comic_book_downloader/web.py
2023-08-20 14:49:51 +08:00

75 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
class HtmlParser:
def __init__(self, html_content: str):
self.html_content = html_content
def get_img_url_list(self):
soup = BeautifulSoup(self.html_content, "html.parser")
img_tags = soup.find("div", class_="reading-content").find_all("img")
img_urls = []
for img_tag in img_tags:
img_url = img_tag.attrs["data-src"]
img_urls.append(img_url)
return img_urls
class Requester:
def fetch_html(self, url: str, task_name: str, max_retries=3):
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
session.mount("http://", adapter)
session.mount("https://", adapter)
try:
response = session.get(url)
response.raise_for_status()
return response.text
except Exception as e:
print(f"Error occurred while fetching HTML from {url}: {e}")
raise Exception(f"{task_name}, 获取网页html失败")
def fetch_image(self, img_url: str, max_retries=5):
"""
通过给定的图片URL下载图片内容。
参数:
img_url (str): 图片的URL地址。
max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。
返回值:
bytes or None: 成功下载图片的二进制数据若下载失败则返回None。
注解:
这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。
如果下载成功函数将返回图片的二进制内容bytes格式
如果下载失败,函数将尝试最多`max_retries`次重试,直到成功或达到重试次数上限。
在每次重试之间,函数会打印错误消息来指示重试进度。
如果重试次数用尽后仍然无法下载图片函数将输出失败消息并返回None。
例子:
```
image_url = "https://example.com/image.jpg"
image_data = download_image(image_url)
if image_data:
# 处理图片数据...
else:
print("无法下载图片,下载失败。")
```
"""
for retry in range(max_retries):
try:
with requests.get(img_url, stream=True) as response:
response.raise_for_status()
return response.content
except Exception as e:
if retry < max_retries - 1:
print(
f"Failed to download image, retrying ({retry+1}/{max_retries})..."
)
else:
print("Failed to download image after multiple retries, skipping.")
return None