comic_book_downloader/web.py
2023-08-21 10:00:38 +08:00

54 lines
2.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
class HtmlParser:
def __init__(self, html_content: str):
self.html_content = html_content
def get_img_url_list(self):
soup = BeautifulSoup(self.html_content, "html.parser")
img_tags = soup.find("div", class_="reading-content").find_all("img")
return [img_tag.attrs["data-src"] for img_tag in img_tags]
class Requester:
def fetch_html(self, url: str, task_name: str, max_retries=3):
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
session.mount("http://", adapter)
session.mount("https://", adapter)
try:
response = session.get(url)
response.raise_for_status()
return response.text
except Exception as e:
print(f"Error occurred while fetching HTML from {url}: {e}")
raise Exception(f"{task_name}, 获取网页html失败")
def fetch_image(self, img_url: str, max_retries=5):
"""
通过给定的图片URL请求图片内容。
参数:
img_url (str): 图片的URL地址。
max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。
返回值:
bytes or None: 成功下载图片的二进制数据若下载失败则返回None。
"""
for retry in range(max_retries):
try:
with requests.get(img_url, stream=True) as response:
response.raise_for_status()
return response.content
except Exception as e:
if retry < max_retries - 1:
print(
f"Failed to download image, retrying ({retry+1}/{max_retries})..."
)
else:
print("Failed to download image after multiple retries, skipping.")
return None