import requests from bs4 import BeautifulSoup class HtmlParser: def __init__(self, html_content: str): self.html_content = html_content def get_img_url_list(self): soup = BeautifulSoup(self.html_content, "html.parser") img_tags = soup.find("div", class_="reading-content").find_all("img") return [img_tag.attrs["data-src"] for img_tag in img_tags] class Requester: def fetch_html(self, url: str, task_name: str, max_retries=3): session = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) session.mount("http://", adapter) session.mount("https://", adapter) try: response = session.get(url) response.raise_for_status() return response.text except Exception as e: print(f"Error occurred while fetching HTML from {url}: {e}") raise Exception(f"{task_name}, 获取网页html失败") def fetch_image(self, img_url: str, max_retries=5): """ 通过给定的图片URL请求图片内容。 参数: img_url (str): 图片的URL地址。 max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。 返回值: bytes or None: 成功下载图片的二进制数据,若下载失败则返回None。 """ for retry in range(max_retries): try: with requests.get(img_url, stream=True) as response: response.raise_for_status() return response.content except Exception as e: if retry < max_retries - 1: print( f"Failed to download image, retrying ({retry+1}/{max_retries})..." ) else: print("Failed to download image after multiple retries, skipping.") return None