import os import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from PIL import Image import io from web_img import WebImg class Requester: def fetch_html(self, url: str, task_name: str, max_retries=3): session = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) session.mount("http://", adapter) session.mount("https://", adapter) try: response = session.get(url) response.raise_for_status() return response.text except Exception as e: print(f"Error occurred while fetching HTML from {url}: {e}") raise Exception(f"{task_name}, 获取网页html失败") def fetch_image(self, img_url: str, max_retries=5): """ 通过给定的图片URL下载图片内容。 参数: img_url (str): 图片的URL地址。 max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。 返回值: bytes or None: 成功下载图片的二进制数据,若下载失败则返回None。 注解: 这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。 如果下载成功,函数将返回图片的二进制内容(bytes格式)。 如果下载失败,函数将尝试最多`max_retries`次重试,直到成功或达到重试次数上限。 在每次重试之间,函数会打印错误消息来指示重试进度。 如果重试次数用尽后仍然无法下载图片,函数将输出失败消息并返回None。 例子: ``` image_url = "https://example.com/image.jpg" image_data = download_image(image_url) if image_data: # 处理图片数据... else: print("无法下载图片,下载失败。") ``` """ for retry in range(max_retries): try: with requests.get(img_url, stream=True) as response: response.raise_for_status() return response.content except Exception as e: if retry < max_retries - 1: print( f"Failed to download image, retrying ({retry+1}/{max_retries})..." ) else: print("Failed to download image after multiple retries, skipping.") return None def fetch_images_to_img_obj(self, web_img: WebImg): url = web_img.url data = self.fetch_image(url) if data is None: task_name = web_img.task_name print(f"{task_name}, 下载图片失败") raise Exception(f"{task_name}, 下载图片失败") web_img.data = data def batch_fetch_images_to_img_obj_list(self, web_img_list: list[WebImg]): with ThreadPoolExecutor() as executor: executor.map(self.fetch_images_to_img_obj, web_img_list)