diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..f2a24d0 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "main.py", + "console": "integratedTerminal", + "justMyCode": true + } + ] +} \ No newline at end of file diff --git a/__pycache__/requester.cpython-310.pyc b/__pycache__/requester.cpython-310.pyc new file mode 100644 index 0000000..86d062f Binary files /dev/null and b/__pycache__/requester.cpython-310.pyc differ diff --git a/__pycache__/web_img.cpython-310.pyc b/__pycache__/web_img.cpython-310.pyc new file mode 100644 index 0000000..dd09ba5 Binary files /dev/null and b/__pycache__/web_img.cpython-310.pyc differ diff --git a/__pycache__/web_parser.cpython-310.pyc b/__pycache__/web_parser.cpython-310.pyc new file mode 100644 index 0000000..e33cb9c Binary files /dev/null and b/__pycache__/web_parser.cpython-310.pyc differ diff --git a/main.py b/main.py index 9b8df50..132c6f3 100644 --- a/main.py +++ b/main.py @@ -1,127 +1,98 @@ -import os -import requests -from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from PIL import Image import io +from web_img import WebImg +from requester import Requester +from web_parser import WebParser -def get_html(url, file_name, max_retries=3): - session = requests.Session() - adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) - session.mount("http://", adapter) - session.mount("https://", adapter) +# def fetch_image(img_url: str, max_retries=5): +# """ +# 通过给定的图片URL下载图片内容。 - try: - response = session.get(url) - response.raise_for_status() - return response.text - except Exception as e: - print(f"Error occurred while fetching HTML from {url}: {e}") - raise Exception(f"{file_name}, 获取网页html失败") +# 参数: +# img_url (str): 图片的URL地址。 +# max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。 + +# 返回值: +# bytes or None: 成功下载图片的二进制数据,若下载失败则返回None。 + +# 注解: +# 这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。 +# 如果下载成功,函数将返回图片的二进制内容(bytes格式)。 +# 如果下载失败,函数将尝试最多`max_retries`次重试,直到成功或达到重试次数上限。 +# 在每次重试之间,函数会打印错误消息来指示重试进度。 +# 如果重试次数用尽后仍然无法下载图片,函数将输出失败消息并返回None。 + +# 例子: +# ``` +# image_url = "https://example.com/image.jpg" +# image_data = download_image(image_url) +# if image_data: +# # 处理图片数据... +# else: +# print("无法下载图片,下载失败。") +# ``` +# """ +# for retry in range(max_retries): +# try: +# with requests.get(img_url, stream=True) as response: +# response.raise_for_status() +# return response.content +# except Exception as e: +# if retry < max_retries - 1: +# print( +# f"Failed to download image, retrying ({retry+1}/{max_retries})..." +# ) +# else: +# print("Failed to download image after multiple retries, skipping.") +# return None -def download_image(img_url, max_retries=5): - """ - 通过给定的图片URL下载图片内容。 - - 参数: - img_url (str): 图片的URL地址。 - max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。 - - 返回值: - bytes or None: 成功下载图片的二进制数据,若下载失败则返回None。 - - 注解: - 这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。 - 如果下载成功,函数将返回图片的二进制内容(bytes格式)。 - 如果下载失败,函数将尝试最多`max_retries`次重试,直到成功或达到重试次数上限。 - 在每次重试之间,函数会打印错误消息来指示重试进度。 - 如果重试次数用尽后仍然无法下载图片,函数将输出失败消息并返回None。 - - 例子: - ``` - image_url = "https://example.com/image.jpg" - image_data = download_image(image_url) - if image_data: - # 处理图片数据... - else: - print("无法下载图片,下载失败。") - ``` - """ - for retry in range(max_retries): - try: - with requests.get(img_url, stream=True) as response: - response.raise_for_status() - return response.content - except Exception as e: - if retry < max_retries - 1: - print( - f"Failed to download image, retrying ({retry+1}/{max_retries})..." - ) - else: - print("Failed to download image after multiple retries, skipping.") - return None - - -def get_img_urls(html_content): - soup = BeautifulSoup(html_content, "html.parser") - img_tags = soup.find("div", class_="reading-content").find_all("img") - - img_urls = [] - for img_tag in img_tags: - img_url = img_tag.attrs["data-src"] - img_urls.append(img_url) - return img_urls - - -def create_img_obj_list(img_url_list, file_name): +def create_web_img_list(img_url_list, task_name): img_obj_list = [] for url in img_url_list: - obj = dict() - obj["file_name"] = file_name - obj["url"] = url - obj["data"] = None - img_obj_list.append(obj) + img = WebImg(task_name, url) + img_obj_list.append(img) return img_obj_list -def download_images_to_img_obj(img_obj): - url = img_obj["url"] - data = download_image(url) - if data is None: - file_name = img_obj["file_name"] - print(f"{file_name}, 下载图片失败") - raise Exception(f"{file_name}, 下载图片失败") - img_obj["data"] = data +# def fetch_images_to_img_obj(web_img: WebImg): +# url = web_img.url +# data = fetch_image(url) +# if data is None: +# task_name = web_img.task_name +# print(f"{task_name}, 下载图片失败") +# raise Exception(f"{task_name}, 下载图片失败") +# web_img.data = data -def batch_download_images_to_img_obj_list(img_obj_list): - """ - 使用 ThreadPoolExecutor 创建线程池,对 img_obj_list 中的每个图片对象调用 set_img_obj_data 函数。 +# def batch_fetch_images_to_img_obj_list(web_img_list: list[WebImg]): +# """ +# 使用 ThreadPoolExecutor 创建线程池,对 img_obj_list 中的每个图片对象调用 set_img_obj_data 函数。 - Args: - img_obj_list (list): 图片对象列表,每个对象包含图片的数据等信息。 +# Args: +# img_obj_list (list): 图片对象列表,每个对象包含图片的数据等信息。 - Returns: - None - """ - with ThreadPoolExecutor() as executor: - executor.map(download_images_to_img_obj, img_obj_list) +# Returns: +# None +# """ +# with ThreadPoolExecutor() as executor: +# executor.map(fetch_images_to_img_obj, web_img_list) -def concatenate_images_vertically(img_obj_list): +def concatenate_images_vertically(web_img_list: list[WebImg]): """ 垂直拼接长图片 """ try: # 计算拼接后的长图宽度和总高度 max_width = max( - Image.open(io.BytesIO(img_obj["data"])).width for img_obj in img_obj_list + Image.open(io.BytesIO(web_img.data)).width for web_img in web_img_list ) total_height = sum( - Image.open(io.BytesIO(img_obj["data"])).height for img_obj in img_obj_list + Image.open(io.BytesIO(web_img.data)).height for web_img in web_img_list ) # 创建一张新的长图 @@ -129,8 +100,8 @@ def concatenate_images_vertically(img_obj_list): # 依次将图片在垂直方向上拼接起来 y_offset = 0 - for img_obj in img_obj_list: - img = Image.open(io.BytesIO(img_obj["data"])) + for web_img in web_img_list: + img = Image.open(io.BytesIO(web_img.data)) img_width, img_height = img.size x_offset = (max_width - img_width) // 2 # 居中拼接 long_image.paste(img, (x_offset, y_offset)) @@ -139,27 +110,28 @@ def concatenate_images_vertically(img_obj_list): return long_image except Exception as e: - file_name = img_obj_list[0]["file_name"] - print(f"{file_name}, 拼接图片失败:{e}") + task_name = web_img_list[0].task_name + print(f"{task_name}, 拼接图片失败:{e}") return None -def pre_batch_task(lines): +def pre_batch_task(lines: list[str]): """ 每个线程的批次任务 """ for line in lines: line = line.strip() # 去掉每行开头和结尾的空白字符 if line: - file_name, _, url = line.partition(" - ") # 解析出 HTML 文件名和 URL 地址 - print(f"{file_name}, 开始下载") - html_content = get_html(url, file_name) - img_url_list = get_img_urls(html_content) - img_obj_list = create_img_obj_list(img_url_list, file_name) - batch_download_images_to_img_obj_list(img_obj_list) - long_image = concatenate_images_vertically(img_obj_list) # 垂直拼接长图片 - long_image.save(f"output/{file_name}.png") # 保存长图到本地 - print(f"{file_name}, 完成!!") + requester = Requester() + task_name, _, url = line.partition(" - ") # 解析出 HTML 文件名和 URL 地址 + print(f"{task_name}, 开始下载") + html_content = requester.fetch_html(url, task_name) + img_url_list = WebParser(html_content).parse_img_urls() + web_img_list = create_web_img_list(img_url_list, task_name) + requester.batch_fetch_images_to_img_obj_list(web_img_list) + long_image = concatenate_images_vertically(web_img_list) # 垂直拼接长图片 + long_image.save(f"output/{task_name}.png") # 保存长图到本地 + print(f"{task_name}, 完成!!") def read_lines_from_file(task_file): @@ -167,7 +139,7 @@ def read_lines_from_file(task_file): 从文件中读取所有行并返回一个包含行的列表。 参数: - file_name (str): 要读取的文件名。 + task_file (file): 任务文件。 返回值: lines (list): 包含文件中所有行的列表。 diff --git a/requester.py b/requester.py new file mode 100644 index 0000000..27cf08b --- /dev/null +++ b/requester.py @@ -0,0 +1,78 @@ +import os +import requests +from bs4 import BeautifulSoup +from concurrent.futures import ThreadPoolExecutor +from PIL import Image +import io +from web_img import WebImg + + +class Requester: + def fetch_html(self, url: str, task_name: str, max_retries=3): + session = requests.Session() + adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) + session.mount("http://", adapter) + session.mount("https://", adapter) + + try: + response = session.get(url) + response.raise_for_status() + return response.text + except Exception as e: + print(f"Error occurred while fetching HTML from {url}: {e}") + raise Exception(f"{task_name}, 获取网页html失败") + + def fetch_image(self, img_url: str, max_retries=5): + """ + 通过给定的图片URL下载图片内容。 + + 参数: + img_url (str): 图片的URL地址。 + max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。 + + 返回值: + bytes or None: 成功下载图片的二进制数据,若下载失败则返回None。 + + 注解: + 这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。 + 如果下载成功,函数将返回图片的二进制内容(bytes格式)。 + 如果下载失败,函数将尝试最多`max_retries`次重试,直到成功或达到重试次数上限。 + 在每次重试之间,函数会打印错误消息来指示重试进度。 + 如果重试次数用尽后仍然无法下载图片,函数将输出失败消息并返回None。 + + 例子: + ``` + image_url = "https://example.com/image.jpg" + image_data = download_image(image_url) + if image_data: + # 处理图片数据... + else: + print("无法下载图片,下载失败。") + ``` + """ + for retry in range(max_retries): + try: + with requests.get(img_url, stream=True) as response: + response.raise_for_status() + return response.content + except Exception as e: + if retry < max_retries - 1: + print( + f"Failed to download image, retrying ({retry+1}/{max_retries})..." + ) + else: + print("Failed to download image after multiple retries, skipping.") + return None + + def fetch_images_to_img_obj(self, web_img: WebImg): + url = web_img.url + data = self.fetch_image(url) + if data is None: + task_name = web_img.task_name + print(f"{task_name}, 下载图片失败") + raise Exception(f"{task_name}, 下载图片失败") + web_img.data = data + + def batch_fetch_images_to_img_obj_list(self, web_img_list: list[WebImg]): + with ThreadPoolExecutor() as executor: + executor.map(self.fetch_images_to_img_obj, web_img_list) diff --git a/test.py b/test.py new file mode 100644 index 0000000..b441c78 --- /dev/null +++ b/test.py @@ -0,0 +1,7 @@ +from web_img import WebImg + + +def main(): + i = WebImg("name", "url.com") + print(i) +main() \ No newline at end of file diff --git a/web_img.py b/web_img.py new file mode 100644 index 0000000..3550c7a --- /dev/null +++ b/web_img.py @@ -0,0 +1,5 @@ +class WebImg: + def __init__(self, file_name, url): + self.task_name = file_name + self.url = url + self.data = None diff --git a/web_parser.py b/web_parser.py new file mode 100644 index 0000000..39d23ea --- /dev/null +++ b/web_parser.py @@ -0,0 +1,18 @@ +from bs4 import BeautifulSoup +from web_img import WebImg + + +class WebParser: + def __init__(self, html_content: str): + self.html_content = html_content + + def parse_img_urls(self): + soup = BeautifulSoup(self.html_content, "html.parser") + img_tags = soup.find("div", class_="reading-content").find_all("img") + img_urls = [] + for img_tag in img_tags: + img_url = img_tag.attrs["data-src"] + img_urls.append(img_url) + return img_urls + +