搬移函数

2023-08-19 23:39:18 +08:00 · 2023-08-19 23:39:18 +08:00 · dacae684d7
commit dacae684d7
parent 197043d048
9 changed files with 207 additions and 111 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,16 @@
+{
+	// 使用 IntelliSense 了解相关属性。 
+	// 悬停以查看现有属性的描述。
+	// 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
+	"version": "0.2.0",
+	"configurations": [
+		{
+			"name": "Python: Current File",
+			"type": "python",
+			"request": "launch",
+			"program": "main.py",
+			"console": "integratedTerminal",
+			"justMyCode": true
+		}
+	]
+}
--- a/pycache/requester.cpython-310.pyc
+++ b/pycache/requester.cpython-310.pyc
--- a/pycache/web_img.cpython-310.pyc
+++ b/pycache/web_img.cpython-310.pyc
--- a/pycache/web_parser.cpython-310.pyc
+++ b/pycache/web_parser.cpython-310.pyc
--- a/main.py
+++ b/main.py
@ -1,127 +1,98 @@
-import os
-import requests
-from bs4 import BeautifulSoup
 from concurrent.futures import ThreadPoolExecutor
 from PIL import Image
 import io
+from web_img import WebImg
+from requester import Requester
+from web_parser import WebParser


-def get_html(url, file_name, max_retries=3):
-    session = requests.Session()
-    adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
-    session.mount("http://", adapter)
-    session.mount("https://", adapter)
+# def fetch_image(img_url: str, max_retries=5):
+#     """
+#     通过给定的图片URL下载图片内容。

-    try:
-        response = session.get(url)
-        response.raise_for_status()
-        return response.text
-    except Exception as e:
-        print(f"Error occurred while fetching HTML from {url}: {e}")
-        raise Exception(f"{file_name}, 获取网页html失败")
+#     参数：
+#         img_url (str): 图片的URL地址。
+#         max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。
+
+#     返回值：
+#         bytes or None: 成功下载图片的二进制数据，若下载失败则返回None。
+
+#     注解：
+#         这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。
+#         如果下载成功，函数将返回图片的二进制内容（bytes格式）。
+#         如果下载失败，函数将尝试最多`max_retries`次重试，直到成功或达到重试次数上限。
+#         在每次重试之间，函数会打印错误消息来指示重试进度。
+#         如果重试次数用尽后仍然无法下载图片，函数将输出失败消息并返回None。
+
+#         例子：
+#         ```
+#         image_url = "https://example.com/image.jpg"
+#         image_data = download_image(image_url)
+#         if image_data:
+#             # 处理图片数据...
+#         else:
+#             print("无法下载图片，下载失败。")
+#         ```
+#     """
+#     for retry in range(max_retries):
+#         try:
+#             with requests.get(img_url, stream=True) as response:
+#                 response.raise_for_status()
+#                 return response.content
+#         except Exception as e:
+#             if retry < max_retries - 1:
+#                 print(
+#                     f"Failed to download image, retrying ({retry+1}/{max_retries})..."
+#                 )
+#             else:
+#                 print("Failed to download image after multiple retries, skipping.")
+#                 return None


-def download_image(img_url, max_retries=5):
-    """
-    通过给定的图片URL下载图片内容。
-
-    参数：
-        img_url (str): 图片的URL地址。
-        max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。
-
-    返回值：
-        bytes or None: 成功下载图片的二进制数据，若下载失败则返回None。
-
-    注解：
-        这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。
-        如果下载成功，函数将返回图片的二进制内容（bytes格式）。
-        如果下载失败，函数将尝试最多`max_retries`次重试，直到成功或达到重试次数上限。
-        在每次重试之间，函数会打印错误消息来指示重试进度。
-        如果重试次数用尽后仍然无法下载图片，函数将输出失败消息并返回None。
-
-        例子：
-        ```
-        image_url = "https://example.com/image.jpg"
-        image_data = download_image(image_url)
-        if image_data:
-            # 处理图片数据...
-        else:
-            print("无法下载图片，下载失败。")
-        ```
-    """
-    for retry in range(max_retries):
-        try:
-            with requests.get(img_url, stream=True) as response:
-                response.raise_for_status()
-                return response.content
-        except Exception as e:
-            if retry < max_retries - 1:
-                print(
-                    f"Failed to download image, retrying ({retry+1}/{max_retries})..."
-                )
-            else:
-                print("Failed to download image after multiple retries, skipping.")
-                return None
-
-
-def get_img_urls(html_content):
-    soup = BeautifulSoup(html_content, "html.parser")
-    img_tags = soup.find("div", class_="reading-content").find_all("img")
-
-    img_urls = []
-    for img_tag in img_tags:
-        img_url = img_tag.attrs["data-src"]
-        img_urls.append(img_url)
-    return img_urls
-
-
-def create_img_obj_list(img_url_list, file_name):
+def create_web_img_list(img_url_list, task_name):
    img_obj_list = []
    for url in img_url_list:
-        obj = dict()
-        obj["file_name"] = file_name
-        obj["url"] = url
-        obj["data"] = None
-        img_obj_list.append(obj)
+        img = WebImg(task_name, url)
+        img_obj_list.append(img)

    return img_obj_list


-def download_images_to_img_obj(img_obj):
-    url = img_obj["url"]
-    data = download_image(url)
-    if data is None:
-        file_name = img_obj["file_name"]
-        print(f"{file_name}, 下载图片失败")
-        raise Exception(f"{file_name}, 下载图片失败")
-    img_obj["data"] = data
+# def fetch_images_to_img_obj(web_img: WebImg):
+#     url = web_img.url
+#     data = fetch_image(url)
+#     if data is None:
+#         task_name = web_img.task_name
+#         print(f"{task_name}, 下载图片失败")
+#         raise Exception(f"{task_name}, 下载图片失败")
+#     web_img.data = data


-def batch_download_images_to_img_obj_list(img_obj_list):
-    """
-    使用 ThreadPoolExecutor 创建线程池，对 img_obj_list 中的每个图片对象调用 set_img_obj_data 函数。
+# def batch_fetch_images_to_img_obj_list(web_img_list: list[WebImg]):
+#     """
+#     使用 ThreadPoolExecutor 创建线程池，对 img_obj_list 中的每个图片对象调用 set_img_obj_data 函数。

-    Args:
-        img_obj_list (list): 图片对象列表，每个对象包含图片的数据等信息。
+#     Args:
+#         img_obj_list (list): 图片对象列表，每个对象包含图片的数据等信息。

-    Returns:
-        None
-    """
-    with ThreadPoolExecutor() as executor:
-        executor.map(download_images_to_img_obj, img_obj_list)
+#     Returns:
+#         None
+#     """
+#     with ThreadPoolExecutor() as executor:
+#         executor.map(fetch_images_to_img_obj, web_img_list)


-def concatenate_images_vertically(img_obj_list):
+def concatenate_images_vertically(web_img_list: list[WebImg]):
    """
    垂直拼接长图片
    """
    try:
        # 计算拼接后的长图宽度和总高度
        max_width = max(
-            Image.open(io.BytesIO(img_obj["data"])).width for img_obj in img_obj_list
+            Image.open(io.BytesIO(web_img.data)).width for web_img in web_img_list
        )
        total_height = sum(
-            Image.open(io.BytesIO(img_obj["data"])).height for img_obj in img_obj_list
+            Image.open(io.BytesIO(web_img.data)).height for web_img in web_img_list
        )

        # 创建一张新的长图
@ -129,8 +100,8 @@ def concatenate_images_vertically(img_obj_list):

        # 依次将图片在垂直方向上拼接起来
        y_offset = 0
-        for img_obj in img_obj_list:
-            img = Image.open(io.BytesIO(img_obj["data"]))
+        for web_img in web_img_list:
+            img = Image.open(io.BytesIO(web_img.data))
            img_width, img_height = img.size
            x_offset = (max_width - img_width) // 2  # 居中拼接
            long_image.paste(img, (x_offset, y_offset))
@ -139,27 +110,28 @@ def concatenate_images_vertically(img_obj_list):
        return long_image

    except Exception as e:
-        file_name = img_obj_list[0]["file_name"]
-        print(f"{file_name}, 拼接图片失败：{e}")
+        task_name = web_img_list[0].task_name
+        print(f"{task_name}, 拼接图片失败：{e}")
        return None


-def pre_batch_task(lines):
+def pre_batch_task(lines: list[str]):
    """
    每个线程的批次任务
    """
    for line in lines:
        line = line.strip()  # 去掉每行开头和结尾的空白字符
        if line:
-            file_name, _, url = line.partition(" - ")  # 解析出 HTML 文件名和 URL 地址
-            print(f"{file_name}, 开始下载")
-            html_content = get_html(url, file_name)
-            img_url_list = get_img_urls(html_content)
-            img_obj_list = create_img_obj_list(img_url_list, file_name)
-            batch_download_images_to_img_obj_list(img_obj_list)
-            long_image = concatenate_images_vertically(img_obj_list)  # 垂直拼接长图片
-            long_image.save(f"output/{file_name}.png")  # 保存长图到本地
-            print(f"{file_name}, 完成!!")
+            requester = Requester()
+            task_name, _, url = line.partition(" - ")  # 解析出 HTML 文件名和 URL 地址
+            print(f"{task_name}, 开始下载")
+            html_content = requester.fetch_html(url, task_name)
+            img_url_list = WebParser(html_content).parse_img_urls()
+            web_img_list = create_web_img_list(img_url_list, task_name)
+            requester.batch_fetch_images_to_img_obj_list(web_img_list)
+            long_image = concatenate_images_vertically(web_img_list)  # 垂直拼接长图片
+            long_image.save(f"output/{task_name}.png")  # 保存长图到本地
+            print(f"{task_name}, 完成!!")


 def read_lines_from_file(task_file):
@ -167,7 +139,7 @@ def read_lines_from_file(task_file):
    从文件中读取所有行并返回一个包含行的列表。

    参数：
-        file_name (str): 要读取的文件名。
+        task_file (file): 任务文件。

    返回值：
        lines (list): 包含文件中所有行的列表。
--- a/requester.py
+++ b/requester.py
@ -0,0 +1,78 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor
+from PIL import Image
+import io
+from web_img import WebImg
+
+
+class Requester:
+    def fetch_html(self, url: str, task_name: str, max_retries=3):
+        session = requests.Session()
+        adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
+        session.mount("http://", adapter)
+        session.mount("https://", adapter)
+
+        try:
+            response = session.get(url)
+            response.raise_for_status()
+            return response.text
+        except Exception as e:
+            print(f"Error occurred while fetching HTML from {url}: {e}")
+            raise Exception(f"{task_name}, 获取网页html失败")
+
+    def fetch_image(self, img_url: str, max_retries=5):
+        """
+        通过给定的图片URL下载图片内容。
+
+        参数：
+                        img_url (str): 图片的URL地址。
+                        max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。
+
+        返回值：
+                        bytes or None: 成功下载图片的二进制数据，若下载失败则返回None。
+
+        注解：
+                        这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。
+                        如果下载成功，函数将返回图片的二进制内容（bytes格式）。
+                        如果下载失败，函数将尝试最多`max_retries`次重试，直到成功或达到重试次数上限。
+                        在每次重试之间，函数会打印错误消息来指示重试进度。
+                        如果重试次数用尽后仍然无法下载图片，函数将输出失败消息并返回None。
+
+                        例子：
+                        ```
+                        image_url = "https://example.com/image.jpg"
+                        image_data = download_image(image_url)
+                        if image_data:
+                                        # 处理图片数据...
+                        else:
+                                        print("无法下载图片，下载失败。")
+                        ```
+        """
+        for retry in range(max_retries):
+            try:
+                with requests.get(img_url, stream=True) as response:
+                    response.raise_for_status()
+                    return response.content
+            except Exception as e:
+                if retry < max_retries - 1:
+                    print(
+                        f"Failed to download image, retrying ({retry+1}/{max_retries})..."
+                    )
+                else:
+                    print("Failed to download image after multiple retries, skipping.")
+                    return None
+
+    def fetch_images_to_img_obj(self, web_img: WebImg):
+        url = web_img.url
+        data = self.fetch_image(url)
+        if data is None:
+            task_name = web_img.task_name
+            print(f"{task_name}, 下载图片失败")
+            raise Exception(f"{task_name}, 下载图片失败")
+        web_img.data = data
+
+    def batch_fetch_images_to_img_obj_list(self, web_img_list: list[WebImg]):
+        with ThreadPoolExecutor() as executor:
+            executor.map(self.fetch_images_to_img_obj, web_img_list)
--- a/test.py
+++ b/test.py
@ -0,0 +1,7 @@
+from web_img import WebImg
+
+
+def main():
+    i = WebImg("name", "url.com")
+    print(i)
+main()
--- a/web_img.py
+++ b/web_img.py
@ -0,0 +1,5 @@
+class WebImg:
+    def __init__(self, file_name, url):
+        self.task_name = file_name
+        self.url = url
+        self.data = None
--- a/web_parser.py
+++ b/web_parser.py
@ -0,0 +1,18 @@
+from bs4 import BeautifulSoup
+from web_img import WebImg
+
+
+class WebParser:
+    def __init__(self, html_content: str):
+        self.html_content = html_content
+
+    def parse_img_urls(self):
+        soup = BeautifulSoup(self.html_content, "html.parser")
+        img_tags = soup.find("div", class_="reading-content").find_all("img")
+        img_urls = []
+        for img_tag in img_tags:
+            img_url = img_tag.attrs["data-src"]
+            img_urls.append(img_url)
+        return img_urls
+
+