From 317fb272f555543a57305372325212dd02d7305b Mon Sep 17 00:00:00 2001
From: vincent <vincent0214@qq.com>
Date: Fri, 21 Jul 2023 13:34:49 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=A4=87=E4=BB=BD=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 main copy.py | 109 ------------------------------
 main.py.bak  | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+), 109 deletions(-)
 delete mode 100644 main copy.py
 create mode 100644 main.py.bak

diff --git a/main copy.py b/main copy.py
deleted file mode 100644
index ddb6f90..0000000
--- a/main copy.py	
+++ /dev/null
@@ -1,109 +0,0 @@
-import os
-import requests
-import base64
-from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor
-
-
-def get_html(url, max_retries=3):
-    session = requests.Session()
-    adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
-    session.mount("http://", adapter)
-    session.mount("https://", adapter)
-
-    try:
-        response = session.get(url)
-        response.raise_for_status()
-        return response.text
-    except Exception as e:
-        print(f"Error occurred while fetching HTML from {url}: {e}")
-        return None
-
-
-def save_html_to_file(html_content, file_path):
-    try:
-        with open(file_path, "w", encoding="utf-8") as file:
-            file.write(html_content)
-        print(f"HTML content saved to {file_path} successfully!")
-    except IOError as e:
-        print(f"Error occurred while saving HTML content: {e}")
-
-
-def download_image(img_url):
-    MAX_RETRY = 3
-    for retry in range(MAX_RETRY):
-        try:
-            with requests.get(img_url, stream=True) as response:
-                response.raise_for_status()
-                return response.content
-        except Exception as e:
-            if retry < MAX_RETRY - 1:
-                print(f"Failed to download image, retrying ({retry+1}/{MAX_RETRY})...")
-            else:
-                print("Failed to download image after multiple retries, skipping.")
-                return None
-
-
-def replace_img_with_base64(soup):
-    img_tag_list = soup.find_all("img")
-    for img_tag in img_tag_list:
-        if "data-src" in img_tag.attrs:
-            img_url = img_tag["data-src"]
-            try:
-                response = requests.get(img_url)
-
-                if response.status_code == 200:
-                    img_data = response.content
-                    img_base64 = base64.b64encode(img_data).decode("utf-8")
-                    img_tag["src"] = f"data:image/png;base64,{img_base64}"
-            except Exception as e:
-                print(f"Error occurred while fetching image: {e}")
-
-
-def replace_img_with_base64(img_tag):
-    if "data-src" in img_tag.attrs:
-        img_url = img_tag["data-src"]
-        img_data = download_image(img_url)
-        if img_data is not None:
-            img_base64 = base64.b64encode(img_data).decode("utf-8")
-            img_tag["src"] = f"data:image/png;base64,{img_base64}"
-
-
-def process_batch(lines):
-    for line in lines:
-        line = line.strip()  # 去掉每行开头和结尾的空白字符
-        if line:
-            # 解析出 HTML 文件名和 URL 地址
-            file_name, _, url = line.partition(" - ")
-
-            html_content = get_html(url)
-            if html_content:
-                soup = BeautifulSoup(html_content, "html.parser")
-
-                # replace_img_with_base64(soup)
-
-                img_tag_list = soup.find_all("img")
-                # 使用 ThreadPoolExecutor 创建线程池
-                with ThreadPoolExecutor() as executor:
-                    # 多线程处理图片下载和替换
-                    executor.map(replace_img_with_base64, img_tag_list)
-
-                # 保存为指定的 HTML 文件
-                file_path = f"{file_name}.html"
-                modified_html = soup.prettify()
-                save_html_to_file(modified_html, file_path)
-
-
-if __name__ == "__main__":
-    file_name = "input.txt"
-    batch_size = 3  # 每个线程处理的行数
-
-    with open(file_name, "r", encoding="utf-8") as file:
-        lines = file.readlines()
-
-    # 使用 ThreadPoolExecutor 创建线程池
-    with ThreadPoolExecutor() as executor:
-        # 按照 batch_size 将行分批次处理
-        for i in range(0, len(lines), batch_size):
-            batch_lines = lines[i : i + batch_size]
-            executor.submit(process_batch, batch_lines)
diff --git a/main.py.bak b/main.py.bak
new file mode 100644
index 0000000..fe24a9e
--- /dev/null
+++ b/main.py.bak
@@ -0,0 +1,187 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor
+
+from PIL import Image
+import io
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+
+
+def get_html(url, max_retries=3):
+    session = requests.Session()
+    adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+
+    try:
+        response = session.get(url)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        print(f"Error occurred while fetching HTML from {url}: {e}")
+        raise Exception("获取网页html失败")
+
+
+def download_image(img_url, max_retries=5):
+    for retry in range(max_retries):
+        try:
+            with requests.get(img_url, stream=True) as response:
+                response.raise_for_status()
+                return response.content
+        except Exception as e:
+            if retry < max_retries - 1:
+                print(
+                    f"Failed to download image, retrying ({retry+1}/{max_retries})..."
+                )
+            else:
+                print("Failed to download image after multiple retries, skipping.")
+                return None
+
+
+def get_img_urls(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    img_tags = soup.find("div", class_="reading-content").find_all("img")
+
+    img_urls = []
+    for img_tag in img_tags:
+        img_url = img_tag.attrs["data-src"]
+        img_urls.append(img_url)
+    return img_urls
+
+
+def create_img_obj_list(img_url_list):
+    img_obj_list = []
+    for url in img_url_list:
+        obj = dict()
+        obj["url"] = url
+        obj["data"] = None
+        img_obj_list.append(obj)
+        
+    return img_obj_list
+
+
+def set_img_obj_data(img_obj):
+    url = img_obj["url"]
+    data = download_image(url)
+    if data is None:
+        raise Exception("下载图片失败")
+    img_obj["data"] = data
+
+
+def save_images_to_directory(img_obj_list, directory_path):
+    try:
+        # 创建保存图片的目录（如果不存在）
+        os.makedirs(directory_path, exist_ok=True)
+
+        for idx, img_obj in enumerate(img_obj_list):
+            url = img_obj["url"]
+            data = img_obj["data"]
+
+            # 获取图片的扩展名（假设url以图片扩展名结尾）
+            extension = os.path.splitext(url)[1]
+
+            # 图片文件名，这里用序号作为文件名
+            file_name = f"image_{idx}{extension}"
+            file_path = os.path.join(directory_path, file_name)
+
+            # 将图片数据写入本地文件
+            with open(file_path, "wb") as file:
+                file.write(data)
+
+        print("图片保存成功！")
+    except Exception as e:
+        print(f"图片保存失败：{e}")
+
+
+def generate_pdf_from_images(img_obj_list, output_file):
+    try:
+        c = canvas.Canvas(output_file, pagesize=letter)
+
+        for img_obj in img_obj_list:
+            # 从图片对象的 data 字段中创建图像对象
+            img_data = img_obj["data"]
+            img = Image.open(io.BytesIO(img_data))
+
+            # 将图像大小调整为 PDF 页面大小
+            img_width, img_height = img.size
+            pdf_width, pdf_height = letter
+            scale = min(pdf_width / img_width, pdf_height / img_height)
+            new_width, new_height = int(img_width * scale), int(img_height * scale)
+            img = img.resize((new_width, new_height), Image.ANTIALIAS)
+
+            # 将图像添加到 PDF 页面中
+            c.drawInlineImage(img, 0, 0, width=new_width, height=new_height)
+
+            # 创建新的页面
+            c.showPage()
+
+        c.save()
+        print("PDF 生成成功！")
+    except Exception as e:
+        print(f"PDF 生成失败：{e}")
+
+
+def concatenate_images_vertically(img_obj_list, output_file):
+    try:
+        # 计算拼接后的长图宽度和总高度
+        max_width = max(
+            Image.open(io.BytesIO(img_obj["data"])).width for img_obj in img_obj_list
+        )
+        total_height = sum(
+            Image.open(io.BytesIO(img_obj["data"])).height for img_obj in img_obj_list
+        )
+
+        # 创建一张新的长图
+        long_image = Image.new("RGB", (max_width, total_height), color=(255, 255, 255))
+
+        # 依次将图片在垂直方向上拼接起来
+        y_offset = 0
+        for img_obj in img_obj_list:
+            img = Image.open(io.BytesIO(img_obj["data"]))
+            img_width, img_height = img.size
+            x_offset = (max_width - img_width) // 2  # 居中拼接
+            long_image.paste(img, (x_offset, y_offset))
+            y_offset += img_height
+
+        # 保存拼接后的长图到本地
+        long_image.save(output_file)
+
+    except Exception as e:
+        print(f"拼接图片失败：{e}")
+        return None
+
+
+def process_batch(lines):
+    for line in lines:
+        line = line.strip()  # 去掉每行开头和结尾的空白字符
+        if line:
+            # 解析出 HTML 文件名和 URL 地址
+            file_name, _, url = line.partition(" - ")
+            html_content = get_html(url)
+            img_url_list = get_img_urls(html_content)
+            img_obj_list = create_img_obj_list(img_url_list)
+
+            # 使用 ThreadPoolExecutor 创建线程池
+            with ThreadPoolExecutor() as executor:
+                # 多线程处理图片下载和替换
+                executor.map(set_img_obj_data, img_obj_list)
+
+            # save_images_to_directory(img_obj_list, directory_path="imgs")
+            concatenate_images_vertically(img_obj_list, output_file=f"imgs/{file_name}.pdf")
+
+
+if __name__ == "__main__":
+    file_name = "input.txt"
+    batch_size = 3  # 每个线程处理的行数
+
+    with open(file_name, "r", encoding="utf-8") as file:
+        lines = file.readlines()
+
+    # 使用 ThreadPoolExecutor 创建线程池
+    with ThreadPoolExecutor() as executor:
+        # 按照 batch_size 将行分批次处理
+        for i in range(0, len(lines), batch_size):
+            batch_lines = lines[i : i + batch_size]
+            executor.submit(process_batch, batch_lines)