backup

2023-07-21 10:06:59 +08:00 · 2023-07-21 10:06:59 +08:00 · f9dff30eac
commit f9dff30eac
1 changed files with 109 additions and 0 deletions
--- a/copy.py
+++ b/copy.py
@ -0,0 +1,109 @@
+import os
+import requests
+import base64
+from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor
+
+
+def get_html(url, max_retries=3):
+    session = requests.Session()
+    adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+
+    try:
+        response = session.get(url)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        print(f"Error occurred while fetching HTML from {url}: {e}")
+        return None
+
+
+def save_html_to_file(html_content, file_path):
+    try:
+        with open(file_path, "w", encoding="utf-8") as file:
+            file.write(html_content)
+        print(f"HTML content saved to {file_path} successfully!")
+    except IOError as e:
+        print(f"Error occurred while saving HTML content: {e}")
+
+
+def download_image(img_url):
+    MAX_RETRY = 3
+    for retry in range(MAX_RETRY):
+        try:
+            with requests.get(img_url, stream=True) as response:
+                response.raise_for_status()
+                return response.content
+        except Exception as e:
+            if retry < MAX_RETRY - 1:
+                print(f"Failed to download image, retrying ({retry+1}/{MAX_RETRY})...")
+            else:
+                print("Failed to download image after multiple retries, skipping.")
+                return None
+
+
+def replace_img_with_base64(soup):
+    img_tag_list = soup.find_all("img")
+    for img_tag in img_tag_list:
+        if "data-src" in img_tag.attrs:
+            img_url = img_tag["data-src"]
+            try:
+                response = requests.get(img_url)
+
+                if response.status_code == 200:
+                    img_data = response.content
+                    img_base64 = base64.b64encode(img_data).decode("utf-8")
+                    img_tag["src"] = f"data:image/png;base64,{img_base64}"
+            except Exception as e:
+                print(f"Error occurred while fetching image: {e}")
+
+
+def replace_img_with_base64(img_tag):
+    if "data-src" in img_tag.attrs:
+        img_url = img_tag["data-src"]
+        img_data = download_image(img_url)
+        if img_data is not None:
+            img_base64 = base64.b64encode(img_data).decode("utf-8")
+            img_tag["src"] = f"data:image/png;base64,{img_base64}"
+
+
+def process_batch(lines):
+    for line in lines:
+        line = line.strip()  # 去掉每行开头和结尾的空白字符
+        if line:
+            # 解析出 HTML 文件名和 URL 地址
+            file_name, _, url = line.partition(" - ")
+
+            html_content = get_html(url)
+            if html_content:
+                soup = BeautifulSoup(html_content, "html.parser")
+
+                # replace_img_with_base64(soup)
+
+                img_tag_list = soup.find_all("img")
+                # 使用 ThreadPoolExecutor 创建线程池
+                with ThreadPoolExecutor() as executor:
+                    # 多线程处理图片下载和替换
+                    executor.map(replace_img_with_base64, img_tag_list)
+
+                # 保存为指定的 HTML 文件
+                file_path = f"{file_name}.html"
+                modified_html = soup.prettify()
+                save_html_to_file(modified_html, file_path)
+
+
+if __name__ == "__main__":
+    file_name = "input.txt"
+    batch_size = 3  # 每个线程处理的行数
+
+    with open(file_name, "r", encoding="utf-8") as file:
+        lines = file.readlines()
+
+    # 使用 ThreadPoolExecutor 创建线程池
+    with ThreadPoolExecutor() as executor:
+        # 按照 batch_size 将行分批次处理
+        for i in range(0, len(lines), batch_size):
+            batch_lines = lines[i : i + batch_size]
+            executor.submit(process_batch, batch_lines)