commit f9dff30eacd92d613176f091948d6883792001ed Author: vincent Date: Fri Jul 21 10:06:59 2023 +0800 backup diff --git a/main copy.py b/main copy.py new file mode 100644 index 0000000..ddb6f90 --- /dev/null +++ b/main copy.py @@ -0,0 +1,109 @@ +import os +import requests +import base64 +from bs4 import BeautifulSoup +from concurrent.futures import ThreadPoolExecutor + + +def get_html(url, max_retries=3): + session = requests.Session() + adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) + session.mount("http://", adapter) + session.mount("https://", adapter) + + try: + response = session.get(url) + response.raise_for_status() + return response.text + except Exception as e: + print(f"Error occurred while fetching HTML from {url}: {e}") + return None + + +def save_html_to_file(html_content, file_path): + try: + with open(file_path, "w", encoding="utf-8") as file: + file.write(html_content) + print(f"HTML content saved to {file_path} successfully!") + except IOError as e: + print(f"Error occurred while saving HTML content: {e}") + + +def download_image(img_url): + MAX_RETRY = 3 + for retry in range(MAX_RETRY): + try: + with requests.get(img_url, stream=True) as response: + response.raise_for_status() + return response.content + except Exception as e: + if retry < MAX_RETRY - 1: + print(f"Failed to download image, retrying ({retry+1}/{MAX_RETRY})...") + else: + print("Failed to download image after multiple retries, skipping.") + return None + + +def replace_img_with_base64(soup): + img_tag_list = soup.find_all("img") + for img_tag in img_tag_list: + if "data-src" in img_tag.attrs: + img_url = img_tag["data-src"] + try: + response = requests.get(img_url) + + if response.status_code == 200: + img_data = response.content + img_base64 = base64.b64encode(img_data).decode("utf-8") + img_tag["src"] = f"data:image/png;base64,{img_base64}" + except Exception as e: + print(f"Error occurred while fetching image: {e}") + + +def replace_img_with_base64(img_tag): + if "data-src" in img_tag.attrs: + img_url = img_tag["data-src"] + img_data = download_image(img_url) + if img_data is not None: + img_base64 = base64.b64encode(img_data).decode("utf-8") + img_tag["src"] = f"data:image/png;base64,{img_base64}" + + +def process_batch(lines): + for line in lines: + line = line.strip() # 去掉每行开头和结尾的空白字符 + if line: + # 解析出 HTML 文件名和 URL 地址 + file_name, _, url = line.partition(" - ") + + html_content = get_html(url) + if html_content: + soup = BeautifulSoup(html_content, "html.parser") + + # replace_img_with_base64(soup) + + img_tag_list = soup.find_all("img") + # 使用 ThreadPoolExecutor 创建线程池 + with ThreadPoolExecutor() as executor: + # 多线程处理图片下载和替换 + executor.map(replace_img_with_base64, img_tag_list) + + # 保存为指定的 HTML 文件 + file_path = f"{file_name}.html" + modified_html = soup.prettify() + save_html_to_file(modified_html, file_path) + + +if __name__ == "__main__": + file_name = "input.txt" + batch_size = 3 # 每个线程处理的行数 + + with open(file_name, "r", encoding="utf-8") as file: + lines = file.readlines() + + # 使用 ThreadPoolExecutor 创建线程池 + with ThreadPoolExecutor() as executor: + # 按照 batch_size 将行分批次处理 + for i in range(0, len(lines), batch_size): + batch_lines = lines[i : i + batch_size] + executor.submit(process_batch, batch_lines)