import os import requests import base64 from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor def get_html(url, max_retries=3): session = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) session.mount("http://", adapter) session.mount("https://", adapter) try: response = session.get(url) response.raise_for_status() return response.text except Exception as e: print(f"Error occurred while fetching HTML from {url}: {e}") return None def save_html_to_file(html_content, file_path): try: with open(file_path, "w", encoding="utf-8") as file: file.write(html_content) print(f"HTML content saved to {file_path} successfully!") except IOError as e: print(f"Error occurred while saving HTML content: {e}") def download_image(img_url): MAX_RETRY = 3 for retry in range(MAX_RETRY): try: with requests.get(img_url, stream=True) as response: response.raise_for_status() return response.content except Exception as e: if retry < MAX_RETRY - 1: print(f"Failed to download image, retrying ({retry+1}/{MAX_RETRY})...") else: print("Failed to download image after multiple retries, skipping.") return None def replace_img_with_base64(soup): img_tag_list = soup.find_all("img") for img_tag in img_tag_list: if "data-src" in img_tag.attrs: img_url = img_tag["data-src"] try: response = requests.get(img_url) if response.status_code == 200: img_data = response.content img_base64 = base64.b64encode(img_data).decode("utf-8") img_tag["src"] = f"data:image/png;base64,{img_base64}" except Exception as e: print(f"Error occurred while fetching image: {e}") def replace_img_with_base64(img_tag): if "data-src" in img_tag.attrs: img_url = img_tag["data-src"] img_data = download_image(img_url) if img_data is not None: img_base64 = base64.b64encode(img_data).decode("utf-8") img_tag["src"] = f"data:image/png;base64,{img_base64}" def process_batch(lines): for line in lines: line = line.strip() # 去掉每行开头和结尾的空白字符 if line: # 解析出 HTML 文件名和 URL 地址 file_name, _, url = line.partition(" - ") html_content = get_html(url) if html_content: soup = BeautifulSoup(html_content, "html.parser") # replace_img_with_base64(soup) img_tag_list = soup.find_all("img") # 使用 ThreadPoolExecutor 创建线程池 with ThreadPoolExecutor() as executor: # 多线程处理图片下载和替换 executor.map(replace_img_with_base64, img_tag_list) # 保存为指定的 HTML 文件 file_path = f"{file_name}.html" modified_html = soup.prettify() save_html_to_file(modified_html, file_path) if __name__ == "__main__": file_name = "input.txt" batch_size = 3 # 每个线程处理的行数 with open(file_name, "r", encoding="utf-8") as file: lines = file.readlines() # 使用 ThreadPoolExecutor 创建线程池 with ThreadPoolExecutor() as executor: # 按照 batch_size 将行分批次处理 for i in range(0, len(lines), batch_size): batch_lines = lines[i : i + batch_size] executor.submit(process_batch, batch_lines)