import os
import requests
import base64
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor


def get_html(url, max_retries=3):
    session = requests.Session()
    adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    try:
        response = session.get(url)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error occurred while fetching HTML from {url}: {e}")
        return None


def save_html_to_file(html_content, file_path):
    try:
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(html_content)
        print(f"HTML content saved to {file_path} successfully!")
    except IOError as e:
        print(f"Error occurred while saving HTML content: {e}")


def download_image(img_url):
    MAX_RETRY = 3
    for retry in range(MAX_RETRY):
        try:
            with requests.get(img_url, stream=True) as response:
                response.raise_for_status()
                return response.content
        except Exception as e:
            if retry < MAX_RETRY - 1:
                print(f"Failed to download image, retrying ({retry+1}/{MAX_RETRY})...")
            else:
                print("Failed to download image after multiple retries, skipping.")
                return None


def replace_img_with_base64(soup):
    img_tag_list = soup.find_all("img")
    for img_tag in img_tag_list:
        if "data-src" in img_tag.attrs:
            img_url = img_tag["data-src"]
            try:
                response = requests.get(img_url)

                if response.status_code == 200:
                    img_data = response.content
                    img_base64 = base64.b64encode(img_data).decode("utf-8")
                    img_tag["src"] = f"data:image/png;base64,{img_base64}"
            except Exception as e:
                print(f"Error occurred while fetching image: {e}")


def replace_img_with_base64(img_tag):
    if "data-src" in img_tag.attrs:
        img_url = img_tag["data-src"]
        img_data = download_image(img_url)
        if img_data is not None:
            img_base64 = base64.b64encode(img_data).decode("utf-8")
            img_tag["src"] = f"data:image/png;base64,{img_base64}"


def process_batch(lines):
    for line in lines:
        line = line.strip()  # 去掉每行开头和结尾的空白字符
        if line:
            # 解析出 HTML 文件名和 URL 地址
            file_name, _, url = line.partition(" - ")

            html_content = get_html(url)
            if html_content:
                soup = BeautifulSoup(html_content, "html.parser")

                # replace_img_with_base64(soup)

                img_tag_list = soup.find_all("img")
                # 使用 ThreadPoolExecutor 创建线程池
                with ThreadPoolExecutor() as executor:
                    # 多线程处理图片下载和替换
                    executor.map(replace_img_with_base64, img_tag_list)

                # 保存为指定的 HTML 文件
                file_path = f"{file_name}.html"
                modified_html = soup.prettify()
                save_html_to_file(modified_html, file_path)


if __name__ == "__main__":
    file_name = "input.txt"
    batch_size = 3  # 每个线程处理的行数

    with open(file_name, "r", encoding="utf-8") as file:
        lines = file.readlines()

    # 使用 ThreadPoolExecutor 创建线程池
    with ThreadPoolExecutor() as executor:
        # 按照 batch_size 将行分批次处理
        for i in range(0, len(lines), batch_size):
            batch_lines = lines[i : i + batch_size]
            executor.submit(process_batch, batch_lines)