comic_book_downloader/main.py

from concurrent.futures import ThreadPoolExecutor
from PIL import Image
import io
from web_img import WebImg
from requester import Requester
from web_parser import WebParser


# def fetch_image(img_url: str, max_retries=5):
#     """
#     通过给定的图片URL下载图片内容。

#     参数：
#         img_url (str): 图片的URL地址。
#         max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。

#     返回值：
#         bytes or None: 成功下载图片的二进制数据，若下载失败则返回None。

#     注解：
#         这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。
#         如果下载成功，函数将返回图片的二进制内容（bytes格式）。
#         如果下载失败，函数将尝试最多`max_retries`次重试，直到成功或达到重试次数上限。
#         在每次重试之间，函数会打印错误消息来指示重试进度。
#         如果重试次数用尽后仍然无法下载图片，函数将输出失败消息并返回None。

#         例子：
#         ```
#         image_url = "https://example.com/image.jpg"
#         image_data = download_image(image_url)
#         if image_data:
#             # 处理图片数据...
#         else:
#             print("无法下载图片，下载失败。")
#         ```
#     """
#     for retry in range(max_retries):
#         try:
#             with requests.get(img_url, stream=True) as response:
#                 response.raise_for_status()
#                 return response.content
#         except Exception as e:
#             if retry < max_retries - 1:
#                 print(
#                     f"Failed to download image, retrying ({retry+1}/{max_retries})..."
#                 )
#             else:
#                 print("Failed to download image after multiple retries, skipping.")
#                 return None


def create_web_img_list(img_url_list, task_name):
    img_obj_list = []
    for url in img_url_list:
        img = WebImg(task_name, url)
        img_obj_list.append(img)

    return img_obj_list


# def fetch_images_to_img_obj(web_img: WebImg):
#     url = web_img.url
#     data = fetch_image(url)
#     if data is None:
#         task_name = web_img.task_name
#         print(f"{task_name}, 下载图片失败")
#         raise Exception(f"{task_name}, 下载图片失败")
#     web_img.data = data


# def batch_fetch_images_to_img_obj_list(web_img_list: list[WebImg]):
#     """
#     使用 ThreadPoolExecutor 创建线程池，对 img_obj_list 中的每个图片对象调用 set_img_obj_data 函数。

#     Args:
#         img_obj_list (list): 图片对象列表，每个对象包含图片的数据等信息。

#     Returns:
#         None
#     """
#     with ThreadPoolExecutor() as executor:
#         executor.map(fetch_images_to_img_obj, web_img_list)


def concatenate_images_vertically(web_img_list: list[WebImg]):
    """
    垂直拼接长图片
    """
    try:
        # 计算拼接后的长图宽度和总高度
        max_width = max(
            Image.open(io.BytesIO(web_img.data)).width for web_img in web_img_list
        )
        total_height = sum(
            Image.open(io.BytesIO(web_img.data)).height for web_img in web_img_list
        )

        # 创建一张新的长图
        long_image = Image.new("RGB", (max_width, total_height), color=(255, 255, 255))

        # 依次将图片在垂直方向上拼接起来
        y_offset = 0
        for web_img in web_img_list:
            img = Image.open(io.BytesIO(web_img.data))
            img_width, img_height = img.size
            x_offset = (max_width - img_width) // 2  # 居中拼接
            long_image.paste(img, (x_offset, y_offset))
            y_offset += img_height

        return long_image

    except Exception as e:
        task_name = web_img_list[0].task_name
        print(f"{task_name}, 拼接图片失败：{e}")
        return None


def pre_batch_task(lines: list[str]):
    """
    每个线程的批次任务
    """
    for line in lines:
        line = line.strip()  # 去掉每行开头和结尾的空白字符
        if line:
            requester = Requester()
            task_name, _, url = line.partition(" - ")  # 解析出 HTML 文件名和 URL 地址
            print(f"{task_name}, 开始下载")
            html_content = requester.fetch_html(url, task_name)
            img_url_list = WebParser(html_content).parse_img_urls()
            web_img_list = create_web_img_list(img_url_list, task_name)
            requester.batch_fetch_images_to_img_obj_list(web_img_list)
            long_image = concatenate_images_vertically(web_img_list)  # 垂直拼接长图片
            long_image.save(f"output/{task_name}.png")  # 保存长图到本地
            print(f"{task_name}, 完成!!")


def read_lines_from_file(task_file):
    """
    从文件中读取所有行并返回一个包含行的列表。

    参数：
        task_file (file): 任务文件。

    返回值：
        lines (list): 包含文件中所有行的列表。
    """
    with open(task_file, "r", encoding="utf-8") as file:
        lines = file.readlines()
    return lines


def process_lines_in_batches(lines, batch_size):
    """
    将行数据按照指定的批次大小，利用线程池并行处理。

    参数：
        lines (list): 包含所有行的列表。
        batch_size (int): 每个批次处理的行数。

    """
    # 使用 ThreadPoolExecutor 创建线程池
    with ThreadPoolExecutor() as executor:
        # 按照 batch_size 将行分批次处理
        for i in range(0, len(lines), batch_size):
            batch_lines = lines[i : i + batch_size]
            executor.submit(pre_batch_task, batch_lines)


if __name__ == "__main__":
    task_file = "input.txt"
    batch_size = 3  # 每个线程处理的行数

    lines = read_lines_from_file(task_file)
    process_lines_in_batches(lines, batch_size)
    print("finish, 程序结束...")