comic_book_downloader/main.py
2024-02-29 14:58:41 +08:00

72 lines
2.0 KiB
Python

from concurrent.futures import ThreadPoolExecutor
from web import Requester
from web import HtmlParser
from web_img import ImgManager
def pre_batch_task(lines: list[str]):
"""
每个线程的批次任务
"""
for line in lines:
line = line.strip() # 去掉每行开头和结尾的空白字符
if not line or line.startswith("#"):
continue
task_name, _, url = line.partition(" - ") # 解析出 HTML 文件名和 URL 地址
run_task(task_name, url)
def read_lines(task_file):
"""
从文件中读取所有行并返回一个包含行的列表。
参数:
task_file (file): 任务文件。
返回值:
lines (list): 包含文件中所有行的列表。
"""
with open(task_file, "r", encoding="utf-8") as file:
lines = file.readlines()
return lines
def process_lines_in_batches(lines: list[str], batch_size: int):
"""
将行数据按照指定的批次大小,利用线程池并行处理。
参数:
lines (list): 包含所有行的列表。
batch_size (int): 每个批次处理的行数。
"""
# 使用 ThreadPoolExecutor 创建线程池
with ThreadPoolExecutor() as executor:
# 按照 batch_size 将行分批次处理
for i in range(0, len(lines), batch_size):
batch_lines = lines[i : i + batch_size]
executor.submit(pre_batch_task, batch_lines)
def run_task(task_name: str, url: str):
"""
执行任务
"""
print(f"{task_name}, 开始下载")
html_content = Requester().fetch_html(url, task_name)
img_url_list = HtmlParser(html_content).get_img_url_list()
img_manager = ImgManager(img_url_list, task_name)
img_manager.batch_fill_image_data()
img_manager.save_long_image()
print(f"{task_name}, 完成!!")
if __name__ == "__main__":
task_file = "task.txt"
per_thread_line_size = 3 # 每个线程处理的行数
lines = read_lines(task_file)
process_lines_in_batches(lines, per_thread_line_size)
print("finish, 程序结束...")