72 lines
2.0 KiB
Python
72 lines
2.0 KiB
Python
from concurrent.futures import ThreadPoolExecutor
|
|
from web import Requester
|
|
from web import HtmlParser
|
|
from web_img import ImgManager
|
|
|
|
|
|
def pre_batch_task(lines: list[str]):
|
|
"""
|
|
每个线程的批次任务
|
|
"""
|
|
for line in lines:
|
|
line = line.strip() # 去掉每行开头和结尾的空白字符
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
task_name, _, url = line.partition(" - ") # 解析出 HTML 文件名和 URL 地址
|
|
run_task(task_name, url)
|
|
|
|
|
|
def read_lines(task_file):
|
|
"""
|
|
从文件中读取所有行并返回一个包含行的列表。
|
|
|
|
参数:
|
|
task_file (file): 任务文件。
|
|
|
|
返回值:
|
|
lines (list): 包含文件中所有行的列表。
|
|
"""
|
|
with open(task_file, "r", encoding="utf-8") as file:
|
|
lines = file.readlines()
|
|
return lines
|
|
|
|
|
|
def process_lines_in_batches(lines: list[str], batch_size: int):
|
|
"""
|
|
将行数据按照指定的批次大小,利用线程池并行处理。
|
|
|
|
参数:
|
|
lines (list): 包含所有行的列表。
|
|
batch_size (int): 每个批次处理的行数。
|
|
|
|
"""
|
|
# 使用 ThreadPoolExecutor 创建线程池
|
|
with ThreadPoolExecutor() as executor:
|
|
# 按照 batch_size 将行分批次处理
|
|
for i in range(0, len(lines), batch_size):
|
|
batch_lines = lines[i : i + batch_size]
|
|
executor.submit(pre_batch_task, batch_lines)
|
|
|
|
|
|
def run_task(task_name: str, url: str):
|
|
"""
|
|
执行任务
|
|
"""
|
|
print(f"{task_name}, 开始下载")
|
|
html_content = Requester().fetch_html(url, task_name)
|
|
img_url_list = HtmlParser(html_content).get_img_url_list()
|
|
|
|
img_manager = ImgManager(img_url_list, task_name)
|
|
img_manager.batch_fill_image_data()
|
|
img_manager.save_long_image()
|
|
print(f"{task_name}, 完成!!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
task_file = "task.txt"
|
|
per_thread_line_size = 3 # 每个线程处理的行数
|
|
|
|
lines = read_lines(task_file)
|
|
process_lines_in_batches(lines, per_thread_line_size)
|
|
print("finish, 程序结束...")
|