里程碑v2

This commit is contained in:
vincent 2023-07-21 13:47:59 +08:00
parent 317fb272f5
commit 9cab9ac777

49
main.py
View File

@ -3,10 +3,12 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from PIL import Image from PIL import Image
import io import io
from reportlab.lib.pagesizes import letter from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas from reportlab.pdfgen import canvas
from utils import ArrayUtil
def get_html(url, max_retries=3): def get_html(url, max_retries=3):
@ -24,7 +26,7 @@ def get_html(url, max_retries=3):
raise Exception("获取网页html失败") raise Exception("获取网页html失败")
def download_image(img_url, max_retries=3): def download_image(img_url, max_retries=5):
for retry in range(max_retries): for retry in range(max_retries):
try: try:
with requests.get(img_url, stream=True) as response: with requests.get(img_url, stream=True) as response:
@ -59,13 +61,10 @@ def create_img_obj_list(img_url_list):
obj["data"] = None obj["data"] = None
img_obj_list.append(obj) img_obj_list.append(obj)
# TODO remember to delete
if len(img_obj_list) > 2:
break
return img_obj_list return img_obj_list
def fill_img_obj(img_obj): def set_img_obj_data(img_obj):
url = img_obj["url"] url = img_obj["url"]
data = download_image(url) data = download_image(url)
if data is None: if data is None:
@ -104,7 +103,7 @@ def generate_pdf_from_images(img_obj_list, output_file):
for img_obj in img_obj_list: for img_obj in img_obj_list:
# 从图片对象的 data 字段中创建图像对象 # 从图片对象的 data 字段中创建图像对象
img_data = img_obj['data'] img_data = img_obj["data"]
img = Image.open(io.BytesIO(img_data)) img = Image.open(io.BytesIO(img_data))
# 将图像大小调整为 PDF 页面大小 # 将图像大小调整为 PDF 页面大小
@ -125,6 +124,38 @@ def generate_pdf_from_images(img_obj_list, output_file):
except Exception as e: except Exception as e:
print(f"PDF 生成失败:{e}") print(f"PDF 生成失败:{e}")
def concatenate_images_vertically(img_obj_list, output_file):
try:
# 计算拼接后的长图宽度和总高度
max_width = max(
Image.open(io.BytesIO(img_obj["data"])).width for img_obj in img_obj_list
)
total_height = sum(
Image.open(io.BytesIO(img_obj["data"])).height for img_obj in img_obj_list
)
# 创建一张新的长图
long_image = Image.new("RGB", (max_width, total_height), color=(255, 255, 255))
# 依次将图片在垂直方向上拼接起来
y_offset = 0
for img_obj in img_obj_list:
img = Image.open(io.BytesIO(img_obj["data"]))
img_width, img_height = img.size
x_offset = (max_width - img_width) // 2 # 居中拼接
long_image.paste(img, (x_offset, y_offset))
y_offset += img_height
# 保存拼接后的长图到本地
# long_image.save(output_file)
except Exception as e:
print(f"拼接图片失败:{e}")
return None
def process_batch(lines): def process_batch(lines):
for line in lines: for line in lines:
line = line.strip() # 去掉每行开头和结尾的空白字符 line = line.strip() # 去掉每行开头和结尾的空白字符
@ -138,10 +169,12 @@ def process_batch(lines):
# 使用 ThreadPoolExecutor 创建线程池 # 使用 ThreadPoolExecutor 创建线程池
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
# 多线程处理图片下载和替换 # 多线程处理图片下载和替换
executor.map(fill_img_obj, img_obj_list) executor.map(set_img_obj_data, img_obj_list)
# save_images_to_directory(img_obj_list, directory_path="imgs") # save_images_to_directory(img_obj_list, directory_path="imgs")
generate_pdf_from_images(img_obj_list, output_file=f"imgs/{file_name}.pdf") concatenate_images_vertically(
img_obj_list, output_file=f"imgs/{file_name}.pdf"
)
if __name__ == "__main__": if __name__ == "__main__":