里程碑v2

This commit is contained in:
vincent 2023-07-21 13:47:59 +08:00
parent 317fb272f5
commit 9cab9ac777

49
main.py
View File

@ -3,10 +3,12 @@ import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
import io
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from utils import ArrayUtil
def get_html(url, max_retries=3):
@ -24,7 +26,7 @@ def get_html(url, max_retries=3):
raise Exception("获取网页html失败")
def download_image(img_url, max_retries=3):
def download_image(img_url, max_retries=5):
for retry in range(max_retries):
try:
with requests.get(img_url, stream=True) as response:
@ -59,13 +61,10 @@ def create_img_obj_list(img_url_list):
obj["data"] = None
img_obj_list.append(obj)
# TODO remember to delete
if len(img_obj_list) > 2:
break
return img_obj_list
def fill_img_obj(img_obj):
def set_img_obj_data(img_obj):
url = img_obj["url"]
data = download_image(url)
if data is None:
@ -104,7 +103,7 @@ def generate_pdf_from_images(img_obj_list, output_file):
for img_obj in img_obj_list:
# 从图片对象的 data 字段中创建图像对象
img_data = img_obj['data']
img_data = img_obj["data"]
img = Image.open(io.BytesIO(img_data))
# 将图像大小调整为 PDF 页面大小
@ -125,6 +124,38 @@ def generate_pdf_from_images(img_obj_list, output_file):
except Exception as e:
print(f"PDF 生成失败:{e}")
def concatenate_images_vertically(img_obj_list, output_file):
try:
# 计算拼接后的长图宽度和总高度
max_width = max(
Image.open(io.BytesIO(img_obj["data"])).width for img_obj in img_obj_list
)
total_height = sum(
Image.open(io.BytesIO(img_obj["data"])).height for img_obj in img_obj_list
)
# 创建一张新的长图
long_image = Image.new("RGB", (max_width, total_height), color=(255, 255, 255))
# 依次将图片在垂直方向上拼接起来
y_offset = 0
for img_obj in img_obj_list:
img = Image.open(io.BytesIO(img_obj["data"]))
img_width, img_height = img.size
x_offset = (max_width - img_width) // 2 # 居中拼接
long_image.paste(img, (x_offset, y_offset))
y_offset += img_height
# 保存拼接后的长图到本地
# long_image.save(output_file)
except Exception as e:
print(f"拼接图片失败:{e}")
return None
def process_batch(lines):
for line in lines:
line = line.strip() # 去掉每行开头和结尾的空白字符
@ -138,10 +169,12 @@ def process_batch(lines):
# 使用 ThreadPoolExecutor 创建线程池
with ThreadPoolExecutor() as executor:
# 多线程处理图片下载和替换
executor.map(fill_img_obj, img_obj_list)
executor.map(set_img_obj_data, img_obj_list)
# save_images_to_directory(img_obj_list, directory_path="imgs")
generate_pdf_from_images(img_obj_list, output_file=f"imgs/{file_name}.pdf")
concatenate_images_vertically(
img_obj_list, output_file=f"imgs/{file_name}.pdf"
)
if __name__ == "__main__":