删除过时的代码
This commit is contained in:
parent
301e6f9662
commit
beba0d1060
187
main.py.bak
187
main.py.bak
@ -1,187 +0,0 @@
|
|||||||
import os
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
import io
|
|
||||||
from reportlab.lib.pagesizes import letter
|
|
||||||
from reportlab.pdfgen import canvas
|
|
||||||
|
|
||||||
|
|
||||||
def get_html(url, max_retries=3):
|
|
||||||
session = requests.Session()
|
|
||||||
adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
|
|
||||||
session.mount("http://", adapter)
|
|
||||||
session.mount("https://", adapter)
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = session.get(url)
|
|
||||||
response.raise_for_status()
|
|
||||||
return response.text
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error occurred while fetching HTML from {url}: {e}")
|
|
||||||
raise Exception("获取网页html失败")
|
|
||||||
|
|
||||||
|
|
||||||
def download_image(img_url, max_retries=5):
|
|
||||||
for retry in range(max_retries):
|
|
||||||
try:
|
|
||||||
with requests.get(img_url, stream=True) as response:
|
|
||||||
response.raise_for_status()
|
|
||||||
return response.content
|
|
||||||
except Exception as e:
|
|
||||||
if retry < max_retries - 1:
|
|
||||||
print(
|
|
||||||
f"Failed to download image, retrying ({retry+1}/{max_retries})..."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
print("Failed to download image after multiple retries, skipping.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_img_urls(html_content):
|
|
||||||
soup = BeautifulSoup(html_content, "html.parser")
|
|
||||||
img_tags = soup.find("div", class_="reading-content").find_all("img")
|
|
||||||
|
|
||||||
img_urls = []
|
|
||||||
for img_tag in img_tags:
|
|
||||||
img_url = img_tag.attrs["data-src"]
|
|
||||||
img_urls.append(img_url)
|
|
||||||
return img_urls
|
|
||||||
|
|
||||||
|
|
||||||
def create_img_obj_list(img_url_list):
|
|
||||||
img_obj_list = []
|
|
||||||
for url in img_url_list:
|
|
||||||
obj = dict()
|
|
||||||
obj["url"] = url
|
|
||||||
obj["data"] = None
|
|
||||||
img_obj_list.append(obj)
|
|
||||||
|
|
||||||
return img_obj_list
|
|
||||||
|
|
||||||
|
|
||||||
def set_img_obj_data(img_obj):
|
|
||||||
url = img_obj["url"]
|
|
||||||
data = download_image(url)
|
|
||||||
if data is None:
|
|
||||||
raise Exception("下载图片失败")
|
|
||||||
img_obj["data"] = data
|
|
||||||
|
|
||||||
|
|
||||||
def save_images_to_directory(img_obj_list, directory_path):
|
|
||||||
try:
|
|
||||||
# 创建保存图片的目录(如果不存在)
|
|
||||||
os.makedirs(directory_path, exist_ok=True)
|
|
||||||
|
|
||||||
for idx, img_obj in enumerate(img_obj_list):
|
|
||||||
url = img_obj["url"]
|
|
||||||
data = img_obj["data"]
|
|
||||||
|
|
||||||
# 获取图片的扩展名(假设url以图片扩展名结尾)
|
|
||||||
extension = os.path.splitext(url)[1]
|
|
||||||
|
|
||||||
# 图片文件名,这里用序号作为文件名
|
|
||||||
file_name = f"image_{idx}{extension}"
|
|
||||||
file_path = os.path.join(directory_path, file_name)
|
|
||||||
|
|
||||||
# 将图片数据写入本地文件
|
|
||||||
with open(file_path, "wb") as file:
|
|
||||||
file.write(data)
|
|
||||||
|
|
||||||
print("图片保存成功!")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"图片保存失败:{e}")
|
|
||||||
|
|
||||||
|
|
||||||
def generate_pdf_from_images(img_obj_list, output_file):
|
|
||||||
try:
|
|
||||||
c = canvas.Canvas(output_file, pagesize=letter)
|
|
||||||
|
|
||||||
for img_obj in img_obj_list:
|
|
||||||
# 从图片对象的 data 字段中创建图像对象
|
|
||||||
img_data = img_obj["data"]
|
|
||||||
img = Image.open(io.BytesIO(img_data))
|
|
||||||
|
|
||||||
# 将图像大小调整为 PDF 页面大小
|
|
||||||
img_width, img_height = img.size
|
|
||||||
pdf_width, pdf_height = letter
|
|
||||||
scale = min(pdf_width / img_width, pdf_height / img_height)
|
|
||||||
new_width, new_height = int(img_width * scale), int(img_height * scale)
|
|
||||||
img = img.resize((new_width, new_height), Image.ANTIALIAS)
|
|
||||||
|
|
||||||
# 将图像添加到 PDF 页面中
|
|
||||||
c.drawInlineImage(img, 0, 0, width=new_width, height=new_height)
|
|
||||||
|
|
||||||
# 创建新的页面
|
|
||||||
c.showPage()
|
|
||||||
|
|
||||||
c.save()
|
|
||||||
print("PDF 生成成功!")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"PDF 生成失败:{e}")
|
|
||||||
|
|
||||||
|
|
||||||
def concatenate_images_vertically(img_obj_list, output_file):
|
|
||||||
try:
|
|
||||||
# 计算拼接后的长图宽度和总高度
|
|
||||||
max_width = max(
|
|
||||||
Image.open(io.BytesIO(img_obj["data"])).width for img_obj in img_obj_list
|
|
||||||
)
|
|
||||||
total_height = sum(
|
|
||||||
Image.open(io.BytesIO(img_obj["data"])).height for img_obj in img_obj_list
|
|
||||||
)
|
|
||||||
|
|
||||||
# 创建一张新的长图
|
|
||||||
long_image = Image.new("RGB", (max_width, total_height), color=(255, 255, 255))
|
|
||||||
|
|
||||||
# 依次将图片在垂直方向上拼接起来
|
|
||||||
y_offset = 0
|
|
||||||
for img_obj in img_obj_list:
|
|
||||||
img = Image.open(io.BytesIO(img_obj["data"]))
|
|
||||||
img_width, img_height = img.size
|
|
||||||
x_offset = (max_width - img_width) // 2 # 居中拼接
|
|
||||||
long_image.paste(img, (x_offset, y_offset))
|
|
||||||
y_offset += img_height
|
|
||||||
|
|
||||||
# 保存拼接后的长图到本地
|
|
||||||
long_image.save(output_file)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"拼接图片失败:{e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def process_batch(lines):
|
|
||||||
for line in lines:
|
|
||||||
line = line.strip() # 去掉每行开头和结尾的空白字符
|
|
||||||
if line:
|
|
||||||
# 解析出 HTML 文件名和 URL 地址
|
|
||||||
file_name, _, url = line.partition(" - ")
|
|
||||||
html_content = get_html(url)
|
|
||||||
img_url_list = get_img_urls(html_content)
|
|
||||||
img_obj_list = create_img_obj_list(img_url_list)
|
|
||||||
|
|
||||||
# 使用 ThreadPoolExecutor 创建线程池
|
|
||||||
with ThreadPoolExecutor() as executor:
|
|
||||||
# 多线程处理图片下载和替换
|
|
||||||
executor.map(set_img_obj_data, img_obj_list)
|
|
||||||
|
|
||||||
# save_images_to_directory(img_obj_list, directory_path="imgs")
|
|
||||||
concatenate_images_vertically(img_obj_list, output_file=f"imgs/{file_name}.pdf")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
file_name = "input.txt"
|
|
||||||
batch_size = 3 # 每个线程处理的行数
|
|
||||||
|
|
||||||
with open(file_name, "r", encoding="utf-8") as file:
|
|
||||||
lines = file.readlines()
|
|
||||||
|
|
||||||
# 使用 ThreadPoolExecutor 创建线程池
|
|
||||||
with ThreadPoolExecutor() as executor:
|
|
||||||
# 按照 batch_size 将行分批次处理
|
|
||||||
for i in range(0, len(lines), batch_size):
|
|
||||||
batch_lines = lines[i : i + batch_size]
|
|
||||||
executor.submit(process_batch, batch_lines)
|
|
||||||
Loading…
Reference in New Issue
Block a user