From 317fb272f555543a57305372325212dd02d7305b Mon Sep 17 00:00:00 2001 From: vincent Date: Fri, 21 Jul 2023 13:34:49 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=A4=87=E4=BB=BD=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main copy.py | 109 ------------------------------ main.py.bak | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+), 109 deletions(-) delete mode 100644 main copy.py create mode 100644 main.py.bak diff --git a/main copy.py b/main copy.py deleted file mode 100644 index ddb6f90..0000000 --- a/main copy.py +++ /dev/null @@ -1,109 +0,0 @@ -import os -import requests -import base64 -from bs4 import BeautifulSoup -from concurrent.futures import ThreadPoolExecutor - - -def get_html(url, max_retries=3): - session = requests.Session() - adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) - session.mount("http://", adapter) - session.mount("https://", adapter) - - try: - response = session.get(url) - response.raise_for_status() - return response.text - except Exception as e: - print(f"Error occurred while fetching HTML from {url}: {e}") - return None - - -def save_html_to_file(html_content, file_path): - try: - with open(file_path, "w", encoding="utf-8") as file: - file.write(html_content) - print(f"HTML content saved to {file_path} successfully!") - except IOError as e: - print(f"Error occurred while saving HTML content: {e}") - - -def download_image(img_url): - MAX_RETRY = 3 - for retry in range(MAX_RETRY): - try: - with requests.get(img_url, stream=True) as response: - response.raise_for_status() - return response.content - except Exception as e: - if retry < MAX_RETRY - 1: - print(f"Failed to download image, retrying ({retry+1}/{MAX_RETRY})...") - else: - print("Failed to download image after multiple retries, skipping.") - return None - - -def replace_img_with_base64(soup): - img_tag_list = soup.find_all("img") - for img_tag in img_tag_list: - if "data-src" in img_tag.attrs: - img_url = img_tag["data-src"] - try: - response = requests.get(img_url) - - if response.status_code == 200: - img_data = response.content - img_base64 = base64.b64encode(img_data).decode("utf-8") - img_tag["src"] = f"data:image/png;base64,{img_base64}" - except Exception as e: - print(f"Error occurred while fetching image: {e}") - - -def replace_img_with_base64(img_tag): - if "data-src" in img_tag.attrs: - img_url = img_tag["data-src"] - img_data = download_image(img_url) - if img_data is not None: - img_base64 = base64.b64encode(img_data).decode("utf-8") - img_tag["src"] = f"data:image/png;base64,{img_base64}" - - -def process_batch(lines): - for line in lines: - line = line.strip() # 去掉每行开头和结尾的空白字符 - if line: - # 解析出 HTML 文件名和 URL 地址 - file_name, _, url = line.partition(" - ") - - html_content = get_html(url) - if html_content: - soup = BeautifulSoup(html_content, "html.parser") - - # replace_img_with_base64(soup) - - img_tag_list = soup.find_all("img") - # 使用 ThreadPoolExecutor 创建线程池 - with ThreadPoolExecutor() as executor: - # 多线程处理图片下载和替换 - executor.map(replace_img_with_base64, img_tag_list) - - # 保存为指定的 HTML 文件 - file_path = f"{file_name}.html" - modified_html = soup.prettify() - save_html_to_file(modified_html, file_path) - - -if __name__ == "__main__": - file_name = "input.txt" - batch_size = 3 # 每个线程处理的行数 - - with open(file_name, "r", encoding="utf-8") as file: - lines = file.readlines() - - # 使用 ThreadPoolExecutor 创建线程池 - with ThreadPoolExecutor() as executor: - # 按照 batch_size 将行分批次处理 - for i in range(0, len(lines), batch_size): - batch_lines = lines[i : i + batch_size] - executor.submit(process_batch, batch_lines) diff --git a/main.py.bak b/main.py.bak new file mode 100644 index 0000000..fe24a9e --- /dev/null +++ b/main.py.bak @@ -0,0 +1,187 @@ +import os +import requests +from bs4 import BeautifulSoup +from concurrent.futures import ThreadPoolExecutor + +from PIL import Image +import io +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas + + +def get_html(url, max_retries=3): + session = requests.Session() + adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) + session.mount("http://", adapter) + session.mount("https://", adapter) + + try: + response = session.get(url) + response.raise_for_status() + return response.text + except Exception as e: + print(f"Error occurred while fetching HTML from {url}: {e}") + raise Exception("获取网页html失败") + + +def download_image(img_url, max_retries=5): + for retry in range(max_retries): + try: + with requests.get(img_url, stream=True) as response: + response.raise_for_status() + return response.content + except Exception as e: + if retry < max_retries - 1: + print( + f"Failed to download image, retrying ({retry+1}/{max_retries})..." + ) + else: + print("Failed to download image after multiple retries, skipping.") + return None + + +def get_img_urls(html_content): + soup = BeautifulSoup(html_content, "html.parser") + img_tags = soup.find("div", class_="reading-content").find_all("img") + + img_urls = [] + for img_tag in img_tags: + img_url = img_tag.attrs["data-src"] + img_urls.append(img_url) + return img_urls + + +def create_img_obj_list(img_url_list): + img_obj_list = [] + for url in img_url_list: + obj = dict() + obj["url"] = url + obj["data"] = None + img_obj_list.append(obj) + + return img_obj_list + + +def set_img_obj_data(img_obj): + url = img_obj["url"] + data = download_image(url) + if data is None: + raise Exception("下载图片失败") + img_obj["data"] = data + + +def save_images_to_directory(img_obj_list, directory_path): + try: + # 创建保存图片的目录(如果不存在) + os.makedirs(directory_path, exist_ok=True) + + for idx, img_obj in enumerate(img_obj_list): + url = img_obj["url"] + data = img_obj["data"] + + # 获取图片的扩展名(假设url以图片扩展名结尾) + extension = os.path.splitext(url)[1] + + # 图片文件名,这里用序号作为文件名 + file_name = f"image_{idx}{extension}" + file_path = os.path.join(directory_path, file_name) + + # 将图片数据写入本地文件 + with open(file_path, "wb") as file: + file.write(data) + + print("图片保存成功!") + except Exception as e: + print(f"图片保存失败:{e}") + + +def generate_pdf_from_images(img_obj_list, output_file): + try: + c = canvas.Canvas(output_file, pagesize=letter) + + for img_obj in img_obj_list: + # 从图片对象的 data 字段中创建图像对象 + img_data = img_obj["data"] + img = Image.open(io.BytesIO(img_data)) + + # 将图像大小调整为 PDF 页面大小 + img_width, img_height = img.size + pdf_width, pdf_height = letter + scale = min(pdf_width / img_width, pdf_height / img_height) + new_width, new_height = int(img_width * scale), int(img_height * scale) + img = img.resize((new_width, new_height), Image.ANTIALIAS) + + # 将图像添加到 PDF 页面中 + c.drawInlineImage(img, 0, 0, width=new_width, height=new_height) + + # 创建新的页面 + c.showPage() + + c.save() + print("PDF 生成成功!") + except Exception as e: + print(f"PDF 生成失败:{e}") + + +def concatenate_images_vertically(img_obj_list, output_file): + try: + # 计算拼接后的长图宽度和总高度 + max_width = max( + Image.open(io.BytesIO(img_obj["data"])).width for img_obj in img_obj_list + ) + total_height = sum( + Image.open(io.BytesIO(img_obj["data"])).height for img_obj in img_obj_list + ) + + # 创建一张新的长图 + long_image = Image.new("RGB", (max_width, total_height), color=(255, 255, 255)) + + # 依次将图片在垂直方向上拼接起来 + y_offset = 0 + for img_obj in img_obj_list: + img = Image.open(io.BytesIO(img_obj["data"])) + img_width, img_height = img.size + x_offset = (max_width - img_width) // 2 # 居中拼接 + long_image.paste(img, (x_offset, y_offset)) + y_offset += img_height + + # 保存拼接后的长图到本地 + long_image.save(output_file) + + except Exception as e: + print(f"拼接图片失败:{e}") + return None + + +def process_batch(lines): + for line in lines: + line = line.strip() # 去掉每行开头和结尾的空白字符 + if line: + # 解析出 HTML 文件名和 URL 地址 + file_name, _, url = line.partition(" - ") + html_content = get_html(url) + img_url_list = get_img_urls(html_content) + img_obj_list = create_img_obj_list(img_url_list) + + # 使用 ThreadPoolExecutor 创建线程池 + with ThreadPoolExecutor() as executor: + # 多线程处理图片下载和替换 + executor.map(set_img_obj_data, img_obj_list) + + # save_images_to_directory(img_obj_list, directory_path="imgs") + concatenate_images_vertically(img_obj_list, output_file=f"imgs/{file_name}.pdf") + + +if __name__ == "__main__": + file_name = "input.txt" + batch_size = 3 # 每个线程处理的行数 + + with open(file_name, "r", encoding="utf-8") as file: + lines = file.readlines() + + # 使用 ThreadPoolExecutor 创建线程池 + with ThreadPoolExecutor() as executor: + # 按照 batch_size 将行分批次处理 + for i in range(0, len(lines), batch_size): + batch_lines = lines[i : i + batch_size] + executor.submit(process_batch, batch_lines)