backup
This commit is contained in:
commit
f9dff30eac
109
main copy.py
Normal file
109
main copy.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import base64
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
|
||||||
|
def get_html(url, max_retries=3):
|
||||||
|
session = requests.Session()
|
||||||
|
adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
|
||||||
|
session.mount("http://", adapter)
|
||||||
|
session.mount("https://", adapter)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = session.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.text
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error occurred while fetching HTML from {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def save_html_to_file(html_content, file_path):
|
||||||
|
try:
|
||||||
|
with open(file_path, "w", encoding="utf-8") as file:
|
||||||
|
file.write(html_content)
|
||||||
|
print(f"HTML content saved to {file_path} successfully!")
|
||||||
|
except IOError as e:
|
||||||
|
print(f"Error occurred while saving HTML content: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def download_image(img_url):
|
||||||
|
MAX_RETRY = 3
|
||||||
|
for retry in range(MAX_RETRY):
|
||||||
|
try:
|
||||||
|
with requests.get(img_url, stream=True) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.content
|
||||||
|
except Exception as e:
|
||||||
|
if retry < MAX_RETRY - 1:
|
||||||
|
print(f"Failed to download image, retrying ({retry+1}/{MAX_RETRY})...")
|
||||||
|
else:
|
||||||
|
print("Failed to download image after multiple retries, skipping.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def replace_img_with_base64(soup):
|
||||||
|
img_tag_list = soup.find_all("img")
|
||||||
|
for img_tag in img_tag_list:
|
||||||
|
if "data-src" in img_tag.attrs:
|
||||||
|
img_url = img_tag["data-src"]
|
||||||
|
try:
|
||||||
|
response = requests.get(img_url)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
img_data = response.content
|
||||||
|
img_base64 = base64.b64encode(img_data).decode("utf-8")
|
||||||
|
img_tag["src"] = f"data:image/png;base64,{img_base64}"
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error occurred while fetching image: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def replace_img_with_base64(img_tag):
|
||||||
|
if "data-src" in img_tag.attrs:
|
||||||
|
img_url = img_tag["data-src"]
|
||||||
|
img_data = download_image(img_url)
|
||||||
|
if img_data is not None:
|
||||||
|
img_base64 = base64.b64encode(img_data).decode("utf-8")
|
||||||
|
img_tag["src"] = f"data:image/png;base64,{img_base64}"
|
||||||
|
|
||||||
|
|
||||||
|
def process_batch(lines):
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip() # 去掉每行开头和结尾的空白字符
|
||||||
|
if line:
|
||||||
|
# 解析出 HTML 文件名和 URL 地址
|
||||||
|
file_name, _, url = line.partition(" - ")
|
||||||
|
|
||||||
|
html_content = get_html(url)
|
||||||
|
if html_content:
|
||||||
|
soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
|
||||||
|
# replace_img_with_base64(soup)
|
||||||
|
|
||||||
|
img_tag_list = soup.find_all("img")
|
||||||
|
# 使用 ThreadPoolExecutor 创建线程池
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
# 多线程处理图片下载和替换
|
||||||
|
executor.map(replace_img_with_base64, img_tag_list)
|
||||||
|
|
||||||
|
# 保存为指定的 HTML 文件
|
||||||
|
file_path = f"{file_name}.html"
|
||||||
|
modified_html = soup.prettify()
|
||||||
|
save_html_to_file(modified_html, file_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
file_name = "input.txt"
|
||||||
|
batch_size = 3 # 每个线程处理的行数
|
||||||
|
|
||||||
|
with open(file_name, "r", encoding="utf-8") as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
# 使用 ThreadPoolExecutor 创建线程池
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
# 按照 batch_size 将行分批次处理
|
||||||
|
for i in range(0, len(lines), batch_size):
|
||||||
|
batch_lines = lines[i : i + batch_size]
|
||||||
|
executor.submit(process_batch, batch_lines)
|
||||||
Loading…
Reference in New Issue
Block a user