搬移函数
This commit is contained in:
parent
197043d048
commit
dacae684d7
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
// 使用 IntelliSense 了解相关属性。
|
||||||
|
// 悬停以查看现有属性的描述。
|
||||||
|
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Python: Current File",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "main.py",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"justMyCode": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
BIN
__pycache__/requester.cpython-310.pyc
Normal file
BIN
__pycache__/requester.cpython-310.pyc
Normal file
Binary file not shown.
BIN
__pycache__/web_img.cpython-310.pyc
Normal file
BIN
__pycache__/web_img.cpython-310.pyc
Normal file
Binary file not shown.
BIN
__pycache__/web_parser.cpython-310.pyc
Normal file
BIN
__pycache__/web_parser.cpython-310.pyc
Normal file
Binary file not shown.
194
main.py
194
main.py
@ -1,127 +1,98 @@
|
|||||||
import os
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import io
|
import io
|
||||||
|
from web_img import WebImg
|
||||||
|
from requester import Requester
|
||||||
|
from web_parser import WebParser
|
||||||
|
|
||||||
|
|
||||||
def get_html(url, file_name, max_retries=3):
|
# def fetch_image(img_url: str, max_retries=5):
|
||||||
session = requests.Session()
|
# """
|
||||||
adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
|
# 通过给定的图片URL下载图片内容。
|
||||||
session.mount("http://", adapter)
|
|
||||||
session.mount("https://", adapter)
|
|
||||||
|
|
||||||
try:
|
# 参数:
|
||||||
response = session.get(url)
|
# img_url (str): 图片的URL地址。
|
||||||
response.raise_for_status()
|
# max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。
|
||||||
return response.text
|
|
||||||
except Exception as e:
|
# 返回值:
|
||||||
print(f"Error occurred while fetching HTML from {url}: {e}")
|
# bytes or None: 成功下载图片的二进制数据,若下载失败则返回None。
|
||||||
raise Exception(f"{file_name}, 获取网页html失败")
|
|
||||||
|
# 注解:
|
||||||
|
# 这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。
|
||||||
|
# 如果下载成功,函数将返回图片的二进制内容(bytes格式)。
|
||||||
|
# 如果下载失败,函数将尝试最多`max_retries`次重试,直到成功或达到重试次数上限。
|
||||||
|
# 在每次重试之间,函数会打印错误消息来指示重试进度。
|
||||||
|
# 如果重试次数用尽后仍然无法下载图片,函数将输出失败消息并返回None。
|
||||||
|
|
||||||
|
# 例子:
|
||||||
|
# ```
|
||||||
|
# image_url = "https://example.com/image.jpg"
|
||||||
|
# image_data = download_image(image_url)
|
||||||
|
# if image_data:
|
||||||
|
# # 处理图片数据...
|
||||||
|
# else:
|
||||||
|
# print("无法下载图片,下载失败。")
|
||||||
|
# ```
|
||||||
|
# """
|
||||||
|
# for retry in range(max_retries):
|
||||||
|
# try:
|
||||||
|
# with requests.get(img_url, stream=True) as response:
|
||||||
|
# response.raise_for_status()
|
||||||
|
# return response.content
|
||||||
|
# except Exception as e:
|
||||||
|
# if retry < max_retries - 1:
|
||||||
|
# print(
|
||||||
|
# f"Failed to download image, retrying ({retry+1}/{max_retries})..."
|
||||||
|
# )
|
||||||
|
# else:
|
||||||
|
# print("Failed to download image after multiple retries, skipping.")
|
||||||
|
# return None
|
||||||
|
|
||||||
|
|
||||||
def download_image(img_url, max_retries=5):
|
def create_web_img_list(img_url_list, task_name):
|
||||||
"""
|
|
||||||
通过给定的图片URL下载图片内容。
|
|
||||||
|
|
||||||
参数:
|
|
||||||
img_url (str): 图片的URL地址。
|
|
||||||
max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。
|
|
||||||
|
|
||||||
返回值:
|
|
||||||
bytes or None: 成功下载图片的二进制数据,若下载失败则返回None。
|
|
||||||
|
|
||||||
注解:
|
|
||||||
这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。
|
|
||||||
如果下载成功,函数将返回图片的二进制内容(bytes格式)。
|
|
||||||
如果下载失败,函数将尝试最多`max_retries`次重试,直到成功或达到重试次数上限。
|
|
||||||
在每次重试之间,函数会打印错误消息来指示重试进度。
|
|
||||||
如果重试次数用尽后仍然无法下载图片,函数将输出失败消息并返回None。
|
|
||||||
|
|
||||||
例子:
|
|
||||||
```
|
|
||||||
image_url = "https://example.com/image.jpg"
|
|
||||||
image_data = download_image(image_url)
|
|
||||||
if image_data:
|
|
||||||
# 处理图片数据...
|
|
||||||
else:
|
|
||||||
print("无法下载图片,下载失败。")
|
|
||||||
```
|
|
||||||
"""
|
|
||||||
for retry in range(max_retries):
|
|
||||||
try:
|
|
||||||
with requests.get(img_url, stream=True) as response:
|
|
||||||
response.raise_for_status()
|
|
||||||
return response.content
|
|
||||||
except Exception as e:
|
|
||||||
if retry < max_retries - 1:
|
|
||||||
print(
|
|
||||||
f"Failed to download image, retrying ({retry+1}/{max_retries})..."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
print("Failed to download image after multiple retries, skipping.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_img_urls(html_content):
|
|
||||||
soup = BeautifulSoup(html_content, "html.parser")
|
|
||||||
img_tags = soup.find("div", class_="reading-content").find_all("img")
|
|
||||||
|
|
||||||
img_urls = []
|
|
||||||
for img_tag in img_tags:
|
|
||||||
img_url = img_tag.attrs["data-src"]
|
|
||||||
img_urls.append(img_url)
|
|
||||||
return img_urls
|
|
||||||
|
|
||||||
|
|
||||||
def create_img_obj_list(img_url_list, file_name):
|
|
||||||
img_obj_list = []
|
img_obj_list = []
|
||||||
for url in img_url_list:
|
for url in img_url_list:
|
||||||
obj = dict()
|
img = WebImg(task_name, url)
|
||||||
obj["file_name"] = file_name
|
img_obj_list.append(img)
|
||||||
obj["url"] = url
|
|
||||||
obj["data"] = None
|
|
||||||
img_obj_list.append(obj)
|
|
||||||
|
|
||||||
return img_obj_list
|
return img_obj_list
|
||||||
|
|
||||||
|
|
||||||
def download_images_to_img_obj(img_obj):
|
# def fetch_images_to_img_obj(web_img: WebImg):
|
||||||
url = img_obj["url"]
|
# url = web_img.url
|
||||||
data = download_image(url)
|
# data = fetch_image(url)
|
||||||
if data is None:
|
# if data is None:
|
||||||
file_name = img_obj["file_name"]
|
# task_name = web_img.task_name
|
||||||
print(f"{file_name}, 下载图片失败")
|
# print(f"{task_name}, 下载图片失败")
|
||||||
raise Exception(f"{file_name}, 下载图片失败")
|
# raise Exception(f"{task_name}, 下载图片失败")
|
||||||
img_obj["data"] = data
|
# web_img.data = data
|
||||||
|
|
||||||
|
|
||||||
def batch_download_images_to_img_obj_list(img_obj_list):
|
# def batch_fetch_images_to_img_obj_list(web_img_list: list[WebImg]):
|
||||||
"""
|
# """
|
||||||
使用 ThreadPoolExecutor 创建线程池,对 img_obj_list 中的每个图片对象调用 set_img_obj_data 函数。
|
# 使用 ThreadPoolExecutor 创建线程池,对 img_obj_list 中的每个图片对象调用 set_img_obj_data 函数。
|
||||||
|
|
||||||
Args:
|
# Args:
|
||||||
img_obj_list (list): 图片对象列表,每个对象包含图片的数据等信息。
|
# img_obj_list (list): 图片对象列表,每个对象包含图片的数据等信息。
|
||||||
|
|
||||||
Returns:
|
# Returns:
|
||||||
None
|
# None
|
||||||
"""
|
# """
|
||||||
with ThreadPoolExecutor() as executor:
|
# with ThreadPoolExecutor() as executor:
|
||||||
executor.map(download_images_to_img_obj, img_obj_list)
|
# executor.map(fetch_images_to_img_obj, web_img_list)
|
||||||
|
|
||||||
|
|
||||||
def concatenate_images_vertically(img_obj_list):
|
def concatenate_images_vertically(web_img_list: list[WebImg]):
|
||||||
"""
|
"""
|
||||||
垂直拼接长图片
|
垂直拼接长图片
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# 计算拼接后的长图宽度和总高度
|
# 计算拼接后的长图宽度和总高度
|
||||||
max_width = max(
|
max_width = max(
|
||||||
Image.open(io.BytesIO(img_obj["data"])).width for img_obj in img_obj_list
|
Image.open(io.BytesIO(web_img.data)).width for web_img in web_img_list
|
||||||
)
|
)
|
||||||
total_height = sum(
|
total_height = sum(
|
||||||
Image.open(io.BytesIO(img_obj["data"])).height for img_obj in img_obj_list
|
Image.open(io.BytesIO(web_img.data)).height for web_img in web_img_list
|
||||||
)
|
)
|
||||||
|
|
||||||
# 创建一张新的长图
|
# 创建一张新的长图
|
||||||
@ -129,8 +100,8 @@ def concatenate_images_vertically(img_obj_list):
|
|||||||
|
|
||||||
# 依次将图片在垂直方向上拼接起来
|
# 依次将图片在垂直方向上拼接起来
|
||||||
y_offset = 0
|
y_offset = 0
|
||||||
for img_obj in img_obj_list:
|
for web_img in web_img_list:
|
||||||
img = Image.open(io.BytesIO(img_obj["data"]))
|
img = Image.open(io.BytesIO(web_img.data))
|
||||||
img_width, img_height = img.size
|
img_width, img_height = img.size
|
||||||
x_offset = (max_width - img_width) // 2 # 居中拼接
|
x_offset = (max_width - img_width) // 2 # 居中拼接
|
||||||
long_image.paste(img, (x_offset, y_offset))
|
long_image.paste(img, (x_offset, y_offset))
|
||||||
@ -139,27 +110,28 @@ def concatenate_images_vertically(img_obj_list):
|
|||||||
return long_image
|
return long_image
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
file_name = img_obj_list[0]["file_name"]
|
task_name = web_img_list[0].task_name
|
||||||
print(f"{file_name}, 拼接图片失败:{e}")
|
print(f"{task_name}, 拼接图片失败:{e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def pre_batch_task(lines):
|
def pre_batch_task(lines: list[str]):
|
||||||
"""
|
"""
|
||||||
每个线程的批次任务
|
每个线程的批次任务
|
||||||
"""
|
"""
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line = line.strip() # 去掉每行开头和结尾的空白字符
|
line = line.strip() # 去掉每行开头和结尾的空白字符
|
||||||
if line:
|
if line:
|
||||||
file_name, _, url = line.partition(" - ") # 解析出 HTML 文件名和 URL 地址
|
requester = Requester()
|
||||||
print(f"{file_name}, 开始下载")
|
task_name, _, url = line.partition(" - ") # 解析出 HTML 文件名和 URL 地址
|
||||||
html_content = get_html(url, file_name)
|
print(f"{task_name}, 开始下载")
|
||||||
img_url_list = get_img_urls(html_content)
|
html_content = requester.fetch_html(url, task_name)
|
||||||
img_obj_list = create_img_obj_list(img_url_list, file_name)
|
img_url_list = WebParser(html_content).parse_img_urls()
|
||||||
batch_download_images_to_img_obj_list(img_obj_list)
|
web_img_list = create_web_img_list(img_url_list, task_name)
|
||||||
long_image = concatenate_images_vertically(img_obj_list) # 垂直拼接长图片
|
requester.batch_fetch_images_to_img_obj_list(web_img_list)
|
||||||
long_image.save(f"output/{file_name}.png") # 保存长图到本地
|
long_image = concatenate_images_vertically(web_img_list) # 垂直拼接长图片
|
||||||
print(f"{file_name}, 完成!!")
|
long_image.save(f"output/{task_name}.png") # 保存长图到本地
|
||||||
|
print(f"{task_name}, 完成!!")
|
||||||
|
|
||||||
|
|
||||||
def read_lines_from_file(task_file):
|
def read_lines_from_file(task_file):
|
||||||
@ -167,7 +139,7 @@ def read_lines_from_file(task_file):
|
|||||||
从文件中读取所有行并返回一个包含行的列表。
|
从文件中读取所有行并返回一个包含行的列表。
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
file_name (str): 要读取的文件名。
|
task_file (file): 任务文件。
|
||||||
|
|
||||||
返回值:
|
返回值:
|
||||||
lines (list): 包含文件中所有行的列表。
|
lines (list): 包含文件中所有行的列表。
|
||||||
|
|||||||
78
requester.py
Normal file
78
requester.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
import os
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
from web_img import WebImg
|
||||||
|
|
||||||
|
|
||||||
|
class Requester:
|
||||||
|
def fetch_html(self, url: str, task_name: str, max_retries=3):
|
||||||
|
session = requests.Session()
|
||||||
|
adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
|
||||||
|
session.mount("http://", adapter)
|
||||||
|
session.mount("https://", adapter)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = session.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.text
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error occurred while fetching HTML from {url}: {e}")
|
||||||
|
raise Exception(f"{task_name}, 获取网页html失败")
|
||||||
|
|
||||||
|
def fetch_image(self, img_url: str, max_retries=5):
|
||||||
|
"""
|
||||||
|
通过给定的图片URL下载图片内容。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
img_url (str): 图片的URL地址。
|
||||||
|
max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。
|
||||||
|
|
||||||
|
返回值:
|
||||||
|
bytes or None: 成功下载图片的二进制数据,若下载失败则返回None。
|
||||||
|
|
||||||
|
注解:
|
||||||
|
这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。
|
||||||
|
如果下载成功,函数将返回图片的二进制内容(bytes格式)。
|
||||||
|
如果下载失败,函数将尝试最多`max_retries`次重试,直到成功或达到重试次数上限。
|
||||||
|
在每次重试之间,函数会打印错误消息来指示重试进度。
|
||||||
|
如果重试次数用尽后仍然无法下载图片,函数将输出失败消息并返回None。
|
||||||
|
|
||||||
|
例子:
|
||||||
|
```
|
||||||
|
image_url = "https://example.com/image.jpg"
|
||||||
|
image_data = download_image(image_url)
|
||||||
|
if image_data:
|
||||||
|
# 处理图片数据...
|
||||||
|
else:
|
||||||
|
print("无法下载图片,下载失败。")
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
for retry in range(max_retries):
|
||||||
|
try:
|
||||||
|
with requests.get(img_url, stream=True) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.content
|
||||||
|
except Exception as e:
|
||||||
|
if retry < max_retries - 1:
|
||||||
|
print(
|
||||||
|
f"Failed to download image, retrying ({retry+1}/{max_retries})..."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("Failed to download image after multiple retries, skipping.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def fetch_images_to_img_obj(self, web_img: WebImg):
|
||||||
|
url = web_img.url
|
||||||
|
data = self.fetch_image(url)
|
||||||
|
if data is None:
|
||||||
|
task_name = web_img.task_name
|
||||||
|
print(f"{task_name}, 下载图片失败")
|
||||||
|
raise Exception(f"{task_name}, 下载图片失败")
|
||||||
|
web_img.data = data
|
||||||
|
|
||||||
|
def batch_fetch_images_to_img_obj_list(self, web_img_list: list[WebImg]):
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
executor.map(self.fetch_images_to_img_obj, web_img_list)
|
||||||
7
test.py
Normal file
7
test.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from web_img import WebImg
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
i = WebImg("name", "url.com")
|
||||||
|
print(i)
|
||||||
|
main()
|
||||||
5
web_img.py
Normal file
5
web_img.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
class WebImg:
|
||||||
|
def __init__(self, file_name, url):
|
||||||
|
self.task_name = file_name
|
||||||
|
self.url = url
|
||||||
|
self.data = None
|
||||||
18
web_parser.py
Normal file
18
web_parser.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from web_img import WebImg
|
||||||
|
|
||||||
|
|
||||||
|
class WebParser:
|
||||||
|
def __init__(self, html_content: str):
|
||||||
|
self.html_content = html_content
|
||||||
|
|
||||||
|
def parse_img_urls(self):
|
||||||
|
soup = BeautifulSoup(self.html_content, "html.parser")
|
||||||
|
img_tags = soup.find("div", class_="reading-content").find_all("img")
|
||||||
|
img_urls = []
|
||||||
|
for img_tag in img_tags:
|
||||||
|
img_url = img_tag.attrs["data-src"]
|
||||||
|
img_urls.append(img_url)
|
||||||
|
return img_urls
|
||||||
|
|
||||||
|
|
||||||
Loading…
Reference in New Issue
Block a user