75 lines
3.1 KiB
Python
75 lines
3.1 KiB
Python
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
|
||
class HtmlParser:
|
||
def __init__(self, html_content: str):
|
||
self.html_content = html_content
|
||
|
||
def get_img_url_list(self):
|
||
soup = BeautifulSoup(self.html_content, "html.parser")
|
||
img_tags = soup.find("div", class_="reading-content").find_all("img")
|
||
img_urls = []
|
||
for img_tag in img_tags:
|
||
img_url = img_tag.attrs["data-src"]
|
||
img_urls.append(img_url)
|
||
return img_urls
|
||
|
||
|
||
class Requester:
|
||
def fetch_html(self, url: str, task_name: str, max_retries=3):
|
||
session = requests.Session()
|
||
adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
|
||
session.mount("http://", adapter)
|
||
session.mount("https://", adapter)
|
||
|
||
try:
|
||
response = session.get(url)
|
||
response.raise_for_status()
|
||
return response.text
|
||
except Exception as e:
|
||
print(f"Error occurred while fetching HTML from {url}: {e}")
|
||
raise Exception(f"{task_name}, 获取网页html失败")
|
||
|
||
def fetch_image(self, img_url: str, max_retries=5):
|
||
"""
|
||
通过给定的图片URL下载图片内容。
|
||
|
||
参数:
|
||
img_url (str): 图片的URL地址。
|
||
max_retries (int, 可选): 下载失败时的最大重试次数。默认为5次。
|
||
|
||
返回值:
|
||
bytes or None: 成功下载图片的二进制数据,若下载失败则返回None。
|
||
|
||
注解:
|
||
这个函数通过发送HTTP请求下载图片文件。它使用`requests`库来获取URL返回的响应。
|
||
如果下载成功,函数将返回图片的二进制内容(bytes格式)。
|
||
如果下载失败,函数将尝试最多`max_retries`次重试,直到成功或达到重试次数上限。
|
||
在每次重试之间,函数会打印错误消息来指示重试进度。
|
||
如果重试次数用尽后仍然无法下载图片,函数将输出失败消息并返回None。
|
||
|
||
例子:
|
||
```
|
||
image_url = "https://example.com/image.jpg"
|
||
image_data = download_image(image_url)
|
||
if image_data:
|
||
# 处理图片数据...
|
||
else:
|
||
print("无法下载图片,下载失败。")
|
||
```
|
||
"""
|
||
for retry in range(max_retries):
|
||
try:
|
||
with requests.get(img_url, stream=True) as response:
|
||
response.raise_for_status()
|
||
return response.content
|
||
except Exception as e:
|
||
if retry < max_retries - 1:
|
||
print(
|
||
f"Failed to download image, retrying ({retry+1}/{max_retries})..."
|
||
)
|
||
else:
|
||
print("Failed to download image after multiple retries, skipping.")
|
||
return None
|