使用 Python 爬取目标网站的图片
测试域名:
https://www.xgzwk.com测试成功,很 nice
专注于原图下载并保留其他核心功能(多线程、异常处理、反爬策略等):
import os
import random
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from io import BytesIO
# -------------------- 核心功能 --------------------
def get_html_content(url, headers=None):
"""获取网页HTML内容(含重试机制)"""
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
def extract_image_urls(html_content, base_url):
"""解析HTML并提取所有JPG图片URL(支持相对路径转换)"""
soup = BeautifulSoup(html_content, 'html.parser')
img_tags = soup.find_all('img', src=True)
image_urls = []
for img in img_tags:
img_url = img['src']
if not img_url.lower().endswith('.jpg'):
continue
# 转换相对路径为绝对路径
absolute_url = urljoin(base_url, img_url)
image_urls.append(absolute_url)
return list(set(image_urls)) # 去重
def download_image(img_url, save_folder, headers=None):
"""下载单张图片(含完整性校验)"""
try:
response = requests.get(img_url, headers=headers, stream=True, timeout=15)
if response.status_code == 200:
# 校验是否为有效JPG文件
img_data = BytesIO(response.content)
try:
Image.open(img_data).verify() # 验证图片完整性
img_data.seek(0)
# 生成合法文件名
img_name = os.path.basename(urlparse(img_url).path)
if not img_name:
img_name = f"img_{hash(img_url)}.jpg"
save_path = os.path.join(save_folder, img_name)
with open(save_path, 'wb') as f:
f.write(img_data.read())
print(f"下载成功: {save_path}")
return True
except Exception as e:
print(f"图片校验失败: {img_url} - 错误: {e}")
else:
print(f"下载失败: HTTP {response.status_code}")
except Exception as e:
print(f"下载异常: {img_url} - 错误: {e}")
return False
# -------------------- 扩展功能 --------------------
def batch_download_with_threadpool(image_urls, save_folder, max_workers=5):
"""多线程批量下载图片"""
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for url in image_urls:
futures.append(executor.submit(
download_image,
url,
save_folder,
{'User-Agent': 'Mozilla/5.0'}
))
time.sleep(random.uniform(0.3, 1.2)) # 随机延迟避免封禁
# 等待所有任务完成
for future in as_completed(futures):
future.result()
# -------------------- 主函数 --------------------
def crawl_jpg_images(target_url, save_folder='downloaded_images', enable_threadpool=True):
"""主爬取函数"""
# 初始化设置
os.makedirs(save_folder, exist_ok=True)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
'Referer': target_url
}
# 获取页面内容
html_content = get_html_content(target_url, headers)
if not html_content:
return
# 提取图片URL
image_urls = extract_image_urls(html_content, target_url)
print(f"发现 {len(image_urls)} 张JPG图片")
# 下载图片(选择单线程或多线程)
if enable_threadpool:
batch_download_with_threadpool(image_urls, save_folder)
else:
for idx, url in enumerate(image_urls):
print(f"正在下载 ({idx+1}/{len(image_urls)}): {url}")
download_image(url, save_folder, headers)
time.sleep(random.uniform(0.5, 1.5))
if __name__ == "__main__":
target_url = " https://www.xgzwk.com "
crawl_jpg_images(target_url, enable_threadpool=True)主要优化点:
完全移除使用代理下载功能:删除
相关函数及相关调用保留核心下载逻辑:
使用
stream=True分块下载大文件通过
Pillow验证图片完整性自动处理文件名冲突(哈希备用名)
增强稳定性:
多线程下载控制并发数
随机延迟(0.3-1.5 秒)规避反爬
完善的异常处理链
路径安全处理:
使用
urlparse提取纯净文件名os.path.join确保跨平台兼容性
使用建议:
修改目标网站:替换
target_url为实际需要抓取的网址控制请求频率:调整
time.sleep参数避免被封禁扩展功能:
# 如需代理支持,修改download_image函数: proxies = {'http': ' http://proxy_ip:port ', 'https': ' https://proxy_ip:port '} response = requests.get(url, headers=headers, proxies=proxies)法律合规:遵守目标网站的 robots.txt 规则
该代码综合了多个来源的最佳实践,在保证功能完整性的同时移除了不必要的图片处理步骤。如需进一步优化,可参考异步 IO 方案(如aiohttp)或动态内容处理(如selenium)。
使用 Python 爬取目标网站的图片
https://uniomo.com/archives/shi-yong-python-pa-qu-mu-biao-wang-zhan-de-tu-pian