使用 Python 爬取目标网站的图片

测试域名:

https://www.xgzwk.com

测试成功,很 nice

专注于原图下载并保留其他核心功能(多线程、异常处理、反爬策略等):

import os
import random
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from io import BytesIO

# -------------------- 核心功能 --------------------
def get_html_content(url, headers=None):
    """获取网页HTML内容(含重试机制)"""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"请求失败: {e}")
        return None

def extract_image_urls(html_content, base_url):
    """解析HTML并提取所有JPG图片URL(支持相对路径转换)"""
    soup = BeautifulSoup(html_content, 'html.parser')
    img_tags = soup.find_all('img', src=True)
    image_urls = []
    
    for img in img_tags:
        img_url = img['src']
        if not img_url.lower().endswith('.jpg'):
            continue
        
        # 转换相对路径为绝对路径
        absolute_url = urljoin(base_url, img_url)
        image_urls.append(absolute_url)
    
    return list(set(image_urls))  # 去重

def download_image(img_url, save_folder, headers=None):
    """下载单张图片(含完整性校验)"""
    try:
        response = requests.get(img_url, headers=headers, stream=True, timeout=15)
        if response.status_code == 200:
            # 校验是否为有效JPG文件
            img_data = BytesIO(response.content)
            try:
                Image.open(img_data).verify()  # 验证图片完整性
                img_data.seek(0)
                
                # 生成合法文件名
                img_name = os.path.basename(urlparse(img_url).path)
                if not img_name:
                    img_name = f"img_{hash(img_url)}.jpg"
                
                save_path = os.path.join(save_folder, img_name)
                with open(save_path, 'wb') as f:
                    f.write(img_data.read())
                print(f"下载成功: {save_path}")
                return True
            except Exception as e:
                print(f"图片校验失败: {img_url} - 错误: {e}")
        else:
            print(f"下载失败: HTTP {response.status_code}")
    except Exception as e:
        print(f"下载异常: {img_url} - 错误: {e}")
    return False

# -------------------- 扩展功能 --------------------
def batch_download_with_threadpool(image_urls, save_folder, max_workers=5):
    """多线程批量下载图片"""
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for url in image_urls:
            futures.append(executor.submit(
                download_image, 
                url, 
                save_folder,
                {'User-Agent': 'Mozilla/5.0'}
            ))
            time.sleep(random.uniform(0.3, 1.2))  # 随机延迟避免封禁
        
        # 等待所有任务完成
        for future in as_completed(futures):
            future.result()

# -------------------- 主函数 --------------------
def crawl_jpg_images(target_url, save_folder='downloaded_images', enable_threadpool=True):
    """主爬取函数"""
    # 初始化设置
    os.makedirs(save_folder, exist_ok=True)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
        'Referer': target_url
    }
    
    # 获取页面内容
    html_content = get_html_content(target_url, headers)
    if not html_content:
        return
    
    # 提取图片URL
    image_urls = extract_image_urls(html_content, target_url)
    print(f"发现 {len(image_urls)} 张JPG图片")
    
    # 下载图片(选择单线程或多线程)
    if enable_threadpool:
        batch_download_with_threadpool(image_urls, save_folder)
    else:
        for idx, url in enumerate(image_urls):
            print(f"正在下载 ({idx+1}/{len(image_urls)}): {url}")
            download_image(url, save_folder, headers)
            time.sleep(random.uniform(0.5, 1.5))

if __name__ == "__main__":
    target_url = " https://www.xgzwk.com "
    crawl_jpg_images(target_url, enable_threadpool=True)

主要优化点:

  1. 完全移除使用代理下载功能:删除相关函数及相关调用

  2. 保留核心下载逻辑

    • 使用stream=True分块下载大文件

    • 通过Pillow验证图片完整性

    • 自动处理文件名冲突(哈希备用名)

  3. 增强稳定性

    • 多线程下载控制并发数

    • 随机延迟(0.3-1.5 秒)规避反爬

    • 完善的异常处理链

  4. 路径安全处理

    • 使用urlparse提取纯净文件名

    • os.path.join确保跨平台兼容性

使用建议:

  1. 修改目标网站:替换target_url为实际需要抓取的网址

  2. 控制请求频率:调整time.sleep参数避免被封禁

  3. 扩展功能

    # 如需代理支持,修改download_image函数:
    proxies = {'http': ' http://proxy_ip:port ', 'https': ' https://proxy_ip:port '}
    response = requests.get(url, headers=headers, proxies=proxies)
  4. 法律合规:遵守目标网站的 robots.txt 规则

该代码综合了多个来源的最佳实践,在保证功能完整性的同时移除了不必要的图片处理步骤。如需进一步优化,可参考异步 IO 方案(如aiohttp)或动态内容处理(如selenium)。


使用 Python 爬取目标网站的图片
https://uniomo.com/archives/shi-yong-python-pa-qu-mu-biao-wang-zhan-de-tu-pian
作者
雨落秋垣
发布于
2025年09月15日
许可协议