chinaz 站长素材图片

单页面图片抓取下载并将地址写入excel文件

sc_chinaz_single.py

import requests,time,os,pandas
from lxml import etree


url = 'https://sc.chinaz.com/tupian/jianzhutupian.html'

headers = {
    'Referer': 'https://sc.chinaz.com/tupian/',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
}

# 请求图片频道页
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'

# 提取频道页图片item标签
tree = etree.HTML(response.text)
tags = tree.xpath('/html/body/div[3]/div[2]/div')

# 创建pandas表格,指定两列,列名分别为 "url, name"
table = pandas.DataFrame(columns=['url', 'name'])
# 表格行号
data_row_index = 0

for tag in tags:
    time.sleep(0.3)
    # 获取图片详情页地址
    img_page_url = 'https://sc.chinaz.com' + tag.xpath('./div/a/@href')[0]
    # 获取图片名称
    img_name = tag.xpath('./div/a/text()')[0]

    # 请求图片详情页
    res_img_page = requests.get(img_page_url, headers=headers)
    res_img_page.encoding = 'utf-8'
    img_tree = etree.HTML(res_img_page.text)
    # 图片真实地址
    img_url = 'https:' + img_tree.xpath('/html/body/div[3]/div[1]/div[1]/div[2]/div[2]/img/@src')[0]
    # 获取图片后缀(下方写入本地文件时使用)
    extension = os.path.splitext(img_url)[1]

    # 往pandas表格中插入数据
    table.loc[data_row_index] = [img_url, img_name]
    data_row_index += 1

    # 请求图片数据
    image = requests.get(img_url, headers=headers)
    # 写入本地文件
    with open('./images/' + img_name + extension, 'wb') as f:
        f.write(image.content)

# 将pandas表格转换导出为本地excel表格
table.to_excel('img.xlsx', index=False, sheet_name='img_url')
print(table)
print('Images downloaded.')

多页面图片抓取下载并将地址写入excel文件

sc_chinaz_multi.py

import requests,time,os,pandas
from lxml import etree


url_init = 'https://sc.chinaz.com/tupian/jianzhutupian.html'

# 图片信息列表,用于后续写入pandas表格
img_info = []

for page_num in range(1,3):
    if page_num == 1:
        url = url_init
    else:
        url = f'https://sc.chinaz.com/tupian/jianzhutupian_{page_num}.html'

    headers = {
        'Referer': 'https://sc.chinaz.com/tupian/',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
    }

    # 请求图片频道页
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'

    # 提取频道页图片item标签
    tree = etree.HTML(response.text)
    tags = tree.xpath('/html/body/div[3]/div[2]/div')

    for tag in tags:
        time.sleep(0.3)
        # 获取图片详情页地址
        img_page_url = 'https://sc.chinaz.com' + tag.xpath('./div/a/@href')[0]
        # 获取图片名称
        img_name = tag.xpath('./div/a/text()')[0]

        # 请求图片详情页
        res_img_page = requests.get(img_page_url, headers=headers)
        res_img_page.encoding = 'utf-8'
        img_tree = etree.HTML(res_img_page.text)
        # 图片真实地址
        img_url = 'https:' + img_tree.xpath('/html/body/div[3]/div[1]/div[1]/div[2]/div[2]/img/@src')[0]
        # 获取图片后缀(下方写入本地文件时使用)
        extension = os.path.splitext(img_url)[1]

        # 往图片信息列表中插入数据
        img_info.append({'url': img_url, 'name': img_name})

        # 请求图片数据
        image = requests.get(img_url, headers=headers)
        # 写入本地文件
        with open('./images/' + img_name + extension, 'wb') as f:
            f.write(image.content)

    # 创建pandas表格(有两列,并用图片信息列表数据进行填充)
    table = pandas.DataFrame(columns=['url', 'name'], data=img_info)
    # 将pandas表格转换导出为本地excel表格
    table.to_excel('img.xlsx', index=False, sheet_name='img_url')

    print(table)
    print(f'Page{page_num} images downloaded.')

Last updated