chinaz 站长素材图片
单页面图片抓取下载并将地址写入excel文件
sc_chinaz_single.py
import requests,time,os,pandas
from lxml import etree
url = 'https://sc.chinaz.com/tupian/jianzhutupian.html'
headers = {
'Referer': 'https://sc.chinaz.com/tupian/',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
}
# 请求图片频道页
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
# 提取频道页图片item标签
tree = etree.HTML(response.text)
tags = tree.xpath('/html/body/div[3]/div[2]/div')
# 创建pandas表格,指定两列,列名分别为 "url, name"
table = pandas.DataFrame(columns=['url', 'name'])
# 表格行号
data_row_index = 0
for tag in tags:
time.sleep(0.3)
# 获取图片详情页地址
img_page_url = 'https://sc.chinaz.com' + tag.xpath('./div/a/@href')[0]
# 获取图片名称
img_name = tag.xpath('./div/a/text()')[0]
# 请求图片详情页
res_img_page = requests.get(img_page_url, headers=headers)
res_img_page.encoding = 'utf-8'
img_tree = etree.HTML(res_img_page.text)
# 图片真实地址
img_url = 'https:' + img_tree.xpath('/html/body/div[3]/div[1]/div[1]/div[2]/div[2]/img/@src')[0]
# 获取图片后缀(下方写入本地文件时使用)
extension = os.path.splitext(img_url)[1]
# 往pandas表格中插入数据
table.loc[data_row_index] = [img_url, img_name]
data_row_index += 1
# 请求图片数据
image = requests.get(img_url, headers=headers)
# 写入本地文件
with open('./images/' + img_name + extension, 'wb') as f:
f.write(image.content)
# 将pandas表格转换导出为本地excel表格
table.to_excel('img.xlsx', index=False, sheet_name='img_url')
print(table)
print('Images downloaded.')多页面图片抓取下载并将地址写入excel文件
sc_chinaz_multi.py
import requests,time,os,pandas
from lxml import etree
url_init = 'https://sc.chinaz.com/tupian/jianzhutupian.html'
# 图片信息列表,用于后续写入pandas表格
img_info = []
for page_num in range(1,3):
if page_num == 1:
url = url_init
else:
url = f'https://sc.chinaz.com/tupian/jianzhutupian_{page_num}.html'
headers = {
'Referer': 'https://sc.chinaz.com/tupian/',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
}
# 请求图片频道页
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
# 提取频道页图片item标签
tree = etree.HTML(response.text)
tags = tree.xpath('/html/body/div[3]/div[2]/div')
for tag in tags:
time.sleep(0.3)
# 获取图片详情页地址
img_page_url = 'https://sc.chinaz.com' + tag.xpath('./div/a/@href')[0]
# 获取图片名称
img_name = tag.xpath('./div/a/text()')[0]
# 请求图片详情页
res_img_page = requests.get(img_page_url, headers=headers)
res_img_page.encoding = 'utf-8'
img_tree = etree.HTML(res_img_page.text)
# 图片真实地址
img_url = 'https:' + img_tree.xpath('/html/body/div[3]/div[1]/div[1]/div[2]/div[2]/img/@src')[0]
# 获取图片后缀(下方写入本地文件时使用)
extension = os.path.splitext(img_url)[1]
# 往图片信息列表中插入数据
img_info.append({'url': img_url, 'name': img_name})
# 请求图片数据
image = requests.get(img_url, headers=headers)
# 写入本地文件
with open('./images/' + img_name + extension, 'wb') as f:
f.write(image.content)
# 创建pandas表格(有两列,并用图片信息列表数据进行填充)
table = pandas.DataFrame(columns=['url', 'name'], data=img_info)
# 将pandas表格转换导出为本地excel表格
table.to_excel('img.xlsx', index=False, sheet_name='img_url')
print(table)
print(f'Page{page_num} images downloaded.')Last updated