pkdoutu 图片
multiThreadsSpider.py 多线程抓取图片
import requests
from lxml import etree
import os
import time
import threading
url = 'https://www.pkdoutu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
def getImgUrls(url, headers):
res = requests.get(url, headers=headers)
selector = etree.HTML(res.text)
imgUrls = selector.xpath('//div[@class="pic-content text-center"]/div/a//img[2]/@data-original')
return imgUrls
def downloadImg(imgUrl):
time.sleep(0.2)
name = imgUrl.split('/')[-1]
res = requests.get(imgUrl, headers=headers)
with open('images/' + name, 'wb') as f:
for item in res.iter_content():
f.write(item)
print('{} is downloaded'.format(name))
imgUrls = getImgUrls(url, headers)
if not os.path.exists('images'):
os.mkdir('images')
print('Start download')
start = time.time()
threadList = []
for imgUrl in imgUrls:
t = threading.Thread(target=downloadImg, args=(imgUrl,))
t.start()
threadList.append(t)
for t in threadList:
t.join()
print('End download,耗时: {}s'.format(time.time() - start))
asyncSpider.py 异步协程抓取图片
Last updated