pkdoutu 图片

multiThreadsSpider.py 多线程抓取图片

import requests
from lxml import etree
import os
import time
import threading

url = 'https://www.pkdoutu.com/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}


def getImgUrls(url, headers):
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    imgUrls = selector.xpath('//div[@class="pic-content text-center"]/div/a//img[2]/@data-original')
    return imgUrls


def downloadImg(imgUrl):
    time.sleep(0.2)
    name = imgUrl.split('/')[-1]
    res = requests.get(imgUrl, headers=headers)
    with open('images/' + name, 'wb') as f:
        for item in res.iter_content():
            f.write(item)
    print('{} is downloaded'.format(name))


imgUrls = getImgUrls(url, headers)

if not os.path.exists('images'):
    os.mkdir('images')

print('Start download')

start = time.time()

threadList = []
for imgUrl in imgUrls:
    t = threading.Thread(target=downloadImg, args=(imgUrl,))
    t.start()
    threadList.append(t)

for t in threadList:
    t.join()

print('End download,耗时: {}s'.format(time.time() - start))

asyncSpider.py 异步协程抓取图片

Last updated