pkdoutu 图片

multiThreadsSpider.py 多线程抓取图片

import requests
from lxml import etree
import os
import time
import threading

url = 'https://www.pkdoutu.com/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}


def getImgUrls(url, headers):
    res = requests.get(url, headers=headers)
    selector = etree.HTML(res.text)
    imgUrls = selector.xpath('//div[@class="pic-content text-center"]/div/a//img[2]/@data-original')
    return imgUrls


def downloadImg(imgUrl):
    time.sleep(0.2)
    name = imgUrl.split('/')[-1]
    res = requests.get(imgUrl, headers=headers)
    with open('images/' + name, 'wb') as f:
        for item in res.iter_content():
            f.write(item)
    print('{} is downloaded'.format(name))


imgUrls = getImgUrls(url, headers)

if not os.path.exists('images'):
    os.mkdir('images')

print('Start download')

start = time.time()

threadList = []
for imgUrl in imgUrls:
    t = threading.Thread(target=downloadImg, args=(imgUrl,))
    t.start()
    threadList.append(t)

for t in threadList:
    t.join()

print('End download,耗时: {}s'.format(time.time() - start))

asyncSpider.py 异步协程抓取图片

import asyncio
import aiohttp
from lxml import etree
import os
import time

url = 'https://www.pkdoutu.com/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}

async def getImgUrls(url, headers):
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers, ssl=False) as response:
            text = await response.content.read()

            selector = etree.HTML(text)
            imgUrls = selector.xpath('//div[@class="pic-content text-center"]/div/a//img[2]/@data-original')

            return imgUrls


async def downloadImg(imgUrl):
    name = imgUrl.split('/')[-1]
    async with aiohttp.ClientSession() as session:
        async with session.get(imgUrl, headers=headers, ssl=False) as response:
            with open('images/' + name, 'wb') as f:
                f.write(await response.content.read())
                
            print('{} is downloaded'.format(name))


async def mainTask():
    tasks = [
        asyncio.create_task(downloadImg(imgUrl)) for imgUrl in await getImgUrls(url, headers)
    ]
    await asyncio.wait(tasks)


if __name__ == '__main__':

    start = time.time()

    if not os.path.exists('images'):
        os.mkdir('images')

    asyncio.run(mainTask())

    print('End download,耗时: {}s'.format(time.time() - start))

Last updated