pkdoutu 图片
multiThreadsSpider.py 多线程抓取图片
import requests
from lxml import etree
import os
import time
import threading
url = 'https://www.pkdoutu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
def getImgUrls(url, headers):
res = requests.get(url, headers=headers)
selector = etree.HTML(res.text)
imgUrls = selector.xpath('//div[@class="pic-content text-center"]/div/a//img[2]/@data-original')
return imgUrls
def downloadImg(imgUrl):
time.sleep(0.2)
name = imgUrl.split('/')[-1]
res = requests.get(imgUrl, headers=headers)
with open('images/' + name, 'wb') as f:
for item in res.iter_content():
f.write(item)
print('{} is downloaded'.format(name))
imgUrls = getImgUrls(url, headers)
if not os.path.exists('images'):
os.mkdir('images')
print('Start download')
start = time.time()
threadList = []
for imgUrl in imgUrls:
t = threading.Thread(target=downloadImg, args=(imgUrl,))
t.start()
threadList.append(t)
for t in threadList:
t.join()
print('End download,耗时: {}s'.format(time.time() - start))
asyncSpider.py 异步协程抓取图片
import asyncio
import aiohttp
from lxml import etree
import os
import time
url = 'https://www.pkdoutu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
async def getImgUrls(url, headers):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, ssl=False) as response:
text = await response.content.read()
selector = etree.HTML(text)
imgUrls = selector.xpath('//div[@class="pic-content text-center"]/div/a//img[2]/@data-original')
return imgUrls
async def downloadImg(imgUrl):
name = imgUrl.split('/')[-1]
async with aiohttp.ClientSession() as session:
async with session.get(imgUrl, headers=headers, ssl=False) as response:
with open('images/' + name, 'wb') as f:
f.write(await response.content.read())
print('{} is downloaded'.format(name))
async def mainTask():
tasks = [
asyncio.create_task(downloadImg(imgUrl)) for imgUrl in await getImgUrls(url, headers)
]
await asyncio.wait(tasks)
if __name__ == '__main__':
start = time.time()
if not os.path.exists('images'):
os.mkdir('images')
asyncio.run(mainTask())
print('End download,耗时: {}s'.format(time.time() - start))
Last updated