豆瓣
提取豆瓣250:电影名、评分、评论人数
# 本案例使用 XPath 实现
from lxml import etree
import re
with open('/Users/will/Desktop/Top250.html', 'r') as f:
data = f.read()
selector = etree.HTML(data)
items = selector.xpath('//div[@class="item"]')
infos = []
for item in items:
title = item.xpath('.//span[@class="title"][1]/text()')[0]
rating_num = item.xpath('.//span[@class="rating_num"]/text()')[0]
comment = item.xpath('.//div[@class="star"]//span[last()]/text()')[0]
comment_num = re.search('\d+', comment).group()
info = (title, rating_num, comment_num)
infos.append(info)
print(infos)提取豆瓣“选电影”
https://movie.douban.com/explore
提示:此页面有referer反爬措施
import requests
url = 'https://m.douban.com/rexxar/api/v2/movie/recommend?refresh=0&start=0&count=20&selected_categories=%7B%7D&uncollect=false&tags='
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Referer': 'https://movie.douban.com/explore'
}
res = requests.get(url, headers=header)
# requests对象提供的反序列化方式(不用调用json模块,更加方便)
data = res.json()['items']
print(res.status_code)
for item in data:
if item.get('title'):
print(item.get('title'))Last updated