豆瓣
提取豆瓣250:电影名、评分、评论人数
# 本案例使用 XPath 实现
from lxml import etree
import re
with open('/Users/will/Desktop/Top250.html', 'r') as f:
data = f.read()
selector = etree.HTML(data)
items = selector.xpath('//div[@class="item"]')
infos = []
for item in items:
title = item.xpath('.//span[@class="title"][1]/text()')[0]
rating_num = item.xpath('.//span[@class="rating_num"]/text()')[0]
comment = item.xpath('.//div[@class="star"]//span[last()]/text()')[0]
comment_num = re.search('\d+', comment).group()
info = (title, rating_num, comment_num)
infos.append(info)
print(infos)提取豆瓣“选电影”
Last updated