17K 小说
17k_spider.py
写入本地文件函数版本
import requests
from lxml import etree
import os
import time
import random
session = requests.session()
# 登陆
def login():
urlLogin = 'https://passport.17k.com/ck/user/login'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
dataLogin = {
'loginName': '15676752369',
'password': '9217K..'
}
session.post(urlLogin, headers=headers, data=dataLogin)
# 获取书架页面每本小说的链接
def getShelfInfo(shelfUrl):
data = session.get(shelfUrl)
items = data.json()['data']
list_books = []
for item in items:
list_books.append({'bookName': item['bookName'],
'bookId': item['bookId'],
'bookUrl': 'https://www.17k.com/book/' + str(item['bookId']) + '.html',
'chaptersUrl': 'https://www.17k.com/list/' + str(item['bookId']) + '.html'
})
return list_books
# 获取章节页具体的小说详情页链接
def getListChapters(list_books):
listChapters = []
for book in list_books:
chaptersUrl = book['chaptersUrl']
chapterData = session.get(chaptersUrl)
chapterData.encoding = 'utf-8'
chapterHtmlData = chapterData.text
selector = etree.HTML(chapterHtmlData)
aResults = selector.xpath('//span[contains(text(), "正文")]/../../dd/a')
pageLinks = []
for item in aResults:
chapterName = item.xpath('./span/text()')[0].strip()
chapterHref = item.xpath('./@href')[0]
pageLink = chapterName, 'https://www.17k.com' + chapterHref
pageLinks.append(pageLink)
listChapters.append([book['bookName'], pageLinks])
return listChapters
# 获取小说内容页文字内容
def getContent(listChapters):
for book in listChapters:
print(book)
bookName = book[0]
for chapter in book[1]:
chapterName = chapter[0]
chapterUrl = chapter[1]
# print(bookName, chapterName, chapterUrl)
content = session.get(chapterUrl)
content.encoding = 'utf-8'
data_content = content.text
selector_content = etree.HTML(data_content)
p_list_content = selector_content.xpath('//*[@id="readArea"]/div[1]/div[2]//p[position()<last()]/text()')
# 下载小说,将数据写入本地文件
downloadContent(bookName, chapterName, p_list_content)
# 下载小说,将数据写入本地文件
def downloadContent(bookName, chapterName, p_list_content):
time.sleep(random.uniform(0.3, 1.5))
# 创建文件夹
if not os.path.exists(bookName):
os.mkdir(bookName)
# 写入章节数据
print('《{}》{}: 开始写入本地数据'.format(bookName, chapterName))
for p in p_list_content:
with open(bookName + '/' + chapterName + '.txt', 'a') as f:
f.write(p + '\n')
# Big step 0: ------------- 登陆 -------------
login()
# Big step 1: ------------- 获取书架信息 -------------
shelfUrl = 'https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919'
list_books = getShelfInfo(shelfUrl)
# print(list_books)
# Big step 2: ------------- 获取详情页链接 -------------
listChapters = getListChapters(list_books)
# print(listChapters)
# Big step 3: ------------- 获取小说内容页文字内容,并写入本地文件 -------------
getContent(listChapters)写入本地文件类版本
写入 MySQL 类版本
模拟登陆获取书架数据
Last updated