17K 小说
17k_spider.py
写入本地文件函数版本
import requests
from lxml import etree
import os
import time
import random
session = requests.session()
# 登陆
def login():
urlLogin = 'https://passport.17k.com/ck/user/login'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
dataLogin = {
'loginName': '15676752369',
'password': '9217K..'
}
session.post(urlLogin, headers=headers, data=dataLogin)
# 获取书架页面每本小说的链接
def getShelfInfo(shelfUrl):
data = session.get(shelfUrl)
items = data.json()['data']
list_books = []
for item in items:
list_books.append({'bookName': item['bookName'],
'bookId': item['bookId'],
'bookUrl': 'https://www.17k.com/book/' + str(item['bookId']) + '.html',
'chaptersUrl': 'https://www.17k.com/list/' + str(item['bookId']) + '.html'
})
return list_books
# 获取章节页具体的小说详情页链接
def getListChapters(list_books):
listChapters = []
for book in list_books:
chaptersUrl = book['chaptersUrl']
chapterData = session.get(chaptersUrl)
chapterData.encoding = 'utf-8'
chapterHtmlData = chapterData.text
selector = etree.HTML(chapterHtmlData)
aResults = selector.xpath('//span[contains(text(), "正文")]/../../dd/a')
pageLinks = []
for item in aResults:
chapterName = item.xpath('./span/text()')[0].strip()
chapterHref = item.xpath('./@href')[0]
pageLink = chapterName, 'https://www.17k.com' + chapterHref
pageLinks.append(pageLink)
listChapters.append([book['bookName'], pageLinks])
return listChapters
# 获取小说内容页文字内容
def getContent(listChapters):
for book in listChapters:
print(book)
bookName = book[0]
for chapter in book[1]:
chapterName = chapter[0]
chapterUrl = chapter[1]
# print(bookName, chapterName, chapterUrl)
content = session.get(chapterUrl)
content.encoding = 'utf-8'
data_content = content.text
selector_content = etree.HTML(data_content)
p_list_content = selector_content.xpath('//*[@id="readArea"]/div[1]/div[2]//p[position()<last()]/text()')
# 下载小说,将数据写入本地文件
downloadContent(bookName, chapterName, p_list_content)
# 下载小说,将数据写入本地文件
def downloadContent(bookName, chapterName, p_list_content):
time.sleep(random.uniform(0.3, 1.5))
# 创建文件夹
if not os.path.exists(bookName):
os.mkdir(bookName)
# 写入章节数据
print('《{}》{}: 开始写入本地数据'.format(bookName, chapterName))
for p in p_list_content:
with open(bookName + '/' + chapterName + '.txt', 'a') as f:
f.write(p + '\n')
# Big step 0: ------------- 登陆 -------------
login()
# Big step 1: ------------- 获取书架信息 -------------
shelfUrl = 'https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919'
list_books = getShelfInfo(shelfUrl)
# print(list_books)
# Big step 2: ------------- 获取详情页链接 -------------
listChapters = getListChapters(list_books)
# print(listChapters)
# Big step 3: ------------- 获取小说内容页文字内容,并写入本地文件 -------------
getContent(listChapters)写入本地文件类版本
import requests
from lxml import etree
import os
import time
import random
class fiction_spider():
def __init__(self):
self.session = requests.session()
# 登陆
def login(self):
urlLogin = 'https://passport.17k.com/ck/user/login'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
dataLogin = {
'loginName': '15676752369',
'password': '9217K..'
}
self.session.post(urlLogin, headers=headers, data=dataLogin)
# 获取书架页面每本小说的链接
def getShelfInfo(self, shelfUrl):
data = self.session.get(shelfUrl)
items = data.json()['data']
list_books = []
for item in items:
list_books.append({'bookName': item['bookName'],
'bookId': item['bookId'],
'bookUrl': 'https://www.17k.com/book/' + str(item['bookId']) + '.html',
'chaptersUrl': 'https://www.17k.com/list/' + str(item['bookId']) + '.html'
})
return list_books
# 获取章节页具体的小说详情页链接
def getListChapters(self, list_books):
listChapters = []
for book in list_books:
chaptersUrl = book['chaptersUrl']
chapterData = self.session.get(chaptersUrl)
chapterData.encoding = 'utf-8'
chapterHtmlData = chapterData.text
selector = etree.HTML(chapterHtmlData)
aResults = selector.xpath('//span[contains(text(), "正文")]/../../dd/a')
pageLinks = []
for item in aResults:
chapterName = item.xpath('./span/text()')[0].strip()
chapterHref = item.xpath('./@href')[0]
pageLink = chapterName, 'https://www.17k.com' + chapterHref
pageLinks.append(pageLink)
listChapters.append([book['bookName'], pageLinks])
return listChapters
# 获取小说内容页文字内容
def getContent(self, listChapters):
for book in listChapters:
print(book)
bookName = book[0]
for chapter in book[1]:
chapterName = chapter[0]
chapterUrl = chapter[1]
# print(bookName, chapterName, chapterUrl)
content = self.session.get(chapterUrl)
content.encoding = 'utf-8'
data_content = content.text
selector_content = etree.HTML(data_content)
p_list_content = selector_content.xpath('//*[@id="readArea"]/div[1]/div[2]//p[position()<last()]/text()')
# 下载小说,将数据写入本地文件
self.downloadContent(bookName, chapterName, p_list_content)
# 下载小说,将数据写入本地文件
def downloadContent(self, bookName, chapterName, p_list_content):
time.sleep(random.uniform(0.3, 1.5))
# 创建文件夹
if not os.path.exists(bookName):
os.mkdir(bookName)
# 写入章节数据
print('《{}》{}: 开始写入本地数据'.format(bookName, chapterName))
for p in p_list_content:
with open(bookName + '/' + chapterName + '.txt', 'a') as f:
f.write(p + '\n')
# 主逻辑
def run(self):
# Big step 0: ------------- 登陆 -------------
self.login()
# Big step 1: ------------- 获取书架信息 -------------
shelfUrl = 'https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919'
list_books = self.getShelfInfo(shelfUrl)
# print(list_books)
# Big step 2: ------------- 获取详情页链接 -------------
listChapters = self.getListChapters(list_books)
# print(listChapters)
# Big step 3: ------------- 获取小说内容页文字内容,并写入本地文件 -------------
self.getContent(listChapters)
s1 = fiction_spider()
s1.run()写入 MySQL 类版本
import requests
from lxml import etree
import pymysql
class fiction_spider():
def __init__(self, host, user, password, database, **kwargs):
# 连接数据库服务器
self.conn = pymysql.connect(host=host, user=user, password=password, database=database,
cursorclass=pymysql.cursors.DictCursor, **kwargs)
# 获取游标
self.cursor = self.conn.cursor()
# 初始化表
self.init_MySQL()
self.session = requests.session()
# 初始化数据表
def init_MySQL(self):
# 创建bookList表
sql_bookList = """
create table if not exists bookList(
bookId int primary key ,
bookName varchar(64),
bookUrl varchar(64),
chaptersUrl varchar(64)
)
"""
self.cursor.execute(sql_bookList)
self.conn.commit()
# 创建bookContent表
sql_book_content = """
create table if not exists bookContent(
bookId int,
chapterId int,
primary key (bookId, chapterId),
chapterName varchar(64),
chapterUrl varchar(64),
chapterContent text
)
"""
self.cursor.execute(sql_book_content)
self.conn.commit()
# 登陆
def login(self):
urlLogin = 'https://passport.17k.com/ck/user/login'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
dataLogin = {
'loginName': '15676752369',
'password': '9217K..'
}
self.session.post(urlLogin, headers=headers, data=dataLogin)
# 获取书架页面每本小说的链接
def getShelfInfo(self, shelfUrl):
data = self.session.get(shelfUrl)
list_books = []
# list_books_sql = []
items = data.json()['data']
for item in items:
list_books.append({'bookName': item['bookName'],
'bookId': item['bookId'],
'bookUrl': 'https://www.17k.com/book/' + str(item['bookId']) + '.html',
'chaptersUrl': 'https://www.17k.com/list/' + str(item['bookId']) + '.html'
})
# 拼接“书籍列表信息SQL”
item_list_book_sql = f"insert into bookList values ({item['bookId']}, '{item['bookName']}', 'https://www.17k.com/book/{item['bookId']}.html', 'https://www.17k.com/list/{item['bookId']}.html')"
# 写库
self.cursor.execute(item_list_book_sql)
self.conn.commit()
print("书籍信息: {} {} 成功写入数据库".format(item['bookId'], item['bookName']))
return list_books
# 获取章节页具体的小说详情页链接
def getListChapters(self, list_books):
listChapters = []
for book in list_books:
chaptersUrl = book['chaptersUrl']
chapterData = self.session.get(chaptersUrl)
chapterData.encoding = 'utf-8'
chapterHtmlData = chapterData.text
selector = etree.HTML(chapterHtmlData)
aResults = selector.xpath('//span[contains(text(), "正文")]/../../dd/a')
pageLinks = []
for item in aResults:
chapterName = item.xpath('./span/text()')[0].strip()
chapterHref = item.xpath('./@href')[0]
pageLink = chapterName, 'https://www.17k.com' + chapterHref
pageLinks.append(pageLink)
listChapters.append([book['bookName'], book['bookId'], pageLinks])
return listChapters
# 获取小说内容页文字内容
def getContent(self, listChapters):
for book in listChapters:
bookName = book[0]
bookId = book[1]
chapterId = 0
print("章节信息读取: {}\n开始写入《{}》章节内容数据:".format(book, bookName))
for chapter in book[2]:
chapterName = chapter[0]
chapterUrl = chapter[1]
chapterId += 1
# print(bookName, chapterName, chapterUrl)
content = self.session.get(chapterUrl)
content.encoding = 'utf-8'
data_content = content.text
selector_content = etree.HTML(data_content)
p_list_content = selector_content.xpath('//*[@id="readArea"]/div[1]/div[2]//p[position()<last()]/text()')
chapter_content = "\n".join(p_list_content)
# 拼接“内容SQL”
ChapterContentSQL = f"""insert into bookContent values ({bookId}, {chapterId}, "{chapterName}", "{chapterUrl}", '{chapter_content}')"""
# print(ChapterContentSQL)
# 写库
self.cursor.execute(ChapterContentSQL)
self.conn.commit()
print("章节内容: {} {} 成功写入数据库".format(chapterName, chapterUrl))
print('《{}》成功写入数据库\n'.format(bookName))
# 主逻辑
def run(self):
# Big step 0: ------------- 初始化数据库表 -------------
print('Step0: 开始初始化数据库表')
self.init_MySQL()
# Big step 1: ------------- 登陆 -------------
print('\n\nStep1: 开始登陆')
self.login()
# Big step 2: ------------- 获取书架信息 -------------
print("\n\nStep2: 开始读取书架中的书籍信息...")
shelfUrl = 'https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919'
list_books = self.getShelfInfo(shelfUrl)
# print(list_books)
# Big step 3: ------------- 获取详情页链接 -------------
print("\n\nstep3: 开始读取书籍章节详细信息...")
listChapters = self.getListChapters(list_books)
# Big step 4: ------------- 获取小说内容页文字内容,并写入数据库 -------------
print("\n\nstep4: 开始读取章节内容...")
self.getContent(listChapters)
# Big step5:
self.cursor.close()
self.conn.close()
if __name__ == '__main__':
s1 = fiction_spider(host='localhost', user='root', password='12345678', database='fiction')
s1.run()模拟登陆获取书架数据
import requests
session = requests.session()
def login():
urlLogin = 'https://passport.17k.com/ck/user/login'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
dataLogin = {
'loginName': 'xxxxxx',
'password': 'xxxxxx'
}
res = session.post(urlLogin, headers=headers, data=dataLogin)
print('Request headers:', res.request.headers, '\n')
print('Response headers:', res.headers, '\n')
print('Session Cookie:', session.cookies, '\n')
def getShelf():
res = session.get('https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919')
res.encoding = 'utf-8'
print('Shelf info:', res.json()['data'])
login()
getShelf()
Last updated