17K 小说

17k_spider.py

写入本地文件函数版本

import requests
from lxml import etree
import os
import time
import random

session = requests.session()


# 登陆
def login():
    urlLogin = 'https://passport.17k.com/ck/user/login'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
    }

    dataLogin = {
        'loginName': '15676752369',
        'password': '9217K..'
    }

    session.post(urlLogin, headers=headers, data=dataLogin)


# 获取书架页面每本小说的链接
def getShelfInfo(shelfUrl):
    data = session.get(shelfUrl)
    items = data.json()['data']
    
    list_books = []
    for item in items:
        list_books.append({'bookName': item['bookName'],
                           'bookId': item['bookId'],
                           'bookUrl': 'https://www.17k.com/book/' + str(item['bookId']) + '.html',
                           'chaptersUrl': 'https://www.17k.com/list/' + str(item['bookId']) + '.html'
                           })
    return list_books


# 获取章节页具体的小说详情页链接
def getListChapters(list_books):
    listChapters = []
    for book in list_books:
        chaptersUrl = book['chaptersUrl']

        chapterData = session.get(chaptersUrl)
        chapterData.encoding = 'utf-8'
        chapterHtmlData = chapterData.text

        selector = etree.HTML(chapterHtmlData)
        aResults = selector.xpath('//span[contains(text(), "正文")]/../../dd/a')

        pageLinks = []
        for item in aResults:
            chapterName = item.xpath('./span/text()')[0].strip()
            chapterHref = item.xpath('./@href')[0]
            pageLink = chapterName, 'https://www.17k.com' + chapterHref
            pageLinks.append(pageLink)
        listChapters.append([book['bookName'], pageLinks])
    return listChapters


# 获取小说内容页文字内容
def getContent(listChapters):
    for book in listChapters:
        print(book)
        bookName = book[0]
        for chapter in book[1]:
            chapterName = chapter[0]
            chapterUrl = chapter[1]
            # print(bookName, chapterName, chapterUrl)
            content = session.get(chapterUrl)
            content.encoding = 'utf-8'
            data_content = content.text

            selector_content = etree.HTML(data_content)
            p_list_content = selector_content.xpath('//*[@id="readArea"]/div[1]/div[2]//p[position()<last()]/text()')

            # 下载小说,将数据写入本地文件
            downloadContent(bookName, chapterName, p_list_content)


# 下载小说,将数据写入本地文件
def downloadContent(bookName, chapterName, p_list_content):
    time.sleep(random.uniform(0.3, 1.5))
    # 创建文件夹
    if not os.path.exists(bookName):
        os.mkdir(bookName)
    # 写入章节数据
    print('《{}{}: 开始写入本地数据'.format(bookName, chapterName))
    for p in p_list_content:
        with open(bookName + '/' + chapterName + '.txt', 'a') as f:
            f.write(p + '\n')


# Big step 0: ------------- 登陆 -------------
login()

# Big step 1: ------------- 获取书架信息 -------------
shelfUrl = 'https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919'
list_books = getShelfInfo(shelfUrl)
# print(list_books)


# Big step 2: ------------- 获取详情页链接 -------------
listChapters = getListChapters(list_books)
# print(listChapters)


# Big step 3: ------------- 获取小说内容页文字内容,并写入本地文件 -------------
getContent(listChapters)

写入本地文件类版本

写入 MySQL 类版本

模拟登陆获取书架数据

Last updated