17K 小说

17k_spider.py

写入本地文件函数版本

import requests
from lxml import etree
import os
import time
import random

session = requests.session()


# 登陆
def login():
    urlLogin = 'https://passport.17k.com/ck/user/login'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
    }

    dataLogin = {
        'loginName': '15676752369',
        'password': '9217K..'
    }

    session.post(urlLogin, headers=headers, data=dataLogin)


# 获取书架页面每本小说的链接
def getShelfInfo(shelfUrl):
    data = session.get(shelfUrl)
    items = data.json()['data']
    
    list_books = []
    for item in items:
        list_books.append({'bookName': item['bookName'],
                           'bookId': item['bookId'],
                           'bookUrl': 'https://www.17k.com/book/' + str(item['bookId']) + '.html',
                           'chaptersUrl': 'https://www.17k.com/list/' + str(item['bookId']) + '.html'
                           })
    return list_books


# 获取章节页具体的小说详情页链接
def getListChapters(list_books):
    listChapters = []
    for book in list_books:
        chaptersUrl = book['chaptersUrl']

        chapterData = session.get(chaptersUrl)
        chapterData.encoding = 'utf-8'
        chapterHtmlData = chapterData.text

        selector = etree.HTML(chapterHtmlData)
        aResults = selector.xpath('//span[contains(text(), "正文")]/../../dd/a')

        pageLinks = []
        for item in aResults:
            chapterName = item.xpath('./span/text()')[0].strip()
            chapterHref = item.xpath('./@href')[0]
            pageLink = chapterName, 'https://www.17k.com' + chapterHref
            pageLinks.append(pageLink)
        listChapters.append([book['bookName'], pageLinks])
    return listChapters


# 获取小说内容页文字内容
def getContent(listChapters):
    for book in listChapters:
        print(book)
        bookName = book[0]
        for chapter in book[1]:
            chapterName = chapter[0]
            chapterUrl = chapter[1]
            # print(bookName, chapterName, chapterUrl)
            content = session.get(chapterUrl)
            content.encoding = 'utf-8'
            data_content = content.text

            selector_content = etree.HTML(data_content)
            p_list_content = selector_content.xpath('//*[@id="readArea"]/div[1]/div[2]//p[position()<last()]/text()')

            # 下载小说,将数据写入本地文件
            downloadContent(bookName, chapterName, p_list_content)


# 下载小说,将数据写入本地文件
def downloadContent(bookName, chapterName, p_list_content):
    time.sleep(random.uniform(0.3, 1.5))
    # 创建文件夹
    if not os.path.exists(bookName):
        os.mkdir(bookName)
    # 写入章节数据
    print('《{}{}: 开始写入本地数据'.format(bookName, chapterName))
    for p in p_list_content:
        with open(bookName + '/' + chapterName + '.txt', 'a') as f:
            f.write(p + '\n')


# Big step 0: ------------- 登陆 -------------
login()

# Big step 1: ------------- 获取书架信息 -------------
shelfUrl = 'https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919'
list_books = getShelfInfo(shelfUrl)
# print(list_books)


# Big step 2: ------------- 获取详情页链接 -------------
listChapters = getListChapters(list_books)
# print(listChapters)


# Big step 3: ------------- 获取小说内容页文字内容,并写入本地文件 -------------
getContent(listChapters)

写入本地文件类版本

import requests
from lxml import etree
import os
import time
import random


class fiction_spider():
    def __init__(self):
        self.session = requests.session()

    # 登陆
    def login(self):
        urlLogin = 'https://passport.17k.com/ck/user/login'

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
        }

        dataLogin = {
            'loginName': '15676752369',
            'password': '9217K..'
        }

        self.session.post(urlLogin, headers=headers, data=dataLogin)


    # 获取书架页面每本小说的链接
    def getShelfInfo(self, shelfUrl):
        data = self.session.get(shelfUrl)
        items = data.json()['data']
        
        list_books = []
        for item in items:
            list_books.append({'bookName': item['bookName'],
                               'bookId': item['bookId'],
                               'bookUrl': 'https://www.17k.com/book/' + str(item['bookId']) + '.html',
                               'chaptersUrl': 'https://www.17k.com/list/' + str(item['bookId']) + '.html'
                               })
        return list_books


    # 获取章节页具体的小说详情页链接
    def getListChapters(self, list_books):
        listChapters = []
        for book in list_books:
            chaptersUrl = book['chaptersUrl']

            chapterData = self.session.get(chaptersUrl)
            chapterData.encoding = 'utf-8'
            chapterHtmlData = chapterData.text

            selector = etree.HTML(chapterHtmlData)
            aResults = selector.xpath('//span[contains(text(), "正文")]/../../dd/a')

            pageLinks = []
            for item in aResults:
                chapterName = item.xpath('./span/text()')[0].strip()
                chapterHref = item.xpath('./@href')[0]
                pageLink = chapterName, 'https://www.17k.com' + chapterHref
                pageLinks.append(pageLink)
            listChapters.append([book['bookName'], pageLinks])
        return listChapters


    # 获取小说内容页文字内容
    def getContent(self, listChapters):
        for book in listChapters:
            print(book)
            bookName = book[0]
            for chapter in book[1]:
                chapterName = chapter[0]
                chapterUrl = chapter[1]
                # print(bookName, chapterName, chapterUrl)
                content = self.session.get(chapterUrl)
                content.encoding = 'utf-8'
                data_content = content.text

                selector_content = etree.HTML(data_content)
                p_list_content = selector_content.xpath('//*[@id="readArea"]/div[1]/div[2]//p[position()<last()]/text()')

                # 下载小说,将数据写入本地文件
                self.downloadContent(bookName, chapterName, p_list_content)


    # 下载小说,将数据写入本地文件
    def downloadContent(self, bookName, chapterName, p_list_content):
        time.sleep(random.uniform(0.3, 1.5))
        # 创建文件夹
        if not os.path.exists(bookName):
            os.mkdir(bookName)
        # 写入章节数据
        print('《{}》{}: 开始写入本地数据'.format(bookName, chapterName))
        for p in p_list_content:
            with open(bookName + '/' + chapterName + '.txt', 'a') as f:
                f.write(p + '\n')


    # 主逻辑
    def run(self):
        # Big step 0: ------------- 登陆 -------------
        self.login()

        # Big step 1: ------------- 获取书架信息 -------------
        shelfUrl = 'https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919'
        list_books = self.getShelfInfo(shelfUrl)
        # print(list_books)


        # Big step 2: ------------- 获取详情页链接 -------------
        listChapters = self.getListChapters(list_books)
        # print(listChapters)


        # Big step 3: ------------- 获取小说内容页文字内容,并写入本地文件 -------------
        self.getContent(listChapters)

s1 = fiction_spider()
s1.run()

写入 MySQL 类版本

import requests
from lxml import etree
import pymysql

class fiction_spider():
    def __init__(self, host, user, password, database, **kwargs):
        # 连接数据库服务器
        self.conn = pymysql.connect(host=host, user=user, password=password, database=database,
                             cursorclass=pymysql.cursors.DictCursor, **kwargs)
        # 获取游标
        self.cursor = self.conn.cursor()
        # 初始化表
        self.init_MySQL()

        self.session = requests.session()


    # 初始化数据表
    def init_MySQL(self):
        # 创建bookList表
        sql_bookList = """
        create table if not exists bookList(
            bookId int primary key ,
            bookName varchar(64),
            bookUrl varchar(64),
            chaptersUrl varchar(64)
        )
        """
        self.cursor.execute(sql_bookList)
        self.conn.commit()

        # 创建bookContent表
        sql_book_content = """
        create table if not exists bookContent(
            bookId int,
            chapterId int,
            primary key (bookId, chapterId),
            chapterName varchar(64),
            chapterUrl varchar(64),
            chapterContent text
        )
        """
        self.cursor.execute(sql_book_content)
        self.conn.commit()


    # 登陆
    def login(self):
        urlLogin = 'https://passport.17k.com/ck/user/login'

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
        }

        dataLogin = {
            'loginName': '15676752369',
            'password': '9217K..'
        }

        self.session.post(urlLogin, headers=headers, data=dataLogin)


    # 获取书架页面每本小说的链接
    def getShelfInfo(self, shelfUrl):
        data = self.session.get(shelfUrl)
        list_books = []
        # list_books_sql = []
        items = data.json()['data']
        for item in items:
            list_books.append({'bookName': item['bookName'],
                               'bookId': item['bookId'],
                               'bookUrl': 'https://www.17k.com/book/' + str(item['bookId']) + '.html',
                               'chaptersUrl': 'https://www.17k.com/list/' + str(item['bookId']) + '.html'
                               })
            # 拼接“书籍列表信息SQL”
            item_list_book_sql = f"insert into bookList values ({item['bookId']}, '{item['bookName']}', 'https://www.17k.com/book/{item['bookId']}.html', 'https://www.17k.com/list/{item['bookId']}.html')"
            # 写库
            self.cursor.execute(item_list_book_sql)
            self.conn.commit()
            print("书籍信息: {} {} 成功写入数据库".format(item['bookId'], item['bookName']))

        return list_books


# 获取章节页具体的小说详情页链接
    def getListChapters(self, list_books):
        listChapters = []
        for book in list_books:
            chaptersUrl = book['chaptersUrl']
            chapterData = self.session.get(chaptersUrl)
            chapterData.encoding = 'utf-8'
            chapterHtmlData = chapterData.text

            selector = etree.HTML(chapterHtmlData)
            aResults = selector.xpath('//span[contains(text(), "正文")]/../../dd/a')

            pageLinks = []
            for item in aResults:
                chapterName = item.xpath('./span/text()')[0].strip()
                chapterHref = item.xpath('./@href')[0]
                pageLink = chapterName, 'https://www.17k.com' + chapterHref
                pageLinks.append(pageLink)
            listChapters.append([book['bookName'], book['bookId'], pageLinks])

        return listChapters


    # 获取小说内容页文字内容
    def getContent(self, listChapters):
        for book in listChapters:
            bookName = book[0]
            bookId = book[1]
            chapterId = 0

            print("章节信息读取: {}\n开始写入《{}》章节内容数据:".format(book, bookName))
            for chapter in book[2]:
                chapterName = chapter[0]
                chapterUrl = chapter[1]
                chapterId += 1
                # print(bookName, chapterName, chapterUrl)
                content = self.session.get(chapterUrl)
                content.encoding = 'utf-8'
                data_content = content.text

                selector_content = etree.HTML(data_content)
                p_list_content = selector_content.xpath('//*[@id="readArea"]/div[1]/div[2]//p[position()<last()]/text()')
                chapter_content = "\n".join(p_list_content)

                # 拼接“内容SQL”
                ChapterContentSQL = f"""insert into bookContent values ({bookId}, {chapterId}, "{chapterName}", "{chapterUrl}", '{chapter_content}')"""
                # print(ChapterContentSQL)
                # 写库
                self.cursor.execute(ChapterContentSQL)
                self.conn.commit()
                print("章节内容: {} {} 成功写入数据库".format(chapterName, chapterUrl))
            print('《{}》成功写入数据库\n'.format(bookName))


    # 主逻辑
    def run(self):
        # Big step 0: ------------- 初始化数据库表 -------------
        print('Step0: 开始初始化数据库表')
        self.init_MySQL()

        # Big step 1: ------------- 登陆 -------------
        print('\n\nStep1: 开始登陆')
        self.login()

        # Big step 2: ------------- 获取书架信息 -------------
        print("\n\nStep2: 开始读取书架中的书籍信息...")
        shelfUrl = 'https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919'
        list_books = self.getShelfInfo(shelfUrl)
        # print(list_books)

        # Big step 3: ------------- 获取详情页链接 -------------
        print("\n\nstep3: 开始读取书籍章节详细信息...")
        listChapters = self.getListChapters(list_books)

        # Big step 4: ------------- 获取小说内容页文字内容,并写入数据库 -------------
        print("\n\nstep4: 开始读取章节内容...")
        self.getContent(listChapters)

        # Big step5:
        self.cursor.close()
        self.conn.close()


if __name__ == '__main__':
    s1 = fiction_spider(host='localhost', user='root', password='12345678', database='fiction')
    s1.run()

模拟登陆获取书架数据

import requests

session = requests.session()

def login():
    urlLogin = 'https://passport.17k.com/ck/user/login'
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
    }

    dataLogin = {
        'loginName': 'xxxxxx',
        'password': 'xxxxxx'
    }

    res = session.post(urlLogin, headers=headers, data=dataLogin)
    print('Request headers:', res.request.headers, '\n')
    print('Response headers:', res.headers, '\n')
    print('Session Cookie:', session.cookies, '\n')

def getShelf():
    res = session.get('https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919')
    res.encoding = 'utf-8'
    print('Shelf info:', res.json()['data'])


login()
getShelf()

Last updated