ks.wangxiao.cn 试题

spider.py

import time, os, pandas, requests
from lxml import etree

url = 'https://ks.wangxiao.cn/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
    'Referer': 'https://ks.wangxiao.cn/',
}


def get_category():
    # category_data = [{'first_category_title': '工程类', 'data': [{'second_category_title': '一级建筑师', 'second_category_listEveryday_url': 'https://ks.wangxiao.cn/practice/listEveryday?sign=jzs1', 'quiz_list': [{'20250206': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250206'}, {'20250205': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250205'}, {'20250204': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250204'}, {'20250201': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250201'}, {'20250131': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250131'}, {'20250130': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250130'}, {'20250129': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250129'}, {'20250128': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250128'}, {'20250127': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250127'}, {'20250126': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250126'}]}]}]
    category_data = []
    response = requests.get(url, headers=headers).text

    tree = etree.HTML(response)
    category_list = tree.xpath('//div[@id="banner"]/div[@class="min-center-width"]/ul/li')
    for category in category_list:
        first_category_title = category.xpath('./p/span/text()')[0]
        category_dic = {}
        category_dic['first_category_title'] = first_category_title
        category_dic['data'] = []
        category_data.append(category_dic)
        second_category_list = category.xpath('./div/a')
        for second_category in second_category_list:
            second_category_dic = {}
            second_category_title = second_category.xpath('./text()')[0]
            second_category_url = second_category.xpath('./@href')[0]
            second_category_listEveryday_url = 'https://ks.wangxiao.cn/practice/listEveryday?' + \
                                               second_category_url.split('?')[1]
            second_category_dic['second_category_title'] = second_category_title
            second_category_dic['second_category_listEveryday_url'] = second_category_listEveryday_url
            second_category_dic['quiz_list'] = []

            # 获取练习列表链接
            time.sleep(2)
            listEveryday_response = requests.get(second_category_listEveryday_url, headers=headers).text
            listEveryday_tree = etree.HTML(listEveryday_response)
            quiz_list = listEveryday_tree.xpath('//div[@class="test-panel"]/div/ul')
            if quiz_list:
                for quiz in quiz_list:
                    quiz_title = quiz.xpath('./li[1]/text()')[0]
                    quiz_url = quiz.xpath('./li[4]/a/@href')[0]
                    # print(quiz_title, quiz_url)
                    second_category_quiz_dic = {}
                    second_category_quiz_dic[quiz_title] = 'https://ks.wangxiao.cn' + quiz_url
                    second_category_dic['quiz_list'].append(second_category_quiz_dic)
            category_dic['data'].append(second_category_dic)
            break
        break
    return category_data


def get_quiz_data(second_category_page):
    url = second_category_page

    quiz_url = 'https://ks.wangxiao.cn/practice/listQuestions'

    headers2 = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
        'Referer': second_category_page,
        'Cookie': 'mantis6894=a8629d4c6a944276a6a377bf6927f874@6894; UserCookieName=pc_867224853; OldUsername2=N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D; OldUsername=N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D; UserCookieName_=pc_867224853; OldUsername2_=N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D; OldUsername_=N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D; OldPassword=Zjaf4MyWoxw%3D; OldPassword_=Zjaf4MyWoxw%3D; pc_867224853_exam=fangchan; userInfo=%7B%22userName%22%3A%22pc_867224853%22%2C%22token%22%3A%226554bb42-6449-4776-b3b7-a202b1de6b78%22%2C%22headImg%22%3Anull%2C%22nickName%22%3A%22156****2369%22%2C%22sign%22%3A%22fangchan%22%2C%22isBindingMobile%22%3A%221%22%2C%22isSubPa%22%3A%220%22%2C%22userNameCookies%22%3A%22N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D%22%2C%22passwordCookies%22%3A%22Zjaf4MyWoxw%3D%22%7D; token=6554bb42-6449-4776-b3b7-a202b1de6b78; tfstk=gOyxK1jTygKY2KMhnt1ojBqjE4SlKRE2errBIV0D1zU8lll6osrMWha0Wqc1ul4-qyqwIV5qIdhsPzZDnV2TadEUb-jqIik40Ak1-wX36orqQIXavbyvPUi0VcOXG_D5vgSImwXhKo1j0Aq5--b-51gqfAij1AZ52cisCmG_54GSADYjCAa12YitvIOsCjtSVcgS5AM_5us-jUAPW4tjSdFdgRceuHiJCdw-Db1uc41aP8iYn2EbHd96AmhxRogR8vMRBfUxM8Xv8mF7vzoY79JqhfFbOxwOJtH7Y74j2P_p64NL3Rh3HwdEl-uaJxNRPKHseDEtt7j694V_48h8H6xZ4-q_3XDCLEggZketXR7NeyEbkWGQhEIyXJ2Klg8nJcAf2gdw_jiy2qLJE33oUdm--iB9_ClP403h2edw_jir22juZCRZag5..; acw_tc=76700a2017388212223958982e8e5734a8510d561159f6b965fd7b4b13; safedog-flow-item='
    }

    practiceType = url.split('&')[0].split('?')[-1].split('=')[-1]
    sign = url.split('&')[1].split('=')[-1]
    subsign = url.split('&')[2].split('=')[-1]
    day = url.split('&')[-1].split('=')[-1]

    payload = {
        'practiceType': practiceType,
        'sign': sign,
        'subsign': subsign,
        'day': day
    }

    time.sleep(2)
    quiz_response = requests.post(quiz_url, headers=headers2, json=payload).json()
    print(f'{day} 爬取···')

    return quiz_response


def data_process(quiz_response):
    quiz_questions_list = quiz_response['Data'][0]['questions']
    # quiz_items = [{'question': 'c', 'answer': ['A', 'B', 'C', 'D'], 'isRight': [0, 0, 1, 0]}, {}]
    quiz_items = []

    for question in quiz_questions_list:
        # 获取问题
        quiz_item_dic = {}
        quiz_item_dic['question'] = question['content']
        # 获取答案(选项和正确答案)

        quiz_answers_list = question['options']
        quiz_answers = []
        quiz_is_right = []
        index_dic = {
            0: 'A',
            1: 'B',
            2: 'C',
            3: 'D'
        }
        for index, answer in enumerate(quiz_answers_list):
            quiz_answers.append(index_dic[index] + '. ' + str(answer['content']))
            quiz_is_right.append(answer['isRight'])
        quiz_item_dic['answers'] = quiz_answers
        quiz_item_dic['isRight'] = quiz_is_right

        quiz_items.append(quiz_item_dic)

    return quiz_items


# 获取所有分类 URL
category_data = get_category()
print(category_data)

for category in category_data:
    for second_category in category['data']:

        file_name = category['first_category_title'] + '-' + second_category['second_category_title'] + '.xlsx'
        sheet_name = ''

        for quiz_url_dic in second_category['quiz_list']:

            # 获取数据
            quiz_response = get_quiz_data(list(quiz_url_dic.values())[0])
            # 解析数据
            quiz_items = data_process(quiz_response)
            # 填充数据
            # quiz_items = [{'question': 'c', 'answers': ['A', 'B', 'C', 'D'], 'isRight': [0, 0, 1, 0]}, {}]
            quiz_data = []
            for quiz_item in quiz_items:
                quiz_data.append({'Question': quiz_item['question'], 'Answers-A': quiz_item['answers'][0],
                                  'Answers-B': quiz_item['answers'][1], 'Answers-C': quiz_item['answers'][2],
                                  'Answers-D': quiz_item['answers'][3], 'isRight': quiz_item['isRight']})
                print(quiz_item)
            sheet_name = list(quiz_url_dic.keys())[0]

            # 创建 pandas 表格,有'Question', 'Answers-A', 'Answers-B', 'Answers-C', 'Answers-D', 'isRight'6列,使用quiz_data填充数据
            table = pandas.DataFrame(
                columns=['Question', 'Answers-A', 'Answers-B', 'Answers-C', 'Answers-D', 'isRight'], data=quiz_data)

            # 检查文件是否存在
            if os.path.exists(file_name):
                mode = 'a'  # 追加模式
                if_sheet_exists = 'new'  # 创建新 sheet
            else:
                mode = 'w'  # 创建新文件
                if_sheet_exists = None  # 第一次写入,不需要这个参数
            # 写入本地 Excel 文件
            with pandas.ExcelWriter(file_name, engine='openpyxl', mode=mode, if_sheet_exists=if_sheet_exists) as writer:
                table.to_excel(writer, sheet_name=sheet_name, index=False)
            print(
                f'{category['first_category_title']} - {second_category['second_category_title']} : {list(quiz_url_dic.keys())[0]} - 写入数据完毕')

        print('写入本地xlsx完毕!')

Last updated