ks.wangxiao.cn 试题
spider.py
import time, os, pandas, requests
from lxml import etree
url = 'https://ks.wangxiao.cn/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'Referer': 'https://ks.wangxiao.cn/',
}
def get_category():
# category_data = [{'first_category_title': '工程类', 'data': [{'second_category_title': '一级建筑师', 'second_category_listEveryday_url': 'https://ks.wangxiao.cn/practice/listEveryday?sign=jzs1', 'quiz_list': [{'20250206': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250206'}, {'20250205': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250205'}, {'20250204': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250204'}, {'20250201': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250201'}, {'20250131': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250131'}, {'20250130': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250130'}, {'20250129': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250129'}, {'20250128': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250128'}, {'20250127': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250127'}, {'20250126': 'https://ks.wangxiao.cn/practice/getQuestion?practiceType=1&sign=jzs1&subsign=5166078fbf1eed222fe9&day=20250126'}]}]}]
category_data = []
response = requests.get(url, headers=headers).text
tree = etree.HTML(response)
category_list = tree.xpath('//div[@id="banner"]/div[@class="min-center-width"]/ul/li')
for category in category_list:
first_category_title = category.xpath('./p/span/text()')[0]
category_dic = {}
category_dic['first_category_title'] = first_category_title
category_dic['data'] = []
category_data.append(category_dic)
second_category_list = category.xpath('./div/a')
for second_category in second_category_list:
second_category_dic = {}
second_category_title = second_category.xpath('./text()')[0]
second_category_url = second_category.xpath('./@href')[0]
second_category_listEveryday_url = 'https://ks.wangxiao.cn/practice/listEveryday?' + \
second_category_url.split('?')[1]
second_category_dic['second_category_title'] = second_category_title
second_category_dic['second_category_listEveryday_url'] = second_category_listEveryday_url
second_category_dic['quiz_list'] = []
# 获取练习列表链接
time.sleep(2)
listEveryday_response = requests.get(second_category_listEveryday_url, headers=headers).text
listEveryday_tree = etree.HTML(listEveryday_response)
quiz_list = listEveryday_tree.xpath('//div[@class="test-panel"]/div/ul')
if quiz_list:
for quiz in quiz_list:
quiz_title = quiz.xpath('./li[1]/text()')[0]
quiz_url = quiz.xpath('./li[4]/a/@href')[0]
# print(quiz_title, quiz_url)
second_category_quiz_dic = {}
second_category_quiz_dic[quiz_title] = 'https://ks.wangxiao.cn' + quiz_url
second_category_dic['quiz_list'].append(second_category_quiz_dic)
category_dic['data'].append(second_category_dic)
break
break
return category_data
def get_quiz_data(second_category_page):
url = second_category_page
quiz_url = 'https://ks.wangxiao.cn/practice/listQuestions'
headers2 = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'Referer': second_category_page,
'Cookie': 'mantis6894=a8629d4c6a944276a6a377bf6927f874@6894; UserCookieName=pc_867224853; OldUsername2=N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D; OldUsername=N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D; UserCookieName_=pc_867224853; OldUsername2_=N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D; OldUsername_=N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D; OldPassword=Zjaf4MyWoxw%3D; OldPassword_=Zjaf4MyWoxw%3D; pc_867224853_exam=fangchan; userInfo=%7B%22userName%22%3A%22pc_867224853%22%2C%22token%22%3A%226554bb42-6449-4776-b3b7-a202b1de6b78%22%2C%22headImg%22%3Anull%2C%22nickName%22%3A%22156****2369%22%2C%22sign%22%3A%22fangchan%22%2C%22isBindingMobile%22%3A%221%22%2C%22isSubPa%22%3A%220%22%2C%22userNameCookies%22%3A%22N1cDvlBveQZUi62%2Ff0mlNQ%3D%3D%22%2C%22passwordCookies%22%3A%22Zjaf4MyWoxw%3D%22%7D; token=6554bb42-6449-4776-b3b7-a202b1de6b78; tfstk=gOyxK1jTygKY2KMhnt1ojBqjE4SlKRE2errBIV0D1zU8lll6osrMWha0Wqc1ul4-qyqwIV5qIdhsPzZDnV2TadEUb-jqIik40Ak1-wX36orqQIXavbyvPUi0VcOXG_D5vgSImwXhKo1j0Aq5--b-51gqfAij1AZ52cisCmG_54GSADYjCAa12YitvIOsCjtSVcgS5AM_5us-jUAPW4tjSdFdgRceuHiJCdw-Db1uc41aP8iYn2EbHd96AmhxRogR8vMRBfUxM8Xv8mF7vzoY79JqhfFbOxwOJtH7Y74j2P_p64NL3Rh3HwdEl-uaJxNRPKHseDEtt7j694V_48h8H6xZ4-q_3XDCLEggZketXR7NeyEbkWGQhEIyXJ2Klg8nJcAf2gdw_jiy2qLJE33oUdm--iB9_ClP403h2edw_jir22juZCRZag5..; acw_tc=76700a2017388212223958982e8e5734a8510d561159f6b965fd7b4b13; safedog-flow-item='
}
practiceType = url.split('&')[0].split('?')[-1].split('=')[-1]
sign = url.split('&')[1].split('=')[-1]
subsign = url.split('&')[2].split('=')[-1]
day = url.split('&')[-1].split('=')[-1]
payload = {
'practiceType': practiceType,
'sign': sign,
'subsign': subsign,
'day': day
}
time.sleep(2)
quiz_response = requests.post(quiz_url, headers=headers2, json=payload).json()
print(f'{day} 爬取···')
return quiz_response
def data_process(quiz_response):
quiz_questions_list = quiz_response['Data'][0]['questions']
# quiz_items = [{'question': 'c', 'answer': ['A', 'B', 'C', 'D'], 'isRight': [0, 0, 1, 0]}, {}]
quiz_items = []
for question in quiz_questions_list:
# 获取问题
quiz_item_dic = {}
quiz_item_dic['question'] = question['content']
# 获取答案(选项和正确答案)
quiz_answers_list = question['options']
quiz_answers = []
quiz_is_right = []
index_dic = {
0: 'A',
1: 'B',
2: 'C',
3: 'D'
}
for index, answer in enumerate(quiz_answers_list):
quiz_answers.append(index_dic[index] + '. ' + str(answer['content']))
quiz_is_right.append(answer['isRight'])
quiz_item_dic['answers'] = quiz_answers
quiz_item_dic['isRight'] = quiz_is_right
quiz_items.append(quiz_item_dic)
return quiz_items
# 获取所有分类 URL
category_data = get_category()
print(category_data)
for category in category_data:
for second_category in category['data']:
file_name = category['first_category_title'] + '-' + second_category['second_category_title'] + '.xlsx'
sheet_name = ''
for quiz_url_dic in second_category['quiz_list']:
# 获取数据
quiz_response = get_quiz_data(list(quiz_url_dic.values())[0])
# 解析数据
quiz_items = data_process(quiz_response)
# 填充数据
# quiz_items = [{'question': 'c', 'answers': ['A', 'B', 'C', 'D'], 'isRight': [0, 0, 1, 0]}, {}]
quiz_data = []
for quiz_item in quiz_items:
quiz_data.append({'Question': quiz_item['question'], 'Answers-A': quiz_item['answers'][0],
'Answers-B': quiz_item['answers'][1], 'Answers-C': quiz_item['answers'][2],
'Answers-D': quiz_item['answers'][3], 'isRight': quiz_item['isRight']})
print(quiz_item)
sheet_name = list(quiz_url_dic.keys())[0]
# 创建 pandas 表格,有'Question', 'Answers-A', 'Answers-B', 'Answers-C', 'Answers-D', 'isRight'6列,使用quiz_data填充数据
table = pandas.DataFrame(
columns=['Question', 'Answers-A', 'Answers-B', 'Answers-C', 'Answers-D', 'isRight'], data=quiz_data)
# 检查文件是否存在
if os.path.exists(file_name):
mode = 'a' # 追加模式
if_sheet_exists = 'new' # 创建新 sheet
else:
mode = 'w' # 创建新文件
if_sheet_exists = None # 第一次写入,不需要这个参数
# 写入本地 Excel 文件
with pandas.ExcelWriter(file_name, engine='openpyxl', mode=mode, if_sheet_exists=if_sheet_exists) as writer:
table.to_excel(writer, sheet_name=sheet_name, index=False)
print(
f'{category['first_category_title']} - {second_category['second_category_title']} : {list(quiz_url_dic.keys())[0]} - 写入数据完毕')
print('写入本地xlsx完毕!')
Last updated