某WZ
Proxy_AntiBanIP.py
import requests,random,time
from lxml import etree
# 代理提取API
proxy_url = 'http://proxy.siyetian.com/apis_get.html?token=MesJWLNp2Y35keJdXTqV1dNRVT45ERVdnT31STqFUeNpXQ10EVBl3TE1UNPR1Yx8ERjFTTE1ke.QO4MTOxMDOzcTA&limit=5&type=1&time=&data_format=json'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
}
# 使用代理提取API提取IP
res_proxies = requests.get(proxy_url, headers=headers).json()
print(res_proxies)
# 构建代理池
proxies_pool = []
for ip_item in res_proxies['data']:
socks5 = f"socks5://{ip_item['ip']}:{ip_item['port']}"
proxy = {'http': socks5, 'https': socks5}
proxies_pool.append(proxy)
print(proxies_pool)
url_init = 'https://wz.sun0769.com/political/index/politicsNewest?id=1&page='
# 使用代理池抓取100页数据
for i in range(1, 101):
time.sleep(0.1)
url = url_init + str(i)
choice_proxy = random.choice(proxies_pool)
response = requests.get(url, headers=headers, proxies=choice_proxy)
response.encoding = 'utf-8'
tree = etree.HTML(response.text)
titles = tree.xpath('/html/body/div[2]/div[3]/ul[2]/li/span[3]/a/text()')
print(f'Page{i}:', titles)Last updated