提交 9fa18dac 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

import requests
from bs4 import BeautifulSoup
from retry import retry
from base import BaseCore
from requests.packages import urllib3
urllib3.disable_warnings()
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Cookie': 'pkulaw_v6_sessionid=v1z41wppegb5phyqattpozp4; agency=sclx.pkulaw.com; referer=; Hm_lvt_25f0770f77e5e05b70c050b7d0f2f4a8=1707209811; Hm_lpvt_25f0770f77e5e05b70c050b7d0f2f4a8=1707209811; xCloseNew=7',
'Host': 'sclx.pkulaw.com',
'Origin': 'https://sclx.pkulaw.com',
'Pragma': 'no-cache',
'Referer': 'https://sclx.pkulaw.com/law',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
@retry(tries=2, delay=5)
def getHref(Keywords):
data = {
'Menu': 'law',
'Keywords': Keywords,
'PreKeywords': Keywords,
'SearchKeywordType': 'Title',
'MatchType': 'Exact',
'RangeType': 'Piece',
'Library': 'chl',
'ClassFlag': 'chl',
'GroupLibraries': '',
'QuerySearchCondition': 'Title+Exact+Piece+0',
'QueryOnClick': False,
'AfterSearch': True,
'RequestFrom': 'btnSearch',
'SearchInResult': '',
'PreviousLib': 'chl',
'IsSynonymSearch': 'false',
'RecordShowType': 'List',
'ClassCodeKey': ',,,,,,',
'IsSearchErrorKeyword': '',
'FirstQueryKeywords': Keywords,
'FirstQueryKeywordType': 'Title',
'IsSynonymSearch': 'false',
'X-Requested-With': 'XMLHttpRequest',
}
ip = baseCore.get_proxy()
url = 'https://sclx.pkulaw.com/law/chl'
req = requests.get(url, headers=headers, data=data, proxies=ip, verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
try:
tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={
'name': 'HistoryAssociation'})
href = 'https://sclx.pkulaw.com' + tag.get('url')
except:
href = ''
return href
@retry(tries=2, delay=5)
def getData(href):
ip = baseCore.get_proxy()
req = requests.get(href, headers=headers, proxies=ip, verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
li_list = soup.find_all('li')
for li in li_list:
publishDate = li.find('span', class_='time').text.strip()
try:
theme = li.find('div', class_='theme').text.strip()
except:
theme = ''
try:
relevance = li.find('div', class_='relevance').text.strip()
except:
relevance = ''
log.info(f'{publishDate}==={theme}==={relevance}')
def doJob():
Keywords = '中华人民共和国公司法(2023修订)'
href = getHref(Keywords)
if href:
getData(href)
if __name__ == '__main__':
doJob()
baseCore.close()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论