提交 841ed4b6 作者: 薛凌堃

北大法宝法规

上级 9b27f1fe
import csv
import time
import redis
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from retry import retry from retry import retry
from selenium.common import StaleElementReferenceException
from base import BaseCore from base import BaseCore
from requests.packages import urllib3 from requests.packages import urllib3
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
urllib3.disable_warnings() urllib3.disable_warnings()
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
headers = { headers = {
'Accept': '*/*', 'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
...@@ -30,52 +40,100 @@ headers = { ...@@ -30,52 +40,100 @@ headers = {
'sec-ch-ua-platform': '"Windows"', 'sec-ch-ua-platform': '"Windows"',
} }
# todo:使用模拟浏览器
def create_driver():
path = r'D:\soft\msedgedriver.exe'
@retry(tries=2, delay=5) # options = webdriver.EdgeOptions()
def getHref(Keywords): options = {
data = { "browserName": "MicrosoftEdge",
'Menu': 'law', "ms:edgeOptions": {
'Keywords': Keywords, "extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
'PreKeywords': Keywords, }
'SearchKeywordType': 'Title',
'MatchType': 'Exact',
'RangeType': 'Piece',
'Library': 'chl',
'ClassFlag': 'chl',
'GroupLibraries': '',
'QuerySearchCondition': 'Title+Exact+Piece+0',
'QueryOnClick': False,
'AfterSearch': True,
'RequestFrom': 'btnSearch',
'SearchInResult': '',
'PreviousLib': 'chl',
'IsSynonymSearch': 'false',
'RecordShowType': 'List',
'ClassCodeKey': ',,,,,,',
'IsSearchErrorKeyword': '',
'FirstQueryKeywords': Keywords,
'FirstQueryKeywordType': 'Title',
'IsSynonymSearch': 'false',
'X-Requested-With': 'XMLHttpRequest',
} }
ip = baseCore.get_proxy()
url = 'https://sclx.pkulaw.com/law/chl' driver = webdriver.Edge(executable_path=path, capabilities=options)
req = requests.get(url, headers=headers, data=data, proxies=ip, verify=False) return driver
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
@retry(tries=2, delay=5)
def getHref(Keywords, driver):
# data = {
# 'Menu': 'law',
# 'Keywords': Keywords,
# 'PreKeywords': Keywords,
# 'SearchKeywordType': 'Title',
# 'MatchType': 'Exact',
# 'RangeType': 'Piece',
# 'Library': 'chl',
# 'ClassFlag': 'chl',
# 'GroupLibraries': '',
# 'QuerySearchCondition': 'Title+Exact+Piece+0',
# 'QueryOnClick': False,
# 'AfterSearch': True,
# 'RequestFrom': 'btnSearch',
# 'SearchInResult': '',
# 'PreviousLib': 'chl',
# 'IsSynonymSearch': 'false',
# 'RecordShowType': 'List',
# 'ClassCodeKey': ',,,,,,',
# 'IsSearchErrorKeyword': '',
# 'FirstQueryKeywords': Keywords,
# 'FirstQueryKeywordType': 'Title',
# 'IsSynonymSearch': 'false',
# 'X-Requested-With': 'XMLHttpRequest',
# }
driver.get('https://sclx.pkulaw.com/law')
# ip = baseCore.get_proxy()
driver.find_element(By.ID, 'txtSearch').send_keys(Keywords)
time.sleep(0.5)
driver.find_element(By.CLASS_NAME, 'btn-search').click()
wait = WebDriverWait(driver, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "accompanying-wrap")))
getpart = driver.find_element(By.CLASS_NAME, 'accompanying-wrap')
# li_list = getpart.find_elements(By.TAG_NAME, 'li')
# for li in li_list:
driver.execute_script("arguments[0].scrollIntoView();", getpart)
time.sleep(2)
try: try:
tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={ element = getpart.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
'name': 'HistoryAssociation'}) time.sleep(1)
href = 'https://sclx.pkulaw.com' + tag.get('url') driver.execute_script("arguments[0].scrollIntoView();", element)
except: time.sleep(1)
href = '' element.click()
href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "a-tab-col")))
info_part = driver.find_element(By.CLASS_NAME, 'a-tab-col').find_element(By.XPATH, './/div[@name="HistoryAssociation"]')
# except Exception as e:
except StaleElementReferenceException:
# 元素已经stale,重新定位元素
element = driver.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
element.click() # 再次尝试与元素交互
href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
# log.info(e)
# href = ''
return href return href
# url = 'https://sclx.pkulaw.com/law/chl'
# req = requests.post(url, headers=headers, data=data, proxies=ip)
# req = requests.post(url, headers=headers, data=data, verify=False)
# req.encoding = req.apparent_encoding
# soup = BeautifulSoup(req.text, 'html.parser')
# try:
# tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={
# 'name': 'HistoryAssociation'})
# href = 'https://sclx.pkulaw.com' + tag.get('url')
# except:
# href = ''
# return href
@retry(tries=2, delay=5)
def getData(href): @retry(tries=3, delay=5)
ip = baseCore.get_proxy() def getData(href, Keywords):
req = requests.get(href, headers=headers, proxies=ip, verify=False) term = Keywords
# ip = baseCore.get_proxy()
# req = requests.get(href, headers=headers, proxies=ip)
req = requests.get(href, headers=headers)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser') soup = BeautifulSoup(req.text, 'html.parser')
li_list = soup.find_all('li') li_list = soup.find_all('li')
...@@ -85,19 +143,59 @@ def getData(href): ...@@ -85,19 +143,59 @@ def getData(href):
theme = li.find('div', class_='theme').text.strip() theme = li.find('div', class_='theme').text.strip()
except: except:
theme = '' theme = ''
try: # try:
relevance = li.find('div', class_='relevance').text.strip() # relevance = li.find('div', class_='relevance').text.strip()
except: # except:
relevance = '' # relevance = ''
log.info(f'{publishDate}==={theme}==={relevance}') # log.info(f'{publishDate}==={theme}==')
term += ',' + theme + '_' + publishDate
log.info(term)
if ',' not in term or '_' not in term:
r.rpush('ShenjisclxError:', Keywords)
return None
return term
def doJob(): def doJob():
Keywords = '中华人民共和国公司法(2023修订)' data_list = []
href = getHref(Keywords) driver = create_driver()
if href: driver.maximize_window()
getData(href)
while True:
try:
Keywords = r.lpop('Shenjisclx:').decode()
# Keywords = '中华人民共和国银行业监督管理法(2006修正)'
except:
Keywords = ''
if Keywords:
try:
href = getHref(Keywords, driver)
if href:
r.rpush('ShenjisclxHref:', f'{Keywords}|{href}')
log.info(f'{Keywords}====找到=== {href}')
term = getData(href, Keywords)
else:
term = Keywords + ','
r.rpush('ShenjisclxHrefNull:', f'{Keywords}|{href}')
log.info(f'{Keywords}====未找到')
if term:
# data_list.append(term)
r.rpush('ShenjisclxReault:', term)
except:
r.rpush('ShenjisclxError:', Keywords)
continue
time.sleep(2)
else:
break
# print(data_list)
# with open('./output.csv', 'w', newline='') as file:
# writer = csv.writer(file)
#
# # 写入数据
# for row in data_list:
# writer.writerow(row.split(','))
#
# print('数据已成功写入CSV文件')
if __name__ == '__main__': if __name__ == '__main__':
doJob() doJob()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论