北大法宝法规

841ed4b6 · 薛凌堃 · 9b27f1fe · 841ed4b6
--- a/shenji/sclx.py
+++ b/shenji/sclx.py
+import csv
+import time
+import redis
 import requests
 from bs4 import BeautifulSoup
 from retry import retry
+from selenium.common import StaleElementReferenceException
 from base import BaseCore
 from requests.packages import urllib3
+from selenium.webdriver.common.by import By
+from selenium import webdriver
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 urllib3.disable_warnings()
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
+r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
 headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
@@ -30,52 +40,100 @@ headers = {
    'sec-ch-ua-platform': '"Windows"',
 }
+# todo:使用模拟浏览器
+def create_driver():
+    path = r'D:\soft\msedgedriver.exe'
-@retry(tries=2, delay=5)
+    # options = webdriver.EdgeOptions()
-def getHref(Keywords):
+    options = {
-    data = {
+        "browserName": "MicrosoftEdge",
-        'Menu': 'law',
+        "ms:edgeOptions": {
-        'Keywords': Keywords,
+            "extensions": [], "args": ["--start-maximized"]  # 添加最大化窗口运作参数
-        'PreKeywords': Keywords,
+        }
-        'SearchKeywordType': 'Title',
-        'MatchType': 'Exact',
-        'RangeType': 'Piece',
-        'Library': 'chl',
-        'ClassFlag': 'chl',
-        'GroupLibraries': '',
-        'QuerySearchCondition': 'Title+Exact+Piece+0',
-        'QueryOnClick': False,
-        'AfterSearch': True,
-        'RequestFrom': 'btnSearch',
-        'SearchInResult': '',
-        'PreviousLib': 'chl',
-        'IsSynonymSearch': 'false',
-        'RecordShowType': 'List',
-        'ClassCodeKey': ',,,,,,',
-        'IsSearchErrorKeyword': '',
-        'FirstQueryKeywords': Keywords,
-        'FirstQueryKeywordType': 'Title',
-        'IsSynonymSearch': 'false',
-        'X-Requested-With': 'XMLHttpRequest',
    }
-    ip = baseCore.get_proxy()
-    url = 'https://sclx.pkulaw.com/law/chl'
+    driver = webdriver.Edge(executable_path=path, capabilities=options)
-    req = requests.get(url, headers=headers, data=data, proxies=ip, verify=False)
+    return driver
-    req.encoding = req.apparent_encoding
-    soup = BeautifulSoup(req.text, 'html.parser')
+@retry(tries=2, delay=5)
+def getHref(Keywords, driver):
+    # data = {
+    #     'Menu': 'law',
+    #     'Keywords': Keywords,
+    #     'PreKeywords': Keywords,
+    #     'SearchKeywordType': 'Title',
+    #     'MatchType': 'Exact',
+    #     'RangeType': 'Piece',
+    #     'Library': 'chl',
+    #     'ClassFlag': 'chl',
+    #     'GroupLibraries': '',
+    #     'QuerySearchCondition': 'Title+Exact+Piece+0',
+    #     'QueryOnClick': False,
+    #     'AfterSearch': True,
+    #     'RequestFrom': 'btnSearch',
+    #     'SearchInResult': '',
+    #     'PreviousLib': 'chl',
+    #     'IsSynonymSearch': 'false',
+    #     'RecordShowType': 'List',
+    #     'ClassCodeKey': ',,,,,,',
+    #     'IsSearchErrorKeyword': '',
+    #     'FirstQueryKeywords': Keywords,
+    #     'FirstQueryKeywordType': 'Title',
+    #     'IsSynonymSearch': 'false',
+    #     'X-Requested-With': 'XMLHttpRequest',
+    # }
+    driver.get('https://sclx.pkulaw.com/law')
+    # ip = baseCore.get_proxy()
+    driver.find_element(By.ID, 'txtSearch').send_keys(Keywords)
+    time.sleep(0.5)
+    driver.find_element(By.CLASS_NAME, 'btn-search').click()
+    wait = WebDriverWait(driver, 30)
+    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "accompanying-wrap")))
+    getpart = driver.find_element(By.CLASS_NAME, 'accompanying-wrap')
+    # li_list = getpart.find_elements(By.TAG_NAME, 'li')
+    # for li in li_list:
+    driver.execute_script("arguments[0].scrollIntoView();", getpart)
+    time.sleep(2)
    try:
-        tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={
+        element = getpart.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
-            'name': 'HistoryAssociation'})
+        time.sleep(1)
-        href = 'https://sclx.pkulaw.com' + tag.get('url')
+        driver.execute_script("arguments[0].scrollIntoView();", element)
-    except:
+        time.sleep(1)
-        href = ''
+        element.click()
+        href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
+        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "a-tab-col")))
+        info_part = driver.find_element(By.CLASS_NAME, 'a-tab-col').find_element(By.XPATH, './/div[@name="HistoryAssociation"]')
+    # except Exception as e:
+    except StaleElementReferenceException:
+        # 元素已经stale，重新定位元素
+        element = driver.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
+        element.click()  # 再次尝试与元素交互
+        href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
+        # log.info(e)
+        # href = ''
    return href
+    # url = 'https://sclx.pkulaw.com/law/chl'
+    # req = requests.post(url, headers=headers, data=data, proxies=ip)
+    # req = requests.post(url, headers=headers, data=data, verify=False)
+    # req.encoding = req.apparent_encoding
+    # soup = BeautifulSoup(req.text, 'html.parser')
+    # try:
+        # tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={
+        #     'name': 'HistoryAssociation'})
+        # href = 'https://sclx.pkulaw.com' + tag.get('url')
+    # except:
+    #     href = ''
+    # return href
-@retry(tries=2, delay=5)
-def getData(href):
+@retry(tries=3, delay=5)
-    ip = baseCore.get_proxy()
+def getData(href, Keywords):
-    req = requests.get(href, headers=headers, proxies=ip, verify=False)
+    term = Keywords
+    # ip = baseCore.get_proxy()
+    # req = requests.get(href, headers=headers, proxies=ip)
+    req = requests.get(href, headers=headers)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    li_list = soup.find_all('li')
@@ -85,19 +143,59 @@ def getData(href):
            theme = li.find('div', class_='theme').text.strip()
        except:
            theme = ''
-        try:
+        # try:
-            relevance = li.find('div', class_='relevance').text.strip()
+        #     relevance = li.find('div', class_='relevance').text.strip()
-        except:
+        # except:
-            relevance = ''
+        #     relevance = ''
-        log.info(f'{publishDate}==={theme}==={relevance}')
+        # log.info(f'{publishDate}==={theme}==')
+        term += ',' + theme + '_' + publishDate
+    log.info(term)
+    if ',' not in term or '_' not in term:
+        r.rpush('ShenjisclxError:', Keywords)
+        return None
+    return term
 def doJob():
-    Keywords = '中华人民共和国公司法(2023修订)'
+    data_list = []
-    href = getHref(Keywords)
+    driver = create_driver()
-    if href:
+    driver.maximize_window()
-        getData(href)
+    while True:
+        try:
+            Keywords = r.lpop('Shenjisclx:').decode()
+            # Keywords = '中华人民共和国银行业监督管理法（2006修正）'
+        except:
+            Keywords = ''
+        if Keywords:
+            try:
+                href = getHref(Keywords, driver)
+                if href:
+                    r.rpush('ShenjisclxHref:', f'{Keywords}|{href}')
+                    log.info(f'{Keywords}====找到=== {href}')
+                    term = getData(href, Keywords)
+                else:
+                    term = Keywords + ','
+                    r.rpush('ShenjisclxHrefNull:', f'{Keywords}|{href}')
+                    log.info(f'{Keywords}====未找到')
+                if term:
+                    # data_list.append(term)
+                    r.rpush('ShenjisclxReault:', term)
+            except:
+                r.rpush('ShenjisclxError:', Keywords)
+                continue
+            time.sleep(2)
+        else:
+            break
+    # print(data_list)
+    # with open('./output.csv', 'w', newline='') as file:
+    #     writer = csv.writer(file)
+    #
+    #     # 写入数据
+    #     for row in data_list:
+    #         writer.writerow(row.split(','))
+    #
+    # print('数据已成功写入CSV文件')
 if __name__ == '__main__':
    doJob()