提交 422c5516 作者: LiuLiYuan

审计法规 02/06

上级 e976fee1
import time
import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from retry import retry from retry import retry
...@@ -69,11 +72,13 @@ def getHref(Keywords): ...@@ -69,11 +72,13 @@ def getHref(Keywords):
href = 'https://sclx.pkulaw.com' + tag.get('url') href = 'https://sclx.pkulaw.com' + tag.get('url')
except: except:
href = '' href = ''
time.sleep(1)
return href return href
@retry(tries=2, delay=5) @retry(tries=2, delay=5)
def getData(href): def getData(href):
data = []
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
req = requests.get(href, headers=headers, proxies=ip, verify=False) req = requests.get(href, headers=headers, proxies=ip, verify=False)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
...@@ -89,15 +94,18 @@ def getData(href): ...@@ -89,15 +94,18 @@ def getData(href):
relevance = li.find('div', class_='relevance').text.strip() relevance = li.find('div', class_='relevance').text.strip()
except: except:
relevance = '' relevance = ''
log.info(f'{publishDate}==={theme}==={relevance}') data.append([publishDate,theme,relevance])
time.sleep(1)
return data
def doJob(): def doJob():
data = []
Keywords = '中华人民共和国公司法(2023修订)' Keywords = '中华人民共和国公司法(2023修订)'
href = getHref(Keywords) href = getHref(Keywords)
if href: if href:
getData(href) data += getData(href)
df = pd.DataFrame(data)
print(df)
if __name__ == '__main__': if __name__ == '__main__':
doJob() doJob()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论