提交 6173700f 作者: 薛凌堃

11/16

上级 d3fd7612
import os import os
...@@ -36,6 +36,11 @@ class Policy(): ...@@ -36,6 +36,11 @@ class Policy():
data_json = req.json() data_json = req.json()
return data_json return data_json
def requestPost_html(self,headers, url, payload):
req = requests.post(headers=headers, url=url, data=payload)
result = BeautifulSoup(req.content,'html.parser')
return result
def createDriver(self): def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe' chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver) path = Service(chrome_driver)
...@@ -48,11 +53,22 @@ class Policy(): ...@@ -48,11 +53,22 @@ class Policy():
return driver return driver
def deletep(self,soup,i,tag,attribute_to_delete,value_to_delete): def deletep(self,soup,i,tag,attribute_to_delete,value_to_delete):
# 查找带有指定属性的P标签并删除 # 查找带有指定属性的标签并删除
tags = soup.find_all(tag, {attribute_to_delete: value_to_delete}) tags = soup.find_all(tag, {attribute_to_delete: value_to_delete})
for tag in tags[:i]: for tag in tags[:i]:
tag.decompose() tag.decompose()
def deletespan(self, td):
spans = td.find_all('span')
for span in spans:
span.extract() # 删除span标签
def deletetag(self, td,tag):
tags = td.find_all(tag)
for tag_ in tags:
tag_.extract() # 删除指定标签
def deletek(self,soup): def deletek(self,soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外) # 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' '): for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' '):
...@@ -386,7 +402,6 @@ def zhengquanqihuo(wb,file_path): ...@@ -386,7 +402,6 @@ def zhengquanqihuo(wb,file_path):
#深圳交易所 http://www.szse.cn/lawrules/index.html #深圳交易所 http://www.szse.cn/lawrules/index.html
#上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs #上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
def sse(wb,file_path): def sse(wb,file_path):
url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761' url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
...@@ -483,20 +498,24 @@ def sse(wb,file_path): ...@@ -483,20 +498,24 @@ def sse(wb,file_path):
fu_jian_name = '' fu_jian_name = ''
fu_jian_href = '' fu_jian_href = ''
for fujian in fujian_list: for fujian in fujian_list:
file_href = fujian['href'] try:
file_href = fujian['href']
except:
continue
file_name = fujian.text.strip(' ') file_name = fujian.text.strip(' ')
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category in file_name: if category in file_name:
pass pass
else: else:
file_name = file_name + category file_name = file_name + category
rename_file = f'{str(num)}_{publishDate}_{file_name}'.replace('\\','').replace('/','').replace('|','').replace('>','').replace('<','').replace('*','').replace(':','').replace('?','').replace('—','') rename_file = f'{str(num)}_{publishDate[:10]}_{file_name}'.replace('\\','').replace('/','').replace('|','').replace('>','').replace('<','').replace('*','').replace(':','').replace('?','').replace('—','').replace('-','')
fu_jian_name += rename_file + '\n' fu_jian_name += rename_file + '\n'
fu_jian_href += file_href + '\n' fu_jian_href += file_href + '\n'
try: try:
policy.downloadfile(file_href, f'{path}/{rename_file}') policy.downloadfile(file_href, f'{path}/{rename_file}')
except: except:
log.info(f'--{page}-{num}======{newsUrl}') log.info(f'--{page}-{num}======{newsUrl}')
continue
dic_info = { dic_info = {
'序号': num, '序号': num,
'标题': title, '标题': title,
...@@ -525,10 +544,7 @@ def sse(wb,file_path): ...@@ -525,10 +544,7 @@ def sse(wb,file_path):
baseCore.writerToExcel(DataList, file_path, sheet_name) baseCore.writerToExcel(DataList, file_path, sheet_name)
#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs #北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
def beijing(): def beijing():
url = 'https://www.beijing.gov.cn/so/ss/query/s' url = 'https://www.beijing.gov.cn/so/ss/query/s'
payload = { payload = {
...@@ -622,12 +638,264 @@ def beijing(): ...@@ -622,12 +638,264 @@ def beijing():
# print(dic_info) # print(dic_info)
# break # break
# 河北省人民政府
def hebei():
path = 'data/河北省人民政府'
if not os.path.exists(path):
os.makedirs(path)
num = 0
url = "https://www.hebei.gov.cn/search/pcRender?pageId=b97a38833f7343cebc31dec44544f684"
appNames = ['热点专题']
for appName in appNames:
payload = {'qAnd': ' ',
'qOr': ' ',
'qAll': ' ',
'qNot': ' ',
'startTime': ' ',
'endTime': ' ',
'advSearch': ' ',
'originalSearchUrl': ' /search/pcRender?pageId=b97a38833f7343cebc31dec44544f684',
'originalSearch': ' ',
'app': ' 20c723b3a36e4906b0d91e6950d3dc29,8b157f193fb54ea7837d6380a37bb84a,0ad7369c794e4b2fbd6a4e76f9b84e9c,47fb4bc5c08d49d3b937c56c7960a909,9f54f8001d8747e4826d542fedcc6abc,b42baf238f43435ea7f796bec4ef7592,c943f166fb9042d288743397b12978fc,4b2050e6bb5d48dc9b200385dd99b4e3,7b5b083a6d254960ab34e34009e7e8d7,aa9d0848dcb84e8b919fd02b2da090b4,54e1a38a0e2846a4bc60258af5ced450,b88b6ee476494a16b66ea9cacc0456ee,4d0e00783a2e4037a6d3bdcd1fe98fb1,a8cb58e7494e4ae4a682b0e79df63dc6,f70c53427500439cbdeee467c5a185a6,d3f6aaca16c54e7b8626993314ad27b7,4d63955d8ec441018e8fddc6131997b0',
'searchArea': ' ',
'appName': appName,
'sr': ' score desc',
'advtime': ' ',
'advrange': ' ',
'articleType': ' ',
'siteId': ' ',
'siteName': ' ',
'ext': ' ',
'pNo': ' 1',
'deviceType': ' pc',
'q2': ' ',
'q': ' REITs'}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '907',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': 'aisearchbehavior=42b33c1f2d22475bb571093346193219; JSESSIONID=251311215A6447AE509141936F4569D4; arialoadData=true',
'Host': 'www.hebei.gov.cn',
'Origin': 'https://www.hebei.gov.cn',
'Referer': 'https://www.hebei.gov.cn/search/pcRender?pageId=b97a38833f7343cebc31dec44544f684',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
soup_ = policy.requestPost_html(headers, url, payload)
# 第一次请求获取页数
pages = int(soup_.find('span',class_='default-result-tolal-records').find('span').text)
DataList = []
for page in range(1, pages+1):
payload_page = {
'qAnd': ' ',
'qOr': ' ',
'qAll': ' ',
'qNot': ' ',
'startTime': ' ',
'endTime': ' ',
'advSearch': ' ',
'originalSearchUrl': ' /search/pcRender?pageId=b97a38833f7343cebc31dec44544f684',
'originalSearch': ' ',
'app': ' 20c723b3a36e4906b0d91e6950d3dc29,8b157f193fb54ea7837d6380a37bb84a,0ad7369c794e4b2fbd6a4e76f9b84e9c,47fb4bc5c08d49d3b937c56c7960a909,9f54f8001d8747e4826d542fedcc6abc,b42baf238f43435ea7f796bec4ef7592,c943f166fb9042d288743397b12978fc,4b2050e6bb5d48dc9b200385dd99b4e3,7b5b083a6d254960ab34e34009e7e8d7,aa9d0848dcb84e8b919fd02b2da090b4,54e1a38a0e2846a4bc60258af5ced450,b88b6ee476494a16b66ea9cacc0456ee,4d0e00783a2e4037a6d3bdcd1fe98fb1,a8cb58e7494e4ae4a682b0e79df63dc6,f70c53427500439cbdeee467c5a185a6,d3f6aaca16c54e7b8626993314ad27b7,4d63955d8ec441018e8fddc6131997b0',
'searchArea': ' ',
'appName': appName,
'sr': ' score desc',
'advtime': ' ',
'advrange': ' ',
'articleType': ' ',
'siteId': ' ',
'siteName': ' ',
'ext': ' ',
'pNo': str(page),
'deviceType': ' pc',
'q2': ' ',
'q': ' REITs'}
soup = policy.requestPost_html(headers, url, payload_page)
list_news = soup.find_all('div',class_='szf-data-tpl1-item')
for news in list_news:
num += 1
title = news.find('h3').text
summary = news.find('div').find('p', class_='txtCon').text
publishDate = news.find('div').find('p', class_='dates').text.replace('发布日期:', '').replace('\n', '')
news_href = news.find('div').find('p', class_='txtCon').find('a')['href']
# news_href = 'http://info.hebei.gov.cn//hbszfxxgk/6898876/7026469/7026511/7026506/7033297/index.html'
news_req = requests.get(news_href, headers)
news_soup = BeautifulSoup(news_req.content, 'html.parser')
writeDate = ''
pub_hao = ''
source = ''
content = ''
pub_origin = ''
try:
content = news_soup.find('div', id='zoom').text
contentWithTag = news_soup.find('div', id='zoom')
try:
source = news_soup.find('div', class_='article_tit').find('li', class_='xl_laiyuan').text
except:
source = ''
try:
info_ = news_soup.find('div',class_='xxgk_bmxl')
policy.deletetag(info_, 'strong')
policy.deletek(info_)
info_list = info_.find_all('td')
pub_origin = info_list[1].text
pub_hao = info_list[2].text
except:
# 处理空标签
policy.deletek(news_soup)
p_list = news_soup.find_all('p')
for p in p_list:
text_pubhao = p.text
if '号' in text_pubhao and '〔' in text_pubhao:
pattern = r"冀政办字〔\d+〕\d+号"
match = re.search(pattern, text_pubhao)
if match:
pub_hao = match.group(0)
break
else:
continue
writeDate_ = p.text
pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
match = re.search(pattern, writeDate_)
if match:
writeDate = match.group(0)
break
else:
continue
except:
try:
contentWithTag = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr')
content = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr').text
info = news_soup.find('div', class_='xxgk_gfxwjk-xqy-touxx')
policy.deletespan(info)
pub_hao = info.find('p', class_='xxgk_gfxwjk-xqy-touxx4').text
pub_origin = info.find('p', class_='xxgk_gfxwjk-xqy-touxx3').text
writeDate = info.find('p', class_='xxgk_gfxwjk-xqy-touxx5').text
except:
pass
# 附件:
fu_jian_name = ''
fu_jian_href = ''
try:
fujian_href = contentWithTag.find_all('a')
policy.paserUrl(contentWithTag, news_href)
for file_href_ in fujian_href:
file_href = file_href_['href']
file_name = file_href_.text
category = os.path.splitext(file_href)[1]
if category in file_name:
pass
else:
file_name = file_name + category
rename_file = f'{str(num)}_{publishDate}_{file_name}'
fu_jian_name += rename_file + '\n'
fu_jian_href += file_href + '\n'
policy.downloadfile(file_href, f'{path}/{rename_file}')
except Exception as e:
pass
if content == '':
continue
dic_info = {
'序号': num,
'标题': title.replace('\n', ''),
'发布时间': publishDate,
'来源': source,
'原文链接': news_href,
'发文时间': writeDate,
'发文机构': pub_origin,
'发文字号': pub_hao,
'摘要': summary.replace('\n', ''),
'正文': content,
'附件名称': fu_jian_name,
'附件链接': fu_jian_href,
}
print(dic_info)
DataList.append(dic_info)
sheet_name = appName
if sheet_name in wb.sheetnames:
log.info(f"{sheet_name}工作表已存在!")
else:
# 创建新工作表
wb.create_sheet(sheet_name)
print(f"{sheet_name}新工作表创建完成!")
# 保存Excel文件
wb.save(file_path)
baseCore.writerToExcel(DataList, file_path, sheet_name)
break
# 广东省人民政府
def guangdong():
pass
# 贵州省人民政府
def guizhou():
url = "https://www.guizhou.gov.cn/irs/front/search"
payload = "{\"tenantId\":\"186\",\"configTenantId\":\"\",\"tenantIds\":\"\",\"searchWord\":\"REITs\",\"historySearchWords\":[\"REITs\"],\"dataTypeId\":\"965\",\"orderBy\":\"related\",\"searchBy\":\"all\",\"appendixType\":\"\",\"granularity\":\"ALL\",\"beginDateTime\":\"\",\"endDateTime\":\"\",\"isSearchForced\":0,\"filters\":[],\"pageNo\":1,\"pageSize\":9}"
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Content-Length': '291',
'Content-Type': 'application/json',
'Cookie': 'SESSION=MGY2NWQ3NjctZTNhZC00OTJhLWIzNGQtMDI1MmQ5MWVlZmNm; _trs_uv=lp15qktj_367_a56u; _trs_ua_s_1=lp15qktj_367_lac; yfx_c_g_u_id_10000921=_ck23111620182819813554574558557; yfx_f_l_v_t_10000921=f_t_1700137108976__r_t_1700137108976__v_t_1700137108976__r_c_0; arialoadData=false',
'Host': 'www.guizhou.gov.cn',
'Origin': 'https://www.guizhou.gov.cn',
'Referer': 'https://www.guizhou.gov.cn/so/search.shtml?tenantId=186&tenantIds=&configTenantId=&searchWord=REITs&dataTypeId=965&sign=6bd8592c-2e19-4f22-ae6d-f129f729e795',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
jsonData = policy.requestPost(headers, url, payload)
result_list = jsonData['data']['middle']["list"]
for datainfo in result_list:
title = datainfo['title']
publishData = datainfo['time']
source = datainfo['source']
summary = datainfo['content']
newsUrl = datainfo['url']
soup = policy.getrequest_soup(headers,newsUrl)
# print(soup)
pub_hao = soup.find('head').find('title')
print(pub_hao)
pass
if __name__=="__main__": if __name__=="__main__":
file_path = f'data/REITs国家改革发展委员会.xlsx' file_path = f'data/REITs深圳交易所.xlsx'
wb = policy.createfile(file_path) wb = policy.createfile(file_path)
# reform(wb,file_path) # reform(wb,file_path)
# shenzhen()
# zhengquanqihuo(wb,file_path) # zhengquanqihuo(wb,file_path)
sse(wb,file_path) # sse(wb,file_path)
# hebei()
guizhou()
# zhengquanqihuo() # zhengquanqihuo()
\ No newline at end of file
""" """
...@@ -291,18 +291,18 @@ def run_threads(num_threads,esMethod): ...@@ -291,18 +291,18 @@ def run_threads(num_threads,esMethod):
thread.join() thread.join()
if __name__ == '__main__': if __name__ == '__main__':
# while True: for i in range(0,5):
esMethod = EsMethod() esMethod = EsMethod()
p = 0 p = 0
# result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p) result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
# total = result['hits']['total']['value'] total = result['hits']['total']['value']
# if total == 0: if total == 0:
# log.info('++++已没有数据+++++') log.info('++++已没有数据+++++')
# break break
start = time.time() start = time.time()
num_threads = 8 num_threads = 10
run_threads(num_threads,esMethod) run_threads(num_threads,esMethod)
log.info(f'8线程 每个处理200条数据 总耗时{time.time()-start}秒') log.info(f'10线程 每个处理200条数据 总耗时{time.time()-start}秒')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论