提交 eeb41ef7 作者: 薛凌堃

政策法规

上级 d5722767
...@@ -725,7 +725,7 @@ def ji_lin(): ...@@ -725,7 +725,7 @@ def ji_lin():
if is_href: if is_href:
continue continue
try: try:
# real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj/202211/t20221123_2310750.html' # real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
href_text = requests.get(url=real_href, headers=headers, verify=False) href_text = requests.get(url=real_href, headers=headers, verify=False)
i_html = href_text.text.encode("ISO-8859-1") i_html = href_text.text.encode("ISO-8859-1")
i_html = i_html.decode("utf-8") i_html = i_html.decode("utf-8")
...@@ -733,7 +733,8 @@ def ji_lin(): ...@@ -733,7 +733,8 @@ def ji_lin():
# print(i_soup) # print(i_soup)
#相对路径转化为绝对路径 #相对路径转化为绝对路径
soup = paserUrl(i_soup, real_href) soup = paserUrl(i_soup, real_href)
text = str(soup.prettify()) soup.prettify()
try: try:
i_come = i_soup.find('span', class_='source') i_come = i_soup.find('span', class_='source')
i_time = i_soup.find('span', class_='time') i_time = i_soup.find('span', class_='time')
...@@ -756,9 +757,18 @@ def ji_lin(): ...@@ -756,9 +757,18 @@ def ji_lin():
pub_time = pub.find(class_='left').find('span', class_='time').text pub_time = pub.find(class_='left').find('span', class_='time').text
pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源:')[1].strip() pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源:')[1].strip()
# print(pub_come) # print(pub_come)
i_content = i_soup.find(class_='zsy_comain') i_content = soup.find(class_='zsy_comain')
if i_content: if i_content:
content = str(i_content) print(real_href)
#去掉扫一扫
soup.find('div',id='qr_container').decompose()
soup.find('div',id='div_div').decompose()
#去掉style
# 去掉style标签
for styleTag in soup.find_all('style'):
styleTag.extract()
contentWithTag = soup.find(class_='zsy_comain')
content = contentWithTag.text.strip()
#发文字号 #发文字号
find_hao = i_content.find_all('p')[:3] find_hao = i_content.find_all('p')[:3]
pub_hao = '' pub_hao = ''
...@@ -767,7 +777,7 @@ def ji_lin(): ...@@ -767,7 +777,7 @@ def ji_lin():
pub_hao = j.text pub_hao = j.text
else: else:
continue continue
fj = i_soup.find('div', style='width:920px; margin: 0 auto;') fj = soup.find('div', style='width:920px; margin: 0 auto;')
if fj: if fj:
li_list = fj.find_all('li') li_list = fj.find_all('li')
for li in li_list: for li in li_list:
...@@ -790,16 +800,20 @@ def ji_lin(): ...@@ -790,16 +800,20 @@ def ji_lin():
else: else:
continue continue
else: else:
i_content= i_soup.find(class_="content") i_content= soup.find(class_="content")
#将文章中的附件字段删去 #将文章中的附件字段删去
pattern = r'\d+\.' pattern = r'\d+\.'
# pattern = r"附件:\d+\.\s*(.*)"
for p in i_content.find_all('div')[-10:]: for p in i_content.find_all('div')[-10:]:
p_text = p.text p_text = p.text
matches = re.findall(pattern, p_text) matches = re.findall(pattern, p_text)
for k in matches: if matches:
if k in p_text:
p.extract() for k in matches:
content = str(i_content) if k in p_text:
p.extract()
contentWithTag = i_content
content = contentWithTag.text.strip()
#找到附件上传至文件服务器 #找到附件上传至文件服务器
fj_soup = i_soup.find('div',class_='wenjianfujian') fj_soup = i_soup.find('div',class_='wenjianfujian')
fj_list = fj_soup.find_all('a') fj_list = fj_soup.find_all('a')
...@@ -815,7 +829,7 @@ def ji_lin(): ...@@ -815,7 +829,7 @@ def ji_lin():
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '吉林国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
...@@ -836,8 +850,8 @@ def ji_lin(): ...@@ -836,8 +850,8 @@ def ji_lin():
dic_news = { dic_news = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'author': '', 'author': '',
'content': str(i_content.text), 'content': content,
'contentWithTag': content, 'contentWithTag': contentWithTag,
'createDate': time_now, 'createDate': time_now,
'deleteFlag': 0, 'deleteFlag': 0,
'id': '', 'id': '',
...@@ -1168,15 +1182,17 @@ def fu_jian(): ...@@ -1168,15 +1182,17 @@ def fu_jian():
i_html = href_text.text i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser') i_soup = BeautifulSoup(i_html, 'html.parser')
real_href = href real_href = href
real_href = 'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm'
# print(real_href) # print(real_href)
is_href = db_storage.find_one({'网址': real_href}) # is_href = db_storage.find_one({'网址': real_href})
if is_href: # if is_href:
continue # continue
try: try:
# # 文章是远程pdf # 文章是远程pdf
#直接下载文件至服务器,解析出正文内容 # 直接下载文件至服务器,解析出正文内容
if '.pdf' in real_href: if '.pdf' in real_href:
# pass
resp_content = requests.get(real_href, headers=headers, verify=False, timeout=20).content resp_content = requests.get(real_href, headers=headers, verify=False, timeout=20).content
#解析出pdf内容 #解析出pdf内容
content = baseCore.pdf_content(resp_content) content = baseCore.pdf_content(resp_content)
...@@ -1195,8 +1211,6 @@ def fu_jian(): ...@@ -1195,8 +1211,6 @@ def fu_jian():
else: else:
try: try:
real_href = 'http://gzw.fujian.gov.cn/ztzl/gzjgfzjs/gfxwj_7426/201809/t20180911_4492105.htm'
href_text = requests.get(url=real_href, headers=headers, verify=False) href_text = requests.get(url=real_href, headers=headers, verify=False)
href_text.encoding = href_text.apparent_encoding href_text.encoding = href_text.apparent_encoding
i_html = href_text.text i_html = href_text.text
...@@ -1208,6 +1222,7 @@ def fu_jian(): ...@@ -1208,6 +1222,7 @@ def fu_jian():
try: try:
fu_jian_list = i_soup.find('ul',class_='clearflx myzj_xl_list').find_all('a') fu_jian_list = i_soup.find('ul',class_='clearflx myzj_xl_list').find_all('a')
except: except:
pass
fu_jian_list = [] fu_jian_list = []
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
fj_href = fu_jian['href'] fj_href = fu_jian['href']
...@@ -1234,29 +1249,12 @@ def fu_jian(): ...@@ -1234,29 +1249,12 @@ def fu_jian():
pub_hao = '' pub_hao = ''
except: except:
print(f'-------其他情况:{real_href}-------') pub_source = ''
continue pub_time = ''
# href_text = requests.get(url=real_href, headers=headers, verify=False) contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
# href_text.encoding = href_text.apparent_encoding content = contentwithtag.text.strip()
# i_html = href_text.text pub_hao = contentwithtag.find_all('div',class_='rules_tit1 b-free-read-leaf').text.dtrip()
# i_soup = BeautifulSoup(i_html, 'html.parser')
# i_soup = paserUrl(i_soup, real_href)
# # print(i_soup)
# source = str(i_soup.find('table', attrs={'class': 'tp-pho'}).text)
# pub_hao = source.split('文号')[1].split('发布机构')[0].strip().lstrip()
# pub_source = source.split('发布机构')[1].split('生成日期')[0].strip().lstrip()
# pub_time = source.split('生成日期')[1].split('标题')[0].strip().lstrip()
# content = i_soup.find('div', attrs={'class': 'xl-article-nr'})
# fu_jian_result = re.findall('href="(.*?)"', str(content))
# fu_jian_href_list = []
# if len(fu_jian_result) > 0:
# for fu_jian_re in fu_jian_result:
# if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
# or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
# or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
# fu_jian_href = fu_jian_re
# print(fu_jian_href)
# fu_jian_href_list.append(fu_jian_href)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -1283,7 +1281,6 @@ def fu_jian(): ...@@ -1283,7 +1281,6 @@ def fu_jian():
sendKafka(dic_news) sendKafka(dic_news)
save_data(dic_news) save_data(dic_news)
print(title) print(title)
# save_data(result_dict)
num += 1 num += 1
except: except:
pass pass
...@@ -1727,7 +1724,8 @@ def hai_nan(): ...@@ -1727,7 +1724,8 @@ def hai_nan():
'summary': '', 'summary': '',
'title': title 'title': title
} }
sendKafka(dic_news)
save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
print(title) print(title)
...@@ -1777,7 +1775,7 @@ def hai_nan(): ...@@ -1777,7 +1775,7 @@ def hai_nan():
contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content') contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
content = contentWithTag.text content = contentWithTag.text
except: except:
print(href) # print(href)
pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'}) pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
topicClassification = '' topicClassification = ''
origin = str(pub_result.text).split('来源:')[1].split(' 【字体:')[0].lstrip().strip() origin = str(pub_result.text).split('来源:')[1].split(' 【字体:')[0].lstrip().strip()
...@@ -1991,31 +1989,31 @@ def hai_nan(): ...@@ -1991,31 +1989,31 @@ def hai_nan():
pub_hao = '' pub_hao = ''
contentWithTag = doc_href.find(class_='pages_content') contentWithTag = doc_href.find(class_='pages_content')
content = contentWithTag.text content = contentWithTag.text
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
'attachmentIds': [], 'attachmentIds': [],
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': str(contentWithTag), 'contentWithTag': str(contentWithTag),
'createDate': time_now, 'createDate': time_now,
'deleteFlag': 0, 'deleteFlag': 0,
'id': '', 'id': '',
'labels': [{'relationId': "1677", 'relationName': "海南省国资委", 'labelMark': "policy"}], 'labels': [{'relationId': "1677", 'relationName': "海南省国资委", 'labelMark': "policy"}],
'origin': '', 'origin': '',
'organ': pub_source, 'organ': pub_source,
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': '',
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': i_href, 'sourceAddress': i_href,
'summary': '', 'summary': '',
'title': title 'title': title
} }
sendKafka(dic_news) sendKafka(dic_news)
save_data(dic_news) save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
print(title) print(title)
...@@ -2411,9 +2409,8 @@ def gui_zhou(): ...@@ -2411,9 +2409,8 @@ def gui_zhou():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
# sendKafka(dic_news) sendKafka(dic_news)
# save_data(dic_news) save_data(dic_news)
print(title) print(title)
# save_data(result_dict) # save_data(result_dict)
num = num + 1 num = num + 1
...@@ -2700,7 +2697,7 @@ def chong_qing(): ...@@ -2700,7 +2697,7 @@ def chong_qing():
contentWithTag = doc_href.find('div',class_='zwxl-article') contentWithTag = doc_href.find('div',class_='zwxl-article')
content = contentWithTag.text content = contentWithTag.text
except: except:
pub_source = '' origin = ''
topicClassification = '' topicClassification = ''
pub_time = '' pub_time = ''
writtenDate = '' writtenDate = ''
...@@ -2742,7 +2739,7 @@ def chong_qing(): ...@@ -2742,7 +2739,7 @@ def chong_qing():
'id': '', 'id': '',
'labels': [{'relationId': "1693", 'relationName': "重庆市国资委", 'labels': [{'relationId': "1693", 'relationName': "重庆市国资委",
'labelMark': "policy"}], 'labelMark': "policy"}],
'origin': '', 'origin': origin,
'organ': '', 'organ': '',
'topicClassification': topicClassification, 'topicClassification': topicClassification,
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
...@@ -5392,7 +5389,7 @@ if __name__ == '__main__': ...@@ -5392,7 +5389,7 @@ if __name__ == '__main__':
# ji_lin() # ji_lin()
# shang_hai() # shang_hai()
# zhe_jiang() # zhe_jiang()
# fu_jian() fu_jian()
# shan_dong() # shan_dong()
# guang_dong() # guang_dong()
# hai_nan() # hai_nan()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论