提交 eeb41ef7 作者: 薛凌堃

政策法规

上级 d5722767
......@@ -725,7 +725,7 @@ def ji_lin():
if is_href:
continue
try:
# real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj/202211/t20221123_2310750.html'
# real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
href_text = requests.get(url=real_href, headers=headers, verify=False)
i_html = href_text.text.encode("ISO-8859-1")
i_html = i_html.decode("utf-8")
......@@ -733,7 +733,8 @@ def ji_lin():
# print(i_soup)
#相对路径转化为绝对路径
soup = paserUrl(i_soup, real_href)
text = str(soup.prettify())
soup.prettify()
try:
i_come = i_soup.find('span', class_='source')
i_time = i_soup.find('span', class_='time')
......@@ -756,9 +757,18 @@ def ji_lin():
pub_time = pub.find(class_='left').find('span', class_='time').text
pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源:')[1].strip()
# print(pub_come)
i_content = i_soup.find(class_='zsy_comain')
i_content = soup.find(class_='zsy_comain')
if i_content:
content = str(i_content)
print(real_href)
#去掉扫一扫
soup.find('div',id='qr_container').decompose()
soup.find('div',id='div_div').decompose()
#去掉style
# 去掉style标签
for styleTag in soup.find_all('style'):
styleTag.extract()
contentWithTag = soup.find(class_='zsy_comain')
content = contentWithTag.text.strip()
#发文字号
find_hao = i_content.find_all('p')[:3]
pub_hao = ''
......@@ -767,7 +777,7 @@ def ji_lin():
pub_hao = j.text
else:
continue
fj = i_soup.find('div', style='width:920px; margin: 0 auto;')
fj = soup.find('div', style='width:920px; margin: 0 auto;')
if fj:
li_list = fj.find_all('li')
for li in li_list:
......@@ -790,16 +800,20 @@ def ji_lin():
else:
continue
else:
i_content= i_soup.find(class_="content")
i_content= soup.find(class_="content")
#将文章中的附件字段删去
pattern = r'\d+\.'
# pattern = r"附件:\d+\.\s*(.*)"
for p in i_content.find_all('div')[-10:]:
p_text = p.text
matches = re.findall(pattern, p_text)
if matches:
for k in matches:
if k in p_text:
p.extract()
content = str(i_content)
contentWithTag = i_content
content = contentWithTag.text.strip()
#找到附件上传至文件服务器
fj_soup = i_soup.find('div',class_='wenjianfujian')
fj_list = fj_soup.find_all('a')
......@@ -815,7 +829,7 @@ def ji_lin():
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '吉林国资委', file_name, num)
id_list.append(att_id)
# todo:将返回的地址更新到soup
......@@ -836,8 +850,8 @@ def ji_lin():
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(i_content.text),
'contentWithTag': content,
'content': content,
'contentWithTag': contentWithTag,
'createDate': time_now,
'deleteFlag': 0,
'id': '',
......@@ -1168,15 +1182,17 @@ def fu_jian():
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
real_href = href
real_href = 'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm'
# print(real_href)
is_href = db_storage.find_one({'网址': real_href})
if is_href:
continue
# is_href = db_storage.find_one({'网址': real_href})
# if is_href:
# continue
try:
# # 文章是远程pdf
#直接下载文件至服务器,解析出正文内容
# 文章是远程pdf
# 直接下载文件至服务器,解析出正文内容
if '.pdf' in real_href:
# pass
resp_content = requests.get(real_href, headers=headers, verify=False, timeout=20).content
#解析出pdf内容
content = baseCore.pdf_content(resp_content)
......@@ -1195,8 +1211,6 @@ def fu_jian():
else:
try:
real_href = 'http://gzw.fujian.gov.cn/ztzl/gzjgfzjs/gfxwj_7426/201809/t20180911_4492105.htm'
href_text = requests.get(url=real_href, headers=headers, verify=False)
href_text.encoding = href_text.apparent_encoding
i_html = href_text.text
......@@ -1208,6 +1222,7 @@ def fu_jian():
try:
fu_jian_list = i_soup.find('ul',class_='clearflx myzj_xl_list').find_all('a')
except:
pass
fu_jian_list = []
for fu_jian in fu_jian_list:
fj_href = fu_jian['href']
......@@ -1234,29 +1249,12 @@ def fu_jian():
pub_hao = ''
except:
print(f'-------其他情况:{real_href}-------')
continue
# href_text = requests.get(url=real_href, headers=headers, verify=False)
# href_text.encoding = href_text.apparent_encoding
# i_html = href_text.text
# i_soup = BeautifulSoup(i_html, 'html.parser')
# i_soup = paserUrl(i_soup, real_href)
# # print(i_soup)
# source = str(i_soup.find('table', attrs={'class': 'tp-pho'}).text)
# pub_hao = source.split('文号')[1].split('发布机构')[0].strip().lstrip()
# pub_source = source.split('发布机构')[1].split('生成日期')[0].strip().lstrip()
# pub_time = source.split('生成日期')[1].split('标题')[0].strip().lstrip()
# content = i_soup.find('div', attrs={'class': 'xl-article-nr'})
# fu_jian_result = re.findall('href="(.*?)"', str(content))
# fu_jian_href_list = []
# if len(fu_jian_result) > 0:
# for fu_jian_re in fu_jian_result:
# if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
# or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
# or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
# fu_jian_href = fu_jian_re
# print(fu_jian_href)
# fu_jian_href_list.append(fu_jian_href)
pub_source = ''
pub_time = ''
contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
content = contentwithtag.text.strip()
pub_hao = contentwithtag.find_all('div',class_='rules_tit1 b-free-read-leaf').text.dtrip()
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -1283,7 +1281,6 @@ def fu_jian():
sendKafka(dic_news)
save_data(dic_news)
print(title)
# save_data(result_dict)
num += 1
except:
pass
......@@ -1727,7 +1724,8 @@ def hai_nan():
'summary': '',
'title': title
}
sendKafka(dic_news)
save_data(dic_news)
href_text.close()
# save_data(result_dict)
print(title)
......@@ -1777,7 +1775,7 @@ def hai_nan():
contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
content = contentWithTag.text
except:
print(href)
# print(href)
pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
topicClassification = ''
origin = str(pub_result.text).split('来源:')[1].split(' 【字体:')[0].lstrip().strip()
......@@ -2411,9 +2409,8 @@ def gui_zhou():
'title': title
}
# print(dic_news)
# sendKafka(dic_news)
# save_data(dic_news)
sendKafka(dic_news)
save_data(dic_news)
print(title)
# save_data(result_dict)
num = num + 1
......@@ -2700,7 +2697,7 @@ def chong_qing():
contentWithTag = doc_href.find('div',class_='zwxl-article')
content = contentWithTag.text
except:
pub_source = ''
origin = ''
topicClassification = ''
pub_time = ''
writtenDate = ''
......@@ -2742,7 +2739,7 @@ def chong_qing():
'id': '',
'labels': [{'relationId': "1693", 'relationName': "重庆市国资委",
'labelMark': "policy"}],
'origin': '',
'origin': origin,
'organ': '',
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
......@@ -5392,7 +5389,7 @@ if __name__ == '__main__':
# ji_lin()
# shang_hai()
# zhe_jiang()
# fu_jian()
fu_jian()
# shan_dong()
# guang_dong()
# hai_nan()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论