提交 07bd3604 作者: 薛凌堃

浙江省人民政府

上级 3d6d75b6
import os import os
...@@ -6,7 +6,7 @@ import requests ...@@ -6,7 +6,7 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from retry import retry from retry import retry
from base import BaseCore import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
...@@ -96,13 +96,19 @@ class Policy(): ...@@ -96,13 +96,19 @@ class Policy():
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name:
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '', file_name) try:
retData = baseCore.uptoOBS(file_href, '', file_name)
except:
return '', ''
if retData['state']: if retData['state']:
pass pass
else: else:
return '', '' return '', ''
att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate) try:
return att_id, full_path att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate)
return att_id, full_path
except:
return '', ''
else: else:
return '', '' return '', ''
...@@ -422,6 +428,8 @@ def getContent(url, publishDate, num): ...@@ -422,6 +428,8 @@ def getContent(url, publishDate, num):
contentWithTag = soup.find('body > div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(3)') contentWithTag = soup.find('body > div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(3)')
if not contentWithTag: if not contentWithTag:
contentWithTag = soup.find('div',class_='detail-pic') contentWithTag = soup.find('div',class_='detail-pic')
if not contentWithTag:
contentWithTag = soup.find('div',class_='mian')
try: try:
contentWithTag.find('video').decompose() contentWithTag.find('video').decompose()
contentWithTag = None contentWithTag = None
...@@ -439,9 +447,18 @@ def getContent(url, publishDate, num): ...@@ -439,9 +447,18 @@ def getContent(url, publishDate, num):
style.decompose() style.decompose()
except: except:
pass pass
if contentWithTag:
pass
else:
log.info(f"内容未解析出来===={url}")
return '','',[]
a_list = contentWithTag.find_all('a') a_list = contentWithTag.find_all('a')
for a in a_list: for a in a_list:
href = a.get('href') href = a.get('href')
if href:
pass
else:
continue
fj_title = a.text.strip().lstrip() fj_title = a.text.strip().lstrip()
category = os.path.splitext(href)[1] category = os.path.splitext(href)[1]
if category not in fj_title: if category not in fj_title:
...@@ -464,6 +481,10 @@ def getDatas(page): ...@@ -464,6 +481,10 @@ def getDatas(page):
'\r\n', ' ') '\r\n', ' ')
href = soup.find('div', class_='titleWrapper').find('a').get('href') href = soup.find('div', class_='titleWrapper').find('a').get('href')
href = href.split('url=')[1].split('.html')[0].replace('%3A', ':').replace('%2F', '/') + '.html' href = href.split('url=')[1].split('.html')[0].replace('%3A', ':').replace('%2F', '/') + '.html'
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
try: try:
info = soup.find('table', class_='fgwj_table_list').text info = soup.find('table', class_='fgwj_table_list').text
organ = info.split('发布机构:')[1].split('成文日期:')[0].lstrip().strip() organ = info.split('发布机构:')[1].split('成文日期:')[0].lstrip().strip()
...@@ -475,6 +496,10 @@ def getDatas(page): ...@@ -475,6 +496,10 @@ def getDatas(page):
' ', '').replace(' ', '').replace('\r\n', '') ' ', '').replace(' ', '').replace('\r\n', '')
publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip() publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip()
contentWithTag, content, id_list = getContent(href, publishDate, num) contentWithTag, content, id_list = getContent(href, publishDate, num)
if contentWithTag:
pass
else:
continue
num += 1 num += 1
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = { dic_info = {
...@@ -494,7 +519,7 @@ def getDatas(page): ...@@ -494,7 +519,7 @@ def getDatas(page):
'issuedNumber': '', 'issuedNumber': '',
'summary': '', 'summary': '',
'createDate': time_now, 'createDate': time_now,
'sid': '1729041791539326977', 'sid': '1730472253306552321',
} }
try: try:
baseCore.sendkafka(dic_info, topic) baseCore.sendkafka(dic_info, topic)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论