提交 07bd3604 作者: 薛凌堃

浙江省人民政府

上级 3d6d75b6
import os
import os
......@@ -6,7 +6,7 @@ import requests
from bs4 import BeautifulSoup
from retry import retry
from base import BaseCore
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
......@@ -96,13 +96,19 @@ class Policy():
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '', file_name)
try:
retData = baseCore.uptoOBS(file_href, '', file_name)
except:
return '', ''
if retData['state']:
pass
else:
return '', ''
att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate)
return att_id, full_path
try:
att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate)
return att_id, full_path
except:
return '', ''
else:
return '', ''
......@@ -422,6 +428,8 @@ def getContent(url, publishDate, num):
contentWithTag = soup.find('body > div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(3)')
if not contentWithTag:
contentWithTag = soup.find('div',class_='detail-pic')
if not contentWithTag:
contentWithTag = soup.find('div',class_='mian')
try:
contentWithTag.find('video').decompose()
contentWithTag = None
......@@ -439,9 +447,18 @@ def getContent(url, publishDate, num):
style.decompose()
except:
pass
if contentWithTag:
pass
else:
log.info(f"内容未解析出来===={url}")
return '','',[]
a_list = contentWithTag.find_all('a')
for a in a_list:
href = a.get('href')
if href:
pass
else:
continue
fj_title = a.text.strip().lstrip()
category = os.path.splitext(href)[1]
if category not in fj_title:
......@@ -464,6 +481,10 @@ def getDatas(page):
'\r\n', ' ')
href = soup.find('div', class_='titleWrapper').find('a').get('href')
href = href.split('url=')[1].split('.html')[0].replace('%3A', ':').replace('%2F', '/') + '.html'
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
try:
info = soup.find('table', class_='fgwj_table_list').text
organ = info.split('发布机构:')[1].split('成文日期:')[0].lstrip().strip()
......@@ -475,6 +496,10 @@ def getDatas(page):
' ', '').replace(' ', '').replace('\r\n', '')
publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip()
contentWithTag, content, id_list = getContent(href, publishDate, num)
if contentWithTag:
pass
else:
continue
num += 1
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
......@@ -494,7 +519,7 @@ def getDatas(page):
'issuedNumber': '',
'summary': '',
'createDate': time_now,
'sid': '1729041791539326977',
'sid': '1730472253306552321',
}
try:
baseCore.sendkafka(dic_info, topic)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论