提交 88209302 作者: 薛凌堃

REITs政策法规

上级 049e8a5e
......@@ -13,8 +13,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '重庆市人民政府'
topic = 'research_center_fourth'
webname = '重庆市人民政府_'
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
......@@ -124,7 +124,10 @@ def getContent(url):
contentWithTag = soup.find('div', class_='view')
if not contentWithTag:
contentWithTag = soup.find('div',class_='document')
contentWithTag.find('div',class_='item').decompose()
try:
contentWithTag.find('div',class_='item').decompose()
except:
pass
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
......@@ -168,14 +171,17 @@ def getData(data_, num):
content, contentWithTag = getContent(href)
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': [],
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -19,7 +19,7 @@ from reits import Policy
policy = Policy()
topic = 'policy'
topic = 'research_center_fourth'
webname = '广东省人民政府'
headers = {
'Content-Type': 'application/json',
......@@ -144,14 +144,17 @@ def getData(data_, num,sid):
content, contentWithTag, id_list = getContent(href, publishDate, num)
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -15,7 +15,7 @@ from reits import Policy
policy = Policy()
topic = 'policy'
topic = 'research_center_fourth'
webname = '江西省人民政府'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
......@@ -130,14 +130,17 @@ def doJob():
num += 1
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......@@ -150,6 +153,8 @@ def doJob():
'createDate': time_now,
'sid': '1729043445107838978'
}
# print(dic_info['id'])
# print(publishDate)
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论