提交 88209302 作者: 薛凌堃

REITs政策法规

上级 049e8a5e
...@@ -13,8 +13,8 @@ from reits import Policy ...@@ -13,8 +13,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '重庆市人民政府' webname = '重庆市人民政府_'
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
...@@ -124,7 +124,10 @@ def getContent(url): ...@@ -124,7 +124,10 @@ def getContent(url):
contentWithTag = soup.find('div', class_='view') contentWithTag = soup.find('div', class_='view')
if not contentWithTag: if not contentWithTag:
contentWithTag = soup.find('div',class_='document') contentWithTag = soup.find('div',class_='document')
try:
contentWithTag.find('div',class_='item').decompose() contentWithTag.find('div',class_='item').decompose()
except:
pass
try: try:
scripts = contentWithTag.find_all('script') scripts = contentWithTag.find_all('script')
for script in scripts: for script in scripts:
...@@ -168,14 +171,17 @@ def getData(data_, num): ...@@ -168,14 +171,17 @@ def getData(data_, num):
content, contentWithTag = getContent(href) content, contentWithTag = getContent(href)
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': [], 'attachmentIds': [],
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -19,7 +19,7 @@ from reits import Policy ...@@ -19,7 +19,7 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '广东省人民政府' webname = '广东省人民政府'
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
...@@ -144,14 +144,17 @@ def getData(data_, num,sid): ...@@ -144,14 +144,17 @@ def getData(data_, num,sid):
content, contentWithTag, id_list = getContent(href, publishDate, num) content, contentWithTag, id_list = getContent(href, publishDate, num)
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -15,7 +15,7 @@ from reits import Policy ...@@ -15,7 +15,7 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '江西省人民政府' webname = '江西省人民政府'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
...@@ -130,14 +130,17 @@ def doJob(): ...@@ -130,14 +130,17 @@ def doJob():
num += 1 num += 1
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
...@@ -150,6 +153,8 @@ def doJob(): ...@@ -150,6 +153,8 @@ def doJob():
'createDate': time_now, 'createDate': time_now,
'sid': '1729043445107838978' 'sid': '1729043445107838978'
} }
# print(dic_info['id'])
# print(publishDate)
try: try:
baseCore.sendkafka(dic_info, topic) baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href) baseCore.r.sadd('REITs::' + webname, href)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论