提交 be4f16b4 作者: XveLingKun

0717

上级 c887e9d2
...@@ -51,7 +51,7 @@ def get_html(tycid, driver, headers): ...@@ -51,7 +51,7 @@ def get_html(tycid, driver, headers):
' ', '') ' ', '')
return int(total) return int(total)
else: else:
return -1 return 0
except: except:
return 0 return 0
...@@ -226,7 +226,7 @@ def doJob(): ...@@ -226,7 +226,7 @@ def doJob():
elif charge == 0: elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示") log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1' url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff/announcement?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try: try:
total_page1, data_page1 = get_page(url1, s, headers) total_page1, data_page1 = get_page(url1, s, headers)
except: except:
......
import pandas as pd import pandas as pd
...@@ -2,10 +2,10 @@ import pandas as pd ...@@ -2,10 +2,10 @@ import pandas as pd
import pymongo import pymongo
# 7649 # 7649
data_list = [] data_list = []
db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['全球企业资讯'] db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['7-17全球企业资讯删除数据']
# datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}}) # datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}})
# 导出标签是空的数据 # 导出标签是空的数据
datas = db_stroage.find({"标签": ""}) datas = db_stroage.find()
link = [] link = []
for data in datas: for data in datas:
del data['_id'] del data['_id']
...@@ -14,7 +14,8 @@ for data in datas: ...@@ -14,7 +14,8 @@ for data in datas:
if data['标题'] not in link: if data['标题'] not in link:
data_list.append(data) data_list.append(data)
link.append(data['标题']) link.append(data['标题'])
# print(data) # print(data)
print(len(data_list)) print(len(data_list))
df = pd.DataFrame(data_list) df = pd.DataFrame(data_list)
df.to_excel('./不保留企业资讯.xlsx',index=False) df.to_excel('./7-17全球企业资讯不保留数据.xlsx',index=False)
\ No newline at end of file \ No newline at end of file
import json import json
...@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False) ...@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger() log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'智库-不保留222'] '7-17全球企业资讯删除数据']
lock = threading.Lock() lock = threading.Lock()
...@@ -37,12 +37,13 @@ class EsMethod(object): ...@@ -37,12 +37,13 @@ class EsMethod(object):
"must": [ "must": [
{ {
"match": { "match": {
"subjectId": "1537739653432397825" "subjectId": "1734030182269853697"
} }
}, },
{ {
"match": { "match": {
"deleteFlag": "1" "deleteFlag": "1"
#checkStatus 保留
} }
}, },
{ {
......
import json import json
...@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False) ...@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger() log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'全球企业资讯0710'] '新华丝路-丝路投资']
lock = threading.Lock() lock = threading.Lock()
...@@ -37,17 +37,9 @@ class EsMethod(object): ...@@ -37,17 +37,9 @@ class EsMethod(object):
"must": [ "must": [
{ {
"match": { "match": {
"subjectId": "1734030182269853697" "sid": "1597878873601540098"
}
},
{
"range": {
"createDate": {
"gte": "2024-07-01T00:00:00",
"lte": "2024-07-11T00:00:00"
} }
} }
}
] ]
} }
}, },
...@@ -140,42 +132,44 @@ def main(page, p, esMethod): ...@@ -140,42 +132,44 @@ def main(page, p, esMethod):
pre_summary = preprocess(clean_summary) pre_summary = preprocess(clean_summary)
except: except:
pre_summary = summary pre_summary = summary
try: origin = mms['_source']['origin']
contentRaw = mms['_source']['contentRaw'] publishDate = mms['_source']['publishDate']
except: # try:
contentRaw = '' # contentRaw = mms['_source']['contentRaw']
# except:
try: # contentRaw = ''
clean_contentRaw = clean_html_tag(contentRaw) #
pre_contentRaw = preprocess(clean_contentRaw) # try:
except: # clean_contentRaw = clean_html_tag(contentRaw)
pre_contentRaw = contentRaw # pre_contentRaw = preprocess(clean_contentRaw)
try: # except:
titleRaw = mms['_source']['titleRaw'] # pre_contentRaw = contentRaw
except: # try:
titleRaw = '' # titleRaw = mms['_source']['titleRaw']
try: # except:
summaryRaw = mms['_source']['summaryRaw'] # titleRaw = ''
except: # try:
summaryRaw = '' # summaryRaw = mms['_source']['summaryRaw']
try: # except:
clean_summaryRaw = clean_html_tag(summaryRaw) # summaryRaw = ''
pre_summaryRaw = preprocess(clean_summaryRaw) # try:
except: # clean_summaryRaw = clean_html_tag(summaryRaw)
pre_summaryRaw = summaryRaw # pre_summaryRaw = preprocess(clean_summaryRaw)
# except:
# pre_summaryRaw = summaryRaw
log.info(f'{id}--{title}---') log.info(f'{id}--{title}---')
labels = mms['_source']['labels'] # labels = mms['_source']['labels']
tags = [] # tags = []
for label in labels: # for label in labels:
label_name = label['labelMark'] # label_name = label['labelMark']
if label_name == "dynamic_tags": # if label_name == "dynamic_tags":
relationName = label['relationName'] # relationName = label['relationName']
tags.append(relationName) # tags.append(relationName)
else: # else:
continue # continue
info_tags = ','.join(tags) # info_tags = ','.join(tags)
# 存入数据库 # 存入数据库
dic = { dic = {
...@@ -184,10 +178,8 @@ def main(page, p, esMethod): ...@@ -184,10 +178,8 @@ def main(page, p, esMethod):
"摘要": pre_summary, "摘要": pre_summary,
"内容": pre_content, "内容": pre_content,
"带标签内容": contentWithTag, "带标签内容": contentWithTag,
"标题译文": titleRaw, "来源": origin,
"摘要译文": pre_summaryRaw, "发布时间": publishDate,
"内容译文": pre_contentRaw,
"标签": info_tags,
} }
db_storage.insert_one(dic) db_storage.insert_one(dic)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论