提交 be4f16b4 作者: XveLingKun

0717

上级 c887e9d2
......@@ -51,7 +51,7 @@ def get_html(tycid, driver, headers):
' ', '')
return int(total)
else:
return -1
return 0
except:
return 0
......@@ -226,7 +226,7 @@ def doJob():
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff/announcement?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1, data_page1 = get_page(url1, s, headers)
except:
......
import pandas as pd
import pandas as pd
......@@ -2,10 +2,10 @@ import pandas as pd
import pymongo
# 7649
data_list = []
db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['全球企业资讯']
db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['7-17全球企业资讯删除数据']
# datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}})
# 导出标签是空的数据
datas = db_stroage.find({"标签": ""})
datas = db_stroage.find()
link = []
for data in datas:
del data['_id']
......@@ -14,7 +14,8 @@ for data in datas:
if data['标题'] not in link:
data_list.append(data)
link.append(data['标题'])
# print(data)
print(len(data_list))
df = pd.DataFrame(data_list)
df.to_excel('./不保留企业资讯.xlsx',index=False)
\ No newline at end of file
df.to_excel('./7-17全球企业资讯不保留数据.xlsx',index=False)
\ No newline at end of file
import json
import json
......@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'智库-不保留222']
'7-17全球企业资讯删除数据']
lock = threading.Lock()
......@@ -37,12 +37,13 @@ class EsMethod(object):
"must": [
{
"match": {
"subjectId": "1537739653432397825"
"subjectId": "1734030182269853697"
}
},
{
"match": {
"deleteFlag": "1"
#checkStatus 保留
}
},
{
......
import json
import json
......@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'全球企业资讯0710']
'新华丝路-丝路投资']
lock = threading.Lock()
......@@ -37,15 +37,7 @@ class EsMethod(object):
"must": [
{
"match": {
"subjectId": "1734030182269853697"
}
},
{
"range": {
"createDate": {
"gte": "2024-07-01T00:00:00",
"lte": "2024-07-11T00:00:00"
}
"sid": "1597878873601540098"
}
}
]
......@@ -140,42 +132,44 @@ def main(page, p, esMethod):
pre_summary = preprocess(clean_summary)
except:
pre_summary = summary
try:
contentRaw = mms['_source']['contentRaw']
except:
contentRaw = ''
try:
clean_contentRaw = clean_html_tag(contentRaw)
pre_contentRaw = preprocess(clean_contentRaw)
except:
pre_contentRaw = contentRaw
try:
titleRaw = mms['_source']['titleRaw']
except:
titleRaw = ''
try:
summaryRaw = mms['_source']['summaryRaw']
except:
summaryRaw = ''
try:
clean_summaryRaw = clean_html_tag(summaryRaw)
pre_summaryRaw = preprocess(clean_summaryRaw)
except:
pre_summaryRaw = summaryRaw
origin = mms['_source']['origin']
publishDate = mms['_source']['publishDate']
# try:
# contentRaw = mms['_source']['contentRaw']
# except:
# contentRaw = ''
#
# try:
# clean_contentRaw = clean_html_tag(contentRaw)
# pre_contentRaw = preprocess(clean_contentRaw)
# except:
# pre_contentRaw = contentRaw
# try:
# titleRaw = mms['_source']['titleRaw']
# except:
# titleRaw = ''
# try:
# summaryRaw = mms['_source']['summaryRaw']
# except:
# summaryRaw = ''
# try:
# clean_summaryRaw = clean_html_tag(summaryRaw)
# pre_summaryRaw = preprocess(clean_summaryRaw)
# except:
# pre_summaryRaw = summaryRaw
log.info(f'{id}--{title}---')
labels = mms['_source']['labels']
tags = []
for label in labels:
label_name = label['labelMark']
if label_name == "dynamic_tags":
relationName = label['relationName']
tags.append(relationName)
else:
continue
info_tags = ','.join(tags)
# labels = mms['_source']['labels']
# tags = []
# for label in labels:
# label_name = label['labelMark']
# if label_name == "dynamic_tags":
# relationName = label['relationName']
# tags.append(relationName)
# else:
# continue
# info_tags = ','.join(tags)
# 存入数据库
dic = {
......@@ -184,10 +178,8 @@ def main(page, p, esMethod):
"摘要": pre_summary,
"内容": pre_content,
"带标签内容": contentWithTag,
"标题译文": titleRaw,
"摘要译文": pre_summaryRaw,
"内容译文": pre_contentRaw,
"标签": info_tags,
"来源": origin,
"发布时间": publishDate,
}
db_storage.insert_one(dic)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论