Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
be4f16b4
提交
be4f16b4
authored
7月 30, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
0717
上级
c887e9d2
显示空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
52 行增加
和
58 行删除
+52
-58
CorePerson_Update.py
comData/Tyc/CorePerson_Update.py
+2
-2
mongo导数据.py
es拉取全球企业资讯/mongo导数据.py
+6
-5
拉取不保留数据.py
es拉取全球企业资讯/拉取不保留数据.py
+4
-3
拉取保留数据.py
es拉取全球企业资讯/拉取保留数据.py
+40
-48
没有找到文件。
comData/Tyc/CorePerson_Update.py
浏览文件 @
be4f16b4
...
@@ -51,7 +51,7 @@ def get_html(tycid, driver, headers):
...
@@ -51,7 +51,7 @@ def get_html(tycid, driver, headers):
' '
,
''
)
' '
,
''
)
return
int
(
total
)
return
int
(
total
)
else
:
else
:
return
-
1
return
0
except
:
except
:
return
0
return
0
...
@@ -226,7 +226,7 @@ def doJob():
...
@@ -226,7 +226,7 @@ def doJob():
elif
charge
==
0
:
elif
charge
==
0
:
log
.
info
(
f
"{id}---{xydm}----{tycid}----没有最新公示"
)
log
.
info
(
f
"{id}---{xydm}----{tycid}----没有最新公示"
)
url1
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url1
=
f
'https://capi.tianyancha.com/cloud-company-background/company/dim/staff
/announcement
?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try
:
try
:
total_page1
,
data_page1
=
get_page
(
url1
,
s
,
headers
)
total_page1
,
data_page1
=
get_page
(
url1
,
s
,
headers
)
except
:
except
:
...
...
es拉取全球企业资讯/mongo导数据.py
浏览文件 @
be4f16b4
impor
t
pandas
as
pd
impor
t
pandas
as
pd
...
@@ -2,10 +2,10 @@ import pandas as pd
...
@@ -2,10 +2,10 @@ import pandas as pd
import
pymongo
import
pymongo
# 7649
# 7649
data_list
=
[]
data_list
=
[]
db_stroage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'
全球企业资讯
'
]
db_stroage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'
7-17全球企业资讯删除数据
'
]
# datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}})
# datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}})
# 导出标签是空的数据
# 导出标签是空的数据
datas
=
db_stroage
.
find
(
{
"标签"
:
""
}
)
datas
=
db_stroage
.
find
()
link
=
[]
link
=
[]
for
data
in
datas
:
for
data
in
datas
:
del
data
[
'_id'
]
del
data
[
'_id'
]
...
@@ -14,7 +14,8 @@ for data in datas:
...
@@ -14,7 +14,8 @@ for data in datas:
if
data
[
'标题'
]
not
in
link
:
if
data
[
'标题'
]
not
in
link
:
data_list
.
append
(
data
)
data_list
.
append
(
data
)
link
.
append
(
data
[
'标题'
])
link
.
append
(
data
[
'标题'
])
# print(data)
# print(data)
print
(
len
(
data_list
))
print
(
len
(
data_list
))
df
=
pd
.
DataFrame
(
data_list
)
df
=
pd
.
DataFrame
(
data_list
)
df
.
to_excel
(
'./不保留企业资讯.xlsx'
,
index
=
False
)
df
.
to_excel
(
'./7-17全球企业资讯不保留数据.xlsx'
,
index
=
False
)
\ No newline at end of file
\ No newline at end of file
es拉取全球企业资讯/拉取不保留数据.py
浏览文件 @
be4f16b4
impor
t
json
impor
t
json
...
@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False)
...
@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False)
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'
智库-不保留222
'
]
'
7-17全球企业资讯删除数据
'
]
lock
=
threading
.
Lock
()
lock
=
threading
.
Lock
()
...
@@ -37,12 +37,13 @@ class EsMethod(object):
...
@@ -37,12 +37,13 @@ class EsMethod(object):
"must"
:
[
"must"
:
[
{
{
"match"
:
{
"match"
:
{
"subjectId"
:
"1
537739653432397825
"
"subjectId"
:
"1
734030182269853697
"
}
}
},
},
{
{
"match"
:
{
"match"
:
{
"deleteFlag"
:
"1"
"deleteFlag"
:
"1"
#checkStatus 保留
}
}
},
},
{
{
...
...
es拉取全球企业资讯/拉取保留数据.py
浏览文件 @
be4f16b4
impor
t
json
impor
t
json
...
@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False)
...
@@ -18,7 +18,7 @@ baseCore = BaseCore.BaseCore(sqlFlg=False)
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'
全球企业资讯0710
'
]
'
新华丝路-丝路投资
'
]
lock
=
threading
.
Lock
()
lock
=
threading
.
Lock
()
...
@@ -37,15 +37,7 @@ class EsMethod(object):
...
@@ -37,15 +37,7 @@ class EsMethod(object):
"must"
:
[
"must"
:
[
{
{
"match"
:
{
"match"
:
{
"subjectId"
:
"1734030182269853697"
"sid"
:
"1597878873601540098"
}
},
{
"range"
:
{
"createDate"
:
{
"gte"
:
"2024-07-01T00:00:00"
,
"lte"
:
"2024-07-11T00:00:00"
}
}
}
}
}
]
]
...
@@ -140,42 +132,44 @@ def main(page, p, esMethod):
...
@@ -140,42 +132,44 @@ def main(page, p, esMethod):
pre_summary
=
preprocess
(
clean_summary
)
pre_summary
=
preprocess
(
clean_summary
)
except
:
except
:
pre_summary
=
summary
pre_summary
=
summary
try
:
origin
=
mms
[
'_source'
][
'origin'
]
contentRaw
=
mms
[
'_source'
][
'contentRaw'
]
publishDate
=
mms
[
'_source'
][
'publishDate'
]
except
:
# try:
contentRaw
=
''
# contentRaw = mms['_source']['contentRaw']
# except:
try
:
# contentRaw = ''
clean_contentRaw
=
clean_html_tag
(
contentRaw
)
#
pre_contentRaw
=
preprocess
(
clean_contentRaw
)
# try:
except
:
# clean_contentRaw = clean_html_tag(contentRaw)
pre_contentRaw
=
contentRaw
# pre_contentRaw = preprocess(clean_contentRaw)
try
:
# except:
titleRaw
=
mms
[
'_source'
][
'titleRaw'
]
# pre_contentRaw = contentRaw
except
:
# try:
titleRaw
=
''
# titleRaw = mms['_source']['titleRaw']
try
:
# except:
summaryRaw
=
mms
[
'_source'
][
'summaryRaw'
]
# titleRaw = ''
except
:
# try:
summaryRaw
=
''
# summaryRaw = mms['_source']['summaryRaw']
try
:
# except:
clean_summaryRaw
=
clean_html_tag
(
summaryRaw
)
# summaryRaw = ''
pre_summaryRaw
=
preprocess
(
clean_summaryRaw
)
# try:
except
:
# clean_summaryRaw = clean_html_tag(summaryRaw)
pre_summaryRaw
=
summaryRaw
# pre_summaryRaw = preprocess(clean_summaryRaw)
# except:
# pre_summaryRaw = summaryRaw
log
.
info
(
f
'{id}--{title}---'
)
log
.
info
(
f
'{id}--{title}---'
)
labels
=
mms
[
'_source'
][
'labels'
]
#
labels = mms['_source']['labels']
tags
=
[]
#
tags = []
for
label
in
labels
:
#
for label in labels:
label_name
=
label
[
'labelMark'
]
#
label_name = label['labelMark']
if
label_name
==
"dynamic_tags"
:
#
if label_name == "dynamic_tags":
relationName
=
label
[
'relationName'
]
#
relationName = label['relationName']
tags
.
append
(
relationName
)
#
tags.append(relationName)
else
:
#
else:
continue
#
continue
info_tags
=
','
.
join
(
tags
)
#
info_tags = ','.join(tags)
# 存入数据库
# 存入数据库
dic
=
{
dic
=
{
...
@@ -184,10 +178,8 @@ def main(page, p, esMethod):
...
@@ -184,10 +178,8 @@ def main(page, p, esMethod):
"摘要"
:
pre_summary
,
"摘要"
:
pre_summary
,
"内容"
:
pre_content
,
"内容"
:
pre_content
,
"带标签内容"
:
contentWithTag
,
"带标签内容"
:
contentWithTag
,
"标题译文"
:
titleRaw
,
"来源"
:
origin
,
"摘要译文"
:
pre_summaryRaw
,
"发布时间"
:
publishDate
,
"内容译文"
:
pre_contentRaw
,
"标签"
:
info_tags
,
}
}
db_storage
.
insert_one
(
dic
)
db_storage
.
insert_one
(
dic
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论