Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
621196c7
提交
621196c7
authored
8月 15, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
天眼查企业动态
上级
80e7804c
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
55 行增加
和
43 行删除
+55
-43
BaseCore.py
base/BaseCore.py
+6
-0
tyc_qydt_add.py
comData/tcyQydt/tyc_qydt_add.py
+49
-43
没有找到文件。
base/BaseCore.py
浏览文件 @
621196c7
...
@@ -458,6 +458,7 @@ class BaseCore:
...
@@ -458,6 +458,7 @@ class BaseCore:
print
(
e
)
print
(
e
)
self
.
cnx
.
commit
()
self
.
cnx
.
commit
()
#获取企查查token
def
GetToken
(
self
):
def
GetToken
(
self
):
#获取企查查token
#获取企查查token
query
=
"select token from QCC_token "
query
=
"select token from QCC_token "
...
@@ -476,6 +477,7 @@ class BaseCore:
...
@@ -476,6 +477,7 @@ class BaseCore:
return
'cn'
return
'cn'
return
result
[
0
]
return
result
[
0
]
#追加接入excel
def
writerToExcel
(
self
,
detailList
,
filename
):
def
writerToExcel
(
self
,
detailList
,
filename
):
# filename='baidu搜索.xlsx'
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
# 读取已存在的xlsx文件
...
@@ -488,4 +490,8 @@ class BaseCore:
...
@@ -488,4 +490,8 @@ class BaseCore:
combined_data
.
to_excel
(
filename
,
index
=
False
)
combined_data
.
to_excel
(
filename
,
index
=
False
)
# return combined_data
# return combined_data
#对失败或者断掉的企业 重新放入redis
def
rePutIntoR
(
self
,
item
):
self
.
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
comData/tcyQydt/tyc_qydt_add.py
浏览文件 @
621196c7
"""
增量采集:
取state为3、update_state为空的企业 表示上次采集成功的企业,
新增update_state字段,取一个企业更新为2,表示该企业正在采集。
采集完毕更新为1.
表示已经采集完成。跟据date_time 来排列 每次就不会拿到重复的数据。
okCount
errorCount
repectCount
新增三个字段分别对应更新的up_okCount up_errorCount up_repectCount ,
记录这些更新的数据 然后加到原来的数据上表示该企业已采集多少动态
8.8日改版,企业动态也传kafka
"""
import
json
import
json
import
requests
,
time
,
pymysql
import
requests
,
time
,
pymysql
import
jieba
import
jieba
import
sys
import
sys
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
getTycId
import
getTycIdByXYDM
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
from
base.smart
import
smart_extractor
from
base.smart
import
smart_extractor
# sys.path.append('D:\\KK\\zzsn_spider\\base')
# sys.path.append('D:\\KK\\zzsn_spider\\base')
...
@@ -53,8 +38,8 @@ headers = {
...
@@ -53,8 +38,8 @@ headers = {
taskType
=
'企业动态/天眼查'
taskType
=
'企业动态/天眼查'
def
beinWork
(
tyc_code
,
social_code
):
def
beinWork
(
tyc_code
,
social_code
,
start_time
):
start_time
=
time
.
time
()
time
.
sleep
(
3
)
time
.
sleep
(
3
)
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
retData
=
{
'total'
:
0
,
'up_okCount'
:
0
,
'up_errorCount'
:
0
,
'up_repetCount'
:
0
}
retData
=
{
'total'
:
0
,
'up_okCount'
:
0
,
'up_errorCount'
:
0
,
'up_repetCount'
:
0
}
...
@@ -230,12 +215,13 @@ def beinWork(tyc_code, social_code):
...
@@ -230,12 +215,13 @@ def beinWork(tyc_code, social_code):
'sid'
:
'1684032033495392257'
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
link
,
# 原文链接
'sourceAddress'
:
link
,
# 原文链接
'summary'
:
info_page
[
'abstracts'
],
'summary'
:
info_page
[
'abstracts'
],
'title'
:
contentText
,
'title'
:
title
,
'type'
:
2
,
'type'
:
2
,
'socialCreditCode'
:
social_code
,
'socialCreditCode'
:
social_code
,
'year'
:
time_format
[:
4
]
'year'
:
time_format
[:
4
]
}
}
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
f
'传输失败:{social_code}----{link}'
)
log
.
info
(
f
'传输失败:{social_code}----{link}'
)
e
=
'数据库传输失败'
e
=
'数据库传输失败'
state
=
0
state
=
0
...
@@ -263,6 +249,7 @@ def beinWork(tyc_code, social_code):
...
@@ -263,6 +249,7 @@ def beinWork(tyc_code, social_code):
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
''
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
''
)
# return True
# return True
except
Exception
as
e
:
except
Exception
as
e
:
dic_result
=
{
dic_result
=
{
'success'
:
'false'
,
'success'
:
'false'
,
'message'
:
'操作失败'
,
'message'
:
'操作失败'
,
...
@@ -276,8 +263,6 @@ def beinWork(tyc_code, social_code):
...
@@ -276,8 +263,6 @@ def beinWork(tyc_code, social_code):
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
e
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
e
)
log
.
info
(
f
"获取分页数据--{tyc_code}----分页{num},耗时{baseCore.getTimeCost(start_page, time.time())}"
)
log
.
info
(
f
"获取分页数据--{tyc_code}----分页{num},耗时{baseCore.getTimeCost(start_page, time.time())}"
)
retData
[
'up_okCount'
]
=
up_okCount
retData
[
'up_okCount'
]
=
up_okCount
retData
[
'up_errorCount'
]
=
up_errorCount
retData
[
'up_errorCount'
]
=
up_errorCount
retData
[
'up_repetCount'
]
=
up_repetCount
retData
[
'up_repetCount'
]
=
up_repetCount
...
@@ -295,30 +280,49 @@ def doJob():
...
@@ -295,30 +280,49 @@ def doJob():
if
social_code
==
'None'
:
if
social_code
==
'None'
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
data
=
baseCore
.
getInfomation
(
social_code
)
start
=
time
.
time
()
id
=
data
[
0
]
try
:
xydm
=
data
[
2
]
data
=
baseCore
.
getInfomation
(
social_code
)
tycid
=
data
[
11
]
id
=
data
[
0
]
count
=
data
[
17
]
xydm
=
data
[
2
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始处理"
)
tycid
=
data
[
11
]
start_time
=
time
.
time
()
if
tycid
==
None
:
try
:
retData
=
getTycIdByXYDM
(
xydm
)
tycid
=
retData
[
'tycData'
][
'id'
]
#todo:写入数据库
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
social_code
)
continue
count
=
data
[
17
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始处理"
)
start_time
=
time
.
time
()
# updateBeginSql = f"update ssqy_tyc set update_state=2,date_time=now() where id={id}"
# cursor.execute(updateBeginSql)
# cnx.commit()
# updateBeginSql = f"update ssqy_tyc set update_state=2,date_time=now() where id={id}"
# 开始采集企业动态
# cursor.execute(updateBeginSql)
retData
=
beinWork
(
tycid
,
xydm
,
start_time
)
# cnx.commit()
# 信息采集完成后将该企业的采集次数更新
runType
=
'NewsRunCount'
count
+=
1
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
total
=
retData
[
'total'
]
up_okCount
=
retData
[
'up_okCount'
]
up_errorCount
=
retData
[
'up_errorCount'
]
up_repetCount
=
retData
[
'up_repetCount'
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----结束处理,耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}"
)
except
:
log
.
info
(
f
'==={social_code}=====获取企业信息失败===='
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取企业信息失败'
)
# 开始采集企业动态
retData
=
beinWork
(
tycid
,
xydm
)
# 信息采集完成后将该企业的采集次数更新
runType
=
'NewsRunCount'
count
+=
1
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
total
=
retData
[
'total'
]
up_okCount
=
retData
[
'up_okCount'
]
up_errorCount
=
retData
[
'up_errorCount'
]
up_repetCount
=
retData
[
'up_repetCount'
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----结束处理,耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}"
)
cursor
.
close
()
cursor
.
close
()
cnx
.
close
()
cnx
.
close
()
...
@@ -328,4 +332,6 @@ def doJob():
...
@@ -328,4 +332,6 @@ def doJob():
# Press the green button in the gutter to run the script.
# Press the green button in the gutter to run the script.
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
doJob
()
doJob
()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论