Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
08e4725c
提交
08e4725c
authored
12月 21, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
12/21
上级
8f2915d4
全部展开
显示空白字符变更
内嵌
并排
正在显示
43 个修改的文件
包含
232 行增加
和
22 行删除
+232
-22
BaseCore.py
REITs_policyData/BaseCore.py
+0
-0
DisInfo-shanghai.py
REITs_policyData/DisInfo-shanghai.py
+0
-0
FundAnncmnt-shenzhen.py
REITs_policyData/FundAnncmnt-shenzhen.py
+0
-0
FundsList-shenzhen.py
REITs_policyData/FundsList-shenzhen.py
+0
-0
LawRules_2_shenzhen.py
REITs_policyData/LawRules_2_shenzhen.py
+0
-0
LawRules_shenzhen.py
REITs_policyData/LawRules_shenzhen.py
+0
-0
MarketOverview-shenzhen.py
REITs_policyData/MarketOverview-shenzhen.py
+0
-0
ProductQuotes-shanghai.py
REITs_policyData/ProductQuotes-shanghai.py
+0
-0
ProjectDynamics-shanghai.py
REITs_policyData/ProjectDynamics-shanghai.py
+0
-0
ProjectDynamics-shenzhen.py
REITs_policyData/ProjectDynamics-shenzhen.py
+0
-0
REITsDailyFund-shanghai.py
REITs_policyData/REITsDailyFund-shanghai.py
+0
-0
RuleGuide_shanghai.py
REITs_policyData/RuleGuide_shanghai.py
+0
-0
RuleGuide_shenzhen.py
REITs_policyData/RuleGuide_shenzhen.py
+0
-0
cushman.py
REITs_policyData/cushman.py
+0
-0
info-shanghai.py
REITs_policyData/info-shanghai.py
+0
-0
policy_beijing.py
REITs_policyData/policy_beijing.py
+0
-0
policy_chongqing.py
REITs_policyData/policy_chongqing.py
+0
-0
policy_fujian.py
REITs_policyData/policy_fujian.py
+0
-0
policy_guangdong.py
REITs_policyData/policy_guangdong.py
+0
-0
policy_guangxi.py
REITs_policyData/policy_guangxi.py
+0
-0
policy_gwy.py
REITs_policyData/policy_gwy.py
+0
-0
policy_hainan.py
REITs_policyData/policy_hainan.py
+0
-0
policy_heilongjiang.py
REITs_policyData/policy_heilongjiang.py
+0
-0
policy_hubei.py
REITs_policyData/policy_hubei.py
+0
-0
policy_jiangsu.py
REITs_policyData/policy_jiangsu.py
+0
-0
policy_jiangxi.py
REITs_policyData/policy_jiangxi.py
+0
-0
policy_jilin.py
REITs_policyData/policy_jilin.py
+0
-0
policy_liaoning.py
REITs_policyData/policy_liaoning.py
+0
-0
policy_neimenggu.py
REITs_policyData/policy_neimenggu.py
+0
-0
policy_shandong.py
REITs_policyData/policy_shandong.py
+0
-0
policy_shanghai.py
REITs_policyData/policy_shanghai.py
+0
-0
policy_shanxi.py
REITs_policyData/policy_shanxi.py
+0
-0
policy_sichuan.py
REITs_policyData/policy_sichuan.py
+0
-0
policy_tianjin.py
REITs_policyData/policy_tianjin.py
+0
-0
policy_yunnan.py
REITs_policyData/policy_yunnan.py
+0
-0
policy_zhejiang.py
REITs_policyData/policy_zhejiang.py
+0
-0
reits.py
REITs_policyData/reits.py
+0
-0
start.py
REITs_policyData/start.py
+5
-3
Singapore Exchange.py
REITs_policyData/国际市场/Singapore Exchange.py
+0
-0
baseinfo1122.py
comData/BaseInfo_qcc/baseinfo1122.py
+18
-15
zhaiquan.py
comData/bond_zjh/zhaiquan.py
+0
-0
gwyparts.py
comData/policylaw/gwyparts.py
+3
-4
tyc_zhuanli.py
comData/zhuanli/tyc_zhuanli.py
+206
-0
没有找到文件。
REITs
专题数据
/BaseCore.py
→
REITs
_policyData
/BaseCore.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/DisInfo-shanghai.py
→
REITs
_policyData
/DisInfo-shanghai.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/FundAnncmnt-shenzhen.py
→
REITs
_policyData
/FundAnncmnt-shenzhen.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/FundsList-shenzhen.py
→
REITs
_policyData
/FundsList-shenzhen.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/LawRules_2_shenzhen.py
→
REITs
_policyData
/LawRules_2_shenzhen.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/LawRules_shenzhen.py
→
REITs
_policyData
/LawRules_shenzhen.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/MarketOverview-shenzhen.py
→
REITs
_policyData
/MarketOverview-shenzhen.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/ProductQuotes-shanghai.py
→
REITs
_policyData
/ProductQuotes-shanghai.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/ProjectDynamics-shanghai.py
→
REITs
_policyData
/ProjectDynamics-shanghai.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/ProjectDynamics-shenzhen.py
→
REITs
_policyData
/ProjectDynamics-shenzhen.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/REITsDailyFund-shanghai.py
→
REITs
_policyData
/REITsDailyFund-shanghai.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/RuleGuide_shanghai.py
→
REITs
_policyData
/RuleGuide_shanghai.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/RuleGuide_shenzhen.py
→
REITs
_policyData
/RuleGuide_shenzhen.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/cushman.py
→
REITs
_policyData
/cushman.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/info-shanghai.py
→
REITs
_policyData
/info-shanghai.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_beijing.py
→
REITs
_policyData
/policy_beijing.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_chongqing.py
→
REITs
_policyData
/policy_chongqing.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_fujian.py
→
REITs
_policyData
/policy_fujian.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_guangdong.py
→
REITs
_policyData
/policy_guangdong.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_guangxi.py
→
REITs
_policyData
/policy_guangxi.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_gwy.py
→
REITs
_policyData
/policy_gwy.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_hainan.py
→
REITs
_policyData
/policy_hainan.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_heilongjiang.py
→
REITs
_policyData
/policy_heilongjiang.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_hubei.py
→
REITs
_policyData
/policy_hubei.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_jiangsu.py
→
REITs
_policyData
/policy_jiangsu.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_jiangxi.py
→
REITs
_policyData
/policy_jiangxi.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_jilin.py
→
REITs
_policyData
/policy_jilin.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_liaoning.py
→
REITs
_policyData
/policy_liaoning.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_neimenggu.py
→
REITs
_policyData
/policy_neimenggu.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_shandong.py
→
REITs
_policyData
/policy_shandong.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_shanghai.py
→
REITs
_policyData
/policy_shanghai.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_shanxi.py
→
REITs
_policyData
/policy_shanxi.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_sichuan.py
→
REITs
_policyData
/policy_sichuan.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_tianjin.py
→
REITs
_policyData
/policy_tianjin.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_yunnan.py
→
REITs
_policyData
/policy_yunnan.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/policy_zhejiang.py
→
REITs
_policyData
/policy_zhejiang.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/reits.py
→
REITs
_policyData
/reits.py
浏览文件 @
08e4725c
File moved
REITs
专题数据
/start.py
→
REITs
_policyData
/start.py
浏览文件 @
08e4725c
import
reits
import
reits
import
reits
import
policy_
beijing
,
policy_
chongqing
,
policy_fujian
,
policy_guangdong
import
policy_chongqing
,
policy_fujian
,
policy_guangdong
import
policy_guangxi
,
policy_gwy
,
policy_hainan
,
policy_heilongjiang
,
policy_hubei
,
policy_jiangsu
import
policy_jiangxi
,
policy_jilin
,
policy_liaoning
,
policy_neimenggu
,
policy_shandong
,
policy_hubei
import
policy_shanxi
,
policy_sichuan
,
policy_tianjin
,
policy_yunnan
,
policy_zhejiang
import
RuleGuide_shanghai
,
RuleGuide_shenzhen
import
LawRules_shenzhen
,
LawRules_2_shenzhen
from
REITs_policyData.policy_beijing
import
beijing
if
__name__
==
"__mian__"
:
policy_beijing
.
beijing
()
beijing
()
reits
.
sse
()
reits
.
reform
()
reits
.
hebei
()
...
...
REITs
专题数据
/国际市场/Singapore Exchange.py
→
REITs
_policyData
/国际市场/Singapore Exchange.py
浏览文件 @
08e4725c
File moved
comData/BaseInfo_qcc/baseinfo1122.py
浏览文件 @
08e4725c
# -*- coding: utf-8 -*-
"""
模拟点击的方法不行,涉及到需要账号登录
"""
import
json
import
re
import
time
...
...
@@ -296,7 +292,7 @@ def dic_handle(result_dic):
return
aa_dict
# 采集准备
def
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
):
def
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
# if social_code:
# dic_info = baseCore.getInfomation(social_code)
...
...
@@ -342,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
else
:
# 开始采集
try
:
if
spiderwork
(
soup
,
com_name
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
):
if
spiderwork
(
soup
,
com_name
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
count
+=
1
log
.
info
(
f
'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}'
)
token
.
updateTokeen
(
id_cookie
,
3
)
...
...
@@ -377,7 +373,7 @@ def ifbeforename(company_url):
return
''
# 采集基本信息和工商信息
def
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
):
def
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
qccid
=
company_url
.
split
(
'firm/'
)[
1
]
.
split
(
'.html'
)[
0
]
# 将采集到的企查查id更新
updateSql
=
f
"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
...
...
@@ -467,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic
[
'listingDate'
]
=
listingDate
aa_dic
[
'category'
]
=
category
aa_dic
[
'exchange'
]
=
exchange
aa_dic
[
'listingType'
]
=
listType
# print(aa_dic)
sendkafka
(
aa_dic
)
...
...
@@ -486,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic
[
'listingDate'
]
=
listingDate
aa_dic
[
'category'
]
=
category
aa_dic
[
'exchange'
]
=
exchange
aa_dic
[
'listingType'
]
=
listType
sendkafka
(
aa_dic
)
# 判断名称是否统一
def
spiderwork
(
soup
,
receptname
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
):
def
spiderwork
(
soup
,
receptname
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
company_url
=
''
try
:
company_list
=
soup
.
find
(
'table'
,
class_
=
'app-ltable ntable ntable-list ntable ntable-list'
)
...
...
@@ -530,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url
=
info_t
.
find
(
'a'
)[
'href'
]
beforename
=
ifbeforename
(
company_url
)
if
beforename
==
receptname
:
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
)
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
else
:
#没有搜到相同的企业名称
data
=
[
com_name
,
social_code
]
...
...
@@ -544,7 +540,7 @@ if __name__ == '__main__':
while
True
:
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
file_name
=
f
'./data/国内企业基本信息采集情况
_{nowtime}
.xlsx'
file_name
=
f
'./data/国内企业基本信息采集情况.xlsx'
file
.
createFile
(
file_name
)
cookieinfo
=
token
.
getToken
()
...
...
@@ -553,6 +549,7 @@ if __name__ == '__main__':
else
:
log
.
info
(
'==========已无cookies=========='
)
time
.
sleep
(
30
)
continue
id_cookie
=
cookieinfo
[
0
]
cookie_
=
json
.
loads
(
cookieinfo
[
1
])
...
...
@@ -599,6 +596,11 @@ if __name__ == '__main__':
while
flag
:
log
.
info
(
'--------已没有数据---------'
)
time
.
sleep
(
30
)
if
not
baseCore
.
check_mysql_conn
(
cnx_
):
# 144数据库
cnx_
=
baseCore
.
cnx
cursor_
=
cnx_
.
cursor
()
log
.
info
(
'===11数据库重新连接成功==='
)
company_field
=
baseCore
.
redicPullData
(
'BaseInfoEnterprise:gnqy_socialCode'
)
if
company_field
:
flag
=
False
...
...
@@ -608,7 +610,7 @@ if __name__ == '__main__':
continue
social_code
=
company_field
.
split
(
'|'
)[
0
]
com_name
=
company_field
.
split
(
'|'
)[
2
]
.
replace
(
' '
,
''
)
com_name
=
company_field
.
split
(
'|'
)[
1
]
.
replace
(
' '
,
''
)
ynDomestic
=
company_field
.
split
(
'|'
)[
15
]
countryName
=
company_field
.
split
(
'|'
)[
16
]
...
...
@@ -617,6 +619,7 @@ if __name__ == '__main__':
listingDate
=
company_field
.
split
(
'|'
)[
21
]
category
=
company_field
.
split
(
'|'
)[
19
]
exchange
=
company_field
.
split
(
'|'
)[
20
]
listType
=
company_field
.
split
(
'|'
)[
21
]
# ynDomestic = ''
# countryName = ''
# securitiesCode = ''
...
...
@@ -625,8 +628,8 @@ if __name__ == '__main__':
# category = ''
# exchange = ''
count
=
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
)
time
.
sleep
(
40
)
count
=
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
time
.
sleep
(
2
)
# break
# baseCore.r.close()
# baseCore.sendEmail(file_name)
...
...
comData/bond_zjh/zhaiquan.py
0 → 100644
浏览文件 @
08e4725c
差异被折叠。
点击展开。
comData/policylaw/gwyparts.py
浏览文件 @
08e4725c
...
...
@@ -94,7 +94,7 @@ def get_content2():
child_type
=
content_dict
[
'childtype'
]
# 主题分类
except
:
child_type
=
''
#
#
判断是否已经爬取过
# 判断是否已经爬取过
is_href
=
baseTool
.
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
num
+=
1
...
...
@@ -102,6 +102,7 @@ def get_content2():
time
.
sleep
(
1
)
continue
try
:
# href = 'https://www.gov.cn/zhengce/zhengceku/202312/content_6921452.htm'
resp
=
requests
.
get
(
url
=
href
,
headers
=
baseTool
.
headers
,
verify
=
False
)
resp
.
encoding
=
resp
.
apparent_encoding
resp_text
=
resp
.
text
...
...
@@ -120,9 +121,7 @@ def get_content2():
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
or
'.odf'
in
file_href
:
if
'.ofd'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
or
'.pdf'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
...
...
comData/zhuanli/tyc_zhuanli.py
0 → 100644
浏览文件 @
08e4725c
import
requests
,
time
,
re
,
random
from
base
import
BaseCore
import
pandas
as
pd
from
bs4
import
BeautifulSoup
as
bs
from
comData.Tyc.getTycId
import
getTycIdByXYDM
baseCore
=
BaseCore
.
BaseCore
()
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
taskType
=
'天眼查专利/国内上市'
def
spider_zhuanli
(
com_name
,
social_code
,
tycid
,
page
,
list_all_info
):
start_time
=
time
.
time
()
log
.
info
(
f
'===正在处理第{page}页==='
)
# list_all_info = []
t
=
int
(
time
.
time
()
*
1000
)
header
=
{
'Accept'
:
'application/json, text/plain, */*'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Content-Type'
:
'application/json'
,
'Host'
:
'capi.tianyancha.com'
,
'Origin'
:
'https://www.tianyancha.com'
,
'Referer'
:
'https://www.tianyancha.com/'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-site'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
,
'X-AUTH-TOKEN'
:
'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzI3MzczNzEzMSIsImlhdCI6MTcwMzE1MjEzMSwiZXhwIjoxNzA1NzQ0MTMxfQ.3tF-UFhorC_mS4h2UIBOZamApfcaJEfjBbr8K11d2yHhELBM1pEvjd6yccxhLzVKRoyFdTn-1Cz6__ZpzgjnGg'
,
'X-TYCID'
:
'6f6298905d3011ee96146793e725899d'
,
'sec-ch-ua'
:
'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
'version'
:
'TYC-Web'
}
url
=
f
'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
try
:
ip
=
baseCore
.
get_proxy
()
except
:
time
.
sleep
(
2
)
ip
=
baseCore
.
get_proxy
()
try
:
res_j
=
requests
.
get
(
url
=
url
,
headers
=
header
,
proxies
=
ip
,
verify
=
False
)
.
json
()
except
:
for
i
in
range
(
3
):
try
:
res_j
=
requests
.
get
(
url
=
url
,
headers
=
header
,
verify
=
False
)
.
json
()
except
:
time
.
sleep
(
2
)
continue
# print(res_j)
list_all
=
res_j
[
'data'
][
'items'
]
# print(list_all)
if
list_all
:
for
one_zhuanli
in
list_all
:
title
=
one_zhuanli
[
'title'
]
try
:
shenqingri
=
one_zhuanli
[
'applicationTime'
]
except
:
shenqingri
=
''
try
:
shenqing_code
=
one_zhuanli
[
'patentNum'
]
except
:
shenqing_code
=
''
try
:
leixing
=
one_zhuanli
[
'patentType'
]
except
:
leixing
=
''
try
:
status
=
one_zhuanli
[
'lprs'
]
except
:
status
=
''
try
:
gongkairi
=
one_zhuanli
[
'pubDate'
]
except
:
gongkairi
=
''
try
:
gongkai_code
=
one_zhuanli
[
'pubnumber'
]
except
:
gongkai_code
=
''
try
:
famingren
=
one_zhuanli
[
'inventor'
]
except
:
famingren
=
''
try
:
shenqingren
=
one_zhuanli
[
'applicantName'
]
except
:
shenqingren
=
''
try
:
gongneng
=
one_zhuanli
[
'cat'
]
except
:
gongneng
=
''
try
:
uuid
=
one_zhuanli
[
'uuid'
]
except
:
uuid
=
''
dic_info
=
{
'企业名称'
:
com_name
,
'统一信用代码'
:
social_code
,
'专利名称'
:
title
,
'申请日'
:
shenqingri
,
'申请号'
:
shenqing_code
,
'专利类型'
:
leixing
,
'专利状态'
:
status
,
'公开日'
:
gongkairi
,
'公开号'
:
gongkai_code
,
'发明人'
:
famingren
,
'申请人'
:
shenqingren
,
'功能'
:
gongneng
,
'天眼查详情id'
:
uuid
,
'年份'
:
shenqingri
[:
4
]
}
selectSql
=
f
"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
cursor
.
execute
(
selectSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
f
"{com_name}-------{shenqing_code}---已经存在"
)
continue
else
:
values_tuple
=
tuple
(
dic_info
.
values
())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql
=
f
"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor
.
execute
(
insertSql
,
values_tuple
)
cnx
.
commit
()
log
.
info
(
f
"{com_name}-------{shenqing_code}---新增"
)
time
.
sleep
(
2
)
# list_all_info.append(dic_info)
log
.
info
(
f
"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}"
)
return
page
else
:
return
0
if
__name__
==
"__main__"
:
while
True
:
list_all_info
=
[]
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'ZhuanLi:gnshSocial_code'
)
# social_code = '9111010566840059XP'
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
# time.sleep(20)
break
start
=
time
.
time
()
try
:
data
=
baseCore
.
getInfomation
(
social_code
)
if
len
(
data
)
!=
0
:
pass
else
:
# 数据重新塞入redis
baseCore
.
rePutIntoR
(
'ZhuanLi:gnshSocial_code'
,
social_code
)
continue
id
=
data
[
0
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
if
tycid
==
None
or
tycid
==
''
:
try
:
retData
=
getTycIdByXYDM
(
xydm
)
if
retData
[
'tycData'
]
and
retData
[
'reput'
]:
tycid
=
retData
[
'tycData'
][
'id'
]
# todo:写入数据库
updateSql
=
f
"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor
.
execute
(
updateSql
)
cnx
.
commit
()
elif
not
retData
[
'tycData'
]
and
retData
[
'reput'
]:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
baseCore
.
rePutIntoR
(
'NewsEnterprise:gnqy_socialCode'
,
social_code
)
continue
elif
not
retData
[
'reput'
]
and
not
retData
[
'tycData'
]:
continue
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
'NewsEnterprise:gnqy_socialCode'
,
social_code
)
continue
count
=
data
[
17
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始处理"
)
page
=
1
while
True
:
page
=
spider_zhuanli
(
com_name
,
xydm
,
tycid
,
page
,
list_all_info
)
if
page
!=
0
:
page
+=
1
else
:
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log
.
info
(
f
"{id}---{xydm}----{tycid}----结束处理"
)
break
except
Exception
as
e
:
log
.
info
(
f
'==={social_code}=====获取企业信息失败==={e}='
)
# 重新塞入redis
baseCore
.
rePutIntoR
(
'ZhuanLi:gnshSocial_code'
,
social_code
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
time
.
sleep
(
5
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论