Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
55610b8f
提交
55610b8f
authored
1月 05, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
24/01/05
上级
23d4dd76
隐藏空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
563 行增加
和
121 行删除
+563
-121
BaseCore.py
base/BaseCore.py
+1
-0
getcode.py
comData/BaseInfo_qcc/getcode.py
+31
-0
resentYanbao.py
comData/YanBao/resentYanbao.py
+153
-120
gwyparts.py
comData/policylaw/gwyparts.py
+1
-1
tyc_shangbiao_zg500.py
comData/shangbiao/tyc_shangbiao_zg500.py
+151
-0
tyc_zhuanli_zg500.py
comData/zhuanli/tyc_zhuanli_zg500.py
+226
-0
没有找到文件。
base/BaseCore.py
浏览文件 @
55610b8f
...
@@ -403,6 +403,7 @@ class BaseCore:
...
@@ -403,6 +403,7 @@ class BaseCore:
sql
=
"select proxy from clb_proxy"
sql
=
"select proxy from clb_proxy"
self
.
cursor
.
execute
(
sql
)
self
.
cursor
.
execute
(
sql
)
proxy_lists
=
self
.
cursor
.
fetchall
()
proxy_lists
=
self
.
cursor
.
fetchall
()
self
.
cnx
.
commit
()
ip_list
=
[]
ip_list
=
[]
for
proxy_
in
proxy_lists
:
for
proxy_
in
proxy_lists
:
ip_list
.
append
(
str
(
proxy_
)
.
replace
(
"('"
,
''
)
.
replace
(
"',)"
,
''
))
ip_list
.
append
(
str
(
proxy_
)
.
replace
(
"('"
,
''
)
.
replace
(
"',)"
,
''
))
...
...
comData/BaseInfo_qcc/getcode.py
0 → 100644
浏览文件 @
55610b8f
import
pandas
as
pd
# from pandas import DataFrame as df
import
pymysql
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'dbScore'
,
charset
=
'utf8mb4'
)
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
df_all
=
pd
.
read_excel
(
'D:
\\
企业数据
\\
数据组提供
\\
第五批专精特新企业名单汇总_修订版_20240102.xlsx'
,
dtype
=
str
)
list_com
=
[]
for
num_df
in
range
(
len
(
df_all
)):
com_name
=
str
(
df_all
[
'企业名称'
][
num_df
])
dic_com
=
{
'social_code'
:
''
,
'com_name'
:
com_name
}
with
cnx
.
cursor
()
as
cursor
:
sel_sql
=
'''select social_credit_code from sys_base_enterprise where name =
%
s '''
cursor
.
execute
(
sel_sql
,
com_name
)
selects
=
cursor
.
fetchone
()
if
selects
:
print
(
f
'【{num_df}/{len(df_all)}】==={com_name}找到'
)
social_code
=
selects
[
0
]
else
:
print
(
f
'【{num_df}/{len(df_all)}】==={com_name}未找到'
)
social_code
=
''
df_all
[
'信用代码'
][
num_df
]
=
str
(
social_code
)
df_all
.
to_excel
(
'D:
\\
企业数据
\\
数据组提供
\\
第五批专精特新企业名单汇总_修订版_20240102.xlsx'
,
index
=
False
)
\ No newline at end of file
comData/YanBao/resentYanbao.py
浏览文件 @
55610b8f
...
@@ -228,7 +228,7 @@ def download(data, order_by):
...
@@ -228,7 +228,7 @@ def download(data, order_by):
'sid'
:
sid
,
'sid'
:
sid
,
'sourceAddress'
:
sourceAddress
,
'sourceAddress'
:
sourceAddress
,
'summary'
:
summary
,
'summary'
:
summary
,
'title'
:
name_pdf
,
'title'
:
name_pdf
.
split
(
'.pdf'
)[
0
]
,
'type'
:
'0'
'type'
:
'0'
}
}
# 将相应字段通过kafka传输保存
# 将相应字段通过kafka传输保存
...
@@ -396,8 +396,8 @@ def Mob():
...
@@ -396,8 +396,8 @@ def Mob():
# usecount = loginfo.split('|')[2]
# usecount = loginfo.split('|')[2]
usecount
=
0
usecount
=
0
# 测试用
# 测试用
#
account = '13636711746'
account
=
'13636711746'
#
password = 'Zhenghao123'
password
=
'Zhenghao123'
# account = '18703752600'
# account = '18703752600'
# password = 'Axlk010208!'
# password = 'Axlk010208!'
...
@@ -407,8 +407,8 @@ def Mob():
...
@@ -407,8 +407,8 @@ def Mob():
# password = 'xlk123456!'
# password = 'xlk123456!'
# account = '17103126138'
# account = '17103126138'
# password = '171BlackOne'
# password = '171BlackOne'
account
=
'17103128590'
#
account = '17103128590'
password
=
'171BlackTwo'
#
password = '171BlackTwo'
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
f_url
=
'https://www.mob.com/developer/login'
f_url
=
'https://www.mob.com/developer/login'
browser
.
get
(
f_url
)
browser
.
get
(
f_url
)
...
@@ -469,12 +469,8 @@ def Mob():
...
@@ -469,12 +469,8 @@ def Mob():
i_soup
=
BeautifulSoup
(
res_href
,
'html.parser'
)
i_soup
=
BeautifulSoup
(
res_href
,
'html.parser'
)
summary_list
=
i_soup
.
find
(
class_
=
'picture-content htmlContent'
)
.
find_all
(
'h3'
)
summary_list
=
i_soup
.
find
(
class_
=
'picture-content htmlContent'
)
.
find_all
(
'h3'
)
news_url
=
f
'https://api.os.mob.com/api/academy_report/download/{report_id}'
news_url
=
f
'https://api.os.mob.com/api/academy_report/download/{report_id}'
# headers['token'] = '92b42171-7a33-4f3b-a25b-9ca689699e10'
# headers['token'] = '495f9714-7ea8-4987-91c0-2b0ede38238b'
headers
[
'token'
]
=
'05bc441a-b09b-40cb-ab65-8d9e63e5c529'
# headers['token'] = '0dcbde4a-9aaa-4651-b886-856add4b8df9'
# headers['token'] = '2fcdd67b-da81-4f2f-9d6f-529fdbf6ae1f'
# headers['token'] = 'dd54bc77-50fa-4a25-aec7-95ec45bd17f8'
headers
[
'token'
]
=
'2fd143d3-a1ec-4d9d-9d9b-38a1d4cf8387'
news_req
=
session
.
get
(
url
=
news_url
,
headers
=
headers
)
news_req
=
session
.
get
(
url
=
news_url
,
headers
=
headers
)
pdf_url
=
news_req
.
json
()[
'data'
]
pdf_url
=
news_req
.
json
()[
'data'
]
...
@@ -693,31 +689,75 @@ def juliangsuanshu():
...
@@ -693,31 +689,75 @@ def juliangsuanshu():
getnews
(
browser
)
getnews
(
browser
)
browser
.
quit
()
browser
.
quit
()
def
ke36switch
(
browser
,
info_url
):
try
:
browser
.
get
(
info_url
)
# 跳到指定页面
page_source
=
browser
.
page_source
# 获取页面信息
soup_info
=
BeautifulSoup
(
page_source
,
'html.parser'
)
info_date
=
soup_info
.
find
(
'meta'
,
{
'property'
:
'article:published_time'
})
.
get
(
'content'
)[:
10
]
return
soup_info
except
:
browser
.
quit
()
proxy
=
baseCore
.
get_proxy
()
# proxy = {
# 'http': '222.90.4.73:40018',
# 'httpS': '222.90.4.73:40018'
# }
opt
.
add_argument
(
'--proxy-server='
+
proxy
[
'http'
]
.
split
(
'://'
)[
1
])
# opt.add_argument('--proxy-server=' + proxy['http'])
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
browser
.
refresh
()
ke36switch
(
browser
,
info_url
)
# 36氪
# 36氪
def
ke36
():
def
ke36
():
# browser = webdriver.Chrome(chromedriver)
# browser = webdriver.Chrome(chromedriver)
proxy
=
baseCore
.
get_proxy
()
opt
.
add_argument
(
'--proxy-server='
+
proxy
[
'http'
]
.
split
(
'://'
)[
1
])
# opt.add_argument('--proxy-server=' + proxy['http'])
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
url
=
'https://36kr.com/academe'
url
=
'https://36kr.com/academe'
browser
.
get
(
url
)
#跳到指定页面
browser
.
get
(
url
)
#跳到指定页面
time
.
sleep
(
3
)
for
i
in
range
(
10
):
try
:
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
CLASS_NAME
,
'show-more'
)))
js
=
"var q=document.documentElement.scrollTop=3000"
browser
.
execute_script
(
js
)
time
.
sleep
(
2
)
browser
.
find_element
(
By
.
CLASS_NAME
,
'show-more'
)
.
click
()
except
:
break
wait
=
WebDriverWait
(
browser
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
TAG_NAME
,
'body'
)))
page_source
=
browser
.
page_source
#获取页面信息
page_source
=
browser
.
page_source
#获取页面信息
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
list_all
=
soup
.
find
(
'div'
,{
'class'
:
'report-list-wrapper'
})
.
find_all
(
'div'
,{
'class'
:
'report-card type-4'
})
list_all
=
soup
.
find
(
'div'
,{
'class'
:
'report-list-wrapper'
})
.
find_all
(
'div'
,{
'class'
:
'report-card type-4'
})
for
one_info
in
list_all
:
for
one_info
in
list_all
[::
-
1
]
:
info_title
=
one_info
.
find
(
'div'
,{
'class'
:
'title'
})
.
text
info_title
=
one_info
.
find
(
'div'
,{
'class'
:
'title'
})
.
text
info_zhaiyao
=
one_info
.
find
(
'div'
,{
'class'
:
'desc'
})
.
text
info_zhaiyao
=
one_info
.
find
(
'div'
,{
'class'
:
'desc'
})
.
text
info_url
=
one_info
.
a
.
get
(
'href'
)
info_url
=
one_info
.
a
.
get
(
'href'
)
# is_member = r.sismember('report_pdf_three_history', info_url)
# if is_member:
# continue
soup_info
=
ke36switch
(
browser
,
info_url
)
browser
.
get
(
info_url
)
#跳到指定页面
info_date
=
soup_info
.
find
(
'meta'
,
{
'property'
:
'article:published_time'
})
.
get
(
'content'
)[:
10
]
if
info_date
<
'2023-05-10'
:
page_source
=
browser
.
page_source
#获取页面信息
pass
soup_info
=
BeautifulSoup
(
page_source
,
'html.parser'
)
else
:
time
.
sleep
(
1
)
info_date
=
soup_info
.
find
(
'meta'
,{
'property'
:
'article:published_time'
})
.
get
(
'content'
)[:
10
]
continue
info_content
=
soup_info
.
find
(
'div'
,{
'class'
:
'common-width margin-bottom-20'
})
.
text
try
:
info_content
=
soup_info
.
find
(
'div'
,{
'class'
:
'common-width margin-bottom-20'
})
.
text
except
:
proxy
=
baseCore
.
get_proxy
()
opt
.
add_argument
(
'--proxy-server='
+
proxy
[
'http'
])
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
ke36switch
(
browser
,
info_url
)
dic_post
=
{
dic_post
=
{
'title'
:
info_title
,
# 报告名称
'title'
:
info_title
,
# 报告名称
'url_pdf'
:
''
,
# 报告链接
'url_pdf'
:
''
,
# 报告链接
...
@@ -734,7 +774,7 @@ def ke36():
...
@@ -734,7 +774,7 @@ def ke36():
'sid'
:
'1662008421217378306'
,
# 信息源id
'sid'
:
'1662008421217378306'
,
# 信息源id
}
}
order_by
=
1
order_by
=
1
download
(
dic_post
,
order_by
)
#
download(dic_post, order_by)
order_by
+=
1
order_by
+=
1
# print(page,dic_post)
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# url = 'http://114.115.155.139:5002/report_download'
...
@@ -742,6 +782,7 @@ def ke36():
...
@@ -742,6 +782,7 @@ def ke36():
# res = requests.post(url, data=json.dumps(dic_post))
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
# print(res.json())
time
.
sleep
(
2
)
time
.
sleep
(
2
)
browser
.
quit
()
browser
.
quit
()
...
@@ -922,6 +963,28 @@ def shijiejingjiluntan():
...
@@ -922,6 +963,28 @@ def shijiejingjiluntan():
time
.
sleep
(
2
)
time
.
sleep
(
2
)
browser
.
quit
()
browser
.
quit
()
def
get_json
(
key_word
,
page
,
headers
):
param
=
{
"uid"
:
""
,
"keyword"
:
key_word
,
"type"
:
[
"researchReport"
],
"client"
:
"web"
,
"clientVersion"
:
"curr"
,
"clientType"
:
"web"
,
"param"
:
{
"researchReport"
:
{
"client"
:
"web"
,
"pageSize"
:
10
,
"pageIndex"
:
page
}}
}
param_url
=
parse
.
quote
(
str
(
param
)
.
replace
(
" "
,
""
))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://search-api-web.eastmoney.com/search/jsonp?cb=¶m={param_url}&_={t}'
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969¶m=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
.
text
[
1
:
-
1
]
res_json
=
json
.
loads
(
res
)
return
res_json
# 东方财富网
# 东方财富网
def
dongfangcaifu
():
def
dongfangcaifu
():
headers
=
{
headers
=
{
...
@@ -965,101 +1028,70 @@ def dongfangcaifu():
...
@@ -965,101 +1028,70 @@ def dongfangcaifu():
page
=
1
page
=
1
# for page in range(1,500):
# for page in range(1,500):
# log.info(page)
# log.info(page)
param
=
{
res_json_
=
get_json
(
key_word
,
page
,
headers
)
"uid"
:
""
,
# 添加页数
"keyword"
:
key_word
,
total
=
res_json_
[
'hitsTotal'
]
"type"
:
[
"researchReport"
],
page
=
(
total
/
10
)
+
1
"client"
:
"web"
,
for
page_
in
range
(
1
,
page
+
1
):
"clientVersion"
:
"curr"
,
res_json
=
get_json
(
key_word
,
page_
,
headers
)
"clientType"
:
"web"
,
list_all
=
res_json
[
'result'
][
'researchReport'
]
"param"
:
{
"researchReport"
:
{
"client"
:
"web"
,
"pageSize"
:
10
,
"pageIndex"
:
page
}}
}
if
list_all
:
param_url
=
parse
.
quote
(
str
(
param
)
.
replace
(
" "
,
""
))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t
=
int
(
time
.
time
()
*
1000
)
url
=
f
'https://search-api-web.eastmoney.com/search/jsonp?cb=¶m={param_url}&_={t}'
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969¶m=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
.
text
[
1
:
-
1
]
res_json
=
json
.
loads
(
res
)
list_all
=
res_json
[
'result'
][
'researchReport'
]
if
list_all
:
pass
else
:
continue
for
one_news
in
list_all
:
news_title
=
one_news
[
'title'
]
news_title
=
news_title
.
replace
(
'<em>'
,
''
)
.
replace
(
'</em>'
,
''
)
news_date
=
one_news
[
'date'
][:
10
]
comparison_date
=
"2023-12-08"
# 比较发布日期是否小于2023-10-06
if
news_date
<
comparison_date
:
continue
else
:
pass
pass
news_come
=
one_news
[
'source'
]
else
:
news_code
=
one_news
[
'code'
]
continue
for
one_news
in
list_all
:
news_title
=
one_news
[
'title'
]
news_title
=
news_title
.
replace
(
'<em>'
,
''
)
.
replace
(
'</em>'
,
''
)
news_date
=
one_news
[
'date'
][:
10
]
comparison_date
=
"2023-12-08"
# 比较发布日期是否小于2023-10-06
if
news_date
<
comparison_date
:
continue
else
:
pass
news_come
=
one_news
[
'source'
]
news_code
=
one_news
[
'code'
]
news_url
=
f
'https://data.eastmoney.com/report/zw_stock.jshtml?infocode={news_code}'
news_url
=
f
'https://data.eastmoney.com/report/zw_stock.jshtml?infocode={news_code}'
news_res
=
requests
.
get
(
news_url
)
news_res
=
requests
.
get
(
news_url
)
news_soup
=
BeautifulSoup
(
news_res
.
content
,
'html.parser'
)
news_soup
=
BeautifulSoup
(
news_res
.
content
,
'html.parser'
)
try
:
try
:
if
'抱歉,您访问的页面不存在或已删除!'
in
news_soup
.
title
.
text
:
if
'抱歉,您访问的页面不存在或已删除!'
in
news_soup
.
title
.
text
:
continue
except
:
continue
continue
except
:
try
:
continue
news_content
=
news_soup
.
find
(
'div'
,
{
'class'
:
'newsContent'
})
.
text
.
strip
()
try
:
except
:
news_content
=
news_soup
.
find
(
'div'
,
{
'class'
:
'newsContent'
})
.
text
.
strip
()
news_content
=
news_soup
.
find
(
'div'
,
{
'class'
:
'ctx-content'
})
.
text
.
strip
()
except
:
news_content
=
news_soup
.
find
(
'div'
,
{
'class'
:
'ctx-content'
})
.
text
.
strip
()
try
:
try
:
news_pdf
=
news_soup
.
find
(
'div'
,
{
'class'
:
'detail-header'
})
.
find_all
(
'a'
)[
-
1
]
.
get
(
'href'
)
news_pdf
=
news_soup
.
find
(
'div'
,
{
'class'
:
'detail-header'
})
.
find_all
(
'a'
)[
-
1
]
.
get
(
'href'
)
except
:
except
:
news_pdf
=
news_soup
.
find
(
'span'
,
{
'class'
:
'to-link'
})
.
a
.
get
(
'href'
)
news_pdf
=
news_soup
.
find
(
'span'
,
{
'class'
:
'to-link'
})
.
a
.
get
(
'href'
)
dic_post
=
{
dic_post
=
{
'title'
:
news_title
,
# 报告名称
'title'
:
news_title
,
# 报告名称
'url_pdf'
:
news_pdf
,
# 报告链接
'url_pdf'
:
news_pdf
,
# 报告链接
'year'
:
news_date
[:
4
],
# 报告年份
'year'
:
news_date
[:
4
],
# 报告年份
'type_id'
:
'4'
,
# 报告种类,(年报:1,季报:2,月报:3,研报:4)
'type_id'
:
'4'
,
# 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id'
:
social_code
,
# 关联记录id,如:企业信用代码
'item_id'
:
social_code
,
# 关联记录id,如:企业信用代码
'category'
:
'pdf'
,
# 文件后缀名,如:pdf
'category'
:
'pdf'
,
# 文件后缀名,如:pdf
'create_by'
:
'TangYuHang'
,
# 创建人,使用驼峰命名,如:TangYuHang
'create_by'
:
'TangYuHang'
,
# 创建人,使用驼峰命名,如:TangYuHang
'publishDate'
:
news_date
,
# 时间
'publishDate'
:
news_date
,
# 时间
'origin'
:
'东方财富网-研报中心'
,
# 来源
'origin'
:
'东方财富网-研报中心'
,
# 来源
'sourceAddress'
:
news_url
,
# 原文链接
'sourceAddress'
:
news_url
,
# 原文链接
'content'
:
''
,
# 内容
'content'
:
''
,
# 内容
'summary'
:
news_content
,
# 摘要
'summary'
:
news_content
,
# 摘要
'sid'
:
'1662008733005160449'
,
# 信息源id
'sid'
:
'1662008733005160449'
,
# 信息源id
'come'
:
news_come
,
'come'
:
news_come
,
}
}
order_by
=
1
order_by
=
1
download
(
dic_post
,
order_by
)
download
(
dic_post
,
order_by
)
order_by
+=
1
order_by
+=
1
# log.info(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# log.info(res.json())
# dic_news = {
# '关键字':key_word,
# '标题':news_title,
# '时间':news_date,
# '来源':news_come,
# '摘要':news_content,
# '原文链接':news_url,
# 'PDF链接':news_pdf,
# }
# list_all_info.append(dic_news)
# if len(list_all) != 10:
# break
# 东方财富网2
# 东方财富网2
def
dongfangcaifu2
():
def
dongfangcaifu2
():
...
@@ -1590,11 +1622,11 @@ def dongfangcaifu7():
...
@@ -1590,11 +1622,11 @@ def dongfangcaifu7():
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
try
:
#
try:
log
.
info
(
'mob'
)
#
log.info('mob')
Mob
()
#
Mob()
except
Exception
as
e
:
#
except Exception as e:
pass
#
pass
# try:
# try:
# log.info('yidong_guanxiangtai')
# log.info('yidong_guanxiangtai')
# yidong_guanxiangtai()
# yidong_guanxiangtai()
...
@@ -1605,11 +1637,12 @@ if __name__ == '__main__':
...
@@ -1605,11 +1637,12 @@ if __name__ == '__main__':
# juliangsuanshu()
# juliangsuanshu()
# except Exception as e:
# except Exception as e:
# pass
# pass
# try:
try
:
# log.info('ke36')
log
.
info
(
'ke36'
)
# ke36()
ke36
()
# except:
except
Exception
as
e
:
# pass
ke36
()
pass
# try:
# try:
# log.info('qianyanzhishiku')
# log.info('qianyanzhishiku')
# qianyanzhishiku()
# qianyanzhishiku()
...
...
comData/policylaw/gwyparts.py
浏览文件 @
55610b8f
...
@@ -121,7 +121,7 @@ def get_content2():
...
@@ -121,7 +121,7 @@ def get_content2():
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
f
'---{href}--------{e}-------'
)
log
.
info
(
f
'---{href}--------{e}-------'
)
continue
continue
if
'.ofd'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
or
'.pdf'
in
file_href
:
if
'.
wps'
in
file_href
or
'.
ofd'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xls'
in
file_href
or
'.zip'
in
file_href
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
or
'.pdf'
in
file_href
:
file_name
=
file
.
text
.
strip
()
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
if
category
not
in
file_name
:
...
...
comData/shangbiao/tyc_shangbiao_zg500.py
0 → 100644
浏览文件 @
55610b8f
# 天眼查商标申请数量
# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
# 请求方式 POST
import
requests
,
time
,
re
,
random
from
base
import
BaseCore
import
pandas
as
pd
from
bs4
import
BeautifulSoup
as
bs
from
comData.Tyc.getTycId
import
getTycIdByXYDM
baseCore
=
BaseCore
.
BaseCore
()
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
taskType
=
'天眼查商标/中国500强'
header
=
{
'Accept'
:
'application/json, text/plain, */*'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Content-Type'
:
'application/json'
,
'Host'
:
'capi.tianyancha.com'
,
'Origin'
:
'https://www.tianyancha.com'
,
'Referer'
:
'https://www.tianyancha.com/'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-site'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'X-AUTH-TOKEN'
:
'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ'
,
'X-TYCID'
:
'6f6298905d3011ee96146793e725899d'
,
'sec-ch-ua'
:
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
'version'
:
'TYC-Web'
}
if
__name__
==
"__main__"
:
while
True
:
start_time
=
time
.
time
()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'ShangBiao:zg500shSocial_code'
)
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
# time.sleep(20)
break
start
=
time
.
time
()
try
:
data
=
baseCore
.
getInfomation
(
social_code
)
if
len
(
data
)
!=
0
:
pass
else
:
# 数据重新塞入redis
baseCore
.
rePutIntoR
(
'ShangBiao:zg500shSocial_code'
,
social_code
)
continue
id
=
data
[
0
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
if
tycid
==
None
or
tycid
==
''
:
try
:
retData
=
getTycIdByXYDM
(
xydm
)
if
retData
[
'tycData'
]
and
retData
[
'reput'
]:
tycid
=
retData
[
'tycData'
][
'id'
]
# todo:写入数据库
updateSql
=
f
"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor
.
execute
(
updateSql
)
cnx
.
commit
()
elif
not
retData
[
'tycData'
]
and
retData
[
'reput'
]:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
baseCore
.
rePutIntoR
(
'ShangBiao:zg500shSocial_code'
,
social_code
)
continue
elif
not
retData
[
'reput'
]
and
not
retData
[
'tycData'
]:
continue
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
'ShangBiao:zg500shSocial_code'
,
social_code
)
continue
# count = data[17]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始处理"
)
t
=
int
(
time
.
time
()
*
1000
)
# url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
url
=
f
'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
# tycid = '209252214'
# payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
# "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
# "sortType": "-100"}
request
=
requests
.
get
(
url
=
url
,
headers
=
header
,
verify
=
False
)
# request = requests.post(url=url, headers=header, data=payload)
# print(request.text)
data_json
=
request
.
json
()
# print(data_json)
try
:
all_data
=
data_json
[
'data'
][
'applyYearGraph'
][
'statisticGraphData'
]
except
:
dic_info
=
{
'企业名称'
:
com_name
,
'统一信用代码'
:
social_code
,
}
selectSql
=
f
"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
cursor
.
execute
(
selectSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
f
"{com_name}----已经存在---无商标数据"
)
continue
else
:
values_tuple
=
tuple
(
dic_info
.
values
())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql
=
f
"insert into shangbiao_sh_tyc(com_name,social_code) values (
%
s,
%
s)"
cursor
.
execute
(
insertSql
,
values_tuple
)
cnx
.
commit
()
log
.
info
(
f
"{com_name}-----新增---无商标数据"
)
continue
for
info
in
all_data
:
year
=
info
[
'desc'
]
num
=
info
[
'num'
]
# 申请商标数量
dic_info
=
{
'企业名称'
:
com_name
,
'统一信用代码'
:
social_code
,
'年份'
:
year
,
'数量'
:
num
}
selectSql
=
f
"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
cursor
.
execute
(
selectSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
f
"{com_name}-------{year}---已经存在"
)
continue
else
:
values_tuple
=
tuple
(
dic_info
.
values
())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql
=
f
"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (
%
s,
%
s,
%
s,
%
s)"
cursor
.
execute
(
insertSql
,
values_tuple
)
cnx
.
commit
()
log
.
info
(
f
"{com_name}-------{year}---新增"
)
time
.
sleep
(
2
)
# list_all_info.append(dic_info)
log
.
info
(
f
"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}"
)
except
Exception
as
e
:
log
.
info
(
f
'==={social_code}=====获取企业信息失败==={e}='
)
# 重新塞入redis
baseCore
.
rePutIntoR
(
'ShangBiao:zg500shSocial_code'
,
social_code
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
time
.
sleep
(
5
)
comData/zhuanli/tyc_zhuanli_zg500.py
0 → 100644
浏览文件 @
55610b8f
import
requests
,
time
,
re
,
random
from
base
import
BaseCore
import
pandas
as
pd
from
bs4
import
BeautifulSoup
as
bs
from
comData.Tyc.getTycId
import
getTycIdByXYDM
baseCore
=
BaseCore
.
BaseCore
()
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
taskType
=
'天眼查专利/国内上市'
def
spider_zhuanli
(
com_name
,
social_code
,
tycid
,
page
,
list_all_info
):
start_time
=
time
.
time
()
log
.
info
(
f
'===正在处理第{page}页==='
)
# list_all_info = []
t
=
int
(
time
.
time
()
*
1000
)
header
=
{
'Accept'
:
'application/json, text/plain, */*'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Content-Type'
:
'application/json'
,
'Host'
:
'capi.tianyancha.com'
,
'Origin'
:
'https://www.tianyancha.com'
,
'Referer'
:
'https://www.tianyancha.com/'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-site'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
,
'X-AUTH-TOKEN'
:
'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzI3MzczNzEzMSIsImlhdCI6MTcwMzE1MjEzMSwiZXhwIjoxNzA1NzQ0MTMxfQ.3tF-UFhorC_mS4h2UIBOZamApfcaJEfjBbr8K11d2yHhELBM1pEvjd6yccxhLzVKRoyFdTn-1Cz6__ZpzgjnGg'
,
'X-TYCID'
:
'6f6298905d3011ee96146793e725899d'
,
'sec-ch-ua'
:
'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
'version'
:
'TYC-Web'
}
url
=
f
'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
try
:
ip
=
baseCore
.
get_proxy
()
except
:
time
.
sleep
(
2
)
ip
=
baseCore
.
get_proxy
()
try
:
res_j
=
requests
.
get
(
url
=
url
,
headers
=
header
,
proxies
=
ip
,
verify
=
False
)
.
json
()
except
:
for
i
in
range
(
3
):
try
:
res_j
=
requests
.
get
(
url
=
url
,
headers
=
header
,
verify
=
False
)
.
json
()
except
:
time
.
sleep
(
2
)
continue
# print(res_j)
try
:
list_all
=
res_j
[
'data'
][
'items'
]
except
:
dic_info
=
{
'企业名称'
:
com_name
,
'统一信用代码'
:
social_code
}
selectSql
=
f
"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
cursor
.
execute
(
selectSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
f
"{com_name}---{social_code}---已经存在---无专利"
)
return
0
else
:
values_tuple
=
tuple
(
dic_info
.
values
())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql
=
f
"insert into zhuanli_sh_tyc(com_name,social_code) values (
%
s,
%
s)"
cursor
.
execute
(
insertSql
,
values_tuple
)
cnx
.
commit
()
log
.
info
(
f
"{com_name}---{social_code}---新增---无专利"
)
return
0
# print(list_all)
if
list_all
:
for
one_zhuanli
in
list_all
:
title
=
one_zhuanli
[
'title'
]
try
:
shenqingri
=
one_zhuanli
[
'applicationTime'
]
except
:
shenqingri
=
''
try
:
shenqing_code
=
one_zhuanli
[
'patentNum'
]
except
:
shenqing_code
=
''
try
:
leixing
=
one_zhuanli
[
'patentType'
]
except
:
leixing
=
''
try
:
status
=
one_zhuanli
[
'lprs'
]
except
:
status
=
''
try
:
gongkairi
=
one_zhuanli
[
'pubDate'
]
except
:
gongkairi
=
''
try
:
gongkai_code
=
one_zhuanli
[
'pubnumber'
]
except
:
gongkai_code
=
''
try
:
famingren
=
one_zhuanli
[
'inventor'
]
except
:
famingren
=
''
try
:
shenqingren
=
one_zhuanli
[
'applicantName'
]
except
:
shenqingren
=
''
try
:
gongneng
=
one_zhuanli
[
'cat'
]
except
:
gongneng
=
''
try
:
uuid
=
one_zhuanli
[
'uuid'
]
except
:
uuid
=
''
dic_info
=
{
'企业名称'
:
com_name
,
'统一信用代码'
:
social_code
,
'专利名称'
:
title
,
'申请日'
:
shenqingri
,
'申请号'
:
shenqing_code
,
'专利类型'
:
leixing
,
'专利状态'
:
status
,
'公开日'
:
gongkairi
,
'公开号'
:
gongkai_code
,
'发明人'
:
famingren
,
'申请人'
:
shenqingren
,
'功能'
:
gongneng
,
'天眼查详情id'
:
uuid
,
'年份'
:
shenqingri
[:
4
]
}
selectSql
=
f
"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
cursor
.
execute
(
selectSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
f
"{com_name}-------{shenqing_code}---已经存在"
)
continue
else
:
values_tuple
=
tuple
(
dic_info
.
values
())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql
=
f
"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor
.
execute
(
insertSql
,
values_tuple
)
cnx
.
commit
()
log
.
info
(
f
"{com_name}-------{shenqing_code}---新增"
)
time
.
sleep
(
2
)
# list_all_info.append(dic_info)
log
.
info
(
f
"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}"
)
return
page
else
:
return
0
if
__name__
==
"__main__"
:
while
True
:
list_all_info
=
[]
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'ZhuanLi:gnshSocial_code_zg500'
)
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
# time.sleep(20)
break
start
=
time
.
time
()
try
:
data
=
baseCore
.
getInfomation
(
social_code
)
if
len
(
data
)
!=
0
:
pass
else
:
# 数据重新塞入redis
baseCore
.
rePutIntoR
(
'ZhuanLi:gnshSocial_code_zg500'
,
social_code
)
continue
id
=
data
[
0
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
if
tycid
==
None
or
tycid
==
''
:
try
:
retData
=
getTycIdByXYDM
(
xydm
)
if
retData
[
'tycData'
]
and
retData
[
'reput'
]:
tycid
=
retData
[
'tycData'
][
'id'
]
# todo:写入数据库
updateSql
=
f
"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor
.
execute
(
updateSql
)
cnx
.
commit
()
elif
not
retData
[
'tycData'
]
and
retData
[
'reput'
]:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
log
.
info
(
f
'======={social_code}====重新放入redis===='
)
baseCore
.
rePutIntoR
(
'NewsEnterprise:gnqy_socialCode'
,
social_code
)
continue
elif
not
retData
[
'reput'
]
and
not
retData
[
'tycData'
]:
continue
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
'获取天眼查id失败'
)
baseCore
.
rePutIntoR
(
'NewsEnterprise:gnqy_socialCode'
,
social_code
)
continue
count
=
data
[
17
]
log
.
info
(
f
"{id}---{xydm}----{tycid}----开始处理"
)
page
=
1
while
True
:
page
=
spider_zhuanli
(
com_name
,
xydm
,
tycid
,
page
,
list_all_info
)
if
page
!=
0
:
page
+=
1
else
:
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log
.
info
(
f
"{id}---{xydm}----{tycid}----结束处理"
)
break
except
Exception
as
e
:
log
.
info
(
f
'==={social_code}=====获取企业信息失败==={e}='
)
# 重新塞入redis
baseCore
.
rePutIntoR
(
'ZhuanLi:gnshSocial_code_zg500'
,
social_code
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
f
'获取企业信息失败--{e}'
)
time
.
sleep
(
5
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论