Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
4e84d611
提交
4e84d611
authored
10月 26, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
10/26
上级
c2749092
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
361 行增加
和
43 行删除
+361
-43
RedisPPData.py
base/RedisPPData.py
+0
-0
smart_extractor.py
base/smart/smart_extractor.py
+15
-2
getTycId.py
comData/Tyc/getTycId.py
+4
-2
newsbucai.py
comData/Tyc/newsbucai.py
+38
-7
雪球网-年报.py
comData/annualReport/雪球网-年报.py
+3
-3
creditchina.py
comData/negative_news/creditchina.py
+131
-26
tycdt.py
comData/tyctest/tycdt.py
+2
-2
get_tokenCookies.py
comData/weixin_solo/get_tokenCookies.py
+1
-1
qiushi_leaderspeech.py
qiushi_leaderspeech.py
+167
-0
没有找到文件。
base/RedisPPData.py
浏览文件 @
4e84d611
差异被折叠。
点击展开。
base/smart/smart_extractor.py
浏览文件 @
4e84d611
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import
sys
import
pandas
as
pd
import
pandas
as
pd
import
requests
import
requests
from
goose3
import
Goose
from
goose3
import
Goose
from
goose3.text
import
StopWordsChinese
,
StopWordsKorean
,
StopWordsArabic
from
goose3.text
import
StopWordsChinese
,
StopWordsKorean
,
StopWordsArabic
from
base.smart.entity
import
*
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base
\\
smart'
)
from
base.smart.smart_extractor_utility
import
SmartExtractorUtility
from
entity
import
*
from
smart_extractor_utility
import
SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
# goose3自带的lxml,提示找不到etree,但仍可使用
from
lxml
import
etree
from
lxml
import
etree
from
lxml.html
import
HtmlElement
from
lxml.html
import
HtmlElement
...
@@ -135,6 +138,16 @@ class SmartExtractor:
...
@@ -135,6 +138,16 @@ class SmartExtractor:
return
self
.
get_extraction_result
(
article
,
link_text
)
return
self
.
get_extraction_result
(
article
,
link_text
)
def
extract_by_html
(
self
,
html
,
link_text
=
''
):
"""
按HTML采集内容
"""
# 采集正文:传入html
article
=
self
.
goose
.
extract
(
raw_html
=
html
)
return
self
.
get_extraction_result
(
article
,
link_text
)
#url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....]
#url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....]
def
extract_by_url_test
(
url_list
,
list_info_all
):
def
extract_by_url_test
(
url_list
,
list_info_all
):
# 测试:按URL采集
# 测试:按URL采集
...
...
comData/Tyc/getTycId.py
浏览文件 @
4e84d611
# 根据信用代码获取天眼查id
# 根据信用代码获取天眼查id
import
json
import
json
import
random
import
random
import
sys
import
time
import
time
import
pymysql
import
pymysql
import
requests
import
requests
from
base.BaseCore
import
BaseCore
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
import
urllib3
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
requests
.
adapters
.
DEFAULT_RETRIES
=
5
requests
.
adapters
.
DEFAULT_RETRIES
=
5
baseCore
=
BaseCore
()
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
headers
=
{
headers
=
{
'Accept'
:
'application/json, text/plain, */*'
,
'Accept'
:
'application/json, text/plain, */*'
,
...
...
comData/Tyc/newsbucai.py
浏览文件 @
4e84d611
...
@@ -6,11 +6,12 @@ import requests, time, pymysql
...
@@ -6,11 +6,12 @@ import requests, time, pymysql
import
jieba
import
jieba
import
sys
import
sys
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
getTycId
import
getTycIdByXYDM
from
getTycId
import
getTycIdByXYDM
# from base.BaseCore import BaseCore
# from base.BaseCore import BaseCore
# from base.smart import smart_extractor
# from base.smart import smart_extractor
sys
.
path
.
append
(
'D:
\\
zzsn_spider
\\
base'
)
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
import
BaseCore
from
smart
import
smart_extractor
from
smart
import
smart_extractor
import
urllib3
import
urllib3
...
@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor
...
@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor
taskType
=
'企业动态/天眼查/补采20W+'
taskType
=
'企业动态/天眼查/补采20W+'
def
reqDetailmsg
(
url
,
headers
):
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for
i
in
range
(
0
,
1
):
try
:
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
timeout
=
8
,
verify
=
False
)
response
.
encoding
=
response
.
apparent_encoding
htmltext
=
response
.
text
except
Exception
as
e
:
htmltext
=
''
log
.
info
(
f
'{url}---详情请求失败--{e}'
)
if
htmltext
:
log
.
info
(
f
'{url}---详情请求成功'
)
break
return
htmltext
def
beinWork
(
tyc_code
,
social_code
,
start_time
):
def
beinWork
(
tyc_code
,
social_code
,
start_time
):
time
.
sleep
(
3
)
time
.
sleep
(
3
)
...
@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time):
...
@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time):
# 开始进行智能解析
# 开始进行智能解析
# lang = baseCore.detect_language(title)
# lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang)
# smart = smart_extractor.SmartExtractor(lang)
#带标签正文
# req = requests.get(url=link,headers=headers,timeout=10)
contentText
=
smart
.
extract_by_url
(
link
)
.
text
# html = BeautifulSoup(req.content,'html.parser')
#不带标签正文
raw_html
=
reqDetailmsg
(
link
,
headers
)
content
=
smart
.
extract_by_url
(
link
)
.
cleaned_text
if
raw_html
:
# time.sleep(3)
# soup = BeautifulSoup(raw_html, 'html.parser')
try
:
article
=
smart
.
extract_by_html
(
raw_html
)
content
=
article
.
cleaned_text
contentText
=
article
.
text
except
Exception
as
e
:
log
.
info
(
f
'抽取失败!!{e}'
)
# #带标签正文
# contentText = smart.extract_by_url(link).text
# #不带标签正文
# content = smart.extract_by_url(link).cleaned_text
# # time.sleep(3)
except
Exception
as
e
:
except
Exception
as
e
:
contentText
=
''
contentText
=
''
if
contentText
==
''
:
if
contentText
==
''
:
log
.
error
(
f
'获取正文失败:--------{tyc_code}--------{num}--------{link}'
)
log
.
error
(
f
'获取正文失败:--------{tyc_code}--------{num}--------{link}'
)
e
=
'获取正文失败'
e
=
'获取正文失败'
...
@@ -281,7 +312,7 @@ def doJob():
...
@@ -281,7 +312,7 @@ def doJob():
while
True
:
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'NewsEnterprise:gnqybc_socialCode'
)
social_code
=
baseCore
.
redicPullData
(
'NewsEnterprise:gnqybc_socialCode'
)
#
social_code = '91440300665899831W
'
#
social_code = '913205007764477744
'
# 判断 如果Redis中已经没有数据,则等待
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
if
social_code
==
None
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
...
...
comData/annualReport/雪球网-年报.py
浏览文件 @
4e84d611
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num):
...
@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num):
'sid'
:
'1684032033495392257'
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
year_url
,
# 原文链接
'sourceAddress'
:
year_url
,
# 原文链接
'summary'
:
''
,
'summary'
:
''
,
'title'
:
name_pdf
.
replace
(
'
,
pdf'
,
''
),
'title'
:
name_pdf
.
replace
(
'
.
pdf'
,
''
),
'type'
:
1
,
'type'
:
1
,
'socialCreditCode'
:
social_code
,
'socialCreditCode'
:
social_code
,
'year'
:
year
'year'
:
year
...
@@ -260,7 +260,7 @@ if __name__ == '__main__':
...
@@ -260,7 +260,7 @@ if __name__ == '__main__':
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code
=
'913
412007050444417
'
social_code
=
'913
30000734507783B
'
if
not
social_code
:
if
not
social_code
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
...
...
comData/negative_news/creditchina.py
浏览文件 @
4e84d611
...
@@ -33,13 +33,14 @@ def getRequest(url,headers):
...
@@ -33,13 +33,14 @@ def getRequest(url,headers):
return
json_data
return
json_data
# 严重失信
# 严重失信
def
dishonesty
():
def
dishonesty
(
headers
,
com_name
,
social_code
):
list_dishonesty
=
[]
param
=
{
param
=
{
'tableName'
:
'credit_zgf_fr_sxbzxr'
,
'tableName'
:
'credit_zgf_fr_sxbzxr'
,
'searchState'
:
'1'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'keyword'
:
com_name
,
'tyshxydm'
:
'91440882315032592M'
,
'tyshxydm'
:
social_code
,
'page'
:
'1'
,
'page'
:
'1'
,
'pageSize'
:
'10'
'pageSize'
:
'10'
}
}
...
@@ -50,14 +51,14 @@ def dishonesty():
...
@@ -50,14 +51,14 @@ def dishonesty():
if
json_data
[
'status'
]
==
1
:
if
json_data
[
'status'
]
==
1
:
pass
pass
total_size
=
json_data
[
'data'
][
'totalSize'
]
total_size
=
json_data
[
'data'
][
'totalSize'
]
for
page
in
total_size
:
for
page
in
range
(
1
,
total_size
+
1
)
:
param_page
=
{
param_page
=
{
'tableName'
:
'credit_zgf_fr_sxbzxr'
,
'tableName'
:
'credit_zgf_fr_sxbzxr'
,
'searchState'
:
'1'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'keyword'
:
com_name
,
'tyshxydm'
:
'91440882315032592M'
,
'tyshxydm'
:
social_code
,
'page'
:
f
'{page}'
,
'page'
:
page
,
'pageSize'
:
'10'
'pageSize'
:
'10'
}
}
url_page
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
url_page
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...
@@ -67,7 +68,7 @@ def dishonesty():
...
@@ -67,7 +68,7 @@ def dishonesty():
pass
pass
info_list
=
json_data
[
'data'
][
'list'
]
info_list
=
json_data
[
'data'
][
'list'
]
for
info
in
info_list
:
for
info
in
info_list
:
entity
=
info
[
'entity'
]
entity
=
info
iname
=
entity
[
'iname'
]
# 失信被执行人姓名/名称
iname
=
entity
[
'iname'
]
# 失信被执行人姓名/名称
cardnumber
=
entity
[
'cardnumber'
]
# 组织机构代码
cardnumber
=
entity
[
'cardnumber'
]
# 组织机构代码
court_name
=
entity
[
'court_name'
]
# 执行法院
court_name
=
entity
[
'court_name'
]
# 执行法院
...
@@ -83,15 +84,34 @@ def dishonesty():
...
@@ -83,15 +84,34 @@ def dishonesty():
performed_part
=
entity
[
'performed_part'
]
# 已履行部分
performed_part
=
entity
[
'performed_part'
]
# 已履行部分
unperform_part
=
entity
[
'unperform_part'
]
# 未履行部分
unperform_part
=
entity
[
'unperform_part'
]
# 未履行部分
dataSource
=
info
[
'dataSource'
]
# 数据来源
dataSource
=
info
[
'dataSource'
]
# 数据来源
dic_dishonesty
=
{
'失信被执行人姓名/名称'
:
iname
,
'组织机构代码'
:
cardnumber
,
'执行法院'
:
court_name
,
'省份'
:
area_name
,
'执行依据文号'
:
case_code
,
'立案时间'
:
reg_date
,
'案号'
:
gist_cid
,
'做出执行依据单位'
:
gist_unit
,
'生效法律文书确定的义务'
:
duty
,
'被执行人的履行情况'
:
performance
,
'失信被执行人行为具体情形'
:
disreput_type_name
,
'发布时间'
:
publish_date
,
'已履行部分'
:
performed_part
,
'未履行部分'
:
unperform_part
,
'数据来源'
:
dataSource
}
list_dishonesty
.
append
(
dic_dishonesty
)
return
list_dishonesty
# 行政处罚
# 行政处罚
def
punish
():
def
punish
(
headers
,
com_name
,
social_code
):
list_punish
=
[]
param
=
{
param
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'keyword'
:
com_name
,
'tyshxydm'
:
'91440882315032592M'
,
'tyshxydm'
:
social_code
,
'page'
:
'1'
,
'page'
:
'1'
,
'pageSize'
:
'10'
'pageSize'
:
'10'
}
}
...
@@ -106,15 +126,16 @@ def punish():
...
@@ -106,15 +126,16 @@ def punish():
if
total_size
>
0
:
if
total_size
>
0
:
pass
pass
else
:
else
:
log
.
info
()
log
.
info
(
f
'该企业{com_name}无行政处罚信息'
)
for
page
in
total_size
:
return
list_punish
for
page
in
range
(
1
,
total_size
+
1
):
param_page
=
{
param_page
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'keyword'
:
com_name
,
'tyshxydm'
:
'91440882315032592M'
,
'tyshxydm'
:
social_code
,
'page'
:
f
'{page}'
,
'page'
:
page
,
'pageSize'
:
'10'
'pageSize'
:
'10'
}
}
url_page
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
url_page
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...
@@ -141,6 +162,88 @@ def punish():
...
@@ -141,6 +162,88 @@ def punish():
cf_sjly
=
entity
[
'cf_sjly'
]
# 数据来源
cf_sjly
=
entity
[
'cf_sjly'
]
# 数据来源
cf_sjlydm
=
entity
[
'cf_sjlydm'
]
# 数据来源单位统一社会信用代码
cf_sjlydm
=
entity
[
'cf_sjlydm'
]
# 数据来源单位统一社会信用代码
dic_punish
=
{
'行政处罚决定书文号'
:
cf_wsh
,
'处罚类别'
:
cf_cflb
,
'处罚决定日期'
:
cf_jdrq
,
'处罚内容'
:
cf_nr
,
'罚款金额(万元)'
:
cf_nr_fk
,
'没收违法所得、没收非法财物的金额(万元)'
:
cf_nr_wfff
,
'暂扣或吊销证照名称及编号'
:
cf_nr_zkdx
,
'违法行为类型'
:
cf_wfxw
,
'违法事实'
:
cf_sy
,
'处罚依据'
:
cf_yj
,
'处罚机关'
:
cf_cfjg
,
'处罚机关统一社会信用代码'
:
cf_cfjgdm
,
'数据来源'
:
cf_sjly
,
'数据来源单位统一社会信用代码'
:
cf_sjlydm
}
list_punish
.
append
(
dic_punish
)
return
list_punish
# 经营异常
def
abnormal
(
headers
,
com_name
,
social_code
):
list_abhormal
=
[]
param
=
{
'tableName'
:
'credit_scjdglzj_fr_ycjyml'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
'1'
,
'pageSize'
:
'10'
}
url
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page=1&pageSize=10'
json_data
=
getRequest
(
url
,
headers
)
# print(json_data)
if
json_data
[
'status'
]
==
1
:
pass
# 总条数
total_size
=
json_data
[
'data'
][
'totalSize'
]
if
total_size
>
0
:
pass
else
:
log
.
info
()
for
page
in
total_size
:
param_page
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
page
,
'pageSize'
:
'10'
}
url
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
json_data
=
getRequest
(
url
,
headers
)
if
json_data
[
'status'
]
==
1
:
pass
info_list
=
json_data
[
'data'
][
'list'
]
for
entity
in
info_list
:
entname
=
entity
[
'entname'
]
# 企业名称
uniscid
=
entity
[
'uniscid'
]
# 社会统一信用代码
lerep
=
entity
[
'lerep'
]
# 法定代表人
pripid
=
entity
[
'pripid'
]
# 主体身份代码
regno
=
entity
[
'regno'
]
# 注册号
specausename
=
entity
[
'specausename'
]
# 列入经营异常名录原因类型名称
abntime
=
entity
[
'abntime'
]
# 设立日期
decorgname
=
entity
[
'decorgname'
]
# 列入决定机关名称
dataSource
=
entity
[
'dataSource'
]
# 数据来源
dic_abnormal
=
{
'企业名称'
:
entname
,
'社会统一信用代码'
:
uniscid
,
'法定代表人'
:
lerep
,
'主体身份代码'
:
pripid
,
'注册号'
:
regno
,
'列入经营异常名录原因类型名称'
:
specausename
,
'设立日期'
:
abntime
,
'列入决定机关名称'
:
decorgname
,
'数据来源'
:
dataSource
}
list_abhormal
.
append
(
dic_abnormal
)
return
list_abhormal
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
@@ -154,16 +257,18 @@ if __name__=='__main__':
...
@@ -154,16 +257,18 @@ if __name__=='__main__':
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
'sec-ch-ua-platform'
:
'"Windows"'
}
}
type_list
=
[
'严重失信主体名单'
,
'行政管理'
]
com_name
=
'石家庄交投集团工程服务有限责任公司'
com_name
=
''
social_code
=
'91130100MA7EK14C8L'
social_code
=
''
# list_dishonesty = dishonesty(headers,com_name,social_code)
dishonesty
()
# print(list_dishonesty)
punish
()
list_punish
=
punish
(
headers
,
com_name
,
social_code
)
print
(
list_punish
)
# abnormal(headers,com_name,social_code)
# 报告链接
# 报告链接
url_report
=
f
'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
#
url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
report_json
=
getRequest
(
url_report
,
headers
)
#
report_json = getRequest(url_report, headers)
reportNumber
=
report_json
[
'data'
][
'reportNumber'
]
#
reportNumber = report_json['data']['reportNumber']
pdf_url
=
f
'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
#
pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
# respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30)
# respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30)
...
...
comData/tyctest/tycdt.py
浏览文件 @
4e84d611
...
@@ -58,8 +58,8 @@ class Tycdt(object):
...
@@ -58,8 +58,8 @@ class Tycdt(object):
def
doJob
(
self
):
def
doJob
(
self
):
while
True
:
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
#
social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
social_code
=
self
.
baseCore
.
redicPullData
(
'NewsEnterprise:gnqybc_socialCode'
)
social_code
=
'913205002517479347'
#
social_code = '913205002517479347'
# 判断 如果Redis中已经没有数据,则等待
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
if
social_code
==
None
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
...
...
comData/weixin_solo/get_tokenCookies.py
浏览文件 @
4e84d611
...
@@ -50,7 +50,7 @@ if __name__=="__main__":
...
@@ -50,7 +50,7 @@ if __name__=="__main__":
opt
.
add_experimental_option
(
"excludeSwitches"
,
[
"enable-automation"
])
opt
.
add_experimental_option
(
"excludeSwitches"
,
[
"enable-automation"
])
opt
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-logging'
])
opt
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-logging'
])
opt
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
opt
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
opt
.
binary_location
=
r'D:\
crawler\baidu_crawler\tool\
Google\Chrome\Application\chrome.exe'
opt
.
binary_location
=
r'D:\Google\Chrome\Application\chrome.exe'
chromedriver
=
r'D:\cmd100\chromedriver.exe'
chromedriver
=
r'D:\cmd100\chromedriver.exe'
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
url
=
"https://mp.weixin.qq.com/"
url
=
"https://mp.weixin.qq.com/"
...
...
qiushi_leaderspeech.py
0 → 100644
浏览文件 @
4e84d611
import
datetime
import
json
import
time
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
urllib.parse
import
urljoin
from
kafka
import
KafkaProducer
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
0
)
def
sendKafka
(
dic_news
):
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
log
.
info
(
dic_result
)
return
True
except
Exception
as
e
:
dic_result
=
{
'success'
:
'false'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'e'
:
e
}
log
.
info
(
dic_result
)
return
False
def
getRequest
(
url
,
headers
):
req
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
timeout
=
30
)
if
req
.
status_code
==
200
:
pass
soup
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
return
soup
def
deletep
(
soup
,
attribute_to_delete
,
value_to_delete
):
# 查找带有指定属性的P标签并删除
p_tags
=
soup
.
find_all
(
'p'
,
{
attribute_to_delete
:
value_to_delete
})
for
p_tag
in
p_tags
:
p_tag
.
decompose
()
def
deletek
(
soup
):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for
i
in
soup
.
find_all
(
lambda
tag
:
len
(
tag
.
get_text
())
==
0
and
tag
.
name
not
in
[
"img"
,
"video"
,
"br"
]
and
tag
.
name
!=
"br"
or
tag
.
get_text
()
==
' '
):
for
j
in
i
.
descendants
:
if
j
.
name
in
[
"img"
,
"video"
,
"br"
]:
break
else
:
i
.
decompose
()
# 将html中的相对地址转换成绝对地址
def
paserUrl
(
html
,
listurl
):
# 获取所有的<a>标签和<img>标签
if
isinstance
(
html
,
str
):
html
=
BeautifulSoup
(
html
,
'html.parser'
)
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
if
__name__
==
'__main__'
:
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'max-age=0'
,
'Cookie'
:
'UM_distinctid=18b5f64f72a580-0d0997e58eee04-26031e51-e1000-18b5f64f72bab5; wdcid=23a1d057521777ff; wdses=22f0d407e263a31e; CNZZDATA30019853=cnzz_eid
%3
D744929620-1698112534-
%26
ntime
%3
D1698112562; wdlast=1698112562'
,
'Host'
:
'www.qstheory.cn'
,
'Proxy-Connection'
:
'keep-alive'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
url
=
'http://www.qstheory.cn/qs/mulu.htm'
soup_report
=
getRequest
(
url
,
headers
)
report_list
=
soup_report
.
find_all
(
'div'
,
class_
=
'col-sm-3'
)
for
book
in
report_list
:
href
=
book
.
find
(
'div'
,
class_
=
'booktitle'
)
.
find
(
'a'
)[
'href'
]
year
=
book
.
find
(
'div'
,
class_
=
'booktitle'
)
.
find
(
'a'
)
.
text
soup_href
=
getRequest
(
href
,
headers
)
period
=
soup_href
.
find
(
'div'
,
class_
=
'highlight'
)
deletep
(
period
,
'align'
,
'center'
)
deletek
(
period
)
period_list
=
period
.
find_all
(
'p'
)
for
p
in
period_list
:
period_href
=
p
.
find
(
'a'
)[
'href'
]
period_title
=
p
.
find
(
'a'
)
.
text
soup_news
=
getRequest
(
period_href
,
headers
)
deletep
(
soup_news
,
'align'
,
'center'
)
deletek
(
soup_news
)
title_list
=
soup_news
.
select
(
'div[class="highlight"]>p'
)[
1
:]
for
new
in
title_list
:
try
:
deletek
(
new
)
try
:
author
=
new
.
find
(
'font'
,
face
=
'楷体'
)
.
text
.
replace
(
'/'
,
''
)
.
replace
(
'
\u3000
'
,
' '
)
.
replace
(
'
\xa0
'
,
''
)
except
:
continue
if
len
(
author
)
>
4
:
continue
# if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
if
'('
in
author
or
'本刊'
in
author
or
'国家'
in
author
\
or
'中共'
in
author
or
'记者'
in
author
or
'新闻社'
in
author
\
or
'党委'
in
author
or
'”'
in
author
\
or
'大学'
in
author
or
'洛桑江村'
in
author
:
continue
new_href
=
new
.
find
(
'a'
)[
'href'
]
is_member
=
r
.
sismember
(
'qiushileaderspeech::'
+
period_title
,
new_href
)
if
is_member
:
continue
new_title
=
new
.
find
(
'a'
)
.
text
.
replace
(
'
\u3000
'
,
' '
)
.
lstrip
(
' '
)
.
replace
(
'——'
,
''
)
.
replace
(
'
\xa0
'
,
''
)
except
:
continue
soup_new
=
getRequest
(
new_href
,
headers
)
deletek
(
soup_new
)
deletep
(
soup_new
,
'style'
,
'TEXT-ALIGN: center'
)
result
=
soup_new
.
find
(
'div'
,
class_
=
'inner'
)
if
result
:
pass
else
:
continue
span_list
=
result
.
find_all
(
'span'
)
source
=
span_list
[
0
]
.
text
.
replace
(
'来源:'
,
''
)
.
strip
(
'
\r\n
'
)
pub_time
=
span_list
[
2
]
.
text
.
strip
(
'
\r\n
'
)
content
=
soup_new
.
find
(
'div'
,
class_
=
'highlight'
)
.
text
paserUrl
(
soup_new
,
new_href
)
contentWithTag
=
soup_new
.
find
(
'div'
,
class_
=
'highlight'
)
nowDate
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
dic_news
=
{
'sid'
:
'1716996740019585025'
,
'title'
:
new_title
,
'source'
:
"16"
,
'origin'
:
source
,
'author'
:
author
,
'publishDate'
:
pub_time
,
'content'
:
content
,
'contentWithTag'
:
str
(
contentWithTag
),
'sourceAddress'
:
new_href
,
"createDate"
:
nowDate
}
# log.info(dic_news)
if
sendKafka
(
dic_news
):
r
.
sadd
(
'qiushileaderspeech::'
+
period_title
,
new_href
)
log
.
info
(
f
'采集成功----{dic_news["sourceAddress"]}'
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论