Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
4e84d611
提交
4e84d611
authored
10月 26, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
10/26
上级
c2749092
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
361 行增加
和
43 行删除
+361
-43
RedisPPData.py
base/RedisPPData.py
+0
-0
smart_extractor.py
base/smart/smart_extractor.py
+15
-2
getTycId.py
comData/Tyc/getTycId.py
+4
-2
newsbucai.py
comData/Tyc/newsbucai.py
+38
-7
雪球网-年报.py
comData/annualReport/雪球网-年报.py
+3
-3
creditchina.py
comData/negative_news/creditchina.py
+131
-26
tycdt.py
comData/tyctest/tycdt.py
+2
-2
get_tokenCookies.py
comData/weixin_solo/get_tokenCookies.py
+1
-1
qiushi_leaderspeech.py
qiushi_leaderspeech.py
+167
-0
没有找到文件。
base/RedisPPData.py
浏览文件 @
4e84d611
差异被折叠。
点击展开。
base/smart/smart_extractor.py
浏览文件 @
4e84d611
# -*- coding: utf-8 -*-
import
sys
import
pandas
as
pd
import
requests
from
goose3
import
Goose
from
goose3.text
import
StopWordsChinese
,
StopWordsKorean
,
StopWordsArabic
from
base.smart.entity
import
*
from
base.smart.smart_extractor_utility
import
SmartExtractorUtility
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base
\\
smart'
)
from
entity
import
*
from
smart_extractor_utility
import
SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from
lxml
import
etree
from
lxml.html
import
HtmlElement
...
...
@@ -135,6 +138,16 @@ class SmartExtractor:
return
self
.
get_extraction_result
(
article
,
link_text
)
def
extract_by_html
(
self
,
html
,
link_text
=
''
):
"""
按HTML采集内容
"""
# 采集正文:传入html
article
=
self
.
goose
.
extract
(
raw_html
=
html
)
return
self
.
get_extraction_result
(
article
,
link_text
)
#url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....]
def
extract_by_url_test
(
url_list
,
list_info_all
):
# 测试:按URL采集
...
...
comData/Tyc/getTycId.py
浏览文件 @
4e84d611
# 根据信用代码获取天眼查id
import
json
import
random
import
sys
import
time
import
pymysql
import
requests
from
base.BaseCore
import
BaseCore
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
requests
.
adapters
.
DEFAULT_RETRIES
=
5
baseCore
=
BaseCore
()
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'Accept'
:
'application/json, text/plain, */*'
,
...
...
comData/Tyc/newsbucai.py
浏览文件 @
4e84d611
...
...
@@ -6,11 +6,12 @@ import requests, time, pymysql
import
jieba
import
sys
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
getTycId
import
getTycIdByXYDM
# from base.BaseCore import BaseCore
# from base.smart import smart_extractor
sys
.
path
.
append
(
'D:
\\
zzsn_spider
\\
base'
)
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
from
smart
import
smart_extractor
import
urllib3
...
...
@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor
taskType
=
'企业动态/天眼查/补采20W+'
def
reqDetailmsg
(
url
,
headers
):
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for
i
in
range
(
0
,
1
):
try
:
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
timeout
=
8
,
verify
=
False
)
response
.
encoding
=
response
.
apparent_encoding
htmltext
=
response
.
text
except
Exception
as
e
:
htmltext
=
''
log
.
info
(
f
'{url}---详情请求失败--{e}'
)
if
htmltext
:
log
.
info
(
f
'{url}---详情请求成功'
)
break
return
htmltext
def
beinWork
(
tyc_code
,
social_code
,
start_time
):
time
.
sleep
(
3
)
...
...
@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time):
# 开始进行智能解析
# lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang)
#带标签正文
contentText
=
smart
.
extract_by_url
(
link
)
.
text
#不带标签正文
content
=
smart
.
extract_by_url
(
link
)
.
cleaned_text
# time.sleep(3)
# req = requests.get(url=link,headers=headers,timeout=10)
# html = BeautifulSoup(req.content,'html.parser')
raw_html
=
reqDetailmsg
(
link
,
headers
)
if
raw_html
:
# soup = BeautifulSoup(raw_html, 'html.parser')
try
:
article
=
smart
.
extract_by_html
(
raw_html
)
content
=
article
.
cleaned_text
contentText
=
article
.
text
except
Exception
as
e
:
log
.
info
(
f
'抽取失败!!{e}'
)
# #带标签正文
# contentText = smart.extract_by_url(link).text
# #不带标签正文
# content = smart.extract_by_url(link).cleaned_text
# # time.sleep(3)
except
Exception
as
e
:
contentText
=
''
if
contentText
==
''
:
log
.
error
(
f
'获取正文失败:--------{tyc_code}--------{num}--------{link}'
)
e
=
'获取正文失败'
...
...
@@ -281,7 +312,7 @@ def doJob():
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'NewsEnterprise:gnqybc_socialCode'
)
#
social_code = '91440300665899831W
'
#
social_code = '913205007764477744
'
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
time
.
sleep
(
20
)
...
...
comData/annualReport/雪球网-年报.py
浏览文件 @
4e84d611
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
...
@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num):
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
year_url
,
# 原文链接
'summary'
:
''
,
'title'
:
name_pdf
.
replace
(
'
,
pdf'
,
''
),
'title'
:
name_pdf
.
replace
(
'
.
pdf'
,
''
),
'type'
:
1
,
'socialCreditCode'
:
social_code
,
'year'
:
year
...
...
@@ -260,7 +260,7 @@ if __name__ == '__main__':
start_time
=
time
.
time
()
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code
=
'913
412007050444417
'
social_code
=
'913
30000734507783B
'
if
not
social_code
:
time
.
sleep
(
20
)
continue
...
...
comData/negative_news/creditchina.py
浏览文件 @
4e84d611
...
...
@@ -33,13 +33,14 @@ def getRequest(url,headers):
return
json_data
# 严重失信
def
dishonesty
():
def
dishonesty
(
headers
,
com_name
,
social_code
):
list_dishonesty
=
[]
param
=
{
'tableName'
:
'credit_zgf_fr_sxbzxr'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'tyshxydm'
:
'91440882315032592M'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
'1'
,
'pageSize'
:
'10'
}
...
...
@@ -50,14 +51,14 @@ def dishonesty():
if
json_data
[
'status'
]
==
1
:
pass
total_size
=
json_data
[
'data'
][
'totalSize'
]
for
page
in
total_size
:
for
page
in
range
(
1
,
total_size
+
1
)
:
param_page
=
{
'tableName'
:
'credit_zgf_fr_sxbzxr'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'tyshxydm'
:
'91440882315032592M'
,
'page'
:
f
'{page}'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
page
,
'pageSize'
:
'10'
}
url_page
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...
...
@@ -67,7 +68,7 @@ def dishonesty():
pass
info_list
=
json_data
[
'data'
][
'list'
]
for
info
in
info_list
:
entity
=
info
[
'entity'
]
entity
=
info
iname
=
entity
[
'iname'
]
# 失信被执行人姓名/名称
cardnumber
=
entity
[
'cardnumber'
]
# 组织机构代码
court_name
=
entity
[
'court_name'
]
# 执行法院
...
...
@@ -83,15 +84,34 @@ def dishonesty():
performed_part
=
entity
[
'performed_part'
]
# 已履行部分
unperform_part
=
entity
[
'unperform_part'
]
# 未履行部分
dataSource
=
info
[
'dataSource'
]
# 数据来源
dic_dishonesty
=
{
'失信被执行人姓名/名称'
:
iname
,
'组织机构代码'
:
cardnumber
,
'执行法院'
:
court_name
,
'省份'
:
area_name
,
'执行依据文号'
:
case_code
,
'立案时间'
:
reg_date
,
'案号'
:
gist_cid
,
'做出执行依据单位'
:
gist_unit
,
'生效法律文书确定的义务'
:
duty
,
'被执行人的履行情况'
:
performance
,
'失信被执行人行为具体情形'
:
disreput_type_name
,
'发布时间'
:
publish_date
,
'已履行部分'
:
performed_part
,
'未履行部分'
:
unperform_part
,
'数据来源'
:
dataSource
}
list_dishonesty
.
append
(
dic_dishonesty
)
return
list_dishonesty
# 行政处罚
def
punish
():
def
punish
(
headers
,
com_name
,
social_code
):
list_punish
=
[]
param
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'tyshxydm'
:
'91440882315032592M'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
'1'
,
'pageSize'
:
'10'
}
...
...
@@ -106,15 +126,16 @@ def punish():
if
total_size
>
0
:
pass
else
:
log
.
info
()
for
page
in
total_size
:
log
.
info
(
f
'该企业{com_name}无行政处罚信息'
)
return
list_punish
for
page
in
range
(
1
,
total_size
+
1
):
param_page
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'tyshxydm'
:
'91440882315032592M'
,
'page'
:
f
'{page}'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
page
,
'pageSize'
:
'10'
}
url_page
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...
...
@@ -141,6 +162,88 @@ def punish():
cf_sjly
=
entity
[
'cf_sjly'
]
# 数据来源
cf_sjlydm
=
entity
[
'cf_sjlydm'
]
# 数据来源单位统一社会信用代码
dic_punish
=
{
'行政处罚决定书文号'
:
cf_wsh
,
'处罚类别'
:
cf_cflb
,
'处罚决定日期'
:
cf_jdrq
,
'处罚内容'
:
cf_nr
,
'罚款金额(万元)'
:
cf_nr_fk
,
'没收违法所得、没收非法财物的金额(万元)'
:
cf_nr_wfff
,
'暂扣或吊销证照名称及编号'
:
cf_nr_zkdx
,
'违法行为类型'
:
cf_wfxw
,
'违法事实'
:
cf_sy
,
'处罚依据'
:
cf_yj
,
'处罚机关'
:
cf_cfjg
,
'处罚机关统一社会信用代码'
:
cf_cfjgdm
,
'数据来源'
:
cf_sjly
,
'数据来源单位统一社会信用代码'
:
cf_sjlydm
}
list_punish
.
append
(
dic_punish
)
return
list_punish
# 经营异常
def
abnormal
(
headers
,
com_name
,
social_code
):
list_abhormal
=
[]
param
=
{
'tableName'
:
'credit_scjdglzj_fr_ycjyml'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
'1'
,
'pageSize'
:
'10'
}
url
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page=1&pageSize=10'
json_data
=
getRequest
(
url
,
headers
)
# print(json_data)
if
json_data
[
'status'
]
==
1
:
pass
# 总条数
total_size
=
json_data
[
'data'
][
'totalSize'
]
if
total_size
>
0
:
pass
else
:
log
.
info
()
for
page
in
total_size
:
param_page
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
page
,
'pageSize'
:
'10'
}
url
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
json_data
=
getRequest
(
url
,
headers
)
if
json_data
[
'status'
]
==
1
:
pass
info_list
=
json_data
[
'data'
][
'list'
]
for
entity
in
info_list
:
entname
=
entity
[
'entname'
]
# 企业名称
uniscid
=
entity
[
'uniscid'
]
# 社会统一信用代码
lerep
=
entity
[
'lerep'
]
# 法定代表人
pripid
=
entity
[
'pripid'
]
# 主体身份代码
regno
=
entity
[
'regno'
]
# 注册号
specausename
=
entity
[
'specausename'
]
# 列入经营异常名录原因类型名称
abntime
=
entity
[
'abntime'
]
# 设立日期
decorgname
=
entity
[
'decorgname'
]
# 列入决定机关名称
dataSource
=
entity
[
'dataSource'
]
# 数据来源
dic_abnormal
=
{
'企业名称'
:
entname
,
'社会统一信用代码'
:
uniscid
,
'法定代表人'
:
lerep
,
'主体身份代码'
:
pripid
,
'注册号'
:
regno
,
'列入经营异常名录原因类型名称'
:
specausename
,
'设立日期'
:
abntime
,
'列入决定机关名称'
:
decorgname
,
'数据来源'
:
dataSource
}
list_abhormal
.
append
(
dic_abnormal
)
return
list_abhormal
if
__name__
==
'__main__'
:
...
...
@@ -154,16 +257,18 @@ if __name__=='__main__':
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
type_list
=
[
'严重失信主体名单'
,
'行政管理'
]
com_name
=
''
social_code
=
''
dishonesty
()
punish
()
com_name
=
'石家庄交投集团工程服务有限责任公司'
social_code
=
'91130100MA7EK14C8L'
# list_dishonesty = dishonesty(headers,com_name,social_code)
# print(list_dishonesty)
list_punish
=
punish
(
headers
,
com_name
,
social_code
)
print
(
list_punish
)
# abnormal(headers,com_name,social_code)
# 报告链接
url_report
=
f
'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
report_json
=
getRequest
(
url_report
,
headers
)
reportNumber
=
report_json
[
'data'
][
'reportNumber'
]
pdf_url
=
f
'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
#
url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
#
report_json = getRequest(url_report, headers)
#
reportNumber = report_json['data']['reportNumber']
#
pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
# respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30)
...
...
comData/tyctest/tycdt.py
浏览文件 @
4e84d611
...
...
@@ -58,8 +58,8 @@ class Tycdt(object):
def
doJob
(
self
):
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
#
social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
social_code
=
'913205002517479347'
social_code
=
self
.
baseCore
.
redicPullData
(
'NewsEnterprise:gnqybc_socialCode'
)
#
social_code = '913205002517479347'
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
time
.
sleep
(
20
)
...
...
comData/weixin_solo/get_tokenCookies.py
浏览文件 @
4e84d611
...
...
@@ -50,7 +50,7 @@ if __name__=="__main__":
opt
.
add_experimental_option
(
"excludeSwitches"
,
[
"enable-automation"
])
opt
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-logging'
])
opt
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
opt
.
binary_location
=
r'D:\
crawler\baidu_crawler\tool\
Google\Chrome\Application\chrome.exe'
opt
.
binary_location
=
r'D:\Google\Chrome\Application\chrome.exe'
chromedriver
=
r'D:\cmd100\chromedriver.exe'
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
url
=
"https://mp.weixin.qq.com/"
...
...
qiushi_leaderspeech.py
0 → 100644
浏览文件 @
4e84d611
import
datetime
import
json
import
time
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
urllib.parse
import
urljoin
from
kafka
import
KafkaProducer
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
0
)
def
sendKafka
(
dic_news
):
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
log
.
info
(
dic_result
)
return
True
except
Exception
as
e
:
dic_result
=
{
'success'
:
'false'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'e'
:
e
}
log
.
info
(
dic_result
)
return
False
def
getRequest
(
url
,
headers
):
req
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
timeout
=
30
)
if
req
.
status_code
==
200
:
pass
soup
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
return
soup
def
deletep
(
soup
,
attribute_to_delete
,
value_to_delete
):
# 查找带有指定属性的P标签并删除
p_tags
=
soup
.
find_all
(
'p'
,
{
attribute_to_delete
:
value_to_delete
})
for
p_tag
in
p_tags
:
p_tag
.
decompose
()
def
deletek
(
soup
):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for
i
in
soup
.
find_all
(
lambda
tag
:
len
(
tag
.
get_text
())
==
0
and
tag
.
name
not
in
[
"img"
,
"video"
,
"br"
]
and
tag
.
name
!=
"br"
or
tag
.
get_text
()
==
' '
):
for
j
in
i
.
descendants
:
if
j
.
name
in
[
"img"
,
"video"
,
"br"
]:
break
else
:
i
.
decompose
()
# 将html中的相对地址转换成绝对地址
def
paserUrl
(
html
,
listurl
):
# 获取所有的<a>标签和<img>标签
if
isinstance
(
html
,
str
):
html
=
BeautifulSoup
(
html
,
'html.parser'
)
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
if
__name__
==
'__main__'
:
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'max-age=0'
,
'Cookie'
:
'UM_distinctid=18b5f64f72a580-0d0997e58eee04-26031e51-e1000-18b5f64f72bab5; wdcid=23a1d057521777ff; wdses=22f0d407e263a31e; CNZZDATA30019853=cnzz_eid
%3
D744929620-1698112534-
%26
ntime
%3
D1698112562; wdlast=1698112562'
,
'Host'
:
'www.qstheory.cn'
,
'Proxy-Connection'
:
'keep-alive'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
url
=
'http://www.qstheory.cn/qs/mulu.htm'
soup_report
=
getRequest
(
url
,
headers
)
report_list
=
soup_report
.
find_all
(
'div'
,
class_
=
'col-sm-3'
)
for
book
in
report_list
:
href
=
book
.
find
(
'div'
,
class_
=
'booktitle'
)
.
find
(
'a'
)[
'href'
]
year
=
book
.
find
(
'div'
,
class_
=
'booktitle'
)
.
find
(
'a'
)
.
text
soup_href
=
getRequest
(
href
,
headers
)
period
=
soup_href
.
find
(
'div'
,
class_
=
'highlight'
)
deletep
(
period
,
'align'
,
'center'
)
deletek
(
period
)
period_list
=
period
.
find_all
(
'p'
)
for
p
in
period_list
:
period_href
=
p
.
find
(
'a'
)[
'href'
]
period_title
=
p
.
find
(
'a'
)
.
text
soup_news
=
getRequest
(
period_href
,
headers
)
deletep
(
soup_news
,
'align'
,
'center'
)
deletek
(
soup_news
)
title_list
=
soup_news
.
select
(
'div[class="highlight"]>p'
)[
1
:]
for
new
in
title_list
:
try
:
deletek
(
new
)
try
:
author
=
new
.
find
(
'font'
,
face
=
'楷体'
)
.
text
.
replace
(
'/'
,
''
)
.
replace
(
'
\u3000
'
,
' '
)
.
replace
(
'
\xa0
'
,
''
)
except
:
continue
if
len
(
author
)
>
4
:
continue
# if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
if
'('
in
author
or
'本刊'
in
author
or
'国家'
in
author
\
or
'中共'
in
author
or
'记者'
in
author
or
'新闻社'
in
author
\
or
'党委'
in
author
or
'”'
in
author
\
or
'大学'
in
author
or
'洛桑江村'
in
author
:
continue
new_href
=
new
.
find
(
'a'
)[
'href'
]
is_member
=
r
.
sismember
(
'qiushileaderspeech::'
+
period_title
,
new_href
)
if
is_member
:
continue
new_title
=
new
.
find
(
'a'
)
.
text
.
replace
(
'
\u3000
'
,
' '
)
.
lstrip
(
' '
)
.
replace
(
'——'
,
''
)
.
replace
(
'
\xa0
'
,
''
)
except
:
continue
soup_new
=
getRequest
(
new_href
,
headers
)
deletek
(
soup_new
)
deletep
(
soup_new
,
'style'
,
'TEXT-ALIGN: center'
)
result
=
soup_new
.
find
(
'div'
,
class_
=
'inner'
)
if
result
:
pass
else
:
continue
span_list
=
result
.
find_all
(
'span'
)
source
=
span_list
[
0
]
.
text
.
replace
(
'来源:'
,
''
)
.
strip
(
'
\r\n
'
)
pub_time
=
span_list
[
2
]
.
text
.
strip
(
'
\r\n
'
)
content
=
soup_new
.
find
(
'div'
,
class_
=
'highlight'
)
.
text
paserUrl
(
soup_new
,
new_href
)
contentWithTag
=
soup_new
.
find
(
'div'
,
class_
=
'highlight'
)
nowDate
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
dic_news
=
{
'sid'
:
'1716996740019585025'
,
'title'
:
new_title
,
'source'
:
"16"
,
'origin'
:
source
,
'author'
:
author
,
'publishDate'
:
pub_time
,
'content'
:
content
,
'contentWithTag'
:
str
(
contentWithTag
),
'sourceAddress'
:
new_href
,
"createDate"
:
nowDate
}
# log.info(dic_news)
if
sendKafka
(
dic_news
):
r
.
sadd
(
'qiushileaderspeech::'
+
period_title
,
new_href
)
log
.
info
(
f
'采集成功----{dic_news["sourceAddress"]}'
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论