Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
1bb5b282
提交
1bb5b282
authored
10月 27, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
10/27
上级
fb61875d
显示空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
89 行增加
和
19 行删除
+89
-19
RedisPPData.py
base/RedisPPData.py
+0
-0
fbs_annualreport.py
comData/annualReport/fbs_annualreport.py
+5
-4
证监会-年报.py
comData/annualReport/证监会-年报.py
+4
-4
report.py
comData/annualReport1014/report.py
+2
-1
creditchina.py
comData/negative_news/creditchina.py
+78
-10
没有找到文件。
base/RedisPPData.py
浏览文件 @
1bb5b282
This source diff could not be displayed because it is too large. You can
view the blob
instead.
comData/annualReport/fbs_annualreport.py
浏览文件 @
1bb5b282
...
...
@@ -58,7 +58,7 @@ if __name__ == '__main__':
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
,
}
query
=
"
select * from clb_sys_attachment where id= 383007
"
query
=
"
SELECT * FROM clb_sys_attachment WHERE type_id=1 AND source='证监会'
"
cursor_
.
execute
(
query
)
results
=
cursor_
.
fetchall
()
for
result
in
results
:
...
...
@@ -74,9 +74,10 @@ if __name__ == '__main__':
pass
else
:
com_name
=
selects
[
1
]
full_path
=
'http://
114.115.215.96/'
+
result
[
6
]
full_path
=
'http://
zzsn.luyuen.com/'
+
result
[
19
]
year
=
result
[
9
]
create_time
=
result
[
13
]
publish
=
str
(
result
[
21
])
content
=
''
for
i
in
range
(
0
,
3
):
try
:
...
...
@@ -102,9 +103,9 @@ if __name__ == '__main__':
'id'
:
''
,
'keyWords'
:
''
,
'lang'
:
detect_language
,
'origin'
:
com_name
+
'企业官网
'
,
'origin'
:
'证监会
'
,
# 'origin': '雪球网',
'publishDate'
:
str
(
year
)
+
'-12-31'
,
'publishDate'
:
publish
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
''
,
# 原文链接
'summary'
:
''
,
...
...
comData/annualReport/证监会-年报.py
浏览文件 @
1bb5b282
impor
t
json
impor
t
json
...
...
@@ -125,8 +125,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
year
=
re
.
findall
(
'
\
d{4}
\
s*年'
,
name_pdf
)[
0
]
.
replace
(
'年'
,
''
)
except
Exception
as
e
:
# pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
year
=
int
(
pub_time
)
-
1
year
=
str
(
year
)
year
=
int
(
pub_time
[:
4
]
)
-
1
#
year = str(year)
# page_size = 0
...
...
@@ -322,7 +322,7 @@ if __name__ == '__main__':
start_time
=
time
.
time
()
# 获取企业信息
social_code
=
baseCore
.
redicPullData
(
'AnnualEnterprise:gnqy_socialCode'
)
# social_code = '91
100000100003962T
'
# social_code = '91
210800765420138L
'
if
not
social_code
:
time
.
sleep
(
20
)
continue
...
...
comData/annualReport1014/report.py
浏览文件 @
1bb5b282
...
...
@@ -180,6 +180,7 @@ if __name__=='__main__':
#retData, com_name, year, pdf_name, num, pub_time
att_id
=
baseCore
.
tableUpdate
(
retData_f
,
cname
,
file_year
,
file_name
,
num
,
file_year
+
'-12-31'
,
origin
)
if
att_id
:
detect_language
=
baseCore
.
detect_language
(
content
)
dic_news
=
{
'attachmentIds'
:
att_id
,
'author'
:
''
,
...
...
@@ -189,7 +190,7 @@ if __name__=='__main__':
'deleteFlag'
:
'0'
,
'id'
:
''
,
'keyWords'
:
''
,
'lang'
:
'zh'
,
'lang'
:
detect_language
,
'origin'
:
origin
,
'publishDate'
:
file_year
+
'-12-31'
,
'sid'
:
'1684032033495392257'
,
...
...
comData/negative_news/creditchina.py
浏览文件 @
1bb5b282
...
...
@@ -12,11 +12,14 @@
pageSize: 10
}
"""
import
json
import
time
from
urllib
import
parse
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
retry
import
retry
from
base.BaseCore
import
BaseCore
...
...
@@ -24,6 +27,41 @@ baseCore = BaseCore()
log
=
baseCore
.
getLogger
()
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
0
)
taskType
=
'企业负面新闻'
def
sendKafka
(
dic_news
):
start_time
=
time
.
time
()
try
:
# 114.116.116.241
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
log
.
info
(
dic_result
)
# 传输成功,写入日志中
state
=
1
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
return
True
except
Exception
as
e
:
dic_result
=
{
'success'
:
'false'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'e'
:
e
}
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
dic_news
[
'title'
],
'Kafka操作失败'
)
log
.
info
(
dic_result
)
return
False
@retry
(
tries
=
3
,
delay
=
1
)
def
getRequest
(
url
,
headers
):
...
...
@@ -51,6 +89,11 @@ def dishonesty(headers,com_name,social_code):
if
json_data
[
'status'
]
==
1
:
pass
total_size
=
json_data
[
'data'
][
'totalSize'
]
if
total_size
>
0
:
pass
else
:
log
.
info
(
f
'该企业{com_name}无严重失信信息'
)
return
list_dishonesty
for
page
in
range
(
1
,
total_size
+
1
):
param_page
=
{
'tableName'
:
'credit_zgf_fr_sxbzxr'
,
...
...
@@ -102,7 +145,9 @@ def dishonesty(headers,com_name,social_code):
'数据来源'
:
dataSource
}
list_dishonesty
.
append
(
dic_dishonesty
)
return
list_dishonesty
# r.sadd('dishonesty::' +social_code , )
return
url
,
list_dishonesty
# 行政处罚
def
punish
(
headers
,
com_name
,
social_code
):
list_punish
=
[]
...
...
@@ -179,7 +224,7 @@ def punish(headers,com_name,social_code):
'数据来源单位统一社会信用代码'
:
cf_sjlydm
}
list_punish
.
append
(
dic_punish
)
return
list_punish
return
url
,
list_punish
# 经营异常
def
abnormal
(
headers
,
com_name
,
social_code
):
...
...
@@ -204,8 +249,9 @@ def abnormal(headers,com_name,social_code):
if
total_size
>
0
:
pass
else
:
log
.
info
()
for
page
in
total_size
:
log
.
info
(
f
'该企业{com_name}无经营异常信息'
)
return
list_abhormal
for
page
in
range
(
1
,
total_size
+
1
):
param_page
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
...
...
@@ -242,8 +288,20 @@ def abnormal(headers,com_name,social_code):
'数据来源'
:
dataSource
}
list_abhormal
.
append
(
dic_abnormal
)
return
list_abhormal
return
url
,
list_abhormal
def
dic_data
(
com_name
,
listData
,
type
,
detailurl
):
dic_news
=
{
'title'
:
com_name
+
type
,
'structuredData'
:
listData
,
'ynStructure'
:
1
,
'content'
:
''
,
'contentHtml'
:
''
,
'source'
:
'信用中国'
,
'publishtime'
:
''
,
'detailurl'
:
detailurl
,
}
return
dic_news
if
__name__
==
'__main__'
:
...
...
@@ -259,11 +317,20 @@ if __name__=='__main__':
}
com_name
=
'石家庄交投集团工程服务有限责任公司'
social_code
=
'91130100MA7EK14C8L'
# list_dishonesty = dishonesty(headers,com_name,social_code)
# print(list_dishonesty)
list_punish
=
punish
(
headers
,
com_name
,
social_code
)
print
(
list_punish
)
# abnormal(headers,com_name,social_code)
url_dishonesty
,
list_dishonesty
=
dishonesty
(
headers
,
com_name
,
social_code
)
dic_dishonesty
=
dic_data
(
com_name
,
list_dishonesty
,
'严重违法失信信息'
,
url_dishonesty
)
sendKafka
(
dic_dishonesty
)
url_punish
,
list_punish
=
punish
(
headers
,
com_name
,
social_code
)
dic_punish
=
dic_data
(
com_name
,
list_punish
,
'行政处罚信息'
,
url_punish
)
# print(dic_punish)
sendKafka
(
dic_punish
)
url_abnormal
,
list_abnormal
=
abnormal
(
headers
,
com_name
,
social_code
)
dic_abnormal
=
dic_data
(
com_name
,
list_abnormal
,
'经营异常信息'
,
url_abnormal
)
# print(dic_abnormal)
sendKafka
(
dic_abnormal
)
# 报告链接
# url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
# report_json = getRequest(url_report, headers)
...
...
@@ -273,3 +340,4 @@ if __name__=='__main__':
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论