Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
793652a0
提交
793652a0
authored
8月 11, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
公告动态自动化
上级
610b0b53
全部展开
显示空白字符变更
内嵌
并排
正在显示
10 个修改的文件
包含
419 行增加
和
233 行删除
+419
-233
BaseCore.py
base/BaseCore.py
+64
-32
RedisPPData.py
base/RedisPPData.py
+108
-36
smart_extractor.py
base/smart/smart_extractor.py
+36
-0
getQccId.py
comData/BaseInfo_qcc/getQccId.py
+49
-0
证监会-年报.py
comData/annualReport_ZJH/证监会-年报.py
+41
-67
证监会-公告.py
comData/noticeReport_ZJH/证监会-公告.py
+21
-14
test.py
comData/tcyQydt/test.py
+22
-9
tyc_qydt_add.py
comData/tcyQydt/tyc_qydt_add.py
+17
-15
NewsYahooAuto.py
comData/yhcj/NewsYahooAuto.py
+0
-0
雅虎财经_企业动态.py
comData/yhcj/雅虎财经_企业动态.py
+61
-60
没有找到文件。
base/BaseCore.py
浏览文件 @
793652a0
# 核心工具包
import
os
import
random
import
socket
...
...
@@ -5,13 +6,18 @@ import sys
import
time
import
logbook
import
logbook.more
# 核心工具包
import
zhconv
import
pymysql
import
redis
from
selenium
import
webdriver
from
selenium.webdriver.chrome.service
import
Service
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
import
langid
class
BaseCore
:
# 序列号
__seq
=
0
...
...
@@ -211,8 +217,16 @@ class BaseCore:
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
def
__init__
(
self
):
self
.
__cnx_proxy
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'root'
,
password
=
'zzsn9988'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
self
.
__cursor_proxy
=
self
.
__cnx_proxy
.
cursor
()
self
.
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'root'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
self
.
cursor
=
self
.
cnx
.
cursor
()
# 连接到Redis
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
self
.
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
def
close
(
self
):
try
:
...
...
@@ -222,15 +236,7 @@ class BaseCore:
self
.
cnx
.
close
()
except
:
pass
def
__init__
(
self
):
self
.
__cnx_proxy
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'root'
,
password
=
'zzsn9988'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
self
.
__cursor_proxy
=
self
.
__cnx_proxy
.
cursor
()
self
.
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'root'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
self
.
cursor
=
self
.
cnx
.
cursor
()
pass
# 计算耗时
def
getTimeCost
(
self
,
start
,
end
):
...
...
@@ -354,28 +360,37 @@ class BaseCore:
str
=
str
[
0
:
end
+
1
]
return
str
# def pullDateFromSql(self):
# query = "select SocialCode from EnterpriseInfo "
# self.cursor.execute(query)
# result = self.cursor.fetchall()
# social_list = list(result)
# return social_list
#
# def redisPushData(self,social_list):
#
# #将数据插入到redis中
# for item in social_list:
# self.r.rpush('qy_socialCode', item)
# 繁体字转简体字
def
hant_2_hans
(
hant_str
:
str
):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return
zhconv
.
convert
(
hant_str
,
'zh-hans'
)
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
self
,
type
):
# 判断字符串里是否含数字
def
str_have_num
(
str_num
):
panduan
=
False
if
type
==
1
:
gn_item
=
self
.
r
.
lpop
(
'gnqy_socialCode'
)
return
gn_item
.
decode
()
if
gn_item
else
None
if
type
==
2
:
gw_item
=
self
.
r
.
lpop
(
'gwqy_socialCode'
)
return
gw_item
.
decode
()
if
gw_item
else
None
for
str_1
in
str_num
:
ppp
=
str_1
.
isdigit
()
if
ppp
:
panduan
=
ppp
return
panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
self
,
key
):
item
=
self
.
r
.
lpop
(
key
)
return
item
.
decode
()
if
item
else
None
# 获得脚本进程PID
def
getPID
(
self
):
...
...
@@ -401,8 +416,9 @@ class BaseCore:
"excludeSwitches"
,
[
"enable-automation"
])
chrome_options
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
chrome_options
.
add_argument
(
'lang=zh-CN,zh,zh-TW,en-US,en'
)
chrome_options
.
add_argument
(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
)
chrome_options
.
add_argument
(
self
.
getRandomUserAgent
())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver
=
webdriver
.
Chrome
(
chrome_options
=
chrome_options
,
service
=
service
)
with
open
(
'../../base/stealth.min.js'
)
as
f
:
js
=
f
.
read
()
...
...
@@ -438,4 +454,20 @@ class BaseCore:
print
(
e
)
self
.
cnx
.
commit
()
def
GetToken
(
self
):
#获取企查查token
query
=
"select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self
.
cursor
.
execute
(
query
)
token
=
self
.
cursor
.
fetchone
()[
0
]
def
detect_language
(
self
,
text
):
# 使用langid.py判断文本的语言
result
=
langid
.
classify
(
text
)
if
result
==
''
:
return
'cn'
if
result
[
0
]
==
''
:
return
'cn'
return
result
[
0
]
base/RedisPPData.py
浏览文件 @
793652a0
import
time
import
pymysql
import
redis
from
base
import
BaseCore
from
apscheduler.schedulers.blocking
import
BlockingScheduler
basecore
=
BaseCore
.
BaseCore
()
log
=
basecore
.
getLogger
()
cnx
=
basecore
.
cnx
cursor
=
basecore
.
cursor
r
=
basecore
.
r
# 连接到Redis
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
# # 连接到Redis
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
#
# cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
# charset='utf8mb4')
# cursor = cnx.cursor()
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'root'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
# def pullDateFromSql():
# gn_query = "select SocialCode from EnterpriseInfo where Place = '1' "
# cursor.execute(gn_query)
# gn_result = cursor.fetchall()
#
# gw_query = "select SocialCode from EnterpriseInfo where Place = '2' "
# cursor.execute(gw_query)
# gw_result = cursor.fetchall()
#
# gw_social_list = [item[0] for item in gw_result]
# gn_social_list = [item[0] for item in gn_result]
# return gn_social_list,gw_social_list
def
pullDateFromSql
():
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' limit 1 "
def
NewsEnterprise
():
#获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gw_query
=
"select SocialCode from EnterpriseInfo where Place = '2'
limit 1
"
#获取国外企业
gw_query
=
"select SocialCode from EnterpriseInfo where Place = '2'"
cursor
.
execute
(
gw_query
)
gw_result
=
cursor
.
fetchall
()
gw_social_list
=
[
item
[
0
]
for
item
in
gw_result
]
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
return
gn_social_list
,
gw_social_list
def
redisPushData
():
# return gn_social_list, gw_social_list
print
(
'======='
)
gn_social_list
,
gw_social_list
=
pullDateFromSql
()
#
gn_social_list,gw_social_list = pullDateFromSql()
#将数据插入到redis中
for
item
in
gn_social_list
:
r
.
rpush
(
'gnqy_socialCode'
,
item
)
r
.
rpush
(
'
NewsEnterprise:
gnqy_socialCode'
,
item
)
for
item
in
gw_social_list
:
r
.
rpush
(
'gwqy_socialCode'
,
item
)
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
type
):
gn_item
=
r
.
lpop
(
'gn_socialCode'
)
gw_item
=
r
.
lpop
(
'gw_socialCode'
)
#1 表示国内 2 表示国外
if
type
==
1
:
return
gn_item
.
decode
()
if
gn_item
else
None
if
type
==
2
:
return
gw_item
.
decode
()
if
gw_item
else
None
def
task
(
task_time
):
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
def
NewsEnterprise_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每天执行一次
scheduler
.
add_job
(
NewsEnterprise
,
'cron'
,
hour
=
12
,
minute
=
0
,
max_instances
=
2
)
try
:
# redisPushData # 定时开始前执行一次
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
def
NoticeEnterprise
():
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null limit 1 "
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'NoticeEnterprise:gnqy_socialCode'
,
item
)
def
NoticeEnterprise_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每天执行一次
scheduler
.
add_job
(
NoticeEnterprise
,
'cron'
,
hour
=
12
,
minute
=
0
)
try
:
# redisPushData # 定时开始前执行一次
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
def
AnnualEnterprise
():
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'AnnualEnterprise:gnqy_socialCode'
,
item
)
def
AnnualEnterprise_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每年执行一次
scheduler
.
add_job
(
AnnualEnterprise
,
'cron'
,
second
=
'*/10'
)
try
:
# redisPushData # 定时开始前执行一次
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
def
BaseInfoEnterprise
():
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' limit 1 "
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
item
)
#企业基本信息
def
BaseInfoEnterprise_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每半分钟执行一次
scheduler
.
add_job
(
redisPushData
,
'cron'
,
second
=
task_time
,
max_instances
=
3
)
# 每天早上9点执行一次
# scheduler.add_job(self.auto_tb(), 'cron', day='*', hour=12, minute=5, start_date='2021-12-16 09:00:00',end_date='2023-11-30 23:59:59')
# 每年执行一次
scheduler
.
add_job
(
BaseInfoEnterprise
,
'cron'
,
second
=
'*/10'
)
try
:
# redisPushData # 定时开始前执行一次
scheduler
.
start
()
...
...
@@ -63,8 +130,13 @@ def task(task_time):
print
(
'定时采集异常'
,
e
)
pass
if
__name__
==
"__main__"
:
start
=
time
.
time
()
task_time
=
'*/10'
task
(
task_time
)
# NewsEnterprise_task()
# NoticeEnterprise_task()
AnnualEnterprise_task
()
log
.
info
(
f
'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}==='
)
# cnx.close()
# cursor.close()
# basecore.close()
base/smart/smart_extractor.py
浏览文件 @
793652a0
...
...
@@ -50,7 +50,43 @@ class SmartExtractor:
构造器:未指定 lang_code 参数时,默认为 cn
"""
# 支持语言
supported_lang_code_list
=
list
(
SmartExtractor
.
get_supported_lang_code_dict
())
# 初始化 goose 对象:
# 1、根据语言代码,创建 goose 对象
if
lang_code
is
None
or
lang_code
==
'cn'
or
lang_code
==
'zh-cn'
or
lang_code
==
'zh'
:
# 需要分词:中文
# 1、不指定lang_code参数,或不指定lang_code为 None 时,默认为中文分词
# 2、Flask Web接口:未指定get参数 lang_code 时,lang_code 会接收为 None
self
.
goose
=
Goose
({
'stopwords_class'
:
StopWordsChinese
})
elif
lang_code
==
'ko'
:
# 需要分词:韩语
# 1、测试:只传递语言,不传递分词器
# self.goose = Goose({'use_meta_language': False, 'target_language': 'ko'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试失败:正文采集为空
# 韩语分词:测试成功
self
.
goose
=
Goose
({
'stopwords_class'
:
StopWordsKorean
})
elif
lang_code
==
'ar'
:
# 需要分词:阿拉伯语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试成功
# self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
self
.
goose
=
Goose
({
'stopwords_class'
:
StopWordsArabic
})
elif
lang_code
==
'en'
:
# 单独测试:英文
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})
# 测试成功:创建Goose对象时,不指定语言默认为英文分词
self
.
goose
=
Goose
()
elif
lang_code
==
'ru'
:
# 单独测试:俄语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
self
.
goose
=
Goose
({
'use_meta_language'
:
False
,
'target_language'
:
lang_code
})
# 测试成功:直接传递语言编码
elif
lang_code
in
supported_lang_code_list
:
# 其它语言编码,统一处理,不再单独测试
self
.
goose
=
Goose
({
'use_meta_language'
:
False
,
'target_language'
:
lang_code
})
else
:
# 未识别的语言代码
raise
Exception
(
f
'智能采集时,无法识别语言代码:{lang_code}'
)
def
get_extraction_result
(
self
,
article
,
link_text
=
''
):
"""
...
...
comData/BaseInfo_qcc/getQccId.py
0 → 100644
浏览文件 @
793652a0
# -*- coding: utf-8 -*-
import
time
from
urllib.parse
import
quote
import
requests
import
urllib3
headers
=
{
'Host'
:
'xcx.qcc.com'
,
'Connection'
:
'keep-alive'
,
'Qcc-Platform'
:
'mp-weixin'
,
'Qcc-Timestamp'
:
''
,
'Qcc-Version'
:
'1.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat'
,
'content-type'
:
'application/json'
,
'Referer'
:
'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html'
,
'Accept-Encoding'
:
'gzip, deflate, br,'
}
# 通过企业名称或信用代码获取企查查id
def
find_id_by_name
(
name
):
urllib3
.
disable_warnings
()
qcc_key
=
name
t
=
str
(
int
(
time
.
time
())
*
1000
)
headers
[
'Qcc-Timestamp'
]
=
t
url
=
f
"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=®istCapiBegin=®istCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for
lll
in
range
(
1
,
6
):
try
:
resp_dict
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
verify
=
False
)
.
json
()
break
except
:
print
(
'重试'
)
time
.
sleep
(
5
)
continue
time
.
sleep
(
2
)
if
resp_dict
[
'result'
][
'Result'
]:
result_dict
=
resp_dict
[
'result'
][
'Result'
][
0
]
KeyNo
=
result_dict
[
'KeyNo'
]
Name
=
result_dict
[
'Name'
]
.
replace
(
'<em>'
,
''
)
.
replace
(
'</em>'
,
''
)
.
strip
()
if
Name
==
''
:
KeyNo
=
''
else
:
KeyNo
=
''
print
(
"{},企业代码为:{}"
.
format
(
qcc_key
,
KeyNo
))
return
KeyNo
\ No newline at end of file
comData/annualReport_ZJH/证监会-年报.py
浏览文件 @
793652a0
差异被折叠。
点击展开。
comData/noticeReport_ZJH/证监会-公告.py
浏览文件 @
793652a0
"""
"""
...
...
@@ -28,11 +28,11 @@ cursor_ = cnx_.cursor()
tracker_conf
=
get_tracker_conf
(
'./client.conf'
)
client
=
Fdfs_client
(
tracker_conf
)
taskType
=
'企业公告/证监会'
def
RequestUrl
(
url
,
payload
,
social_code
):
def
RequestUrl
(
url
,
payload
,
social_code
,
start_time
):
# ip = get_proxy()[random.randint(0, 3)]
start_time_url
=
time
.
time
()
taskType
=
'公告'
for
m
in
range
(
0
,
3
):
try
:
response
=
requests
.
post
(
url
=
url
,
headers
=
headers
,
data
=
payload
)
# ,proxies=ip)
...
...
@@ -46,19 +46,17 @@ def RequestUrl(url, payload, social_code):
if
response
.
status_code
==
200
:
# 请求成功,处理响应数据
# print(response.text)
soup
=
BeautifulSoup
(
response
.
text
,
'html.parser'
)
pass
else
:
# 请求失败,输出错误信息
log
.
error
(
'请求失败:'
,
url
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
_url
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url
,
'请求失败'
)
soup
=
BeautifulSoup
(
response
.
text
,
'html.parser'
)
soup
=
''
return
soup
def
getUrl
(
code
,
url_parms
,
Catagory2_parms
):
# 深市
if
code
[
0
]
==
'2'
or
code
[
0
]
==
'0'
or
code
[
0
]
==
'3'
:
...
...
@@ -147,6 +145,7 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type
return
inster
# 信息插入数据库
try
:
insert_sql
=
'''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)'''
list_info
=
[
...
...
@@ -165,10 +164,14 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type
cnx_
.
commit
()
insert
=
True
return
insert
except
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
pdf_url
,
'数据库传输失败'
)
return
insert
def
GetContent
(
pdf_url
,
pdf_name
,
social_code
,
year
,
pub_time
,
start_time
):
taskType
=
'公告'
sel_sql
=
"select article_id from brpa_source_article where source_address =
%
s"
cursor_
.
execute
(
sel_sql
,
pdf_url
)
row
=
cursor_
.
fetchone
()
...
...
@@ -251,7 +254,8 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获
short_name
=
dic_info
[
4
]
soup
=
RequestUrl
(
url
,
payload
,
social_code
,
start_time
)
if
soup
==
''
:
return
# 先获取页数
page
=
soup
.
find
(
'div'
,
class_
=
'pages'
)
.
find
(
'ul'
,
class_
=
'g-ul'
)
.
text
...
...
@@ -274,6 +278,8 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获
href
=
url
.
split
(
'index'
)[
0
]
+
f
'index_{i}_f.html'
soup
=
RequestUrl
(
href
,
payload
,
social_code
,
start_time
)
if
soup
==
''
:
continue
tr_list
=
soup
.
find
(
'div'
,
id
=
'txt'
)
.
find_all
(
'tr'
)
pageIndex
=
0
for
tr
in
tr_list
[
1
:]:
...
...
@@ -303,7 +309,7 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获
log
.
info
(
f
'{short_name}==============解析传输操作成功'
)
state
=
1
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
'公告'
,
state
,
takeTime
,
pdf_url
,
''
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
pdf_url
,
''
)
pass
else
:
errorCount
+=
1
...
...
@@ -354,8 +360,9 @@ if __name__ == '__main__':
while
True
:
start_time
=
time
.
time
()
# 获取企业信息
social_code
=
''
if
social_code
==
''
:
social_code
=
baseCore
.
redicPullData
(
'NoticeEnterprise:gnqy_socialCode'
)
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
time
.
sleep
(
20
)
continue
dic_info
=
baseCore
.
getInfomation
(
social_code
)
...
...
comData/tcyQydt/test.py
浏览文件 @
793652a0
import
json
from
bs4
import
BeautifulSoup
import
langid
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
#
# def detect_language(text):
# # 使用langid.py判断文本的语言
# lang, confidence = langid.classify(text)
# print(lang,confidence)
# return lang
# detect_language("123")
s
=
'jQuery1124020359136113854692_1688967721474({"rc":0,"rt":6,"svr":182993358,"lt":1,"full":1,"dlmkts":"","data":{"total":5488,"diff":[{"f1":2,"f2":35.37,"f3":130.87,"f4":20.05,"f5":505082,"f6":1561753667.0,"f7":72.85,"f8":73.63,"f9":79.87,"f10":"-","f11":-0.34,"f12":"603119","f13":1,"f14":"N
\xe6\xb5\x99\xe8\x8d\xa3
","f15":37.54,"f16":26.38,"f17":28.88,"f18":15.32,"f20":9903600000,"f21":2426214099,"f22":-0.03,"f23":6.46,"f24":130.87,"f25":130.87,"f62":503279629.0,"f115":70.77,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":70.7,"f3":26.98,"f4":15.02,"f5":278191,"f6":2015432017.69,"f7":19.83,"f8":73.92,"f9":44.38,"f10":"-","f11":0.41,"f12":"301371","f13":0,"f14":"N
\xe6\x95\xb7\xe5\xb0\x94\xe4\xbd\xb3
","f15":80.04,"f16":69.0,"f17":80.0,"f18":55.68,"f20":28285656000,"f21":2660599297,"f22":0.11,"f23":5.64,"f24":26.98,"f25":26.98,"f62":476657031.0,"f115":33.47,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":27.6,"f3":20.0,"f4":4.6,"f5":135775,"f6":348360366.27,"f7":21.04,"f8":33.94,"f9":212.8,"f10":3.1,"f11":0.0,"f12":"301316","f13":0,"f14":"
\xe6\x85\xa7\xe5\x8d\x9a\xe4\xba\x91\xe9\x80\x9a
","f15":27.6,"f16":22.76,"f17":23.11,"f18":23.0,"f20":11040276000,"f21":1104274261,"f22":0.0,"f23":11.68,"f24":18.1,"f25":44.43,"f62":107348086.0,"f115":124.43,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":43.62,"f3":20.0,"f4":7.27,"f5":75204,"f6":311935188.44,"f7":21.79,"f8":29.67,"f9":56.11,"f10":13.27,"f11":0.0,"f12":"301289","f13":0,"f14":"
\xe5\x9b\xbd\xe7\xbc\x86\xe6\xa3\x80\xe6\xb5\x8b
","f15":43.62,"f16":35.7,"f17":36.61,"f18":36.35,"f20":3402360000,"f21":1105762682,"f22":0.0,"f23":3.86,"f24":28.26,"f25":35.55,"f62":80534335.0,"f115":47.25,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":40.98,"f3":20.0,"f4":6.83,"f5":118733,"f6":464542197.42,"f7":20.73,"f8":40.73,"f9":56.02,"f10":2.57,"f11":0.0,"f12":"300881","f13":0,"f14":"
\xe7\x9b\x9b\xe5\xbe\xb7\xe9\x91\xab\xe6\xb3\xb0
","f15":40.98,"f16":33.9,"f17":33.9,"f18":34.15,"f20":4507800000,"f21":1194567000,"f22":0.0,"f23":5.48,"f24":23.81,"f25":42.05,"f62":16802132.0,"f115":56.01,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":21.0,"f3":19.45,"f4":3.42,"f5":50301,"f6":97244231.42,"f7":16.1,"f8":16.87,"f9":46.64,"f10":1.95,"f11":1.35,"f12":"873576","f13":0,"f14":"
\xe5\xa4\xa9\xe5\x8a\x9b\xe5\xa4\x8d\xe5\x90\x88
","f15":21.0,"f16":18.17,"f17":18.18,"f18":17.58,"f20":2247000000,"f21":626162250,"f22":0.72,"f23":5.16,"f24":50.21,"f25":50.21,"f62":11286257.0,"f115":29.96,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":76.8,"f3":16.21,"f4":10.71,"f5":153518,"f6":1100431330.98,"f7":23.24,"f8":73.58,"f9":190.79,"f10":1.6,"f11":0.27,"f12":"301315","f13":0,"f14":"
\xe5\xa8\x81\xe5\xa3\xab\xe9\xa1\xbf
","f15":79.31,"f16":63.95,"f17":63.95,"f18":66.09,"f20":6758400000,"f21":1602347750,"f22":0.17,"f23":7.03,"f24":137.84,"f25":137.84,"f62":112419255.0,"f115":102.68,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":72.99,"f3":16.17,"f4":10.16,"f5":106236,"f6":714127513.24,"f7":23.68,"f8":52.41,"f9":123.41,"f10":1.71,"f11":0.4,"f12":"301141","f13":0,"f14":"
\xe4\xb8\xad\xe7\xa7\x91\xe7\xa3\x81\xe4\xb8\x9a
","f15":74.88,"f16":60.0,"f17":62.85,"f18":62.83,"f20":6466528467,"f21":1479619267,"f22":0.07,"f23":3.14,"f24":96.74,"f25":78.02,"f62":-26422445.0,"f115":87.31,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":27.3,"f3":12.81,"f4":3.1,"f5":171865,"f6":442577004.48,"f7":15.25,"f8":7.3,"f9":-156.2,"f10":0.94,"f11":-0.15,"f12":"300551","f13":0,"f14":"
\xe5\x8f\xa4\xe9\xb3\x8c\xe7\xa7\x91\xe6\x8a\x80
","f15":27.55,"f16":23.86,"f17":24.2,"f18":24.2,"f20":9439055235,"f21":6427896275,"f22":-0.11,"f23":8.93,"f24":48.37,"f25":133.73,"f62":16013778.0,"f115":-126.12,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":84.3,"f3":12.18,"f4":9.15,"f5":124022,"f6":989104033.4,"f7":17.33,"f8":64.35,"f9":99.53,"f10":1.15,"f11":0.19,"f12":"301398","f13":0,"f14":"
\xe6\x98\x9f\xe6\xba\x90\xe5\x8d\x93\xe9\x95\x81
","f15":86.5,"f16":73.48,"f17":75.48,"f18":75.15,"f20":6744000000,"f21":1624735481,"f22":-0.04,"f23":6.81,"f24":157.88,"f25":173.35,"f62":-26812467.0,"f115":105.29,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":34.85,"f3":10.95,"f4":3.44,"f5":27626,"f6":95746251.0,"f7":9.87,"f8":7.18,"f9":-37.27,"f10":9.74,"f11":-0.03,"f12":"688622","f13":1,"f14":"
\xe7\xa6\xbe\xe4\xbf\xa1\xe4\xbb\xaa\xe5\x99\xa8
","f15":36.0,"f16":32.9,"f17":35.0,"f18":31.41,"f20":2439416569,"f21":1341637317,"f22":-0.03,"f23":4.74,"f24":-5.76,"f25":7.23,"f62":18152096.0,"f115":-36.22,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":87.8,"f3":10.66,"f4":8.46,"f5":22037,"f6":184811228.0,"f7":11.33,"f8":6.52,"f9":116.36,"f10":4.84,"f11":1.09,"f12":"688776","f13":1,"f14":"
\xe5\x9b\xbd\xe5\x85\x89\xe7\x94\xb5\xe6\xb0\x94
","f15":87.99,"f16":79.0,"f17":79.0,"f18":79.34,"f20":9516064188,"f21":2968587801,"f22":-0.22,"f23":5.39,"f24":-5.88,"f25":-29.79,"f62":2907315.0,"f115":65.69,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":2.05,"f3":10.22,"f4":0.19,"f5":3258788,"f6":657251653.18,"f7":9.68,"f8":6.48,"f9":-12.82,"f10":3.95,"f11":0.0,"f12":"000413","f13":0,"f14":"
\xe4\xb8\x9c\xe6\x97\xad\xe5\x85\x89\xe7\x94\xb5
","f15":2.05,"f16":1.87,"f17":1.87,"f18":1.86,"f20":11547137393,"f21":10310048690,"f22":0.0,"f23":0.52,"f24":17.82,"f25":15.82,"f62":213263692.0,"f115":-8.55,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":2.7,"f3":10.2,"f4":0.25,"f5":1107878,"f6":291343381.08,"f7":11.84,"f8":7.94,"f9":-19.65,"f10":2.01,"f11":0.0,"f12":"002256","f13":0,"f14":"
\xe5\x85\x86\xe6\x96\xb0\xe8\x82\xa1\xe4\xbb\xbd
","f15":2.7,"f16":2.41,"f17":2.44,"f18":2.45,"f20":5082512054,"f21":3769280384,"f22":0.0,"f23":4.31,"f24":11.11,"f25":12.97,"f62":96164236.0,"f115":-99.3,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":2.92,"f3":10.19,"f4":0.27,"f5":1178068,"f6":333498626.0,"f7":9.06,"f8":7.34,"f9":7.63,"f10":1.4,"f11":0.0,"f12":"600239","f13":1,"f14":"
\xe4\xba\x91\xe5\x8d\x97\xe5\x9f\x8e\xe6\x8a\x95
","f15":2.92,"f16":2.68,"f17":2.69,"f18":2.65,"f20":4688605774,"f21":4688605774,"f22":0.0,"f23":2.89,"f24":28.07,"f25":51.3,"f62":27795948.0,"f115":-16.59,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":3.15,"f3":10.14,"f4":0.29,"f5":2973491,"f6":920586623.66,"f7":8.74,"f8":28.9,"f9":-7.07,"f10":4.18,"f11":0.0,"f12":"002630","f13":0,"f14":"
\xe5\x8d\x8e\xe8\xa5\xbf\xe8\x83\xbd\xe6\xba\x90
","f15":3.15,"f16":2.9,"f17":2.95,"f18":2.86,"f20":3719520000,"f21":3240482440,"f22":0.0,"f23":4.9,"f24":26.51,"f25":7.14,"f62":-18293260.0,"f115":-5.07,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":4.79,"f3":10.11,"f4":0.44,"f5":1857359,"f6":864538200.0,"f7":10.8,"f8":9.31,"f9":24.64,"f10":9.05,"f11":0.0,"f12":"600577","f13":1,"f14":"
\xe7\xb2\xbe\xe8\xbe\xbe\xe8\x82\xa1\xe4\xbb\xbd
","f15":4.79,"f16":4.32,"f17":4.35,"f18":4.35,"f20":9959122877,"f21":9559956211,"f22":0.0,"f23":2.07,"f24":14.05,"f25":16.26,"f62":161845983.0,"f115":26.21,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":4.36,"f3":10.1,"f4":0.4,"f5":617159,"f6":264661451.0,"f7":11.62,"f8":2.74,"f9":122.48,"f10":3.79,"f11":0.0,"f12":"601777","f13":1,"f14":"
\xe5\x8a\x9b\xe5\xb8\x86\xe7\xa7\x91\xe6\x8a\x80
","f15":4.36,"f16":3.9,"f17":3.95,"f18":3.96,"f20":19931840280,"f21":9811962000,"f22":0.0,"f23":1.94,"f24":24.22,"f25":12.95,"f62":41966291.0,"f115":137.9,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":3.27,"f3":10.1,"f4":0.3,"f5":290547,"f6":93712867.6,"f7":8.08,"f8":2.28,"f9":1394.52,"f10":1.03,"f11":0.0,"f12":"002175","f13":0,"f14":"
\xe4\xb8\x9c\xe6\x96\xb9\xe6\x99\xba\xe9\x80\xa0
","f15":3.27,"f16":3.03,"f17":3.04,"f18":2.97,"f20":4175072977,"f21":4175040277,"f22":0.0,"f23":8.53,"f24":13.54,"f25":-8.66,"f62":52561839.0,"f115":41.98,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2},{"f1":2,"f2":2.51,"f3":10.09,"f4":0.23,"f5":1715205,"f6":423246793.0,"f7":10.96,"f8":5.97,"f9":-4.84,"f10":2.8,"f11":0.0,"f12":"600569","f13":1,"f14":"
\xe5\xae\x89\xe9\x98\xb3\xe9\x92\xa2\xe9\x93\x81
","f15":2.51,"f16":2.26,"f17":2.26,"f18":2.28,"f20":7209777679,"f21":7209777679,"f22":0.0,"f23":1.02,"f24":17.84,"f25":21.26,"f62":88473646.0,"f115":-2.55,"f128":"-","f140":"-","f141":"-","f136":"-","f152":2}]}});'
from
textblob
import
TextBlob
baseCore
=
BaseCore
()
jsondata
=
baseCore
.
getSubStr
(
s
,
'{'
,
'}'
)
def
detect_language
(
text
):
blob
=
TextBlob
(
text
)
lang
=
blob
.
detect_language
()
return
lang
retJsonData
=
json
.
loads
(
jsondata
)
text
=
"Hello, how are you?"
language
=
detect_language
(
text
)
print
(
language
)
dataList
=
retJsonData
[
'data'
][
'diff'
]
print
(
len
(
dataList
))
for
dataIndex
in
range
(
len
(
dataList
)):
print
(
f
"{dataList[dataIndex]['f12']}----{dataList[dataIndex]['f14']}"
)
\ No newline at end of file
comData/tcyQydt/tyc_qydt_add.py
浏览文件 @
793652a0
...
...
@@ -31,7 +31,7 @@ import urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
jieba
.
cut
(
"必须加载jieba"
)
# 初始化,设置中文分词
smart
=
smart_extractor
.
SmartExtractor
(
'cn'
)
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'dbScore'
,
charset
=
'utf8mb4'
)
...
...
@@ -51,9 +51,10 @@ headers = {
}
taskType
=
'企业动态/天眼查'
def
beinWork
(
tyc_code
,
social_code
):
start_time
=
time
.
time
()
taskType
=
'企业动态'
time
.
sleep
(
3
)
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
retData
=
{
'total'
:
0
,
'up_okCount'
:
0
,
'up_errorCount'
:
0
,
'up_repetCount'
:
0
}
...
...
@@ -67,10 +68,6 @@ def beinWork(tyc_code, social_code):
# time.sleep(random.randint(3, 5))
break
except
Exception
as
e
:
log
.
error
(
f
"request请求异常----{m}-----{e}"
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url
,
e
)
pass
if
(
response
.
status_code
==
200
):
...
...
@@ -87,7 +84,7 @@ def beinWork(tyc_code, social_code):
total
=
json_1
[
'data'
][
'total'
]
except
:
log
.
error
(
f
"{tyc_code}-----获取总数失败"
)
e
=
'获取总
是
失败'
e
=
'获取总
数
失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url
,
e
)
...
...
@@ -171,6 +168,8 @@ def beinWork(tyc_code, social_code):
time_format
=
baseCore
.
getNowTime
(
1
)
try
:
# 开始进行智能解析
lang
=
baseCore
.
detect_language
(
title
)
smart
=
smart_extractor
.
SmartExtractor
(
lang
)
contentText
=
smart
.
extract_by_url
(
link
)
.
text
# time.sleep(3)
except
Exception
as
e
:
...
...
@@ -236,6 +235,13 @@ def beinWork(tyc_code, social_code):
'socialCreditCode'
:
social_code
,
'year'
:
time_format
[:
4
]
}
except
Exception
as
e
:
log
.
info
(
f
'传输失败:{social_code}----{link}'
)
e
=
'数据库传输失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
e
)
continue
# print(dic_news)
# 将相应字段通过kafka传输保存
try
:
...
...
@@ -264,16 +270,11 @@ def beinWork(tyc_code, social_code):
'e'
:
e
}
log
.
error
(
dic_result
)
e
=
str
(
e
)
+
'操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
e
)
except
Exception
as
e
:
log
.
info
(
f
'传输失败:{social_code}----{link}'
)
e
=
'传输失败'
e
=
'Kafka操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
link
,
e
)
log
.
info
(
f
"获取分页数据--{tyc_code}----分页{num},耗时{baseCore.getTimeCost(start_page, time.time())}"
)
...
...
@@ -287,8 +288,9 @@ def beinWork(tyc_code, social_code):
# 日志信息保存至现已创建好数据库中,因此并没有再对此前保存日志信息数据库进行保存
def
doJob
():
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
1
)
social_code
=
baseCore
.
redicPullData
(
'NewsEnterprise:gnqy_socialCode'
)
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
'None'
:
time
.
sleep
(
20
)
...
...
comData/yhcj/NewsYahooAuto.py
0 → 100644
浏览文件 @
793652a0
comData/yhcj/雅虎财经_企业动态.py
浏览文件 @
793652a0
差异被折叠。
点击展开。
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论