Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
2aca66c0
提交
2aca66c0
authored
12月 06, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
es操作
上级
61eecac3
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
144 行增加
和
69 行删除
+144
-69
deletebyid.py
comData/YanBao/deletebyid.py
+1
-2
es_mysql.py
comData/annualReport1023/es_mysql.py
+16
-12
uptoes.py
comData/annualReport1023/uptoes.py
+122
-50
Esmethod.py
estool/Esmethod.py
+5
-5
没有找到文件。
comData/YanBao/deletebyid.py
浏览文件 @
2aca66c0
...
...
@@ -22,8 +22,7 @@ cnx_ = baseCore.cnx_
cursor_
=
cnx_
.
cursor
()
lock
=
threading
.
Lock
()
pathType_
=
'QYResearchReport/'
taskType
=
'企业研报/东方财富网'
pool
=
redis
.
ConnectionPool
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
...
...
comData/annualReport1023/es_mysql.py
浏览文件 @
2aca66c0
import
pandas
as
pd
import
urllib3
import
BaseCore
from
elasticsearch
import
Elasticsearch
...
...
@@ -11,20 +10,25 @@ es = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('el
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
r
=
baseCore
.
r
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
cnx_11
=
baseCore
.
cnx_
cursor_11
=
baseCore
.
cursor_
def
getList
():
sql
=
'Select id,item_id,year from clb_sys_attachment where type_id = 1'
cursor_11
.
execute
(
sql
)
datas
=
cursor_11
.
fetchall
()
gg_social_list
=
[
str
(
item
[
1
])
+
"|"
+
str
(
item
[
0
])
+
"|"
+
str
(
item
[
2
])
for
item
in
datas
]
print
(
'======='
)
for
item
in
gg_social_list
:
r
.
rpush
(
"NianBao:info"
,
item
)
df
=
pd
.
read_excel
(
'./中国500强榜单年报.xlsx'
,
sheet_name
=
'Sheet1'
)
for
i
in
range
(
len
(
df
)):
social_code
=
df
[
'企业信用代码'
][
i
]
year
=
df
[
'年报年份'
][
i
]
sql
=
'Select id,item_id,year from clb_sys_attachment where type_id = 1 and item_id =
%
s and year =
%
s'
cursor_11
.
execute
(
sql
,(
social_code
,
year
))
datas
=
cursor_11
.
fetchall
()
if
datas
:
gg_social_list
=
[
str
(
item
[
1
])
+
"|"
+
str
(
item
[
0
])
+
"|"
+
str
(
item
[
2
])
for
item
in
datas
]
print
(
'======='
)
for
item
in
gg_social_list
:
r
.
rpush
(
"NianBao:info"
,
item
)
else
:
log
.
info
(
f
'{social_code}|{year}======不存在'
)
if
__name__
==
"__main__"
:
getList
()
comData/annualReport1023/uptoes.py
浏览文件 @
2aca66c0
"""
年报上传到es
content 需要重新解析
lang语言
origin 来源 从数据库中获取
title 从数据库中获取
dic_info = {
'attachmentIds': att_id,
'author': '',
'content': content,
'contentWithTag': '',
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': lang,
'origin': origin,
'publishDate': datetime_string,
'sid': '1684032033495392257',
'sourceAddress': year_url, # 原文链接
'summary': '',
'title': name_pdf.replace('.pdf', ''),
'type': 1,
'socialCreditCode': social_code,
'year': year
}
"""
# 1.无年份和信用代码 另外存一个redis的key中
# 2.有信用代码 id 年份的 (1)es 中的id 需要更新为附件表中的id
# (2)通过信用代码 查出名称相同的个数 如果有两个的话,说明其中有一个没有在es库中 需要把es库中的id获取到,并删除没有在es库中的那个记录
...
...
@@ -32,11 +6,14 @@ dic_info = {
#todo:查出有一条记录的,先更新 其他的先保存到另一个redis中
import
json
import
threading
import
redis
import
requests
,
re
,
time
,
pymysql
,
fitz
import
urllib3
from
kafka
import
KafkaProducer
from
base
import
BaseCore
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
...
...
@@ -51,45 +28,139 @@ lock = threading.Lock()
taskType
=
'企业年报'
pathType
=
'QYYearReport/'
def
secrchATT
(
type_id
,
xydm
):
sel_sql
=
'''select * from clb_sys_attachment where item_id=
%
s '''
def
sendKafka
(
dic_news
):
try
:
# 114.116.116.241
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
kafka_result
=
producer
.
send
(
"researchReportTopic"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
log
.
info
(
dic_result
)
return
True
except
Exception
as
e
:
dic_result
=
{
'success'
:
'false'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'e'
:
e
}
log
.
info
(
dic_result
)
return
False
def
getContent
(
file_href
):
headers
=
{}
content
=
''
headers
[
'User-Agent'
]
=
baseCore
.
getRandomUserAgent
()
for
i
in
range
(
0
,
3
):
try
:
response
=
requests
.
get
(
file_href
,
headers
=
headers
,
verify
=
False
,
timeout
=
20
)
break
except
:
time
.
sleep
(
3
)
continue
with
fitz
.
open
(
stream
=
response
.
content
,
filetype
=
'pdf'
)
as
doc
:
page_size
=
doc
.
page_count
log
.
info
(
f
'当前页码----{page_size}'
)
for
page
in
doc
.
pages
():
content
+=
page
.
get_text
()
return
content
def
secrchATT
(
type_id
,
xydm
,
year
):
sel_sql
=
'''select * from clb_sys_attachment where item_id=
%
s and type_id=
%
s and year=
%
s'''
lock
.
acquire
()
cursor_
.
execute
(
sel_sql
,
(
type_id
,
xydm
))
cursor_
.
execute
(
sel_sql
,
(
xydm
,
type_id
,
year
))
selects
=
cursor_
.
fetchall
()
lock
.
release
()
return
selects
def
selectShortName
(
xydm
):
sel_sql
=
"select * from sys_base_enterprise where social_credit_code =
%
s"
lock
.
acquire
()
cursor_
.
execute
(
sel_sql
,
xydm
)
selects
=
cursor_
.
fetchone
()
lock
.
release
()
return
selects
def
main
():
redis_conn
=
redis
.
Redis
(
connection_pool
=
pool
)
info_
=
redis_conn
.
lpop
(
"NianBao:id"
)
# info_ = redis_conn.lpop("NoIPO:info")
info_
=
'91310000132206289R|1725799077425945040|2022'
if
info_
:
pass
else
:
log
.
info
(
"++++已没有数据++++"
)
return
info
=
info_
.
decode
()
# info = info_.decode()
info
=
info_
xydm
=
info
.
split
(
'|'
)[
0
]
att_id
=
info
.
split
(
'|'
)[
1
]
year
=
info
.
split
(
'|'
)[
2
]
if
not
xydm
or
not
year
:
redis_conn
.
lpush
(
'
NianBao:
info'
,
info
)
redis_conn
.
lpush
(
'info'
,
info
)
else
:
selects
=
secrchATT
(
1
,
xydm
)
selects
=
secrchATT
(
'1'
,
xydm
,
year
)
if
len
(
selects
)
>
1
:
redis_conn
.
lpush
(
'NianBao:info'
,
info
)
elif
len
(
selects
)
==
1
:
file_name
=
selects
[
1
]
origin
=
selects
[
18
]
create_time
=
selects
[
13
]
publishDate
=
selects
[
21
]
file_href
=
'http://zzsn.luyuen.com'
+
str
(
selects
[
5
])
# results = selectShortName(xydm)
# if results:
# pass
# else:
# redis_conn.lpush('NoIPO:info', info)
# return
select
=
selects
[
0
]
# name = results[3]
name
=
select
[
1
]
if
name
:
# file_name = results[3] + ':' + year + '年年度报告'
file_name
=
name
.
split
(
'.'
)[
0
]
else
:
redis_conn
.
lpush
(
'Noname:info'
,
info
)
return
log
.
info
(
f
'-----------{file_name}-----------'
)
origin
=
select
[
18
]
create_time
=
select
[
13
]
publishDate
=
select
[
21
]
if
publishDate
==
'2023-12-31'
:
publishDate
=
'2023-08-31'
file_href
=
'http://zzsn.luyuen.com'
+
str
(
select
[
5
])
content
=
getContent
(
file_href
)
lang
=
baseCore
.
detect_language
(
content
)
if
lang
==
'cn'
:
lang
=
'zh'
dic_info
=
{
'attachmentIds'
:
att_id
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
''
,
'createDate'
:
str
(
create_time
),
'deleteFlag'
:
'0'
,
'id'
:
''
,
'keyWords'
:
''
,
'lang'
:
lang
,
'origin'
:
origin
,
'publishDate'
:
publishDate
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
''
,
# 原文链接
'summary'
:
''
,
'title'
:
file_name
,
'type'
:
1
,
'socialCreditCode'
:
xydm
,
'year'
:
year
}
sendKafka
(
dic_info
)
time
.
sleep
(
1
)
def
run_threads
(
num_threads
):
threads
=
[]
...
...
@@ -105,8 +176,9 @@ def run_threads(num_threads):
thread
.
join
()
if
__name__
==
"__main__"
:
start
=
time
.
time
()
num_threads
=
5
run_threads
(
num_threads
)
while
True
:
start
=
time
.
time
()
num_threads
=
5
run_threads
(
num_threads
)
log
.
info
(
f
'5线程 总耗时{time.time() - start}秒'
)
\ No newline at end of file
log
.
info
(
f
'5线程 总耗时{time.time() - start}秒'
)
\ No newline at end of file
estool/Esmethod.py
浏览文件 @
2aca66c0
...
...
@@ -162,11 +162,11 @@ class EsMethod(object):
'query'
:
{
'bool'
:
{
'should'
:[
# {'term':{'origin'
: '雪球网
'}},
{
'term'
:{
'type'
:
1
}},
# {'term':{'origin'
: '证监会
'}},
{
'term'
:{
'type'
:
1
}},
],
'must'
:
[
{
'match'
:
{
'title'
:
'.
pdf
'
}}
{
'match'
:
{
'title'
:
'.
PDF
'
}}
]
}
},
...
...
@@ -221,11 +221,11 @@ if __name__ == '__main__':
for
mms
in
msglist
:
id
=
mms
[
'_source'
][
'id'
]
title
=
mms
[
'_source'
][
'title'
]
utitle
=
title
.
replace
(
'.
pdf
'
,
''
)
utitle
=
title
.
replace
(
'.
PDF
'
,
''
)
print
(
f
'id:{id}---title:{title}--utitle:{utitle}'
)
esMethod
.
updateaunn
(
esMethod
.
index_name
,
str
(
id
),
utitle
)
# esMethod.delete(esMethod.index_name,str(id))
print
(
'跟
新成功!!'
)
# print('更
新成功!!')
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论