Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
35396c0b
提交
35396c0b
authored
11月 01, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
11/01
上级
bf1890e2
显示空白字符变更
内嵌
并排
正在显示
7 个修改的文件
包含
125 行增加
和
31 行删除
+125
-31
RedisPPData.py
base/RedisPPData.py
+0
-0
BaseCore.py
comData/annualReport1014/BaseCore.py
+1
-1
report.py
comData/annualReport1014/report.py
+4
-4
report1.py
comData/annualReport1014/report1.py
+44
-15
delete_obsfile.py
comData/noticeReport/delete_obsfile.py
+30
-0
get_tokenCookies.py
comData/weixin_solo/get_tokenCookies.py
+1
-1
Esmethod.py
estool/Esmethod.py
+45
-10
没有找到文件。
base/RedisPPData.py
浏览文件 @
35396c0b
This source diff could not be displayed because it is too large. You can
view the blob
instead.
comData/annualReport1014/BaseCore.py
浏览文件 @
35396c0b
...
...
@@ -666,7 +666,7 @@ class BaseCore:
self
.
cnx_
.
commit
()
# 插入到att表 返回附件id
def
tableUpdate
(
self
,
retData
,
com_name
,
year
,
pdf_name
,
num
,
pub_time
,
origin
):
def
tableUpdate
(
self
,
retData
,
year
,
pdf_name
,
num
,
pub_time
,
origin
):
item_id
=
retData
[
'item_id'
]
type_id
=
retData
[
'type_id'
]
group_name
=
retData
[
'group_name'
]
...
...
comData/annualReport1014/report.py
浏览文件 @
35396c0b
...
...
@@ -28,8 +28,8 @@ pathType = 'QYYearReport/'
type_id
=
1
create_by
=
'XueLingKun'
taskType
=
'企业年报'
#付俊雪的需要改为巨潮资讯网1_福布斯2000_PDF_60_付
file_path
=
'D:
\\
年报
\\
失败
'
file_path
=
'D:
\\
年报
\\
欧盟记分牌2500_年报补充_718_20231018
'
log
.
info
(
f
'=============当前pid为{baseCore.getPID()}=============='
)
def
sendKafka
(
dic_news
):
...
...
@@ -146,9 +146,9 @@ if __name__=='__main__':
social_code
=
data
[
1
]
ename
=
data
[
2
]
cname
=
data
[
3
]
file_name
=
c
name
+
':'
+
file_year
+
'年年度报告'
+
'.pdf'
file_name
=
e
name
+
':'
+
file_year
+
'年年度报告'
+
'.pdf'
content
=
''
origin
=
c
name
+
'官网'
origin
=
e
name
+
'官网'
#解析文件页数和内容
log
.
info
(
f
"-----------正在处理{file_name}--------------"
)
with
open
(
pdf_path
,
'rb'
)
as
file
:
...
...
comData/annualReport1014/report1.py
浏览文件 @
35396c0b
...
...
@@ -3,7 +3,9 @@
"""
import
json
import
os
import
re
import
time
import
uuid
from
kafka
import
KafkaProducer
from
obs
import
ObsClient
...
...
@@ -24,12 +26,15 @@ baseCore = BaseCore.BaseCore()
log
=
baseCore
.
getLogger
()
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
cnx_
=
baseCore
.
cnx_
cursor_
=
baseCore
.
cursor_
pathType
=
'QYYearReport/'
type_id
=
1
create_by
=
'XueLingKun'
taskType
=
'企业年报'
#付俊雪的需要改为巨潮资讯网
file_path
=
'D:
\\
BaiduNetdiskDownload
\\
1_福布斯2000_PDF_50_郑
'
file_path
=
'D:
\\
年报
\\
年度报告
\\
中石化炼化工程年度报告
'
log
.
info
(
f
'=============当前pid为{baseCore.getPID()}=============='
)
def
sendKafka
(
dic_news
):
...
...
@@ -66,6 +71,10 @@ def sendKafka(dic_news):
log
.
info
(
dic_result
)
return
False
def
getuuid
():
get_timestamp_uuid
=
uuid
.
uuid1
()
# 根据 时间戳生成 uuid , 保证全球唯一
return
get_timestamp_uuid
def
uptoOBS
(
retData
,
pathType
,
taskType
,
start_time
,
file_name
,
pdf_path
):
"""
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
...
...
@@ -92,7 +101,8 @@ def uptoOBS(retData, pathType, taskType, start_time,file_name,pdf_path):
'category'
:
category
,
'file_size'
:
file_size
,
'status'
:
status
,
'create_by'
:
create_by
,
'create_time'
:
create_time
,
'page_size'
:
page_size
,
'content'
:
content
}
try
:
result
=
getOBSres
(
pathType
,
file_name
,
pdf_path
)
name
=
str
(
getuuid
())
+
'.pdf'
result
=
getOBSres
(
pathType
,
name
,
pdf_path
)
except
:
log
=
baseCore
.
getLogger
()
log
.
error
(
f
'OBS发送失败'
)
...
...
@@ -117,6 +127,13 @@ def getOBSres(pathType, name, response):
result
=
obsClient
.
putFile
(
'zzsn'
,
pathType
+
name
,
file_path
=
response
)
return
result
def
secrchATT
(
item_id
,
year
,
type_id
):
sel_sql
=
'''select id from clb_sys_attachment where item_id =
%
s and year =
%
s and type_id=
%
s'''
cursor_
.
execute
(
sel_sql
,
(
item_id
,
year
,
type_id
))
selects
=
cursor_
.
fetchone
()
return
selects
if
__name__
==
'__main__'
:
log
.
info
(
f
'-----------当前文件{file_path}---------------'
)
file_list
=
os
.
listdir
(
file_path
)
...
...
@@ -126,19 +143,27 @@ if __name__=='__main__':
start_time
=
time
.
time
()
pdf_path
=
file_path
+
'/'
+
file
file_rank
=
int
(
file
.
split
(
'-'
)[
0
])
file_year
=
file
.
split
(
'-'
)[
1
]
# file_name_ = file.split('-')[0].replace('公司','')
file_year
=
re
.
findall
(
'
\
d{4}'
,
file
)[
0
]
file_name_
=
file
.
split
(
file_year
)[
0
]
#file_rank 对应上企业信用代码
selectsql
=
f
"select * from
rankandcode where id = {file_rank}
"
selectsql
=
f
"select * from
500Report where com_name = '{file_name_}'
"
cursor
.
execute
(
selectsql
)
data
=
cursor
.
fetchone
()
cnx
.
commit
()
social_code
=
data
[
1
]
ename
=
data
[
2
]
cname
=
data
[
3
]
file_name
=
cname
+
':'
+
file_year
+
'年年度报告'
+
'.pdf'
social_code
=
data
[
2
]
file_name
=
file_name_
+
':'
+
file_year
+
'年年度报告'
+
'.pdf'
content
=
''
origin
=
file_name_
+
'官网'
selects
=
secrchATT
(
social_code
,
file_year
,
1
)
if
selects
:
# self.getLogger().info(f'com_name:{com_name}--{year}已存在')
log
.
info
(
f
'===={file_name}--年报已存在==='
)
continue
#解析文件页数和内容
log
.
info
(
f
"-----------正在处理{file_name}--------------"
)
with
open
(
pdf_path
,
'rb'
)
as
file
:
...
...
@@ -153,7 +178,7 @@ if __name__=='__main__':
content
+=
page
.
get_text
()
# print(content)
except
Exception
as
e
:
log
.
info
(
f
'文件已损坏:{
c
name}'
)
log
.
info
(
f
'文件已损坏:{
file_
name}'
)
continue
#解析文件大小
file_size
=
os
.
path
.
getsize
(
pdf_path
)
...
...
@@ -168,8 +193,9 @@ if __name__=='__main__':
retData_f
=
uptoOBS
(
retData
,
pathType
,
taskType
,
start_time
,
file_name
,
pdf_path
)
if
retData_f
[
'state'
]:
#retData, com_name, year, pdf_name, num, pub_time
att_id
=
baseCore
.
tableUpdate
(
retData_f
,
cname
,
file_year
,
file_name
,
num
,
file_year
+
'-12-31'
)
att_id
=
baseCore
.
tableUpdate
(
retData_f
,
file_year
,
file_name
,
num
,
file_year
+
'-12-31'
,
origin
)
if
att_id
:
detect_language
=
baseCore
.
detect_language
(
content
)
dic_news
=
{
'attachmentIds'
:
att_id
,
'author'
:
''
,
...
...
@@ -179,8 +205,8 @@ if __name__=='__main__':
'deleteFlag'
:
'0'
,
'id'
:
''
,
'keyWords'
:
''
,
'lang'
:
'zh'
,
'origin'
:
'企业官网'
,
'lang'
:
detect_language
,
'origin'
:
origin
,
'publishDate'
:
file_year
+
'-12-31'
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
''
,
# 原文链接
...
...
@@ -191,13 +217,15 @@ if __name__=='__main__':
'year'
:
file_year
}
if
sendKafka
(
dic_news
):
log
.
info
(
f
'成功-
{file_rank}
--{file_name}----{att_id}---{social_code}'
)
log
.
info
(
f
'成功---{file_name}----{att_id}---{social_code}'
)
num
+=
1
else
:
log
.
info
(
f
'失败-
{file_rank}
--{file_name}----{att_id}---{social_code}'
)
log
.
info
(
f
'失败---{file_name}----{att_id}---{social_code}'
)
# 删除插入的数据 400表示发送数据失败
baseCore
.
deliteATT
(
att_id
)
log
.
info
(
f
'已删除插入附件表的数据---{file_name}-----{social_code}'
)
else
:
log
.
info
(
f
'-----年报已存在--{social_code}--{file_name}-----'
)
except
Exception
as
e
:
log
.
info
(
f
'error------{e}'
)
\ No newline at end of file
comData/noticeReport/delete_obsfile.py
0 → 100644
浏览文件 @
35396c0b
from
obs
import
ObsClient
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
cnx_
=
baseCore
.
cnx_
cursor_
=
baseCore
.
cursor_
# 创建ObsClient对象
obs_client
=
ObsClient
(
access_key_id
=
'VEHN7D0TJ9316H8AHCAV'
,
secret_access_key
=
'heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY'
,
server
=
'https://obs.cn-north-1.myhuaweicloud.com'
)
def
delete
(
object_keys
):
# 指定要删除的文件名列表
bucket_name
=
'zzsn'
# object_keys = ['QYNotice/8921b4b0-7853-11ee-bcc0-000c29312880.pdf']
# 批量删除文件
for
object_key
in
object_keys
:
resp
=
obs_client
.
deleteObject
(
bucket_name
,
object_key
)
if
resp
.
status
>=
200
and
resp
.
status
<
300
:
print
(
f
"文件 {object_key} 删除成功!"
)
else
:
print
(
f
"文件 {object_key} 删除失败! 错误码:{resp.errorCode},错误信息:{resp.errorMessage}"
)
# 关闭ObsClient对象
obs_client
.
close
()
if
__name__
==
'__main__'
:
query
=
"SELECT object_key FROM clb_sys_attachment WHERE type_id=8 AND source = '证监会' AND create_time >= '2023-10-30 16:46:09' AND create_time <= '2023-11-01 09:11:12'"
cursor_
.
execute
(
query
)
results
=
cursor_
.
fetchall
()
object_keys
=
[
item
[
0
]
for
item
in
results
]
delete
(
object_keys
)
\ No newline at end of file
comData/weixin_solo/get_tokenCookies.py
浏览文件 @
35396c0b
...
...
@@ -56,7 +56,7 @@ if __name__=="__main__":
url
=
"https://mp.weixin.qq.com/"
browser
.
get
(
url
)
# 可改动
time
.
sleep
(
2
0
)
time
.
sleep
(
4
0
)
s
=
requests
.
session
()
#获取到token和cookies
...
...
estool/Esmethod.py
浏览文件 @
35396c0b
...
...
@@ -149,10 +149,10 @@ class EsMethod(object):
'''
删除
'''
def
delete
(
self
,
index_name
):
def
delete
(
self
,
index_name
,
id
):
result
=
self
.
es
.
delete
(
index
=
index_name
,
doc_type
=
"_doc"
,
id
=
'20220901-XXXXXX'
)
,
id
=
id
)
print
(
'删除结果
%
s'
%
result
)
'''
...
...
@@ -163,16 +163,50 @@ class EsMethod(object):
'query'
:
{
'bool'
:
{
'should'
:[
{
'term'
:{
'
origin'
:
'雪球网
'
}},
{
'term'
:{
'type'
:
1
}},
{
'term'
:{
'
labels.relationId'
:
'91110108740053589U
'
}},
{
'term'
:{
'type'
:
3
}},
],
'must'
:
[
{
'match'
:
{
'title'
:
'.pdf'
}}
]
#
'must': [
#
{'match': {'title': '.pdf'}}
#
]
}
},
'from'
:
pnum
,
'size'
:
6000
,
'size'
:
600
,
}
body
=
{
"query"
:
{
"bool"
:
{
"must"
:
[
{
"nested"
:
{
"path"
:
"labels"
,
"query"
:
{
"match"
:
{
"labels.relationId"
:
"91110108740053589U"
}
}
}
},
{
"term"
:
{
"type.keyword"
:
{
"value"
:
"3"
}
}
}
]
}
},
"sort"
:
[
{
"publishDate"
:
{
"order"
:
"desc"
}
}
],
"track_total_hits"
:
True
,
"size"
:
212
}
filter_path
=
[
'hits.hits._source.title'
,
# 字段1
...
...
@@ -215,13 +249,14 @@ if __name__ == '__main__':
print
(
f
'第{pnum}页数据'
)
result
=
esMethod
.
multi_should
(
index_name
=
esMethod
.
index_name
,
pnum
=
p
)
msglist
=
result
[
'hits'
][
'hits'
]
print
(
msglist
)
#
print(msglist)
for
mms
in
msglist
:
id
=
mms
[
'_source'
][
'id'
]
title
=
mms
[
'_source'
][
'title'
]
utitle
=
title
.
replace
(
'.pdf'
,
''
)
print
(
f
'id:{id}---title:{title}--utitle:{utitle}'
)
esMethod
.
updateaunn
(
esMethod
.
index_name
,
str
(
id
),
utitle
)
# esMethod.updateaunn(esMethod.index_name,str(id),utitle)
esMethod
.
delete
(
esMethod
.
index_name
,
str
(
id
))
print
(
'跟新成功!!'
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论