Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
87f47a13
提交
87f47a13
authored
12月 07, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
政策法规脚本维护
上级
53ccb166
显示空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
131 行增加
和
58 行删除
+131
-58
BaseCore.py
comData/policylaw/BaseCore.py
+131
-58
没有找到文件。
comData/policylaw/BaseCore.py
浏览文件 @
87f47a13
...
...
@@ -4,6 +4,7 @@ import random
import
socket
import
sys
import
time
import
uuid
import
fitz
import
logbook
...
...
@@ -11,26 +12,37 @@ import logbook.more
import
pandas
as
pd
import
requests
import
zhconv
import
pymysql
import
redis
from
docx
import
Document
from
selenium
import
webdriver
from
selenium.webdriver.chrome.service
import
Service
from
openpyxl
import
Workbook
import
langid
#创建连接池
#
创建连接池
import
pymysql
from
pymysql
import
connections
from
DBUtils.PooledDB
import
PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from
fdfs_client.client
import
get_tracker_conf
,
Fdfs_client
tracker_conf
=
get_tracker_conf
(
'D:
\\
zzsn_spider
\\
comData
\\
policylaw
\\
client.conf'
)
client
=
Fdfs_client
(
tracker_conf
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
from
obs
import
ObsClient
import
fitz
from
urllib.parse
import
unquote
obsClient
=
ObsClient
(
access_key_id
=
'VEHN7D0TJ9316H8AHCAV'
,
# 你的华为云的ak码
secret_access_key
=
'heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY'
,
# 你的华为云的sk
server
=
'https://obs.cn-north-1.myhuaweicloud.com'
# 你的桶的地址
)
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class
BaseCore
:
# 序列号
...
...
@@ -236,8 +248,9 @@ class BaseCore:
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST
=
[
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'
]
# Android agent池
__USER_PHONE_AGENT_LIST
=
[
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'
]
def
__init__
(
self
):
...
...
@@ -246,7 +259,7 @@ class BaseCore:
self
.
cursor
=
self
.
cnx
.
cursor
()
#11数据库
#
11数据库
self
.
cnx_
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
self
.
cursor_
=
self
.
cnx_
.
cursor
()
...
...
@@ -271,11 +284,11 @@ class BaseCore:
try
:
self
.
cursor
.
close
()
self
.
cnx
.
close
()
except
:
except
:
pass
# 计算耗时
def
getTimeCost
(
self
,
start
,
end
):
def
getTimeCost
(
self
,
start
,
end
):
seconds
=
int
(
end
-
start
)
m
,
s
=
divmod
(
seconds
,
60
)
h
,
m
=
divmod
(
m
,
60
)
...
...
@@ -288,6 +301,7 @@ class BaseCore:
else
:
ms
=
int
((
end
-
start
)
*
1000
)
return
"
%
d毫秒"
%
(
ms
)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
...
...
@@ -317,7 +331,7 @@ class BaseCore:
return
"ZZSN"
+
self
.
getNowTime
(
2
)
+
str
(
self
.
__seq
)
.
zfill
(
3
)
# 日志格式
def
logFormate
(
self
,
record
,
handler
):
def
logFormate
(
self
,
record
,
handler
):
formate
=
"[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}"
.
format
(
date
=
record
.
time
,
# 日志时间
level
=
record
.
level_name
,
# 日志等级
...
...
@@ -327,8 +341,9 @@ class BaseCore:
msg
=
record
.
message
# 日志内容
)
return
formate
# 获取logger
def
getLogger
(
self
,
fileLogFlag
=
True
,
stdOutFlag
=
True
):
def
getLogger
(
self
,
fileLogFlag
=
True
,
stdOutFlag
=
True
):
dirname
,
filename
=
os
.
path
.
split
(
os
.
path
.
abspath
(
sys
.
argv
[
0
]))
dirname
=
os
.
path
.
join
(
dirname
,
"logs"
)
filename
=
filename
.
replace
(
".py"
,
""
)
+
".log"
...
...
@@ -377,34 +392,34 @@ class BaseCore:
proxy_list
.
append
(
proxy
)
return
proxy_list
[
random
.
randint
(
0
,
3
)]
#字符串截取
def
getSubStr
(
self
,
str
,
beginStr
,
endStr
):
if
beginStr
==
''
:
#
字符串截取
def
getSubStr
(
self
,
str
,
beginStr
,
endStr
):
if
beginStr
==
''
:
pass
else
:
begin
=
str
.
rfind
(
beginStr
)
if
begin
==
-
1
:
begin
=
0
str
=
str
[
begin
:]
if
endStr
==
''
:
begin
=
str
.
rfind
(
beginStr
)
if
begin
==
-
1
:
begin
=
0
str
=
str
[
begin
:]
if
endStr
==
''
:
pass
else
:
end
=
str
.
rfind
(
endStr
)
if
end
==
-
1
:
end
=
str
.
rfind
(
endStr
)
if
end
==
-
1
:
pass
else
:
str
=
str
[
0
:
end
+
1
]
str
=
str
[
0
:
end
+
1
]
return
str
# 繁体字转简体字
def
hant_2_hans
(
self
,
hant_str
:
str
):
def
hant_2_hans
(
self
,
hant_str
:
str
):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return
zhconv
.
convert
(
hant_str
,
'zh-hans'
)
# 判断字符串里是否含数字
def
str_have_num
(
self
,
str_num
):
def
str_have_num
(
self
,
str_num
):
panduan
=
False
for
str_1
in
str_num
:
...
...
@@ -413,7 +428,7 @@ class BaseCore:
panduan
=
ppp
return
panduan
#检测语言
#
检测语言
def
detect_language
(
self
,
text
):
# 使用langid.py判断文本的语言
result
=
langid
.
classify
(
text
)
...
...
@@ -423,11 +438,11 @@ class BaseCore:
return
'cn'
return
result
[
0
]
#追加接入excel
def
writerToExcel
(
self
,
detailList
,
filename
):
#
追加接入excel
def
writerToExcel
(
self
,
detailList
,
filename
):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
,
dtype
=
str
)
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
,
dtype
=
str
)
# 创建新的数据
new_data
=
pd
.
DataFrame
(
data
=
detailList
)
# 将新数据添加到现有数据的末尾
...
...
@@ -436,22 +451,92 @@ class BaseCore:
combined_data
.
to_excel
(
filename
,
index
=
False
)
# return combined_data
#解析word文件页数
#
解析word文件页数
# def doc_page(self,
file_path):
#
doc = Document(file_path)
#
return len(doc.sections)
def
doc_page
(
self
,
file_path
):
doc
=
Document
(
file_path
)
return
len
(
doc
.
sections
)
def
pdf_content
(
self
,
resp_content
):
# 解析pdf文件内容
content
=
''
for
i
in
range
(
0
,
3
):
try
:
result
=
client
.
upload_by_buffer
(
resp_content
,
file_ext_name
=
'pdf'
)
with
fitz
.
open
(
stream
=
resp_content
,
filetype
=
'pdf'
)
as
doc
:
# page_size = doc.page_count
for
page
in
doc
.
pages
():
content
+=
page
.
get_text
()
break
except
:
time
.
sleep
(
3
)
continue
return
content
def
getuuid
(
self
):
get_timestamp_uuid
=
uuid
.
uuid1
()
# 根据 时间戳生成 uuid , 保证全球唯一
return
get_timestamp_uuid
# 替换为绝对路径之后,解析出来a.href
def
uploadToserver
(
self
,
file_href
,
item_id
):
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
# 上传至文件服务器
headers
=
{}
retData
=
{
'state'
:
False
,
'type_id'
:
7
,
'item_id'
:
item_id
,
'group_name'
:
'group1'
,
'path'
:
''
,
'full_path'
:
''
,
'category'
:
category
,
'file_size'
:
''
,
'status'
:
1
,
'create_by'
:
'XueLingKun'
,
'create_time'
:
''
,
'page_size'
:
''
,
'content'
:
''
}
headers
[
'User-Agent'
]
=
self
.
getRandomUserAgent
()
resp_content
=
''
for
i
in
range
(
0
,
3
):
try
:
resp_content
=
requests
.
get
(
file_href
,
headers
=
headers
,
verify
=
False
,
timeout
=
20
)
.
content
break
except
:
time
.
sleep
(
3
)
continue
if
resp_content
:
pass
else
:
return
retData
# page_size = 0
# if category == '.doc' or category == '.docx':
# # page_size = self.doc_page(file_href)
# return retData
# if category == '.pdf' or category == '.PDF':
# page_size = self.pdf_page(resp_content)
for
i
in
range
(
0
,
3
):
try
:
result
=
client
.
upload_by_buffer
(
resp_content
,
file_ext_name
=
category
.
replace
(
'.'
,
''
))
self
.
getLogger
()
.
info
(
'-------文件上传成功------'
)
break
except
:
time
.
sleep
(
3
)
continue
# if page_size>0:
# pass
# else:
# self.getLogger().info(f'======解析失败=====')
# return retData
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
retData
[
'state'
]
=
True
retData
[
'path'
]
=
bytes
.
decode
(
result
[
'Remote file_id'
])
.
replace
(
'group1'
,
''
)
retData
[
'full_path'
]
=
bytes
.
decode
(
result
[
'Remote file_id'
])
retData
[
'file_size'
]
=
result
[
'Uploaded size'
]
retData
[
'create_time'
]
=
time_now
# retData['page_size'] = page_size
return
retData
def
secrchATT
(
self
,
item_id
,
file_name
,
type_id
,
order_by
):
sel_sql
=
'''select id from clb_sys_attachment where item_id =
%
s and
name
=
%
s and type_id=
%
s and order_by=
%
s '''
self
.
cursor_
.
execute
(
sel_sql
,
(
item_id
,
file_name
,
type_id
,
order_by
))
def
secrchATT
(
self
,
item_id
,
retData
,
type_id
,
order_by
):
sel_sql
=
'''select id from clb_sys_attachment where item_id =
%
s and
path
=
%
s and type_id=
%
s and order_by=
%
s '''
self
.
cursor_
.
execute
(
sel_sql
,
(
item_id
,
retData
[
'path'
],
type_id
,
order_by
))
selects
=
self
.
cursor_
.
fetchone
()
return
selects
#插入到att表 返回附件id
def
tableUpdate
(
self
,
retData
,
com_name
,
file_name
,
num
,
pub_tim
e
):
#
插入到att表 返回附件id
def
tableUpdate
(
self
,
retData
,
com_name
,
file_name
,
num
,
publishDat
e
):
item_id
=
retData
[
'item_id'
]
type_id
=
retData
[
'type_id'
]
group_name
=
retData
[
'group_name'
]
...
...
@@ -465,24 +550,22 @@ class BaseCore:
create_time
=
retData
[
'create_time'
]
order_by
=
num
Upsql
=
'''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)'''
values
=
(
file_name
,
type_id
,
item_id
,
group_name
,
path
,
full_path
,
category
,
file_size
,
order_by
,
status
,
create_by
,
create_time
,
path
,
'zzsn'
,
pub_tim
e
)
create_time
,
full_path
.
split
(
'https://zzsn.obs.cn-north-1.myhuaweicloud.com/'
)[
1
],
'zzsn'
,
publishDat
e
)
self
.
cursor_
.
execute
(
Upsql
,
values
)
# 插入
self
.
cnx_
.
commit
()
# 提交
self
.
getLogger
()
.
info
(
"更新完成:{}"
.
format
(
Upsql
))
selects
=
self
.
secrchATT
(
item_id
,
file_name
,
type_id
,
order_by
)
selects
=
self
.
secrchATT
(
item_id
,
retData
,
type_id
,
order_by
)
id
=
selects
[
0
]
return
id
,
full_path
return
id
,
full_path
# 获取文件大小
def
convert_size
(
self
,
size_bytes
):
def
convert_size
(
self
,
size_bytes
):
# 定义不同单位的转换值
units
=
[
'bytes'
,
'KB'
,
'MB'
,
'GB'
,
'TB'
]
i
=
0
...
...
@@ -491,7 +574,7 @@ class BaseCore:
i
+=
1
return
f
"{size_bytes:.2f} {units[i]}"
def
uptoOBS
(
self
,
file_href
,
item_id
,
file_name
):
def
uptoOBS
(
self
,
file_href
,
item_id
,
file_name
):
headers
=
{}
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
...
...
@@ -508,30 +591,21 @@ class BaseCore:
except
:
time
.
sleep
(
3
)
continue
# page_size = 0
for
i
in
range
(
0
,
3
):
try
:
# name = file_name
if
category
in
file_name
:
pass
else
:
file_name
=
file_name
+
'.'
+
category
result
=
obsClient
.
putContent
(
'zzsn'
,
'PolicyDocuments/'
+
file_name
,
content
=
response
.
content
)
name
=
str
(
self
.
getuuid
())
+
category
result
=
obsClient
.
putContent
(
'zzsn'
,
'PolicyDocuments/'
+
name
,
content
=
response
.
content
)
break
except
:
time
.
sleep
(
3
)
continue
# if page_size < 1:
# # pdf解析失败
# # print(f'======pdf解析失败=====')
# return retData
# else:
else
:
try
:
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
retData
[
'state'
]
=
True
retData
[
'path'
]
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
retData
[
'full_path'
]
=
unquote
(
result
[
'body'
][
'objectUrl'
])
retData
[
'full_path'
]
=
result
[
'body'
][
'objectUrl'
]
retData
[
'file_size'
]
=
self
.
convert_size
(
file_size
)
retData
[
'create_time'
]
=
time_now
except
Exception
as
e
:
...
...
@@ -552,4 +626,3 @@ class BaseCore:
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论