Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
8d78289e
提交
8d78289e
authored
8月 26, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
工具包
上级
80cb5cb4
显示空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
146 行增加
和
9 行删除
+146
-9
BaseCore.py
base/BaseCore.py
+83
-9
client.conf
base/client.conf
+63
-0
没有找到文件。
base/BaseCore.py
浏览文件 @
8d78289e
...
...
@@ -4,9 +4,12 @@ import random
import
socket
import
sys
import
time
import
fitz
import
logbook
import
logbook.more
import
pandas
as
pd
import
requests
import
zhconv
import
pymysql
import
redis
...
...
@@ -20,14 +23,17 @@ import pymysql
from
pymysql
import
connections
from
DBUtils.PooledDB
import
PooledDB
from
fdfs_client.client
import
get_tracker_conf
,
Fdfs_client
tracker_conf
=
get_tracker_conf
(
'./client.conf'
)
client
=
Fdfs_client
(
tracker_conf
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class
BaseCore
:
# 序列号
__seq
=
0
# 代理池 数据库连接
__cnx_proxy
=
None
__cursor_proxy
=
None
#
__cnx_proxy =None
#
__cursor_proxy = None
cnx
=
None
cursor
=
None
r
=
None
...
...
@@ -228,9 +234,9 @@ class BaseCore:
__USER_PHONE_AGENT_LIST
=
[
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'
]
def
__init__
(
self
):
self
.
__cnx_proxy
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
self
.
__cursor_proxy
=
self
.
__cnx_proxy
.
cursor
()
#
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
#
charset='utf8mb4')
#
self.__cursor_proxy = self.__cnx_proxy.cursor()
self
.
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
...
...
@@ -254,8 +260,6 @@ class BaseCore:
def
close
(
self
):
try
:
self
.
__cursor_proxy
.
close
()
self
.
__cnx_proxy
.
close
()
self
.
cursor
.
close
()
self
.
cnx
.
close
()
except
:
...
...
@@ -345,8 +349,8 @@ class BaseCore:
# 获取代理
def
get_proxy
(
self
):
sql
=
"select proxy from clb_proxy"
self
.
__cursor_proxy
.
execute
(
sql
)
proxy_lists
=
self
.
__cursor_proxy
.
fetchall
()
self
.
cursor
.
execute
(
sql
)
proxy_lists
=
self
.
cursor
.
fetchall
()
ip_list
=
[]
for
proxy_
in
proxy_lists
:
ip_list
.
append
(
str
(
proxy_
)
.
replace
(
"('"
,
''
)
.
replace
(
"',)"
,
''
))
...
...
@@ -580,8 +584,78 @@ class BaseCore:
self
.
r
.
set
(
key
,
0
)
self
.
r
.
expire
(
key
,
3600
)
time
.
sleep
(
2
)
#上传至文件服务器
def
upLoadToServe
(
self
,
pdf_url
,
type_id
,
social_code
):
headers
=
{}
retData
=
{
'state'
:
False
,
'type_id'
:
type_id
,
'item_id'
:
social_code
,
'group_name'
:
'group1'
,
'path'
:
''
,
'full_path'
:
''
,
'category'
:
'pdf'
,
'file_size'
:
''
,
'status'
:
1
,
'create_by'
:
'XueLingKun'
,
'create_time'
:
''
,
'page_size'
:
''
,
'content'
:
''
}
headers
[
'User-Agent'
]
=
self
.
getRandomUserAgent
()
for
i
in
range
(
0
,
3
):
try
:
resp_content
=
requests
.
get
(
pdf_url
,
headers
=
headers
,
verify
=
False
,
timeout
=
20
)
.
content
break
except
:
time
.
sleep
(
3
)
continue
page_size
=
0
for
i
in
range
(
0
,
3
):
try
:
result
=
client
.
upload_by_buffer
(
resp_content
,
file_ext_name
=
'pdf'
)
with
fitz
.
open
(
stream
=
resp_content
,
filetype
=
'pdf'
)
as
doc
:
page_size
=
doc
.
page_count
for
page
in
doc
.
pages
():
retData
[
'content'
]
+=
page
.
get_text
()
break
except
:
time
.
sleep
(
3
)
continue
if
page_size
<
1
:
# pdf解析失败
print
(
f
'======pdf解析失败====='
)
return
retData
else
:
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
retData
[
'state'
]
=
True
retData
[
'path'
]
=
bytes
.
decode
(
result
[
'Remote file_id'
])
.
replace
(
'group1'
,
''
)
retData
[
'full_path'
]
=
bytes
.
decode
(
result
[
'Remote file_id'
])
retData
[
'file_size'
]
=
result
[
'Uploaded size'
]
retData
[
'create_time'
]
=
time_now
retData
[
'page_size'
]
=
page_size
return
retData
#插入到att表 返回附件id
def
tableUpdate
(
self
,
retData
,
com_name
,
year
,
order_by
):
item_id
=
retData
[
'item_id'
]
type_id
=
retData
[
'type_id'
]
group_name
=
retData
[
'group_name'
]
path
=
retData
[
'path'
]
full_path
=
retData
[
'full_path'
]
category
=
retData
[
'category'
]
file_size
=
retData
[
'file_size'
]
status
=
retData
[
'status'
]
create_by
=
retData
[
'create_by'
]
page_size
=
retData
[
'page_size'
]
create_time
=
retData
[
'create_time'
]
sel_sql
=
'''select item_id from clb_sys_attachment where item_id =
%
s and year =
%
s and type_id=
%
s '''
self
.
cursor
.
execute
(
sel_sql
,
(
item_id
,
year
,
type_id
))
selects
=
self
.
cursor
.
fetchone
()
if
selects
:
self
.
getLogger
()
.
info
(
f
'com_name:{com_name}已存在'
)
else
:
Upsql
=
'''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)'''
values
=
(
year
,
com_name
,
type_id
,
item_id
,
group_name
,
path
,
full_path
,
category
,
file_size
,
order_by
,
status
,
create_by
,
create_time
,
page_size
)
self
.
cursor
.
execute
(
Upsql
,
values
)
# 插入
self
.
cnx
.
commit
()
# 提交
self
.
getLogger
()
.
info
(
"更新完成:{}"
.
format
(
Upsql
))
...
...
base/client.conf
0 → 100644
浏览文件 @
8d78289e
# connect timeout in seconds
# default value is 30s
connect_timeout
=
300
# network timeout in seconds
# default value is 30s
network_timeout
=
600
# the base path to store log files
#base_path=/home/tarena/django-project/cc_shop1/cc_shop1/logs
# tracker_server can ocur more than once, and tracker_server format is
# "host:port", host can be hostname or ip address
tracker_server
=
114
.
115
.
215
.
96
:
22122
#standard log level as syslog, case insensitive, value list:
### emerg for emergency
### alert
### crit for critical
### error
### warn for warning
### notice
### info
### debug
log_level
=
info
# if use connection pool
# default value is false
# since V4.05
use_connection_pool
=
false
# connections whose the idle time exceeds this time will be closed
# unit: second
# default value is 3600
# since V4.05
connection_pool_max_idle_time
=
3600
# if load FastDFS parameters from tracker server
# since V4.05
# default value is false
load_fdfs_parameters_from_tracker
=
false
# if use storage ID instead of IP address
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# default value is false
# since V4.05
use_storage_id
=
false
# specify storage ids filename, can use relative or absolute path
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# since V4.05
storage_ids_filename
=
storage_ids
.
conf
#HTTP settings
http
.
tracker_server_port
=
80
#use "#include" directive to include HTTP other settiongs
##
include
http
.
conf
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论