提交 87f47a13 作者: 薛凌堃

政策法规脚本维护

上级 53ccb166
......@@ -4,6 +4,7 @@ import random
import socket
import sys
import time
import uuid
import fitz
import logbook
......@@ -11,26 +12,37 @@ import logbook.more
import pandas as pd
import requests
import zhconv
import pymysql
import redis
from docx import Document
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
#创建连接池
# 创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('D:\\zzsn_spider\\comData\\policylaw\\client.conf')
client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
from obs import ObsClient
import fitz
from urllib.parse import unquote
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
......@@ -236,8 +248,9 @@ class BaseCore:
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
# Android agent池
__USER_PHONE_AGENT_LIST = [
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
......@@ -246,7 +259,7 @@ class BaseCore:
self.cursor = self.cnx.cursor()
#11数据库
# 11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
......@@ -271,11 +284,11 @@ class BaseCore:
try:
self.cursor.close()
self.cnx.close()
except :
except:
pass
# 计算耗时
def getTimeCost(self,start, end):
def getTimeCost(self, start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
......@@ -288,6 +301,7 @@ class BaseCore:
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
......@@ -317,7 +331,7 @@ class BaseCore:
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
def logFormate(self, record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
......@@ -327,8 +341,9 @@ class BaseCore:
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
def getLogger(self, fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
......@@ -377,34 +392,34 @@ class BaseCore:
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
# 字符串截取
def getSubStr(self, str, beginStr, endStr):
if beginStr == '':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
begin = str.rfind(beginStr)
if begin == -1:
begin = 0
str = str[begin:]
if endStr == '':
pass
else:
end=str.rfind(endStr)
if end==-1:
end = str.rfind(endStr)
if end == -1:
pass
else:
str = str[0:end+1]
str = str[0:end + 1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
def hant_2_hans(self, hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
def str_have_num(self, str_num):
panduan = False
for str_1 in str_num:
......@@ -413,7 +428,7 @@ class BaseCore:
panduan = ppp
return panduan
#检测语言
# 检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
......@@ -423,11 +438,11 @@ class BaseCore:
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# 追加接入excel
def writerToExcel(self, detailList, filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
existing_data = pd.read_excel(filename, engine='openpyxl', dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
......@@ -436,22 +451,92 @@ class BaseCore:
combined_data.to_excel(filename, index=False)
# return combined_data
#解析word文件页数
# 解析word文件页数
# def doc_page(self,file_path):
# doc = Document(file_path)
# return len(doc.sections)
def doc_page(self, file_path):
doc = Document(file_path)
return len(doc.sections)
def pdf_content(self, resp_content):
# 解析pdf文件内容
content = ''
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
for page in doc.pages():
content += page.get_text()
break
except:
time.sleep(3)
continue
return content
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 替换为绝对路径之后,解析出来a.href
def uploadToserver(self, file_href, item_id):
category = os.path.splitext(file_href)[1]
# 上传至文件服务器
headers = {}
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
resp_content = ''
for i in range(0, 3):
try:
resp_content = requests.get(file_href, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
if resp_content:
pass
else:
return retData
# page_size = 0
# if category == '.doc' or category == '.docx':
# # page_size = self.doc_page(file_href)
# return retData
# if category == '.pdf' or category == '.PDF':
# page_size = self.pdf_page(resp_content)
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name=category.replace('.', ''))
self.getLogger().info('-------文件上传成功------')
break
except:
time.sleep(3)
continue
# if page_size>0:
# pass
# else:
# self.getLogger().info(f'======解析失败=====')
# return retData
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
# retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,file_name,type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id,order_by))
def secrchATT(self, item_id, retData, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, retData['path'], type_id, order_by))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,file_name,num,pub_time):
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, file_name, num, publishDate):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
......@@ -465,24 +550,22 @@ class BaseCore:
create_time = retData['create_time']
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time,path,'zzsn',pub_time)
create_time, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn', publishDate)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,file_name,type_id,order_by)
selects = self.secrchATT(item_id, retData, type_id, order_by)
id = selects[0]
return id,full_path
return id, full_path
# 获取文件大小
def convert_size(self,size_bytes):
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
......@@ -491,7 +574,7 @@ class BaseCore:
i += 1
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(self,file_href,item_id,file_name):
def uptoOBS(self, file_href, item_id, file_name):
headers = {}
category = os.path.splitext(file_href)[1]
......@@ -508,30 +591,21 @@ class BaseCore:
except:
time.sleep(3)
continue
# page_size = 0
for i in range(0, 3):
try:
# name = file_name
if category in file_name:
pass
else:
file_name = file_name + '.' + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + file_name, content=response.content)
name = str(self.getuuid()) + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content)
break
except:
time.sleep(3)
continue
# if page_size < 1:
# # pdf解析失败
# # print(f'======pdf解析失败=====')
# return retData
# else:
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
......@@ -552,4 +626,3 @@ class BaseCore:
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论