提交 87f47a13 作者: 薛凌堃

政策法规脚本维护

上级 53ccb166
...@@ -4,6 +4,7 @@ import random ...@@ -4,6 +4,7 @@ import random
import socket import socket
import sys import sys
import time import time
import uuid
import fitz import fitz
import logbook import logbook
...@@ -11,26 +12,37 @@ import logbook.more ...@@ -11,26 +12,37 @@ import logbook.more
import pandas as pd import pandas as pd
import requests import requests
import zhconv import zhconv
import pymysql
import redis import redis
from docx import Document
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid import langid
#创建连接池 # 创建连接池
import pymysql import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB from DBUtils.PooledDB import PooledDB
# import sys # import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client') # sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('D:\\zzsn_spider\\comData\\policylaw\\client.conf')
client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
from obs import ObsClient from obs import ObsClient
import fitz import fitz
from urllib.parse import unquote from urllib.parse import unquote
obsClient = ObsClient( obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码 access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址 server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
) )
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore: class BaseCore:
# 序列号 # 序列号
...@@ -236,8 +248,9 @@ class BaseCore: ...@@ -236,8 +248,9 @@ class BaseCore:
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5' 'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
] ]
#Android agent池 # Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'] __USER_PHONE_AGENT_LIST = [
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self): def __init__(self):
...@@ -246,7 +259,7 @@ class BaseCore: ...@@ -246,7 +259,7 @@ class BaseCore:
self.cursor = self.cnx.cursor() self.cursor = self.cnx.cursor()
#11数据库 # 11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4') charset='utf8mb4')
self.cursor_ = self.cnx_.cursor() self.cursor_ = self.cnx_.cursor()
...@@ -271,11 +284,11 @@ class BaseCore: ...@@ -271,11 +284,11 @@ class BaseCore:
try: try:
self.cursor.close() self.cursor.close()
self.cnx.close() self.cnx.close()
except : except:
pass pass
# 计算耗时 # 计算耗时
def getTimeCost(self,start, end): def getTimeCost(self, start, end):
seconds = int(end - start) seconds = int(end - start)
m, s = divmod(seconds, 60) m, s = divmod(seconds, 60)
h, m = divmod(m, 60) h, m = divmod(m, 60)
...@@ -288,6 +301,7 @@ class BaseCore: ...@@ -288,6 +301,7 @@ class BaseCore:
else: else:
ms = int((end - start) * 1000) ms = int((end - start) * 1000)
return "%d毫秒" % (ms) return "%d毫秒" % (ms)
# 当前时间格式化 # 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S # 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S # 2 : 010101120000 %y%m%d%H%M%S
...@@ -317,7 +331,7 @@ class BaseCore: ...@@ -317,7 +331,7 @@ class BaseCore:
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3) return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式 # 日志格式
def logFormate(self,record, handler): def logFormate(self, record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format( formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间 date=record.time, # 日志时间
level=record.level_name, # 日志等级 level=record.level_name, # 日志等级
...@@ -327,8 +341,9 @@ class BaseCore: ...@@ -327,8 +341,9 @@ class BaseCore:
msg=record.message # 日志内容 msg=record.message # 日志内容
) )
return formate return formate
# 获取logger # 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True): def getLogger(self, fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0])) dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs") dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log" filename = filename.replace(".py", "") + ".log"
...@@ -377,34 +392,34 @@ class BaseCore: ...@@ -377,34 +392,34 @@ class BaseCore:
proxy_list.append(proxy) proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)] return proxy_list[random.randint(0, 3)]
#字符串截取 # 字符串截取
def getSubStr(self,str,beginStr,endStr): def getSubStr(self, str, beginStr, endStr):
if beginStr=='': if beginStr == '':
pass pass
else: else:
begin=str.rfind(beginStr) begin = str.rfind(beginStr)
if begin==-1: if begin == -1:
begin=0 begin = 0
str=str[begin:] str = str[begin:]
if endStr=='': if endStr == '':
pass pass
else: else:
end=str.rfind(endStr) end = str.rfind(endStr)
if end==-1: if end == -1:
pass pass
else: else:
str = str[0:end+1] str = str[0:end + 1]
return str return str
# 繁体字转简体字 # 繁体字转简体字
def hant_2_hans(self,hant_str: str): def hant_2_hans(self, hant_str: str):
''' '''
Function: 将 hant_str 由繁体转化为简体 Function: 将 hant_str 由繁体转化为简体
''' '''
return zhconv.convert(hant_str, 'zh-hans') return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字 # 判断字符串里是否含数字
def str_have_num(self,str_num): def str_have_num(self, str_num):
panduan = False panduan = False
for str_1 in str_num: for str_1 in str_num:
...@@ -413,7 +428,7 @@ class BaseCore: ...@@ -413,7 +428,7 @@ class BaseCore:
panduan = ppp panduan = ppp
return panduan return panduan
#检测语言 # 检测语言
def detect_language(self, text): def detect_language(self, text):
# 使用langid.py判断文本的语言 # 使用langid.py判断文本的语言
result = langid.classify(text) result = langid.classify(text)
...@@ -423,11 +438,11 @@ class BaseCore: ...@@ -423,11 +438,11 @@ class BaseCore:
return 'cn' return 'cn'
return result[0] return result[0]
#追加接入excel # 追加接入excel
def writerToExcel(self,detailList,filename): def writerToExcel(self, detailList, filename):
# filename='baidu搜索.xlsx' # filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件 # 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str) existing_data = pd.read_excel(filename, engine='openpyxl', dtype=str)
# 创建新的数据 # 创建新的数据
new_data = pd.DataFrame(data=detailList) new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾 # 将新数据添加到现有数据的末尾
...@@ -436,22 +451,92 @@ class BaseCore: ...@@ -436,22 +451,92 @@ class BaseCore:
combined_data.to_excel(filename, index=False) combined_data.to_excel(filename, index=False)
# return combined_data # return combined_data
#解析word文件页数 # 解析word文件页数
# def doc_page(self,file_path): def doc_page(self, file_path):
# doc = Document(file_path) doc = Document(file_path)
# return len(doc.sections) return len(doc.sections)
def pdf_content(self, resp_content):
# 解析pdf文件内容
content = ''
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
for page in doc.pages():
content += page.get_text()
break
except:
time.sleep(3)
continue
return content
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 替换为绝对路径之后,解析出来a.href
def uploadToserver(self, file_href, item_id):
category = os.path.splitext(file_href)[1]
# 上传至文件服务器
headers = {}
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
resp_content = ''
for i in range(0, 3):
try:
resp_content = requests.get(file_href, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
if resp_content:
pass
else:
return retData
# page_size = 0
# if category == '.doc' or category == '.docx':
# # page_size = self.doc_page(file_href)
# return retData
# if category == '.pdf' or category == '.PDF':
# page_size = self.pdf_page(resp_content)
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name=category.replace('.', ''))
self.getLogger().info('-------文件上传成功------')
break
except:
time.sleep(3)
continue
# if page_size>0:
# pass
# else:
# self.getLogger().info(f'======解析失败=====')
# return retData
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
# retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,file_name,type_id,order_by): def secrchATT(self, item_id, retData, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s ''' sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id,order_by)) self.cursor_.execute(sel_sql, (item_id, retData['path'], type_id, order_by))
selects = self.cursor_.fetchone() selects = self.cursor_.fetchone()
return selects return selects
#插入到att表 返回附件id # 插入到att表 返回附件id
def tableUpdate(self,retData,com_name,file_name,num,pub_time): def tableUpdate(self, retData, com_name, file_name, num, publishDate):
item_id = retData['item_id'] item_id = retData['item_id']
type_id = retData['type_id'] type_id = retData['type_id']
group_name = retData['group_name'] group_name = retData['group_name']
...@@ -465,24 +550,22 @@ class BaseCore: ...@@ -465,24 +550,22 @@ class BaseCore:
create_time = retData['create_time'] create_time = retData['create_time']
order_by = num order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = ( values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by, status, create_by,
create_time,path,'zzsn',pub_time) create_time, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn', publishDate)
self.cursor_.execute(Upsql, values) # 插入 self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交 self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql)) self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,file_name,type_id,order_by) selects = self.secrchATT(item_id, retData, type_id, order_by)
id = selects[0] id = selects[0]
return id,full_path return id, full_path
# 获取文件大小 # 获取文件大小
def convert_size(self,size_bytes): def convert_size(self, size_bytes):
# 定义不同单位的转换值 # 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB'] units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0 i = 0
...@@ -491,7 +574,7 @@ class BaseCore: ...@@ -491,7 +574,7 @@ class BaseCore:
i += 1 i += 1
return f"{size_bytes:.2f} {units[i]}" return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(self,file_href,item_id,file_name): def uptoOBS(self, file_href, item_id, file_name):
headers = {} headers = {}
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
...@@ -508,30 +591,21 @@ class BaseCore: ...@@ -508,30 +591,21 @@ class BaseCore:
except: except:
time.sleep(3) time.sleep(3)
continue continue
# page_size = 0
for i in range(0, 3): for i in range(0, 3):
try: try:
# name = file_name name = str(self.getuuid()) + category
if category in file_name:
pass result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content)
else:
file_name = file_name + '.' + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + file_name, content=response.content)
break break
except: except:
time.sleep(3) time.sleep(3)
continue continue
else:
# if page_size < 1:
# # pdf解析失败
# # print(f'======pdf解析失败=====')
# return retData
# else:
try: try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1] retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl']) retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = self.convert_size(file_size) retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now retData['create_time'] = time_now
except Exception as e: except Exception as e:
...@@ -552,4 +626,3 @@ class BaseCore: ...@@ -552,4 +626,3 @@ class BaseCore:
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论