提交 91d93313 作者: bruxellse_li

平台模型管理

上级 6291eec9
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
<serverData>
<paths name="python@180.76.177.55:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.130.239:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.9.59:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
</component>
</project>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="49">
<item index="0" class="java.lang.String" itemvalue="pandas" />
<item index="1" class="java.lang.String" itemvalue="tqdm" />
<item index="2" class="java.lang.String" itemvalue="transformers" />
<item index="3" class="java.lang.String" itemvalue="sentencepiece" />
<item index="4" class="java.lang.String" itemvalue="keras" />
<item index="5" class="java.lang.String" itemvalue="gevent" />
<item index="6" class="java.lang.String" itemvalue="torch" />
<item index="7" class="java.lang.String" itemvalue="numpy" />
<item index="8" class="java.lang.String" itemvalue="Flask" />
<item index="9" class="java.lang.String" itemvalue="thulac" />
<item index="10" class="java.lang.String" itemvalue="beautifulsoup4" />
<item index="11" class="java.lang.String" itemvalue="fdfs_client" />
<item index="12" class="java.lang.String" itemvalue="pymysql" />
<item index="13" class="java.lang.String" itemvalue="selenium" />
<item index="14" class="java.lang.String" itemvalue="matplotlib" />
<item index="15" class="java.lang.String" itemvalue="pyecharts" />
<item index="16" class="java.lang.String" itemvalue="requests" />
<item index="17" class="java.lang.String" itemvalue="docx" />
<item index="18" class="java.lang.String" itemvalue="flask_sqlalchemy" />
<item index="19" class="java.lang.String" itemvalue="scikit_learn" />
<item index="20" class="java.lang.String" itemvalue="gensim" />
<item index="21" class="java.lang.String" itemvalue="sentence_transformers" />
<item index="22" class="java.lang.String" itemvalue="elasticsearch" />
<item index="23" class="java.lang.String" itemvalue="nltk" />
<item index="24" class="java.lang.String" itemvalue="symspellpy" />
<item index="25" class="java.lang.String" itemvalue="wordcloud" />
<item index="26" class="java.lang.String" itemvalue="concurrent_log_handler" />
<item index="27" class="java.lang.String" itemvalue="setuptools" />
<item index="28" class="java.lang.String" itemvalue="gunicorn" />
<item index="29" class="java.lang.String" itemvalue="jieba" />
<item index="30" class="java.lang.String" itemvalue="flask" />
<item index="31" class="java.lang.String" itemvalue="flak_cors" />
<item index="32" class="java.lang.String" itemvalue="paddle" />
<item index="33" class="java.lang.String" itemvalue="bert_serving" />
<item index="34" class="java.lang.String" itemvalue="certifi" />
<item index="35" class="java.lang.String" itemvalue="SQLAlchemy" />
<item index="36" class="java.lang.String" itemvalue="xlrd" />
<item index="37" class="java.lang.String" itemvalue="bert_serving_client" />
<item index="38" class="java.lang.String" itemvalue="pytime" />
<item index="39" class="java.lang.String" itemvalue="goose3" />
<item index="40" class="java.lang.String" itemvalue="Flask_Cors" />
<item index="41" class="java.lang.String" itemvalue="paddlepaddle" />
<item index="42" class="java.lang.String" itemvalue="trustai" />
<item index="43" class="java.lang.String" itemvalue="paddle_serving_client" />
<item index="44" class="java.lang.String" itemvalue="tritonclient" />
<item index="45" class="java.lang.String" itemvalue="paddle_serving_server" />
<item index="46" class="java.lang.String" itemvalue="paddlenlp" />
<item index="47" class="java.lang.String" itemvalue="openai" />
<item index="48" class="java.lang.String" itemvalue="feedparser" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Model-Management.iml" filepath="$PROJECT_DIR$/.idea/Model-Management.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Remote Python 3.9.5 (sftp://root@114.116.90.53:22/home/python/anaconda3/envs/JXYQ@py39/bin/python3.9)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" autoUpload="On explicit save action" serverName="FastText-Model" remoteFilesAllowedToDisappearOnAutoupload="false" autoUploadExternalChanges="true">
<serverData>
<paths name="FastText-Model">
<serverdata>
<mappings>
<mapping deploy="/" local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="python@180.76.177.55:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.130.239:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.9.59:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
<option name="myAutoUpload" value="ON_EXPLICIT_SAVE" />
</component>
</project>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="49">
<item index="0" class="java.lang.String" itemvalue="pandas" />
<item index="1" class="java.lang.String" itemvalue="tqdm" />
<item index="2" class="java.lang.String" itemvalue="transformers" />
<item index="3" class="java.lang.String" itemvalue="sentencepiece" />
<item index="4" class="java.lang.String" itemvalue="keras" />
<item index="5" class="java.lang.String" itemvalue="gevent" />
<item index="6" class="java.lang.String" itemvalue="torch" />
<item index="7" class="java.lang.String" itemvalue="numpy" />
<item index="8" class="java.lang.String" itemvalue="Flask" />
<item index="9" class="java.lang.String" itemvalue="thulac" />
<item index="10" class="java.lang.String" itemvalue="beautifulsoup4" />
<item index="11" class="java.lang.String" itemvalue="fdfs_client" />
<item index="12" class="java.lang.String" itemvalue="pymysql" />
<item index="13" class="java.lang.String" itemvalue="selenium" />
<item index="14" class="java.lang.String" itemvalue="matplotlib" />
<item index="15" class="java.lang.String" itemvalue="pyecharts" />
<item index="16" class="java.lang.String" itemvalue="requests" />
<item index="17" class="java.lang.String" itemvalue="docx" />
<item index="18" class="java.lang.String" itemvalue="flask_sqlalchemy" />
<item index="19" class="java.lang.String" itemvalue="scikit_learn" />
<item index="20" class="java.lang.String" itemvalue="gensim" />
<item index="21" class="java.lang.String" itemvalue="sentence_transformers" />
<item index="22" class="java.lang.String" itemvalue="elasticsearch" />
<item index="23" class="java.lang.String" itemvalue="nltk" />
<item index="24" class="java.lang.String" itemvalue="symspellpy" />
<item index="25" class="java.lang.String" itemvalue="wordcloud" />
<item index="26" class="java.lang.String" itemvalue="concurrent_log_handler" />
<item index="27" class="java.lang.String" itemvalue="setuptools" />
<item index="28" class="java.lang.String" itemvalue="gunicorn" />
<item index="29" class="java.lang.String" itemvalue="jieba" />
<item index="30" class="java.lang.String" itemvalue="flask" />
<item index="31" class="java.lang.String" itemvalue="flak_cors" />
<item index="32" class="java.lang.String" itemvalue="paddle" />
<item index="33" class="java.lang.String" itemvalue="bert_serving" />
<item index="34" class="java.lang.String" itemvalue="certifi" />
<item index="35" class="java.lang.String" itemvalue="SQLAlchemy" />
<item index="36" class="java.lang.String" itemvalue="xlrd" />
<item index="37" class="java.lang.String" itemvalue="bert_serving_client" />
<item index="38" class="java.lang.String" itemvalue="pytime" />
<item index="39" class="java.lang.String" itemvalue="goose3" />
<item index="40" class="java.lang.String" itemvalue="Flask_Cors" />
<item index="41" class="java.lang.String" itemvalue="paddlepaddle" />
<item index="42" class="java.lang.String" itemvalue="trustai" />
<item index="43" class="java.lang.String" itemvalue="paddle_serving_client" />
<item index="44" class="java.lang.String" itemvalue="tritonclient" />
<item index="45" class="java.lang.String" itemvalue="paddle_serving_server" />
<item index="46" class="java.lang.String" itemvalue="paddlenlp" />
<item index="47" class="java.lang.String" itemvalue="openai" />
<item index="48" class="java.lang.String" itemvalue="feedparser" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.9.5 (sftp://root@114.116.90.53:22/home/python/anaconda3/envs/JXYQ@py39/bin/python3.9)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/FastText-Model.iml" filepath="$PROJECT_DIR$/.idea/FastText-Model.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteMappingsManager">
<list>
<list>
<remote-mappings server-id="python@sftp://root@114.116.90.53:22/home/python/anaconda3/envs/JXYQ@py39/bin/python3.9">
<settings>
<list>
<mapping local-root="$PROJECT_DIR$" remote-root="/home/python/lzc/新平台模型管理/FastText-Model" />
</list>
</settings>
</remote-mappings>
</list>
</list>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="WebServers">
<option name="servers">
<webServer id="89b44d2f-6e3e-40a6-8aa0-e1bc3fbcfd0f" name="FastText-Model">
<fileTransfer rootFolder="/home/python/lzc/新平台模型管理/FastText-Model" accessType="SFTP" host="114.116.90.53" port="22" sshConfigId="c0166359-81ab-467c-838f-8c7ee48db0f2" sshConfig="root@114.116.90.53:22 password">
<advancedOptions>
<advancedOptions dataProtectionLevel="Private" passiveMode="true" shareSSLContext="true" />
</advancedOptions>
</fileTransfer>
</webServer>
</option>
</component>
</project>
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : Operation.py
# @Time : 2023/3/21 08:25
# @Author : bruxelles_li
# @Software: PyCharm
import os
import subprocess
import sys
from flask import Flask, request, jsonify
from pathlib import Path
import shutil
from shutil import Error
from tqdm import tqdm
import requests
import socket
import datetime
import pandas as pd
import glob
# 追加工作路径
sys.path.append('../')
from base.app.base_app import *
from File_Operation.smart_extractor import extract_by_html_test
# 定义operation_prefix
operation_prefix = "/platform/operation/process" # 上传、删除、测试、发布
operation_file = Blueprint(f'{operation_prefix}', __name__)
UPLOAD_FOLDER = r'../datasets/classification/FastText-Model/' # 上传路径
Path(UPLOAD_FOLDER).mkdir(parents=True, exist_ok=True)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
file_types = ['xls', 'xlsx']
# 获取父进程的环境变量
env = os.environ.copy()
env['PYTHONPATH'] = "/home/python/anaconda3/envs/JXYQ@py39/lib/python3.9/site-packages"
# 找到目标文件并将文件移动到新目录下
def find_dir(dir_path, folder_path):
list_dir = os.listdir(dir_path)
sorted_list = sorted(list_dir, key=lambda x: x[7:], reverse=True)
# 取第二个
source_path = os.path.join(dir_path, sorted_list[1])
for file_name in os.listdir(source_path):
file_path = os.path.join(source_path, file_name)
# 如果是xlsx文件,复制到基于时间戳的目录下
if file_name.endswith('.xlsx') or file_name.endswith('.xls'):
shutil.copy(file_path, folder_path)
print("success")
def check_port(port):
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('localhost', port))
print(f"Port {port} is available")
except socket.error as e:
print(f"Port {port} is already in use")
def get_available_port(start_port, end_port):
for port in range(start_port, end_port+1):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(('localhost', port))
return port
except socket.error as e:
continue
raise Exception("No available ports in the specified range")
def get_random_available_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('localhost', 0))
return s.getsockname()[1]
def all_exist(avalue, bvalue):
if all(any(x in y for y in bvalue) for x in avalue):
return "True"
return "False"
def judge_len(file):
if len(file) == 2 or len(file) == 4:
return "True"
return "False"
def merge_df(dataset_path):
all_files = []
for file_type in file_types:
all_files.extend(glob.glob(os.path.join(dataset_path, f'*.{file_type}')))
# 将所有文件合并到一个DataFrame中
combined_df = pd.concat([pd.read_excel(f) for f in all_files], ignore_index=True)
# 去除重复行
combined_df.drop_duplicates(keep='first', inplace=True)
return combined_df
@operation_file.route('/show_file/', methods=['POST'])
def show():
try:
data = json.loads(request.data.decode('utf-8'))
file_path = data["file_path"]
file_list = os.listdir(file_path)
logger.info(file_list)
if file_list:
new_file_list = []
for file_name in file_list:
if "Origin-Model" in file_name:
continue
else:
new_file_list.append(file_name)
result = {
'handleMsg': 'Success',
'code': 200,
'logs': '处理成功!',
"resultData": ";".join(new_file_list)
}
app.logger.info(result)
return jsonify(result)
else:
result = {
'handleMsg': 'Success',
'code': 200,
'logs': '处理成功!',
"resultData": "当前查询的文件路径下内容为空!"
}
app.logger.info(result)
return jsonify(result)
except Exception as e:
# print(e)
result = {
'handleMsg': 'Failure',
'code': 500,
'logs': '处理失败!当前查询的文件路径不存在,请选择正确的路径参数后重新操作' + str(e),
"resultData": ""
}
app.logger.info(result)
return jsonify(result)
@operation_file.route('/remove_file/', methods=['POST'])
def remove():
try:
data = json.loads(request.data.decode('utf-8'))
file_path = data["file_path"]
flag = data["flag"]
if flag == "/":
os.remove(file_path)
else:
shutil.rmtree(file_path)
result = {
"code": 200,
'handleMsg': 'Success',
'resultData': '文件删除成功!',
'logs': None
}
app.logger.info(result)
return jsonify(result)
except Exception as e:
result = {
"code": 500,
'handleMsg': 'Failure',
'resultData': None,
'logs': '删除失败,当前文件不存在,请选择正确的文件路径参数后重新删除!' + str(e)
}
app.logger.info(result)
return jsonify(result)
# todo: 先进行语料上传操作
@operation_file.route('/upload_file/', methods=['GET', 'POST'])
def upload_file():
try:
# todo: 采用requests请求下载文件,包含任务id 和 url
data = json.loads(request.data.decode('utf-8'))
request_url = data["request_url"] # http://114.115.215.96/group1/M00/01/A3/wKjIbGSFKouAPnsHAApkU0_Y0Bg21.xlsx
task_id = data["task_id"]
# 先判断该任务id的语料路径是否存在,若存在,则继续判断是否
root_path = app.config['UPLOAD_FOLDER'] + task_id
if os.path.exists(root_path):
# 该语料路径存在,则将对下面的文件目录进行遍历,并创建一个带时间戳的目录,来存放新的文件以及当前待下载的文件
folder_name = "floder" + "-" + str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
folder_path = app.config['UPLOAD_FOLDER'] + task_id + "/" + folder_name
Path(folder_path).mkdir(parents=True, exist_ok=True)
# 调用递归函数traverse_dir 先将该目录下之前的文件复制到该目录下
find_dir(dir_path=root_path, folder_path=folder_path)
# 然后将当前传过来的语料也放入到该文件夹下
filename = request_url.split("/")[-1]
save4path = os.path.join(folder_path, filename)
# 下载文件
r = requests.get(request_url, stream=True)
with open(save4path, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
# 再调用merge_df合并文件
combined_df = merge_df(folder_path)
print(f"合并后的长度为{len(combined_df)}")
merge_filename = str(datetime.datetime.now().strftime('%Y%m%d%H%M%S')) + ".xlsx"
combined_df.to_excel(os.path.join(folder_path, merge_filename), index=False)
# 删除其它文件
for file_name in os.listdir(folder_path):
if file_name != merge_filename:
os.remove(os.path.join(folder_path, file_name))
else:
# 该语料路径不存在,则创建目录后下载文件
folder_name = "floder" + "-" + str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
folder_path = app.config['UPLOAD_FOLDER'] + task_id + "/" + folder_name
Path(folder_path).mkdir(parents=True, exist_ok=True)
# 将当前传过来的语料也放入到该文件夹下
filename = request_url.split("/")[-1]
save4path = os.path.join(folder_path, filename)
# 下载文件
r = requests.get(request_url, stream=True)
with open(save4path, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
result = {
"code": 200,
'handleMsg': 'Success',
'logs': '文件上传成功!',
"resultData": folder_name
}
app.logger.info(result)
return jsonify(result) # 返回保存成功的信息
except Error as e:
result = {
"code": 500,
'handleMsg': 'Failure',
'resultData': None,
'logs': '上传失败,当前上传的语料版本名称已经存在!' + str(e)
}
app.logger.info(result)
return jsonify(result)
except Exception as e1:
result = {
"code": 500,
'handleMsg': 'Failure',
'resultData': None,
'logs': '上传失败!' + str(e1)
}
app.logger.info(result)
return jsonify(result)
@operation_file.route('/publish_version/', methods=['POST'])
def publish():
try:
data = json.loads(request.data.decode('utf-8'))
model_version = data['trainModelName']
task_id = data["task_id"]
classification = r'../../../model_saved/classification/FastText-Model/'
model_path = classification + task_id + "/" + model_version
micro_server_port = get_available_port(start_port=3000, end_port=3050) # 自查可用端口 ,范围:3000 - 3050
print(micro_server_port)
# subprocess.call(
# "python ../app/app_run.py -model_path {} -micro_server_port {}".format(model_path, int(micro_server_port)),
# shell=True)
# subprocess.call(['python', '../app/app_run.py', '-model_path', model_path, '-micro_server_port', str(micro_server_port)],
# env=env,
# executable=sys.executable)
cmd = ['python', '../app/app_run.py', '-model_path', model_path, '-micro_server_port', str(micro_server_port)]
# 后台启动子进程
subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdin=subprocess.PIPE,
shell=False,
close_fds=True,
preexec_fn=os.setsid,
env=env,
executable=sys.executable
)
config_json = json.load(open('../config.json', 'r', encoding='utf-8'))
publish_url = 'http://{}:{}/platform/classification/FastText-Model/model_pred/'.format(config_json['ip'], micro_server_port)
result = {
'handleMsg': 'Success',
'code': 200,
'logs': None,
"resultData": publish_url
}
except Exception as e:
# print(e)
app.logger.info(e)
result = {
'handleMsg': 'failure',
'code': 500,
'logs': '模型发布失败,请检查参数后重新发布!' + str(e),
"resultData": None
}
app.logger.info(result)
return jsonify(result)
@operation_file.route('/model_test/', methods=['POST'])
def model_test():
try:
data = json.loads(request.data.decode('utf-8'))
task_id = data['task_id']
model_version = data['trainModelName']
type = data['data_type']
request_url = data['request_url']
# 定义模型root路径
classification = r'../../../model_saved/classification/FastText-Model/'
# 定义模型应用文件存放路径
model_path = classification + "Origin-Model-2023_03_31-12_15_17"
# 定义模型文件副本保存路径,若模型副本路径存在,则不需要备份,若模型副本不存在,需备份模型文件
model_copy = classification + "copy_model"
if not os.path.exists(model_copy):
# 若模型副本文件不存在,需备份,先创建副本保存路径
Path(model_copy).mkdir(parents=True, exist_ok=True)
# 将上版模型文件移动到副本保存路径,此时模型应用文件夹下内容为空
for dirpath, dirnames, filenames in os.walk(model_path):
# print(filenames)
for f0 in tqdm(filenames):
src_file = os.path.join(dirpath, f0)
shutil.move(src_file, model_copy)
else:
# 若模型副本文件已经存在,无需备份,将上版模型文件删除,此时模型应用文件夹下内容为空
for dirpath, dirnames, filenames in os.walk(model_path):
for f0 in tqdm(filenames):
src_file = os.path.join(dirpath, f0)
os.remove(src_file)
# 此时模型应用文件夹下内容为空,下面定义当前模型训练后的文件路径
src_path = classification + task_id + "/" + model_version
# 如果当前训练后的模型文件存在并且符合要求,则将当前训练好的模型文件复制一份到模型应用的保存路径
file_type_list = ['bin', 'json']
if os.path.exists(src_path):
for dirpath1, dirnames1, filenames1 in os.walk(src_path):
for f1 in tqdm(filenames1):
file_type = f1.split('.')[1]
temp_file = os.path.join(dirpath1, f1)
if file_type in file_type_list:
shutil.copy(temp_file, model_path)
# 根据测试方式选择解析方法
if type.strip() == "url":
try:
# 测试:按HTML采集
dict_parse = extract_by_html_test(request_url)
# 调用应用环境下的模型测试接口来处理当前数据
url = "http://localhost:4005/platform/classification/FastText-Model/model_test/"
MODEL_PATH = model_path + "/" + "model.bin"
# print(MODEL_PATH)
payload = json.dumps({
"model_path": MODEL_PATH,
"title": dict_parse["title"],
"content": dict_parse["content"]
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
text = response.text.encode('utf-8')
obj = json.loads(text)
label = obj["result"]["label"]
result = {
'handleMsg': 'success',
'code': 200,
'logs': None,
"resultData": {
"title": dict_parse["title"],
"content": dict_parse["content"],
"label": label
}
}
except Exception as e:
# 测试完毕,先移除模型应用文件夹中的内容,此时模型应用文件夹下内容为空
for dirpath, dirnames, filenames in os.walk(model_path):
# print(filenames)
for f0 in tqdm(filenames):
src_file = os.path.join(dirpath, f0)
os.remove(src_file)
# 接着将模型副本中的文件复制到模型应用文件夹下
for dirpath, dirnames, filenames in os.walk(model_copy):
# print(filenames)
for f0 in tqdm(filenames):
src_file = os.path.join(dirpath, f0)
shutil.copy(src_file, model_path)
# 所有处理操作完毕,返回模型测试结果
result = {
'handleMsg': 'failure',
'code': 500,
'logs': "智能解析url 网页内容失败,请重新选择测试内容" + str(e),
"resultData": None
}
app.logger.info(result)
return jsonify(result)
else:
# 先下载文件,然后解析文件内容进行处理
filename = request_url.split("/")[-1]
save4path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
# 下载文件
r = requests.get(request_url, stream=True)
with open(save4path, "wb") as f:
for chunk in r.iter_content(chunk_size=512):
f.write(chunk)
data_df = pd.read_excel(save4path, keep_default_na=False).astype(str)
list_one = []
for idx, row in tqdm(data_df.iterrows()):
title = row["title"]
content = row["content"]
# 调用应用环境下的模型测试接口来处理当前数据
url = "http://localhost:4005/platform/classification/FastText-Model/model_test/"
MODEL_PATH = model_path + "/" + "model.bin"
# print(MODEL_PATH)
payload = json.dumps({
"model_path": MODEL_PATH,
"title": title,
"content": content
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.request("POST", url, headers=headers, data=payload)
text = response.text.encode('utf-8')
obj = json.loads(text)
label = obj["result"]["label"]
list_one.append({
"title": title,
"content": content,
"label": label
})
except:
continue
result = {
'handleMsg': 'success',
'code': 200,
'logs': None,
"resultData": list_one
}
# 测试完毕,先移除模型应用文件夹中的内容,此时模型应用文件夹下内容为空
for dirpath, dirnames, filenames in os.walk(model_path):
# print(filenames)
for f0 in tqdm(filenames):
src_file = os.path.join(dirpath, f0)
os.remove(src_file)
# 接着将模型副本中的文件复制到模型应用文件夹下
for dirpath, dirnames, filenames in os.walk(model_copy):
# print(filenames)
for f0 in tqdm(filenames):
src_file = os.path.join(dirpath, f0)
shutil.copy(src_file, model_path)
# 所有处理操作完毕,返回模型测试结果
return result
else:
result = {
'handleMsg': 'failure',
'code': 500,
'logs': f'待测试模型版本-{model_version}不存在,请选择已有的模型版本进行测试!',
"resultData": None
}
app.logger.info(result)
return jsonify(result)
except Exception as e:
# print(e)
app.logger.info(e)
result = {
'handleMsg': 'failure',
'code': 500,
'logs': '模型测试失败,请检查参数后重新测试!' + str(e),
"resultData": None
}
app.logger.info(result)
return jsonify(result)
if __name__ == '__main__':
port = get_available_port(3000, 3050)
print(port)
# app.run(host=HOST, port=PORT, debug=DEBUG)
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# -*- coding: utf-8 -*-
import requests, sys
from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
# 追加工作路径
sys.path.append('../')
from File_Operation.entity import *
from File_Operation.smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractor:
@staticmethod
def get_supported_lang_code_dict():
"""
支持语言:
1、需要分词,传递分词器(3种):
a. 中文、韩语、阿拉伯语
2、不需要分词,直接传递语言编码(16种)
a. 其中英语、俄语,单独测试
"""
supported_lang_code_dict = {
'cn': '中文', # 中文
'zh-cn': '简体中文', # 简体中文
'ko': '韩语', # 韩语
'ar': '阿拉伯语', # 阿拉伯语
'en': '英语', # 英语
'ru': '俄语', # 俄语
'da': '丹麦语', # 丹麦语
'de': '德语', # 德语
'es': '西班牙语', # 西班牙语
'fi': '芬兰语', # 芬兰语
'fr': '法语', # 法语
'hu': '匈牙利语', # 匈牙利语
'id': '印度尼西亚语', # 印度尼西亚语
'it': '意大利语', # 意大利语
'nb': '挪威语(伯克梅尔)', # 挪威语(伯克梅尔)
'nl': '荷兰语', # 荷兰语
'no': '挪威文(耐诺斯克)', # 挪威文(耐诺斯克)
'pl': '波兰语', # 波兰语
'pt': '葡萄牙语', # 葡萄牙语
'sv': '瑞典语', # 瑞典语
}
return supported_lang_code_dict
def __init__(self, lang_code='cn'):
"""
构造器:未指定 lang_code 参数时,默认为 cn
"""
# 支持语言
supported_lang_code_list = list(SmartExtractor.get_supported_lang_code_dict())
# 初始化 goose 对象:
# 1、根据语言代码,创建 goose 对象
if lang_code is None or lang_code == 'cn' or lang_code == 'zh-cn':
# 需要分词:中文
# 1、不指定lang_code参数,或不指定lang_code为 None 时,默认为中文分词
# 2、Flask Web接口:未指定get参数 lang_code 时,lang_code 会接收为 None
self.goose = Goose({'stopwords_class': StopWordsChinese})
elif lang_code == 'ko':
# 需要分词:韩语
# 1、测试:只传递语言,不传递分词器
# self.goose = Goose({'use_meta_language': False, 'target_language': 'ko'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试失败:正文采集为空
# 韩语分词:测试成功
self.goose = Goose({'stopwords_class': StopWordsKorean})
elif lang_code == 'ar':
# 需要分词:阿拉伯语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试成功
# self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
self.goose = Goose({'stopwords_class': StopWordsArabic})
elif lang_code == 'en':
# 单独测试:英文
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})
# 测试成功:创建Goose对象时,不指定语言默认为英文分词
self.goose = Goose()
elif lang_code == 'ru':
# 单独测试:俄语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
elif lang_code in supported_lang_code_list:
# 其它语言编码,统一处理,不再单独测试
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})
else:
# 未识别的语言代码
raise Exception(f'智能采集时,无法识别语言代码:{lang_code}')
def get_extraction_result(self, article, link_text=''):
"""
获取采集结果:
1、从 artcile 对象中,采集数据并封装到 ExtractionResult
"""
# 用于保存:采集后的文本
extraction_result = ExtractionResult()
# 标题
# extraction_result.title = article.title # 原办法:使用 goose 采集到的 title 中的标题
extraction_result.title = SmartExtractorUtility.get_article_title(article, link_text)
# 发布日期
extraction_result.publish_date = SmartExtractorUtility.get_publish_date(article)
# 正文(保留所有HTML标记,如:br、img)
extraction_result.text = SmartExtractorUtility.get_article_text(article)
# URL
extraction_result.url = article.final_url
# 摘要
extraction_result.meta_description = article.meta_description
# 干净正文(不带HTML)
extraction_result.cleaned_text = article.cleaned_text
# 来源(目前只支持采集中文网站中的“来源”)
extraction_result.source = ''
return extraction_result
def extract_by_url(self, url, link_text=''):
"""
按URL采集内容
"""
# 采集正文:传入url
article = self.goose.extract(url=url)
# article = goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_url_test(url: str, lang_code: str):
# 测试:按URL采集
# url_list = [
# # "http://www.news.cn/politics/2022-07/31/c_1128879636.htm", # 短文本
# # "https://baijiahao.baidu.com/s?id=1741311527693101670", # 带多张图片
# # "https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml", # 带多张图片,及一个视频(测试内容XPath失败)
# # "http://opinion.people.com.cn/n1/2022/0803/c1003-32492653.html", # 人民网
# # 韩文:中央日报-politics
# # "https://www.joongang.co.kr/article/25094974",
# # "https://www.joongang.co.kr/article/25094967",
# # 英文:加德满都邮报-national-security
# # "https://kathmandupost.com/national-security/2020/01/17/police-s-intelligence-continues-to-fail-them-as-chand-party-claims-explosion",
# # "https://kathmandupost.com/national-security/2019/11/04/india-s-new-political-map-places-disputed-territory-of-kalapani-inside-its-own-borders", # 测试采集:发布时间
# # 俄语:今日白俄罗斯报-word
# # "https://www.sb.by/articles/byvshiy-premer-ministr-italii-zayavil-chto-strane-sleduet-otkazatsya-ot-gaza-iz-rossii.html",
# # 'https://www.sb.by/articles/kryuchkov-predupredil-o-nepopravimykh-posledstviyakh-dlya-ukrainy-v-sluchae-udarov-po-krymu.html',
# # 阿语
# # "http://arabic.people.com.cn/n3/2022/0822/c31659-10137917.html",
# # "http://arabic.people.com.cn/n3/2022/0822/c31657-10137909.html",
# # 测试提取标题
# # "http://www.sasac.gov.cn/n4470048/n16518962/n20928507/n20928570/c25819031/content.html",
# # "http://www.forestry.gov.cn/main/102/20220823/092407820617754.html",
# # "http://www.sasac.gov.cn/n2588025/n2588139/c25825832/content.html", # 标题采集为空
# # 'http://www.crfeb.com.cn/1j/_124/2005409/index.html', # 内容采集失败
# # 'http://www.crfeb.com.cn/1j/_124/912248/index.html', # 内容采集失败
# # 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html', # 中国铁建股份有限公司-工作动态(日期采集错误)
# # 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html', # 中国土木工程集团有限公司-多个栏目(日期采集错误)
# # 'http://v.people.cn/n1/2022/0901/c444662-32517559.html', # 人民网视频:title必须以“元素中的标题”开始,不能判断“包含”
# # 'https://www.chec.bj.cn/cn/xwzx/gsyw/2022/202207/t20220706_8128.html', # 中国港湾工程有限责任公司-公司要闻(标题采集失败)
# # 'https://www.cscec.com/xwzx_new/gsyw_new/202208/3570377.html', # 中国建筑集团有限公司-中建要闻(标题采集失败)
# # 'https://www.crbc.com/site/crbc/276/info/2022/46884837.html', # 中国路桥工程有限责任公司-多个栏目(标题采集失败)
# # 'http://www.cgcoc.com.cn/news/432.html', # 中地海外集团有限公司-新闻中心(标题和内容采集失败)
# # 'http://www.mcc.com.cn/mcc/_132154/_132572/308233/index.html' # 中国五矿(测试:正文采集失败)
# # 'http://www.powerchina.cn/art/2015/5/27/art_7449_441845.html', # 中国电力建设集团(测试:标题、正文采集失败)
# # 中国电力建设集团(测试:标题采集失败),相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
# # 'http://world.people.com.cn/n1/2022/0624/c1002-32455607.html', # 标题采集失败:看着没有问题
# 'https://www.cscec.com/xwzx_new/zqydt_new/202209/3578274.html', # 中国建筑股份有限公司-企业动态:日期采集错误,采集到当天日期
# ]
# 语言编码
# lang_code = 'cn'
# lang_code = 'ko'
# lang_code = 'en'
# lang_code = 'ru'
# lang_code = 'ar'
print("-" * 100)
print('请求URL:', url)
extraction_result = SmartExtractor(lang_code).extract_by_url(url)
# todo: 将内容返回
dict_parse = {
"title": extraction_result.title,
"publistDate": extraction_result.publish_date,
"content": extraction_result.cleaned_text
}
return dict_parse
# for url in url_list:
# print("-" * 100)
# print('请求URL:', url)
# extraction_result = SmartExtractor(lang_code).extract_by_url(url)
#
# # 测试转换为JSON
# # 1、直接转换时,会抛异常:TypeError: Object of type ExtractionResult is not JSON serializable
# # print(json.dumps(extraction_result))
# # print(json.dumps(extraction_result, default=ExtractionResult.to_dict)) # 转换成功:指定序列化器
# # print(type(json.dumps(extraction_result.to_dict()))) # 返回类型:<class 'str'>,内容中的中文会被转义
# # print(str(extraction_result.to_dict())) # 如果直接转换为字符串,中文不会被转义
#
# # 打印测试结果
# print_extraction_result(extraction_result)
def extract_by_html_test(url):
# 测试:按HTML采集
html = '''
<html>
<head>
<title>标题</title>
</head>
<body>
<div>标题</div>
<div>内容</div>
</body>
</html>
'''
# 测试:通过请求URL,获取完整的html
# url = "http://www.news.cn/politics/2022-07/31/c_1128879636.htm" # 测试成功
# url = "http://views.ce.cn/view/ent/202208/15/t20220815_37961634.shtml" # 1、测试失败:lxml.etree.ParserError: Document is empty
# url = 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html' # 中国铁建股份有限公司-工作动态(日期采集错误)
# url = 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html' # 中国土木工程集团有限公司-多个栏目(日期采集错误)
print()
print("-" * 100)
print('请求URL:', url)
html = requests.get(url).text
# 语言编码
lang_code = 'cn'
# 采集内容
extraction_result = SmartExtractor(lang_code).extract_by_html(html)
# todo: 将内容返回
dict_parse = {
"title": extraction_result.title,
# "publistDate": extraction_result.publish_date,
"content": extraction_result.cleaned_text
}
# 打印测试结果
# print_extraction_result(extraction_result)
return dict_parse
def print_extraction_result(extraction_result):
# 打印测试结果
print("标题:", extraction_result.title) # 标题
print("发布时间:", extraction_result.publish_date) # 发布时间
print("正文:", extraction_result.text) # 正文
print("URL:", extraction_result.url) # URL
print("摘要:", extraction_result.meta_description) # 摘要
print("干净正文:", extraction_result.cleaned_text) # 干净正文
if __name__ == '__main__':
try:
# # 测试:按URL采集
# print(extract_by_url_test("http://www.gov.cn/zhengce/zhengceku/2008-03/28/content_6253.htm"))
# 测试:按HTML采集
dict_parse = extract_by_html_test("https://www.msn.cn/zh-cn/news/other/%E6%9B%BE%E7%BB%8F%E4%B8%91%E5%88%B0%E7%94%B7%E4%B8%BB%E9%80%83%E8%B7%91-%E5%A6%82%E4%BB%8A%E9%80%86%E8%A2%AD%E6%88%90%E6%83%B9%E7%9C%BC%E8%BE%A3%E5%A6%B9-%E6%9C%80%E4%B8%91%E5%A5%B3%E5%9B%A2-cindy%E5%AE%8C%E6%88%90/ar-AA1cmel9?ocid=msedgntp&cvid=2952893909c64335846c8f7d0d608e48&ei=5")
print(dict_parse)
except Exception as e:
print("采集失败:", e)
import re
from goose3.article import Article
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractorUtility:
# 标题最小长度
title_min_len = 6
@staticmethod
def extract_publish_date(html):
pattern_list = [
# 2010-10-1 8:00:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010-10-1 8:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 2010年10月1日 8:00:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}:\d{1,2}",
# 2010年10月1日 8:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}",
# 2010/10/1 8:00:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010/10/1 8:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}",
# 2010-10-1
r"20\d{2}-\d{1,2}-\d{1,2}",
# 2010年10月1日
r"20\d{2}年\d{1,2}月\d{1,2}日",
# 2010/10/1
r"20\d{2}/\d{1,2}/\d{1,2}",
# 2022.08.28
r"20\d{2}\.\d{1,2}\.\d{1,2}"
# 12-07-02 10:10
r"\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 1月前
r"\d+(&nbsp;| )*月前",
# 12天前
r"\d+(&nbsp;| )*天前",
# 2小时前
r"\d+(&nbsp;| )*小时前",
# 15分钟前
r"\d+(&nbsp;| )*分钟前",
# 昨天&nbsp;17:59
r"昨天(&nbsp;| )*\d{1,2}:\d{1,2}",
]
# 尝试匹配所有正则式
for pattern in pattern_list:
# 提取可见日期:
# 1、必须在标签内部,不能提取HTML标签属性中的日期
# 2、提取规则:必须在 > 和 < 之间,且中间不能再有 >
tag_pattern = f'>[^>]*(?P<date>{pattern})[^>]*<'
# 搜索第一个匹配项
match = re.search(tag_pattern, html)
# 如果匹配成功,返回正确的发布时间
if match:
return match.group('date')
# 所有正则式匹配失败,返回空字符串
return ""
@staticmethod
def add_html_br(cleaned_text):
# 包装HTML标记:换行
# 1、优先替换双换行:使用goose提取到的cleaned_text,都是双换行
cleaned_text = cleaned_text.replace("\n\n", "<br>")
cleaned_text = cleaned_text.replace("\n", "<br>")
return cleaned_text
@staticmethod
def get_article_title(article: Article, link_text=''):
#
# 优先提取h1、div、span、td元素中的标题
# 1、测试任务:2.智能采集\1.测试任务\国资委-新闻发布
# a. 原title标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”-国务院国有资产监督管理委员会
# b. div元素中的标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”
# 2、测试任务:2.智能采集\1.测试任务\国家林业和草原局-地方动态
# a. 原title标题:上海完成森林资源年度监测遥感解译图斑市级质量检查_地方动态_国家林业和草原局政府网
# b. span元素中的标题:上海完成森林资源年度监测遥感解译图斑市级质量检查
#
# 根据xpath,查询标题元素时:
# 1、标签优先级:h1、特殊元素(id或class包含title)、h2、h3、div、span、td
#
title_element_list = [
'h1',
'h2',
'h3',
'div',
'span',
'td',
'p',
]
# 对比标题前,统一将空格剔除(2022-09-21):
# 1、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# 2、相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
link_text = link_text.replace(" ", "")
tag_title = article.title.replace(" ", "")
title = None
for title_element in title_element_list:
element_list = article.raw_doc.getroottree().xpath(f'//{title_element}')
# 查询XPath成功,遍历所有元素
for element in element_list:
# 取纯文本内容,包括子元素
text = etree.tounicode(element, method='text').strip()
text_no_space = text.replace(" ", "")
# 判断标题:
# 1、如果智能采集的原title标题,以“元素内容”开头,则取元素内容
# 2、查找成功后,返回text作为标题,否则继续下一个循环
# 判断是否以“元素中的标题”开始:
# 1、title必须以“元素中的标题”开始,不能判断“包含”
# 2、测试URL:http://v.people.cn/n1/2022/0901/c444662-32517559.html
# 3、title标签:<title>亿缕阳光丨小生意,大格局--人民视频--人民网</title>
# a. 如果判断“包含”,会采集到:人民网
# b. 因为存在元素:<a href="http://www.people.com.cn/" class="clink">人民网</a>
# c. 如果判断以“元素中的标题”开始,采集到:亿缕阳光丨小生意,大格局
# d. 标题元素:<h2>亿缕阳光丨小生意,大格局</h2>
# 新方案:
# 1、对比常用元素:仍判断是否以“元素中的标题”开始
# 2、优先对比“链接文本”,其次对比“title元素”
# 3、满足最少字数:6个字
# 新方案(2022-09-21):
# 1、对比“链接文本”、“title元素”时,除了判断开始,同时允许结尾
# 2、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# a. 列表中的链接文本:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电...
# b. title标签中的内容:<title>中国电力建设集团 公司要闻 【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠</title>
# c. 元素中的标题:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠
if text_no_space is not None and text_no_space != '' and len(
text_no_space) >= SmartExtractorUtility.title_min_len:
# 优先判断6个字,以方便调试:排除短文本元素
if link_text.startswith(text_no_space) or link_text.endswith(text_no_space) or tag_title.startswith(
text_no_space) or tag_title.endswith(text_no_space):
# 返回时,仍返回未剔除空格后的标题
return text
if title:
# 查找成功,返回元素中的标题
return title
else:
# 查找失败,返回提取到的title属性
# return article.title
# 新考虑:标题采集失败后,返回空值
# 1、原因:article.title 不可靠,只是提取了 title 标签中的内容
return ''
@staticmethod
def get_publish_date(article: Article):
# 优先使用正则式提取日期
# 1、测试任务:加德满都邮报-national-security
# a. 使用 publish_datetime_utc 提取英文日期后,提取错误
# b. 实际日期:Friday, August 19, 2022,但提取到了:2015-02-05
# c. 原因:在下方JS中,有一段JSON文本: "datePublished": "2015-02-05T08:00:00+08:00"
# 2、注意:中文网站,都必须使用正则式
publish_date = SmartExtractorUtility.extract_publish_date(article.raw_html)
if publish_date != '':
return publish_date
else:
if article.publish_datetime_utc:
# 优先使用提取成功的 datetime
return article.publish_datetime_utc.strftime('%Y-%m-%d')
elif article.publish_date:
# 其次使用提取成功的 date 字符串
return article.publish_date
else:
# 全部提取失败,返回字符串
return ''
@staticmethod
def get_article_text(article: Article):
# 第一种方法:在纯文本(cleaned_text)基础上,添加br标签
# 1、缺点:无法获取图片,同时会丢掉原有的p标签(只能用br替补)
# text = SmartExtractor.add_html_br(article.cleaned_text)
# 第二种方法:直接获取 top_node 的HTML内容
# 1、优点:可保留原有的p标签等
# 2、缺点:无法获取图片,img标签未被保留
# text = etree.tounicode(article.top_node, method='html')
# 测试抛出异常
# raise Exception("测试抛出异常")
# 第三种方法:获取到 top_node 的xpath,再通过xpath查询原始doc
# 1、可行:通过查询原始doc,可以获取“正文”的所有HTML内容
# 2、遇到问题:获取到 top_node 的xpath不准确,与原位置偏移一个元素
# a. 测试URL:https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml
# b. 获取到的xpath:/html/body/div/div[1]/div[2]/div[4]
# c. 实际xpath:/html/body/div/div[1]/div[2]/div[5]
# 3、解决办法:
# a. 优先使用id、class查询,如果没有id、class,再查询 top_node 的xpath
xpath = None
if type(article.top_node) is HtmlElement:
if 'id' in article.top_node.attrib:
xpath = "//*[@id='{}']".format(article.top_node.attrib['id'])
elif 'class' in article.top_node.attrib:
xpath = "//*[@class='{}']".format(article.top_node.attrib['class'])
else:
xpath = article.top_node.getroottree().getpath(article.top_node)
else:
# article.top_node 有时为空:
# 1、测试URL:https://baijiahao.baidu.com/s?id=1741311527693101670
# 2、输出日志:article.top_node 不是 HtmlElement 对象:None
print("SmartExtractor:article.top_node 为 {},不是 HtmlElement 对象。".format(article.top_node))
# article.top_node 为空时,直接输出 cleaned_text:
# 1、在纯文本(cleaned_text)基础上,添加br标签
text = SmartExtractorUtility.add_html_br(article.cleaned_text)
return text
# 根据xpath,查询元素
element_list = article.raw_doc.getroottree().xpath(xpath)
if element_list:
# 查询XPath成功,获取第一个元素的HTML
text = etree.tounicode(element_list[0], method='html')
else:
# 查询XPath失败,返回 top_node 原有的HTML
# 1、缺点:无法获取图片,img标签未被保留
text = etree.tounicode(article.top_node, method='html')
return text
# FastText-Model
#### 介绍
新平台NLP算法组 model
#### 安装教程
1. 指定conda环境的python版本
2. 执行requirement.txt
3. 也可以指定运行环境|提前在宿主机上创建好
#### 使用说明
1. xxxx
2. xxxx
3. xxxx
#### 参与贡献
1. Fork 本仓库
2. 新建 Feat_xxx 分支
3. 提交代码
4. 新建 Pull Request
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/7/31 10:21
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : app_config.py
# @Time : 2023/4/1 10:31
# @Author : bruxelles_li
# @Software: PyCharm
import os
import multiprocessing
from pathlib import Path
bind = '0.0.0.0:4005' # 绑定ip和端口号
backlog = 512 # 监听队列
# chdir = '/home/zzsn/liuyan/bin' # gunicorn要切换到的目的工作目录
timeout = 300 # 超时 -> 目前为迎合ZZSN_NLP平台 一带一路要素抽取(文件)需求 暂时关闭超时
# worker_class = 'gevent' # 使用gevent模式,还可以使用sync 模式,默认的是sync模式
# workers = multiprocessing.cpu_count() # 进程数 12
workers = 1 # 低资源 13G 服务器负载过大可调整此处为 1
threads = 50 # 指定每个进程开启的线程数
loglevel = 'error' # 日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置
access_log_format = '%(t)s %(p)s %(h)s "%(r)s" %(s)s %(L)s %(b)s %(f)s" "%(a)s"' # 设置gunicorn访问日志格式,错误日志无法设置
"""
其每个选项的含义如下:
h remote address
l '-'
u currently '-', may be user name in future releases
t date of the request
r status line (e.g. ``GET / HTTP/1.1``)
s status
b response length or '-'
f referer
a user agent
T request time in seconds
D request time in microseconds
L request time in decimal seconds
p process ID
"""
_tmp_path = os.path.dirname(os.path.abspath(__file__))
_tmp_path = os.path.join(_tmp_path, 'log')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
accesslog = os.path.join(_tmp_path, 'gunicorn_access.log') # 访问日志文件
errorlog = os.path.join(_tmp_path, 'gunicorn_error.log') # 错误日志文件
# gunicorn -c app_config.py app_run:app -D --daemon
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : app_run.py
# @Time : 2023/3/31 10:31
# @Author : bruxelles_li
# @Software: PyCharm
import json
import os
import sys
import logging
import requests
import argparse
import queue
from pathlib import Path
from flask import Flask, jsonify, request
import re
# 模型训练服务
sys.path.append('../')
# 关闭多余连接
s = requests.session()
s.keep_alive = False
from classification.config.config_fast_text import FastTextConfig
from classification.runner.runner_fast_text import FastTextRunner
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
# Queue基本FIFO队列 先进先出 FIFO即First in First Out,先进先出
# maxsize设置队列中,数据上限,小于或等于0则不限制,容器中大于这个数则阻塞,直到队列中的数据被消掉
q = queue.Queue(maxsize=0)
# 定义训练配置文件
train_config_path = '../classification/config/fasttext_config_train.yml'
# 定义应用配置文件
pred_config_path = '../classification/config/fasttext_config_pred.yml'
# 关闭多余连接
s = requests.session()
s.keep_alive = False
UPLOAD_FOLDER = r'../datasets/Receive_File' # 上传路径
Path(UPLOAD_FOLDER).mkdir(parents=True, exist_ok=True)
TEMPFILE_FOLDER = UPLOAD_FOLDER + "/" + "Temp_file"
Path(TEMPFILE_FOLDER).mkdir(parents=True, exist_ok=True)
ALLOWED_EXTENSIONS = set(['xls', 'xlsx']) # 允许上传的文件类型
app = Flask(__name__)
# 定义上传文件临时路径
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['TEMPFILE_FOLDER'] = TEMPFILE_FOLDER
from base.app.base_app import *
# 定义模型训练配套服务
from File_Operation.Operation import operation_file
# 此处需要变动
# from classification.app.app_ssyw_column_classify import classification_ssyw_column_classify
# operation_file 模型训练配套服务
operation_prefix = "/platform/operation/process" # 上传、删除、测试、发布
app.register_blueprint(operation_file, url_prefix="{}".format(operation_prefix))
# classification 训练接口参数
train_url = "/platform/classification/FastText-Model/model_train/"
model_name = "FastText-Model"
# 定义应用url
application_url = "/model_pred/"
# 定义测试url
model_test_url = "/model_test/"
# 跨域支持1
from flask_cors import CORS
CORS(app, supports_credentials=True)
@app.route('/', methods=['POST'])
def hello_world():
app.logger.info('请选择正确的方式上传!')
return '请选择正确的方式上传!'
@app.route('/subject_consumer', methods=['GET', 'POST'])
def subject_consumer():
if not q.empty():
config_info = q.get()
return jsonify(message='当前队列数量:' + str(q.qsize()),
queue_left_number=str(q.qsize()),
data=config_info)
else:
return jsonify(message='队列为空!', queue_left_number=0)
@app.route('/queue_size', methods=['GET', 'POST'])
def queue_size():
return jsonify(queue_left_number=q.qsize())
@app.route("/platform/classification/FastText-Model/model_train/", methods=['POST'])
def model_train():
"""
{ 'reg_lambda': 1,
'scale_pos_weight': 1,
'reg_alpha': 1,
'modelProcessId': '1453295295008211969',
'learning_rate': '0.02',
'gpu': False,
'min_child_weight': 1,
'train': 'http://39.105.62.235:7000/br/classification/project_info/filter/train',
'data_path': '/datasets/the_belt_and_road/classification/pro_info_filter'
}
-> data:
:return:
"""
try:
data = json.loads(request.data.decode('utf-8'))
modelProcessId = data['modelProcessId']
task_id = data["task_id"]
learning_rate = data['learning_rate'] if 'learning_rate' in data else 0.02
epoch = data['epoch'] if "epoch" in data else 5
gpu = data['gpu'] if 'gpu' in data else None
# 模型版本 语料版本 名称
data_path_0 = data['data_path']
model_path_0 = data['model_path']
data_path_1 = "/" + task_id + "/" + data_path_0.strip('/')
model_path_1 = '/' + task_id + "/" + model_path_0.strip('/')
# 加载配置文件获取语料和模型文件存放路径
_config = FastTextConfig(config_path=train_config_path).load_config()
# todo: 先做数据集检测,分两种情况来处理
data_temp_path = _config.data.path0 % data_path_1
app.logger.info(data_temp_path)
# todo: 接着做模型路径检测,如果当前模型版本已经存在,则提示当前版本已经存在
temp_path = _config.learn.dir.saved % model_path_1
app.logger.info(temp_path)
if os.path.exists(data_temp_path):
pass
else:
dict_result = {
'code': 500,
'isHandleSuccess': False,
'logs': '模型训练失败!当前模型训练的语料文件不存在,请上传语料后再进行训练',
'result': None
}
app.logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
# todo: 模型版本检测是都需要的,线程执行就好
if not os.path.exists(temp_path):
model_path = model_path_1
else:
dict_result = {
'code': 500,
'isHandleSuccess': False,
'logs': '模型训练失败!当前模型版本已经存在,请更改模型版本号再重新进行训练',
'result': None
}
app.logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
# 模型保存版本 和 数据集 无异常,则开始进入模型训练部分
VER = data_path_1
root_dataset = data_temp_path
app.logger.info(root_dataset)
config_info = {
"modelProcessId": modelProcessId,
"data_path": VER,
"model_path": model_path,
'root_dataset': root_dataset
}
q.put(config_info)
app.logger.info(config_info)
dict_result = {
'code': 200,
'isHandleSuccess': True,
'logs': '模型训练中 ...',
'result': None
}
except Exception as e:
dict_result = {
'code': 500,
'isHandleSuccess': False,
'logs': '训练失败!' + str(e),
'result': None
}
app.logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route("/platform/classification/FastText-Model/model_test/", methods=['POST'])
def model_test():
"""
{
'threshold': 0.5,
'model_path': '/zzsn_nlp_br/classification/model/model_saved',
'url': 'http://39.105.62.235:7000/br/classification/project_info/filter/model_test'
}
-> data:
:return:
"""
try:
data = json.loads(request.data.decode('utf-8'))
title = data['title'] if 'title' in data else None
content = data['content'] if 'content' in data else None
model_path = data['model_path'] if 'model_path' in data else None
runner_test = FastTextRunner(config_path=pred_config_path, model_path=model_path)
dict_result = runner_test.test(title=title, content=content)
if dict_result['code'] != 200:
dict_result['logs'] = '模型测试失败!' + dict_result['logs']
except Exception as e:
dict_result = {
'handleMsg': 'failure',
'code': 500,
'logs': '模型测试失败!' + str(e),
'resultData': None
}
app.logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route("/platform/classification/FastText-Model/model_pred/", methods=['POST'])
def model_pred():
try:
data_list = json.loads(request.data.decode('utf-8'))
result_one = []
for data in data_list:
title = data['title'] if 'title' in data else None
content = data['content'] if 'content' in data else None
infoId = data["id"] if "id" in data else None
level2 = ssyw_runner.pred(
title=title,
content=content
).strip()
result_one.append({
"id": infoId,
'labels': level2
})
dict_result = {
'code': 200,
'message': "操作成功",
'result': result_one
}
except Exception as e:
dict_result = {
'code': 500,
'success': 'false',
'message': "操作失败" + str(e),
'result': None
}
app.logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
parser.add_argument('-port', dest='port', help='', default=4005)
parser.add_argument('-host', dest='host', help='', default='0.0.0.0')
# 微服务参数
parser.add_argument('-model_path', dest='model_path', help='', default='')
parser.add_argument('-micro_server_port', dest='micro_server_port', help='', default=None)
args = parser.parse_args()
if args.model_path and args.micro_server_port:
model_path = os.path.join(args.model_path, "model.bin")
ssyw_runner = FastTextRunner(config_path=pred_config_path, model_path=model_path)
app.run(host=args.host,
port=int(args.micro_server_port)
)
else:
ssyw_runner = FastTextRunner(config_path=pred_config_path)
app.run(host=args.host,
port=int(args.port)
)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 资源检测程序.py
# @Time : 2022/9/30 10:39
# @Author : bruxelles_li
# @Software: PyCharm
import logging
import os, time, re, subprocess
# 获取CPU负载信息
def get_cpu():
last_worktime = 0
last_idletime = 0
f = open("/proc/stat", "r")
line = ""
while not "cpu " in line: line = f.readline()
f.close()
spl = line.split(" ")
worktime = int(spl[2]) + int(spl[3]) + int(spl[4])
idletime = int(spl[5])
dworktime = (worktime - last_worktime)
didletime = (idletime - last_idletime)
rate = float(dworktime) / (didletime + dworktime)
last_worktime = worktime
last_idletime = idletime
if (last_worktime == 0): return 0
return rate
# 获取内存负载信息
def get_mem_usage_percent():
try:
f = open('/proc/meminfo', 'r')
for line in f:
if line.startswith('MemTotal:'):
mem_total = int(line.split()[1])
elif line.startswith('MemFree:'):
mem_free = int(line.split()[1])
elif line.startswith('Buffers:'):
mem_buffer = int(line.split()[1])
elif line.startswith('Cached:'):
mem_cache = int(line.split()[1])
elif line.startswith('SwapTotal:'):
vmem_total = int(line.split()[1])
elif line.startswith('SwapFree:'):
vmem_free = int(line.split()[1])
else:
continue
f.close()
except:
return None
physical_percent = usage_percent(mem_total - (mem_free + mem_buffer + mem_cache), mem_total)
virtual_percent = 0
if vmem_total > 0:
virtual_percent = usage_percent((vmem_total - vmem_free), vmem_total)
return physical_percent, virtual_percent
def usage_percent(use, total):
try:
ret = (float(use) / total) * 100
except ZeroDivisionError:
raise Exception("ERROR - zero division error")
return ret
# 获取磁盘根目录占用信息
def disk_info():
statvfs = os.statvfs('/') # 根目录信息 可根据情况修改
total_disk_space = statvfs.f_frsize * statvfs.f_blocks
free_disk_space = statvfs.f_frsize * statvfs.f_bfree
disk_usage = (total_disk_space - free_disk_space) * 100.0 / total_disk_space
disk_usage = int(disk_usage)
# disk_tip = "硬盘空间使用率(最大100%):" + str(disk_usage) + "%"
# print(str(disk_usage))
return str(disk_usage)
# 获取内存占用信息
def mem_info():
mem_usage = get_mem_usage_percent()
mem_usage = int(mem_usage[0])
# mem_tip = "物理内存使用率(最大100%):" + str(mem_usage) + "%"
# print(str(mem_usage))
return str(mem_usage)
# 获取CPU占用信息
def cpu_info():
cpu_usage = int(get_cpu() * 100)
# cpu_tip = "CPU使用率(最大100%):" + str(cpu_usage) + "%"
# print(str(cpu_usage))
return str(cpu_usage)
# 获取系统占用信息
def sys_info():
load_average = os.getloadavg()
# print(len(load_average))
# load_tip = "系统负载(三个数值中有一个超过3就是高):" + str(load_average)
return len(load_average)
# 获取计算机当前时间
def time_info():
now_time = time.strftime('%Y-%m-%d %H:%M:%S')
return "主机的当前时间:%s" % now_time
# 获取计算机主机名称
def hostname_info():
hostnames = os.popen("hostname").read().strip()
return "你的主机名是: %s" % hostnames
# 获取IP地址信息
def ip_info():
ipadd = os.popen("ip a| grep ens192 | grep inet | awk '{print $2}'").read().strip()
return ipadd
# 获取根的占用信息
def disk_info_root():
child = subprocess.Popen(["df", "-h"], stdout=subprocess.PIPE)
out = child.stdout.readlines()
for item in out:
line = item.strip().split()
# 我这里只查看centos的根
if '/dev/mapper/centos-root' in line:
title = [u'-文件系统-', u'--容量-', u'-已用-', u'-可用-', u'-已用-', u'-挂载点--']
content = "\t".join(title)
if eval(line[4][0:-1]) > 60:
line[0] = 'centos-root'
content += '\r\n' + '\t'.join(line)
return content
# 测试程序
# if __name__ == "__main__":
# disk_information = disk_info()
# disk_usage = [int(s) for s in re.findall(r'\b\d+\b', disk_information)]
# infomation = [hostname_info(), time_info(), disk_information]
# print(disk_usage)
# # 如果磁盘占用高于60%就发邮件告警
# if disk_usage[0] > 60:
# print("当前磁盘占用率已超过60%,建议清除磁盘内存!")
#
# # print(hostname_info())
# # print(time_info())
# # print(ip_info())
# print(sys_info())
# print(cpu_info())
# print(mem_info())
# print(disk_info())
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : main_server.py
# @Time : 2023/3/31 10:31
# @Author : bruxelles_li
# @Software: PyCharm
import logging
import requests
import threading
import sys
import time, os
import json
import pandas as pd
import glob
from pathlib import Path
sys.path.append('../')
# 关闭多余连接
s = requests.session()
s.keep_alive = False
from classification.runner.runner_fast_text import FastTextRunner_train
from detector_source import sys_info, cpu_info, mem_info
from classification.data.data_process import pro_data
# 定义日志输出格式
formatter = logging.Formatter("%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s")
# 创建一个logger, 并设置日志级别
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 创建一个handler,用于将日志输出到控制台,并设置日志级别
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
# 创建一个filehandler,用于将错误日志输出到文件,并设置日志级别
_tmp_path = os.path.dirname(os.path.abspath(__file__))
# print(_tmp_path)
_tmp_path = os.path.join(_tmp_path, 'log')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
fh = logging.FileHandler(os.path.join(_tmp_path, "main_server_error.log"))
fh1 = logging.FileHandler(os.path.join(_tmp_path, "main_server_info.log"))
fh.setLevel(level=logging.ERROR)
fh1.setLevel(level=logging.INFO)
fh.setFormatter(formatter)
fh1.setFormatter(formatter)
# 同时将日志输出到控制台和文件
logger.addHandler(ch)
logger.addHandler(fh)
logger.addHandler(fh1)
# 定义训练配置文件
train_config_path = '../classification/config/fasttext_config_train.yml'
# todo: 定义处理数据相关路径
root_path = r'../word2vec/doc_similarity/'
stop_words_path = os.path.join(root_path, 'stop_words.txt')
save_data_path = r'../datasets/classification/{}/{}/{}.txt'
file_types = ['xls', 'xlsx']
# 加载java回调接口
java_call_back_url = "http://192.168.1.82:9988/manage/algorithmModel/process/changeStatus"
# 加载端口号
port = 4005
modelName = "FastText-Model"
# TODO: 定义进程存放列表
all_thread = []
def merge_df(dataset_path):
all_files = []
for file_type in file_types:
all_files.extend(glob.glob(os.path.join(dataset_path, f'*.{file_type}')))
# 将所有文件合并到一个DataFrame中
combined_df = pd.concat([pd.read_excel(f) for f in all_files], ignore_index=True)
# 去除重复行
combined_df.drop_duplicates(keep='first', inplace=True)
return combined_df
def train_model4FastText(data_path, model_path, modelProcessId, root_dataset):
"""
train
:return:
"""
combined_df = merge_df(dataset_path=root_dataset)
# 预处理数据
pro_data(dataFolderName=data_path, data_df=combined_df, stop_words_path=stop_words_path,
save_data_path=save_data_path, modelName=modelName)
logger.info("====数据预处理成功,准备进入训练阶段===")
# 进入训练
runner_train = FastTextRunner_train(config_path=train_config_path, model_train=True)
runner_train.train(data_path=data_path, model_path=model_path, auto_tune_duration=300)
dict_result = runner_train.test(data_path=data_path, model_path=model_path)
str_dict_result = json.dumps(dict_result, ensure_ascii=False)
logger.info(str_dict_result)
# todo: 调用java的状态更新接口返回训练后的结果
payload = json.dumps({
"id": modelProcessId,
"result": str_dict_result
})
# todo: 调用接口访问实施生成参数函数来生成currentTime, appId
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url=f"{java_call_back_url}",
headers=headers, data=payload)
r1_json = json.loads(r1.text)
# print(r1_json)
logger.info(r1_json)
return str_dict_result
def env_eval(modelProcessId):
# todo 获取资源相关信息(磁盘占用率、系统占用信息【超过3个为高】、CPU占用率、物理内存占用率)
# disk_usage = disk_info()
sys_usage = sys_info()
cpu_usage = cpu_info()
men_usage = mem_info()
# todo 资源不够用时,返回 False
if sys_usage > 10000 or cpu_usage > str(95) or men_usage > str(95):
# todo: 调用java的状态更新接口提示资源占用过高的结果
str_dict_result = {
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '模型训练失败!当前模型训练资源占用率过高,请检查系统占用信息【超过10个为高】、CPU占用率【超过85%为高】、物理内存占用率【超过85%为高】',
'resultData': None
}
logger.info(str_dict_result)
payload = json.dumps({
"id": modelProcessId,
"result": str_dict_result
})
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(
url=f"{java_call_back_url}",
headers=headers, data=payload)
r1_json = json.loads(r1.text)
# print(r1_json)
logger.info(r1_json)
return False
# todo 资源够用时,返回 True
return True
def system_start():
while True:
# print("=====正在进行训练服务=====")
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url=f'http://localhost:{int(port)}/queue_size', headers=headers)
r1_json = json.loads(r1.text)
# print(r1_json)
queue_left_number = r1_json['queue_left_number']
logger.info("当前队列任务总数:" + str(queue_left_number))
if queue_left_number == 0:
# logger.warning("队列为空!无可处理任务。")
time.sleep(30)
else:
for i in range(queue_left_number):
r2 = requests.post(url=f'http://localhost:{int(port)}/subject_consumer', headers=headers)
r2_json = json.loads(r2.text)
config_info = r2_json['data']
logger.info(config_info)
modelProcessId = config_info["modelProcessId"]
model_path = config_info["model_path"]
data_path = config_info["data_path"]
root_dataset = config_info["root_dataset"]
logger.info('##########FastText-Model###############')
t = threading.Thread(target=train_model4FastText,
args=(data_path, model_path, modelProcessId, root_dataset),
daemon=True)
while True:
if env_eval(modelProcessId):
break
else:
time.sleep(600)
# 启动
t.start()
all_thread.append(t)
def system_resume():
"""
恢复模型训练服务状态
:return:
"""
headers = {
'Content-Type': 'application/json'
}
# 清空当前服务中的队列,避免重复启动同一个模型训练
r1 = requests.post(url=f'http://localhost:{int(port)}/queue_size', headers=headers)
r1_json = r1.json()
logger.info('当前队列数量:%d' % r1_json['queue_left_number'])
if r1_json['queue_left_number'] > 0:
logger.info('正在消费队列,直到队列为空!')
while True:
r2 = requests.post(url=f'http://localhost:{int(port)}/subject_consumer', headers=headers)
r2_json = r2.json()
if r2_json['queue_left_number'] == 0:
logger.info('队列消费完毕!可放心进行模型训练 ...')
break
else:
logger.info('队列为空!可放心进行模型训练 ...')
def start_up_check():
"""
启动前检查
:return:
"""
while True:
try:
headers = {
'Content-Type': 'application/json'
}
r0 = requests.post(url=f'http://localhost:{int(port)}/queue_size', headers=headers)
server_started = True
except requests.exceptions.ConnectionError as e:
server_started = False
logger.error("Error: ConnectionError")
logger.warning('服务未启动,请先启动server! 程序已退出。')
exit(123)
# logger.info('server正在尝试自启 ...')
# time.sleep(3)
if server_started:
logger.info("server启动成功!模型训练服务已启动...")
break
if __name__ == '__main__':
# root_path = "../datasets/classification/zcjd_column_classify/zcjd_V0"
# data_df = merge_df(root_path)
# print(len(data_df))
# print(data_df)
# 开始启动模型训练服务
start_up_check()
logger.info('模型训练服务恢复中 ...')
system_resume()
time.sleep(30)
logger.info('模型训练服务恢复完成!')
logger.info('模型训练服务运行中 ...')
system_start()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/7/31 10:21
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/21 9:30
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_app
# @Author : LiuYan
# @Time : 2021/4/21 9:30
import json
from flask import Flask, Blueprint, request
from utils.log import logger
app = Flask(__name__)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:03
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_config
# @Author : LiuYan
# @Time : 2021/4/16 18:06
import os
import pymysql
from abc import abstractmethod, ABC
# root_dir = '/data/lzc/zzsn_nlp_br'
# root_dir = '/data/lzc'
root_dir = '..' # deploy
db_config = {
'host': os.environ.get('brpa_tidb_host') if 'brpa_tidb_host' in os.environ else None,
'port': int(os.environ.get('brpa_tidb_port')) if 'brpa_tidb_port' in os.environ else None,
'user': os.environ.get('brpa_tidb_user') if 'brpa_tidb_user' in os.environ else None,
'password': os.environ.get('brpa_tidb_password') if 'brpa_tidb_password' in os.environ else None,
'database': os.environ.get('brpa_tidb_database') if 'brpa_tidb_database' in os.environ else None,
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor
}
class BaseConfig(ABC):
@abstractmethod
def __init__(self):
super(BaseConfig, self).__init__()
@abstractmethod
def load_config(self):
"""
Add the config you need.
:return: config(YamlDict)
"""
pass
home:
dir: '/data/lzc'
# Please set the GPU or CPU to be used for your model training in the LoadConfig object
device: "cuda:0"
# shared for multiple projects in this machine, raw data, read only
data:
# base: '/data'
base: 'd:/data'
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:03
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_data_loader
# @Author : LiuYan
# @Time : 2021/4/19 9:37
from abc import ABC, abstractmethod
class BaseDataLoader(ABC):
@abstractmethod
def __init__(self):
super(BaseDataLoader, self).__init__()
@abstractmethod
def _load_data(self):
"""
load raw data according to data config
:return:
"""
pass
@abstractmethod
def load_train(self):
pass
@abstractmethod
def load_valid(self):
pass
@abstractmethod
def load_test(self):
pass
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_data_process
# @Author : LiuYan
# @Time : 2021/4/19 9:37
from abc import ABC, abstractmethod
class BaseDataProcess(ABC):
"""
data processing
"""
@abstractmethod
def __init__(self):
super(BaseDataProcess, self).__init__()
@abstractmethod
def process(self):
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_data_reader
# @Author : LiuYan
# @Time : 2021/4/19 9:37
from abc import ABC, abstractmethod
class BaseDataReader(ABC):
@abstractmethod
def __init__(self):
super(BaseDataReader, self).__init__()
@abstractmethod
def reade(self):
pass
@abstractmethod
def save(self):
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:04
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_evaluator
# @Author : LiuYan
# @Time : 2021/4/19 10:39
from abc import ABC, abstractmethod
class BaseEvaluator(ABC):
@abstractmethod
def __init__(self):
super(BaseEvaluator, self).__init__()
@abstractmethod
def evaluate(self, dict_inputs: dict) -> tuple:
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:04
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_loss
# @Author : LiuYan
# @Time : 2021/4/19 10:41
from abc import abstractmethod
import torch.nn as nn
class BaseLoss(nn.Module):
def __init__(self, loss_config):
super(BaseLoss, self).__init__()
self._config = loss_config
@abstractmethod
def forward(self, dict_outputs: dict) -> dict:
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:04
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_model
# @Author : LiuYan
# @Time : 2021/4/19 10:42
from abc import ABC, abstractmethod
import torch.nn as nn
class BaseModel(nn.Module, ABC):
def __init__(self):
super(BaseModel, self).__init__()
@abstractmethod
def forward(self, dict_inputs: dict) -> dict:
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:04
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_runner
# @Author : LiuYan
# @Time : 2021/4/19 10:42
from abc import ABC, abstractmethod
from utils.utils import timeit
class BaseRunner(ABC):
"""
Abstract definition for runner
"""
@abstractmethod
def __init__(self):
pass
@timeit
@abstractmethod
def _build_config(self):
pass
@timeit
@abstractmethod
def _build_data(self):
pass
@timeit
@abstractmethod
def _build_model(self):
pass
@timeit
@abstractmethod
def _build_loss(self):
pass
@timeit
@abstractmethod
def _build_optimizer(self):
pass
@timeit
@abstractmethod
def _build_evaluator(self):
pass
@abstractmethod
def train(self):
pass
@abstractmethod
def _train_epoch(self, epoch: int):
pass
@abstractmethod
def _valid(self, epoch: int):
pass
@abstractmethod
def test(self):
pass
@abstractmethod
def pred(self, title: str, content: str) -> str or dict:
pass
@abstractmethod
def _display_result(self, dict_result: dict):
pass
@abstractmethod
def _save_model(self):
pass
@abstractmethod
def _load_model(self):
pass
class train_BaseRunner(ABC):
"""
Abstract definition for runner
"""
@abstractmethod
def __init__(self):
pass
@timeit
@abstractmethod
def _build_config(self):
pass
@timeit
@abstractmethod
def _build_data(self):
pass
@timeit
@abstractmethod
def _build_model(self):
pass
@timeit
@abstractmethod
def _build_loss(self):
pass
@timeit
@abstractmethod
def _build_optimizer(self):
pass
@timeit
@abstractmethod
def _build_evaluator(self):
pass
@abstractmethod
def train(self):
pass
@abstractmethod
def _train_epoch(self, epoch: int):
pass
@abstractmethod
def _valid(self, data_path, model_path, epoch: int):
pass
@abstractmethod
def test(self):
pass
@abstractmethod
def pred(self, title: str, content: str) -> str or dict:
pass
@abstractmethod
def _display_result(self, dict_result: dict):
pass
@abstractmethod
def _save_model(self, model_path):
pass
@abstractmethod
def _load_model(self):
pass
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/7/31 17:24
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/21 9:59
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/15 10:31
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : fast_text_config
# @Author : LiuYan
# @Time : 2021/4/19 10:46
import dynamic_yaml
import torch
from base.config.base_config import BaseConfig
class FastTextConfig(BaseConfig):
def __init__(self, config_path):
super(FastTextConfig, self).__init__()
self._config_path = config_path
pass
def load_config(self):
with open(self._config_path, mode='r', encoding='UTF-8') as f:
config = dynamic_yaml.load(f)
config.device = torch.device(config.device if torch.cuda.is_available() else 'cpu')
return config
home:
# dir: '/home/zzsn/liuyan' # train or test
dir: '../../..' # deploy
# Shared for multiple modules in the project
project:
name: 'platform_project'
dir:
work: '{home.dir}/{project.name}'
# Please set the GPU or CPU to be used for your model training in the LoadConfig object
device: 'cpu'
status: 'pred' # pred / test / train
# shared for multiple projects in this machine, raw data, read only
data:
dir: ''
name: 'FastText-Model'
num_vocab: ~
num_tag: ~
model:
name: 'Origin-Model'
loss:
name: 'ft_loss'
learn:
time: '2023_03_31-12_15_17'
dir:
work: '{home.dir}/model_saved/classification/{data.name}'
logs: '{learn.dir.work}/log'
saved: '{learn.dir.work}/{model.name}'
result: '{learn.dir.work}/data/result'
# save_model: '{learn.dir.saved}-{learn.time}/model.bin'
load_model: '{learn.dir.saved}-{learn.time}/model.bin'
home:
# dir: '/data/lzc' # train or test
dir: '../../..' # deploy
# Shared for multiple modules in the project
project:
name: 'platform_project'
dir:
work: '{home.dir}/{project.name}'
# Please set the GPU or CPU to be used for your model training in the LoadConfig object
#device: 'cpu'
device: 'cuda:0'
status: 'train' # pred / test / train
# shared for multiple projects in this machine, raw data, read only
data:
dir: '../datasets/classification'
name: 'FastText-Model'
path0: '{data.dir}/{data.name}%s'
train_path: '{data.dir}/{data.name}%s/train.txt'
valid_path: '{data.dir}/{data.name}%s/valid.txt'
test_path: '{data.dir}/{data.name}%s/valid.txt'
batch_size: 4
num_vocab: ~
num_tag: ~
model:
name: 'Origin-Model'
loss:
name: 'ft_loss'
learn:
time: '2023_03_31-12_15_17'
dir:
work: '{home.dir}/model_saved/classification'
logs: '{learn.dir.work}/log'
saved0: '{learn.dir.work}%s'
saved: '{learn.dir.work}/{data.name}%s'
result: '{learn.dir.work}/data/result'
# save_model: '{learn.dir.saved}-{learn.time}/model.bin'
load_model: '{learn.dir.saved}-{learn.time}/model.bin'
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/15 10:31
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : data_process
# @Author : bruxellse_li
# @Time : 2023/3/31 08:39
import os
import pandas as pd
import sys
from pathlib import Path
from pandas import DataFrame
from sklearn.model_selection import train_test_split
# 追加工作路径
sys.path.append('../../')
from classification.utils.utils import *
def process_txt(data_loader: DataFrame, train_file_path: str, valid_file_path: str, stop_words_path:str):
articles = data_loader['article']
labels = data_loader['label']
article_list = []
for article, label in zip(articles, labels):
if type(article) is str:
text = article.replace('\n', '').replace('\r', '').replace('\t', '')
else:
print('{} is not str!'.format(article))
continue
text = seg(text=text, sw=stop_words(path=stop_words_path))
text = '__label__{} {}'.format(label, text)
article_list.append(text)
train_data, valid_data = train_test_split(
article_list, train_size=0.8, random_state=2021, shuffle=True
)
with open(
train_file_path, 'w', encoding='utf-8'
) as train_file, open(
valid_file_path, 'w', encoding='utf-8'
) as valid_file:
for train in train_data:
train_file.write(train + '\n')
for valid in valid_data:
valid_file.write(valid + '\n')
pass
def process(data_loader, train_file_path: str, valid_file_path: str, stop_words_path: str):
# 创建语料路径
# Path(os.path.abspath(os.path.join(train_file_path, os.path.pardir))).mkdir(parents=True, exist_ok=True)
# data_loader = pd.read_excel(path, keep_default_na=False).astype(str)
data_loader['article'] = data_loader['title'] + '。' + data_loader['content']
data_loader['article'] = data_loader.article.apply(clean_tag).apply(clean_txt)
process_txt(
data_loader=data_loader,
train_file_path=train_file_path,
valid_file_path=valid_file_path,
stop_words_path=stop_words_path
)
return None
# 语料处理函数定义
def pro_data(modelName, dataFolderName, data_df, stop_words_path, save_data_path):
# save_data_path = '/home/python/lzc/datasets/classification/{}/{}/{}.txt'
process(
data_loader=data_df,
train_file_path=save_data_path.format(modelName, dataFolderName, 'train'),
valid_file_path=save_data_path.format(modelName, dataFolderName, 'valid'),
stop_words_path=stop_words_path
)
return None
if __name__ == '__main__':
modelName, dataFolderName, data_path = "gzdt_dataset", "gzdt_V1", "../../datasets/Receive_File/测试数据.xlsx"
save_data_path = r'../../datasets/classification/{}/{}/{}.txt'
root_path = r'../../word2vec/doc_similarity/'
stop_words_path = os.path.join(root_path, 'stop_words.txt')
pro_data(modelName, dataFolderName, data_path, stop_words_path, save_data_path)
# date = '20230329'
# path = '../datasets/{}_total_{}.xlsx'
#
# save_data_path = '/home/zzsn/liuyan/datasets/the_belt_and_road/classification/{}/{}_{}.txt'
# # 机械舆情 时事要闻栏目分类
# ssyw_name = 'ssyw_column_classify'
# # 机械舆情 国资动态栏目分类
# gzdt_name = 'gzdt_column_classify'
# # 机械舆情 上下游栏目分类
# sxy_name = 'sxy_column_classify'
# # 机械舆情 行业舆情栏目分类
# hyyq_name = 'hyyq_column_classify'
# # 机械舆情 管理动态栏目分类
# gldt_name = 'gldt_column_classify'
# # 机械舆情 龙头企业栏目分类
# ltqy_name = 'ltqy_column_classify'
# # 机械舆情 新兴领域栏目分类
# xxly_name = 'xxly_column_classify'
# # 机械舆情 综合资讯栏目分类
# zhzx_name = 'zhzx_column_classify'
# # 机械舆情 负面舆情栏目分类
# fmyq_name = 'fmyq_column_classify'
#
# process(
# path=path.format(gzdt_name, date),
# train_file_path=save_data_path.format(gzdt_name, 'train', date),
# valid_file_path=save_data_path.format(gzdt_name, 'valid', date)
# )
# pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : data_stats
# @Author : LiuYan
# @Time : 2021/4/15 16:52
import pandas as pd
from collections import Counter
if __name__ == '__main__':
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/15 10:33
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : eval_classification
# @Author : LiuYan
# @Time : 2021/4/20 21:19
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score
from base.evaluation.base_evaluator import BaseEvaluator
class ClassifyEvaluator(BaseEvaluator):
# def __init__(self, label_dict: dict):
def __init__(self):
super(ClassifyEvaluator, self).__init__()
# self._label_dict = label_dict
# self._count_dict = {'TP': 0}
pass
def evaluate(self, true_list: list, pred_list: list) -> tuple:
dict_result = {}
true_labels = Counter(true_list)
pred_labels = Counter(pred_list)
print(true_labels)
print(pred_labels)
for true_label in true_labels:
# print(true_labels[true_label], pred_labels[true_label])
dict_result[true_label] = {
'precision': 0,
'recall': 0,
'f1-score': 0,
'true_num': 0,
'pred_num': pred_labels[true_label],
'total_num': true_labels[true_label]
}
for true, pred in zip(true_list, pred_list):
if true == pred:
dict_result[true]['true_num'] += 1
print('\n' + ''.join('-' for i in range(89)))
print('label_type\t\t\tp\t\t\tr\t\t\tf1\t\t\ttrue_num\t\t\tpred_num\ttotal_num')
string = '{0}{1:<12.4f}{2:<12.4f}{3:<12.4f}{4:<12}{5:<12}{6:<12}'
true_nums, pred_nums, total_nums = 0, 0, 0
for label_type in dict_result:
true_nums += dict_result[label_type]['true_num']
pred_nums += dict_result[label_type]['pred_num']
total_nums += dict_result[label_type]['total_num']
p = dict_result[label_type]['true_num'] / dict_result[label_type]['pred_num'] if dict_result[label_type]['pred_num'] != 0 else 0
r = dict_result[label_type]['true_num'] / dict_result[label_type]['total_num'] if dict_result[label_type]['total_num'] != 0 else 0
f1 = 2 * p * r / (p + r) if p + r != 0 else 0
chunk_type_out = label_type + ''.join(
' ' for i in range(20 - (((len(label_type.encode('utf-8')) - len(label_type)) // 2) + len(label_type)))
)
print(string.format(chunk_type_out, p, r, f1, dict_result[label_type]['true_num'],
dict_result[label_type]['pred_num'], dict_result[label_type]['total_num']), chr(12288))
dict_result[label_type]['precision'] = p
dict_result[label_type]['recall'] = r
dict_result[label_type]['f1-score'] = f1
p = true_nums / pred_nums if pred_nums != 0 else 0
r = true_nums / total_nums if total_nums != 0 else 0
f1 = 2 * p * r / (p + r) if p + r != 0 else 0
print(string.format('average{}'.format(''.join(' ' for i in range(13))), p, r, f1,
true_nums, pred_nums, total_nums), chr(12288))
print(''.join('-' for i in range(89)) + '\n')
dict_result['average'] = {
'precision': p,
'recall': r,
'f1-score': f1,
'true_num': true_nums,
'pred_num': pred_nums,
'total_num': total_nums
}
return p, r, f1, dict_result
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/8/2 15:47
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/15 10:31
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : runner_fast_text
# @Author : LiuYan
# @Time : 2021/4/15 16:44
import os
import sys
import time
import json
import warnings
import fasttext
import pandas as pd
from pathlib import Path
sys.path.append('../../')
from utils.utils import timeit
from base.runner.base_runner import BaseRunner, train_BaseRunner
from classification.config.config_fast_text import FastTextConfig
from classification.evaluation.classify_evaluator import ClassifyEvaluator
from classification.utils.utils import *
warnings.filterwarnings('ignore')
fasttext.FastText.eprint = lambda x: None
class FastTextRunner_train(train_BaseRunner):
def __init__(self, config_path: str, model_train=False, model_path=None):
super(FastTextRunner_train, self).__init__()
self._config_path = config_path
self._config = None
self._time = time.strftime('%Y_%m_%d-%H_%M_%S')
self._model_train = model_train
self._model_path = model_path
self._train_dataloader = None
self._valid_dataloader = None
self._test_dataloader = None
self._model = None
self._loss = None
self._optimizer = None
self._evaluator = None
self._build()
@timeit
def _build(self):
self._build_config()
# self._time = self._config.learn.time
self._build_data()
self._build_model()
self._build_loss()
self._build_optimizer()
self._build_evaluator()
pass
@timeit
def _build_config(self):
self._config = FastTextConfig(config_path=self._config_path).load_config()
pass
@timeit
def _build_data(self):
if self._config.status in ['train', 'test'] or self._model_train:
self._train_path = self._config.data.train_path
self._valid_path = self._config.data.valid_path
self._test_path = self._config.data.test_path
else:
self._stop_words = stop_words(
path=r'../../word2vec/f_zp_gp/stop_words.txt'
)
# self._stop_words = stop_words(
# path=os.path.join(self._config.home.dir, 'word2vec/f_zp_gp/stop_words.txt')
# )
pass
@timeit
def _build_model(self):
if self._model_path:
self._config.learn.dir.load_model = self._model_path
if self._config.status in ['test', 'pred'] and not self._model_train:
self._load_model()
pass
@timeit
def _build_loss(self):
pass
@timeit
def _build_optimizer(self):
pass
@timeit
def _build_evaluator(self):
self._evaluator = ClassifyEvaluator()
pass
@timeit
def train(self, data_path, model_path, auto_tune_duration=500, auto_tune_model_size='200M'):
self._model = fasttext.train_supervised(
input=self._train_path % data_path, autotuneValidationFile=self._test_path % data_path,
autotuneDuration=auto_tune_duration, autotuneModelSize=auto_tune_model_size
)
self._save_model(model_path)
pass
def _train_epoch(self, epoch: int):
pass
def _valid(self, data_path, model_path, epoch: int) -> None or dict:
with open(self._valid_path % data_path, encoding='utf-8') as file:
self._valid_dataloader = file.readlines()
labels = []
pre_labels = []
for text in self._valid_dataloader:
label = text.replace('__label__', '').split(' ')[0]
labels.append(label)
text = text.replace('__label__', '')[1: -1]
# pred_labels, pred_pros = self._model.predict(text, k=2)
# for pred_label, pred_prob in zip(pred_labels, pred_pros):
# print(pred_label, pred_prob)
pre_label = self._model.predict(text)[0][0].replace('__label__', '')
# print(pre_label, self._model.predict(text))
pre_labels.append(pre_label)
p, r, f1, dict_result = self._evaluator.evaluate(true_list=labels, pred_list=pre_labels)
if self._config.status == 'train' or self._model_train:
json_result = json.dumps(dict_result)
with open(self._config.learn.dir.saved % model_path + '-{}/evaluation_metrics.json'.format(self._time),
'w', encoding='utf-8') as f:
f.write(json_result)
if self._model_train:
dict_result = {
'code': 200,
'result': '模型训练成功!模型评测指标为: precision: {:.0f}% recall: {:.0f}% f1-score: {:.0f}%'.format(
dict_result['average']['precision'] * 100,
dict_result['average']['recall'] * 100,
dict_result['average']['f1-score'] * 100
),
'model_path': self._config.learn.dir.saved % model_path + '-{}/model.bin'.format(self._time)
}
return dict_result
def test(self, data_path=None, model_path=None, title=None, content=None) -> None or dict:
if self._model_train:
return self._valid(data_path=data_path, model_path=model_path, epoch=100)
elif self._model_path:
with open(
os.path.join(os.path.split(self._model_path)[0], 'evaluation_metrics.json'),
'r', encoding='utf-8'
) as f:
json_result = json.load(f)
evaluation_metrics = {
'精确率(P)': '{:.0f}%'.format(json_result['average']['precision'] * 100),
'召回率(R)': '{:.0f}%'.format(json_result['average']['recall'] * 100),
'F1值(F1)': '{:.0f}%'.format(json_result['average']['f1-score'] * 100)
}
result = self.pred(title=title, content=content)
dict_result = {
'handleMsg': 'success',
'code': 200,
'logs': '模型测试成功!',
'result': {
'label': result,
'evaluation_metrics': evaluation_metrics
}
} if type(result) == str else result
return dict_result
else:
self._valid(data_path=data_path, model_path=model_path, epoch=100)
def pred(self, title: str, content: str) -> str or dict:
text = (title + '。') * 2 + content
text = clean_txt(raw=clean_tag(text=text))
if type(text) is str:
text = text.replace('\n', '').replace('\r', '').replace('\t', '')
else:
return {
'handleMsg': 'failure',
'code': 300,
'logs': '{} is not str!'.format(text),
'result': {
'label': None
}
}
text = seg(text=text, sw=self._stop_words)
pre_label = self._model.predict(text)[0][0].replace('__label__', '')
return pre_label
def pred_file(self, file_path: str, result_path: str) -> None or dict:
data_loader = pd.read_excel(file_path)
titles, contents = data_loader['title'], data_loader['content']
labels = []
for title, content in zip(titles, contents):
pred_result = self.pred(title, content)
if type(pred_result) == str:
labels.append('是' if pred_result == '1' else '否')
else:
return pred_result
data_loader['label'] = labels
data_loader.to_excel(result_path)
def _display_result(self, dict_result: dict):
pass
def _save_model(self, model_path):
print(self._config.learn.dir.saved % model_path + '-{}/model.bin'.format(self._time))
Path(self._config.learn.dir.saved % model_path + '-{}'.format(self._time)).mkdir(parents=True, exist_ok=True)
self._model.save_model(self._config.learn.dir.saved % model_path + '-{}/model.bin'.format(self._time))
def _load_model(self):
self._model = fasttext.load_model(self._config.learn.dir.load_model)
class FastTextRunner(BaseRunner):
def __init__(self, config_path: str, model_train=False, model_path=None):
super(FastTextRunner, self).__init__()
self._config_path = config_path
self._config = None
self._time = time.strftime('%Y_%m_%d-%H_%M_%S')
self._model_train = model_train
self._model_path = model_path
self._train_dataloader = None
self._valid_dataloader = None
self._test_dataloader = None
self._model = None
self._loss = None
self._optimizer = None
self._evaluator = None
self._build()
@timeit
def _build(self):
self._build_config()
self._build_data()
self._build_model()
self._build_loss()
self._build_optimizer()
self._build_evaluator()
pass
@timeit
def _build_config(self):
self._config = FastTextConfig(config_path=self._config_path).load_config()
pass
@timeit
def _build_data(self):
if self._config.status in ['train', 'test'] or self._model_train:
self._train_path = self._config.data.train_path
self._valid_path = self._config.data.valid_path
self._test_path = self._config.data.test_path
else:
self._stop_words = stop_words(
path=os.path.join(self._config.data.dir, '../word2vec/f_zp_gp/stop_words.txt')
)
# self._stop_words = stop_words(
# path=os.path.join(self._config.home.dir, 'word2vec/f_zp_gp/stop_words.txt')
# )
pass
@timeit
def _build_model(self):
if self._model_path:
self._config.learn.dir.load_model = self._model_path
if self._config.status in ['test', 'pred'] and not self._model_train:
self._load_model()
pass
@timeit
def _build_loss(self):
pass
@timeit
def _build_optimizer(self):
pass
@timeit
def _build_evaluator(self):
self._evaluator = ClassifyEvaluator()
pass
@timeit
def train(self, auto_tune_duration=5000, auto_tune_model_size='200M'):
self._model = fasttext.train_supervised(
input=self._train_path, autotuneValidationFile=self._test_path,
autotuneDuration=auto_tune_duration, autotuneModelSize=auto_tune_model_size
)
self._save_model()
pass
def _train_epoch(self, epoch: int):
pass
def _valid(self, epoch: int) -> None or dict:
with open(self._valid_path, encoding='utf-8') as file:
self._valid_dataloader = file.readlines()
labels = []
pre_labels = []
for text in self._valid_dataloader:
label = text.replace('__label__', '').split(' ')[0]
labels.append(label)
text = text.replace('__label__', '')[1: -1]
# pred_labels, pred_pros = self._model.predict(text, k=2)
# for pred_label, pred_prob in zip(pred_labels, pred_pros):
# print(pred_label, pred_prob)
pre_label = self._model.predict(text)[0][0].replace('__label__', '')
# print(pre_label, self._model.predict(text))
pre_labels.append(pre_label)
p, r, f1, dict_result = self._evaluator.evaluate(true_list=labels, pred_list=pre_labels)
if self._config.status == 'train' or self._model_train:
json_result = json.dumps(dict_result)
with open(self._config.learn.dir.saved + '-{}/evaluation_metrics.json'.format(self._time),
'w', encoding='utf-8') as f:
f.write(json_result)
if self._model_train:
dict_result = {
'code': 200,
'result': '模型训练成功!模型评测指标为: precision: {:.0f}% recall: {:.0f}% f1-score: {:.0f}%'.format(
dict_result['average']['precision'] * 100,
dict_result['average']['recall'] * 100,
dict_result['average']['f1-score'] * 100
),
'model_path': self._config.learn.dir.saved + '-{}/model.bin'.format(self._time)
}
return dict_result
def test(self, title=None, content=None) -> None or dict:
if self._model_train:
return self._valid(epoch=100)
elif self._model_path:
with open(
os.path.join(os.path.split(self._model_path)[0], 'evaluation_metrics.json'),
'r', encoding='utf-8'
) as f:
json_result = json.load(f)
evaluation_metrics = {
'精确率(P)': '{:.0f}%'.format(json_result['average']['precision'] * 100),
'召回率(R)': '{:.0f}%'.format(json_result['average']['recall'] * 100),
'F1值(F1)': '{:.0f}%'.format(json_result['average']['f1-score'] * 100)
}
result = self.pred(title=title, content=content)
dict_result = {
'handleMsg': 'success',
'code': 200,
'logs': '模型测试成功!',
'result': {
'label': result,
'evaluation_metrics': evaluation_metrics
}
} if type(result) == str else result
return dict_result
else:
self._valid(epoch=100)
def pred(self, title: str, content: str) -> str or dict:
text = (title + '。') * 2 + content
text = clean_txt(raw=clean_tag(text=text))
if type(text) is str:
text = text.replace('\n', '').replace('\r', '').replace('\t', '')
else:
return {
'handleMsg': 'failure',
'code': 500,
'logs': '{} is not str!'.format(text),
'result': {
'label': None
}
}
text = seg(text=text, sw=self._stop_words)
pre_label = self._model.predict(text)[0][0].replace('__label__', '')
return pre_label
def pred_file(self, file_path: str, result_path: str) -> None or dict:
data_loader = pd.read_excel(file_path)
titles, contents = data_loader['title'], data_loader['content']
labels = []
for title, content in zip(titles, contents):
pred_result = self.pred(title, content)
if type(pred_result) == str:
labels.append('是' if pred_result == '1' else '否')
else:
return pred_result
data_loader['label'] = labels
data_loader.to_excel(result_path)
def _display_result(self, dict_result: dict):
pass
def _save_model(self):
Path(self._config.learn.dir.saved + '-{}'.format(self._time)).mkdir(parents=True, exist_ok=True)
self._model.save_model(self._config.learn.dir.saved + '-{}/model.bin'.format(self._time))
def _load_model(self):
self._model = fasttext.load_model(self._config.learn.dir.load_model)
if __name__ == '__main__':
# 一带一路 项目资讯识别筛选模型
ft_config_path = '../config/config_br_pro_info_filter.yml'
# 一带一路 项目信息知识分类模型
# ft_config_path = '../config/config_br_pro_info_type.yml'
# 一带一路 项目商机信息识别分析模型
# ft_config_path = '../config/config_br_buss_op_recognition.yml'
# 一带一路 项目风险信息识别分析模型
# ft_config_path = '../config/config_br_pro_risk_recognition.yml'
# 一带一路 项目资讯正负面信息分析模型
# ft_config_path = '../config/config_br_pro_sentiment_analysis.yml'
runner = FastTextRunner(config_path=ft_config_path)
# runner.train(
# auto_tune_duration=15000
# )
runner.test()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : __init__.py.py
# @Time : 2022/1/5 18:09
# @Author : Mr.Ygg
# @Software: PyCharm
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : test_br_pro_risk_recognition.py
# @Time : 2022/1/5 18:09
# @Author : Mr.Ygg
# @Software: PyCharm
from base.app.base_app import *
from classification.runner.runner_fast_text import FastTextRunner
from classification.utils.utils import load_risk_keywords, is_include_compound_words
# 风险分类
risk_info = [
'外部政治风险',
'主权政治风险',
'社会动荡风险',
'对华关系风险',
'资金风险',
'财政风险',
'汇率风险'
'通货膨胀风险',
'环保风险',
'法律风险',
'突发事件风险',
'项目实施风险',
'企业风险',
'其他风险'
]
ft_config_path = '../config/config_br_pro_risk_recognition.yml'
runner = FastTextRunner(config_path=ft_config_path)
# 招聘股票筛选模型
ft_config_path_rc_f_zp_gp = '../config/config_rc_f_zp_gp.yml'
runner_rc_f_zp_gp = FastTextRunner(config_path=ft_config_path_rc_f_zp_gp)
# 项目资讯正负面信息分析模型
ft_config_path_psa = '../config/config_br_pro_sentiment_analysis.yml'
runner_psa = FastTextRunner(config_path=ft_config_path_psa)
list_country = []
with open('../config/country.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
list_country.append(line.strip().split('(')[0].split('(')[0])
# 模型可识别的风险类型
risk_model_info = [
'社会动荡风险',
'突发事件风险'
]
# 风险分类关键词
dict_risk_keywords = load_risk_keywords('../config/risk_keywords.xlsx')
def pred(title: str, content: str) -> dict:
dict_result = {
'风险类别1': '',
'风险类别2': '',
'风险类别3': '',
'风险类别4': ''
}
# 招聘股票筛选模型
result_rc_f_zp_gp = runner_rc_f_zp_gp.pred(title=title, content=content)
# 0: 非招聘股票 1: 招聘信息 2: 股票信息
bool_rc_f_zp_gp = False if result_rc_f_zp_gp == '1' else True
logger.info('招聘股票筛选模型: {}'.format(result_rc_f_zp_gp))
logger.info('招聘股票筛选模型: {}'.format(bool_rc_f_zp_gp))
# 正负面筛选模型
result_psa = runner_psa.pred(title=title, content=content)
bool_psa = True if result_psa == '项目负面资讯信息' else False
logger.info('正负面筛选模型: {}'.format(result_psa))
logger.info('正负面筛选模型: {}'.format(bool_psa))
# 国家识别筛选模型
bool_country = False
text = title + '。' + content[: len(content) // 5]
for country in list_country:
if country in text:
bool_country = True
logger.info('国家识别筛选模型: {}'.format(country))
break
logger.info('国家识别筛选模型: {}'.format(bool_country))
text = title + '。' + content
if bool_country and bool_psa:
"""
1. 招聘股票筛选模型 -> 非招聘股票信息
2. 国家识别筛选模型 -> 一带一路相关国家
3. 正负面筛选模型 -> 负面信息
"""
# 风险识别筛选模型
result = runner.pred(
title=title,
content=content
)
dict_result['风险类别1'] = result
dict_result['风险类别2'] = result
dict_result['风险类别3'] = result
dict_result['风险类别4'] = result
logger.info('风险识别筛选模型: {}'.format(result))
# 基于关键词的筛选模型
if type(result) is str and result in risk_model_info:
# risk_model_info所包含的风险类别需按照关键词筛选掉一些脏数据
bool_risk_keyword = False
for risk_keyword in dict_risk_keywords[result]:
compound_words = risk_keyword.split('+')
if is_include_compound_words(text=text, compound_words=compound_words):
bool_risk_keyword = True
break
result = result if bool_risk_keyword else '无风险'
dict_result['风险类别3'] = result
dict_result['风险类别4'] = result
logger.info('关键词筛选: {}'.format(bool_risk_keyword))
if result == '无风险':
dict_risk_keywords_num = {
risk_keywords_key: 0 for risk_keywords_key in dict_risk_keywords
}
bool_risk_keyword, risk_category = False, result
for risk_keywords_key in dict_risk_keywords_num:
for risk_keyword in dict_risk_keywords[risk_keywords_key]:
compound_words = risk_keyword.split('+')
if is_include_compound_words(text=text, compound_words=compound_words):
bool_risk_keyword = True
dict_risk_keywords_num[risk_keywords_key] += 1
if bool_risk_keyword:
risk_category = max(dict_risk_keywords_num, key=dict_risk_keywords_num.get)
dict_result['风险类别3'] = risk_category
logger.info('关键词筛选后召回风险信息: {}'.format(risk_category))
elif type(result) is str and result == '无风险':
# 模型识别为无风险的信息,采用关键词召回一些有用的风险信息
dict_risk_keywords_num = {
risk_keywords_key: 0 for risk_keywords_key in dict_risk_keywords
}
# 不召回模型能识别的风险类别? √
for risk_keywords_key in risk_model_info:
dict_risk_keywords_num.pop(risk_keywords_key) if risk_keywords_key in dict_risk_keywords_num else None
bool_risk_keyword, risk_category = False, result
for risk_keywords_key in dict_risk_keywords_num:
for risk_keyword in dict_risk_keywords[risk_keywords_key]:
compound_words = risk_keyword.split('+')
if is_include_compound_words(text=text, compound_words=compound_words):
bool_risk_keyword = True
dict_risk_keywords_num[risk_keywords_key] += 1
if bool_risk_keyword:
risk_category = max(dict_risk_keywords_num, key=dict_risk_keywords_num.get)
dict_result['风险类别2'] = risk_category
dict_result['风险类别3'] = risk_category
dict_result['风险类别4'] = risk_category
logger.info('关键词召回风险信息: {}'.format(risk_category))
else:
result = result if type(result) is str else 'error'
dict_result['风险类别3'] = result
dict_result['风险类别4'] = result
logger.info('ELSE 风险信息: {}'.format(result))
else:
dict_result['风险类别1'] = '无风险'
dict_result['风险类别2'] = '无风险'
dict_result['风险类别3'] = '无风险'
dict_result['风险类别4'] = '无风险'
logger.info('招聘股票|国家识别筛选: 无风险')
return dict_result
if __name__ == '__main__':
import os
import pandas
root_dir = '../data/datasource/test'
# file_name = 'br总资讯'
file_name = '境外快讯_1.4'
df = pandas.read_excel(os.path.join(root_dir, 'input_file/{}.xlsx'.format(file_name)))
df.drop_duplicates(subset='标题', keep='first', inplace=True)
list_title = df['标题']
list_content = df['正文']
dict_risk_result = {
'风险类别1': [],
'风险类别2': [],
'风险类别3': [],
'风险类别4': []
}
list_risk, list_risk_old = [], []
for index, (title, content) in enumerate(zip(list_title, list_content)):
dict_result = pred(title=title, content=content)
for key in dict_risk_result:
dict_risk_result[key].append(dict_result[key] if key in dict_result else 'error')
result_old = runner.pred(title=title, content=content)
list_risk_old.append(result_old)
logger.info('{} / {}\n'.format(index + 1, len(list_title)))
df['风险类别_old'] = list_risk_old
for key in dict_risk_result:
df[key] = dict_risk_result[key]
df.to_excel(os.path.join(root_dir, 'output_file/{}_result_20220112_s.xlsx'.format(file_name)))
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : test_label.py
# @Time : 2022/1/7 18:28
# @Author : Mr.Ygg
# @Software: PyCharm
import os
import pandas as pd
from classification.utils.utils import load_risk_keywords, is_include_compound_words
root_dir = '../data/datasource/test'
# file_name = '项目风险模型数据集_总'
file_name = '去重_F_ZP_GP'
df = pd.read_excel(os.path.join(root_dir, 'input_file/{}.xlsx'.format(file_name)))
list_title = df['标题']
list_content = df['正文']
list_country = []
with open('../config/country.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
list_country.append(line.strip().split('(')[0].split('(')[0])
# 风险分类关键词
dict_risk_keywords = load_risk_keywords('../config/risk_keywords.xlsx')
list_bool_yiqing = []
list_bool_country = []
list_risk_key_words_category = []
for title, content in zip(list_title, list_content):
if type(title) is float:
title = ''
if type(content) is float:
content = ''
# 国家识别筛选模型
bool_country = False
text = title + '。' + content[: len(content) // 5]
for country in list_country:
if country in text:
bool_country = True
list_bool_country.append('是')
break
if not bool_country:
list_bool_country.append('否')
text = title + '。' + content
# 关键词: 疫情
if '疫情' in text:
list_bool_yiqing.append('是')
else:
list_bool_yiqing.append('否')
# 风险关键词
dict_risk_keywords_num = {
risk_keywords_key: 0 for risk_keywords_key in dict_risk_keywords
}
bool_risk_keyword = False
risk_category = '无风险'
for risk_keywords_key in dict_risk_keywords_num:
for risk_keyword in dict_risk_keywords[risk_keywords_key]:
compound_words = risk_keyword.split('+')
if is_include_compound_words(text=text, compound_words=compound_words):
bool_risk_keyword = True
dict_risk_keywords_num[risk_keywords_key] += 1
if bool_risk_keyword:
risk_category = max(dict_risk_keywords_num, key=dict_risk_keywords_num.get)
list_risk_key_words_category.append(risk_category)
df['是否含"疫情"关键词'] = list_bool_yiqing
df['是否含一带一路相关国家'] = list_bool_country
df['关键词分类'] = list_risk_key_words_category
df.to_excel(os.path.join(root_dir, 'output_file/{}_result.xlsx'.format(file_name)))
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : test_label_merge.py
# @Time : 2022/1/10 10:32
# @Author : Mr.Ygg
# @Software: PyCharm
import os
import pandas as pd
root_dir = '../data/datasource/test'
df = pd.read_excel(os.path.join(root_dir, 'input_file/br风险模型数据集_总_20220110.xlsx'))
list_label_1 = df['风险类别'].to_list()
list_label_2 = df['修正风险类别'].to_list()
list_label_3 = df['雪珂终审'].to_list()
list_label = []
for label_1, label_2, label_3 in zip(
list_label_1, list_label_2, list_label_3
):
label = ''
if type(label_1) is str:
label = label_1
if type(label_2) is str:
label = label_2
if type(label_3) is str:
label = label_3
list_label.append(label)
df['label'] = list_label
df.to_excel(os.path.join(root_dir, 'output_file/br风险模型数据集_总_20220110.xlsx'))
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : test_merge.py
# @Time : 2022/1/7 17:23
# @Author : Mr.Ygg
# @Software: PyCharm
import os
import pandas as pd
dict_df = {
'标题': [],
'正文': [],
'状态': [],
'类型': []
}
root_dir = '../data/datasource/test/input_file'
list_file = os.listdir(root_dir)
for file_name in list_file:
file_path = os.path.join(root_dir, file_name)
print(file_path)
df = pd.read_excel(file_path)
list_title = df['标题'].to_list()
list_content = df['正文'].to_list()
list_status = df['审核状态'].to_list()
list_type = df['资讯类型'].to_list()
dict_df['标题'].extend(list_title)
dict_df['正文'].extend(list_content)
dict_df['状态'].extend(list_status)
dict_df['类型'].extend(list_type)
df = pd.DataFrame(dict_df)
df.to_excel(os.path.join(root_dir, 'br总资讯.xlsx'))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 16:40
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : utils
# @Author : LiuYan
# @Time : 2021/4/16 16:40
import re
import jieba
import pandas
from bs4 import BeautifulSoup
def clean_tag(text):
"""
清除网页标签
:param text:
:return:
"""
bs = BeautifulSoup(str(text), 'html.parser')
return bs.text
def clean_txt(raw):
"""
去除表情
:param raw:
:return:
"""
res = re.compile(u'[\U00010000-\U0010ffff\uD800-\uDBFF\uDC00-\uDFFF]')
return res.sub('', raw)
def seg(text, sw):
"""
分词,NLPTokenizer会基于全部命名实体识别和词性标注进行分词
:param text:
:param NLPTokenizer:
:param sw:
:return:
"""
# text = ' '.join([i.word for i in NLPTokenizer.segment(text) if i.word.strip() and i.word not in sw])
text = ' '.join([i.strip() for i in jieba.cut(text) if i.strip() and i not in sw])
return text
def stop_words(path: str) -> list:
"""
去除停用词
:return:
"""
with open(path, 'r', encoding='utf-8') as swf:
return [line.strip() for line in swf]
def segment_para(text):
"""
:param text:
:return:
"""
split_pattern = re.compile(r'\n|。|?|!|\?|\!|\s')
global_sentences = split_pattern.split(text)
global_sentences = ''.join([str(i).strip() + '。' for i in global_sentences if len(i) >= 13])
return global_sentences
def cut_sent(para):
"""
:param para:
:return:
"""
para = re.sub('([。!?\?])([^”’])', r"\1\n\2", para) # 单字符断句符
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
para = re.sub('([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
para = para.rstrip() # 段尾如果有多余的\n就去掉它
return para.split("\n")
def transform_data(text, label):
"""
:param text:
:param label:
:return:
"""
fasttext_line = '__label__{} {}'.format(label, text)
return fasttext_line
def load_risk_keywords(path: str) -> dict:
"""
加载风险分类关键词
:param path:
:return:
"""
df = pandas.read_excel(path)
dict_risk_keywords = dict()
for key in df:
list_risk_keywords = []
list_df = df[key].to_list()
for keyword in list_df:
if type(keyword) is str:
list_risk_keywords.append(keyword.strip())
dict_risk_keywords[key] = list_risk_keywords
return dict_risk_keywords
def is_include_compound_words(text: str, compound_words: list) -> bool:
"""
文本(text)中是否包含组合词[List]
组合词判断有先后顺序
:param text:
:param compound_words:
:return: True: 是 False: 否
"""
for compound_word in compound_words:
if compound_word not in text:
return False
else:
text = text[text.find(compound_word) + len(compound_word):]
return True
{
"port": 4005,
"ip": "114.116.90.53",
"model_name": "FastText-Model",
"train_url": "/platform/classification/FastText-Model/model_train/",
"application_url": "/platform/classification/FastText-Model/pred/",
"show_file_url": "/platform/operation/process/show_file/",
"remove_file_url": "/platform/operation/process/remove_file/",
"upload_file_url": "/platform/operation/process/upload_file/",
"publish_version_url": "/platform/operation/process/publish_version/",
"model_test_url": "/platform/operation/process/model_test/",
"dataset_saved_path": "../datasets/classification/FastText-Model",
"model_saved_path": "../../../model_saved/classification/FastText-Model",
"java_call_back_url": "http://114.115.205.50:9988/manage/algorithmModel/process/changeStatus",
"train_info": {
"modelProcessId": {
"paramter_name": "训练日志Id",
"paramter_data": "",
"paramter_description": "模型训练日志id"
},
"task_id": {
"paramter_name": "模型训练任务id",
"paramter_data": "",
"paramter_description": "模型训练任务id"
},
"learning_rate": {
"paramter_name": "学习率",
"paramter_data": 0.03,
"paramter_description": "学习率"
},
"gpu": {
"paramter_name": "GPU",
"paramter_data": "",
"paramter_description": "是否使用GPU"
},
"data_path": {
"paramter_name": "语料版本",
"paramter_data": "",
"paramter_description": "模型训练时用户填入参数——语料版本"
},
"model_path": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "模型训练时用户填入参数——模型版本"
}
},
"application_info": {
"title": {
"paramter_name": "文章标题",
"paramter_data": "",
"paramter_description": "文章标题"
},
"content": {
"paramter_name": "文章内容",
"paramter_data": "",
"paramter_description": "文章内容"
},
"id": {
"paramter_name": "文章id",
"paramter_data": "",
"paramter_description": "文章id"
}
},
"show_file_info": {
"file_path": {
"paramter_name": "查询文件的相对路径",
"paramter_data": "",
"paramter_description": "要查询的文件目录,注意这里是相对地址,eg: 查询语料保存根目录dataset_saved_path的语料情况可传入../datasets/classification/"
}
},
"remove_file_info": {
"file_path": {
"paramter_name": "删除文件的相对路径",
"paramter_data": "",
"paramter_description": "要删除的文件,注意这里是相对地址,eg: 删除语料保存根目录dataset_saved_path下的ssyw_column_classify语料文件夹可传入../datasets/classification/ssyw_column_classify"
},
"flag": {
"paramter_name": "文件删除标识",
"paramter_data": "",
"paramter_description": "删除文件还是文件夹的标识,删除文件时flag=“/”,删除文件夹时flag为空字符串"
}
},
"upload_file_info": {
"request_url": {
"paramter_name": "语料下载地址",
"paramter_data": "",
"paramter_description": "待上传的语料文件下载地址,当前仅支持xlsx和xls文件,且文件内容需要包含title、content、label三个字段"
},
"task_id": {
"paramter_name": "模型训练任务id",
"paramter_data": "",
"paramter_description": "模型训练任务id"
}
},
"publish_version": {
"trainModelName": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "待发布的模型版本"
},
"task_id": {
"paramter_name": "模型训练任务id",
"paramter_data": "",
"paramter_description": "模型训练任务id"
}
},
"model_test_info": {
"task_id": {
"paramter_name": "模型训练任务id",
"paramter_data": "",
"paramter_description": "模型训练任务id"
},
"trainModelName": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "待测试的模型版本"
},
"data_type": {
"paramter_name": "测试方式",
"paramter_data": "",
"paramter_description": "可选项:url地址解析标题正文|file文件"
},
"request_url": {
"paramter_name": "测试文件下载地址",
"paramter_data": "",
"paramter_description": "待上传的测试文件下载地址,当前仅支持xlsx和xls文件,且文件内容需要包含title、content、label三个字段"
}
}
}
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
import requests
import json
# 加载java回调接口
java_call_back_url = "http://192.168.1.82:9988/manage/algorithmModel/process/changeStatus"
# 定义日志输出格式
formatter = logging.Formatter("%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s")
# 创建一个logger, 并设置日志级别
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
import requests
import json
url = "http://192.168.1.82:9988/manage/algorithmModel/process/changeStatus"
payload = json.dumps({
"result": "{'code': 200, 'result': '模型训练成功!模型评测指标为: precision: 100% recall: 100% f1-score: 100%', 'model_path': '../../../model_saved/classification/FastText-Model/11111/V0-2023_06_11-15_33_15/model.bin'}",
"id": "1455372078906662913"
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
# dict_result = {'code': 200, 'result': '模型训练成功!模型评测指标为: precision: 100% recall: 100% f1-score: 100%', 'model_path': '../../../model_saved/classification/FastText-Model/11111/V0-2023_06_11-15_33_15/model.bin'}
# modelProcessId = "1455372078906662913"
# str_dict_result = json.dumps(dict_result, ensure_ascii=False)
# print(str_dict_result)
# # todo: 调用java的状态更新接口返回训练后的结果
# payload = json.dumps({
# "id": modelProcessId,
# "result": str_dict_result
# })
# print(payload)
# # todo: 调用接口访问实施生成参数函数来生成currentTime, appId
# headers = {
# 'Content-Type': 'application/json'
# }
# r1 = requests.post(url="http://192.168.1.82:9988/manage/algorithmModel/process/changeStatus",
# headers=headers, data=payload)
#
# r1_json = json.loads(r1.text)
# # print(r1_json)
# print(r1_json)
# python3.9.5
gunicorn==20.1.0
beautifulsoup4==4.11.1
datasketch==1.5.3
dynamic_yaml==1.2.3
emoji==1.4.2
Flask==2.0.1
hanlp==2.1.0b3
jieba==0.42.1
jionlp_py39==1.3.45
keras_bert==0.88.0
matplotlib==3.3.4
numpy==1.19.5
pandas==1.1.5
psutil==5.8.0
PyMySQL==1.0.2
python_Levenshtein==0.20.5
pytorch_pretrained_bert==0.6.2
PyYAML==5.3.1
rarfile==4.0
requests==2.28.1
scikit_learn==1.1.2
seaborn==0.11.2
simhash==2.0.0
tensorflow==2.6.0
torch==1.9.0
tqdm==4.62.2
Werkzeug==2.2.2
xlrd==1.1.0
XlsxWriter==3.0.1
protobuf==3.19.5
Levenshtein==0.20.5
sklearn==0.0
fasttext==0.9.2
#!/bin/sh
exec nohup gunicorn -c app/app_config.py app/app_run:app --timeout 1200 & python app/main_server.py --timeout 300 >service.log 2>&1 &
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/7/31 17:36
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : build_word2vec_weights
# @Author : LiuYan
# @Time : 2020/6/24 14:46
from itertools import islice
import numpy as np
import torch
from utils.utils import timeit
@timeit
def load_word2vec(path=None, word_vocab=None, embedding_dim=None):
"""
loading word vector
:param path: None
:param word_vocab: None
:param embedding_dim: 768/100 bert/glove.6B.100d
:return: a vector corresponding to word_vocab.
"""
word_vocab_dict = word_vocab.stoi
vectors_vocab = load_vec(path, embedding_dim=embedding_dim)
if '[PAD]' in vectors_vocab:
pad = vectors_vocab['[PAD]']
elif 'pad' in vectors_vocab:
pad = vectors_vocab['pad']
if '[UNK]' in vectors_vocab:
unk = vectors_vocab['[UNK]']
elif 'unk' in vectors_vocab:
unk = vectors_vocab['unk']
vocab_size = len(word_vocab)
embed_weights = torch.zeros(vocab_size, embedding_dim)
for word, index in word_vocab_dict.items(): # word and index
if word in vectors_vocab:
em = vectors_vocab[word]
elif word == '<pad>':
em = pad
else:
em = unk
embed_weights[index, :] = torch.from_numpy(np.array(em))
return embed_weights
@timeit
def load_vec(path=None, embedding_dim=None):
"""
loading word vector
:param path: None
:param embedding_dim: 768/100 bert/glove.6B.100d
:return: a dictionary of word vectors
"""
vectors_vocab = {}
with open(path, 'r', encoding='utf-8') as f:
for line in islice(f, 1, None): # skip the first row
items = line.split()
char, vectors = items[0], items[-embedding_dim:]
vectors = [float(vector) for vector in vectors]
vectors_vocab[char] = vectors
return vectors_vocab
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : database_mysql
# @Author : LiuYan
# @Time : 2021/9/14 17:51
import time
import base64
import pymysql
from base.config.base_config import db_config
class DatabaseMySQL(object):
def __init__(self):
super(DatabaseMySQL, self).__init__()
self._conn = None
self._cursor = None
self._connect()
def _connect(self) -> None:
self._conn = pymysql.connect(**db_config)
self._cursor = self._conn.cursor()
def query(self, id_model_process: str) -> list:
# 获取表单信息
sql_query = 'select * from brpa_algorithm_model_process where id={};'.format(id_model_process)
print('SQL: {}'.format(sql_query))
self._cursor.execute(sql_query)
list_result = self._cursor.fetchall()
return list_result
def update(self, id_model_process: str, process_result: str, model_path: str or None, status: int,
update_by="'yan'", update_time=time.strftime('%Y-%m-%d %H:%M:%S')) -> None:
# 替换process_result内部单引号为双引号
process_result = process_result.replace("'", '"')
# Update
update_time = time.strftime('%Y-%m-%d %H:%M:%S')
sql_update = '''update brpa_algorithm_model_process
set process_result = '{}', model_path = '{}', status = {}, update_by = {}, update_time = '{}'
where id = {};'''.format(
process_result, model_path, status, update_by, update_time, id_model_process
) if model_path else '''update brpa_algorithm_model_process
set process_result = '{}', status = {}, update_by = {}, update_time = '{}'
where id = {};'''.format(
process_result, status, update_by, update_time, id_model_process
)
print('SQL: {}'.format(sql_update))
self._cursor.execute(sql_update)
self._conn.commit()
def close(self) -> None:
self._cursor.close()
self._conn.close()
if __name__ == '__main__':
import json
id_model_process = '1453295293008211969'
dict_result = {
'result': '训练成功!模型评测指标为: precision: {:.0f}% recall: {:.0f}% f1-score: {:.0f}%'.format(
0.91111111111111 * 100,
0.91111111111111 * 100,
0.91111111111111 * 100
)
}
dbm = DatabaseMySQL()
list_result = dbm.query(id_model_process=id_model_process)
model_path = '/home/zzsn/liuyan/zzsn_nlp_br/classification/model/model_saved/fast_text-pro_info_filter-2021_10_14-18_37_50/model.bin'
dbm.update(id_model_process=id_model_process, process_result=dict_result['result'], model_path=model_path, status=1)
dict_result = {
'result': '训练失败!'
}
dbm.update(id_model_process='1453536215885279233', process_result=dict_result['result'], model_path=None, status=2)
list_result = dbm.query(id_model_process=id_model_process)
dbm.close()
#!/usr/bin/env phthon3
# -*- coding: utf-8 -*
# @File : log
# @Author : LiuYan
# @Time : 2020/6/21 21:08
import os
import logging
import logging.handlers
from pathlib import Path
__all__ = ['logger']
# 用户配置部分 ↓
import tqdm
LEVEL_COLOR = {
'DEBUG': 'cyan',
'INFO': 'green',
'WARNING': 'yellow',
'ERROR': 'red',
'CRITICAL': 'red,bg_white',
}
STDOUT_LOG_FMT = '%(log_color)s[%(asctime)s] [%(levelname)s] [%(threadName)s] [%(filename)s:%(lineno)d] %(message)s'
STDOUT_DATE_FMT = '%Y-%m-%d %H:%M:%S'
FILE_LOG_FMT = '[%(asctime)s] [%(levelname)s] [%(threadName)s] [%(filename)s:%(lineno)d] %(message)s'
FILE_DATE_FMT = '%Y-%m-%d %H:%M:%S'
# 用户配置部分 ↑
class ColoredFormatter(logging.Formatter):
COLOR_MAP = {
'black': '30',
'red': '31',
'green': '32',
'yellow': '33',
'blue': '34',
'magenta': '35',
'cyan': '36',
'white': '37',
'bg_black': '40',
'bg_red': '41',
'bg_green': '42',
'bg_yellow': '43',
'bg_blue': '44',
'bg_magenta': '45',
'bg_cyan': '46',
'bg_white': '47',
'light_black': '1;30',
'light_red': '1;31',
'light_green': '1;32',
'light_yellow': '1;33',
'light_blue': '1;34',
'light_magenta': '1;35',
'light_cyan': '1;36',
'light_white': '1;37',
'light_bg_black': '100',
'light_bg_red': '101',
'light_bg_green': '102',
'light_bg_yellow': '103',
'light_bg_blue': '104',
'light_bg_magenta': '105',
'light_bg_cyan': '106',
'light_bg_white': '107',
}
def __init__(self, fmt, datefmt):
super(ColoredFormatter, self).__init__(fmt, datefmt)
def parse_color(self, level_name):
color_name = LEVEL_COLOR.get(level_name, '')
if not color_name:
return ""
color_value = []
color_name = color_name.split(',')
for _cn in color_name:
color_code = self.COLOR_MAP.get(_cn, '')
if color_code:
color_value.append(color_code)
return '\033[' + ';'.join(color_value) + 'm'
def format(self, record):
record.log_color = self.parse_color(record.levelname)
message = super(ColoredFormatter, self).format(record) + '\033[0m'
return message
class TqdmLoggingHandler(logging.Handler):
def __init__(self, level=logging.NOTSET):
super().__init__(level)
def emit(self, record):
try:
msg = self.format(record)
tqdm.tqdm.write(msg)
self.flush()
except (KeyboardInterrupt, SystemExit):
raise
except:
self.handleError(record)
def _get_logger(log_to_file=True, log_filename='default.log', log_level='DEBUG'):
_logger = logging.getLogger(__name__)
stdout_handler = logging.StreamHandler()
stdout_handler.setFormatter(
ColoredFormatter(
fmt=STDOUT_LOG_FMT,
datefmt=STDOUT_DATE_FMT,
)
)
_logger.addHandler(stdout_handler)
# _logger.setLevel(logging.INFO)
# _logger.addHandler(TqdmLoggingHandler())
if log_to_file:
# _tmp_path = os.path.dirname(os.path.abspath(__file__))
# _tmp_path = os.path.join(_tmp_path, '../logs/{}'.format(log_filename))
_project_path = os.path.dirname(os.getcwd())
_tmp_path = os.path.join(_project_path, 'logs')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
_tmp_path = os.path.join(_tmp_path, log_filename)
file_handler = logging.handlers.TimedRotatingFileHandler(_tmp_path, when='midnight', backupCount=30)
file_formatter = logging.Formatter(
fmt=FILE_LOG_FMT,
datefmt=FILE_DATE_FMT,
)
file_handler.setFormatter(file_formatter)
_logger.addHandler(file_handler)
_logger.setLevel(log_level)
return _logger
logger = _get_logger(log_to_file=False)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : tool
# @Author : LiuYan
# @Time : 2021/6/21 11:22
import re
import json
def read_json(path: str) -> list:
f = open(path, 'r', encoding='utf-8')
examples = []
for line in f.readlines():
examples.append(json.loads(line))
f.close()
return examples
def clean_text(text: str) -> str:
return re.sub('\n+', '\n', text.strip().replace(' ', '').replace('\t', '').replace('\r', ''))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : utils
# @Author : LiuYan
# @Time : 2021/4/16 17:54
from __future__ import unicode_literals, print_function, division
import time
import xlsxwriter
def timeit(f):
def timed(*args, **kw):
ts = time.time()
print('......begin {0:8s}......'.format(f.__name__))
result = f(*args, **kw)
te = time.time()
print('......finish {0:8s}, took:{1:.4f} sec......'.format(f.__name__, te - ts))
return result
return timed
def list2xlsx(result_list: list, xlsx_path: str):
"""
:param result_list: [
{
'id': 1,
'title': 't',
...
}
...
]
:param xlsx_path: '/home/zzsn/liuyan/result/result.xlsx'
:return:
"""
workbook = xlsxwriter.Workbook(xlsx_path)
worksheet = workbook.add_worksheet('sheet1')
worksheet.write_row(row=0, col=0, data=list(result_list[0].keys()))
for row_index, result_dict in enumerate(result_list):
worksheet.write_row(row=row_index + 1, col=0, data=list(
';'.join(result) if type(result) in [list, set] else result for result in result_dict.values()
))
workbook.close()
获取语料目录情况:
获取语料目录情况:
真实的show_file_url地址:http://ip:port + operation_prefix + show_file_url
真实的upload_file_url地址:http://ip:port + operation_prefix + upload_file_url
真实的publish_version_url地址:http://ip:port + operation_prefix + publish_version_url
真实的model_test_url地址:http://ip:port + operation_prefix + model_test_url
真实的train_file地址:http://ip:port + train_url
真实的application_url地址:http://ip:port + application_prefi + /pred/
# 真实的remove_file_url地址:http://ip:port + operation_prefix + remove_file_url
http://114.116.90.53:4004/new_task/
{
"port": 4004,
"ip": "114.116.90.53",
"model_name": "ssyw_column_classify",
"operation_prefix": "/platform/operation/process",
"application_prefix": "/platform/classification/ssyw_column/classify",
"train_url": "/platform/classification/ssyw_column/classify/model_train/",
"application_url": "/pred/",
"show_file_url": "/show_file/",
"remove_file_url": "/remove_file/",
"upload_file_url": "/upload_file/",
"publish_version_url": "/publish_version/",
"model_test_url": "/model_test/",
"dataset_saved_path": "../datasets/classification",
"model_saved_path": "../../../model_saved/classification",
"java_call_back_url": "http://114.115.205.50:9988/manage/algorithmModel/process/changeStatus",
"train_info": {
"modelProcessId": {
"paramter_name": "模型任务Id",
"paramter_data": "",
"paramter_description": "模型训练任务id,关联哪个模型"
},
"learning_rate": {
"paramter_name": "学习率",
"paramter_data": 0.03,
"paramter_description": "学习率"
},
"epoch": {
"paramter_name": "训练轮数",
"paramter_data": 10,
"paramter_description": "训练轮数"
},
"gpu": {
"paramter_name": "GPU",
"paramter_data": "",
"paramter_description": "是否使用GPU"
},
"data_path": {
"paramter_name": "语料版本",
"paramter_data": "",
"paramter_description": "模型训练时用户填入参数——语料版本"
},
"model_path": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "模型训练时用户填入参数——模型版本"
}
},
"application_info": {
"title": {
"paramter_name": "文章标题",
"paramter_data": "",
"paramter_description": "文章标题"
},
"content": {
"paramter_name": "文章内容",
"paramter_data": "",
"paramter_description": "文章内容"
},
"id": {
"paramter_name": "文章id",
"paramter_data": "",
"paramter_description": "文章id"
}
},
"show_file_info": {
"file_path": {
"paramter_name": "查询文件的相对路径",
"paramter_data": "",
"paramter_description": "要查询的文件目录,注意这里是相对地址,eg: 查询语料保存根目录dataset_saved_path的语料情况可传入../datasets/classification/"
}
},
"remove_file_info": {
"file_path": {
"paramter_name": "删除文件的相对路径",
"paramter_data": "",
"paramter_description": "要删除的文件,注意这里是相对地址,eg: 删除语料保存根目录dataset_saved_path下的ssyw_column_classify语料文件夹可传入../datasets/classification/ssyw_column_classify"
},
"flag": {
"paramter_name": "文件删除标识",
"paramter_data": "",
"paramter_description": "删除文件还是文件夹的标识,删除文件时flag=“/”,删除文件夹时flag为空字符串"
}
},
"upload_file_info": {
"url_path": {
"paramter_name": "语料下载地址",
"paramter_data": "",
"paramter_description": "待上传的语料文件下载地址,当前仅支持xlsx和xls文件,且文件内容需要包含title、content、label三个字段"
},
"dataFolderName": {
"paramter_name": "语料版本名称",
"paramter_data": "",
"paramter_description": "待上传的语料版本名称,在训练的时候使用"
}
},
"publish_version": {
"trainModelName": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "待发布的模型版本"
},
"versionName": {
"paramter_name": "发布版本号",
"paramter_data": "",
"paramter_description": "待发布的版本号"
}
},
"model_test_info": {
"modelProcessId": {
"paramter_name": "模型任务Id",
"paramter_data": "",
"paramter_description": "模型训练任务id,关联哪个模型"
},
"trainModelName": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "待测试的模型版本"
},
"data_type": {
"paramter_name": "测试方式",
"paramter_data": "",
"paramter_description": "可选项:url地址解析|file文件"
},
"url_path": {
"paramter_name": "测试文件下载地址",
"paramter_data": "",
"paramter_description": "待上传的测试文件下载地址,当前仅支持xlsx和xls文件,且文件内容需要包含title、content、label三个字段"
},
"title": {
"paramter_name": "文章标题",
"paramter_data": "",
"paramter_description": "文章标题"
},
"content": {
"paramter_name": "文章内容",
"paramter_data": "",
"paramter_description": "文章内容"
}
}
}
\ No newline at end of file
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import socket
import os
import psutil
# 获取CPU负载信息
def get_cpu():
last_worktime = 0
last_idletime = 0
f = open("/proc/stat", "r")
line = ""
while not "cpu " in line: line = f.readline()
f.close()
spl = line.split(" ")
worktime = int(spl[2]) + int(spl[3]) + int(spl[4])
idletime = int(spl[5])
dworktime = (worktime - last_worktime)
didletime = (idletime - last_idletime)
rate = float(dworktime) / (didletime + dworktime)
last_worktime = worktime
last_idletime = idletime
if (last_worktime == 0): return 0
return rate
def get_hostname():
return socket.gethostname()
def get_uptime():
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
uptime_minutes, uptime_seconds = divmod(uptime_seconds, 60)
uptime_hours, uptime_minutes = divmod(uptime_minutes, 60)
uptime_days, uptime_hours = divmod(uptime_hours, 24)
return f"{int(uptime_days)} days, {int(uptime_hours)} hours, {int(uptime_minutes)} minutes, {int(uptime_seconds)} seconds"
def get_kernel_version():
return os.uname().release
# 获取CPU占用信息
def get_cpu_info():
cpu_usage = int(get_cpu() * 100)
# cpu_tip = "CPU使用率(最大100%):" + str(cpu_usage) + "%"
# print(str(cpu_usage))
return str(cpu_usage)
def get_memory_info():
memory_info = psutil.virtual_memory()
return f"Total memory: {memory_info.total / 1024 / 1024:.2f} MB\nUsed memory: {memory_info.used / 1024 / 1024:.2f} MB\nFree memory: {memory_info.available / 1024 / 1024:.2f} MB"
def get_disk_usage():
partitions = psutil.disk_partitions()
disk_usage = ""
for partition in partitions:
usage = psutil.disk_usage(partition.mountpoint)
disk_usage += f"{partition.mountpoint} - Total: {usage.total / 1024 / 1024:.2f} MB, Used: {usage.used / 1024 / 1024:.2f} MB, Free: {usage.free / 1024 / 1024:.2f} MB\n"
return disk_usage
def get_network_interfaces():
interfaces = psutil.net_if_addrs()
network_interfaces = ""
for interface_name, interface_addresses in interfaces.items():
network_interfaces += f"{interface_name}\n"
for address in interface_addresses:
if address.family == socket.AF_INET:
network_interfaces += f" IP address: {address.address}\n"
network_interfaces += f" Netmask: {address.netmask}\n"
elif address.family == socket.AF_PACKET:
network_interfaces += f" MAC address: {address.address}\n"
return network_interfaces
def main_pro():
hostname = get_hostname()
UpTime = get_uptime()
KN_Version = get_kernel_version()
CPU_Info = get_cpu_info()
Memory_Info = get_memory_info()
Disk_Usage = get_disk_usage()
Network_Interfaces = get_network_interfaces()
dict_result = {
"HostName": hostname,
"UpTime": UpTime,
"KN_Version": KN_Version,
"CPU_Info": CPU_Info,
"Memory_Info": Memory_Info,
"Disk_Usage": Disk_Usage,
"Network_Interfaces": Network_Interfaces
}
return dict_result
if __name__ == "__main__":
print(f"Hostname: {get_hostname()}")
print(f"Uptime: {get_uptime()}")
print(f"Kernel version: {get_kernel_version()}")
print(f"CPU information:\n{get_cpu_info()}")
print(f"Memory information:\n{get_memory_info()}")
print(f"Disk usage:\n{get_disk_usage()}")
print(f"Network interfaces:\n{get_network_interfaces()}")
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
创建模型任务功能
http://114.116.90.53:4004/new_task/
"""
import os
import sys, json
import logging
import requests
import argparse
import queue
from pathlib import Path
from flask import Flask, jsonify, request
from main_model import main_info
import re
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
app = Flask(__name__)
# todo: 基于文件名来表示模型名称
root_path = "../"
# 跨域支持1
from flask_cors import CORS
CORS(app, supports_credentials=True)
@app.route('/', methods=['POST'])
def hello_world():
app.logger.info('请选择正确的方式上传!')
return '请选择正确的方式上传!'
@app.route(f'/get_server_info/', methods=['GET', 'POST'])
def get_server_info():
dict_result = main_info()
app.logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route(f'/new_task/', methods=['POST'])
def build_task():
try:
params = json.loads(request.data.decode('utf-8'))
modelName = params["modelName"]
modelPath = os.path.join(root_path, modelName)
if modelName:
# 获取目录下的config.json文件信息返回
config_path = os.path.join(modelPath, "config.json")
config_json = json.load(open(config_path, 'r', encoding='utf-8'))
dict_result = {
"code": 200,
'handleMsg': 'Success',
'logs': None,
"resultData": config_json
}
else:
dict_result = {
"code": 500,
'handleMsg': 'Failure',
'logs': None,
"resultData": "请选择模型管理中存在的模型来进行创建模型任务!"
}
except Exception as e:
dict_result = {
'code': 500,
'success': 'false',
'message': "操作失败" + str(e),
'result': None
}
app.logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
if __name__ == '__main__':
app.config['JSON_AS_ASCII'] = False
app.config['JSONIFY_MIMETYPE'] = "application/json;charset=utf-8"
app.run(host='0.0.0.0', port=4004, debug=False)
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import socket
import os
import psutil
import platform
# 获取CPU负载信息
def get_cpu():
last_worktime = 0
last_idletime = 0
f = open("/proc/stat", "r")
line = ""
while not "cpu " in line: line = f.readline()
f.close()
spl = line.split(" ")
worktime = int(spl[2]) + int(spl[3]) + int(spl[4])
idletime = int(spl[5])
dworktime = (worktime - last_worktime)
didletime = (idletime - last_idletime)
rate = float(dworktime) / (didletime + dworktime)
last_worktime = worktime
last_idletime = idletime
if (last_worktime == 0): return 0
return rate
def get_hostname():
return socket.gethostname()
def get_uptime():
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
uptime_minutes, uptime_seconds = divmod(uptime_seconds, 60)
uptime_hours, uptime_minutes = divmod(uptime_minutes, 60)
uptime_days, uptime_hours = divmod(uptime_hours, 24)
return f"{int(uptime_days)} days, {int(uptime_hours)} hours, {int(uptime_minutes)} minutes, {int(uptime_seconds)} seconds"
def get_kernel_version():
return os.uname().release
# 获取CPU占用信息
def get_cpu_info():
cpu_usage = int(get_cpu() * 100)
# cpu_tip = "CPU使用率(最大100%):" + str(cpu_usage) + "%"
# print(str(cpu_usage))
return str(cpu_usage)
def get_memory_info():
memory_info = psutil.virtual_memory()
return f"Total memory: {memory_info.total / 1024 / 1024:.2f} MB\nUsed memory: {memory_info.used / 1024 / 1024:.2f} MB\nFree memory: {memory_info.available / 1024 / 1024:.2f} MB"
def get_disk_usage():
partitions = psutil.disk_partitions()
disk_usage = ""
for partition in partitions:
usage = psutil.disk_usage(partition.mountpoint)
disk_usage += f"{partition.mountpoint} - Total: {usage.total / 1024 / 1024:.2f} MB, Used: {usage.used / 1024 / 1024:.2f} MB, Free: {usage.free / 1024 / 1024:.2f} MB\n"
return disk_usage
def get_network_interfaces():
interfaces = psutil.net_if_addrs()
network_interfaces = ""
for interface_name, interface_addresses in interfaces.items():
network_interfaces += f"{interface_name}\n"
for address in interface_addresses:
if address.family == socket.AF_INET:
network_interfaces += f" IP address: {address.address}\n"
network_interfaces += f" Netmask: {address.netmask}\n"
elif address.family == socket.AF_PACKET:
network_interfaces += f" MAC address: {address.address}\n"
return network_interfaces
def get_public_ip():
"""
获取公网IP地址
"""
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
ip = s.getsockname()[0]
s.close()
return ip
def main_pro():
hostname = get_hostname()
# UpTime = get_uptime()
KN_Version = get_kernel_version()
CPU_Info = get_cpu_info()
Memory_Info = get_memory_info()
Disk_Usage = get_disk_usage()
ip = get_public_ip()
dict_result = {
"HostName": hostname,
# "UpTime": UpTime,
"KN_Version": KN_Version,
"CPU_Info": CPU_Info,
"Memory_Info": Memory_Info,
"Disk_Usage": Disk_Usage,
"Network_Interfaces": ip
}
return dict_result
def main_info():
# 获取操作系统信息
os_info = platform.platform()
# 获取处理器信息
processor_info = platform.processor()
# 获取可用内存大小
mem_info = psutil.virtual_memory()
available_mem = round(mem_info.available / 1024 / 1024, 2)
# 获取可用硬盘大小
disk_info = psutil.disk_usage('/')
available_disk = round(disk_info.free / 1024 / 1024, 2)
# 获取私有ip
ip = get_public_ip()
# 打印机器信息
print("操作系统:", os_info)
print("处理器型号:", processor_info)
print("可用内存大小:", available_mem, "MB")
print("可用硬盘大小:", available_disk, "MB")
print("ip地址:", ip)
dict_result = {
"操作系统:": os_info,
"处理器型号:": processor_info,
"可用内存大小:": available_mem,
"可用硬盘大小:": available_disk,
"ip地址:": "114.116.90.53"
}
return dict_result
if __name__ == "__main__":
main_info()
# import requests
#
# response = requests.get('https://api.ipify.org')
# public_ip = response.text
#
# print(public_ip)
# dict_result = main_pro()
# print(dict_result)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论