提交 91d93313 作者: bruxellse_li

平台模型管理

上级 6291eec9
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
<serverData>
<paths name="python@180.76.177.55:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.130.239:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.9.59:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
</component>
</project>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="49">
<item index="0" class="java.lang.String" itemvalue="pandas" />
<item index="1" class="java.lang.String" itemvalue="tqdm" />
<item index="2" class="java.lang.String" itemvalue="transformers" />
<item index="3" class="java.lang.String" itemvalue="sentencepiece" />
<item index="4" class="java.lang.String" itemvalue="keras" />
<item index="5" class="java.lang.String" itemvalue="gevent" />
<item index="6" class="java.lang.String" itemvalue="torch" />
<item index="7" class="java.lang.String" itemvalue="numpy" />
<item index="8" class="java.lang.String" itemvalue="Flask" />
<item index="9" class="java.lang.String" itemvalue="thulac" />
<item index="10" class="java.lang.String" itemvalue="beautifulsoup4" />
<item index="11" class="java.lang.String" itemvalue="fdfs_client" />
<item index="12" class="java.lang.String" itemvalue="pymysql" />
<item index="13" class="java.lang.String" itemvalue="selenium" />
<item index="14" class="java.lang.String" itemvalue="matplotlib" />
<item index="15" class="java.lang.String" itemvalue="pyecharts" />
<item index="16" class="java.lang.String" itemvalue="requests" />
<item index="17" class="java.lang.String" itemvalue="docx" />
<item index="18" class="java.lang.String" itemvalue="flask_sqlalchemy" />
<item index="19" class="java.lang.String" itemvalue="scikit_learn" />
<item index="20" class="java.lang.String" itemvalue="gensim" />
<item index="21" class="java.lang.String" itemvalue="sentence_transformers" />
<item index="22" class="java.lang.String" itemvalue="elasticsearch" />
<item index="23" class="java.lang.String" itemvalue="nltk" />
<item index="24" class="java.lang.String" itemvalue="symspellpy" />
<item index="25" class="java.lang.String" itemvalue="wordcloud" />
<item index="26" class="java.lang.String" itemvalue="concurrent_log_handler" />
<item index="27" class="java.lang.String" itemvalue="setuptools" />
<item index="28" class="java.lang.String" itemvalue="gunicorn" />
<item index="29" class="java.lang.String" itemvalue="jieba" />
<item index="30" class="java.lang.String" itemvalue="flask" />
<item index="31" class="java.lang.String" itemvalue="flak_cors" />
<item index="32" class="java.lang.String" itemvalue="paddle" />
<item index="33" class="java.lang.String" itemvalue="bert_serving" />
<item index="34" class="java.lang.String" itemvalue="certifi" />
<item index="35" class="java.lang.String" itemvalue="SQLAlchemy" />
<item index="36" class="java.lang.String" itemvalue="xlrd" />
<item index="37" class="java.lang.String" itemvalue="bert_serving_client" />
<item index="38" class="java.lang.String" itemvalue="pytime" />
<item index="39" class="java.lang.String" itemvalue="goose3" />
<item index="40" class="java.lang.String" itemvalue="Flask_Cors" />
<item index="41" class="java.lang.String" itemvalue="paddlepaddle" />
<item index="42" class="java.lang.String" itemvalue="trustai" />
<item index="43" class="java.lang.String" itemvalue="paddle_serving_client" />
<item index="44" class="java.lang.String" itemvalue="tritonclient" />
<item index="45" class="java.lang.String" itemvalue="paddle_serving_server" />
<item index="46" class="java.lang.String" itemvalue="paddlenlp" />
<item index="47" class="java.lang.String" itemvalue="openai" />
<item index="48" class="java.lang.String" itemvalue="feedparser" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Model-Management.iml" filepath="$PROJECT_DIR$/.idea/Model-Management.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Remote Python 3.9.5 (sftp://root@114.116.90.53:22/home/python/anaconda3/envs/JXYQ@py39/bin/python3.9)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" autoUpload="On explicit save action" serverName="FastText-Model" remoteFilesAllowedToDisappearOnAutoupload="false" autoUploadExternalChanges="true">
<serverData>
<paths name="FastText-Model">
<serverdata>
<mappings>
<mapping deploy="/" local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="python@180.76.177.55:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.130.239:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.115.141.81:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.9.59:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.90.53:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
<option name="myAutoUpload" value="ON_EXPLICIT_SAVE" />
</component>
</project>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="49">
<item index="0" class="java.lang.String" itemvalue="pandas" />
<item index="1" class="java.lang.String" itemvalue="tqdm" />
<item index="2" class="java.lang.String" itemvalue="transformers" />
<item index="3" class="java.lang.String" itemvalue="sentencepiece" />
<item index="4" class="java.lang.String" itemvalue="keras" />
<item index="5" class="java.lang.String" itemvalue="gevent" />
<item index="6" class="java.lang.String" itemvalue="torch" />
<item index="7" class="java.lang.String" itemvalue="numpy" />
<item index="8" class="java.lang.String" itemvalue="Flask" />
<item index="9" class="java.lang.String" itemvalue="thulac" />
<item index="10" class="java.lang.String" itemvalue="beautifulsoup4" />
<item index="11" class="java.lang.String" itemvalue="fdfs_client" />
<item index="12" class="java.lang.String" itemvalue="pymysql" />
<item index="13" class="java.lang.String" itemvalue="selenium" />
<item index="14" class="java.lang.String" itemvalue="matplotlib" />
<item index="15" class="java.lang.String" itemvalue="pyecharts" />
<item index="16" class="java.lang.String" itemvalue="requests" />
<item index="17" class="java.lang.String" itemvalue="docx" />
<item index="18" class="java.lang.String" itemvalue="flask_sqlalchemy" />
<item index="19" class="java.lang.String" itemvalue="scikit_learn" />
<item index="20" class="java.lang.String" itemvalue="gensim" />
<item index="21" class="java.lang.String" itemvalue="sentence_transformers" />
<item index="22" class="java.lang.String" itemvalue="elasticsearch" />
<item index="23" class="java.lang.String" itemvalue="nltk" />
<item index="24" class="java.lang.String" itemvalue="symspellpy" />
<item index="25" class="java.lang.String" itemvalue="wordcloud" />
<item index="26" class="java.lang.String" itemvalue="concurrent_log_handler" />
<item index="27" class="java.lang.String" itemvalue="setuptools" />
<item index="28" class="java.lang.String" itemvalue="gunicorn" />
<item index="29" class="java.lang.String" itemvalue="jieba" />
<item index="30" class="java.lang.String" itemvalue="flask" />
<item index="31" class="java.lang.String" itemvalue="flak_cors" />
<item index="32" class="java.lang.String" itemvalue="paddle" />
<item index="33" class="java.lang.String" itemvalue="bert_serving" />
<item index="34" class="java.lang.String" itemvalue="certifi" />
<item index="35" class="java.lang.String" itemvalue="SQLAlchemy" />
<item index="36" class="java.lang.String" itemvalue="xlrd" />
<item index="37" class="java.lang.String" itemvalue="bert_serving_client" />
<item index="38" class="java.lang.String" itemvalue="pytime" />
<item index="39" class="java.lang.String" itemvalue="goose3" />
<item index="40" class="java.lang.String" itemvalue="Flask_Cors" />
<item index="41" class="java.lang.String" itemvalue="paddlepaddle" />
<item index="42" class="java.lang.String" itemvalue="trustai" />
<item index="43" class="java.lang.String" itemvalue="paddle_serving_client" />
<item index="44" class="java.lang.String" itemvalue="tritonclient" />
<item index="45" class="java.lang.String" itemvalue="paddle_serving_server" />
<item index="46" class="java.lang.String" itemvalue="paddlenlp" />
<item index="47" class="java.lang.String" itemvalue="openai" />
<item index="48" class="java.lang.String" itemvalue="feedparser" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.9.5 (sftp://root@114.116.90.53:22/home/python/anaconda3/envs/JXYQ@py39/bin/python3.9)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/FastText-Model.iml" filepath="$PROJECT_DIR$/.idea/FastText-Model.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteMappingsManager">
<list>
<list>
<remote-mappings server-id="python@sftp://root@114.116.90.53:22/home/python/anaconda3/envs/JXYQ@py39/bin/python3.9">
<settings>
<list>
<mapping local-root="$PROJECT_DIR$" remote-root="/home/python/lzc/新平台模型管理/FastText-Model" />
</list>
</settings>
</remote-mappings>
</list>
</list>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="WebServers">
<option name="servers">
<webServer id="89b44d2f-6e3e-40a6-8aa0-e1bc3fbcfd0f" name="FastText-Model">
<fileTransfer rootFolder="/home/python/lzc/新平台模型管理/FastText-Model" accessType="SFTP" host="114.116.90.53" port="22" sshConfigId="c0166359-81ab-467c-838f-8c7ee48db0f2" sshConfig="root@114.116.90.53:22 password">
<advancedOptions>
<advancedOptions dataProtectionLevel="Private" passiveMode="true" shareSSLContext="true" />
</advancedOptions>
</fileTransfer>
</webServer>
</option>
</component>
</project>
\ No newline at end of file
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# FastText-Model
#### 介绍
新平台NLP算法组 model
#### 安装教程
1. 指定conda环境的python版本
2. 执行requirement.txt
3. 也可以指定运行环境|提前在宿主机上创建好
#### 使用说明
1. xxxx
2. xxxx
3. xxxx
#### 参与贡献
1. Fork 本仓库
2. 新建 Feat_xxx 分支
3. 提交代码
4. 新建 Pull Request
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/7/31 10:21
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : app_config.py
# @Time : 2023/4/1 10:31
# @Author : bruxelles_li
# @Software: PyCharm
import os
import multiprocessing
from pathlib import Path
bind = '0.0.0.0:4005' # 绑定ip和端口号
backlog = 512 # 监听队列
# chdir = '/home/zzsn/liuyan/bin' # gunicorn要切换到的目的工作目录
timeout = 300 # 超时 -> 目前为迎合ZZSN_NLP平台 一带一路要素抽取(文件)需求 暂时关闭超时
# worker_class = 'gevent' # 使用gevent模式,还可以使用sync 模式,默认的是sync模式
# workers = multiprocessing.cpu_count() # 进程数 12
workers = 1 # 低资源 13G 服务器负载过大可调整此处为 1
threads = 50 # 指定每个进程开启的线程数
loglevel = 'error' # 日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置
access_log_format = '%(t)s %(p)s %(h)s "%(r)s" %(s)s %(L)s %(b)s %(f)s" "%(a)s"' # 设置gunicorn访问日志格式,错误日志无法设置
"""
其每个选项的含义如下:
h remote address
l '-'
u currently '-', may be user name in future releases
t date of the request
r status line (e.g. ``GET / HTTP/1.1``)
s status
b response length or '-'
f referer
a user agent
T request time in seconds
D request time in microseconds
L request time in decimal seconds
p process ID
"""
_tmp_path = os.path.dirname(os.path.abspath(__file__))
_tmp_path = os.path.join(_tmp_path, 'log')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
accesslog = os.path.join(_tmp_path, 'gunicorn_access.log') # 访问日志文件
errorlog = os.path.join(_tmp_path, 'gunicorn_error.log') # 错误日志文件
# gunicorn -c app_config.py app_run:app -D --daemon
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 资源检测程序.py
# @Time : 2022/9/30 10:39
# @Author : bruxelles_li
# @Software: PyCharm
import logging
import os, time, re, subprocess
# 获取CPU负载信息
def get_cpu():
last_worktime = 0
last_idletime = 0
f = open("/proc/stat", "r")
line = ""
while not "cpu " in line: line = f.readline()
f.close()
spl = line.split(" ")
worktime = int(spl[2]) + int(spl[3]) + int(spl[4])
idletime = int(spl[5])
dworktime = (worktime - last_worktime)
didletime = (idletime - last_idletime)
rate = float(dworktime) / (didletime + dworktime)
last_worktime = worktime
last_idletime = idletime
if (last_worktime == 0): return 0
return rate
# 获取内存负载信息
def get_mem_usage_percent():
try:
f = open('/proc/meminfo', 'r')
for line in f:
if line.startswith('MemTotal:'):
mem_total = int(line.split()[1])
elif line.startswith('MemFree:'):
mem_free = int(line.split()[1])
elif line.startswith('Buffers:'):
mem_buffer = int(line.split()[1])
elif line.startswith('Cached:'):
mem_cache = int(line.split()[1])
elif line.startswith('SwapTotal:'):
vmem_total = int(line.split()[1])
elif line.startswith('SwapFree:'):
vmem_free = int(line.split()[1])
else:
continue
f.close()
except:
return None
physical_percent = usage_percent(mem_total - (mem_free + mem_buffer + mem_cache), mem_total)
virtual_percent = 0
if vmem_total > 0:
virtual_percent = usage_percent((vmem_total - vmem_free), vmem_total)
return physical_percent, virtual_percent
def usage_percent(use, total):
try:
ret = (float(use) / total) * 100
except ZeroDivisionError:
raise Exception("ERROR - zero division error")
return ret
# 获取磁盘根目录占用信息
def disk_info():
statvfs = os.statvfs('/') # 根目录信息 可根据情况修改
total_disk_space = statvfs.f_frsize * statvfs.f_blocks
free_disk_space = statvfs.f_frsize * statvfs.f_bfree
disk_usage = (total_disk_space - free_disk_space) * 100.0 / total_disk_space
disk_usage = int(disk_usage)
# disk_tip = "硬盘空间使用率(最大100%):" + str(disk_usage) + "%"
# print(str(disk_usage))
return str(disk_usage)
# 获取内存占用信息
def mem_info():
mem_usage = get_mem_usage_percent()
mem_usage = int(mem_usage[0])
# mem_tip = "物理内存使用率(最大100%):" + str(mem_usage) + "%"
# print(str(mem_usage))
return str(mem_usage)
# 获取CPU占用信息
def cpu_info():
cpu_usage = int(get_cpu() * 100)
# cpu_tip = "CPU使用率(最大100%):" + str(cpu_usage) + "%"
# print(str(cpu_usage))
return str(cpu_usage)
# 获取系统占用信息
def sys_info():
load_average = os.getloadavg()
# print(len(load_average))
# load_tip = "系统负载(三个数值中有一个超过3就是高):" + str(load_average)
return len(load_average)
# 获取计算机当前时间
def time_info():
now_time = time.strftime('%Y-%m-%d %H:%M:%S')
return "主机的当前时间:%s" % now_time
# 获取计算机主机名称
def hostname_info():
hostnames = os.popen("hostname").read().strip()
return "你的主机名是: %s" % hostnames
# 获取IP地址信息
def ip_info():
ipadd = os.popen("ip a| grep ens192 | grep inet | awk '{print $2}'").read().strip()
return ipadd
# 获取根的占用信息
def disk_info_root():
child = subprocess.Popen(["df", "-h"], stdout=subprocess.PIPE)
out = child.stdout.readlines()
for item in out:
line = item.strip().split()
# 我这里只查看centos的根
if '/dev/mapper/centos-root' in line:
title = [u'-文件系统-', u'--容量-', u'-已用-', u'-可用-', u'-已用-', u'-挂载点--']
content = "\t".join(title)
if eval(line[4][0:-1]) > 60:
line[0] = 'centos-root'
content += '\r\n' + '\t'.join(line)
return content
# 测试程序
# if __name__ == "__main__":
# disk_information = disk_info()
# disk_usage = [int(s) for s in re.findall(r'\b\d+\b', disk_information)]
# infomation = [hostname_info(), time_info(), disk_information]
# print(disk_usage)
# # 如果磁盘占用高于60%就发邮件告警
# if disk_usage[0] > 60:
# print("当前磁盘占用率已超过60%,建议清除磁盘内存!")
#
# # print(hostname_info())
# # print(time_info())
# # print(ip_info())
# print(sys_info())
# print(cpu_info())
# print(mem_info())
# print(disk_info())
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : main_server.py
# @Time : 2023/3/31 10:31
# @Author : bruxelles_li
# @Software: PyCharm
import logging
import requests
import threading
import sys
import time, os
import json
import pandas as pd
import glob
from pathlib import Path
sys.path.append('../')
# 关闭多余连接
s = requests.session()
s.keep_alive = False
from classification.runner.runner_fast_text import FastTextRunner_train
from detector_source import sys_info, cpu_info, mem_info
from classification.data.data_process import pro_data
# 定义日志输出格式
formatter = logging.Formatter("%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s")
# 创建一个logger, 并设置日志级别
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 创建一个handler,用于将日志输出到控制台,并设置日志级别
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
# 创建一个filehandler,用于将错误日志输出到文件,并设置日志级别
_tmp_path = os.path.dirname(os.path.abspath(__file__))
# print(_tmp_path)
_tmp_path = os.path.join(_tmp_path, 'log')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
fh = logging.FileHandler(os.path.join(_tmp_path, "main_server_error.log"))
fh1 = logging.FileHandler(os.path.join(_tmp_path, "main_server_info.log"))
fh.setLevel(level=logging.ERROR)
fh1.setLevel(level=logging.INFO)
fh.setFormatter(formatter)
fh1.setFormatter(formatter)
# 同时将日志输出到控制台和文件
logger.addHandler(ch)
logger.addHandler(fh)
logger.addHandler(fh1)
# 定义训练配置文件
train_config_path = '../classification/config/fasttext_config_train.yml'
# todo: 定义处理数据相关路径
root_path = r'../word2vec/doc_similarity/'
stop_words_path = os.path.join(root_path, 'stop_words.txt')
save_data_path = r'../datasets/classification/{}/{}/{}.txt'
file_types = ['xls', 'xlsx']
# 加载java回调接口
java_call_back_url = "http://192.168.1.82:9988/manage/algorithmModel/process/changeStatus"
# 加载端口号
port = 4005
modelName = "FastText-Model"
# TODO: 定义进程存放列表
all_thread = []
def merge_df(dataset_path):
all_files = []
for file_type in file_types:
all_files.extend(glob.glob(os.path.join(dataset_path, f'*.{file_type}')))
# 将所有文件合并到一个DataFrame中
combined_df = pd.concat([pd.read_excel(f) for f in all_files], ignore_index=True)
# 去除重复行
combined_df.drop_duplicates(keep='first', inplace=True)
return combined_df
def train_model4FastText(data_path, model_path, modelProcessId, root_dataset):
"""
train
:return:
"""
combined_df = merge_df(dataset_path=root_dataset)
# 预处理数据
pro_data(dataFolderName=data_path, data_df=combined_df, stop_words_path=stop_words_path,
save_data_path=save_data_path, modelName=modelName)
logger.info("====数据预处理成功,准备进入训练阶段===")
# 进入训练
runner_train = FastTextRunner_train(config_path=train_config_path, model_train=True)
runner_train.train(data_path=data_path, model_path=model_path, auto_tune_duration=300)
dict_result = runner_train.test(data_path=data_path, model_path=model_path)
str_dict_result = json.dumps(dict_result, ensure_ascii=False)
logger.info(str_dict_result)
# todo: 调用java的状态更新接口返回训练后的结果
payload = json.dumps({
"id": modelProcessId,
"result": str_dict_result
})
# todo: 调用接口访问实施生成参数函数来生成currentTime, appId
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url=f"{java_call_back_url}",
headers=headers, data=payload)
r1_json = json.loads(r1.text)
# print(r1_json)
logger.info(r1_json)
return str_dict_result
def env_eval(modelProcessId):
# todo 获取资源相关信息(磁盘占用率、系统占用信息【超过3个为高】、CPU占用率、物理内存占用率)
# disk_usage = disk_info()
sys_usage = sys_info()
cpu_usage = cpu_info()
men_usage = mem_info()
# todo 资源不够用时,返回 False
if sys_usage > 10000 or cpu_usage > str(95) or men_usage > str(95):
# todo: 调用java的状态更新接口提示资源占用过高的结果
str_dict_result = {
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '模型训练失败!当前模型训练资源占用率过高,请检查系统占用信息【超过10个为高】、CPU占用率【超过85%为高】、物理内存占用率【超过85%为高】',
'resultData': None
}
logger.info(str_dict_result)
payload = json.dumps({
"id": modelProcessId,
"result": str_dict_result
})
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(
url=f"{java_call_back_url}",
headers=headers, data=payload)
r1_json = json.loads(r1.text)
# print(r1_json)
logger.info(r1_json)
return False
# todo 资源够用时,返回 True
return True
def system_start():
while True:
# print("=====正在进行训练服务=====")
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url=f'http://localhost:{int(port)}/queue_size', headers=headers)
r1_json = json.loads(r1.text)
# print(r1_json)
queue_left_number = r1_json['queue_left_number']
logger.info("当前队列任务总数:" + str(queue_left_number))
if queue_left_number == 0:
# logger.warning("队列为空!无可处理任务。")
time.sleep(30)
else:
for i in range(queue_left_number):
r2 = requests.post(url=f'http://localhost:{int(port)}/subject_consumer', headers=headers)
r2_json = json.loads(r2.text)
config_info = r2_json['data']
logger.info(config_info)
modelProcessId = config_info["modelProcessId"]
model_path = config_info["model_path"]
data_path = config_info["data_path"]
root_dataset = config_info["root_dataset"]
logger.info('##########FastText-Model###############')
t = threading.Thread(target=train_model4FastText,
args=(data_path, model_path, modelProcessId, root_dataset),
daemon=True)
while True:
if env_eval(modelProcessId):
break
else:
time.sleep(600)
# 启动
t.start()
all_thread.append(t)
def system_resume():
"""
恢复模型训练服务状态
:return:
"""
headers = {
'Content-Type': 'application/json'
}
# 清空当前服务中的队列,避免重复启动同一个模型训练
r1 = requests.post(url=f'http://localhost:{int(port)}/queue_size', headers=headers)
r1_json = r1.json()
logger.info('当前队列数量:%d' % r1_json['queue_left_number'])
if r1_json['queue_left_number'] > 0:
logger.info('正在消费队列,直到队列为空!')
while True:
r2 = requests.post(url=f'http://localhost:{int(port)}/subject_consumer', headers=headers)
r2_json = r2.json()
if r2_json['queue_left_number'] == 0:
logger.info('队列消费完毕!可放心进行模型训练 ...')
break
else:
logger.info('队列为空!可放心进行模型训练 ...')
def start_up_check():
"""
启动前检查
:return:
"""
while True:
try:
headers = {
'Content-Type': 'application/json'
}
r0 = requests.post(url=f'http://localhost:{int(port)}/queue_size', headers=headers)
server_started = True
except requests.exceptions.ConnectionError as e:
server_started = False
logger.error("Error: ConnectionError")
logger.warning('服务未启动,请先启动server! 程序已退出。')
exit(123)
# logger.info('server正在尝试自启 ...')
# time.sleep(3)
if server_started:
logger.info("server启动成功!模型训练服务已启动...")
break
if __name__ == '__main__':
# root_path = "../datasets/classification/zcjd_column_classify/zcjd_V0"
# data_df = merge_df(root_path)
# print(len(data_df))
# print(data_df)
# 开始启动模型训练服务
start_up_check()
logger.info('模型训练服务恢复中 ...')
system_resume()
time.sleep(30)
logger.info('模型训练服务恢复完成!')
logger.info('模型训练服务运行中 ...')
system_start()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/7/31 10:21
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/21 9:30
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_app
# @Author : LiuYan
# @Time : 2021/4/21 9:30
import json
from flask import Flask, Blueprint, request
from utils.log import logger
app = Flask(__name__)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:03
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_config
# @Author : LiuYan
# @Time : 2021/4/16 18:06
import os
import pymysql
from abc import abstractmethod, ABC
# root_dir = '/data/lzc/zzsn_nlp_br'
# root_dir = '/data/lzc'
root_dir = '..' # deploy
db_config = {
'host': os.environ.get('brpa_tidb_host') if 'brpa_tidb_host' in os.environ else None,
'port': int(os.environ.get('brpa_tidb_port')) if 'brpa_tidb_port' in os.environ else None,
'user': os.environ.get('brpa_tidb_user') if 'brpa_tidb_user' in os.environ else None,
'password': os.environ.get('brpa_tidb_password') if 'brpa_tidb_password' in os.environ else None,
'database': os.environ.get('brpa_tidb_database') if 'brpa_tidb_database' in os.environ else None,
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor
}
class BaseConfig(ABC):
@abstractmethod
def __init__(self):
super(BaseConfig, self).__init__()
@abstractmethod
def load_config(self):
"""
Add the config you need.
:return: config(YamlDict)
"""
pass
home:
dir: '/data/lzc'
# Please set the GPU or CPU to be used for your model training in the LoadConfig object
device: "cuda:0"
# shared for multiple projects in this machine, raw data, read only
data:
# base: '/data'
base: 'd:/data'
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:03
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_data_loader
# @Author : LiuYan
# @Time : 2021/4/19 9:37
from abc import ABC, abstractmethod
class BaseDataLoader(ABC):
@abstractmethod
def __init__(self):
super(BaseDataLoader, self).__init__()
@abstractmethod
def _load_data(self):
"""
load raw data according to data config
:return:
"""
pass
@abstractmethod
def load_train(self):
pass
@abstractmethod
def load_valid(self):
pass
@abstractmethod
def load_test(self):
pass
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_data_process
# @Author : LiuYan
# @Time : 2021/4/19 9:37
from abc import ABC, abstractmethod
class BaseDataProcess(ABC):
"""
data processing
"""
@abstractmethod
def __init__(self):
super(BaseDataProcess, self).__init__()
@abstractmethod
def process(self):
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_data_reader
# @Author : LiuYan
# @Time : 2021/4/19 9:37
from abc import ABC, abstractmethod
class BaseDataReader(ABC):
@abstractmethod
def __init__(self):
super(BaseDataReader, self).__init__()
@abstractmethod
def reade(self):
pass
@abstractmethod
def save(self):
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:04
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_evaluator
# @Author : LiuYan
# @Time : 2021/4/19 10:39
from abc import ABC, abstractmethod
class BaseEvaluator(ABC):
@abstractmethod
def __init__(self):
super(BaseEvaluator, self).__init__()
@abstractmethod
def evaluate(self, dict_inputs: dict) -> tuple:
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:04
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_loss
# @Author : LiuYan
# @Time : 2021/4/19 10:41
from abc import abstractmethod
import torch.nn as nn
class BaseLoss(nn.Module):
def __init__(self, loss_config):
super(BaseLoss, self).__init__()
self._config = loss_config
@abstractmethod
def forward(self, dict_outputs: dict) -> dict:
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:04
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_model
# @Author : LiuYan
# @Time : 2021/4/19 10:42
from abc import ABC, abstractmethod
import torch.nn as nn
class BaseModel(nn.Module, ABC):
def __init__(self):
super(BaseModel, self).__init__()
@abstractmethod
def forward(self, dict_inputs: dict) -> dict:
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 18:04
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : base_runner
# @Author : LiuYan
# @Time : 2021/4/19 10:42
from abc import ABC, abstractmethod
from utils.utils import timeit
class BaseRunner(ABC):
"""
Abstract definition for runner
"""
@abstractmethod
def __init__(self):
pass
@timeit
@abstractmethod
def _build_config(self):
pass
@timeit
@abstractmethod
def _build_data(self):
pass
@timeit
@abstractmethod
def _build_model(self):
pass
@timeit
@abstractmethod
def _build_loss(self):
pass
@timeit
@abstractmethod
def _build_optimizer(self):
pass
@timeit
@abstractmethod
def _build_evaluator(self):
pass
@abstractmethod
def train(self):
pass
@abstractmethod
def _train_epoch(self, epoch: int):
pass
@abstractmethod
def _valid(self, epoch: int):
pass
@abstractmethod
def test(self):
pass
@abstractmethod
def pred(self, title: str, content: str) -> str or dict:
pass
@abstractmethod
def _display_result(self, dict_result: dict):
pass
@abstractmethod
def _save_model(self):
pass
@abstractmethod
def _load_model(self):
pass
class train_BaseRunner(ABC):
"""
Abstract definition for runner
"""
@abstractmethod
def __init__(self):
pass
@timeit
@abstractmethod
def _build_config(self):
pass
@timeit
@abstractmethod
def _build_data(self):
pass
@timeit
@abstractmethod
def _build_model(self):
pass
@timeit
@abstractmethod
def _build_loss(self):
pass
@timeit
@abstractmethod
def _build_optimizer(self):
pass
@timeit
@abstractmethod
def _build_evaluator(self):
pass
@abstractmethod
def train(self):
pass
@abstractmethod
def _train_epoch(self, epoch: int):
pass
@abstractmethod
def _valid(self, data_path, model_path, epoch: int):
pass
@abstractmethod
def test(self):
pass
@abstractmethod
def pred(self, title: str, content: str) -> str or dict:
pass
@abstractmethod
def _display_result(self, dict_result: dict):
pass
@abstractmethod
def _save_model(self, model_path):
pass
@abstractmethod
def _load_model(self):
pass
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/7/31 17:24
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/21 9:59
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/15 10:31
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : fast_text_config
# @Author : LiuYan
# @Time : 2021/4/19 10:46
import dynamic_yaml
import torch
from base.config.base_config import BaseConfig
class FastTextConfig(BaseConfig):
def __init__(self, config_path):
super(FastTextConfig, self).__init__()
self._config_path = config_path
pass
def load_config(self):
with open(self._config_path, mode='r', encoding='UTF-8') as f:
config = dynamic_yaml.load(f)
config.device = torch.device(config.device if torch.cuda.is_available() else 'cpu')
return config
home:
# dir: '/home/zzsn/liuyan' # train or test
dir: '../../..' # deploy
# Shared for multiple modules in the project
project:
name: 'platform_project'
dir:
work: '{home.dir}/{project.name}'
# Please set the GPU or CPU to be used for your model training in the LoadConfig object
device: 'cpu'
status: 'pred' # pred / test / train
# shared for multiple projects in this machine, raw data, read only
data:
dir: ''
name: 'FastText-Model'
num_vocab: ~
num_tag: ~
model:
name: 'Origin-Model'
loss:
name: 'ft_loss'
learn:
time: '2023_03_31-12_15_17'
dir:
work: '{home.dir}/model_saved/classification/{data.name}'
logs: '{learn.dir.work}/log'
saved: '{learn.dir.work}/{model.name}'
result: '{learn.dir.work}/data/result'
# save_model: '{learn.dir.saved}-{learn.time}/model.bin'
load_model: '{learn.dir.saved}-{learn.time}/model.bin'
home:
# dir: '/data/lzc' # train or test
dir: '../../..' # deploy
# Shared for multiple modules in the project
project:
name: 'platform_project'
dir:
work: '{home.dir}/{project.name}'
# Please set the GPU or CPU to be used for your model training in the LoadConfig object
#device: 'cpu'
device: 'cuda:0'
status: 'train' # pred / test / train
# shared for multiple projects in this machine, raw data, read only
data:
dir: '../datasets/classification'
name: 'FastText-Model'
path0: '{data.dir}/{data.name}%s'
train_path: '{data.dir}/{data.name}%s/train.txt'
valid_path: '{data.dir}/{data.name}%s/valid.txt'
test_path: '{data.dir}/{data.name}%s/valid.txt'
batch_size: 4
num_vocab: ~
num_tag: ~
model:
name: 'Origin-Model'
loss:
name: 'ft_loss'
learn:
time: '2023_03_31-12_15_17'
dir:
work: '{home.dir}/model_saved/classification'
logs: '{learn.dir.work}/log'
saved0: '{learn.dir.work}%s'
saved: '{learn.dir.work}/{data.name}%s'
result: '{learn.dir.work}/data/result'
# save_model: '{learn.dir.saved}-{learn.time}/model.bin'
load_model: '{learn.dir.saved}-{learn.time}/model.bin'
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/15 10:31
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : data_process
# @Author : bruxellse_li
# @Time : 2023/3/31 08:39
import os
import pandas as pd
import sys
from pathlib import Path
from pandas import DataFrame
from sklearn.model_selection import train_test_split
# 追加工作路径
sys.path.append('../../')
from classification.utils.utils import *
def process_txt(data_loader: DataFrame, train_file_path: str, valid_file_path: str, stop_words_path:str):
articles = data_loader['article']
labels = data_loader['label']
article_list = []
for article, label in zip(articles, labels):
if type(article) is str:
text = article.replace('\n', '').replace('\r', '').replace('\t', '')
else:
print('{} is not str!'.format(article))
continue
text = seg(text=text, sw=stop_words(path=stop_words_path))
text = '__label__{} {}'.format(label, text)
article_list.append(text)
train_data, valid_data = train_test_split(
article_list, train_size=0.8, random_state=2021, shuffle=True
)
with open(
train_file_path, 'w', encoding='utf-8'
) as train_file, open(
valid_file_path, 'w', encoding='utf-8'
) as valid_file:
for train in train_data:
train_file.write(train + '\n')
for valid in valid_data:
valid_file.write(valid + '\n')
pass
def process(data_loader, train_file_path: str, valid_file_path: str, stop_words_path: str):
# 创建语料路径
# Path(os.path.abspath(os.path.join(train_file_path, os.path.pardir))).mkdir(parents=True, exist_ok=True)
# data_loader = pd.read_excel(path, keep_default_na=False).astype(str)
data_loader['article'] = data_loader['title'] + '。' + data_loader['content']
data_loader['article'] = data_loader.article.apply(clean_tag).apply(clean_txt)
process_txt(
data_loader=data_loader,
train_file_path=train_file_path,
valid_file_path=valid_file_path,
stop_words_path=stop_words_path
)
return None
# 语料处理函数定义
def pro_data(modelName, dataFolderName, data_df, stop_words_path, save_data_path):
# save_data_path = '/home/python/lzc/datasets/classification/{}/{}/{}.txt'
process(
data_loader=data_df,
train_file_path=save_data_path.format(modelName, dataFolderName, 'train'),
valid_file_path=save_data_path.format(modelName, dataFolderName, 'valid'),
stop_words_path=stop_words_path
)
return None
if __name__ == '__main__':
modelName, dataFolderName, data_path = "gzdt_dataset", "gzdt_V1", "../../datasets/Receive_File/测试数据.xlsx"
save_data_path = r'../../datasets/classification/{}/{}/{}.txt'
root_path = r'../../word2vec/doc_similarity/'
stop_words_path = os.path.join(root_path, 'stop_words.txt')
pro_data(modelName, dataFolderName, data_path, stop_words_path, save_data_path)
# date = '20230329'
# path = '../datasets/{}_total_{}.xlsx'
#
# save_data_path = '/home/zzsn/liuyan/datasets/the_belt_and_road/classification/{}/{}_{}.txt'
# # 机械舆情 时事要闻栏目分类
# ssyw_name = 'ssyw_column_classify'
# # 机械舆情 国资动态栏目分类
# gzdt_name = 'gzdt_column_classify'
# # 机械舆情 上下游栏目分类
# sxy_name = 'sxy_column_classify'
# # 机械舆情 行业舆情栏目分类
# hyyq_name = 'hyyq_column_classify'
# # 机械舆情 管理动态栏目分类
# gldt_name = 'gldt_column_classify'
# # 机械舆情 龙头企业栏目分类
# ltqy_name = 'ltqy_column_classify'
# # 机械舆情 新兴领域栏目分类
# xxly_name = 'xxly_column_classify'
# # 机械舆情 综合资讯栏目分类
# zhzx_name = 'zhzx_column_classify'
# # 机械舆情 负面舆情栏目分类
# fmyq_name = 'fmyq_column_classify'
#
# process(
# path=path.format(gzdt_name, date),
# train_file_path=save_data_path.format(gzdt_name, 'train', date),
# valid_file_path=save_data_path.format(gzdt_name, 'valid', date)
# )
# pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : data_stats
# @Author : LiuYan
# @Time : 2021/4/15 16:52
import pandas as pd
from collections import Counter
if __name__ == '__main__':
pass
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/15 10:33
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : eval_classification
# @Author : LiuYan
# @Time : 2021/4/20 21:19
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score
from base.evaluation.base_evaluator import BaseEvaluator
class ClassifyEvaluator(BaseEvaluator):
# def __init__(self, label_dict: dict):
def __init__(self):
super(ClassifyEvaluator, self).__init__()
# self._label_dict = label_dict
# self._count_dict = {'TP': 0}
pass
def evaluate(self, true_list: list, pred_list: list) -> tuple:
dict_result = {}
true_labels = Counter(true_list)
pred_labels = Counter(pred_list)
print(true_labels)
print(pred_labels)
for true_label in true_labels:
# print(true_labels[true_label], pred_labels[true_label])
dict_result[true_label] = {
'precision': 0,
'recall': 0,
'f1-score': 0,
'true_num': 0,
'pred_num': pred_labels[true_label],
'total_num': true_labels[true_label]
}
for true, pred in zip(true_list, pred_list):
if true == pred:
dict_result[true]['true_num'] += 1
print('\n' + ''.join('-' for i in range(89)))
print('label_type\t\t\tp\t\t\tr\t\t\tf1\t\t\ttrue_num\t\t\tpred_num\ttotal_num')
string = '{0}{1:<12.4f}{2:<12.4f}{3:<12.4f}{4:<12}{5:<12}{6:<12}'
true_nums, pred_nums, total_nums = 0, 0, 0
for label_type in dict_result:
true_nums += dict_result[label_type]['true_num']
pred_nums += dict_result[label_type]['pred_num']
total_nums += dict_result[label_type]['total_num']
p = dict_result[label_type]['true_num'] / dict_result[label_type]['pred_num'] if dict_result[label_type]['pred_num'] != 0 else 0
r = dict_result[label_type]['true_num'] / dict_result[label_type]['total_num'] if dict_result[label_type]['total_num'] != 0 else 0
f1 = 2 * p * r / (p + r) if p + r != 0 else 0
chunk_type_out = label_type + ''.join(
' ' for i in range(20 - (((len(label_type.encode('utf-8')) - len(label_type)) // 2) + len(label_type)))
)
print(string.format(chunk_type_out, p, r, f1, dict_result[label_type]['true_num'],
dict_result[label_type]['pred_num'], dict_result[label_type]['total_num']), chr(12288))
dict_result[label_type]['precision'] = p
dict_result[label_type]['recall'] = r
dict_result[label_type]['f1-score'] = f1
p = true_nums / pred_nums if pred_nums != 0 else 0
r = true_nums / total_nums if total_nums != 0 else 0
f1 = 2 * p * r / (p + r) if p + r != 0 else 0
print(string.format('average{}'.format(''.join(' ' for i in range(13))), p, r, f1,
true_nums, pred_nums, total_nums), chr(12288))
print(''.join('-' for i in range(89)) + '\n')
dict_result['average'] = {
'precision': p,
'recall': r,
'f1-score': f1,
'true_num': true_nums,
'pred_num': pred_nums,
'total_num': total_nums
}
return p, r, f1, dict_result
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/8/2 15:47
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/15 10:31
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : __init__.py.py
# @Time : 2022/1/5 18:09
# @Author : Mr.Ygg
# @Software: PyCharm
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : test_br_pro_risk_recognition.py
# @Time : 2022/1/5 18:09
# @Author : Mr.Ygg
# @Software: PyCharm
from base.app.base_app import *
from classification.runner.runner_fast_text import FastTextRunner
from classification.utils.utils import load_risk_keywords, is_include_compound_words
# 风险分类
risk_info = [
'外部政治风险',
'主权政治风险',
'社会动荡风险',
'对华关系风险',
'资金风险',
'财政风险',
'汇率风险'
'通货膨胀风险',
'环保风险',
'法律风险',
'突发事件风险',
'项目实施风险',
'企业风险',
'其他风险'
]
ft_config_path = '../config/config_br_pro_risk_recognition.yml'
runner = FastTextRunner(config_path=ft_config_path)
# 招聘股票筛选模型
ft_config_path_rc_f_zp_gp = '../config/config_rc_f_zp_gp.yml'
runner_rc_f_zp_gp = FastTextRunner(config_path=ft_config_path_rc_f_zp_gp)
# 项目资讯正负面信息分析模型
ft_config_path_psa = '../config/config_br_pro_sentiment_analysis.yml'
runner_psa = FastTextRunner(config_path=ft_config_path_psa)
list_country = []
with open('../config/country.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
list_country.append(line.strip().split('(')[0].split('(')[0])
# 模型可识别的风险类型
risk_model_info = [
'社会动荡风险',
'突发事件风险'
]
# 风险分类关键词
dict_risk_keywords = load_risk_keywords('../config/risk_keywords.xlsx')
def pred(title: str, content: str) -> dict:
dict_result = {
'风险类别1': '',
'风险类别2': '',
'风险类别3': '',
'风险类别4': ''
}
# 招聘股票筛选模型
result_rc_f_zp_gp = runner_rc_f_zp_gp.pred(title=title, content=content)
# 0: 非招聘股票 1: 招聘信息 2: 股票信息
bool_rc_f_zp_gp = False if result_rc_f_zp_gp == '1' else True
logger.info('招聘股票筛选模型: {}'.format(result_rc_f_zp_gp))
logger.info('招聘股票筛选模型: {}'.format(bool_rc_f_zp_gp))
# 正负面筛选模型
result_psa = runner_psa.pred(title=title, content=content)
bool_psa = True if result_psa == '项目负面资讯信息' else False
logger.info('正负面筛选模型: {}'.format(result_psa))
logger.info('正负面筛选模型: {}'.format(bool_psa))
# 国家识别筛选模型
bool_country = False
text = title + '。' + content[: len(content) // 5]
for country in list_country:
if country in text:
bool_country = True
logger.info('国家识别筛选模型: {}'.format(country))
break
logger.info('国家识别筛选模型: {}'.format(bool_country))
text = title + '。' + content
if bool_country and bool_psa:
"""
1. 招聘股票筛选模型 -> 非招聘股票信息
2. 国家识别筛选模型 -> 一带一路相关国家
3. 正负面筛选模型 -> 负面信息
"""
# 风险识别筛选模型
result = runner.pred(
title=title,
content=content
)
dict_result['风险类别1'] = result
dict_result['风险类别2'] = result
dict_result['风险类别3'] = result
dict_result['风险类别4'] = result
logger.info('风险识别筛选模型: {}'.format(result))
# 基于关键词的筛选模型
if type(result) is str and result in risk_model_info:
# risk_model_info所包含的风险类别需按照关键词筛选掉一些脏数据
bool_risk_keyword = False
for risk_keyword in dict_risk_keywords[result]:
compound_words = risk_keyword.split('+')
if is_include_compound_words(text=text, compound_words=compound_words):
bool_risk_keyword = True
break
result = result if bool_risk_keyword else '无风险'
dict_result['风险类别3'] = result
dict_result['风险类别4'] = result
logger.info('关键词筛选: {}'.format(bool_risk_keyword))
if result == '无风险':
dict_risk_keywords_num = {
risk_keywords_key: 0 for risk_keywords_key in dict_risk_keywords
}
bool_risk_keyword, risk_category = False, result
for risk_keywords_key in dict_risk_keywords_num:
for risk_keyword in dict_risk_keywords[risk_keywords_key]:
compound_words = risk_keyword.split('+')
if is_include_compound_words(text=text, compound_words=compound_words):
bool_risk_keyword = True
dict_risk_keywords_num[risk_keywords_key] += 1
if bool_risk_keyword:
risk_category = max(dict_risk_keywords_num, key=dict_risk_keywords_num.get)
dict_result['风险类别3'] = risk_category
logger.info('关键词筛选后召回风险信息: {}'.format(risk_category))
elif type(result) is str and result == '无风险':
# 模型识别为无风险的信息,采用关键词召回一些有用的风险信息
dict_risk_keywords_num = {
risk_keywords_key: 0 for risk_keywords_key in dict_risk_keywords
}
# 不召回模型能识别的风险类别? √
for risk_keywords_key in risk_model_info:
dict_risk_keywords_num.pop(risk_keywords_key) if risk_keywords_key in dict_risk_keywords_num else None
bool_risk_keyword, risk_category = False, result
for risk_keywords_key in dict_risk_keywords_num:
for risk_keyword in dict_risk_keywords[risk_keywords_key]:
compound_words = risk_keyword.split('+')
if is_include_compound_words(text=text, compound_words=compound_words):
bool_risk_keyword = True
dict_risk_keywords_num[risk_keywords_key] += 1
if bool_risk_keyword:
risk_category = max(dict_risk_keywords_num, key=dict_risk_keywords_num.get)
dict_result['风险类别2'] = risk_category
dict_result['风险类别3'] = risk_category
dict_result['风险类别4'] = risk_category
logger.info('关键词召回风险信息: {}'.format(risk_category))
else:
result = result if type(result) is str else 'error'
dict_result['风险类别3'] = result
dict_result['风险类别4'] = result
logger.info('ELSE 风险信息: {}'.format(result))
else:
dict_result['风险类别1'] = '无风险'
dict_result['风险类别2'] = '无风险'
dict_result['风险类别3'] = '无风险'
dict_result['风险类别4'] = '无风险'
logger.info('招聘股票|国家识别筛选: 无风险')
return dict_result
if __name__ == '__main__':
import os
import pandas
root_dir = '../data/datasource/test'
# file_name = 'br总资讯'
file_name = '境外快讯_1.4'
df = pandas.read_excel(os.path.join(root_dir, 'input_file/{}.xlsx'.format(file_name)))
df.drop_duplicates(subset='标题', keep='first', inplace=True)
list_title = df['标题']
list_content = df['正文']
dict_risk_result = {
'风险类别1': [],
'风险类别2': [],
'风险类别3': [],
'风险类别4': []
}
list_risk, list_risk_old = [], []
for index, (title, content) in enumerate(zip(list_title, list_content)):
dict_result = pred(title=title, content=content)
for key in dict_risk_result:
dict_risk_result[key].append(dict_result[key] if key in dict_result else 'error')
result_old = runner.pred(title=title, content=content)
list_risk_old.append(result_old)
logger.info('{} / {}\n'.format(index + 1, len(list_title)))
df['风险类别_old'] = list_risk_old
for key in dict_risk_result:
df[key] = dict_risk_result[key]
df.to_excel(os.path.join(root_dir, 'output_file/{}_result_20220112_s.xlsx'.format(file_name)))
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : test_label.py
# @Time : 2022/1/7 18:28
# @Author : Mr.Ygg
# @Software: PyCharm
import os
import pandas as pd
from classification.utils.utils import load_risk_keywords, is_include_compound_words
root_dir = '../data/datasource/test'
# file_name = '项目风险模型数据集_总'
file_name = '去重_F_ZP_GP'
df = pd.read_excel(os.path.join(root_dir, 'input_file/{}.xlsx'.format(file_name)))
list_title = df['标题']
list_content = df['正文']
list_country = []
with open('../config/country.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
list_country.append(line.strip().split('(')[0].split('(')[0])
# 风险分类关键词
dict_risk_keywords = load_risk_keywords('../config/risk_keywords.xlsx')
list_bool_yiqing = []
list_bool_country = []
list_risk_key_words_category = []
for title, content in zip(list_title, list_content):
if type(title) is float:
title = ''
if type(content) is float:
content = ''
# 国家识别筛选模型
bool_country = False
text = title + '。' + content[: len(content) // 5]
for country in list_country:
if country in text:
bool_country = True
list_bool_country.append('是')
break
if not bool_country:
list_bool_country.append('否')
text = title + '。' + content
# 关键词: 疫情
if '疫情' in text:
list_bool_yiqing.append('是')
else:
list_bool_yiqing.append('否')
# 风险关键词
dict_risk_keywords_num = {
risk_keywords_key: 0 for risk_keywords_key in dict_risk_keywords
}
bool_risk_keyword = False
risk_category = '无风险'
for risk_keywords_key in dict_risk_keywords_num:
for risk_keyword in dict_risk_keywords[risk_keywords_key]:
compound_words = risk_keyword.split('+')
if is_include_compound_words(text=text, compound_words=compound_words):
bool_risk_keyword = True
dict_risk_keywords_num[risk_keywords_key] += 1
if bool_risk_keyword:
risk_category = max(dict_risk_keywords_num, key=dict_risk_keywords_num.get)
list_risk_key_words_category.append(risk_category)
df['是否含"疫情"关键词'] = list_bool_yiqing
df['是否含一带一路相关国家'] = list_bool_country
df['关键词分类'] = list_risk_key_words_category
df.to_excel(os.path.join(root_dir, 'output_file/{}_result.xlsx'.format(file_name)))
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : test_label_merge.py
# @Time : 2022/1/10 10:32
# @Author : Mr.Ygg
# @Software: PyCharm
import os
import pandas as pd
root_dir = '../data/datasource/test'
df = pd.read_excel(os.path.join(root_dir, 'input_file/br风险模型数据集_总_20220110.xlsx'))
list_label_1 = df['风险类别'].to_list()
list_label_2 = df['修正风险类别'].to_list()
list_label_3 = df['雪珂终审'].to_list()
list_label = []
for label_1, label_2, label_3 in zip(
list_label_1, list_label_2, list_label_3
):
label = ''
if type(label_1) is str:
label = label_1
if type(label_2) is str:
label = label_2
if type(label_3) is str:
label = label_3
list_label.append(label)
df['label'] = list_label
df.to_excel(os.path.join(root_dir, 'output_file/br风险模型数据集_总_20220110.xlsx'))
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : test_merge.py
# @Time : 2022/1/7 17:23
# @Author : Mr.Ygg
# @Software: PyCharm
import os
import pandas as pd
dict_df = {
'标题': [],
'正文': [],
'状态': [],
'类型': []
}
root_dir = '../data/datasource/test/input_file'
list_file = os.listdir(root_dir)
for file_name in list_file:
file_path = os.path.join(root_dir, file_name)
print(file_path)
df = pd.read_excel(file_path)
list_title = df['标题'].to_list()
list_content = df['正文'].to_list()
list_status = df['审核状态'].to_list()
list_type = df['资讯类型'].to_list()
dict_df['标题'].extend(list_title)
dict_df['正文'].extend(list_content)
dict_df['状态'].extend(list_status)
dict_df['类型'].extend(list_type)
df = pd.DataFrame(dict_df)
df.to_excel(os.path.join(root_dir, 'br总资讯.xlsx'))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/4/16 16:40
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : utils
# @Author : LiuYan
# @Time : 2021/4/16 16:40
import re
import jieba
import pandas
from bs4 import BeautifulSoup
def clean_tag(text):
"""
清除网页标签
:param text:
:return:
"""
bs = BeautifulSoup(str(text), 'html.parser')
return bs.text
def clean_txt(raw):
"""
去除表情
:param raw:
:return:
"""
res = re.compile(u'[\U00010000-\U0010ffff\uD800-\uDBFF\uDC00-\uDFFF]')
return res.sub('', raw)
def seg(text, sw):
"""
分词,NLPTokenizer会基于全部命名实体识别和词性标注进行分词
:param text:
:param NLPTokenizer:
:param sw:
:return:
"""
# text = ' '.join([i.word for i in NLPTokenizer.segment(text) if i.word.strip() and i.word not in sw])
text = ' '.join([i.strip() for i in jieba.cut(text) if i.strip() and i not in sw])
return text
def stop_words(path: str) -> list:
"""
去除停用词
:return:
"""
with open(path, 'r', encoding='utf-8') as swf:
return [line.strip() for line in swf]
def segment_para(text):
"""
:param text:
:return:
"""
split_pattern = re.compile(r'\n|。|?|!|\?|\!|\s')
global_sentences = split_pattern.split(text)
global_sentences = ''.join([str(i).strip() + '。' for i in global_sentences if len(i) >= 13])
return global_sentences
def cut_sent(para):
"""
:param para:
:return:
"""
para = re.sub('([。!?\?])([^”’])', r"\1\n\2", para) # 单字符断句符
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
para = re.sub('([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
para = para.rstrip() # 段尾如果有多余的\n就去掉它
return para.split("\n")
def transform_data(text, label):
"""
:param text:
:param label:
:return:
"""
fasttext_line = '__label__{} {}'.format(label, text)
return fasttext_line
def load_risk_keywords(path: str) -> dict:
"""
加载风险分类关键词
:param path:
:return:
"""
df = pandas.read_excel(path)
dict_risk_keywords = dict()
for key in df:
list_risk_keywords = []
list_df = df[key].to_list()
for keyword in list_df:
if type(keyword) is str:
list_risk_keywords.append(keyword.strip())
dict_risk_keywords[key] = list_risk_keywords
return dict_risk_keywords
def is_include_compound_words(text: str, compound_words: list) -> bool:
"""
文本(text)中是否包含组合词[List]
组合词判断有先后顺序
:param text:
:param compound_words:
:return: True: 是 False: 否
"""
for compound_word in compound_words:
if compound_word not in text:
return False
else:
text = text[text.find(compound_word) + len(compound_word):]
return True
{
"port": 4005,
"ip": "114.116.90.53",
"model_name": "FastText-Model",
"train_url": "/platform/classification/FastText-Model/model_train/",
"application_url": "/platform/classification/FastText-Model/pred/",
"show_file_url": "/platform/operation/process/show_file/",
"remove_file_url": "/platform/operation/process/remove_file/",
"upload_file_url": "/platform/operation/process/upload_file/",
"publish_version_url": "/platform/operation/process/publish_version/",
"model_test_url": "/platform/operation/process/model_test/",
"dataset_saved_path": "../datasets/classification/FastText-Model",
"model_saved_path": "../../../model_saved/classification/FastText-Model",
"java_call_back_url": "http://114.115.205.50:9988/manage/algorithmModel/process/changeStatus",
"train_info": {
"modelProcessId": {
"paramter_name": "训练日志Id",
"paramter_data": "",
"paramter_description": "模型训练日志id"
},
"task_id": {
"paramter_name": "模型训练任务id",
"paramter_data": "",
"paramter_description": "模型训练任务id"
},
"learning_rate": {
"paramter_name": "学习率",
"paramter_data": 0.03,
"paramter_description": "学习率"
},
"gpu": {
"paramter_name": "GPU",
"paramter_data": "",
"paramter_description": "是否使用GPU"
},
"data_path": {
"paramter_name": "语料版本",
"paramter_data": "",
"paramter_description": "模型训练时用户填入参数——语料版本"
},
"model_path": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "模型训练时用户填入参数——模型版本"
}
},
"application_info": {
"title": {
"paramter_name": "文章标题",
"paramter_data": "",
"paramter_description": "文章标题"
},
"content": {
"paramter_name": "文章内容",
"paramter_data": "",
"paramter_description": "文章内容"
},
"id": {
"paramter_name": "文章id",
"paramter_data": "",
"paramter_description": "文章id"
}
},
"show_file_info": {
"file_path": {
"paramter_name": "查询文件的相对路径",
"paramter_data": "",
"paramter_description": "要查询的文件目录,注意这里是相对地址,eg: 查询语料保存根目录dataset_saved_path的语料情况可传入../datasets/classification/"
}
},
"remove_file_info": {
"file_path": {
"paramter_name": "删除文件的相对路径",
"paramter_data": "",
"paramter_description": "要删除的文件,注意这里是相对地址,eg: 删除语料保存根目录dataset_saved_path下的ssyw_column_classify语料文件夹可传入../datasets/classification/ssyw_column_classify"
},
"flag": {
"paramter_name": "文件删除标识",
"paramter_data": "",
"paramter_description": "删除文件还是文件夹的标识,删除文件时flag=“/”,删除文件夹时flag为空字符串"
}
},
"upload_file_info": {
"request_url": {
"paramter_name": "语料下载地址",
"paramter_data": "",
"paramter_description": "待上传的语料文件下载地址,当前仅支持xlsx和xls文件,且文件内容需要包含title、content、label三个字段"
},
"task_id": {
"paramter_name": "模型训练任务id",
"paramter_data": "",
"paramter_description": "模型训练任务id"
}
},
"publish_version": {
"trainModelName": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "待发布的模型版本"
},
"task_id": {
"paramter_name": "模型训练任务id",
"paramter_data": "",
"paramter_description": "模型训练任务id"
}
},
"model_test_info": {
"task_id": {
"paramter_name": "模型训练任务id",
"paramter_data": "",
"paramter_description": "模型训练任务id"
},
"trainModelName": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "待测试的模型版本"
},
"data_type": {
"paramter_name": "测试方式",
"paramter_data": "",
"paramter_description": "可选项:url地址解析标题正文|file文件"
},
"request_url": {
"paramter_name": "测试文件下载地址",
"paramter_data": "",
"paramter_description": "待上传的测试文件下载地址,当前仅支持xlsx和xls文件,且文件内容需要包含title、content、label三个字段"
}
}
}
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
import requests
import json
# 加载java回调接口
java_call_back_url = "http://192.168.1.82:9988/manage/algorithmModel/process/changeStatus"
# 定义日志输出格式
formatter = logging.Formatter("%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s")
# 创建一个logger, 并设置日志级别
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
import requests
import json
url = "http://192.168.1.82:9988/manage/algorithmModel/process/changeStatus"
payload = json.dumps({
"result": "{'code': 200, 'result': '模型训练成功!模型评测指标为: precision: 100% recall: 100% f1-score: 100%', 'model_path': '../../../model_saved/classification/FastText-Model/11111/V0-2023_06_11-15_33_15/model.bin'}",
"id": "1455372078906662913"
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
# dict_result = {'code': 200, 'result': '模型训练成功!模型评测指标为: precision: 100% recall: 100% f1-score: 100%', 'model_path': '../../../model_saved/classification/FastText-Model/11111/V0-2023_06_11-15_33_15/model.bin'}
# modelProcessId = "1455372078906662913"
# str_dict_result = json.dumps(dict_result, ensure_ascii=False)
# print(str_dict_result)
# # todo: 调用java的状态更新接口返回训练后的结果
# payload = json.dumps({
# "id": modelProcessId,
# "result": str_dict_result
# })
# print(payload)
# # todo: 调用接口访问实施生成参数函数来生成currentTime, appId
# headers = {
# 'Content-Type': 'application/json'
# }
# r1 = requests.post(url="http://192.168.1.82:9988/manage/algorithmModel/process/changeStatus",
# headers=headers, data=payload)
#
# r1_json = json.loads(r1.text)
# # print(r1_json)
# print(r1_json)
# python3.9.5
gunicorn==20.1.0
beautifulsoup4==4.11.1
datasketch==1.5.3
dynamic_yaml==1.2.3
emoji==1.4.2
Flask==2.0.1
hanlp==2.1.0b3
jieba==0.42.1
jionlp_py39==1.3.45
keras_bert==0.88.0
matplotlib==3.3.4
numpy==1.19.5
pandas==1.1.5
psutil==5.8.0
PyMySQL==1.0.2
python_Levenshtein==0.20.5
pytorch_pretrained_bert==0.6.2
PyYAML==5.3.1
rarfile==4.0
requests==2.28.1
scikit_learn==1.1.2
seaborn==0.11.2
simhash==2.0.0
tensorflow==2.6.0
torch==1.9.0
tqdm==4.62.2
Werkzeug==2.2.2
xlrd==1.1.0
XlsxWriter==3.0.1
protobuf==3.19.5
Levenshtein==0.20.5
sklearn==0.0
fasttext==0.9.2
#!/bin/sh
exec nohup gunicorn -c app/app_config.py app/app_run:app --timeout 1200 & python app/main_server.py --timeout 300 >service.log 2>&1 &
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : __init__.py
# @Author : LiuYan
# @Time : 2021/7/31 17:36
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : build_word2vec_weights
# @Author : LiuYan
# @Time : 2020/6/24 14:46
from itertools import islice
import numpy as np
import torch
from utils.utils import timeit
@timeit
def load_word2vec(path=None, word_vocab=None, embedding_dim=None):
"""
loading word vector
:param path: None
:param word_vocab: None
:param embedding_dim: 768/100 bert/glove.6B.100d
:return: a vector corresponding to word_vocab.
"""
word_vocab_dict = word_vocab.stoi
vectors_vocab = load_vec(path, embedding_dim=embedding_dim)
if '[PAD]' in vectors_vocab:
pad = vectors_vocab['[PAD]']
elif 'pad' in vectors_vocab:
pad = vectors_vocab['pad']
if '[UNK]' in vectors_vocab:
unk = vectors_vocab['[UNK]']
elif 'unk' in vectors_vocab:
unk = vectors_vocab['unk']
vocab_size = len(word_vocab)
embed_weights = torch.zeros(vocab_size, embedding_dim)
for word, index in word_vocab_dict.items(): # word and index
if word in vectors_vocab:
em = vectors_vocab[word]
elif word == '<pad>':
em = pad
else:
em = unk
embed_weights[index, :] = torch.from_numpy(np.array(em))
return embed_weights
@timeit
def load_vec(path=None, embedding_dim=None):
"""
loading word vector
:param path: None
:param embedding_dim: 768/100 bert/glove.6B.100d
:return: a dictionary of word vectors
"""
vectors_vocab = {}
with open(path, 'r', encoding='utf-8') as f:
for line in islice(f, 1, None): # skip the first row
items = line.split()
char, vectors = items[0], items[-embedding_dim:]
vectors = [float(vector) for vector in vectors]
vectors_vocab[char] = vectors
return vectors_vocab
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : database_mysql
# @Author : LiuYan
# @Time : 2021/9/14 17:51
import time
import base64
import pymysql
from base.config.base_config import db_config
class DatabaseMySQL(object):
def __init__(self):
super(DatabaseMySQL, self).__init__()
self._conn = None
self._cursor = None
self._connect()
def _connect(self) -> None:
self._conn = pymysql.connect(**db_config)
self._cursor = self._conn.cursor()
def query(self, id_model_process: str) -> list:
# 获取表单信息
sql_query = 'select * from brpa_algorithm_model_process where id={};'.format(id_model_process)
print('SQL: {}'.format(sql_query))
self._cursor.execute(sql_query)
list_result = self._cursor.fetchall()
return list_result
def update(self, id_model_process: str, process_result: str, model_path: str or None, status: int,
update_by="'yan'", update_time=time.strftime('%Y-%m-%d %H:%M:%S')) -> None:
# 替换process_result内部单引号为双引号
process_result = process_result.replace("'", '"')
# Update
update_time = time.strftime('%Y-%m-%d %H:%M:%S')
sql_update = '''update brpa_algorithm_model_process
set process_result = '{}', model_path = '{}', status = {}, update_by = {}, update_time = '{}'
where id = {};'''.format(
process_result, model_path, status, update_by, update_time, id_model_process
) if model_path else '''update brpa_algorithm_model_process
set process_result = '{}', status = {}, update_by = {}, update_time = '{}'
where id = {};'''.format(
process_result, status, update_by, update_time, id_model_process
)
print('SQL: {}'.format(sql_update))
self._cursor.execute(sql_update)
self._conn.commit()
def close(self) -> None:
self._cursor.close()
self._conn.close()
if __name__ == '__main__':
import json
id_model_process = '1453295293008211969'
dict_result = {
'result': '训练成功!模型评测指标为: precision: {:.0f}% recall: {:.0f}% f1-score: {:.0f}%'.format(
0.91111111111111 * 100,
0.91111111111111 * 100,
0.91111111111111 * 100
)
}
dbm = DatabaseMySQL()
list_result = dbm.query(id_model_process=id_model_process)
model_path = '/home/zzsn/liuyan/zzsn_nlp_br/classification/model/model_saved/fast_text-pro_info_filter-2021_10_14-18_37_50/model.bin'
dbm.update(id_model_process=id_model_process, process_result=dict_result['result'], model_path=model_path, status=1)
dict_result = {
'result': '训练失败!'
}
dbm.update(id_model_process='1453536215885279233', process_result=dict_result['result'], model_path=None, status=2)
list_result = dbm.query(id_model_process=id_model_process)
dbm.close()
#!/usr/bin/env phthon3
# -*- coding: utf-8 -*
# @File : log
# @Author : LiuYan
# @Time : 2020/6/21 21:08
import os
import logging
import logging.handlers
from pathlib import Path
__all__ = ['logger']
# 用户配置部分 ↓
import tqdm
LEVEL_COLOR = {
'DEBUG': 'cyan',
'INFO': 'green',
'WARNING': 'yellow',
'ERROR': 'red',
'CRITICAL': 'red,bg_white',
}
STDOUT_LOG_FMT = '%(log_color)s[%(asctime)s] [%(levelname)s] [%(threadName)s] [%(filename)s:%(lineno)d] %(message)s'
STDOUT_DATE_FMT = '%Y-%m-%d %H:%M:%S'
FILE_LOG_FMT = '[%(asctime)s] [%(levelname)s] [%(threadName)s] [%(filename)s:%(lineno)d] %(message)s'
FILE_DATE_FMT = '%Y-%m-%d %H:%M:%S'
# 用户配置部分 ↑
class ColoredFormatter(logging.Formatter):
COLOR_MAP = {
'black': '30',
'red': '31',
'green': '32',
'yellow': '33',
'blue': '34',
'magenta': '35',
'cyan': '36',
'white': '37',
'bg_black': '40',
'bg_red': '41',
'bg_green': '42',
'bg_yellow': '43',
'bg_blue': '44',
'bg_magenta': '45',
'bg_cyan': '46',
'bg_white': '47',
'light_black': '1;30',
'light_red': '1;31',
'light_green': '1;32',
'light_yellow': '1;33',
'light_blue': '1;34',
'light_magenta': '1;35',
'light_cyan': '1;36',
'light_white': '1;37',
'light_bg_black': '100',
'light_bg_red': '101',
'light_bg_green': '102',
'light_bg_yellow': '103',
'light_bg_blue': '104',
'light_bg_magenta': '105',
'light_bg_cyan': '106',
'light_bg_white': '107',
}
def __init__(self, fmt, datefmt):
super(ColoredFormatter, self).__init__(fmt, datefmt)
def parse_color(self, level_name):
color_name = LEVEL_COLOR.get(level_name, '')
if not color_name:
return ""
color_value = []
color_name = color_name.split(',')
for _cn in color_name:
color_code = self.COLOR_MAP.get(_cn, '')
if color_code:
color_value.append(color_code)
return '\033[' + ';'.join(color_value) + 'm'
def format(self, record):
record.log_color = self.parse_color(record.levelname)
message = super(ColoredFormatter, self).format(record) + '\033[0m'
return message
class TqdmLoggingHandler(logging.Handler):
def __init__(self, level=logging.NOTSET):
super().__init__(level)
def emit(self, record):
try:
msg = self.format(record)
tqdm.tqdm.write(msg)
self.flush()
except (KeyboardInterrupt, SystemExit):
raise
except:
self.handleError(record)
def _get_logger(log_to_file=True, log_filename='default.log', log_level='DEBUG'):
_logger = logging.getLogger(__name__)
stdout_handler = logging.StreamHandler()
stdout_handler.setFormatter(
ColoredFormatter(
fmt=STDOUT_LOG_FMT,
datefmt=STDOUT_DATE_FMT,
)
)
_logger.addHandler(stdout_handler)
# _logger.setLevel(logging.INFO)
# _logger.addHandler(TqdmLoggingHandler())
if log_to_file:
# _tmp_path = os.path.dirname(os.path.abspath(__file__))
# _tmp_path = os.path.join(_tmp_path, '../logs/{}'.format(log_filename))
_project_path = os.path.dirname(os.getcwd())
_tmp_path = os.path.join(_project_path, 'logs')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
_tmp_path = os.path.join(_tmp_path, log_filename)
file_handler = logging.handlers.TimedRotatingFileHandler(_tmp_path, when='midnight', backupCount=30)
file_formatter = logging.Formatter(
fmt=FILE_LOG_FMT,
datefmt=FILE_DATE_FMT,
)
file_handler.setFormatter(file_formatter)
_logger.addHandler(file_handler)
_logger.setLevel(log_level)
return _logger
logger = _get_logger(log_to_file=False)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : tool
# @Author : LiuYan
# @Time : 2021/6/21 11:22
import re
import json
def read_json(path: str) -> list:
f = open(path, 'r', encoding='utf-8')
examples = []
for line in f.readlines():
examples.append(json.loads(line))
f.close()
return examples
def clean_text(text: str) -> str:
return re.sub('\n+', '\n', text.strip().replace(' ', '').replace('\t', '').replace('\r', ''))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : utils
# @Author : LiuYan
# @Time : 2021/4/16 17:54
from __future__ import unicode_literals, print_function, division
import time
import xlsxwriter
def timeit(f):
def timed(*args, **kw):
ts = time.time()
print('......begin {0:8s}......'.format(f.__name__))
result = f(*args, **kw)
te = time.time()
print('......finish {0:8s}, took:{1:.4f} sec......'.format(f.__name__, te - ts))
return result
return timed
def list2xlsx(result_list: list, xlsx_path: str):
"""
:param result_list: [
{
'id': 1,
'title': 't',
...
}
...
]
:param xlsx_path: '/home/zzsn/liuyan/result/result.xlsx'
:return:
"""
workbook = xlsxwriter.Workbook(xlsx_path)
worksheet = workbook.add_worksheet('sheet1')
worksheet.write_row(row=0, col=0, data=list(result_list[0].keys()))
for row_index, result_dict in enumerate(result_list):
worksheet.write_row(row=row_index + 1, col=0, data=list(
';'.join(result) if type(result) in [list, set] else result for result in result_dict.values()
))
workbook.close()
获取语料目录情况:
获取语料目录情况:
真实的show_file_url地址:http://ip:port + operation_prefix + show_file_url
真实的upload_file_url地址:http://ip:port + operation_prefix + upload_file_url
真实的publish_version_url地址:http://ip:port + operation_prefix + publish_version_url
真实的model_test_url地址:http://ip:port + operation_prefix + model_test_url
真实的train_file地址:http://ip:port + train_url
真实的application_url地址:http://ip:port + application_prefi + /pred/
# 真实的remove_file_url地址:http://ip:port + operation_prefix + remove_file_url
http://114.116.90.53:4004/new_task/
{
"port": 4004,
"ip": "114.116.90.53",
"model_name": "ssyw_column_classify",
"operation_prefix": "/platform/operation/process",
"application_prefix": "/platform/classification/ssyw_column/classify",
"train_url": "/platform/classification/ssyw_column/classify/model_train/",
"application_url": "/pred/",
"show_file_url": "/show_file/",
"remove_file_url": "/remove_file/",
"upload_file_url": "/upload_file/",
"publish_version_url": "/publish_version/",
"model_test_url": "/model_test/",
"dataset_saved_path": "../datasets/classification",
"model_saved_path": "../../../model_saved/classification",
"java_call_back_url": "http://114.115.205.50:9988/manage/algorithmModel/process/changeStatus",
"train_info": {
"modelProcessId": {
"paramter_name": "模型任务Id",
"paramter_data": "",
"paramter_description": "模型训练任务id,关联哪个模型"
},
"learning_rate": {
"paramter_name": "学习率",
"paramter_data": 0.03,
"paramter_description": "学习率"
},
"epoch": {
"paramter_name": "训练轮数",
"paramter_data": 10,
"paramter_description": "训练轮数"
},
"gpu": {
"paramter_name": "GPU",
"paramter_data": "",
"paramter_description": "是否使用GPU"
},
"data_path": {
"paramter_name": "语料版本",
"paramter_data": "",
"paramter_description": "模型训练时用户填入参数——语料版本"
},
"model_path": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "模型训练时用户填入参数——模型版本"
}
},
"application_info": {
"title": {
"paramter_name": "文章标题",
"paramter_data": "",
"paramter_description": "文章标题"
},
"content": {
"paramter_name": "文章内容",
"paramter_data": "",
"paramter_description": "文章内容"
},
"id": {
"paramter_name": "文章id",
"paramter_data": "",
"paramter_description": "文章id"
}
},
"show_file_info": {
"file_path": {
"paramter_name": "查询文件的相对路径",
"paramter_data": "",
"paramter_description": "要查询的文件目录,注意这里是相对地址,eg: 查询语料保存根目录dataset_saved_path的语料情况可传入../datasets/classification/"
}
},
"remove_file_info": {
"file_path": {
"paramter_name": "删除文件的相对路径",
"paramter_data": "",
"paramter_description": "要删除的文件,注意这里是相对地址,eg: 删除语料保存根目录dataset_saved_path下的ssyw_column_classify语料文件夹可传入../datasets/classification/ssyw_column_classify"
},
"flag": {
"paramter_name": "文件删除标识",
"paramter_data": "",
"paramter_description": "删除文件还是文件夹的标识,删除文件时flag=“/”,删除文件夹时flag为空字符串"
}
},
"upload_file_info": {
"url_path": {
"paramter_name": "语料下载地址",
"paramter_data": "",
"paramter_description": "待上传的语料文件下载地址,当前仅支持xlsx和xls文件,且文件内容需要包含title、content、label三个字段"
},
"dataFolderName": {
"paramter_name": "语料版本名称",
"paramter_data": "",
"paramter_description": "待上传的语料版本名称,在训练的时候使用"
}
},
"publish_version": {
"trainModelName": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "待发布的模型版本"
},
"versionName": {
"paramter_name": "发布版本号",
"paramter_data": "",
"paramter_description": "待发布的版本号"
}
},
"model_test_info": {
"modelProcessId": {
"paramter_name": "模型任务Id",
"paramter_data": "",
"paramter_description": "模型训练任务id,关联哪个模型"
},
"trainModelName": {
"paramter_name": "模型版本",
"paramter_data": "",
"paramter_description": "待测试的模型版本"
},
"data_type": {
"paramter_name": "测试方式",
"paramter_data": "",
"paramter_description": "可选项:url地址解析|file文件"
},
"url_path": {
"paramter_name": "测试文件下载地址",
"paramter_data": "",
"paramter_description": "待上传的测试文件下载地址,当前仅支持xlsx和xls文件,且文件内容需要包含title、content、label三个字段"
},
"title": {
"paramter_name": "文章标题",
"paramter_data": "",
"paramter_description": "文章标题"
},
"content": {
"paramter_name": "文章内容",
"paramter_data": "",
"paramter_description": "文章内容"
}
}
}
\ No newline at end of file
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import socket
import os
import psutil
# 获取CPU负载信息
def get_cpu():
last_worktime = 0
last_idletime = 0
f = open("/proc/stat", "r")
line = ""
while not "cpu " in line: line = f.readline()
f.close()
spl = line.split(" ")
worktime = int(spl[2]) + int(spl[3]) + int(spl[4])
idletime = int(spl[5])
dworktime = (worktime - last_worktime)
didletime = (idletime - last_idletime)
rate = float(dworktime) / (didletime + dworktime)
last_worktime = worktime
last_idletime = idletime
if (last_worktime == 0): return 0
return rate
def get_hostname():
return socket.gethostname()
def get_uptime():
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
uptime_minutes, uptime_seconds = divmod(uptime_seconds, 60)
uptime_hours, uptime_minutes = divmod(uptime_minutes, 60)
uptime_days, uptime_hours = divmod(uptime_hours, 24)
return f"{int(uptime_days)} days, {int(uptime_hours)} hours, {int(uptime_minutes)} minutes, {int(uptime_seconds)} seconds"
def get_kernel_version():
return os.uname().release
# 获取CPU占用信息
def get_cpu_info():
cpu_usage = int(get_cpu() * 100)
# cpu_tip = "CPU使用率(最大100%):" + str(cpu_usage) + "%"
# print(str(cpu_usage))
return str(cpu_usage)
def get_memory_info():
memory_info = psutil.virtual_memory()
return f"Total memory: {memory_info.total / 1024 / 1024:.2f} MB\nUsed memory: {memory_info.used / 1024 / 1024:.2f} MB\nFree memory: {memory_info.available / 1024 / 1024:.2f} MB"
def get_disk_usage():
partitions = psutil.disk_partitions()
disk_usage = ""
for partition in partitions:
usage = psutil.disk_usage(partition.mountpoint)
disk_usage += f"{partition.mountpoint} - Total: {usage.total / 1024 / 1024:.2f} MB, Used: {usage.used / 1024 / 1024:.2f} MB, Free: {usage.free / 1024 / 1024:.2f} MB\n"
return disk_usage
def get_network_interfaces():
interfaces = psutil.net_if_addrs()
network_interfaces = ""
for interface_name, interface_addresses in interfaces.items():
network_interfaces += f"{interface_name}\n"
for address in interface_addresses:
if address.family == socket.AF_INET:
network_interfaces += f" IP address: {address.address}\n"
network_interfaces += f" Netmask: {address.netmask}\n"
elif address.family == socket.AF_PACKET:
network_interfaces += f" MAC address: {address.address}\n"
return network_interfaces
def main_pro():
hostname = get_hostname()
UpTime = get_uptime()
KN_Version = get_kernel_version()
CPU_Info = get_cpu_info()
Memory_Info = get_memory_info()
Disk_Usage = get_disk_usage()
Network_Interfaces = get_network_interfaces()
dict_result = {
"HostName": hostname,
"UpTime": UpTime,
"KN_Version": KN_Version,
"CPU_Info": CPU_Info,
"Memory_Info": Memory_Info,
"Disk_Usage": Disk_Usage,
"Network_Interfaces": Network_Interfaces
}
return dict_result
if __name__ == "__main__":
print(f"Hostname: {get_hostname()}")
print(f"Uptime: {get_uptime()}")
print(f"Kernel version: {get_kernel_version()}")
print(f"CPU information:\n{get_cpu_info()}")
print(f"Memory information:\n{get_memory_info()}")
print(f"Disk usage:\n{get_disk_usage()}")
print(f"Network interfaces:\n{get_network_interfaces()}")
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
创建模型任务功能
http://114.116.90.53:4004/new_task/
"""
import os
import sys, json
import logging
import requests
import argparse
import queue
from pathlib import Path
from flask import Flask, jsonify, request
from main_model import main_info
import re
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
app = Flask(__name__)
# todo: 基于文件名来表示模型名称
root_path = "../"
# 跨域支持1
from flask_cors import CORS
CORS(app, supports_credentials=True)
@app.route('/', methods=['POST'])
def hello_world():
app.logger.info('请选择正确的方式上传!')
return '请选择正确的方式上传!'
@app.route(f'/get_server_info/', methods=['GET', 'POST'])
def get_server_info():
dict_result = main_info()
app.logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route(f'/new_task/', methods=['POST'])
def build_task():
try:
params = json.loads(request.data.decode('utf-8'))
modelName = params["modelName"]
modelPath = os.path.join(root_path, modelName)
if modelName:
# 获取目录下的config.json文件信息返回
config_path = os.path.join(modelPath, "config.json")
config_json = json.load(open(config_path, 'r', encoding='utf-8'))
dict_result = {
"code": 200,
'handleMsg': 'Success',
'logs': None,
"resultData": config_json
}
else:
dict_result = {
"code": 500,
'handleMsg': 'Failure',
'logs': None,
"resultData": "请选择模型管理中存在的模型来进行创建模型任务!"
}
except Exception as e:
dict_result = {
'code': 500,
'success': 'false',
'message': "操作失败" + str(e),
'result': None
}
app.logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
if __name__ == '__main__':
app.config['JSON_AS_ASCII'] = False
app.config['JSONIFY_MIMETYPE'] = "application/json;charset=utf-8"
app.run(host='0.0.0.0', port=4004, debug=False)
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import socket
import os
import psutil
import platform
# 获取CPU负载信息
def get_cpu():
last_worktime = 0
last_idletime = 0
f = open("/proc/stat", "r")
line = ""
while not "cpu " in line: line = f.readline()
f.close()
spl = line.split(" ")
worktime = int(spl[2]) + int(spl[3]) + int(spl[4])
idletime = int(spl[5])
dworktime = (worktime - last_worktime)
didletime = (idletime - last_idletime)
rate = float(dworktime) / (didletime + dworktime)
last_worktime = worktime
last_idletime = idletime
if (last_worktime == 0): return 0
return rate
def get_hostname():
return socket.gethostname()
def get_uptime():
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
uptime_minutes, uptime_seconds = divmod(uptime_seconds, 60)
uptime_hours, uptime_minutes = divmod(uptime_minutes, 60)
uptime_days, uptime_hours = divmod(uptime_hours, 24)
return f"{int(uptime_days)} days, {int(uptime_hours)} hours, {int(uptime_minutes)} minutes, {int(uptime_seconds)} seconds"
def get_kernel_version():
return os.uname().release
# 获取CPU占用信息
def get_cpu_info():
cpu_usage = int(get_cpu() * 100)
# cpu_tip = "CPU使用率(最大100%):" + str(cpu_usage) + "%"
# print(str(cpu_usage))
return str(cpu_usage)
def get_memory_info():
memory_info = psutil.virtual_memory()
return f"Total memory: {memory_info.total / 1024 / 1024:.2f} MB\nUsed memory: {memory_info.used / 1024 / 1024:.2f} MB\nFree memory: {memory_info.available / 1024 / 1024:.2f} MB"
def get_disk_usage():
partitions = psutil.disk_partitions()
disk_usage = ""
for partition in partitions:
usage = psutil.disk_usage(partition.mountpoint)
disk_usage += f"{partition.mountpoint} - Total: {usage.total / 1024 / 1024:.2f} MB, Used: {usage.used / 1024 / 1024:.2f} MB, Free: {usage.free / 1024 / 1024:.2f} MB\n"
return disk_usage
def get_network_interfaces():
interfaces = psutil.net_if_addrs()
network_interfaces = ""
for interface_name, interface_addresses in interfaces.items():
network_interfaces += f"{interface_name}\n"
for address in interface_addresses:
if address.family == socket.AF_INET:
network_interfaces += f" IP address: {address.address}\n"
network_interfaces += f" Netmask: {address.netmask}\n"
elif address.family == socket.AF_PACKET:
network_interfaces += f" MAC address: {address.address}\n"
return network_interfaces
def get_public_ip():
"""
获取公网IP地址
"""
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
ip = s.getsockname()[0]
s.close()
return ip
def main_pro():
hostname = get_hostname()
# UpTime = get_uptime()
KN_Version = get_kernel_version()
CPU_Info = get_cpu_info()
Memory_Info = get_memory_info()
Disk_Usage = get_disk_usage()
ip = get_public_ip()
dict_result = {
"HostName": hostname,
# "UpTime": UpTime,
"KN_Version": KN_Version,
"CPU_Info": CPU_Info,
"Memory_Info": Memory_Info,
"Disk_Usage": Disk_Usage,
"Network_Interfaces": ip
}
return dict_result
def main_info():
# 获取操作系统信息
os_info = platform.platform()
# 获取处理器信息
processor_info = platform.processor()
# 获取可用内存大小
mem_info = psutil.virtual_memory()
available_mem = round(mem_info.available / 1024 / 1024, 2)
# 获取可用硬盘大小
disk_info = psutil.disk_usage('/')
available_disk = round(disk_info.free / 1024 / 1024, 2)
# 获取私有ip
ip = get_public_ip()
# 打印机器信息
print("操作系统:", os_info)
print("处理器型号:", processor_info)
print("可用内存大小:", available_mem, "MB")
print("可用硬盘大小:", available_disk, "MB")
print("ip地址:", ip)
dict_result = {
"操作系统:": os_info,
"处理器型号:": processor_info,
"可用内存大小:": available_mem,
"可用硬盘大小:": available_disk,
"ip地址:": "114.116.90.53"
}
return dict_result
if __name__ == "__main__":
main_info()
# import requests
#
# response = requests.get('https://api.ipify.org')
# public_ip = response.text
#
# print(public_ip)
# dict_result = main_pro()
# print(dict_result)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论