提交 c4e5365c 作者: ctt

自然语言平台版本V1.0

上级
nohup python manage.py runserver --noreload 0.0.0.0:7004 >> app.log 2>&1 &
from django.contrib import admin
# Register your models here.
from django.apps import AppConfig
class BaseConfig(AppConfig):
name = 'base'
from django.db import models
from datetime import datetime
# Create your models here.
class User(models.Model):
username = models.CharField(max_length=30, unique=True)
true_name = models.CharField(max_length=30)
sex = models.CharField(max_length=2)
mobile_number = models.CharField(max_length=20)
mail = models.CharField(max_length=20)
id_card = models.CharField(max_length=20)
password = models.CharField(max_length=40)
account_number = models.CharField(max_length=20)
def toDict(self):
return {'id':self.id,
'username':self.username,
'true_name':self.true_name,
'sex':self.sex,
'mobile_number':self.mobile_number,
'mail':self.mail,
'id_card':self.id_card,
'password':self.password,
'account_number':self.account_number,
# 'update_at':self.update_at.strftime('%Y-%m-%d %H:%M:%S')
}
class Meta:
db_table = 'user'
class ServiceManage(models.Model):
name = models.CharField(max_length=15)
username = models.CharField(max_length=30)
filenames = models.CharField(max_length=200)
create_date = models.DateTimeField(default=None)
end_date = models.DateTimeField(default=None)
state = models.CharField(max_length=10)
path = models.CharField(max_length=20)
def toDict(self):
return {'name': self.name,
'username': self.username,
'filenames': self.filenames,
'create_date': self.create_date.strftime('%Y-%m-%d %H:%M:%S'),
'end_date': self.end_date.strftime('%Y-%m-%d %H:%M:%S'),
'state': self.state,
'path': self.path,
}
class Meta:
db_table = 'service_manage'
class SubjectManage(models.Model):
sid = models.CharField(max_length=10, unique=True)
name = models.CharField(max_length=30)
def toDict(self):
return {'sid': self.sid,
'name': self.name,
}
class Meta:
db_table = 'subject_manage'
class ModelManage(models.Model):
task_name = models.CharField(max_length=30)
function_type = models.CharField(max_length=20)
model_type = models.CharField(max_length=20)
version_num = models.IntegerField()
create_date = models.DateTimeField(default=None)
def toDict(self):
return {'id': self.id,
'task_name': self.task_name,
'function_type': self.function_type,
'model_type': self.model_type,
'version_num': self.version_num,
'create_date': self.create_date.strftime('%Y-%m-%d %H:%M:%S'),
}
class Meta:
db_table = 'model_manage'
class VersionManage(models.Model):
model = models.ForeignKey(ModelManage, related_name='version_model', on_delete=models.CASCADE)
version = models.CharField(max_length=20)
create_date = models.DateTimeField(default=None)
end_date = models.DateTimeField(default=None)
state = models.CharField(max_length=20)
creator = models.CharField(max_length=30)
path = models.CharField(max_length=20, unique=True)
def toDict(self):
return {'id': self.id,
'version': self.version,
'create_date': self.create_date.strftime('%Y-%m-%d %H:%M:%S'),
'end_date': self.end_date.strftime('%Y-%m-%d %H:%M:%S'),
'state': self.state,
'creator': self.creator,
'path': self.path,
}
class Meta:
db_table = 'version_manage'
\ No newline at end of file
from django.test import TestCase
# Create your tests here.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/12 18:05
# @Author : 程婷婷
# @FileName: urls.py
# @Software: PyCharm
from django.urls import path
from base.views import views
from django.conf.urls import url
from base.views import views as base_views
urlpatterns = [
url(r'^register-account', base_views.register_account, name='register_account'),
url(r'^verify-username', base_views.verify_username, name='verify_username'),
url(r'^login', base_views.login, name='login'),
url(r'^reset-password', base_views.reset_password, name='reset_password'),
url(r'^show-config-file', base_views.show_config_file, name='show_config_file'),
url(r'^show-service-file', base_views.show_service_file, name='show_service_file'),
url(r'^delete-file-row-manage', base_views.delete_file_row_manage, name='delete_file_row_manage'),
url(r'^delete-file-row-service', base_views.delete_file_row_service, name='delete_file_row_service'),
url(r'^file-upload', base_views.file_upload, name='file_upload'),
url(r'^show-log-file', base_views.show_log_file, name='show_log_file'),
url(r'^validate-code', base_views.validate_code, name='validate_code'),
url(r'^download-zip', base_views.download_zip, name='download_zip'),
url(r'^download-xlsx', base_views.download_xlsx, name='download_xlsx'),
url(r'^query-manage', base_views.query_manage, name='query_manage'),
url(r'^forget-password', base_views.forget_password, name='forget_password'),
url(r'^train', base_views.run_train, name='train'),
url(r'^query-service-manage', base_views.query_service_manage, name='query_service_manage'),
url(r'^query-subject', base_views.query_subject, name='query_subject'),
url(r'^query-version', base_views.query_version, name='query_version'),
url(r'^query-task-name', base_views.query_task_name, name='query_task_name')
]
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/13 11:51
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/10 14:34
# @Author : 程婷婷
# @FileName: BaseConfig.py
# @Software: PyCharm
import yaml
class BaseConfig:
def __init__(self, config_path):
self._config_path = config_path
self._parsed_file = self.load_config()
def load_config(self):
print(self._config_path)
with open(self._config_path) as yaml_file:
parsed_file = yaml.load(yaml_file, Loader=yaml.FullLoader)
return parsed_file
# if __name__ == '__main__':
# bc = BaseConfig()
# print(bc._parsed_file)
# print(bc.load_config()['data_path'])
# print(bc.load_config()['embedding'])
# print(bc.load_config()['model'])
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 9:58
# @Author : 程婷婷
# @FileName: BaseDataLoader.py
# @Software: PyCharm
import pandas as pd
from base.views.config.BaseConfig import BaseConfig
class BaseDataLoader:
def __init__(self, config_path):
self.data_loader_config = BaseConfig(config_path)._parsed_file['data_loader']
def read_file(self):
symbol = self.data_loader_config['dataset_path'].split('.')[-1]
if (symbol == 'xlsx') or (symbol == 'xls'):
df = pd.read_excel(r''+self.data_loader_config['dataset_path'])
elif symbol == '.csv':
df = pd.read_csv(r''+self.data_loader_config['dataset_path'], sep='\t')
else:
print('数据类型错误')
return '数据类型错误'
df.drop_duplicates(subset='content', keep='first', inplace=True)
df.dropna(subset=['content', 'label'], inplace=True)
df = df.reset_index(drop=True)
print('=================执行正文去重和去空之后共有%d条数据=============' % len(df['content']))
return df
def read_stopwords(self):
# 读取停顿词列表
stopword_list = [k.strip() for k in open(self.data_loader_config['stopwords_path'], encoding='utf8').readlines() if
k.strip() != '']
return stopword_list
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/10 15:28
# @Author : 程婷婷
# @FileName: BaseDataProcess.py
# @Software: PyCharm
import re
import jieba
import pickle
import gensim
import logging
import numpy as np
import pandas as pd
from pyhanlp import *
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, SelectPercentile
from base.views.config.BaseConfig import BaseConfig
from base.views.data.BaseDataLoader import BaseDataLoader
from platform_zzsn.settings import BASE_DIR
format = '%(asctime)s %(levelname)s %(pathname)s %(funcName)s %(message)s'
logging.basicConfig(format=format, level=logging.INFO)
class BaseDataProcess:
def __init__(self, config_path):
self.embedding_config = BaseConfig(config_path)._parsed_file['embedding']
self.process_config = BaseConfig(config_path)._parsed_file['data_process']
PerceptronLexicalAnalyzer = JClass('com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer')
self.pla_segment = PerceptronLexicalAnalyzer()
self.bdl = BaseDataLoader(config_path)
def clean_content(self, content):
bs = BeautifulSoup(content, 'html.parser')
return bs.text
def remove_char(self, content):
# 保留中文、英语字母、数字和标点
graph_filter = re.compile(r'[^\u4e00-\u9fa5a-zA-Z0-9\s,。\.,?\?!!;;]')
content = graph_filter.sub('', content)
return content
def jieba_tokenizer(self, content):
if self.process_config['use_stopwords']:
stopwords = self.bdl.read_stopwords()
else:
stopwords = []
return ' '.join([word for word in jieba.lcut(content) if word not in stopwords])
def pla_tokenizer(self, content):
words = list(self.pla_segment.analyze(content).toWordArray())
if self.process_config['use_stopwords']:
stopwords = self.bdl.read_stopwords()
else:
stopwords = []
return ' '.join([word for word in words if word not in stopwords])
def save(self, voc, path):
with open(path, 'wb') as voc_file:
pickle.dump(voc, voc_file)
def process(self, data, min_content=0):
processed_data = []
for record in data:
record = self.clean_content(str(record))
record = self.remove_char(record)
if len(record) > min_content:
methods = self.process_config['tokenizer']
if methods == 'PerceptronLexicalAnalyzer':
record = self.pla_tokenizer(record)
record = [row.strip() for row in record if row.strip() != '']
else:
record = self.jieba_tokenizer(record)
record = [row.strip() for row in record if row.strip() != '']
processed_data.append(' '.join(record))
else:
pass
return processed_data
def split_dataset(self, data, use_dev):
if use_dev:
train_data_set, test_dev_set = train_test_split(data,
train_size=self.process_config['train_size'],
random_state=self.process_config['random_state'],
shuffle=True)
train_data_set, test_data_set, dev_data_set = train_test_split(test_dev_set,
test_size=self.process_config['test_size'],
random_state=self.process_config['random_state'],
shuffle=True)
print(len(train_data_set) + len(test_data_set) + len(dev_data_set))
return train_data_set, test_data_set, dev_data_set
else:
train_data_set, test_data_set = train_test_split(data,
train_size=self.process_config['train_size'],
random_state=self.process_config['random_state'],
shuffle=True)
return train_data_set, test_data_set
def bag_of_words(self, data, label):
vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=5)
x = vectorizer.fit_transform(data)
transformer = TfidfTransformer(norm=self.embedding_config['norm'], use_idf=self.embedding_config['use_idf'],
smooth_idf=self.embedding_config['smooth_idf'])
x = transformer.fit_transform(x).toarray()
if self.embedding_config['with_feature_selection']:
transformed_data = SelectPercentile(mutual_info_classif, 20).fit_transform(x, label)
else:
transformed_data = x
os.makedirs(self.embedding_config['embedding_path'], exist_ok=True)
self.save(voc=vectorizer.vocabulary_, path=os.path.join(self.embedding_config['embedding_path'], 'tfidf.pkl'))
return transformed_data, vectorizer.get_feature_names()
def word2vec(self, data, feature_words):
model = gensim.models.word2vec.Word2Vec(sentences=data,
size=self.embedding_config['size'],
window=self.embedding_config['window'],
min_count=self.embedding_config['min_count'],
workers=self.embedding_config['workers'],
sg=self.embedding_config['sg'],
iter=self.embedding_config['iter'])
vocabulary_w2v = model.wv.vocab.keys()
count = 0
if self.embedding_config['use_Tencent']:
model_tencent = gensim.models.KeyedVectors.load_word2vec_format(
os.path.join(BASE_DIR, 'static/base/Tencent_AILab_ChineseEmbedding.bin'), binary=True)
vocabulary_tencent = model_tencent.wv.vocab.keys()
vector_matrix = np.zeros((len(feature_words), int(self.embedding_config['size']) + 200))
for word in feature_words:
if word in vocabulary_tencent:
vector_tencent = model_tencent.wv.word_vec(word)
else:
vector_tencent = np.random.randn(200)
if word in vocabulary_w2v:
vector_w2v = model.wv.word_vec(word)
else:
vector_w2v = np.random.randn(self.embedding_config['size'])
vector = np.concatenate((vector_tencent, vector_w2v))
vector_matrix[count] = vector
count += 1
else:
vector_matrix = np.zeros((len(feature_words), self.embedding_config['size']))
for word in feature_words:
if word in vocabulary_w2v:
vector_w2v = model.wv.word_vec(word)
else:
vector_w2v = np.random.randn(self.embedding_config['size'])
vector_matrix[count] = vector_w2v
count += 1
os.makedirs(self.embedding_config['embedding_path'], exist_ok=True)
model.save(os.path.join(self.embedding_config['embedding_path'], 'word2vec.model'))
return vector_matrix
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 16:30
# @Author : 程婷婷
# @FileName: BaseEvaluator.py
# @Software: PyCharm
from sklearn.metrics import precision_score, f1_score, recall_score, classification_report
import logging
from base.views.config.BaseConfig import BaseConfig
formats = '%(asctime)s %(levelname)s %(pathname)s %(funcName)s %(message)s'
logging.basicConfig(format=formats, level=logging.INFO)
class BaseEvaluator:
def __init__(self, config_path):
self.evaluate_config = BaseConfig(config_path)._parsed_file['evaluate']
def evaluate(self, y_true, y_pred, label_mapping, logger):
result = []
y_true = list(map(str, y_true))
y_pred = list(map(str, y_pred))
logger.info('模型评估结果如下:')
if not label_mapping:
result.append(classification_report(y_true, y_pred))
logger.info(classification_report(y_true, y_pred))
else:
for value in label_mapping.values():
print([k for k,v in label_mapping.items() if v == value])
p = precision_score(y_true, y_pred, average=self.evaluate_config['average'], pos_label=str(value))
r = recall_score(y_true, y_pred, average=self.evaluate_config['average'], pos_label=str(value))
f1 = f1_score(y_true, y_pred, average=self.evaluate_config['average'], pos_label=str(value))
print({'value': value,'召回率为': r, '精确率为': p, 'F1': f1})
logger.info('标签为%s' % [k for k,v in label_mapping.items() if v == value][0])
logger.info('精确率为%.2f' %p)
logger.info('召回率为%.2f' %r)
logger.info('精确率为%.2f' %f1)
result.append(str({'label': value,'recall': r, 'precision': p, 'F1': f1}))
return ' '.join(result)
# y_true = [0, 1, 2, 0, 1, 2]
# y_pred = [0, 2, 1, 0, 0, 1]
# print(BaseEvaluator())
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
import os
import yaml
import random
import smtplib
from email.mime.text import MIMEText
from django.core.paginator import Paginator
from email.mime.multipart import MIMEMultipart
from PIL import Image,ImageFont,ImageDraw,ImageFilter
from base.models import ModelManage, ServiceManage, VersionManage
from platform_zzsn.settings import BASE_DIR
class Picture:
def __init__(self):
self.size = (240,60)
self.mode='RGB'
self.color='white'
self.font = ImageFont.truetype(os.path.join(BASE_DIR,
'static/common/font/arial.ttf'), 36) #设置字体大小
def randChar(self):
basic='23456789abcdefghijklmnpqrstwxyzABCDEFGHIJKLMNPQRSTWXYZ'
return basic[random.randint(0,len(basic)-1)] #随机字符
def randBdColor(self):
return (random.randint(64,255),random.randint(64,255),random.randint(64,255)) #背景
def randTextColor(self):
return (random.randint(32, 127), random.randint(32, 127), random.randint(32, 127)) #随机颜色
def proPicture(self):
new_image=Image.new(self.mode,self.size,self.color) #创建新图像有三个默认参数:尺寸,颜色,模式
drawObject=ImageDraw.Draw(new_image) #创建一个可以对image操作的对象
line_num = random.randint(4,6) # 干扰线条数
for i in range(line_num):
#size=(240,60)
begin = (random.randint(0, self.size[0]), random.randint(0, self.size[1]))
end = (random.randint(0, self.size[0]), random.randint(0, self.size[1]))
drawObject.line([begin, end], self.randTextColor())
for x in range(240):
for y in range(60):
tmp = random.randint(0,50)
if tmp>30: #调整干扰点数量
drawObject.point((x,y),self.randBdColor())
randchar=''
for i in range(5):
rand=self.randChar()
randchar+=rand
drawObject.text([50*i+10,10],rand,self.randTextColor(),font=self.font) #写入字符
new_image = new_image.filter(ImageFilter.SHARPEN) # 滤镜
return new_image,randchar
def update_config_file(config_path, config_file):
data = yaml.load(config_file, Loader=yaml.FullLoader)
data['data_loader'] = {}
model_path = data['model']['model_path']
model_name = data['model']['model_name']
if data['model']['model_path']:
data['model']['model_path'] = os.path.join(config_path, model_path)
else:
data['model']['model_path'] = os.path.join(config_path, model_name)
print(data['model']['model_path'])
embedding_path = data['embedding']['embedding_path']
if embedding_path:
data['embedding']['embedding_path'] = os.path.join(config_path, data['embedding']['embedding_path'])
else:
if data['embedding']['name']:
data['embedding']['embedding_path'] = os.path.join(config_path, data['embedding']['name'])
tokenizer_path = data['embedding']['tokenizer_path']
if tokenizer_path:
data['embedding']['tokenizer_path'] = os.path.join(config_path, data['embedding']['tokenizer_path'])
try:
test_file_path = data['data_process']['test_file_path']
train_file_path = data['data_process']['train_file_path']
except KeyError:
pass
else:
data['data_process']['test_file_path'] = os.path.join(config_path, test_file_path)
data['data_process']['train_file_path'] = os.path.join(config_path, train_file_path)
for file in os.listdir(config_path):
if ('.xls' == file[-4:]) or ('.xlsx' == file[-5:]):
xlsx_path = os.path.join(config_path, file)
data['data_loader']['dataset_path'] = xlsx_path
if 'save_fname' in data['runner'].keys():
data['runner']['save_fpath'] = os.path.join(config_path, data['runner']['save_fname'])
data['data_loader']['stopwords_path'] = os.path.join(BASE_DIR, 'static/base/baidu_stopwords.txt')
file_path = os.path.join(config_path, 'config.yaml')
with open(file_path, 'w') as yaml_file:
yaml.safe_dump(data, yaml_file, default_flow_style=False)
return file_path
def select_manage(task_name, function_type, model_type, begin_cdate, end_cdate, page_size, current_page):
condition = {'task_name': task_name, 'function_type': function_type, 'model_type': model_type,
'create_date__range': (begin_cdate, end_cdate,)
}
del_keys = []
for key in condition.keys():
if not condition[key]:
del_keys.append(key)
if not condition['create_date__range'][0]:
del_keys.append('create_date__range')
for key in del_keys:
condition.pop(key)
managers = ModelManage.objects.filter(**condition).order_by('-create_date')
len_managers = len(managers)
page = Paginator(managers, page_size)
maxpages = page.num_pages # 最大页数
pIndex = int(current_page)
# 判断页数是否越界
if pIndex > maxpages:
pIndex = maxpages
manager_list = page.page(pIndex) # 当前页数据
return list(manager_list), len_managers
def select_version(model_id, begin_cdate, end_cdate, page_size, current_page):
condition = {'model_id': model_id,
'create_date__range': (begin_cdate, end_cdate,)
}
del_keys = []
if not condition['create_date__range'][0]:
del_keys.append('create_date__range')
for key in del_keys:
condition.pop(key)
versions = VersionManage.objects.filter(**condition).order_by('-create_date')
len_versions = len(versions)
page = Paginator(versions, page_size)
maxpages = page.num_pages # 最大页数
pIndex = int(current_page)
# 判断页数是否越界
if pIndex > maxpages:
pIndex = maxpages
version_list = page.page(pIndex) # 当前页数据
return list(version_list), len_versions
def select_service_manage(name, begin_cdate, end_cdate, state, username, page_size, current_page):
condition = {
'name': name,
'state': state,
'create_date__range': (begin_cdate, end_cdate),
'username': username,
}
del_keys = []
for key in condition.keys():
if not condition[key]:
del_keys.append(key)
if not condition['create_date__range'][0]:
del_keys.append('create_date__range')
for key in del_keys:
condition.pop(key)
print(condition)
service_managers = ServiceManage.objects.filter(**condition).order_by('-create_date')
len_service_managers = len(service_managers)
page = Paginator(service_managers, page_size)
maxpages = page.num_pages
pIndex = int(current_page)
# 判断页数是否越界
if pIndex > maxpages:
pIndex = maxpages
manager_list = page.page(pIndex) # 当前页数据
return list(manager_list), len_service_managers
def sendMail(user,pwd,sender,receiver,msg_title):
mail_host = "smtp.163.com" #163的SMTP服务器
message = MIMEMultipart('alternative')
#设置邮件的发送者
message["From"] = sender
#设置邮件的接收方
message["To"] = ",".join(receiver)
#4.设置邮件的标题
message["Subject"] = msg_title
# 添加plain格式的文本
# message.attach(MIMEText('您好,\n'
# ' 您当前的密码为%s, 为了保证您的账号安全,请尽快登陆重置您的密码'%msg_content, 'plain', 'utf-8'))
# 添加html内容
message.attach(MIMEText('<html>'
'<body>'
'<h1>Hello </h1><br> '
'<h3>To ensure the security of your account, please log in and reset your password as soon as possible.</h3>'
'<h2><a href="http://192.168.1.149:8020/reset_password/">点此重置</a></h2>'
'</body>'
'</html>', 'html', 'utf-8'))
#1.启用服务器发送邮件
smtpObj = smtplib.SMTP_SSL(mail_host,465)
#2.登录邮箱进行验证
smtpObj.login(user,pwd)
#3.发送邮件
#参数:发送方,接收方,邮件信息
smtpObj.sendmail(sender,receiver,message.as_string())
return True
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 16:29
# @Author : 程婷婷
# @FileName: BaseLoss.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 16:18
# @Author : 程婷婷
# @FileName: BaseModel.py
# @Software: PyCharm
from base.views.config.BaseConfig import BaseConfig
import os
import pickle
class BaseModel:
def __init__(self,config_path):
self.model_config = BaseConfig(config_path)._parsed_file['model']
def building_model(self, *params):
pass
def save(self, model):
dir = os.path.dirname(self.model_config['model_path'])
if not os.path.exists(dir):
os.makedirs(dir)
with open(self.model_config['model_path'], 'wb') as model_file:
pickle.dump(model, model_file)
def predict(self, model, X):
proba = model.predict_proba(X)
y_predict = model.predict(X)
return {'proba': proba, 'y_predict': y_predict}
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 16:36
# @Author : 程婷婷
# @FileName: BaseRunner.py
# @Software: PyCharm
from base.views.config.BaseConfig import BaseConfig
class BaseRunner:
def __init__(self,config_path):
self.runner_config = BaseConfig(config_path)._parsed_file['runner']
def train(self, logger):
pass
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 9:24
# @Author : 程婷婷
# @FileName: test.py
# @Software: PyCharm
import jieba
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, chi2
X, y = load_digits(return_X_y=True)
print(X.shape)
print(X[:10], y[:100])
X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
print(X_new.shape)
print(X_new[:10])
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/20 16:58
# @Author : 程婷婷
# @FileName: token_authorize.py
# @Software: PyCharm
import jwt
import time
import functools
from jwt import exceptions
from django.http import JsonResponse
from platform_zzsn.settings import *
global SECRET_KEY
SECRET_KEY = SECRET_KEY
# 定义签名密钥,用于校验jwt的有效、合法性
def create_token(user):
'''基于jwt创建token的函数'''
headers = {
"alg": "HS256",
"typ": "JWT"
}
exp = int(time.time() + 3*60*60)
payload = {
"id": user.id,
"name": user.username,
"exp": exp
}
token = jwt.encode(payload=payload, key=SECRET_KEY, algorithm='HS256', headers=headers).decode('utf-8')
return token
def login_required(view_func):
@functools.wraps(view_func)
def validate_token(request, *args, **kwargs):
'''校验token的函数,校验通过则返回解码信息'''
payload = None
msg = None
try:
token = request.META.get("HTTP_AUTHORIZATION")
payload = jwt.decode(token, SECRET_KEY, True, algorithm='HS256')
print(payload)
return view_func(request, *args, **kwargs)
# jwt有效、合法性校验
except exceptions.ExpiredSignatureError:
return JsonResponse({
'handle_msg': 'failure',
'is_handle_success': False,
'logs': '登录已过期'
})
except jwt.DecodeError:
return JsonResponse({
'handle_msg': 'failure',
'is_handle_success': False,
'logs': '缺少参数token'
# token认证失败
})
except jwt.InvalidTokenError:
return JsonResponse({
'handle_msg': 'failure',
'is_handle_success': False,
'logs': '缺少参数token'
# 非法的token
})
return validate_token
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/9 11:19
# @Author : 程婷婷
# @FileName: utils.py
# @Software: PyCharm
import os
import re
import jieba
import zipfile
import pandas as pd
from docx import Document
from platform_zzsn.settings import *
def read_txt(path):
with open(path, 'r', encoding='utf8') as file:
lines = file.readlines()
return lines
def read_docx(pending_file, user_file):
jieba.load_userdict(user_file)
document = Document(pending_file)
doc_text_list = []
for para in document.paragraphs:
para_text = re.sub(r'\s', '', para.text)
if para_text:
doc_text_list.append(para_text)
return doc_text_list
def read_excel(pending_file, user_file):
jieba.load_userdict(user_file)
doc_text_list = pd.read_excel(pending_file)['content']
doc_text_list.dropna(inplace=True)
return doc_text_list
def merge_para(paras):
new_paras = []
for i, para in enumerate(paras):
if not new_paras:
new_paras.append(para)
elif (len(new_paras[-1]) < 500):
new_paras[-1] += para
else:
new_paras.append(para)
return new_paras
def filter_stopwords(para):
path = os.path.join(BASE_DIR, 'static/base/baidu_stopwords.txt')
stopword_list = [k.strip() for k in read_txt(path) if
k.strip() != '']
words = [word for word in jieba.lcut(para) if word not in stopword_list]
return words
# 获取列表的第二个元素
def takeSecond(elem):
return elem[1]
def takeFirst_len(elem):
return len(elem[0])
def make_zip(file_dir: str, zip_path: str) -> None:
zip_f = zipfile.ZipFile(zip_path, 'w')
pre_len = len(os.path.dirname(file_dir))
for parent, dir_names, filenames in os.walk(file_dir):
for filename in filenames:
path_file = os.path.join(parent, filename)
arc_name = path_file[pre_len:].strip(os.path.sep)
zip_f.write(path_file, arc_name)
zip_f.close()
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/14 10:32
# @Author : 程婷婷
# @FileName: views.py
# @Software: PyCharm
import os
import base64
import shutil
import logging
import datetime
import tempfile
import zipfile
from io import BytesIO
from django.db import transaction
from wsgiref.util import FileWrapper
from django.core.paginator import Paginator
from werkzeug.utils import secure_filename
from django.forms.models import model_to_dict
from django.http import JsonResponse, HttpResponse
from django.core.files.storage import default_storage
from django.views.decorators.http import require_POST
from base.views import interaction, utils
from base.views.token_authorize import *
from base.models import User, ModelManage, ServiceManage, SubjectManage, VersionManage
from classify.views.textcnn_classify.TextcnnClassifyRunner import TextcnnClassifyRunner
from classify.views.xgboost_classify.XgboostClassifyRunner import XgboostClassifyRunner
from classify.views.logistic_classify.LogisticClassifyRunner import LogisticClassifyRunner
from classify.views.fasttext_classify.FastTextRunner import FastTextRunner
# from classify.flair_classify.FlairClassifyRunner import FlairClassifyRunner
from clustering.views.KMeans.KmeansRunner import KmeansRunner
from platform_zzsn.settings import BASE_DIR
print('-----------')
print(BASE_DIR)
UPLOAD_FOLDER = os.path.join(BASE_DIR, 'media/')
ALLOWED_EXTENSIONS = set(['yaml', 'xlsx', 'xls', 'doc', 'docx', 'txt'])
# 登录
@require_POST
def login(request):
username = request.POST['username']
password = request.POST['password']
try:
user = User.objects.filter(username=username)
if not user:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '用户不存在!',
'resultData': False,
})
elif user[0].password == password:
token = create_token(user[0])
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '登陆成功!',
'resultData': 'customer',
})
else:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '账号或密码不正确!',
'resultData': False,
})
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '登陆失败!',
'resultData': False,
})
# 注册
@require_POST
def register_account(request):
try:
username = request.POST['username']
true_name = request.POST['true_name']
sex = request.POST['sex']
mobile_number = request.POST['mobile_number']
mail = request.POST['mail']
id_card = request.POST['id_card']
password = request.POST['password']
account_number = username + '@zzsn.cn'
user = User.objects.create(
username=username,
true_name=true_name,
sex=sex,
mobile_number=mobile_number,
mail=mail,
id_card=id_card,
password=password,
account_number=account_number,
)
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '注册失败!',
'resultData': False
})
else:
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '注册成功!',
'resultData': True,
})
# 核查用户名
@require_POST
def verify_username(request):
try:
username = request.POST['username']
usernames = User.objects.values_list('username', flat=True)
if username in usernames:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '该用户名已存在!',
'resultData': False,
})
else:
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '此用户名可用!',
'resultData': True
})
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '用户名对比失败!',
'resultData': False
})
# 重置密码
@require_POST
def reset_password(request):
username = request.POST['username']
password = request.POST['password']
try:
user = User.objects.get(username=username)
user.password = password
user.save()
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '重置密码失败!',
'resultData': False
})
else:
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '重置密码成功!',
'resultData': True,
})
@require_POST
@login_required
def show_config_file(request):
token = request.META.get("HTTP_AUTHORIZATION")
model_type = request.POST['model_type']
try:
path = os.path.join(BASE_DIR, r'static/common/config_data/'+ model_type + '.yaml')
data = utils.read_txt(path)
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '配置文件加载失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '配置文件加载成功!',
'resultData': ''.join(data),
})
@require_POST
@login_required
def show_service_file(request):
# token = request.META.get("HTTP_AUTHORIZATION")
service_type = request.POST['service_type']
service_name = request.POST['service_name']
example_dir = os.path.join(BASE_DIR, 'static/common/', service_type, service_name)
temp = tempfile.TemporaryFile()
archive = zipfile.ZipFile(temp, 'w', zipfile.ZIP_DEFLATED)
print(example_dir)
filenames = os.listdir(example_dir)
for filename in filenames:
archive.write(os.path.join(example_dir, filename), filename)
archive.close()
lenth = temp.tell()
temp.seek(0)
wrapper = FileWrapper(temp)
response = HttpResponse(wrapper, content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=example.zip'
response['Content-Length'] = lenth
return response
@require_POST
@login_required
@transaction.atomic
def delete_file_row_manage(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
try:
path = os.path.join(UPLOAD_FOLDER, str(path_timestamp) + '/')
if os.path.exists(path):
shutil.rmtree(path)
version_manage = VersionManage.objects.get(path=path_timestamp)
if version_manage:
model_id = version_manage.model.id
print(model_id)
all_version = VersionManage.objects.filter(model_id=model_id)
if len(all_version) == 1:
version_manage.delete()
version_manage.model.delete()
elif version_manage.state == '训练成功':
model_manage = version_manage.model
version_manage.delete()
model_manage.version_num = max(0, model_manage.version_num - 1)
model_manage.save()
else:
version_manage.delete()
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '删除失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '删除成功!',
'resultData': True,
})
@require_POST
@login_required
def file_upload(request):
token = request.META.get("HTTP_AUTHORIZATION")
files = request.FILES.getlist('files')
path_timestamp = request.POST['path_timestamp']
if not path_timestamp:
path_timestamp = int(round(time.time() * 1000000))
path = os.path.join(UPLOAD_FOLDER, str(path_timestamp))
try:
for file in files:
print('上传文件名称为%s' % file.name)
if file and (file.name.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS):
filename = secure_filename(file.name)
os.makedirs(path, exist_ok=True)
default_storage.save(os.path.join(path, filename), file)
else:
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '数据文件格式错误!',
'resultData': False,
})
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '文件上传失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '文件上传成功!',
'resultData': str(path_timestamp),
})
@require_POST
@login_required
def download_zip(request):
"""
最终可用 实现功能:zip打包 下载 删除
:param filename:
:return:
"""
path_timestamp = request.POST['path_timestamp']
token = request.META.get("HTTP_AUTHORIZATION")
print(path_timestamp)
file_dir = os.path.join(UPLOAD_FOLDER, path_timestamp)
try:
if not os.path.exists(file_dir):
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '该文件夹不存在!',
'resultData': False
})
else:
temp = tempfile.TemporaryFile()
archive = zipfile.ZipFile(temp, 'w', zipfile.ZIP_DEFLATED)
num = 0
sub_dirs = []
for cur_dir, dirs, files in os.walk(file_dir):
if num == 0:
sub_dirs = dirs
num += 1
for file in files:
sub_dir = os.path.split(cur_dir)[-1]
if sub_dir in sub_dirs:
archive.write(os.path.join(cur_dir, file), os.path.join(sub_dir, file))
else:
archive.write(os.path.join(cur_dir, file), file)
archive.close()
lenth = temp.tell()
temp.seek(0)
wrapper = FileWrapper(temp)
response = HttpResponse(wrapper, content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=archive.zip'
response['Content-Length'] = lenth
return response
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '下载失败!',
'resultData': False
})
@require_POST
def forget_password(request):
try:
username = request.POST['username']
mobile_number = request.POST['mobile_number']
mail = request.POST['mail']
user = User.objects.get(username=username)
if user.mobile_number == mobile_number:
if user.mail == mail:
mail_username = "15617380221@163.com"
mail_pwd = "2698641198cjh"
mail_sender = "15617380221@163.com"
mail_receiver = [mail]
email_title = "郑州数能AI算法小组"
interaction.sendMail(mail_username, mail_pwd, mail_sender, mail_receiver, email_title)
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '个人信息验证成功,密码已发至邮箱!',
'resultData': True
})
else:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '邮箱账号填写错误!',
'resultData': False
})
else:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '手机号填写错误!',
'resultData': False
})
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '信息对比失败!',
'resultData': False
})
@require_POST
@login_required
def show_log_file(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
try:
path = UPLOAD_FOLDER + path_timestamp
files = [filename for filename in os.listdir(path) if 'log' in filename]
log_path = os.path.join(path, files[0])
data = utils.read_txt(log_path)
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '日志文件加载失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '日志文件加载成功!',
'resultData': ''.join(data),
})
@require_POST
def validate_code(request):
pic = interaction.Picture()
img, code = pic.proPicture()
output_buffer = BytesIO()
img.save(output_buffer, format='JPEG')
byte_data = output_buffer.getvalue()
base64_str = base64.b64encode(byte_data)
base64_str = 'data:image/jpg;base64,' + str(base64_str, 'utf-8')
data = {'img': base64_str, 'code': code}
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '生成成功!',
'resultData': data,
})
@require_POST
@login_required
@transaction.atomic
def run_train(request):
token = request.META.get("HTTP_AUTHORIZATION")
task_name = request.POST['task_name']
function_type = request.POST['function_type']
model_type = request.POST['model_type']
path_timestamp = request.POST['path_timestamp']
config_file = request.POST['config_file']
version_num = request.POST['version_num']
model_id = request.POST['model_id']
creator = request.POST['creator']
create_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
config_path = os.path.join(UPLOAD_FOLDER, path_timestamp)
logger = logging.getLogger(path_timestamp)
logger.setLevel(logging.INFO)
fh = logging.FileHandler(os.path.join(config_path,'train.log'), encoding='utf8')
ch = logging.StreamHandler()
# formatter = logging.Formatter(
# '[%(asctime)s][%(thread)d][%(filename)s][line: %(lineno)d][%(levelname)s] ## %(message)s')
# fh.setFormatter(formatter)
# ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
if not version_num:
version_num = 0
new_version = 1
else:
new_version = 0
if not model_id:
model_manage = ModelManage.objects.create(
task_name=task_name,
function_type=function_type,
model_type=model_type,
version_num=int(version_num),
create_date=create_time,
)
model_id = max(ModelManage.objects.values_list('id', flat=True))
else:
model_manage = ModelManage.objects.get(id=model_id)
if not new_version:
versions = VersionManage.objects.filter(model_id=model_id)
new_version = max([int(version.version.replace('V', '')) for version in versions])+1
version_manage = VersionManage.objects.create(model_id=model_id,
version='V'+str(new_version),
create_date=create_time,
state='正在训练',
creator=creator,
path=path_timestamp,
)
try:
config_path = interaction.update_config_file(config_path, config_file)
print(config_path)
train_dict = {'fasttext': FastTextRunner(config_path),
'xgboost': XgboostClassifyRunner(config_path),
'logistic': LogisticClassifyRunner(config_path),
# 'flair': FlairClassifyRunner(config_path),
'textcnn': TextcnnClassifyRunner(config_path),
'kmeans': KmeansRunner(config_path)}
train_dict[model_type].train(logger)
end_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
version_manage.end_date = end_time
version_manage.state = '训练成功'
version_manage.save()
model_manage.version_num = int(version_num) + 1
model_manage.save()
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '训练成功!',
'resultData': True,
})
except Exception as e:
print(e)
end_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
version_manage.end_date = end_time
version_manage.state = '训练失败'
version_manage.save()
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': str(e),
'resultData': False,
})
finally:
logging.Logger.manager.loggerDict.pop(path_timestamp)
logger.manager = None
logger.handlers = []
@require_POST
@login_required
def query_manage(request):
token = request.META.get("HTTP_AUTHORIZATION")
try:
task_name = request.POST['task_name']
function_type = request.POST['function_type']
model_type = request.POST['model_type']
begin_cdate = request.POST['begin_date']
end_cdate = request.POST['end_date']
page_size = request.POST['page_size']
current_page = request.POST['current_page']
manager_list, len_managers = interaction.select_manage(task_name, function_type, model_type, begin_cdate, end_cdate, page_size, current_page)
# manager_list = [model_to_dict(manager) for manager in manager_list]
manager_list = [ModelManage.toDict(manager) for manager in manager_list]
result = {'current_page': int(current_page), 'page_size': int(page_size), 'data': manager_list, 'total': len_managers}
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '查询失败',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功',
'resultData': result,
})
@require_POST
@login_required
def query_version(request):
token = request.META.get("HTTP_AUTHORIZATION")
try:
begin_cdate = request.POST['begin_date']
end_cdate = request.POST['end_date']
model_id = request.POST['model_id']
page_size = request.POST['page_size']
current_page = request.POST['current_page']
version_list, len_versions = interaction.select_version(model_id, begin_cdate, end_cdate, page_size, current_page)
# manager_list = [model_to_dict(manager) for manager in manager_list]
manager_list = [VersionManage.toDict(version) for version in version_list]
print(manager_list)
result = {'current_page': int(current_page), 'page_size': int(page_size), 'data': manager_list, 'total': len_versions}
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '查询失败',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功',
'resultData': result,
})
@require_POST
@login_required
def query_service_manage(request):
token = request.META.get("HTTP_AUTHORIZATION")
try:
name = request.POST['name']
begin_cdate = request.POST['begin_date']
end_cdate = request.POST['end_date']
state = request.POST['state']
page_size = request.POST['page_size']
current_page = request.POST['current_page']
username = request.POST['username']
manager_list, len_managers = interaction.select_service_manage(
name, begin_cdate, end_cdate, state,
username, page_size, current_page)
# manager_list = [model_to_dict(manager) for manager in manager_list]
manager_list = [ServiceManage.toDict(manager) for manager in manager_list]
result = {'current_page': int(current_page), 'page_size': int(page_size), 'data': manager_list, 'total': len_managers}
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '查询失败',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功',
'resultData': result,
})
@require_POST
@login_required
@transaction.atomic
def delete_file_row_service(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
try:
path = os.path.join(UPLOAD_FOLDER, str(path_timestamp))
if os.path.exists(path):
shutil.rmtree(path)
ServiceManage.objects.filter(path=path_timestamp).delete()
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '删除失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '删除成功!',
'resultData': True,
})
@require_POST
@login_required
def download_xlsx(request):
path_timestamp = request.POST['path_timestamp']
path = os.path.join(UPLOAD_FOLDER, path_timestamp)
xls_path = os.path.join(path, 'result.xlsx')
with open(xls_path, 'rb') as file:
data = file.readlines()
response = HttpResponse(data, content_type='application/vnd.ms-excel')
response['Content-Disposition'] = 'attachment; filename=result.xlsx'
return response
@require_POST
@login_required
def query_subject(request):
token = request.META.get("HTTP_AUTHORIZATION")
current_page = request.POST['current_page']
page_size = request.POST['page_size']
try:
subjects = SubjectManage.objects.all()
len_subjects = len(subjects)
page = Paginator(subjects, page_size)
maxpages = page.num_pages # 最大页数
pIndex = int(current_page)
# 判断页数是否越界
if pIndex > maxpages:
pIndex = maxpages
subject_list = [SubjectManage.toDict(subject) for subject in list(page.page(pIndex))] # 当前页数据
result_data = {'current_page': int(current_page),
'page_size': int(page_size),
'data': subject_list,
'total': len_subjects}
print(result_data)
except Exception as e:
return JsonResponse({
'token': token,
'handleMsg': 'fail',
'isHandleSuccess': False,
'logs': str(e),
'resultData': None,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功!',
'resultData': result_data,
})
@require_POST
@login_required
def query_task_name(request):
token = request.META.get("HTTP_AUTHORIZATION")
task_name = request.POST['task_name']
try:
model_manages = ModelManage.objects.filter(task_name__contains=task_name)[:20]
task_names = [ModelManage.toDict(i)['task_name'] for i in model_manages]
except Exception as e:
return JsonResponse({
'token': token,
'handleMsg': 'fail',
'isHandleSuccess': False,
'logs': str(e),
'resultData': None,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功!',
'resultData': task_names,
})
\ No newline at end of file
from django.contrib import admin
# Register your models here.
from django.apps import AppConfig
class BasicServiceConfig(AppConfig):
name = 'basic_service'
from django.db import models
# Create your models here.
#-*- coding:utf-8 -*-
from django.test import TestCase
# Create your tests here.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/12 18:05
# @Author : 程婷婷
# @FileName: urls.py
# @Software: PyCharm
from django.urls import path
from django.conf.urls import url
from basic_service.views import views
urlpatterns = [
url(r'^ner_single', views.ner_single, name='ner_single'),
url(r'^doc-similarity-single', views.doc_similarity_single, name='doc_similarity_single'),
url(r'^associated-word-single', views.associated_word_single, name='associated_word_single'),
url(r'^word_cut', views.word_cut, name='word_cut'),
url(r'^word_pos', views.word_pos, name='word_pos'),
url(r'^new_word_find', views.new_word_find, name='new_word_find'),
url(r'^show_srl', views.show_srl, name='show_srl'),
url(r'^show_dep', views.show_dep, name='show_dep'),
url(r'^create_keywords', views.create_keywords, name='create_keywords'),
url(r'^get_summary', views.get_summary, name='get_summary'),
url(r'^word_co_occurrence', views.word_co_occurrence, name='word_co_occurrence')
]
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/13 10:02
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/12 19:54
# @Author : 程婷婷
# @FileName: basic.py
# @Software: PyCharm
import os
import jieba
import json
import requests
import jionlp as jio
from ltp import LTP
import jieba.analyse
import ahocorasick
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
from platform_zzsn.settings import BASE_DIR
from model.base.views import utils
General_dict = utils.read_txt(os.path.join(BASE_DIR, 'static/base/dict_sogou.txt'))
General_dict_ = ''
for key in General_dict:
General_dict_ += ' ' + str(key.strip())
def word_cut(text):
ltp = LTP()
sentences = ltp.sent_split([text])
segment, _ = ltp.seg(sentences)
return segment
def word_pos(text):
ltp = LTP()
sentences = ltp.sent_split([text])
segment, hidden = ltp.seg(sentences)
pos = ltp.pos(hidden)
return segment, pos
class AC_Unicode:
"""稍微封装一下,弄个支持unicode的AC自动机
"""
def __init__(self):
self.ac = ahocorasick.Automaton()
def add_word(self, k, v):
# k = k.encode('utf-8')
return self.ac.add_word(k, v)
def make_automaton(self):
return self.ac.make_automaton()
def iter(self, s):
# 搜索文本中存在的单词
# s = s.encode('utf-8')
return self.ac.iter(s)
def new_words_find(text):
words = list(jieba.cut(text, HMM=True))
words_copy = words.copy()
ac = AC_Unicode()
sign = [0] * len(words_copy)
for word in words:
if len(word) >= 2:
ac.add_word(word, word)
ac.make_automaton()
result_ac = ac.iter(General_dict_)
for index, key in result_ac:
try:
words.remove(key)
except:
continue
for index, word in enumerate(words_copy):
if (len(word) >= 2) and (word in words):
sign[index] = 1
return words_copy, sign
def show_srl(text):
ltp = LTP()
sentences = ltp.sent_split([text])
sentences_srl_dict, sentences_seg_dict = {}, {}
for i, sentence in enumerate(sentences):
seg, hidden = ltp.seg([sentence])
srl = ltp.srl(hidden, keep_empty=False)
sentences_seg_dict['句子' + str(i+1)+':'+str(sentence)] = seg[0]
sentences_srl_dict['句子'+str(i+1)+':'+str(sentence)] = srl[0]
return sentences_seg_dict, sentences_srl_dict
def show_dep(text):
ltp = LTP()
sentences = ltp.sent_split([text])
sentences_dep_dict, sentences_seg_dict = {}, {}
for i, sentence in enumerate(sentences):
seg, hidden = ltp.seg([sentence])
dep = ltp.dep(hidden)
sentences_seg_dict['句子'+str(i+1)+':'+str(sentence)] = seg[0]
sentences_dep_dict['句子'+str(i+1)+':'+str(sentence)] = dep[0]
return sentences_seg_dict, sentences_dep_dict
def create_keywords(text:str, topK:int, with_weight:bool)->list:
print(type(topK))
keywords = jio.keyphrase.extract_keyphrase(text, top_k=topK, with_weight=with_weight)
print(keywords)
return keywords
def ner(text):
ltp = LTP()
seg, hidden = ltp.seg([text])
entity = ltp.ner(hidden)
return seg[0], entity[0]
def related_word_recommendation(words, word_num):
# print(model.wv.most_similar(words))
# print(words.split(','), word_num)
print(words)
result = model.most_similar_cosmul(words.split(','), topn=int(word_num)) # 余弦相似度
print(result)
return result
def post_similarity(url, text_1, text_2, sim_algorithm_name):
payload = {'text_1': text_1, 'text_2': text_2, 'sim_algorithm_name': sim_algorithm_name}
headers = {
'Content-Type': 'application/json'
}
response = requests.request('POST', url, headers=headers, data=json.dumps(payload))
data = json.loads(response.text)
return data
def summary(text, summary_length):
summaries = jio.summary.extract_summary(text, summary_length)
return summaries
# zh_nlp = stanza.Pipeline('zh-hans')
# en_nlp = stanza.Pipeline('en')
# nlp_dict = {'zh': zh_nlp, 'en': en_nlp}
#model = KeyedVectors.load_word2vec_format(os.path.join(BASE_DIR, 'static/base/Tencent_AILab_ChineseEmbedding.bin'), binary=True)
# if __name__ == '__main__':
# print(word_cut('汤姆生病了。他去了医院。'))
# print(word_pos('汤姆生病了。他去了医院。'))
# print(new_words_find('白月光,形容的是一种可望不可即的人或者事物,虽然一直在心上,却从不在身边。'))
# print(new_words_find('爷青回,表示爷的青春又回来了,爷表示的是自己,将自己的身份地位抬高一个档次,像我是你大爷一样,通常用来形容那些知名的人、经典的动画、影视、游戏剧等重新复出或者是回归。'))
# show_srl('他叫汤姆去拿外衣。')
# print(show_dep('他叫汤姆去拿外衣。'))
# -*- coding: utf-8 -*-
# @Time : 2021/10/13 17:07
# @Author : ctt
# @File : co
# @Project : platform_zzsn
from basic_service.views.basic import create_keywords
import pandas as pd
import numpy as np
def Get_file_keywords(filepath, topK):
data_array = [] # 每篇文章关键词的二维数组
set_word = [] # 所有关键词的集合
df = pd.read_excel(filepath)
sentences = df['内容'].tolist()
for sentence in sentences:
words = create_keywords(sentence, topK=topK, with_weight=False)
data_array.append(str(words))
for word in words:
if word not in set_word:
set_word.append(str(word))
set_word = list(set(set_word)) # 所有关键词的集合
return data_array, set_word
# 初始化矩阵
def build_matirx(set_word):
edge = len(set_word) + 1 # 建立矩阵,矩阵的高度和宽度为关键词集合的长度+1
matrix = [[''] * edge] * edge # 初始化矩阵
# print(matrix.shape)
print(matrix)
print(set_word)
matrix[0][1:] = np.array(set_word)
print(matrix)
matrix = list(map(list, zip(*matrix)))
print(set_word)
matrix[0][1:] = np.array(set_word) # 赋值矩阵的第一行与第一列
return matrix
# 计算各个关键词的共现次数
def count_matrix(matrix, formated_data):
for row in range(1, len(matrix)):
# 遍历矩阵第一行,跳过下标为0的元素
for col in range(1, len(matrix)):
# 遍历矩阵第一列,跳过下标为0的元素
# 实际上就是为了跳过matrix中下标为[0][0]的元素,因为[0][0]为空,不为关键词
if matrix[0][row] == matrix[col][0]:
# 如果取出的行关键词和取出的列关键词相同,则其对应的共现次数为0,即矩阵对角线为0
matrix[col][row] = str(0)
else:
counter = 0 # 初始化计数器
for ech in formated_data:
# 遍历格式化后的原始数据,让取出的行关键词和取出的列关键词进行组合,
# 再放到每条原始数据中查询
if matrix[0][row] in ech and matrix[col][0] in ech:
counter += 1
else:
continue
matrix[col][row] = str(counter)
return matrix
def main(filepath, topK):
formated_data, set_word = Get_file_keywords(filepath, topK)
matrix = build_matirx(set_word)
matrix = count_matrix(matrix, formated_data)
# data = pd.DataFrame(matrix)
return matrix
\ No newline at end of file
import re
import pandas as pd
from collections import defaultdict, Counter
import numpy as np
import ahocorasick
import math
def read_text(file_articles, encoding='utf8'):
texts = set()
with open(file_articles, encoding=encoding) as f:
for line in f.readlines():
line = re.split(u'[^\u4e00-\u9fa50-9a-zA-Z]+', line)
for s in line:
if len(s) > 1:
texts.add(s)
print('文章数(即文本行数):{}'.format(len(texts)))
return texts
def get_ngrams_counts(texts, n, min_count):
'''
返回ngrams出现的频数
:param n: gram个数
:param min_count: 最小出现次数,小于该值抛弃
:return:
'''
ngrams = defaultdict(int)
for t in list(texts):
for i in range(len(t)):
for j in range(1, n+1):
if i+j <= len(t):
ngrams[t[i:i+j]] += 1
ngrams = {i:j for i,j in ngrams.items() if j >= min_count}
total = 1.*sum([j for i,j in ngrams.items() if len(i) == 1])
print('字数:{}'.format(total))
return ngrams, total
def filter_with_porba(s, min_proba, total, ngrams):
'''
统计凝固度,并根据阈值抛弃一定数量的词
:param s:
:param min_proba:
:return:
'''
if len(s) >= 2:
score = min([total*ngrams[s]/(ngrams[s[:i+1]]*ngrams[s[i+1:]]) for i in range(len(s)-1)])
if score > min_proba[len(s)]:
return True
else:
return False
def cut(s, n, ngrams):
'''
使用ngrams切分文本:采取宁愿不切,也不切错的原则
:param s: 一段文本
:param ngrams: 筛选过后的gram集合
:return:
'''
# 统计文本每个长度大于2的子串在G中出现的次数
r = np.array([0]*(len(s)-1)) # 大于2的片段频数统计
for i in range(len(s)-1):
for j in range(2, n+1):
if s[i:i+j] in ngrams:
r[i:i+j-1] += 1
# 切分方法:只要有一个子串在G中,就不切分。只有当r中的统计次数为0时才切分一次。
w = [s[0]]
for i in range(1, len(s)):
if r[i-1] > 0:
w[-1] += s[i]
else:
w.append(s[i])
return w
def is_real(s, n, ngrams):
if len(s) >= 4:
for i in range(4, n+1):
for j in range(len(s)-i+1):
if s[j:j+i] not in ngrams:
return False
return True
else:
return True
def cal_entropy(dict_gram,key):
'''
计算gram的边界熵,分别计算左边界和右边界
:param dict_gram:
:param key:
:return:
'''
left = dict_gram['left']
if len(set(left)) ==1 and left[0] ==' ' :
entropy_left = -1 # 如果左边界为空,则将其设置为-1
else:
list_left = list(Counter(left).values())
sum_left = sum(list_left)
entropy_left = sum([-(i / sum_left) * math.log(i / sum_left) for i in list_left])
right = dict_gram['right']
if len(set(right)) ==1 and right[0] ==' ' :
entropy_right = -1 # 如果右边界为空,则将其设置为-1
else:
list_right = list(Counter(right).values())
sum_right = sum(list_right)
entropy_right = sum([ -(i/sum_right)*math.log(i/sum_right) for i in list_right])
if entropy_left==-1 and entropy_right==-1:
entropy =-2 # 如果左右边界熵都为空,将其设置为-2
else:
entropy = min(entropy_left, entropy_right)
return entropy
class AC_Unicode:
"""稍微封装一下,弄个支持unicode的AC自动机
"""
def __init__(self):
self.ac = ahocorasick.Automaton()
def add_word(self, k, v):
# k = k.encode('utf-8')
return self.ac.add_word(k, v)
def make_automaton(self):
return self.ac.make_automaton()
def iter(self, s):
# 搜索文本中存在的单词
# s = s.encode('utf-8')
return self.ac.iter(s)
def get_ngrams_neighbor_ac(texts, w):
'''
返回ngrams出现的左右相邻的字, 将所有文本拼接成一行,利用AC自动机一次匹配所有词
根据匹配结果获取该词的左右字,从而计算边界熵
'''
neighbors = {}
text_line = ''
for line in texts:
text_line += ' '+ line
print('构建AC自动机...')
ac = AC_Unicode()
for gram in w.keys():
if len(gram)>1:
ac.add_word(gram, gram)
ac.make_automaton()
result_ac = ac.iter(text_line)
print('迭代匹配结果...')
for item in result_ac:
index, key = item
if key not in neighbors.keys():
neighbors[key] = {'left':[], 'right':[]}
else:
index_left = index-len(key) + 1
if index_left-1 >= 0:
neighbors[key]['left'].append(text_line[index_left-1 : index_left])
index_right = index
if index_left-1 <= len(text_line):
neighbors[key]['right'].append(text_line[index_right+1 : index_right+2])
print('计算边界熵...')
ngrams_entropy = defaultdict(int)
for key in neighbors.keys():
entropy = cal_entropy(neighbors[key], key)
ngrams_entropy[key] = entropy
return ngrams_entropy
def remove_general_words_ac(dict_general_words, ws):
'''
根据常用词词典移除常用词,将常用词典拼成长文本
利用AC自动机匹配出现在长文本中词,并将其删除
:param dict_general_words:
:param ws:
:return:
'''
print('移除常用词...')
ac = AC_Unicode()
for gram in ws.keys():
if len(gram)>1:
ac.add_word(gram, gram)
General_dict = pd.read_csv(dict_general_words)
General_dict = list(General_dict['0'].values)
General_dict_ = ''
for key in General_dict:
General_dict_ += ' ' + str(key)
ac.make_automaton()
result_ac = ac.iter(General_dict_)
for index, key in result_ac:
try:
del ws[key]
except: continue
final_w = sorted(ws.items(), key=lambda item: item[1],reverse=True)
return final_w
def get_new_words( file_in, file_dict, file_out, min_count, min_proba):
'''
获取新词
:param file_in: 按行存储的输入文档,每行可以看做一篇文章,utf8编码
:param file_dict: 常用词词典,每行一个词
:param file_out: 输出文件,每行一个词,和其对应的边界熵,按边界熵从打到小排列,gbk编码
:param min_count: ngrams最小出现次数
:param min_proba: 不同长度的词对应的最小凝固度阈值字典,这里输入长度为2,3,4的即可
:return:
'''
import time
import pandas as pd
start = time.time()
n = 4 # 默认ngrams中的n为4
df = pd.read_excel(file_in)['摘要'] # 读取数据
df.dropna(inplace=True)
texts = []
for text in df:
if len(str(text)) > 10:
print(text)
texts.append(''.join(text.split()))
ngrams, total = get_ngrams_counts(texts, n, min_count) # 获取ngrams
ngrams_filter = set(i for i, j in ngrams.items() if filter_with_porba(i, min_proba, total, ngrams)) # 计算凝固度,并根据阈值过滤ngrams
# 根据ngrams分词
words = defaultdict(int)
for t in texts:
for i in cut(t, n, ngrams_filter):
words[i] += 1
w = {i: j for i, j in words.items() if j >= min_count} # 根据阈值筛选出出现频率较高的词
# 注意此时的words和ngrams_filter,也就是凝固度集合,鄙视完全重合的。因为会分出来ngrams中没有的词。
# w = {i: j for i, j in words.items() if is_real(i, n, ngrams_filter)}
print('凝固度筛选词的长度:{}'.format(len(w)))
ws = get_ngrams_neighbor_ac(texts, w) # 按边界熵大小排序
final_w = remove_general_words_ac(file_dict, ws)
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~“”?,!【】()、。:;’‘……¥·↓/"""
count_num = 0
with open(file_out, 'w', encoding='utf-8') as writer:
for value in final_w:
word = value[0]
sign = 0
for i in word:
if i in punctuation:
sign = 1
break
print(sign)
if (len(word) >= 2) and (sign==0):
writer.write('{},{}\n'.format(word, value[1]))
count_num += 1
end = time.time()
print('新词个数:{}'.format(count_num))
print('花费时间:{}分钟'.format(round((end - start) / 60, 2)))
if __name__ == '__main__':
min_count = 1
min_proba = {2: 500, 3: 1000, 4: 500}
file_in = r'D:\临时工作\临时工作代码\企业资讯八方面-附关键词\风险管理.xlsx' # utf8
file_dict = './dict_sogou_vec.txt' # utf8
file_out = './find_words_.csv' # gbk
# import pdfplumber
#
# file_path = r'C:\xxxx\practice.PDF'
#
# with pdfplumber.open(file_path) as pdf:
# page = pdf.pages[11]
# print(page.extract_text())
get_new_words(file_in, file_dict, file_out, min_count, min_proba)
from tkinter import _flatten
from django.http import JsonResponse
from django.views.decorators.http import require_POST
from basic_service.views import basic, co_occurrence
from model.base.views.token_authorize import *
import shutil
UPLOAD_FOLDER = '/home/zzsn/ctt/platform_zzsn/media/'
# Create your views here.
@require_POST
@login_required
def doc_similarity_single(request):
token = request.META.get("HTTP_AUTHORIZATION")
text_1 = request.POST['text_1']
text_2 = request.POST['text_2']
sim_algorithm_name = request.POST['sim_algorithm_name']
print(text_1)
print(text_2)
url = 'http://localhost:7005/doc_sim/calculate_similarity'
result = basic.post_similarity(url, text_1, text_2, sim_algorithm_name)
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def ner_single(request):
token = request.META.get("HTTP_AUTHORIZATION")
text = request.POST['text']
words, entity = basic.ner(text)
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': {'words': words, 'pos': entity},
})
@require_POST
@login_required
def associated_word_single(request):
token = request.META.get("HTTP_AUTHORIZATION")
text = request.POST['text']
word_num = request.POST['word_num']
try:
related_words = basic.related_word_recommendation(text, word_num)
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': str(e),
'resultData': None,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': related_words,
})
@require_POST
@login_required
def word_cut(request):
token = request.META.get("HTTP_AUTHORIZATION")
text = request.POST['text']
words = basic.word_cut(text)
words = list(_flatten(words))
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': words,
})
@require_POST
@login_required
def word_pos(request):
token = request.META.get("HTTP_AUTHORIZATION")
text = request.POST['text']
words, pos = basic.word_pos(text)
words = list(_flatten(words))
pos = list(_flatten(pos))
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': {'words': words, 'pos': pos},
})
@require_POST
@login_required
def new_word_find(request):
token = request.META.get("HTTP_AUTHORIZATION")
text = request.POST['text']
words, sign = basic.new_words_find(text)
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': {'words': words, 'sign': sign},
})
@require_POST
@login_required
def show_srl(request):
token = request.META.get("HTTP_AUTHORIZATION")
text = request.POST['text']
words, srl = basic.show_srl(text)
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': {'words': words, 'srl': srl},
})
@require_POST
@login_required
def show_dep(request):
token = request.META.get("HTTP_AUTHORIZATION")
text = request.POST['text']
words, dep = basic.show_dep(text)
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': {'words': words, 'dep': dep},
})
@require_POST
@login_required
def create_keywords(request):
token = request.META.get("HTTP_AUTHORIZATION")
text = request.POST['text']
topK = int(request.POST['topK'])
with_weight = bool(request.POST['with_weight'])
key_words = basic.create_keywords(text=text, topK=topK, with_weight=with_weight)
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': {'keywords': key_words},
})
@require_POST
@login_required
def get_summary(request):
token = request.META.get("HTTP_AUTHORIZATION")
text = request.POST['text']
summary_length = request.POST['summary_length']
summaries = basic.summary(text, summary_length)
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': {'summaries': summaries},
})
@require_POST
@login_required
def word_co_occurrence(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
pending_file = request.POST['pending_file']
path = os.path.join(UPLOAD_FOLDER, path_timestamp)
filepath = os.path.join(path, pending_file)
topK = int(request.POST['topK'])
word_matric = co_occurrence.main(filepath, topK)
if os.path.exists(path_timestamp):
shutil.rmtree(path_timestamp)
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': {'word_matric': word_matric},
})
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'platform_zzsn.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()
from model.base.views.config import BaseConfig
from model.base.views.data import BaseDataLoader
from model.base.views.data import BaseDataProcess
from model.base.views.evaluator import BaseEvaluator
from model.base.views.loss import BaseLoss
from model.base.views.model import BaseModel
from model.base.views.runner import BaseRunner
\ No newline at end of file
from django.contrib import admin
# Register your models here.
from django.apps import AppConfig
class BaseConfig(AppConfig):
name = 'base'
from django.db import models
from datetime import datetime
# Create your models here.
class User(models.Model):
username = models.CharField(max_length=30, unique=True)
true_name = models.CharField(max_length=30)
sex = models.CharField(max_length=2)
mobile_number = models.CharField(max_length=20)
mail = models.CharField(max_length=20)
id_card = models.CharField(max_length=20)
password = models.CharField(max_length=40)
account_number = models.CharField(max_length=20)
def toDict(self):
return {'id':self.id,
'username':self.username,
'true_name':self.true_name,
'sex':self.sex,
'mobile_number':self.mobile_number,
'mail':self.mail,
'id_card':self.id_card,
'password':self.password,
'account_number':self.account_number,
# 'update_at':self.update_at.strftime('%Y-%m-%d %H:%M:%S')
}
class Meta:
db_table = 'user'
class ServiceManage(models.Model):
name = models.CharField(max_length=15)
username = models.CharField(max_length=30)
filenames = models.CharField(max_length=200)
create_date = models.DateTimeField(default=None)
end_date = models.DateTimeField(default=None)
state = models.CharField(max_length=10)
path = models.CharField(max_length=20)
def toDict(self):
return {'name': self.name,
'username': self.username,
'filenames': self.filenames,
'create_date': self.create_date.strftime('%Y-%m-%d %H:%M:%S'),
'end_date': self.end_date.strftime('%Y-%m-%d %H:%M:%S'),
'state': self.state,
'path': self.path,
}
class Meta:
db_table = 'service_manage'
class SubjectManage(models.Model):
sid = models.CharField(max_length=10, unique=True)
name = models.CharField(max_length=30)
def toDict(self):
return {'sid': self.sid,
'name': self.name,
}
class Meta:
db_table = 'subject_manage'
class ModelManage(models.Model):
task_name = models.CharField(max_length=30)
function_type = models.CharField(max_length=20)
model_type = models.CharField(max_length=20)
version_num = models.IntegerField()
create_date = models.DateTimeField(default=None)
def toDict(self):
return {'id': self.id,
'task_name': self.task_name,
'function_type': self.function_type,
'model_type': self.model_type,
'version_num': self.version_num,
'create_date': self.create_date.strftime('%Y-%m-%d %H:%M:%S'),
}
class Meta:
db_table = 'model_manage'
class VersionManage(models.Model):
model = models.ForeignKey(ModelManage, related_name='version_model', on_delete=models.CASCADE)
version = models.CharField(max_length=20)
create_date = models.DateTimeField(default=None)
end_date = models.DateTimeField(default=None)
state = models.CharField(max_length=20)
creator = models.CharField(max_length=30)
path = models.CharField(max_length=20, unique=True)
def toDict(self):
return {'id': self.id,
'version': self.version,
'create_date': self.create_date.strftime('%Y-%m-%d %H:%M:%S'),
'end_date': self.end_date.strftime('%Y-%m-%d %H:%M:%S'),
'state': self.state,
'creator': self.creator,
'path': self.path,
}
class Meta:
db_table = 'version_manage'
\ No newline at end of file
from django.test import TestCase
# Create your tests here.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/12 18:05
# @Author : 程婷婷
# @FileName: urls.py
# @Software: PyCharm
from model.base.views import views as base_views
from django.conf.urls import url
urlpatterns = [
url(r'^register-account', base_views.register_account, name='register_account'),
url(r'^verify-username', base_views.verify_username, name='verify_username'),
url(r'^login', base_views.login, name='login'),
url(r'^reset-password', base_views.reset_password, name='reset_password'),
url(r'^show-config-file', base_views.show_config_file, name='show_config_file'),
url(r'^show-service-file', base_views.show_service_file, name='show_service_file'),
url(r'^delete-file-row-manage', base_views.delete_file_row_manage, name='delete_file_row_manage'),
url(r'^delete-file-row-service', base_views.delete_file_row_service, name='delete_file_row_service'),
url(r'^file-upload', base_views.file_upload, name='file_upload'),
url(r'^show-log-file', base_views.show_log_file, name='show_log_file'),
url(r'^validate-code', base_views.validate_code, name='validate_code'),
url(r'^download-zip', base_views.download_zip, name='download_zip'),
url(r'^download-xlsx', base_views.download_xlsx, name='download_xlsx'),
url(r'^query-manage', base_views.query_manage, name='query_manage'),
url(r'^forget-password', base_views.forget_password, name='forget_password'),
url(r'^train', base_views.run_train, name='train'),
url(r'^query-service-manage', base_views.query_service_manage, name='query_service_manage'),
url(r'^query-subject', base_views.query_subject, name='query_subject'),
url(r'^query-version', base_views.query_version, name='query_version'),
url(r'^query-task-name', base_views.query_task_name, name='query_task_name')
]
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/13 11:51
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/10 14:34
# @Author : 程婷婷
# @FileName: BaseConfig.py
# @Software: PyCharm
import yaml
class BaseConfig:
def __init__(self, config_path):
self._config_path = config_path
self._parsed_file = self.load_config()
def load_config(self):
print(self._config_path)
with open(self._config_path) as yaml_file:
parsed_file = yaml.load(yaml_file, Loader=yaml.FullLoader)
return parsed_file
# if __name__ == '__main__':
# bc = BaseConfig()
# print(bc._parsed_file)
# print(bc.load_config()['data_path'])
# print(bc.load_config()['embedding'])
# print(bc.load_config()['model'])
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 9:58
# @Author : 程婷婷
# @FileName: BaseDataLoader.py
# @Software: PyCharm
import pandas as pd
from model.base.views.config.BaseConfig import BaseConfig
class BaseDataLoader:
def __init__(self, config_path):
self.data_loader_config = BaseConfig(config_path)._parsed_file['data_loader']
def read_file(self):
symbol = self.data_loader_config['dataset_path'].split('.')[-1]
if (symbol == 'xlsx') or (symbol == 'xls'):
df = pd.read_excel(r''+self.data_loader_config['dataset_path'])
elif symbol == '.csv':
df = pd.read_csv(r''+self.data_loader_config['dataset_path'], sep='\t')
else:
print('数据类型错误')
return '数据类型错误'
df.drop_duplicates(subset='content', keep='first', inplace=True)
df.dropna(subset=['content', 'title'], inplace=True)
df = df.reset_index(drop=True)
print('=================执行正文去重和去空之后共有%d条数据=============' % len(df['content']))
return df
def read_stopwords(self):
# 读取停顿词列表
stopword_list = [k.strip() for k in open(self.data_loader_config['stopwords_path'], encoding='utf8').readlines() if
k.strip() != '']
return stopword_list
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/10 15:28
# @Author : 程婷婷
# @FileName: BaseDataProcess.py
# @Software: PyCharm
import re
import jieba
import pickle
import gensim
import logging
import numpy as np
from pyhanlp import *
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, SelectPercentile
from model.base import BaseConfig
from model.base import BaseDataLoader
from platform_zzsn.settings import BASE_DIR
format = '%(asctime)s %(levelname)s %(pathname)s %(funcName)s %(message)s'
logging.basicConfig(format=format, level=logging.INFO)
class BaseDataProcess:
def __init__(self, config_path):
self.embedding_config = BaseConfig.BaseConfig(config_path)._parsed_file['embedding']
self.process_config = BaseConfig.BaseConfig(config_path)._parsed_file['data_process']
PerceptronLexicalAnalyzer = JClass('com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer')
self.pla_segment = PerceptronLexicalAnalyzer()
self.bdl = BaseDataLoader.BaseDataLoader(config_path)
def clean_content(self, content):
bs = BeautifulSoup(content, 'html.parser')
return bs.text
def remove_char(self, content):
# 保留中文、英语字母、数字和标点
graph_filter = re.compile(r'[^\u4e00-\u9fa5a-zA-Z0-9\s,。\.,?\?!!;;]')
content = graph_filter.sub('', content)
return content
def jieba_tokenizer(self, content):
if self.process_config['use_stopwords']:
stopwords = self.bdl.read_stopwords()
else:
stopwords = []
return ' '.join([word for word in jieba.lcut(content) if word not in stopwords])
def pla_tokenizer(self, content):
words = list(self.pla_segment.analyze(content).toWordArray())
if self.process_config['use_stopwords']:
stopwords = self.bdl.read_stopwords()
else:
stopwords = []
return ' '.join([word for word in words if word not in stopwords])
def save(self, voc, path):
with open(path, 'wb') as voc_file:
pickle.dump(voc, voc_file)
def process(self, data, min_content=0):
processed_data = []
for record in data:
record = self.clean_content(str(record))
record = self.remove_char(record)
if len(record) > min_content:
methods = self.process_config['tokenizer']
if methods == 'PerceptronLexicalAnalyzer':
record = self.pla_tokenizer(record)
record = [row.strip() for row in record if row.strip() != '']
else:
record = self.jieba_tokenizer(record)
record = [row.strip() for row in record if row.strip() != '']
processed_data.append(' '.join(record))
else:
pass
return processed_data
def split_dataset(self, data, use_dev):
if use_dev:
train_data_set, test_dev_set = train_test_split(data,
train_size=self.process_config['train_size'],
random_state=self.process_config['random_state'],
shuffle=True)
train_data_set, test_data_set, dev_data_set = train_test_split(test_dev_set,
test_size=self.process_config['test_size'],
random_state=self.process_config['random_state'],
shuffle=True)
print(len(train_data_set) + len(test_data_set) + len(dev_data_set))
return train_data_set, test_data_set, dev_data_set
else:
train_data_set, test_data_set = train_test_split(data,
train_size=self.process_config['train_size'],
random_state=self.process_config['random_state'],
shuffle=True)
return train_data_set, test_data_set
def bag_of_words(self, data, label):
vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=5)
x = vectorizer.fit_transform(data)
transformer = TfidfTransformer(norm=self.embedding_config['norm'], use_idf=self.embedding_config['use_idf'],
smooth_idf=self.embedding_config['smooth_idf'])
x = transformer.fit_transform(x).toarray()
if self.embedding_config['with_feature_selection']:
transformed_data = SelectPercentile(mutual_info_classif, 20).fit_transform(x, label)
else:
transformed_data = x
os.makedirs(self.embedding_config['embedding_path'], exist_ok=True)
self.save(voc=vectorizer.vocabulary_, path=os.path.join(self.embedding_config['embedding_path'], 'tfidf.pkl'))
return transformed_data, vectorizer.get_feature_names()
def word2vec(self, data, feature_words):
model = gensim.models.word2vec.Word2Vec(sentences=data,
size=self.embedding_config['size'],
window=self.embedding_config['window'],
min_count=self.embedding_config['min_count'],
workers=self.embedding_config['workers'],
sg=self.embedding_config['sg'],
iter=self.embedding_config['iter'])
vocabulary_w2v = model.wv.vocab.keys()
count = 0
if self.embedding_config['use_Tencent']:
model_tencent = gensim.models.KeyedVectors.load_word2vec_format(
os.path.join(BASE_DIR, 'static/base/Tencent_AILab_ChineseEmbedding.bin'), binary=True)
vocabulary_tencent = model_tencent.wv.vocab.keys()
vector_matrix = np.zeros((len(feature_words), int(self.embedding_config['size']) + 200))
for word in feature_words:
if word in vocabulary_tencent:
vector_tencent = model_tencent.wv.word_vec(word)
else:
vector_tencent = np.random.randn(200)
if word in vocabulary_w2v:
vector_w2v = model.wv.word_vec(word)
else:
vector_w2v = np.random.randn(self.embedding_config['size'])
vector = np.concatenate((vector_tencent, vector_w2v))
vector_matrix[count] = vector
count += 1
else:
vector_matrix = np.zeros((len(feature_words), self.embedding_config['size']))
for word in feature_words:
if word in vocabulary_w2v:
vector_w2v = model.wv.word_vec(word)
else:
vector_w2v = np.random.randn(self.embedding_config['size'])
vector_matrix[count] = vector_w2v
count += 1
os.makedirs(self.embedding_config['embedding_path'], exist_ok=True)
model.save(os.path.join(self.embedding_config['embedding_path'], 'word2vec.model'))
return vector_matrix
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 16:30
# @Author : 程婷婷
# @FileName: BaseEvaluator.py
# @Software: PyCharm
from sklearn.metrics import precision_score, f1_score, recall_score, classification_report
import logging
from model.base.views.config.BaseConfig import BaseConfig
formats = '%(asctime)s %(levelname)s %(pathname)s %(funcName)s %(message)s'
logging.basicConfig(format=formats, level=logging.INFO)
class BaseEvaluator:
def __init__(self, config_path):
self.evaluate_config = BaseConfig(config_path)._parsed_file['evaluate']
def evaluate(self, y_true, y_pred, label_mapping, logger):
result = []
y_true = list(map(str, y_true))
y_pred = list(map(str, y_pred))
logger.info('模型评估结果如下:')
if not label_mapping:
result.append(classification_report(y_true, y_pred))
logger.info(classification_report(y_true, y_pred))
else:
for value in label_mapping.values():
print([k for k,v in label_mapping.items() if v == value])
p = precision_score(y_true, y_pred, average=self.evaluate_config['average'], pos_label=str(value))
r = recall_score(y_true, y_pred, average=self.evaluate_config['average'], pos_label=str(value))
f1 = f1_score(y_true, y_pred, average=self.evaluate_config['average'], pos_label=str(value))
print({'value': value,'召回率为': r, '精确率为': p, 'F1': f1})
logger.info('标签为%s' % [k for k,v in label_mapping.items() if v == value][0])
logger.info('精确率为%.2f' %p)
logger.info('召回率为%.2f' %r)
logger.info('精确率为%.2f' %f1)
result.append(str({'label': value,'recall': r, 'precision': p, 'F1': f1}))
return ' '.join(result)
# y_true = [0, 1, 2, 0, 1, 2]
# y_pred = [0, 2, 1, 0, 0, 1]
# print(BaseEvaluator())
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
import os
import yaml
import random
import smtplib
from email.mime.text import MIMEText
from django.core.paginator import Paginator
from email.mime.multipart import MIMEMultipart
from PIL import Image,ImageFont,ImageDraw,ImageFilter
from model.base.models import ModelManage, ServiceManage, VersionManage
from platform_zzsn.settings import BASE_DIR
class Picture:
def __init__(self):
self.size = (240,60)
self.mode='RGB'
self.color='white'
self.font = ImageFont.truetype(os.path.join(BASE_DIR,
'static/common/font/arial.ttf'), 36) #设置字体大小
def randChar(self):
basic='23456789abcdefghijklmnpqrstwxyzABCDEFGHIJKLMNPQRSTWXYZ'
return basic[random.randint(0,len(basic)-1)] #随机字符
def randBdColor(self):
return (random.randint(64,255),random.randint(64,255),random.randint(64,255)) #背景
def randTextColor(self):
return (random.randint(32, 127), random.randint(32, 127), random.randint(32, 127)) #随机颜色
def proPicture(self):
new_image=Image.new(self.mode,self.size,self.color) #创建新图像有三个默认参数:尺寸,颜色,模式
drawObject=ImageDraw.Draw(new_image) #创建一个可以对image操作的对象
line_num = random.randint(4,6) # 干扰线条数
for i in range(line_num):
#size=(240,60)
begin = (random.randint(0, self.size[0]), random.randint(0, self.size[1]))
end = (random.randint(0, self.size[0]), random.randint(0, self.size[1]))
drawObject.line([begin, end], self.randTextColor())
for x in range(240):
for y in range(60):
tmp = random.randint(0,50)
if tmp>30: #调整干扰点数量
drawObject.point((x,y),self.randBdColor())
randchar=''
for i in range(5):
rand=self.randChar()
randchar+=rand
drawObject.text([50*i+10,10],rand,self.randTextColor(),font=self.font) #写入字符
new_image = new_image.filter(ImageFilter.SHARPEN) # 滤镜
return new_image,randchar
def update_config_file(config_path, config_file):
data = yaml.load(config_file, Loader=yaml.FullLoader)
data['data_loader'] = {}
model_path = data['model']['model_path']
model_name = data['model']['model_name']
if data['model']['model_path']:
data['model']['model_path'] = os.path.join(config_path, model_path)
else:
data['model']['model_path'] = os.path.join(config_path, model_name)
print(data['model']['model_path'])
embedding_path = data['embedding']['embedding_path']
if embedding_path:
data['embedding']['embedding_path'] = os.path.join(config_path, data['embedding']['embedding_path'])
else:
if data['embedding']['name']:
data['embedding']['embedding_path'] = os.path.join(config_path, data['embedding']['name'])
tokenizer_path = data['embedding']['tokenizer_path']
if tokenizer_path:
data['embedding']['tokenizer_path'] = os.path.join(config_path, data['embedding']['tokenizer_path'])
try:
test_file_path = data['data_process']['test_file_path']
train_file_path = data['data_process']['train_file_path']
except KeyError:
pass
else:
data['data_process']['test_file_path'] = os.path.join(config_path, test_file_path)
data['data_process']['train_file_path'] = os.path.join(config_path, train_file_path)
for file in os.listdir(config_path):
if ('.xls' == file[-4:]) or ('.xlsx' == file[-5:]):
xlsx_path = os.path.join(config_path, file)
data['data_loader']['dataset_path'] = xlsx_path
if 'save_fname' in data['runner'].keys():
data['runner']['save_fpath'] = os.path.join(config_path, data['runner']['save_fname'])
data['data_loader']['stopwords_path'] = os.path.join(BASE_DIR, 'static/base/baidu_stopwords.txt')
file_path = os.path.join(config_path, 'config.yaml')
with open(file_path, 'w') as yaml_file:
yaml.safe_dump(data, yaml_file, default_flow_style=False)
return file_path
def select_manage(task_name, function_type, model_type, begin_cdate, end_cdate, page_size, current_page):
condition = {'task_name': task_name, 'function_type': function_type, 'model_type': model_type,
'create_date__range': (begin_cdate, end_cdate,)
}
del_keys = []
for key in condition.keys():
if not condition[key]:
del_keys.append(key)
if not condition['create_date__range'][0]:
del_keys.append('create_date__range')
for key in del_keys:
condition.pop(key)
managers = ModelManage.objects.filter(**condition).order_by('-create_date')
len_managers = len(managers)
page = Paginator(managers, page_size)
maxpages = page.num_pages # 最大页数
pIndex = int(current_page)
# 判断页数是否越界
if pIndex > maxpages:
pIndex = maxpages
manager_list = page.page(pIndex) # 当前页数据
return list(manager_list), len_managers
def select_version(model_id, begin_cdate, end_cdate, page_size, current_page):
condition = {'model_id': model_id,
'create_date__range': (begin_cdate, end_cdate,)
}
del_keys = []
if not condition['create_date__range'][0]:
del_keys.append('create_date__range')
for key in del_keys:
condition.pop(key)
versions = VersionManage.objects.filter(**condition).order_by('-create_date')
len_versions = len(versions)
page = Paginator(versions, page_size)
maxpages = page.num_pages # 最大页数
pIndex = int(current_page)
# 判断页数是否越界
if pIndex > maxpages:
pIndex = maxpages
version_list = page.page(pIndex) # 当前页数据
return list(version_list), len_versions
def select_service_manage(name, begin_cdate, end_cdate, state, username, page_size, current_page):
condition = {
'name': name,
'state': state,
'create_date__range': (begin_cdate, end_cdate),
'username': username,
}
del_keys = []
for key in condition.keys():
if not condition[key]:
del_keys.append(key)
if not condition['create_date__range'][0]:
del_keys.append('create_date__range')
for key in del_keys:
condition.pop(key)
print(condition)
service_managers = ServiceManage.objects.filter(**condition).order_by('-create_date')
len_service_managers = len(service_managers)
page = Paginator(service_managers, page_size)
maxpages = page.num_pages
pIndex = int(current_page)
# 判断页数是否越界
if pIndex > maxpages:
pIndex = maxpages
manager_list = page.page(pIndex) # 当前页数据
return list(manager_list), len_service_managers
def sendMail(user,pwd,sender,receiver,msg_title):
mail_host = "smtp.163.com" #163的SMTP服务器
message = MIMEMultipart('alternative')
#设置邮件的发送者
message["From"] = sender
#设置邮件的接收方
message["To"] = ",".join(receiver)
#4.设置邮件的标题
message["Subject"] = msg_title
# 添加plain格式的文本
# message.attach(MIMEText('您好,\n'
# ' 您当前的密码为%s, 为了保证您的账号安全,请尽快登陆重置您的密码'%msg_content, 'plain', 'utf-8'))
# 添加html内容
message.attach(MIMEText('<html>'
'<body>'
'<h1>Hello </h1><br> '
'<h3>To ensure the security of your account, please log in and reset your password as soon as possible.</h3>'
'<h2><a href="http://192.168.1.149:8020/reset_password/">点此重置</a></h2>'
'</body>'
'</html>', 'html', 'utf-8'))
#1.启用服务器发送邮件
smtpObj = smtplib.SMTP_SSL(mail_host,465)
#2.登录邮箱进行验证
smtpObj.login(user,pwd)
#3.发送邮件
#参数:发送方,接收方,邮件信息
smtpObj.sendmail(sender,receiver,message.as_string())
return True
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 16:29
# @Author : 程婷婷
# @FileName: BaseLoss.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 16:18
# @Author : 程婷婷
# @FileName: BaseModel.py
# @Software: PyCharm
from model.base.views.config.BaseConfig import BaseConfig
import os
import pickle
class BaseModel:
def __init__(self,config_path):
self.model_config = BaseConfig(config_path)._parsed_file['model']
def building_model(self, *params):
pass
def save(self, model):
dir = os.path.dirname(self.model_config['model_path'])
if not os.path.exists(dir):
os.makedirs(dir)
with open(self.model_config['model_path'], 'wb') as model_file:
pickle.dump(model, model_file)
def predict(self, model, X):
proba = model.predict_proba(X)
y_predict = model.predict(X)
return {'proba': proba, 'y_predict': y_predict}
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 16:36
# @Author : 程婷婷
# @FileName: BaseRunner.py
# @Software: PyCharm
from model.base.views.config.BaseConfig import BaseConfig
class BaseRunner:
def __init__(self,config_path):
self.runner_config = BaseConfig(config_path)._parsed_file['runner']
def train(self, logger):
pass
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 17:04
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 9:24
# @Author : 程婷婷
# @FileName: test.py
# @Software: PyCharm
import jieba
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, chi2
X, y = load_digits(return_X_y=True)
print(X.shape)
print(X[:10], y[:100])
X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
print(X_new.shape)
print(X_new[:10])
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/20 16:58
# @Author : 程婷婷
# @FileName: token_authorize.py
# @Software: PyCharm
import jwt
import time
import functools
from jwt import exceptions
from django.http import JsonResponse
from platform_zzsn.settings import *
global SECRET_KEY
SECRET_KEY = SECRET_KEY
# 定义签名密钥,用于校验jwt的有效、合法性
def create_token(user):
'''基于jwt创建token的函数'''
headers = {
"alg": "HS256",
"typ": "JWT"
}
exp = int(time.time() + 3*60*60)
payload = {
"id": user.id,
"name": user.username,
"exp": exp
}
token = jwt.encode(payload=payload, key=SECRET_KEY, algorithm='HS256', headers=headers).decode('utf-8')
return token
def login_required(view_func):
@functools.wraps(view_func)
def validate_token(request, *args, **kwargs):
'''校验token的函数,校验通过则返回解码信息'''
payload = None
msg = None
try:
token = request.META.get("HTTP_AUTHORIZATION")
payload = jwt.decode(token, SECRET_KEY, True, algorithm='HS256')
print(payload)
return view_func(request, *args, **kwargs)
# jwt有效、合法性校验
except exceptions.ExpiredSignatureError:
return JsonResponse({
'handle_msg': 'failure',
'is_handle_success': False,
'logs': '登录已过期'
})
except jwt.DecodeError:
return JsonResponse({
'handle_msg': 'failure',
'is_handle_success': False,
'logs': '缺少参数token'
# token认证失败
})
except jwt.InvalidTokenError:
return JsonResponse({
'handle_msg': 'failure',
'is_handle_success': False,
'logs': '缺少参数token'
# 非法的token
})
return validate_token
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/9 11:19
# @Author : 程婷婷
# @FileName: utils.py
# @Software: PyCharm
import os
import re
import jieba
import zipfile
import pandas as pd
from docx import Document
from platform_zzsn.settings import *
def read_txt(path):
with open(path, 'r', encoding='utf8') as file:
lines = file.readlines()
return lines
def read_docx(pending_file, user_file):
jieba.load_userdict(user_file)
document = Document(pending_file)
doc_text_list = []
for para in document.paragraphs:
para_text = re.sub(r'\s', '', para.text)
if para_text:
doc_text_list.append(para_text)
return doc_text_list
def read_excel(pending_file, user_file):
jieba.load_userdict(user_file)
doc_text_list = pd.read_excel(pending_file)['content']
doc_text_list.dropna(inplace=True)
return doc_text_list
def merge_para(paras):
new_paras = []
for i, para in enumerate(paras):
if not new_paras:
new_paras.append(para)
elif (len(new_paras[-1]) < 500):
new_paras[-1] += para
else:
new_paras.append(para)
return new_paras
def filter_stopwords(para):
path = os.path.join(BASE_DIR, 'static/base/baidu_stopwords.txt')
stopword_list = [k.strip() for k in read_txt(path) if
k.strip() != '']
words = [word for word in jieba.lcut(para) if word not in stopword_list]
return words
# 获取列表的第二个元素
def takeSecond(elem):
return elem[1]
def takeFirst_len(elem):
return len(elem[0])
def make_zip(file_dir: str, zip_path: str) -> None:
zip_f = zipfile.ZipFile(zip_path, 'w')
pre_len = len(os.path.dirname(file_dir))
for parent, dir_names, filenames in os.walk(file_dir):
for filename in filenames:
path_file = os.path.join(parent, filename)
arc_name = path_file[pre_len:].strip(os.path.sep)
zip_f.write(path_file, arc_name)
zip_f.close()
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/14 10:32
# @Author : 程婷婷
# @FileName: views.py
# @Software: PyCharm
import base64
import shutil
import logging
import tempfile
import zipfile
from io import BytesIO
from django.db import transaction
from wsgiref.util import FileWrapper
from django.core.paginator import Paginator
from werkzeug.utils import secure_filename
from django.http import JsonResponse, HttpResponse
from django.core.files.storage import default_storage
from django.views.decorators.http import require_POST
from model.base.views import utils, interaction
from model.base.views.token_authorize import *
from model.base.models import User, ModelManage, ServiceManage, SubjectManage, VersionManage
from model.classify.views.textcnn_classify.TextcnnClassifyRunner import TextcnnClassifyRunner
from model.classify.views.xgboost_classify.XgboostClassifyRunner import XgboostClassifyRunner
from model.classify.views.logistic_classify.LogisticClassifyRunner import LogisticClassifyRunner
from model.classify.views.fasttext_classify.FastTextRunner import FastTextRunner
# from classify.flair_classify.FlairClassifyRunner import FlairClassifyRunner
from model.clustering.views.KMeans.KmeansRunner import KmeansRunner
from platform_zzsn.settings import MEDIA_ROOT
print('-----------')
print(MEDIA_ROOT)
UPLOAD_FOLDER = MEDIA_ROOT
ALLOWED_EXTENSIONS = set(['yaml', 'xlsx', 'xls', 'doc', 'docx', 'txt'])
# 登录
@require_POST
def login(request):
username = request.POST['username']
password = request.POST['password']
try:
user = User.objects.filter(username=username)
if not user:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '用户不存在!',
'resultData': False,
})
elif user[0].password == password:
token = create_token(user[0])
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '登陆成功!',
'resultData': 'zzsn',
})
else:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '账号或密码不正确!',
'resultData': False,
})
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '登陆失败!',
'resultData': False,
})
# 注册
@require_POST
def register_account(request):
try:
username = request.POST['username']
true_name = request.POST['true_name']
sex = request.POST['sex']
mobile_number = request.POST['mobile_number']
mail = request.POST['mail']
id_card = request.POST['id_card']
password = request.POST['password']
account_number = username + '@zzsn.cn'
user = User.objects.create(
username=username,
true_name=true_name,
sex=sex,
mobile_number=mobile_number,
mail=mail,
id_card=id_card,
password=password,
account_number=account_number,
)
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '注册失败!',
'resultData': False
})
else:
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '注册成功!',
'resultData': True,
})
# 核查用户名
@require_POST
def verify_username(request):
try:
username = request.POST['username']
print(username)
usernames = User.objects.values_list('username', flat=True)
if username in usernames:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '该用户名已存在!',
'resultData': False,
})
else:
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '此用户名可用!',
'resultData': True
})
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '用户名对比失败!',
'resultData': False
})
# 重置密码
@require_POST
def reset_password(request):
username = request.POST['username']
password = request.POST['password']
try:
user = User.objects.get(username=username)
user.password = password
user.save()
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '重置密码失败!',
'resultData': False
})
else:
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '重置密码成功!',
'resultData': True,
})
@require_POST
@login_required
def show_config_file(request):
token = request.META.get("HTTP_AUTHORIZATION")
model_type = request.POST['model_type']
try:
path = os.path.join(BASE_DIR, r'static/common/config_data/'+ model_type + '.yaml')
data = utils.read_txt(path)
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '配置文件加载失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '配置文件加载成功!',
'resultData': ''.join(data),
})
@require_POST
@login_required
def show_service_file(request):
# token = request.META.get("HTTP_AUTHORIZATION")
service_type = request.POST['service_type']
service_name = request.POST['service_name']
example_dir = os.path.join(BASE_DIR, 'static/common/', service_type, service_name)
temp = tempfile.TemporaryFile()
archive = zipfile.ZipFile(temp, 'w', zipfile.ZIP_DEFLATED)
print(example_dir)
filenames = os.listdir(example_dir)
for filename in filenames:
archive.write(os.path.join(example_dir, filename), filename)
archive.close()
lenth = temp.tell()
temp.seek(0)
wrapper = FileWrapper(temp)
response = HttpResponse(wrapper, content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=example.zip'
response['Content-Length'] = lenth
return response
@require_POST
@login_required
@transaction.atomic
def delete_file_row_manage(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
try:
path = os.path.join(UPLOAD_FOLDER, str(path_timestamp) + '/')
if os.path.exists(path):
shutil.rmtree(path)
version_manage = VersionManage.objects.get(path=path_timestamp)
if version_manage:
model_id = version_manage.model.id
print(model_id)
all_version = VersionManage.objects.filter(model_id=model_id)
if len(all_version) == 1:
version_manage.delete()
version_manage.model.delete()
elif version_manage.state == '训练成功':
model_manage = version_manage.model
version_manage.delete()
model_manage.version_num = max(0, model_manage.version_num - 1)
model_manage.save()
else:
version_manage.delete()
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '删除失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '删除成功!',
'resultData': True,
})
@require_POST
@login_required
def file_upload(request):
token = request.META.get("HTTP_AUTHORIZATION")
files = request.FILES.getlist('files')
path_timestamp = request.POST['path_timestamp']
if not path_timestamp:
path_timestamp = int(round(time.time() * 1000000))
path = os.path.join(UPLOAD_FOLDER, str(path_timestamp))
try:
for file in files:
print('上传文件名称为%s' % file.name)
if file and (file.name.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS):
filename = secure_filename(file.name)
os.makedirs(path, exist_ok=True)
#default_storage.save(os.path.join(path, filename), file)
with open(os.path.join(path, filename), 'wb') as f:
for chunk in file.chunks():
f.write(chunk)
else:
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '数据文件格式错误!',
'resultData': False,
})
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '文件上传失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '文件上传成功!',
'resultData': str(path_timestamp),
})
@require_POST
@login_required
def download_zip(request):
"""
最终可用 实现功能:zip打包 下载 删除
:param filename:
:return:
"""
path_timestamp = request.POST['path_timestamp']
token = request.META.get("HTTP_AUTHORIZATION")
print(path_timestamp)
file_dir = os.path.join(UPLOAD_FOLDER, path_timestamp)
try:
if not os.path.exists(file_dir):
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '该文件夹不存在!',
'resultData': False
})
else:
temp = tempfile.TemporaryFile()
archive = zipfile.ZipFile(temp, 'w', zipfile.ZIP_DEFLATED)
num = 0
sub_dirs = []
for cur_dir, dirs, files in os.walk(file_dir):
if num == 0:
sub_dirs = dirs
num += 1
for file in files:
sub_dir = os.path.split(cur_dir)[-1]
if sub_dir in sub_dirs:
archive.write(os.path.join(cur_dir, file), os.path.join(sub_dir, file))
else:
archive.write(os.path.join(cur_dir, file), file)
archive.close()
lenth = temp.tell()
temp.seek(0)
wrapper = FileWrapper(temp)
response = HttpResponse(wrapper, content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=archive.zip'
response['Content-Length'] = lenth
return response
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '下载失败!',
'resultData': False
})
@require_POST
def forget_password(request):
try:
username = request.POST['username']
mobile_number = request.POST['mobile_number']
mail = request.POST['mail']
user = User.objects.get(username=username)
if user.mobile_number == mobile_number:
if user.mail == mail:
mail_username = "15617380221@163.com"
mail_pwd = "2698641198cjh"
mail_sender = "15617380221@163.com"
mail_receiver = [mail]
email_title = "郑州数能AI算法小组"
interaction.sendMail(mail_username, mail_pwd, mail_sender, mail_receiver, email_title)
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '个人信息验证成功,密码已发至邮箱!',
'resultData': True
})
else:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '邮箱账号填写错误!',
'resultData': False
})
else:
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '手机号填写错误!',
'resultData': False
})
except Exception as e:
print(e)
return JsonResponse({
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '信息对比失败!',
'resultData': False
})
@require_POST
@login_required
def show_log_file(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
try:
path = UPLOAD_FOLDER + path_timestamp
files = [filename for filename in os.listdir(path) if 'log' in filename]
log_path = os.path.join(path, files[0])
data = utils.read_txt(log_path)
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '日志文件加载失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '日志文件加载成功!',
'resultData': ''.join(data),
})
@require_POST
def validate_code(request):
pic = interaction.Picture()
img, code = pic.proPicture()
output_buffer = BytesIO()
img.save(output_buffer, format='JPEG')
byte_data = output_buffer.getvalue()
base64_str = base64.b64encode(byte_data)
base64_str = 'data:image/jpg;base64,' + str(base64_str, 'utf-8')
data = {'img': base64_str, 'code': code}
return JsonResponse({
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '生成成功!',
'resultData': data,
})
@require_POST
@login_required
@transaction.atomic
def run_train(request):
token = request.META.get("HTTP_AUTHORIZATION")
task_name = request.POST['task_name']
function_type = request.POST['function_type']
model_type = request.POST['model_type']
path_timestamp = request.POST['path_timestamp']
config_file = request.POST['config_file']
version_num = request.POST['version_num']
model_id = request.POST['model_id']
creator = request.POST['creator']
create_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
config_path = os.path.join(UPLOAD_FOLDER, path_timestamp)
logger = logging.getLogger(path_timestamp)
logger.setLevel(logging.INFO)
fh = logging.FileHandler(os.path.join(config_path,'train.log'), encoding='utf8')
ch = logging.StreamHandler()
# formatter = logging.Formatter(
# '[%(asctime)s][%(thread)d][%(filename)s][line: %(lineno)d][%(levelname)s] ## %(message)s')
# fh.setFormatter(formatter)
# ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
if not version_num:
version_num = 0
new_version = 1
else:
new_version = 0
if not model_id:
model_manage = ModelManage.objects.create(
task_name=task_name,
function_type=function_type,
model_type=model_type,
version_num=int(version_num),
create_date=create_time,
)
model_id = max(ModelManage.objects.values_list('id', flat=True))
else:
model_manage = ModelManage.objects.get(id=model_id)
if not new_version:
versions = VersionManage.objects.filter(model_id=model_id)
new_version = max([int(version.version.replace('V', '')) for version in versions])+1
version_manage = VersionManage.objects.create(model_id=model_id,
version='V'+str(new_version),
create_date=create_time,
state='正在训练',
creator=creator,
path=path_timestamp,
)
try:
config_path = interaction.update_config_file(config_path, config_file)
print(config_path)
train_dict = {
# 'fasttext': FastTextRunner(config_path),
# 'xgboost': XgboostClassifyRunner(config_path),
# 'logistic': LogisticClassifyRunner(config_path),
# 'flair': FlairClassifyRunner(config_path),
# 'textcnn': TextcnnClassifyRunner(config_path),
'kmeans': KmeansRunner(config_path)}
train_dict[model_type].train(logger)
end_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
version_manage.end_date = end_time
version_manage.state = '训练成功'
version_manage.save()
model_manage.version_num = int(version_num) + 1
model_manage.save()
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '训练成功!',
'resultData': True,
})
except Exception as e:
print(e)
end_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
version_manage.end_date = end_time
version_manage.state = '训练失败'
version_manage.save()
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': str(e),
'resultData': False,
})
finally:
logging.Logger.manager.loggerDict.pop(path_timestamp)
logger.manager = None
logger.handlers = []
@require_POST
@login_required
def query_manage(request):
token = request.META.get("HTTP_AUTHORIZATION")
try:
task_name = request.POST['task_name']
function_type = request.POST['function_type']
model_type = request.POST['model_type']
begin_cdate = request.POST['begin_date']
end_cdate = request.POST['end_date']
page_size = request.POST['page_size']
current_page = request.POST['current_page']
manager_list, len_managers = interaction.select_manage(task_name, function_type, model_type, begin_cdate, end_cdate, page_size, current_page)
# manager_list = [model_to_dict(manager) for manager in manager_list]
manager_list = [ModelManage.toDict(manager) for manager in manager_list]
result = {'current_page': int(current_page), 'page_size': int(page_size), 'data': manager_list, 'total': len_managers}
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '查询失败',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功',
'resultData': result,
})
@require_POST
@login_required
def query_version(request):
token = request.META.get("HTTP_AUTHORIZATION")
try:
begin_cdate = request.POST['begin_date']
end_cdate = request.POST['end_date']
model_id = request.POST['model_id']
page_size = request.POST['page_size']
current_page = request.POST['current_page']
version_list, len_versions = interaction.select_version(model_id, begin_cdate, end_cdate, page_size, current_page)
# manager_list = [model_to_dict(manager) for manager in manager_list]
manager_list = [VersionManage.toDict(version) for version in version_list]
print(manager_list)
result = {'current_page': int(current_page), 'page_size': int(page_size), 'data': manager_list, 'total': len_versions}
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '查询失败',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功',
'resultData': result,
})
@require_POST
@login_required
def query_service_manage(request):
token = request.META.get("HTTP_AUTHORIZATION")
try:
name = request.POST['name']
begin_cdate = request.POST['begin_date']
end_cdate = request.POST['end_date']
state = request.POST['state']
page_size = request.POST['page_size']
current_page = request.POST['current_page']
username = request.POST['username']
manager_list, len_managers = interaction.select_service_manage(
name, begin_cdate, end_cdate, state,
username, page_size, current_page)
# manager_list = [model_to_dict(manager) for manager in manager_list]
manager_list = [ServiceManage.toDict(manager) for manager in manager_list]
result = {'current_page': int(current_page), 'page_size': int(page_size), 'data': manager_list, 'total': len_managers}
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': e,
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功',
'resultData': result,
})
@require_POST
@login_required
@transaction.atomic
def delete_file_row_service(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
try:
path = os.path.join(UPLOAD_FOLDER, str(path_timestamp))
if os.path.exists(path):
shutil.rmtree(path)
ServiceManage.objects.filter(path=path_timestamp).delete()
except Exception as e:
print(e)
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '删除失败!',
'resultData': False,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '删除成功!',
'resultData': True,
})
@require_POST
@login_required
def download_xlsx(request):
path_timestamp = request.POST['path_timestamp']
path = os.path.join(UPLOAD_FOLDER, path_timestamp)
xls_path = os.path.join(path, 'result.xlsx')
with open(xls_path, 'rb') as file:
data = file.readlines()
response = HttpResponse(data, content_type='application/vnd.ms-excel')
response['Content-Disposition'] = 'attachment; filename=result.xlsx'
return response
@require_POST
@login_required
def query_subject(request):
token = request.META.get("HTTP_AUTHORIZATION")
current_page = request.POST['current_page']
page_size = request.POST['page_size']
try:
subjects = SubjectManage.objects.all()
len_subjects = len(subjects)
page = Paginator(subjects, page_size)
maxpages = page.num_pages # 最大页数
pIndex = int(current_page)
# 判断页数是否越界
if pIndex > maxpages:
pIndex = maxpages
subject_list = [SubjectManage.toDict(subject) for subject in list(page.page(pIndex))] # 当前页数据
result_data = {'current_page': int(current_page),
'page_size': int(page_size),
'data': subject_list,
'total': len_subjects}
print(result_data)
except Exception as e:
return JsonResponse({
'token': token,
'handleMsg': 'fail',
'isHandleSuccess': False,
'logs': str(e),
'resultData': None,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功!',
'resultData': result_data,
})
@require_POST
@login_required
def query_task_name(request):
token = request.META.get("HTTP_AUTHORIZATION")
task_name = request.POST['task_name']
try:
model_manages = ModelManage.objects.filter(task_name__contains=task_name)[:20]
task_names = [ModelManage.toDict(i)['task_name'] for i in model_manages]
except Exception as e:
return JsonResponse({
'token': token,
'handleMsg': 'fail',
'isHandleSuccess': False,
'logs': str(e),
'resultData': None,
})
else:
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '查询成功!',
'resultData': task_names,
})
from model.classify.views.fasttext_classify import FastTextConfig
from model.classify.views.fasttext_classify.data import FastTextDataLoader
from model.classify.views.fasttext_classify.data import FastTextProcess
from model.classify.views.fasttext_classify import FastTextModel
from model.classify.views.fasttext_classify import FastTextEvaluator
from model.classify.views.fasttext_classify import FastTextRunner
from django.contrib import admin
# Register your models here.
from django.apps import AppConfig
class ClassifyConfig(AppConfig):
name = 'classify'
from django.db import models
# Create your models here.
from django.test import TestCase
# Create your tests here.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/12 18:05
# @Author : 程婷婷
# @FileName: urls.py
# @Software: PyCharm
from django.urls import path
from basic_service import views
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/13 11:24
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:06
# @Author : 程婷婷
# @FileName: XgboostClassifyConfig.py
# @Software: PyCharm
from model.base import BaseConfig
class FastTextConfig(BaseConfig.BaseConfig):
def __init__(self, config_path):
super().__init__(config_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 14:14
# @Author : 程婷婷
# @FileName: XgboostClassifyEvaluator.py
# @Software: PyCharm
from model.base import BaseEvaluator
class FastTextEvaluator(BaseEvaluator.BaseEvaluator):
def __init__(self, config_path):
super().__init__(config_path)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:18
# @Author : 程婷婷
# @FileName: XgboostClassifyModel.py
# @Software: PyCharm
import fasttext
from model.base import BaseModel
class FastTextModel(BaseModel.BaseModel):
def __init__(self, config_path):
super().__init__(config_path)
def building_model(self, input, autotuneValidationFile):
model = fasttext.train_supervised(input=input,
autotuneValidationFile=autotuneValidationFile,
autotuneDuration=self.model_config['autotuneDuration'],
autotuneModelSize=self.model_config['autotuneModelSize'])
model.save_model(self.model_config['model_path'])
return model
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:33
# @Author : 程婷婷
# @FileName: XgboostClassifyRunner.py
# @Software: PyCharm
from model.base import BaseRunner
from model.classify import FastTextProcess
from model.classify import FastTextModel
from model.classify import FastTextEvaluator
class FastTextRunner(BaseRunner.BaseRunner):
def __init__(self, config_path):
super().__init__(config_path)
self.ftp = FastTextProcess.FastTextProcess(config_path)
self.ftm = FastTextModel.FastTextModel(config_path)
self.fte = FastTextEvaluator.FastTextEvaluator(config_path)
def train(self, logger):
train_path, test_path = self.ftp.runner_process(logger)
model = self.ftm.building_model(input=train_path, autotuneValidationFile=test_path)
with open(test_path, encoding='utf8') as file:
test_data = file.readlines()
true_labels, predict_labels = [], []
for text in test_data:
label = text.replace('__label__', '')[0]
text = text.replace('__label__', '')[1:-1]
true_labels.append(int(label))
predict_label = model.predict(text)[0][0].replace('__label__', '')
# print(pre_label)
predict_labels.append(int(predict_label))
evaluate_result = self.fte.evaluate(true_labels, predict_labels, label_mapping=None, logger=logger)
print(evaluate_result)
return 'success'
# if __name__ == '__main__':
# state = FastTextRunner().train()
# print(state)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 10:28
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 11:19
# @Author : 程婷婷
# @FileName: FastTextDataLoader.py
# @Software: PyCharm
from model.base import BaseDataLoader
class FastTextDataLoader(BaseDataLoader.BaseDataLoader):
def __init__(self, config_path):
super().__init__(config_path)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:14
# @Author : 程婷婷
# @FileName: XgboostClassifyProcess.py
# @Software: PyCharm
import re
import time
from model.base import BaseDataProcess
from model.classify import FastTextDataLoader
class FastTextProcess(BaseDataProcess.BaseDataProcess):
def __init__(self, config_path):
super().__init__(config_path)
self.ftdl = FastTextDataLoader.FastTextDataLoader(config_path)
def remove_char(self, content):
graph_filter = re.compile(u'[\U00010000-\U0010ffff\uD800-\uDBFF\uDC00-\uDFFFa-z\n\s]')
content = graph_filter.sub('', content)
return content
def process(self, data, min_content):
processed_data = []
i = 0
for record in data:
record = self.remove_char(record)
if len(record) > min_content:
methods = self.process_config['tokenizer']
if methods == 'PerceptronLexicalAnalyzer':
record = self.pla_tokenizer(record)
else:
record = self.jieba_tokenizer(record)
processed_data.append(record)
i += 1
else:
i += 1
pass
if (i+1)%100 == 0 or i+1 == len(data):
print(time.strftime('%Y-%m-%d %H:%M:%S'),'第',i+1,'条文本分词完毕')
return processed_data
def transform_data(self, data, labels):
format_data = []
for i in range(len(data)):
fasttext_line = "__label__{} {}".format(labels[i], data[i])
format_data.append(fasttext_line)
return format_data
def runner_process(self, logger):
df = self.ftdl.read_file()
processed_data = self.process(df['content'], min_content=10)
# if self.process_config['label_encode']:
if type(df['label'][0]) == int:
labels = df['label']
else:
all_label = list(set(df['label']))
self.label_mapping = {v: k for k, v in dict(enumerate(all_label)).items()}
labels = df['label'].map(self.label_mapping)
print(labels)
fomat_data = self.transform_data(processed_data, labels)
if self.process_config['use_dev']:
train_data_set, test_data_set, dev_data_set = self.split_dataset(fomat_data, use_dev=self.process_config['use_dev'])
else:
train_data_set, test_data_set = self.split_dataset(fomat_data, use_dev=self.process_config['use_dev'])
with open(self.process_config['train_file_path'], 'w', encoding='utf-8') as trainf, \
open(self.process_config['test_file_path'], 'w', encoding='utf-8') as testf:
for train_row in train_data_set:
trainf.write(train_row + '\n')
for test_row in test_data_set:
testf.write(test_row + '\n')
logger.info('处理后的数据量为 %d 条' % len(fomat_data))
logger.info('训练集的数据量为 %d 条' % len(train_data_set))
logger.info('测试集的数据量为 %d 条' % len(test_data_set))
return self.process_config['train_file_path'], self.process_config['test_file_path']
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 11:18
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:06
# @Author : 程婷婷
# @FileName: XgboostClassifyConfig.py
# @Software: PyCharm
from model.base.views.config import BaseConfig
class FastTextConfig(BaseConfig):
def __init__(self):
super().__init__()
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 14:14
# @Author : 程婷婷
# @FileName: XgboostClassifyEvaluator.py
# @Software: PyCharm
from model.base.views.evaluator import BaseEvaluator
class FlairClassifyEvaluator(BaseEvaluator):
def __init__(self, config_path):
super().__init__(config_path)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:18
# @Author : 程婷婷
# @FileName: XgboostClassifyModel.py
# @Software: PyCharm
from torch.optim import Adam
from torch.optim.lr_scheduler import OneCycleLR
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from model.base.views.model.BaseModel import BaseModel
class FlairClassifyModel(BaseModel):
def __init__(self, config_path):
super().__init__(config_path)
def building_model(self, corpus, document_embeddings, label_dict, loss_weights):
# downstream classifier
classifier = TextClassifier(
document_embeddings,
label_dictionary=label_dict,
loss_weights=loss_weights
)
# model trainer
trainer = ModelTrainer(classifier, corpus, optimizer=Adam)
model_save_path = self.model_config['model_path']
trainer.train(str(model_save_path),
learning_rate=3e-5, # use very small learning rate
mini_batch_size=16,
scheduler=OneCycleLR,
mini_batch_chunk_size=2, # optionally set this if transformer is too much for your machine
max_epochs=3, # terminate after X epochs
monitor_train=True,
monitor_test=True,
checkpoint=True
)
return classifier, trainer
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:33
# @Author : 程婷婷
# @FileName: XgboostClassifyRunner.py
# @Software: PyCharm
import os
import numpy as np
import torch
import random
from model.base.views.runner import BaseRunner
from model.classify.views.flair_classify import FlairClassifyProcess
from model.classify.views.flair_classify import FlairClassifyModel
from model.classify.views.flair_classify import FlairClassifyEvaluator
class FlairClassifyRunner(BaseRunner):
def __init__(self, config_path):
super().__init__(config_path)
self.fcp = FlairClassifyProcess(config_path)
self.fcm = FlairClassifyModel(config_path)
self.fce = FlairClassifyEvaluator(config_path)
def reproducibility(seed):
'''
固定随机种子
:param seed:
:return:
'''
os.environ["PYTHONHASHSEED"] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
def train(self):
corpus, document_embeddings, label_dict, loss_weights = self.fcp.runner_process()
model = self.fcm.building_model(
corpus=corpus,
document_embeddings=document_embeddings,
label_dict=label_dict,
loss_weights=loss_weights
)
#self.fce.evaluate(true_labels, predict_labels)
return 'success'
if __name__ == '__main__':
state = FlairClassifyRunner().train()
print(state)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 10:28
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 11:46
# @Author : 程婷婷
# @FileName: FlairClassifyDataLoader.py
# @Software: PyCharm
from model.base import BaseDataLoader
class FlairClassifyDataLoader(BaseDataLoader):
def __init__(self, config_path):
super().__init__(config_path)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:14
# @Author : 程婷婷
# @FileName: XgboostClassifyProcess.py
# @Software: PyCharm
from flair.data import Sentence, Corpus
import re
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from flair.embeddings import TransformerDocumentEmbeddings
from model.base import BaseDataProcess
from model.classify import FlairClassifyDataLoader
class DataSet(Dataset):
def __init__(
self, data_df, tokenizer,
):
df = data_df.copy()
sep_token = tokenizer.special_tokens_map['sep_token']
self.samples = df.content.apply(lambda s: re.sub("<sep>", sep_token, s)).values
self.labels = df.label.values
self.tokenizer = tokenizer
def __len__(self):
return len(self.samples)
def __getitem__(self, index):
sample, label = self.samples[index], self.labels[index]
sentence = Sentence(sample, use_tokenizer=self.tokenizer.tokenize)
if not len(sentence):
sentence = Sentence(self.tokenizer.unk_token, use_tokenizer=self.tokenizer.tokenize)
print(sample)
print(sentence)
sentence.add_label('class', str(label))
return sentence
class FlairClassifyProcess(BaseDataProcess):
def __init__(self, config_path):
super().__init__(config_path)
self.fcdl = FlairClassifyDataLoader(config_path)
@staticmethod
def add_sep_token(content):
return re.sub('。', '。<sep>', content)
def runner_process(self):
df = self.fcdl.read_file()
df = df[df.content.apply(lambda s: s.strip()).apply(len) > 10]
df = df.reset_index(drop=True)
df['content'] = df['content'].apply(lambda s: self.add_sep_token(str(s)))
pos = df.label.value_counts()
loss_weights = (pos.sum() - pos) / pos
self.loss_weights = loss_weights.to_dict()
if self.process_config['label_encode']:
all_label = list(set(df['label']))
self.label_mapping = {v: k for k, v in dict(enumerate(all_label)).items()}
labels = df['label'].map(self.label_mapping)
print(labels)
tokenizer = AutoTokenizer.from_pretrained(self.embedding_config['pretrained_name'])
if self.process_config['use_dev']:
train_data_set, test_data_set, dev_data_set = self.split_dataset(df, use_dev=self.process_config['use_dev'])
train_set = DataSet(train_data_set, tokenizer)
test_set = DataSet(test_data_set, tokenizer)
val_set = DataSet(dev_data_set, tokenizer)
corpus = Corpus(train=train_set, dev=val_set, test=test_set)
else:
train_data_set, test_data_set = self.split_dataset(df, use_dev=self.process_config['use_dev'])
train_set = DataSet(train_data_set, tokenizer)
test_set = DataSet(test_data_set, tokenizer)
corpus = Corpus(train=train_set, test=test_set)
label_dict = corpus.make_label_dictionary()
document_embeddings = TransformerDocumentEmbeddings(
self.embedding_config['pretrained_name'], fine_tune=True
)
return corpus, document_embeddings, label_dict, loss_weights
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 11:43
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:06
# @Author : 程婷婷
# @FileName: XgboostClassifyConfig.py
# @Software: PyCharm
from model.base.views.config import BaseConfig
class LogisticClassifyConfig(BaseConfig):
def __init__(self):
super().__init__()
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 14:14
# @Author : 程婷婷
# @FileName: XgboostClassifyEvaluator.py
# @Software: PyCharm
from model.base.views.evaluator.BaseEvaluator import BaseEvaluator
class LogisticClassifyEvaluator(BaseEvaluator):
def __init__(self, config_path):
super().__init__(config_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:18
# @Author : 程婷婷
# @FileName: XgboostClassifyModel.py
# @Software: PyCharm
import os
from sklearn import linear_model
import joblib
import heapq
import numpy as np
from sklearn import metrics
from model.base.views.model.BaseModel import BaseModel
class ensemble:
def __init__(self, name, r, data, labels, model_save_path):
self.Name = name
self.Data = data
self.Labels = labels
self.model_save_path = model_save_path
self.Num = len(labels)
self.Index = [i for i in range(self.Num)]
print(self.Name + ' | Train | Title | Number of Data | ' + str(self.Num))
self.Num_Positive = self.Labels.count(1)
self.Num_Negative = self.Labels.count(0)
print(self.Name + ' | Train | Title | Number of Positive | ' + str(self.Num_Positive))
print(self.Name + ' | Train | Title | Number of Negative | ' + str(self.Num_Negative))
print(self.Name + ' | Train | Title | Data Loaded' + '\n')
self.Ite = 1
self.Index_Retain_Train = [i for i in range(self.Num)]
self.Index_Retain_Predict = [i for i in range(self.Num)]
self.Index_Delete = {}
self.Recall = []
self.Precision = []
self.F1 = []
self.Threshold = {}
self.recall = r
self.config = True
def classifier(self, data, labels):
clf = linear_model.SGDClassifier(loss='log', penalty='l1', alpha=1e-3, class_weight='balanced',
learning_rate='optimal', eta0=0.0)
clf.fit(data, labels)
probabilities = []
probabilities_positive = []
probabilities_negative = []
tmp = clf.predict_proba(data)
for i in range(len(data)):
if labels[i] == 1:
probabilities.append(tmp[i][1])
probabilities_positive.append(tmp[i][1])
else:
probabilities.append(tmp[i][1])
probabilities_negative.append(tmp[i][1])
return clf, probabilities, probabilities_positive, probabilities_negative
def unit(self):
data_train = [self.Data[idx] for idx in self.Index_Retain_Train]
labels_train = [self.Labels[idx] for idx in self.Index_Retain_Train]
num_positive = labels_train.count(1)
num_negative = labels_train.count(0)
print(self.Name + ' | Train | Title | iteration | ' + str(self.Ite) + ' | Logistic Regression ... ...')
clf_lr, probabilities_train, probabilities_positive_train, probabilities_negative_train = self.classifier(
data=data_train, labels=labels_train)
print(self.Name + ' | Train | Title | iteration | ' + str(self.Ite) + ' | Adjust Threshold ... ...')
print(heapq.nsmallest(max(int(0.01 * self.Num_Positive), 1), probabilities_positive_train))
threshold = heapq.nsmallest(max(int(0.01 * self.Num_Positive), 1), probabilities_positive_train)[-1]
Index_Retain_Train = []
for i in range(num_positive + num_negative):
if labels_train[i] == 1:
Index_Retain_Train.append(self.Index_Retain_Train[i])
elif probabilities_train[i] > threshold:
Index_Retain_Train.append(self.Index_Retain_Train[i])
self.Index_Retain_Train = Index_Retain_Train
data_predict = [self.Data[idx] for idx in self.Index_Retain_Predict]
tmp = clf_lr.predict_proba(data_predict).tolist()
probabilities_predict = list(map(list, zip(*tmp)))[1]
Predictions = [0 for i in range(self.Num)]
Index_Retain_Predict = []
self.Index_Delete[self.Ite] = []
for i in range(len(data_predict)):
if probabilities_predict[i] >= threshold:
Index_Retain_Predict.append(self.Index_Retain_Predict[i])
Predictions[self.Index_Retain_Predict[i]] = 1
else:
self.Index_Delete[self.Ite].append(self.Index_Retain_Predict[i])
self.Index_Retain_Predict = Index_Retain_Predict
recall = metrics.recall_score(self.Labels, Predictions, pos_label=1)
precision = metrics.precision_score(self.Labels, Predictions, pos_label=1)
f1 = metrics.f1_score(self.Labels, Predictions, pos_label=1)
if recall >= self.recall:
self.f1 = f1
print(self.Name + ' | Train | Title | iteration | ' + str(
self.Ite) + ' | Positive Recall | ' + '%.4f' % recall)
print(self.Name + ' | Train | Title | iteration | ' + str(
self.Ite) + ' | Positive Precision | ' + '%.4f' % precision)
print(self.Name + ' | Train | Title | iteration | ' + str(
self.Ite) + ' | Positive F1 | ' + '%.4f' % f1 + '\n')
self.Recall.append(recall)
self.Precision.append(precision)
self.F1.append(f1)
joblib.dump(clf_lr,os.path.join(
self.model_save_path ,self.Name + '_iteration_' + str(self.Ite) + '_train_title_classifier.m'))
self.Threshold[self.Ite] = threshold
self.Ite += 1
else:
print(self.Name + ' | Train | Title | iteration | ' + str(
self.Ite) + ' | Positive Recall Less Than Given Recall' + '\n')
self.Index_Retain_Predict += self.Index_Delete[self.Ite]
del self.Index_Delete[self.Ite]
self.config = False
def train_title(self):
while self.config == True:
self.unit()
return self.Threshold, self.Index_Retain_Predict, self.Index_Delete
def train_content(self, data, Index_Retain_Predict_Title, r, logger):
data_train = data
labels_train = [self.Labels[idx] for idx in Index_Retain_Predict_Title]
print(self.Name + ' | Train | Content | Number of Data | ' + str(len(labels_train)))
num_positive = labels_train.count(1)
num_negative = labels_train.count(0)
print(self.Name + ' | Train | Content | Number of Positive | ' + str(num_positive))
print(self.Name + ' | Train | Content | Number of Negative | ' + str(num_negative) + '\n')
clf_xg = linear_model.SGDClassifier(loss='log', penalty='l1', alpha=1e-3, class_weight='balanced',
learning_rate='optimal', eta0=0.0)
clf_xg.fit(data_train, labels_train)
joblib.dump(clf_xg, os.path.join(
self.model_save_path , self.Name + '_train_content_classifier.m'))
tmp = clf_xg.predict_proba(np.array(data_train)).tolist()
probabilities_predict = list(map(list, zip(*tmp)))[1]
Recall = []
Precision = []
F1 = []
Threshold = []
for t in [x / 1000 for x in range(1001)]:
Predictions = [0 for i in range(self.Num)]
for i in range(len(data_train)):
if probabilities_predict[i] >= t:
Predictions[Index_Retain_Predict_Title[i]] = 1
recall = metrics.recall_score(self.Labels, Predictions, pos_label=1)
precision = metrics.precision_score(self.Labels, Predictions, pos_label=1)
f1 = metrics.f1_score(self.Labels, Predictions, pos_label=1)
Recall.append(recall)
Precision.append(precision)
F1.append(f1)
Threshold.append(t)
if recall < r:
break
print(self.Name + ' | Train | Content | Finally | Threshold | ' + '%.4f' % Threshold[-1] + '\n')
print(self.Name + ' | Train | Content | Finally | Positive Recall | ' + '%.4f' % Recall[-1])
print(self.Name + ' | Train | Content | Finally | Positive Precision | ' + '%.4f' % Precision[-1])
print(self.Name + ' | Train | Content | Finally | Positive F1 | ' + '%.4f' % F1[-1] + '\n')
logger.info('模型评估结果如下:')
logger.info('精确率为%.2f' % Precision[-1])
logger.info('召回率为%.2f' % Recall[-1])
logger.info('精确率为%.2f' % F1[-1])
Index_Retain_Predict = []
Index_Delete = []
for i in range(len(data_train)):
if probabilities_predict[i] >= Threshold[-1]:
Index_Retain_Predict.append(Index_Retain_Predict_Title[i])
else:
Index_Delete.append(Index_Retain_Predict_Title[i])
return Threshold[-1], Index_Retain_Predict, Index_Delete
class LogisticClassifyModel(BaseModel):
def __init__(self, config_path):
super().__init__(config_path)
def building_model(self, tfidf_title=None, tfidf_content=None, labels=None, r=None, logger=None):
if not os.path.exists(self.model_config['model_path']):
os.makedirs(self.model_config['model_path'])
lr = ensemble(name=self.model_config['name'],
r=self.model_config['r'],
data=tfidf_title,
labels=labels,
model_save_path=self.model_config['model_path']) # r可调节,训练在召回率低于r时停止过滤进入下阶段过滤。
if tfidf_title:
Threshold, self.Index_Retain_Predict_Title, Index_Delete_Title = lr.train_title()
joblib.dump(Threshold, os.path.join(self.model_config['model_path'], self.model_config['name'] + '_title_threshold.pkl'))
return Threshold, self.Index_Retain_Predict_Title, Index_Delete_Title
elif tfidf_content:
threshold, Index_Retain_Predict_Content, Index_Delete_Content = lr.train_content(
data=tfidf_content,
Index_Retain_Predict_Title=self.Index_Retain_Predict_Title,
r=0.8,
logger=logger) # r可调节,训练最终在召回率低于r时终止。
joblib.dump(threshold, os.path.join(self.model_config['model_path'],
self.model_config['name'] + '_content_threshold.pkl'))
return threshold, Index_Retain_Predict_Content, Index_Delete_Content
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:33
# @Author : 程婷婷
# @FileName: XgboostClassifyRunner.py
# @Software: PyCharm
from model.base.views.runner.BaseRunner import BaseRunner
from model.classify.views.logistic_classify.data.LogisticClassifyProcess import LogisticClassifyProcess
from model.classify.views.logistic_classify.LogisticClassifyModel import LogisticClassifyModel
from model.classify.views.logistic_classify.LogisticClassifyEvaluator import LogisticClassifyEvaluator
class LogisticClassifyRunner(BaseRunner):
def __init__(self, config_path):
super().__init__(config_path)
self.lcp = LogisticClassifyProcess(config_path)
self.lcm = LogisticClassifyModel(config_path)
self.lce = LogisticClassifyEvaluator(config_path)
def train(self, logger):
tfidf_title, idf_title, labels = self.lcp.title_process(logger)
Threshold,Index_Retain_Predict_Title,Index_Delete_Title = self.lcm.building_model(
tfidf_title=tfidf_title,
labels=labels,
logger=logger
)
tfidf_content, idf_content = self.lcp.content_process(Index_Retain_Predict_Title)
threshold, Index_Retain_Predict_Content, Index_Delete_Content = self.lcm.building_model(
labels = labels,
tfidf_content=tfidf_content,
r=0.8,
logger=logger
) # r可调节,训练最终在召回率低于r时终止。
return 'success'
# if __name__ == '__main__':
# state = LogisticClassifyRunner().train()
# print(state)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/17 9:08
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 14:53
# @Author : 程婷婷
# @FileName: LogisticClassifyDataLoader.py
# @Software: PyCharm
from model.base.views.data.BaseDataLoader import BaseDataLoader
class LogisticClassifyDataLoader(BaseDataLoader):
def __init__(self, config_path):
super().__init__(config_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:14
# @Author : 程婷婷
# @FileName: XgboostClassifyProcess.py
# @Software: PyCharm
import re
import os
import jieba
import joblib
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from model.base.views.data.BaseDataProcess import BaseDataProcess
from model.classify.views.logistic_classify.data.LogisticClassifyDataLoader import LogisticClassifyDataLoader
class LogisticClassifyProcess(BaseDataProcess):
def __init__(self, config_path):
super().__init__(config_path)
self.lcdl = LogisticClassifyDataLoader(config_path)
def document2sentences(self, document, key_words):
symbols = frozenset(u",。!?\n:;“”|)\u3000")
out_sentences = ''
for symbol in symbols:
document = document.replace(symbol, '。')
document = document.replace('\t', '').replace('\n', '')
sentences = document.split('。')
for sentence in sentences:
for key in key_words:
weight = sentence.count(key)
sentence += '。'
out_sentences += sentence * weight
return out_sentences
def filtrate_words(self, words):
find_chinese = re.compile(u"[\u4e00-\u9fa5]+")
symbols = "[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\\@\#\\\&\*\%]"
stopwords = self.lcdl.read_stopwords()
filtrated_words = []
for j in range(len(words)):
if re.findall(find_chinese, words[j]) == []:
continue
elif re.sub(symbols, "", re.findall(find_chinese, words[j])[0]) == '':
continue
elif re.sub(symbols, "", re.findall(find_chinese, words[j])[0]) in stopwords:
continue
else:
filtrated_words.append(re.sub(symbols, "", re.findall(find_chinese, words[j])[0]))
return ' '.join(filtrated_words)
def get_chi(self, data, labels):
num = len(data)
length = len(data[0])
# print(type(labels[0]))
# print(labels[0])
print('================')
print(len(labels))
print(len(data))
data_p = [data[i] for i in range(num) if int(labels[i]) == 1]
data_n = [data[i] for i in range(num) if int(labels[i]) == 0]
num_p = len(data_p)
num_n = len(data_n)
print('正样本为%s', str(num_p))
print('负样本为%s', str(num_n))
data_p_t = list(map(list, zip(*data_p)))
data_n_t = list(map(list, zip(*data_n)))
chi_square = []
for i in range(length):
b = data_p_t[i].count(0)
d = data_n_t[i].count(0)
a = num_p - b
c = num_n - d
if num_p * num_n * (a + c) * (b + d) == 0:
chi_square.append(0)
else:
chi_square.append((num * pow(a * d - b * c, 2)) / (num_p * num_n * (a + c) * (b + d)))
return chi_square
def get_vocabulary_title(self, titles_tokenized_filtered, contents_tokenized_filtered, labels):
data = [
self.embedding_config['title_weight'] * (titles_tokenized_filtered[i] + ' ') + contents_tokenized_filtered[i]
for i in range(len(labels))]
cv = CountVectorizer(ngram_range=(1, 3), min_df=2)
tf = cv.fit_transform(data)
vocabulary_list = cv.get_feature_names()
print(' | Train | Title | Vocabulary | Original Length | ' + str(len(vocabulary_list)))
num_key_words = int(len(vocabulary_list) * self.embedding_config['title_feature_ratio'])
print(' | Train | Title | Vocabulary | Length | ' + str(num_key_words))
print(tf.toarray())
tf_weights = tf.toarray().tolist()
chi_square = self.get_chi(tf_weights, labels)
print(' | Train | Title | Vocabulary | Complete by CHI ......')
original_vocabulary_chi_square = [(vocabulary_list[i], chi_square[i]) for i in range(len(vocabulary_list))]
sorted_original_vocabulary_chi_square = sorted(original_vocabulary_chi_square, key=lambda x: x[1], reverse=True)
vocabulary_list = [sorted_original_vocabulary_chi_square[i][0] for i in range(num_key_words)]
vocabulary_title = {}
k = 0
for word in vocabulary_list:
vocabulary_title[word] = k
k += 1
return vocabulary_title
def get_tfidf_title(self, titles_tokenized_filtered, contents_tokenized_filtered, vocabulary_title):
data = [
self.embedding_config['title_weight'] * (titles_tokenized_filtered[i] + ' ') + contents_tokenized_filtered[i]
for i in range(len(self.labels))]
cv = CountVectorizer(ngram_range=(1, 3), vocabulary=vocabulary_title)
train_tf = cv.fit_transform(data)
print(' | Train | Title | TF | Completed ......')
tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
train_tfidf = tfidf_transformer.fit_transform(train_tf)
train_tfidf_weights = train_tfidf.toarray().tolist()
print(' | Train | Title | TFIDF | Completed ......')
idf = tfidf_transformer.idf_.tolist()
return train_tfidf_weights, idf
def get_vocabulary_content(self, contents_tokenized_filtered, labels, index):
data = [contents_tokenized_filtered[idx] for idx in index]
labels = [labels[idx] for idx in index]
tf_transformer = CountVectorizer(ngram_range=(1, 3), min_df=2)
tf = tf_transformer.fit_transform(data)
vocabulary_list = tf_transformer.get_feature_names()
print(' | Train | Content | Vocabulary | Original Length | ' + str(len(vocabulary_list)))
num_key_words = int(len(vocabulary_list) * self.embedding_config['content_feature_ratio'])
print(' | Train | Content | Vocabulary | Length | ' + str(num_key_words))
tf_weights = tf.toarray().tolist()
chi_square = self.get_chi(tf_weights, labels)
print(' | Train | Content | Vocabulary | Complete by CHI ......')
original_vocabulary_chi_square = [(vocabulary_list[i], chi_square[i]) for i in range(len(vocabulary_list))]
sorted_original_vocabulary_chi_square = sorted(original_vocabulary_chi_square, key=lambda x: x[1], reverse=True)
vocabulary_list = [sorted_original_vocabulary_chi_square[i][0] for i in range(num_key_words)]
self.vocabulary_content = {}
k = 0
for word in vocabulary_list:
self.vocabulary_content[word] = k
k += 1
return self.vocabulary_content
def get_tfidf_content(self, contents_tokenized_filtered, vocabulary_content, index):
data = [contents_tokenized_filtered[idx] for idx in index]
tf_transformer = CountVectorizer(ngram_range=(1, 3), vocabulary=vocabulary_content)
train_tf = tf_transformer.fit_transform(data)
print(' | Train | Content | TF | Completed ......')
tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
train_tfidf = tfidf_transformer.fit_transform(train_tf)
train_tfidf_weights = train_tfidf.toarray().tolist()
print(' | Train | Content | TFIDF | Completed ......')
idf = tfidf_transformer.idf_.tolist()
return train_tfidf_weights, idf
def title_process(self, logger):
df = self.lcdl.read_file()
key_words = []
for word in list(set(df['key_words'])):
if str(word) and str(word) != 'nan':
key_words.append(word)
jieba.add_word(str(word))
df.dropna(subset=['content', 'label'], inplace=True)
df = shuffle(df)
df = df.reset_index(drop=True)
all_label = list(set(df['label']))
self.label_mapping = {v: k for k, v in dict(enumerate(all_label)).items()}
df['label'] = df['label'].map(self.label_mapping)
print('有用的数据共%d条' % len(df))
logger.info('处理后的数据量为 %d 条' %len(df))
train_set, test_set = self.split_dataset(df, use_dev=self.process_config['use_dev'])
logger.info('训练集的数据量为 %d 条' % len(train_set))
logger.info('测试集的数据量为 %d 条' % len(test_set))
train_set = train_set.reset_index(drop=True)
self.labels = train_set['label']
train_set['content'] = [self.document2sentences(content, key_words) for content in train_set['content']]
titles_tokenized = [jieba.lcut(sentences) for sentences in train_set['title']]
contents_tokenized = [jieba.lcut(sentences) for sentences in train_set['content']]
titles_tokenized_filtered = [self.filtrate_words(words) for words in titles_tokenized]
print(' | Train | Content | Filtered ......')
self.contents_tokenized_filtered = [self.filtrate_words(words) for words in contents_tokenized]
vocabulary_title = self.get_vocabulary_title(titles_tokenized_filtered,
self.contents_tokenized_filtered,
self.labels)
# joblib.dump(vocabulary_title, filename=)
tfidf_title, idf_title = self.get_tfidf_title(titles_tokenized_filtered,
self.contents_tokenized_filtered,
vocabulary_title)
labels = self.labels.tolist()
if not os.path.exists(self.embedding_config['embedding_path']):
os.makedirs(self.embedding_config['embedding_path'])
joblib.dump(vocabulary_title, filename=os.path.join(
self.embedding_config['embedding_path'] ,self.embedding_config['name']+'_vocabulary_title.pkl'))
joblib.dump(idf_title, filename=os.path.join(
self.embedding_config['embedding_path'] ,self.embedding_config['name']+'_idf_title.pkl'))
return tfidf_title, idf_title, labels
def content_process(self, Index_Retain_Predict_Title):
vocabulary_content = self.get_vocabulary_content(self.contents_tokenized_filtered,
self.labels,
Index_Retain_Predict_Title) # feature_ratio可调节,用来控制词表的长度,防止词表过长,运行时间太长或者内存溢出。
tfidf_content, idf_content = self.get_tfidf_content(self.contents_tokenized_filtered,
vocabulary_content,
Index_Retain_Predict_Title)
if not os.path.exists(self.embedding_config['embedding_path']):
os.makedirs(self.embedding_config['embedding_path'])
joblib.dump(vocabulary_content, os.path.join(
self.embedding_config['embedding_path'] ,self.embedding_config['name']+'_vocabulary_content.pkl'))
joblib.dump(idf_content, os.path.join(
self.embedding_config['embedding_path'], self.embedding_config['name'] + '_idf_content.pkl'))
return tfidf_content, idf_content
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 14:47
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:06
# @Author : 程婷婷
# @FileName: XgboostClassifyConfig.py
# @Software: PyCharm
from model.base.views.config import BaseConfig
class TextcnnConfig(BaseConfig):
def __init__(self):
super().__init__()
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 14:14
# @Author : 程婷婷
# @FileName: XgboostClassifyEvaluator.py
# @Software: PyCharm
from model.base.views.evaluator.BaseEvaluator import BaseEvaluator
class TextcnnClassifyEvaluator(BaseEvaluator):
def __init__(self, config_path):
super().__init__(config_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:18
# @Author : 程婷婷
# @FileName: XgboostClassifyModel.py
# @Software: PyCharm
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
from keras.layers.merge import concatenate
from keras.utils import np_utils
from tensorflow.python.keras.regularizers import l2
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dropout, Dense, Input
from keras.models import Model
from model.base.views.model.BaseModel import BaseModel
class TextcnnClassifyModel(BaseModel):
def __init__(self, config_path):
super().__init__(config_path)
def building_model(self,
x_train_padded_seqs,
y_train,
x_test_padded_seqs,
y_test,
embedding_matrix,
classes_weight,
vocab):
# 构建TextCNN模型
main_input = Input(shape=(self.model_config['input_shape'],), dtype='float32')
# 词嵌入(使用预训练的词向量) 768是词向量维度
embedder = Embedding(len(vocab) + 1, 768, input_length=self.model_config['input_shape'], weights=[embedding_matrix], trainable=False)
# embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)
embed = embedder(main_input)
# kernel_size:整数或由单个整数构成的list/tuple,卷积核的空域或时域窗长度
# 'same'对边界也进行补0,但是保证输入维度与输出维度相同
cnn1 = Conv1D(filters=256,
kernel_size=3,
padding='same',
strides=1,
activation=self.model_config['activation'],
kernel_regularizer=l2(0.05))(embed)
# pool_size 池化窗口大小
cnn1 = MaxPooling1D(pool_size=int(self.model_config['input_shape'])-2)(cnn1)
cnn2 = Conv1D(filters=256,
kernel_size=4,
padding='same',
strides=1,
activation=self.model_config['activation'],
kernel_regularizer=l2(0.05))(embed)
cnn2 = MaxPooling1D(pool_size=int(self.model_config['input_shape'])-3)(cnn2)
cnn3 = Conv1D(filters=256,
kernel_size=5,
padding='same',
strides=1,
activation=self.model_config['activation'],
kernel_regularizer=l2(0.05))(embed)
cnn3 = MaxPooling1D(pool_size=int(self.model_config['input_shape'])-4)(cnn3)
cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
# units:代表该层的输出维度
main_output = Dense(units=2, activation='softmax')(drop)
model = Model(inputs=main_input, outputs=main_output)
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
one_hot_labels = np_utils.to_categorical(y_train, num_classes=2)
model.fit(x_train_padded_seqs,
one_hot_labels,
batch_size=self.model_config['batch_size'],
epochs=self.model_config['epochs'],
shuffle=self.model_config['shuffle'],
class_weight=classes_weight)
model.save(self.model_config['model_path'])
return model
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:33
# @Author : 程婷婷
# @FileName: XgboostClassifyRunner.py
# @Software: PyCharm
import numpy as np
from model.base.views.runner.BaseRunner import BaseRunner
from model.classify.views.textcnn_classify.data.TextcnnClassifyProcess import TextcnnClassifyProcess
from model.classify.views.textcnn_classify.TextcnnClassifyModel import TextcnnClassifyModel
from model.classify.views.textcnn_classify.TextcnnClassifyEvaluator import TextcnnClassifyEvaluator
class TextcnnClassifyRunner(BaseRunner):
def __init__(self, config_path):
super().__init__(config_path)
self.tcp = TextcnnClassifyProcess(config_path)
self.tcm = TextcnnClassifyModel(config_path)
self.tce = TextcnnClassifyEvaluator(config_path)
def train(self, logger):
x_train_padded_seqs, train_label, x_test_padded_seqs, test_label = self.tcp.runner_process(logger)
classes_weight = self.tcp.class_weight(train_label)
print(classes_weight)
model = self.tcm.building_model(
x_train_padded_seqs=x_train_padded_seqs,
y_train=train_label,
x_test_padded_seqs=x_test_padded_seqs,
y_test=test_label,
embedding_matrix=self.tcp.embedding_matrix,
classes_weight=classes_weight,
vocab=self.tcp.vocab
)
result = model.predict(x_test_padded_seqs) # 预测样本属于每个类别的概率
predict_label = np.argmax(result, axis=1) # 获得最大概率对应的标签
self.tce.evaluate(test_label, predict_label, self.tcp.label_mapping, logger)
return 'success'
# if __name__ == '__main__':
# state = TextcnnClassifyRunner().train()
# print(state)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/17 9:08
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 15:53
# @Author : 程婷婷
# @FileName: TextcnnClassifyDataLoader.py
# @Software: PyCharm
from model.base.views.data.BaseDataLoader import BaseDataLoader
class TextcnnClassifyDataLoader(BaseDataLoader):
def __init__(self, config_path):
super().__init__(config_path)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:14
# @Author : 程婷婷
# @FileName: XgboostClassifyProcess.py
# @Software: PyCharm
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from bert_serving.client import BertClient
import joblib
from sklearn.utils import class_weight
from model.base.views.data.BaseDataProcess import BaseDataProcess
from model.classify.views.textcnn_classify.data.TextcnnClassifyDataLoader import TextcnnClassifyDataLoader
class TextcnnClassifyProcess(BaseDataProcess):
def __init__(self, config_path):
super().__init__(config_path)
self.tcdl = TextcnnClassifyDataLoader(config_path)
def tokenier(self, data, label):
tokenizer = Tokenizer() # 创建一个Tokenizer对象
tokenizer.fit_on_texts(data) # 编号,编号是根据词频的
self.vocab = tokenizer.word_index # 得到每个词的编号
df = pd.DataFrame(columns=['content', 'label'])
df['content'] = data
df['label'] = label
train_set, test_set = self.split_dataset(df, use_dev=self.process_config['use_dev'])
x_train_word_ids = tokenizer.texts_to_sequences(train_set['content'])
x_test_word_ids = tokenizer.texts_to_sequences(test_set['content']) # 序列模式
x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=3500)
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=3500)
# with open(tokenizer_path, 'wb') as file:
# pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)
joblib.dump(tokenizer, filename=self.embedding_config['tokenizer_path'])
return x_train_padded_seqs, train_set['label'], x_test_padded_seqs, test_set['label']
def get_embeddingMatrix(self, vocab):
# 初始化存储所有向量的大矩阵,留意其中多一位(首行),词向量全为 0,用于 padding补零。
embedding_matrix = np.zeros((len(vocab) + 1, 768))
bert_client = BertClient(port=5558, port_out=5559)
for word, i in vocab.items():
try:
# print(word)
embedding_vector = bert_client.encode(word.split(' '))
if embedding_vector.shape == (1, 768):
embedding_vector = embedding_vector.mean(axis=0)
embedding_matrix[i] = embedding_vector
else:
print(embedding_vector.shape)
print('----------类型错误----------')
except KeyError:
continue
return embedding_matrix
def class_weight(self, y_train):
weight = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
classes_weight = dict(enumerate(weight))
return classes_weight
def runner_process(self, logger):
df = self.tcdl.read_file()
all_label = list(set(df['label']))
self.label_mapping = {v: k for k, v in dict(enumerate(all_label)).items()}
labels = df['label'].map(self.label_mapping)
processed_data = self.process(df['content'], min_content=self.process_config['min_content'])
print(processed_data)
x_train_padded_seqs, train_label, x_test_padded_seqs, test_label = self.tokenier(processed_data,
labels)
logger.info('处理后的数据量为 %d 条' % (len(train_label) + len(test_label)))
logger.info('训练集的数据量为 %d 条' % len(train_label))
logger.info('测试集的数据量为 %d 条' % len(test_label))
self.embedding_matrix = self.get_embeddingMatrix(self.vocab)
joblib.dump(self.embedding_matrix, filename=self.embedding_config['embedding_path'])
return x_train_padded_seqs, train_label, x_test_padded_seqs, test_label
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 15:52
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
from django.shortcuts import render
# Create your views here.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:06
# @Author : 程婷婷
# @FileName: XgboostClassifyConfig.py
# @Software: PyCharm
from model.base.views.config import BaseConfig
class XgboostClassifyConfig(BaseConfig):
def __init__(self, config_path):
super().__init__(config_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 14:14
# @Author : 程婷婷
# @FileName: XgboostClassifyEvaluator.py
# @Software: PyCharm
from model.base.views.evaluator.BaseEvaluator import BaseEvaluator
class XgboostClassifyEvaluator(BaseEvaluator):
def __init__(self, config_path):
super().__init__(config_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:18
# @Author : 程婷婷
# @FileName: XgboostClassifyModel.py
# @Software: PyCharm
import scipy.sparse.csr
import scipy.sparse.csc
import pickle
import numpy as np
from xgboost import XGBClassifier
from model.base.views.model.BaseModel import BaseModel
class XgboostClassify(object):
def __init__(self, label_dict, signature, lr=0.1, reg_alpha=0, reg_lambda=1, objective='binary:logitraw', \
with_sample_weight=True, subsample=1, min_child_weight=1, scale_pos_weight=1, thres=0.5):
self.lr = lr
self.label_dict = label_dict
self.signature = signature
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.objective = objective
self.with_sample_weight = with_sample_weight
self.min_child_weight = min_child_weight
self.scale_pos_weight = scale_pos_weight
self.thres = thres
self.clf = None
def set_signature(self, new_signature):
self.signature = new_signature
def train(self, X, Y, save_to=None):
print(len(self.label_dict))
assert len(self.label_dict) == 2, 'It should have exactly two classes.'
if isinstance(X, scipy.sparse.csr.csr_matrix):
data = X.tocsc()
elif isinstance(X, np.ndarray):
data = X
else:
data = np.array(X, copy=False)
if isinstance(Y, scipy.sparse.csr.csr_matrix):
label = Y.todense()
else:
label = np.array(Y, copy=False)
if len(np.unique(label)) == 1:
print('Only contains one label, training stopped.')
return
N_0 = np.sum(label == 0)
N_1 = np.sum(label == 1)
w_0 = (N_0 + N_1) / (2. * N_0)
w_1 = (N_0 + N_1) / (2. * N_1)
self.clf = XGBClassifier(reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, objective=self.objective, \
min_child_weight=self.min_child_weight, scale_pos_weight=self.scale_pos_weight,
learning_rate=self.lr)
if self.with_sample_weight:
self.clf.fit(data, label, sample_weight=[w_0 if l == 0 else w_1 for l in label])
else:
self.clf.fit(data, label)
# print('Finished.')
if save_to:
self.save(save_to)
def save(self, save_to):
file_name = save_to + ('-%s.xgb' % self.signature)
with open(file_name, 'wb') as f:
pickle.dump((self.clf, self.label_dict, self.signature), f)
@staticmethod
def load(file_path):
with open(file_path, 'rb') as f:
clf, label_dict, signature = pickle.load(f)
xgb = Xgboost(label_dict, signature)
xgb.clf = clf
return xgb
def predict(self, X, thres=0.5, return_real_label=False):
prob = self.predict_pro(X)
label = np.zeros((prob.shape[0],))
label[prob[:, 1] >= thres] = 1
if return_real_label:
return [self.label_dict[l] for l in label]
else:
return label.astype(np.int64)
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def predict_pro(self, X):
if not (isinstance(X, scipy.sparse.csr.csr_matrix) or isinstance(X, np.ndarray) or isinstance(X,
scipy.sparse.csc.csc_matrix)):
X = np.array(X, copy=False)
if isinstance(X, scipy.sparse.csr.csr_matrix):
X = X.tocsc()
if self.clf and X.shape[0] > 0:
if len(X.shape) == 1:
X = [X]
prob = self.clf.predict_proba(X)
prob = np.array([self.sigmoid(i) for i in prob[:]])
return prob
else:
if not self.clf:
print('模型还没训练,请先训练模型')
else:
print('数据不能为空')
class XgboostClassifyModel(BaseModel):
def __init__(self, config_path):
super().__init__(config_path)
def building_model(self, label_dict, signature, X_train, Y_train):
xgb = XgboostClassify(label_dict,
signature,
lr=self.model_config['lr'],
reg_alpha=self.model_config['reg_alpha'],
reg_lambda=self.model_config['reg_lambda'],
objective=self.model_config['objective'],
with_sample_weight=self.model_config['with_sample_weight'],
subsample=self.model_config['subsample'],
thres=self.model_config['thres'],
min_child_weight=self.model_config['min_child_weight'],
scale_pos_weight=self.model_config['scale_pos_weight'])
clf_save_to = self.model_config['model_path']
print('开始训练')
xgb.train(X_train, Y_train, save_to=clf_save_to)
print('训练结束')
return xgb
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:33
# @Author : 程婷婷
# @FileName: XgboostClassifyRunner.py
# @Software: PyCharm
import time
import numpy as np
from model.base.views.runner.BaseRunner import BaseRunner
from model.classify.views.xgboost_classify.data.XgboostClassifyProcess import XgboostClassifyProcess
from model.classify.views.xgboost_classify.XgboostClassifyModel import XgboostClassifyModel
from model.classify.views.xgboost_classify.XgboostClassifyEvaluator import XgboostClassifyEvaluator
class XgboostClassifyRunner(BaseRunner):
def __init__(self, config_path):
super().__init__(config_path)
self.signature = int(time.time())
self.xcp = XgboostClassifyProcess(config_path)
self.xcm = XgboostClassifyModel(config_path)
self.xce = XgboostClassifyEvaluator(config_path)
def train(self, logger):
train_set, test_set = self.xcp.runner_process(signature=self.signature)
print(self.xcp.label_mapping)
label_dict, = self.xcp.label_mapping,
X_train = np.delete(train_set, -1, axis=1)
Y_train = train_set[:, -1].astype(np.int64)
print(X_train.shape)
print(Y_train)
print(list(set(Y_train)))
logger.info('处理后的数据量为 %d 条' %(len(train_set)+len(test_set)))
logger.info('训练集的数据量为 %d 条'%len(train_set))
logger.info('测试集的数据量为 %d 条'%len(test_set))
print('==========训练集有%d条数据==========' %len(X_train))
model = self.xcm.building_model(
label_dict,
self.signature,
X_train,
Y_train
)
# xg = XgboostClassify(label_dict=self.xcp.label_mapping, signature=self.signature)
X_test = np.delete(test_set, -1, axis=1)
true_label = test_set[:, -1].astype(np.int64)
print(list(set(true_label)))
predict_label = model.predict(X_test, thres=self.runner_config['thres'])
predict_label = predict_label.tolist()
print(list(set(predict_label)))
self.xce.evaluate(true_label, predict_label, label_dict, logger)
return 'success'
# if __name__ == '__main__':
# state = XgboostClassifyRunner().train()
# print(state)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/17 9:08
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:14
# @Author : 程婷婷
# @FileName: XgboostClassifyProcess.py
# @Software: PyCharm
import numpy as np
from sklearn.utils import class_weight
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import mutual_info_classif, SelectPercentile
import scipy.linalg
import jieba
from sklearn.base import BaseEstimator, TransformerMixin
from model.base.views.data.BaseDataProcess import BaseDataProcess
from model.classify.views.xgboost_classify.data.XgoostClassifyDataLoader import XgboostClassifyDataLoader
class Vocabulary:
def __init__(self, signature, min_word_len=2, name='voc'):
self.signature = signature
self.min_word_len = min_word_len
self.name = name
self.voc = dict()
self.freq = dict()
self.doc_freq = dict()
self.oov = None
self.size = 0
self._fixed_voc = False
def set_state(self, fixed=False):
assert fixed in [True, False, 0, 1]
self._fixed_voc = fixed
def get_state(self):
state = 'Fixed' if self._fixed_voc else 'Not fixed'
return state
def shuffle(self):
self.check_state()
idx = np.random.permutation(self.size)
shuffled_voc = dict()
shuffled_freq = dict()
shuffled_doc_freq = dict()
for key, id in self.voc.items():
shuffled_voc[key] = idx[id]
shuffled_freq[idx[id]] = self.freq[id]
shuffled_doc_freq[idx[id]] = self.doc_freq[id]
del self.voc, self.freq, self.doc_freq
self.voc, self.freq, self.doc_freq = shuffled_voc, shuffled_freq, shuffled_doc_freq
def _is_useless(self, x):
if len(x) < self.min_word_len:
return True
if x.strip(
'''#&$_%^*-+=<>`~!@(())??/\\[]{}—"';::;,。,.‘’“”|…\n abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890''') == '':
return True
return False
def update(self, words):
if self._fixed_voc:
raise Exception('Fixed vocabulary does not support update.')
for word in words:
if not self._is_useless(word):
id = self.voc.get(word, None)
if id is None: # new word
self.voc[word] = self.size
self.freq[self.size] = 1
self.doc_freq[self.size] = 0 # create doc_freq item
self.size += 1
else:
self.freq[id] += 1
for word in set(words):
if not self._is_useless(word):
id = self.voc.get(word, None)
if id is not None:
self.doc_freq[id] += 1 # update doc_freq
def get(self, word):
return self.voc.get(word, self.oov)
def __getitem__(self, word):
return self.voc.get(word, self.oov)
def __contains__(self, word):
return self.voc.__contains__(word)
def __iter__(self):
return iter(self.voc)
def __sizeof__(self):
return self.voc.__sizeof__() + self.freq.__sizeof__() + self.signature.__sizeof__() + self.size.__sizeof__() + \
self.name.__sizeof__() + self._fixed_voc.__sizeof__() + self.oov.__sizeof__() + self.doc_freq.__sizeof__()
def __delitem__(self, word): # delete would destory the inner representation
if self._fixed_voc:
raise Exception('Fixed vocabulary does not support deletion.')
else:
raise NotImplementedError
def get_size(self):
return self.size
def clear(self):
del self.voc, self.freq, self.doc_freq
self.voc = dict()
self.freq = dict()
self.doc_freq = dict()
self.size = 0
self._fixed_voc = False
def check_state(self):
return len(self.voc) == self.size and len(self.freq) == self.size and len(self.doc_freq) == self.size
def to_dict(self):
return self.voc
def set_signature(self, new_signature):
self.signature = new_signature
def save(self, file_name=None):
save_to = (file_name if file_name else self.name) + '-%s.voc' % self.signature
with open(save_to, 'wb') as f:
pickle.dump([self.voc,
self.freq,
self.doc_freq,
self.size,
self.min_word_len,
self.oov,
self._fixed_voc,
self.name,
self.signature], f)
@classmethod
def load(cls, file_name):
with open(file_name, 'rb') as f:
[voc, freq, doc_freq, size, min_word_len, oov, _fixed, name, signature] = pickle.load(f)
voc_from_file = cls(signature, name)
voc_from_file.voc = voc
voc_from_file.freq = freq
voc_from_file.doc_freq = doc_freq
voc_from_file.size = size
voc_from_file.min_word_len = min_word_len
voc_from_file.oov = oov
voc_from_file._fixed_voc = _fixed
voc_from_file.signature = signature
return voc_from_file
class DataProcessor:
def __init__(self, data, transformer='tf', transformer_norm='l2'):
self.data = data
transformer = transformer.lower()
assert transformer in ['tf', 'tfidf']
self.transformer_type = transformer
self.transformer_norm = transformer_norm
self.transformer = None
def reset(self):
self.transformer = None
self.cv = None
def preprocess(self, label_dict, _all=False, _emotion=False):
processed_data = {}
processed_label = {}
processed_label_dict = {}
# only_have_one_label_key = []
for key in self.data:
print(key)
if not _emotion: # _all=False, _emotion=False
processed_data[key] = [' '.join(jieba.lcut(str(record[0]))) for record in self.data[key]]
label = [record[1] for record in self.data[key]]
processed_label[key] = label
processed_label_dict[key] = label_dict
processed_data[key] = np.array(processed_data[key])
print(processed_label_dict)
return processed_data, processed_label, processed_label_dict
def update_vocab(self, vocab, processed_data):
if type(processed_data) == dict:
for key in processed_data:
for record in processed_data[key]:
vocab.update(record.split(' '))
else:
for record in processed_data:
vocab.update(record.split(' '))
assert vocab.check_state(), 'Something wrong with vocabulary.'
def transform(self, vocab, data, label, with_feature_selection=False, feature_selection_method='FDA', binary=False):
vocab.set_state(fixed=True)
assert feature_selection_method in ['FDA', 'SelectPercentile']
if not self.transformer:
self.cv = CountVectorizer(decode_error='replace', vocabulary=vocab.to_dict(), binary=binary)
if self.transformer_type == 'tf':
self.transformer = TfidfTransformer(norm=self.transformer_norm, use_idf=False)
else:
self.transformer = TfidfTransformer(norm=self.transformer_norm, use_idf=True)
if type(data) == dict:
transformed_data = {}
for key in data:
if with_feature_selection:
if feature_selection_method == 'FDA':
transformed_data[key] = FDA().fit_transform(
self.transformer.transform(self.cv.transform(data[key])), label[key]
)
else:
transformed_data[key] = SelectPercentile(mutual_info_classif, 20).fit_transform(
self.transformer.transform(self.cv.transform(data[key])), label[key]
)
else:
transformed_data[key] = self.transformer.transform(self.cv.transform(data[key]))
else:
if with_feature_selection:
if feature_selection_method == 'FDA':
transformed_data = FDA().fit_transform(
self.transformer.transform(self.cv.transform(data)), label
)
else:
transformed_data = SelectPercentile(mutual_info_classif, 20).fit_transform(
self.transformer.transform(self.cv.transform(data)), label
)
else:
transformed_data = self.transformer.transform(self.cv.transform(data))
return transformed_data
class FDA(BaseEstimator, TransformerMixin):
def __init__(self, alpha=1e-4):
'''Fisher discriminant analysis
Arguments:
----------
alpha : float
Regularization parameter
'''
self.alpha = alpha
def fit(self, X, Y):
'''Fit the LDA model
Parameters
----------
X : array-like, shape [n_samples, n_features]
Training data
Y : array-like, shape [n_samples]
Training labels
Returns
-------
self : object
'''
n, d_orig = X.shape
classes = np.unique(Y)
assert (len(Y) == n)
if isinstance(X, scipy.sparse.csr.csr_matrix):
mean_global = X.mean(axis=0)
else:
mean_global = np.mean(X, axis=0, keepdims=True)
scatter_within = self.alpha * np.eye(d_orig)
scatter_between = np.zeros_like(scatter_within)
for c in classes:
n_c = np.sum(Y == c)
if n_c < 2:
continue
if isinstance(X, scipy.sparse.csr.csr_matrix):
mu_diff = X[Y == c].mean(axis=0) - mean_global
else:
mu_diff = np.mean(X[Y == c], axis=0, keepdims=True) - mean_global
scatter_between = scatter_between + n_c * np.dot(mu_diff.T, mu_diff)
if isinstance(X, scipy.sparse.csr.csr_matrix):
scatter_within = scatter_within + n_c * np.cov(X[Y == c].todense(), rowvar=0)
else:
scatter_within = scatter_within + n_c * np.cov(X[Y == c], rowvar=0)
e_vals, e_vecs = scipy.linalg.eig(scatter_between, scatter_within)
self.e_vals_ = e_vals
self.e_vecs_ = e_vecs
self.components_ = e_vecs.T
return self
def transform(self, X):
'''Transform data by FDA
Parameters
----------
X : array-like, shape [n_samples, n_features]
Data to be transformed
Returns
-------
X_new : array, shape (n_samples, n_atoms)
'''
return X.dot(self.components_.T)
def fit_transform(self, X, Y):
self.fit(X, Y)
return self.transform(X)
class XgboostClassifyProcess(BaseDataProcess):
def __init__(self, config_path):
super().__init__(config_path)
self.xcdl = XgboostClassifyDataLoader(config_path)
def class_weight(self, y_train):
weight = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
classes_weight = dict(enumerate(weight))
return classes_weight
def runner_process(self, signature):
df = self.xcdl.read_file()
all_label = list(set(df['label']))
self.label_mapping = {v: k for k, v in dict(enumerate(all_label)).items()}
labels = df['label'].map(self.label_mapping)
processed_data = df['content'].map(lambda x: ' '.join(jieba.lcut(x)))
dp = DataProcessor(processed_data,
transformer=self.embedding_config['transformer'],
transformer_norm=self.embedding_config['transformer_norm'])
dp.reset()
vocab = Vocabulary(signature=signature, name='vocab-%s' % self.embedding_config['name'], min_word_len=2)
dp.update_vocab(vocab, processed_data)
print('%s, after updating, %s' % (self.embedding_config['name'], vocab.get_size()))
transformed_data = dp.transform(vocab, processed_data, labels)
vocab_save_to = self.embedding_config['embedding_path']
print(vocab.to_dict())
vocab.save(vocab_save_to)
merged_data = np.append(transformed_data.toarray(), labels.values.reshape((-1, 1)), axis=1)
print(merged_data.shape)
train_set, test_set = self.split_dataset(merged_data, self.process_config['use_dev'])
return train_set, test_set
# import time
# signature = int(time.time())
# XgboostClassifyProcess().runner_process(signature)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 16:23
# @Author : 程婷婷
# @FileName: XgoostClassifyDataLoader.py
# @Software: PyCharm
from model.base.views.data.BaseDataLoader import BaseDataLoader
class XgboostClassifyDataLoader(BaseDataLoader):
def __init__(self, config_path):
super().__init__(config_path)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 16:23
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
from django.contrib import admin
# Register your models here.
from django.apps import AppConfig
class ClusteringConfig(AppConfig):
name = 'clustering'
from django.db import models
# Create your models here.
from django.test import TestCase
# Create your tests here.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/12 18:05
# @Author : 程婷婷
# @FileName: urls.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:06
# @Author : 程婷婷
# @FileName: XgboostClassifyConfig.py
# @Software: PyCharm
from model.base.views.config import BaseConfig
class KMeansConfig(BaseConfig):
def __init__(self):
super().__init__()
# print(KmeansConfig()._parsed_file)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 14:14
# @Author : 程婷婷
# @FileName: XgboostClassifyEvaluator.py
# @Software: PyCharm
from model.base.views.evaluator.BaseEvaluator import BaseEvaluator
from sklearn import metrics
class KmeansEvaluator(BaseEvaluator):
def __init__(self, config_path):
super().__init__(config_path)
def compute_silhouette(self, X, labels):
score = metrics.silhouette_score(X, labels, metric='euclidean')
return score
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:18
# @Author : 程婷婷
# @FileName: XgboostClassifyModel.py
# @Software: PyCharm
from model.base.views.model.BaseModel import BaseModel
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
class KmeansModel(BaseModel):
def __init__(self, config_path):
super().__init__(config_path)
def chose_k(self, data):
silhouette_int = -1 # 初始化的平均轮廓系数阀值
for n_clusters in range(3, 20):
kmeans = KMeans(n_clusters=n_clusters,
init=self.model_config['init'],
n_init=self.model_config['n_init'],
max_iter=self.model_config['max_iter'])
cluster_labels_tmp = kmeans.fit_predict(data) # 训练聚类模型
silhouette_tmp = metrics.silhouette_score(data, cluster_labels_tmp) # 得到每个K下的平均轮廓系数
if silhouette_tmp > silhouette_int: # 如果平均轮廓系数更高
best_k = n_clusters # 将最好的K存储下来
silhouette_int = silhouette_tmp # 将最好的平均轮廓得分存储下来
best_kmeans = kmeans # 将最好的模型存储下来
print('=========已获得最优模型,共分为%d类========='%best_k)
return best_kmeans
def building_model(self, data):
if not self.model_config['n_clusters']:
model = self.chose_k(data)
else:
kmeans = KMeans(n_clusters=self.model_config['n_clusters'],
init=self.model_config['init'],
n_init=self.model_config['n_init'],
max_iter=self.model_config['max_iter'])
model = kmeans.fit(data)
print('=========共分为%s类=========' % str(self.model_config['n_clusters']))
classes = model.labels_
print(list(set(classes)))
data_cluster = [[] for i in range(max(classes)+1)]
result = [[] for j in range(max(classes)+1)]
for i in range(np.array(data).shape[0]):
for j in range(max(classes)+1):
if classes[i] == j:
result[j].append(i)
data_cluster[j].append(data[i])
self.save(model)
return model, data_cluster, result
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:33
# @Author : 程婷婷
# @FileName: XgboostClassifyRunner.py
# @Software: PyCharm
import numpy as np
import pandas as pd
from model.base.views.runner.BaseRunner import BaseRunner
from model.clustering.views.KMeans.KmeansModel import KmeansModel
from model.clustering.views.KMeans.data.KMeansDataLoader import KMeansDataLoader
from model.clustering.views.KMeans.data.KmeansProcess import KmeansProcess
from model.clustering.views.KMeans.KmeansEvaluator import KmeansEvaluator
import logging
format = '%(asctime)s %(levelname)s %(pathname)s %(funcName)s %(message)s'
logging.basicConfig(format=format, level=logging.INFO)
class KmeansRunner(BaseRunner):
def __init__(self, config_path):
super().__init__(config_path)
self.km = KmeansModel(config_path)
self.kdl = KMeansDataLoader(config_path)
self.kp = KmeansProcess(config_path)
self.ke = KmeansEvaluator(config_path)
def single_train(self, logger):
df = self.kdl.read_file()
if 'labels' not in df.columns:
df['labels'] = ''
logger.info('处理后的数据量为 %d 条' %(len(df)))
transformed_data = self.kp.runner_process(df['content'], df['labels'])
model, data_cluster, result = self.km.building_model(transformed_data)
centroids = model.cluster_centers_
self.labels_ = model.labels_
result_sorted = []
similarity = []
for j in range(max(self.labels_)+1):
distances = [(np.linalg.norm(centroids[j] - data_cluster[j][i]), result[j][i]) for i in
range(len(result[j]))]
distances_sorted = sorted(distances, key=lambda x: x[0])
result_sorted.append([value[1] for value in distances_sorted])
similarity.append([value[0] for value in distances_sorted])
score = self.ke.compute_silhouette(X=transformed_data, labels=self.labels_)
print('====================轮廓系数为%.4f====================' %score)
logger.info('轮廓系数为 %.4f ' %score)
return model, result_sorted, similarity, df
def train(self, logger):
model, result_sorted, similarity, df = self.single_train(logger)
columns = list(df.columns)
columns.append('distance')
writer = pd.ExcelWriter(self.runner_config['save_fpath'])
for j in range(max(self.labels_)+1):
df_out = pd.DataFrame(columns=columns)
if len(result_sorted[j]):
for i in range(len(result_sorted[j])):
row = list(df.iloc[result_sorted[j][i]])
row.append(float(similarity[j][i]))
df_out.loc[i] = row
print('第%s类有%d条数据' %(j, len(result_sorted[j])))
logger.info('第%s类有%d条数据' %(j, len(result_sorted[j])))
df_out.to_excel(writer, sheet_name='sheet' + str(j + 1), index=False)
writer.close()
return 'success'
#if __name__ == '__main__':
# state = KmeansRunner().write_file()
# print(state)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 10:28
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 17:39
# @Author : 程婷婷
# @FileName: KMeansDataLoader.py
# @Software: PyCharm
from model.base.views.data.BaseDataLoader import BaseDataLoader
class KMeansDataLoader(BaseDataLoader):
def __init__(self, config_path):
super().__init__(config_path)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:14
# @Author : 程婷婷
# @FileName: XgboostClassifyProcess.py
# @Software: PyCharm
import numpy as np
import re
import time
from model.base.views.data.BaseDataProcess import BaseDataProcess
class KmeansProcess(BaseDataProcess):
def __init__(self, config_path):
super().__init__(config_path)
def remove_char(self, content):
# 保留中文、英语字母、数字和标点
graph_filter = re.compile(r'[^\u4e00-\u9fa5,。\.,?\?!!;;]')
content = graph_filter.sub('', content)
return content
def process(self, data, min_content):
processed_data = []
i = 0
for record in data:
record = self.remove_char(record)
if len(record) > min_content:
methods = self.process_config['tokenizer']
if methods == 'PerceptronLexicalAnalyzer':
record = self.pla_tokenizer(record)
else:
record = self.jieba_tokenizer(record)
processed_data.append(record)
i += 1
else:
i += 1
pass
if (i+1)%100 == 0 or i+1 == len(data):
print(time.strftime('%Y-%m-%d %H:%M:%S'),'第',i+1,'条文本分词完毕')
return processed_data
def runner_process(self, data, labels):
# all_label = list(set(labels))
# label_mapping = {v: k for k, v in dict(enumerate(all_label)).items()}
processed_data = self.process(data, min_content=10)
transformed_data1, feature_words = self.bag_of_words(processed_data, labels)
processed_data2 = []
for i in processed_data:
record = i.split(' ')
processed_data2.append(record)
transformed_data2 = self.word2vec(processed_data2, feature_words=feature_words)
transformed_data = np.dot(transformed_data1, transformed_data2)
return transformed_data
# import pandas as pd
# df = pd.read_excel(r'E:\working\model_train\KMeans\data\test.xlsx')
# kp = KmeansProcess()
# kp.runner_process()
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 17:38
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/13 11:23
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
from django.shortcuts import render
# Create your views here.
## /基础服务之短文本相似度计算
```text
短文本相似度计算提供了两个短文本之间的语义相似度计算能力,计算方式有5种,分别为cos_sim、lev_sim、cos_sim、min_hash、sim_hash。
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/basic/doc-similarity-single/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDE1MzQzNH0.lTaat1GCB1pffWu1pmTJrpPGW8O_KEsy8QvuefLs6Lo | Text | 是 | []
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
text_1 | 内蒙古没有下大雨了 | Text | 是 | 文本内容1
text_2 | 内蒙古下大雨了 | Text | 是 | 与文本内容1进行对比的文本内容2
sim_algorithm_name | sim_has | Text | 是 | 计算相似度的方法
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /场景化服务之关键词挖掘
```text
关键词挖掘功能
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/extraction-keywords/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
pending_file | 测试.docx | Text | 是 | 上传的docx文档或者xlsx文档的名称
user_file | user_dict.txt | Text | 是 | 用户上传的自定义词典txt
path_timestamp | 1629770804649253 | Text | 是 | 文件夹名称
username | ctt | Text | 是 | 登录的用户名称
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /场景化服务之文件上传
```text
文件上传功能,需注意的是path_timestamp。为保证同一页面内上传的文件在同一文件夹内,故第一次上传时path_timestamp为空,接口返回时会自动创建文件夹并返回path_timestamp的值;此后该页面上传时需将path_timestamp的具体值传入。
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/file-upload/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDMyMDQ2OX0.Q2VNzrTMU1G8VG3E3PMRcwlkJ5K0RqGCshzIz1htFgM | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
files | [] | File | 是 | 用户上传的文件
path_timestamp | 1629704627130134 | Text | 是 | 文件夹名称(同页面内第一次上传文件,此参数为空,返回值中会包含此参数的值;第二次以及更多次上传文件时需要带上第一次返回此参数的值)
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /用户注册
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/register-account/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTg5NDE1NH0.wpTJ5W25A502WPKIDDQeC_NNlIV3Of56bTheLjbkABg | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
username | ly | Text | 是 | 用户名
true_name | ly | Text | 是 | 用户真实姓名
sex | 女 | Text | 是 | 用户性别
mobile_number | 15617380221 | Text | 是 | 用户电话号码
mail | 2698641198@qq.com | Text | 是 | 用户邮箱
id_card | 410527199811565698 | Text | 是 | 用户身份证号码
password | 123456 | Text | 是 | 用户设置的登录密码
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /核查用户名是否存在
```text
核查数据库中是否存在用户输入的用户名称。
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/verify-username/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTg5NDE1NH0.wpTJ5W25A502WPKIDDQeC_NNlIV3Of56bTheLjbkABg | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
username | ctt | Text | 是 | 用户注册时填入的用户名称
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
#### 成功响应示例
```javascript
{
"handleMsg": "success",
"isHandleSuccess": true,
"logs": "此用户名可用!",
"resultData": true
}
```
参数名 | 示例值 | 参数类型 | 参数描述
--- | --- | --- | ---
handleMsg | success | Text |
isHandleSuccess | true | Text |
logs | 此用户名可用! | Text |
resultData | true | Text |
#### 失败响应示例
```javascript
{
"handleMsg": "failure",
"isHandleSuccess": false,
"logs": "该用户名已存在!",
"resultData": false
}
```
参数名 | 示例值 | 参数类型 | 参数描述
--- | --- | --- | ---
handleMsg | failure | Text |
isHandleSuccess | false | Text |
logs | 该用户名已存在! | Text |
resultData | false | Text |
## /重置密码
```text
重置密码功能,根据用户名和用户输入的密码,更新数据库中的密码
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/reset-password/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
username | ctt | Text | 是 | 用户名
password | 123456 | Text | 是 | 用户新密码
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
#### 成功响应示例
```javascript
{
"handleMsg": "success",
"isHandleSuccess": true,
"logs": "重置密码成功!",
"resultData": true
}
```
参数名 | 示例值 | 参数类型 | 参数描述
--- | --- | --- | ---
handleMsg | success | Text |
isHandleSuccess | true | Text |
logs | 重置密码成功! | Text |
resultData | true | Text |
#### 失败响应示例
```javascript
{
"handleMsg": "failure",
"isHandleSuccess": false,
"logs": "重置密码失败!",
"resultData": false
}
```
参数名 | 示例值 | 参数类型 | 参数描述
--- | --- | --- | ---
handleMsg | failure | Text |
isHandleSuccess | false | Text |
logs | 重置密码失败! | Text |
resultData | false | Text |
## /测试 展示配置文件/base/show-config-file/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/show-config-file/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTg5NjA3Nn0.KGp7HWhb61EP-1w6X0y1t9pIDuKmObWlj5muWNJbvIA | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
model_type | textcnn | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 删除文件夹和记录 /base/delete-file-row-manage/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/delete-file-row/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTg5NjA3Nn0.KGp7HWhb61EP-1w6X0y1t9pIDuKmObWlj5muWNJbvIA | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
path_timestamp | 1626320056405440 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 显示日志文件/base/show-log-file/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/show-log-file/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTg5NjA3Nn0.KGp7HWhb61EP-1w6X0y1t9pIDuKmObWlj5muWNJbvIA | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
id | 3 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 生成验证码/base/validate-code/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/validate-code/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 验证用户登录/base/login/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/login
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
username | ctt | Text | 否 |
password | 123456 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 下载zip /base/download-zip/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/download-zip/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
path_timestamp | 1628935910315627 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 查询manage /base/query-manage/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/query-manager/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTg5NjA3Nn0.KGp7HWhb61EP-1w6X0y1t9pIDuKmObWlj5muWNJbvIA | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
task_name | | Text | 否 |
function_type | | Text | 否 |
model_type | | Text | 否 |
begin_date | | Text | 否 |
page_size | 10 | Text | 否 |
end_date | | Text | 否 |
current_page | 1 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 发送邮件重新设置密码/base/forget-password/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/forget-password/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
username | ctt | Text | 否 |
mobile_number | 15617380221 | Text | 否 |
mail | 2698641198@qq.com | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 服务结果文件下载/base/download-xlsx/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/download-xls/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTcxNTMwN30.sJMIlptQoHhqBeGMdgxdJ7WN0PgbqhYRjPde39scj98 | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
pending_file | /home/zzsn/ctt/platform_temporary/test0810.xlsx | Text | 否 |
user_file | /home/zzsn/ctt/platform_temporary/user_dict.txt | Text | 否 |
path_timestamp | 1629770804649253 | Text | 否 |
username | ctt | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 查询service_manage /base/query-service-manage/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/query-service-manage/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDY2MzQ5Mn0.bEHfjDAKo5qoWa1dFSIqtz0fhFWmMRWdoqYPYCZ8Nd0 | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
name | 关键词挖掘 | Text | 否 |
begin_date | 2021-09-01 | Text | 否 |
end_date | 2021-09-12 | Text | 否 |
state | 已完成 | Text | 否 |
page_size | 10 | Text | 否 |
current_page | 1 | Text | 否 |
username | ctt | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 公司名称提取/scenario/extraction-company/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/extraction-company/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTg2NjI4NH0.iOxrDWPASgMoIholJybpZ7wQs92EyJ3c952HdKIlvcc | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
path_timestamp | 1629855525880127 | Text | 否 |
pending_file | 负面信息---350.xlsx | Text | 否 |
user_file | 监控企业信息1.xls | Text | 否 |
username | ctt | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 股票招聘识别/scenario/stock-recruitment-filter/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/stock-recruitment-filter/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTg2NjI4NH0.iOxrDWPASgMoIholJybpZ7wQs92EyJ3c952HdKIlvcc | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
path_timestamp | 110 | Text | 否 |
pending_file | test_file.xlsx | Text | 否 |
user_file | 监控企业信息1.xls | Text | 否 |
username | ctt | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 删除service_manage /base/delete-file-row-service/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/delete-file-row-service/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTg5NDE1NH0.wpTJ5W25A502WPKIDDQeC_NNlIV3Of56bTheLjbkABg | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
path_timestamp | 1629855525880127 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 关联词汇推荐/basic/associated-word-single/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/basic/associated-word-single/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzNjY5MDgxOX0.BkyvWgsS5iVK8rLAde01w8QJh1UbGD4f39FgtocSyc8 | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
text | 混改,数字化转型 | Text | 否 | 多个词以英文逗号隔开,
word_num | 5 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 下载样例文件/base/show-service-file/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/show-service-file/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTk2MTE2OH0.b8EXACgZZbqXCdyTehOLtRbfiyO1RZP_GlVbau_Gm9A | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
service_type | scenario_service | Text | 否 |
service_name | extraction_company_name | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 相似度-去重 /scenario/doc-similarity-duplicate-single/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/doc-similarity-duplicate-single/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDE1MjQzN30.PFqUOsYoRqQpLvtE5_xkbYgbpD72MZvRGh24L4xGONc | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
text_1 | 徐东华(机械工业经济管理研究院院长、书记) | Text | 否 |
text_2 | 天空没有下雨 | Text | 否 |
sim | 0.6 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 自动生成报告/scenario/report-generator-single/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/report-generator-single/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYyOTk4MTgzNn0.V_int4T1l-txK0-q5NGlV_-NhdDzjYvJI72hzmrc5gs | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
title | 千亿民营房企泰禾集团巨额债务违约 | Text | 否 |
sid | 4544 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 专家发言观点/scenario/extraction-speech-single/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/extraction-speech-single/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDMyMDQ2OX0.Q2VNzrTMU1G8VG3E3PMRcwlkJ5K0RqGCshzIz1htFgM | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
title | 华为自主研发 | Text | 否 |
content | 随着华为透露自主研发的操作系统“鸿蒙OS”后,网友对国产操作系统的热情与期待一天比一天旺盛。除了鸿蒙OS频频上头条以外,中兴公司研发的新支点OS也已经出货超两亿套了。不过目前消费者市场上,仍旧很少见到国产操作系统。
在桌面有Windows、移动有iOS、安卓、服务器有Linux系统的情况下,国产操作系统应该如何突围呢?对于这个问题,全球移动通信协会高级顾问、中国移动原董事长王建宙给出了他的看法。
中国移动原董事长王建宙在近日举办的中国“科”公司峰会上表示,5G时代要重视移动操作系统的创新。“目前的5G手机仅仅是加快了上网速度,但这远远不够,应该努力去开发新的功能。”
王建宙认为,现有的5G手机在功能上没有特别的创新,因为本身操作系统还是用的4G操作系统,没有什么变化。2G和3G不一样,2G的操作系统是塞班操作系统,是Windows操作系统,3G时完全不一样,是iOS操作系统,是安卓操作系统。
“现有的移动设备操作系统都是在桌面机操作系统基础上修改和延伸的,不断修改、不断延伸、不断增加功能,但操作系统的功能是管理硬件和软件资源”,王建宙表示“5G带来的是万物互联,这就使得移动设备所面临的功能和环境都发生了非常大的变化。这种变化不仅跟原来的桌面机完全不一样,而且跟原有的手机也不一样,所以我们迫切需要一个更加实时、更加适合于万物互联的移动设备的操作系统。”
王建宙认为,“新的5G操作系统在功能上一定要超过现有操作系统。如果只是跟现有操作系统差不多,也很好,但很难形成一个新的生态系统。所以新操作系统一定要高起点开发,要有新的操作系统,特别是在物联网和人工智能方面。 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 一带一路信息过滤/scenario/project-info-filter-single/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/project-info-filter-single/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDE0MDQ2OX0.XH-1-PmTI4ScKaTTPowXYvVG0WSz8xcQ8qrrx3tNmAQ | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
title | 华为自主研发 | Text | 否 |
content | 随着华为透露自主研发的操作系统“鸿蒙OS”后,网友对国产操作系统的热情与期待一天比一天旺盛。除了鸿蒙OS频频上头条以外,中兴公司研发的新支点OS也已经出货超两亿套了。不过目前消费者市场上,仍旧很少见到国产操作系统。
在桌面有Windows、移动有iOS、安卓、服务器有Linux系统的情况下,国产操作系统应该如何突围呢?对于这个问题,全球移动通信协会高级顾问、中国移动原董事长王建宙给出了他的看法。
中国移动原董事长王建宙在近日举办的中国“科”公司峰会上表示,5G时代要重视移动操作系统的创新。“目前的5G手机仅仅是加快了上网速度,但这远远不够,应该努力去开发新的功能。”
王建宙认为,现有的5G手机在功能上没有特别的创新,因为本身操作系统还是用的4G操作系统,没有什么变化。2G和3G不一样,2G的操作系统是塞班操作系统,是Windows操作系统,3G时完全不一样,是iOS操作系统,是安卓操作系统。
“现有的移动设备操作系统都是在桌面机操作系统基础上修改和延伸的,不断修改、不断延伸、不断增加功能,但操作系统的功能是管理硬件和软件资源”,王建宙表示“5G带来的是万物互联,这就使得移动设备所面临的功能和环境都发生了非常大的变化。这种变化不仅跟原来的桌面机完全不一样,而且跟原有的手机也不一样,所以我们迫切需要一个更加实时、更加适合于万物互联的移动设备的操作系统。”
王建宙认为,“新的5G操作系统在功能上一定要超过现有操作系统。如果只是跟现有操作系统差不多,也很好,但很难形成一个新的生态系统。所以新操作系统一定要高起点开发,要有新的操作系统,特别是在物联网和人工智能方面。 | Text | 否 |
content | 5月10日,山东电建三公司海外市场开发再传捷报!公司与ACWAPower签订沙特朱拜勒3A独立海水淡化项目EPC合同,海水淡化业务板块又添新业绩。沙特朱拜勒3A独立海水淡化项目,位于朱拜勒市达曼法赫德国王国际机场以北约65公里处,项目采用海水反渗透技术,日产水量达60万吨。项目建设成后,对于缓解沙特东部省沿海岸日益增长的用水需求具有重要意义。该项目是公司与ACWAPower签约的第三个海水淡化项目,充分体现了ACWAPower对公司综合实力,以及公司在中东非洲区域项目执行过程中所展现出的卓越管理水平和勇于担当的企业精神的高度认可。同时,该项目也是公司在沙特EPC总承包的第10个大型工程项目 | Text | 否 |
title | 山东电建三公司再传捷报 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 一带一路项目要素抽取/scenario/project-info-extraction-single/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/project-info-extraction-single/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDMxODQyNn0.w9NDZgWbk5oWfYHmdHWfmKrlAcWErBBbnzgx39PuCrs | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
title | 山东电建三公司再传捷报 | Text | 否 |
content | 5月10日,山东电建三公司海外市场开发再传捷报!公司与ACWAPower签订沙特朱拜勒3A独立海水淡化项目EPC合同,海水淡化业务板块又添新业绩。沙特朱拜勒3A独立海水淡化项目,位于朱拜勒市达曼法赫德国王国际机场以北约65公里处,项目采用海水反渗透技术,日产水量达60万吨。项目建设成后,对于缓解沙特东部省沿海岸日益增长的用水需求具有重要意义。该项目是公司与ACWAPower签约的第三个海水淡化项目,充分体现了ACWAPower对公司综合实力,以及公司在中东非洲区域项目执行过程中所展现出的卓越管理水平和勇于担当的企业精神的高度认可。同时,该项目也是公司在沙特EPC总承包的第10个大型工程项目 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 讲话提取/scenario/extraction-speech
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/extraction-speech/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDE0MDQ2OX0.XH-1-PmTI4ScKaTTPowXYvVG0WSz8xcQ8qrrx3tNmAQ | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
path_timestamp | 1630135538675714 | Text | 否 |
pending_file | 发言内容_案例.xlsx | Text | 否 |
username | ctt | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 股票招聘/scenario/stock_recruitment_filter_single
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/stock_recruitment_filter_single
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDE0MDQ2OX0.XH-1-PmTI4ScKaTTPowXYvVG0WSz8xcQ8qrrx3tNmAQ | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
title | 山东电建三公司再传捷报 | Text | 否 |
content | 5月10日,山东电建三公司海外市场开发再传捷报!公司与ACWAPower签订沙特朱拜勒3A独立海水淡化项目EPC合同,海水淡化业务板块又添新业绩。沙特朱拜勒3A独立海水淡化项目,位于朱拜勒市达曼法赫德国王国际机场以北约65公里处,项目采用海水反渗透技术,日产水量达60万吨。项目建设成后,对于缓解沙特东部省沿海岸日益增长的用水需求具有重要意义。该项目是公司与ACWAPower签约的第三个海水淡化项目,充分体现了ACWAPower对公司综合实力,以及公司在中东非洲区域项目执行过程中所展现出的卓越管理水平和勇于担当的企业精神的高度认可。同时,该项目也是公司在沙特EPC总承包的第10个大型工程项目 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 一带一路要素抽取/scenario/project-info-extraction/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/project-info-extraction/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDMyMDQ2OX0.Q2VNzrTMU1G8VG3E3PMRcwlkJ5K0RqGCshzIz1htFgM | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
title | 华为自主研发 | Text | 否 |
content | 随着华为透露自主研发的操作系统“鸿蒙OS”后,网友对国产操作系统的热情与期待一天比一天旺盛。除了鸿蒙OS频频上头条以外,中兴公司研发的新支点OS也已经出货超两亿套了。不过目前消费者市场上,仍旧很少见到国产操作系统。
在桌面有Windows、移动有iOS、安卓、服务器有Linux系统的情况下,国产操作系统应该如何突围呢?对于这个问题,全球移动通信协会高级顾问、中国移动原董事长王建宙给出了他的看法。
中国移动原董事长王建宙在近日举办的中国“科”公司峰会上表示,5G时代要重视移动操作系统的创新。“目前的5G手机仅仅是加快了上网速度,但这远远不够,应该努力去开发新的功能。”
王建宙认为,现有的5G手机在功能上没有特别的创新,因为本身操作系统还是用的4G操作系统,没有什么变化。2G和3G不一样,2G的操作系统是塞班操作系统,是Windows操作系统,3G时完全不一样,是iOS操作系统,是安卓操作系统。
“现有的移动设备操作系统都是在桌面机操作系统基础上修改和延伸的,不断修改、不断延伸、不断增加功能,但操作系统的功能是管理硬件和软件资源”,王建宙表示“5G带来的是万物互联,这就使得移动设备所面临的功能和环境都发生了非常大的变化。这种变化不仅跟原来的桌面机完全不一样,而且跟原有的手机也不一样,所以我们迫切需要一个更加实时、更加适合于万物互联的移动设备的操作系统。”
王建宙认为,“新的5G操作系统在功能上一定要超过现有操作系统。如果只是跟现有操作系统差不多,也很好,但很难形成一个新的生态系统。所以新操作系统一定要高起点开发,要有新的操作系统,特别是在物联网和人工智能方面。 | Text | 否 |
content | 5月10日,山东电建三公司海外市场开发再传捷报!公司与ACWAPower签订沙特朱拜勒3A独立海水淡化项目EPC合同,海水淡化业务板块又添新业绩。沙特朱拜勒3A独立海水淡化项目,位于朱拜勒市达曼法赫德国王国际机场以北约65公里处,项目采用海水反渗透技术,日产水量达60万吨。项目建设成后,对于缓解沙特东部省沿海岸日益增长的用水需求具有重要意义。该项目是公司与ACWAPower签约的第三个海水淡化项目,充分体现了ACWAPower对公司综合实力,以及公司在中东非洲区域项目执行过程中所展现出的卓越管理水平和勇于担当的企业精神的高度认可。同时,该项目也是公司在沙特EPC总承包的第10个大型工程项目 | Text | 否 |
title | 山东电建三公司再传捷报 | Text | 否 |
path_timestamp | 1630309688929082 | Text | 否 |
pending_file | 一带一路项目要素抽取_案例.xlsx | Text | 否 |
username | ctt | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 一带一路信息过滤/scenario/project-info-filter/
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/project-info-filter/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDE1MzQzNH0.lTaat1GCB1pffWu1pmTJrpPGW8O_KEsy8QvuefLs6Lo | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
title | 华为自主研发 | Text | 否 |
content | 随着华为透露自主研发的操作系统“鸿蒙OS”后,网友对国产操作系统的热情与期待一天比一天旺盛。除了鸿蒙OS频频上头条以外,中兴公司研发的新支点OS也已经出货超两亿套了。不过目前消费者市场上,仍旧很少见到国产操作系统。
在桌面有Windows、移动有iOS、安卓、服务器有Linux系统的情况下,国产操作系统应该如何突围呢?对于这个问题,全球移动通信协会高级顾问、中国移动原董事长王建宙给出了他的看法。
中国移动原董事长王建宙在近日举办的中国“科”公司峰会上表示,5G时代要重视移动操作系统的创新。“目前的5G手机仅仅是加快了上网速度,但这远远不够,应该努力去开发新的功能。”
王建宙认为,现有的5G手机在功能上没有特别的创新,因为本身操作系统还是用的4G操作系统,没有什么变化。2G和3G不一样,2G的操作系统是塞班操作系统,是Windows操作系统,3G时完全不一样,是iOS操作系统,是安卓操作系统。
“现有的移动设备操作系统都是在桌面机操作系统基础上修改和延伸的,不断修改、不断延伸、不断增加功能,但操作系统的功能是管理硬件和软件资源”,王建宙表示“5G带来的是万物互联,这就使得移动设备所面临的功能和环境都发生了非常大的变化。这种变化不仅跟原来的桌面机完全不一样,而且跟原有的手机也不一样,所以我们迫切需要一个更加实时、更加适合于万物互联的移动设备的操作系统。”
王建宙认为,“新的5G操作系统在功能上一定要超过现有操作系统。如果只是跟现有操作系统差不多,也很好,但很难形成一个新的生态系统。所以新操作系统一定要高起点开发,要有新的操作系统,特别是在物联网和人工智能方面。 | Text | 否 |
content | 5月10日,山东电建三公司海外市场开发再传捷报!公司与ACWAPower签订沙特朱拜勒3A独立海水淡化项目EPC合同,海水淡化业务板块又添新业绩。沙特朱拜勒3A独立海水淡化项目,位于朱拜勒市达曼法赫德国王国际机场以北约65公里处,项目采用海水反渗透技术,日产水量达60万吨。项目建设成后,对于缓解沙特东部省沿海岸日益增长的用水需求具有重要意义。该项目是公司与ACWAPower签约的第三个海水淡化项目,充分体现了ACWAPower对公司综合实力,以及公司在中东非洲区域项目执行过程中所展现出的卓越管理水平和勇于担当的企业精神的高度认可。同时,该项目也是公司在沙特EPC总承包的第10个大型工程项目 | Text | 否 |
title | 山东电建三公司再传捷报 | Text | 否 |
path_timestamp | 12011 | Text | 否 |
pending_file | 一带一路项目资讯筛选_样例.xlsx | Text | 否 |
username | ctt | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 报告生成-查询专题sid和专题名称
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/base/query-subject/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMDM4MzQ1OX0.wJS8vxk7yYRZuRyGWX3VfOBAvQkVYMHZjcvkAiUxM8A | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
current_page | 2 | Text | 否 |
page_size | 10 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 分词
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/basic/word_cut/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMzg2Njk0M30.oR78Gt5va302elpcZqUB2srPwm9wd1UmsSTszFO0p7o | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
text | 今天天气真好 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 词性
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/basic/word_pos/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMzg2Njk0M30.oR78Gt5va302elpcZqUB2srPwm9wd1UmsSTszFO0p7o | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
text | 今天天气真好 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 新词发现
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/basic/new_word_find/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMzg2Njk0M30.oR78Gt5va302elpcZqUB2srPwm9wd1UmsSTszFO0p7o | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
text | 白月光,形容的是一种可望不可即的人或者事物,虽然一直在心上,却从不在身边。 最早出自张爱玲小说《红玫瑰与白玫瑰》,后来变成网络流行语并被大家所熟知是源于一部热播电视剧《延禧攻略》,剧中秦岚饰演的富察皇后被剧迷们形容为乾隆皇帝心中的“白月光”, 生当复来归,死亦长相思。 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 语义角色标注
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/basic/show_srl/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzNzA2OTM0Nn0.rcyJ5aKlUIYoVnQA2YMhlOHjlJb9By1RZtMf5SLEKzE | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
text | 他叫汤姆去拿外衣。 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 依存分析
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/basic/show_dep/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzMzg2Njk0M30.oR78Gt5va302elpcZqUB2srPwm9wd1UmsSTszFO0p7o | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
text | 白月光,形容的是一种可望不可即的人或者事物,虽然一直在心上,却从不在身边。 最早出自张爱玲小说《红玫瑰与白玫瑰》,后来变成网络流行语并被大家所熟知是源于一部热播电视剧《延禧攻略》,剧中秦岚饰演的富察皇后被剧迷们形容为乾隆皇帝心中的“白月光”, 生当复来归,死亦长相思。 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /positive_negative_judgment
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/scenario/positive_negative_judgment/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzNjM3Mjk5MH0.Zd0au3XyIFJKgv1cAMhd9E5Vede-OyDgfcjiaYzoayg | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
username | ctt | Text | 否 |
industry_code | 36 | Text | 否 |
start_year | 2016 | Text | 否 |
stop_year | 2020 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 关键短语挖掘
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/basic/create_keywords/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzNzA3NDAyMX0.Rofx64XlQWXwYonhrle0hSAIlrACXT-oVsn5vB4HGrs | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
topK | 5 | Text | 否 | 选取多少个关键短语返回,默认为 5
with_weight | True | Text | 否 | 指定返回关键短语是否需要短语权重
text | 法国媒体最新披露,巴黎圣母院火灾当晚,第一次消防警报响起时,负责查验的保安找错了位置,因而可能贻误了救火的最佳时机。 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
## /测试 中文命名实体识别
```text
暂无描述
```
#### 接口状态
> 开发中
#### 接口URL
> http://192.168.1.149:8020/basic/ner_single/
#### 请求方式
> POST
#### Content-Type
> form-data
#### 请求Header参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
Authorization | eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MSwibmFtZSI6ImN0dCIsImV4cCI6MTYzNzEyMzE2OX0.uUmDv9XOfWcgLXg9IkoE5X6ISQK67gH4lumJBGR2X7E | Text | 否 |
#### 请求Body参数
参数名 | 示例值 | 参数类型 | 是否必填 | 参数描述
--- | --- | --- | --- | ---
text | 中国进出口银行与中国银行加强合作 | Text | 否 |
#### 预执行脚本
```javascript
暂无预执行脚本
```
#### 后执行脚本
```javascript
暂无后执行脚本
```
\ No newline at end of file
import pymysql
pymysql.install_as_MySQLdb()
"""
Django settings for platform_zzsn project.
Generated by 'django-admin startproject' using Django 2.2.
For more information on this file, see
https://docs.djangoproject.com/en/2.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/2.2/ref/settings/
"""
import os
import datetime
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))+'/'
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/2.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = '2kzp8f76mp1^-)ko+^ji%+m*@g#i)005v0^vq5*zy0g7bcbo0*'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
APPEND_SLASH = False
ALLOWED_HOSTS = ['*']
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'model.base',
'basic_service',
'model.classify',
'model.clustering',
'scenario_service',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
# 'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'platform_zzsn.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'platform_zzsn.wsgi.application'
# Database
# https://docs.djangoproject.com/en/2.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.mysql', # 数据库引擎
'NAME': 'platform_ctt', # 数据库名,先前创建的
'USER': 'root', # 用户名,可以自己创建用户
'PASSWORD': 'ydyl123456', # 密码
'HOST': '114.115.151.73', # mysql服务所在的主机ip
'PORT': '3306', # mysql服务端口
}
}
# Password validation
# https://docs.djangoproject.com/en/2.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/2.2/topics/i18n/
LANGUAGE_CODE = 'zh-hans'
TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True
USE_L10N = True
USE_TZ = False
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/2.2/howto/static-files/
STATIC_URL = '/static/'
MEDIA_ROOT = os.path.join(BASE_DIR, 'media')
\ No newline at end of file
"""platform_zzsn URL Configuration
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/2.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.urls import include, path
import basic_service.urls
import scenario_service.urls
import model.base.urls
urlpatterns = [
# path('admin/', admin.site.urls),
path('basic/', include(basic_service.urls)),
# path('classify/', include(classify.urls)),
# path('clustering/', include(clustering.urls)),
path('base/', include(model.base.urls)),
path('scenario/', include(scenario_service.urls)),
]
"""
WSGI config for platform_zzsn project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/2.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'platform_zzsn.settings')
application = get_wsgi_application()
from django.contrib import admin
# Register your models here.
from django.apps import AppConfig
class ScenarioServiceConfig(AppConfig):
name = 'scenario_service'
from django.db import models
# Create your models here.
from django.test import TestCase
import requests
import json
from requests.adapters import HTTPAdapter
def post_br_single_file(url, file_name):
payload = {'file_path': file_name}
# headers = {
# 'Content-Type': 'application/json'
# }
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
'Content-Type': 'application/json',
# 'Connection': 'close'
}
requests.adapters.DEFAULT_RETRIES = 3
response = requests.request('POST', url, headers=headers, data=json.dumps(payload),timeout=200)
data = json.loads(response.text)
return data
import time
start_time = time.time()
result = post_br_single_file('http://192.168.1.149:7000/br/doc_event/project_info/extraction/pred_file', '/home/zzsn/ctt/platform_zzsn/media/1201/0830.xlsx')
print(result)
end_time = time.time()
print(end_time-start_time)
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/12 18:05
# @Author : 程婷婷
# @FileName: urls.py
# @Software: PyCharm
from django.urls import path
from django.conf.urls import url
from scenario_service.views import views
urlpatterns = [
url(r'^project-info-filter-single', views.project_info_filter_single, name='project_info_filter_single'),
url(r'^project-info-filter', views.project_info_filter, name='project_info_filter'),
url(r'^project-info-extraction-single', views.project_info_extraction_single, name='project_info_extraction_single'),
url(r'^project-info-extraction', views.project_info_extraction, name='project_info_extraction'),
url(r'^doc-similarity-duplicate-single', views.doc_similarity_duplicate_single, name='doc_similarity_duplicate_single'),
url(r'^report-generator-single', views.report_generator_single, name='report_generator_single'),
url(r'^extraction-speech-single', views.extraction_speech_single, name='extraction_speech_single'),
url(r'^extraction-speech', views.extraction_speech, name='extraction_speech'),
url(r'^extraction-keywords', views.extraction_keywords, name='extraction_keywords'),
# url(r'^download_xls', views.download_xls, name='download_xls'),
url(r'^extraction-company', views.extraction_company, name='extraction_company'),
url(r'^stock-recruitment-filter', views.stock_recruitment_filter, name='stock_recruitment_filter'),
url(r'^stock_recruitment_filter_single', views.stock_recruitment_filter_single, name='stock_recruitment_filter_single'),
url(r'^positive_negative_judgment', views.positive_negative_judgment, name='positive_negative_judgment')
]
\ No newline at end of file
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/13 10:08
# @Author : 程婷婷
# @FileName: __init__.py.py
# @Software: PyCharm
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/6 15:44
# @Author : 程婷婷
# @FileName: cv_tfidf.py
# @Software: PyCharm
# coding:utf-8
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from model.base.views.utils import *
def cv_tfidf(corpus):
vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
X = vectorizer.fit_transform(corpus) # 将文本转为词频矩阵
tfidf = transformer.fit_transform(X) # 计算tf-idf,
word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
return word, weight
def get_word_tf_frequency(word, weight, word2count):
word_tf, keyword, word_weight = [], [], []
for i in range(len(weight)): # 打印每类文本的tf-idf词语权重
temp = list(zip(word, weight[i]))
temp.sort(key=takeSecond, reverse=True)
result = temp[0: 3]
result.sort(key=takeFirst_len, reverse=True)
for index, data in enumerate(result):
if data[0] not in word2count:
continue
word_weight.append(word2count[data[0]])
keyword.append(data[0])
word_tf.append(data[1])
return word_tf, keyword, word_weight
"""
Author: Tao Zhang
Desc: 基于字典的情感极性分析,用于年报情感分析。
2021-10-29: 实现分行业的数字化转型正负面趋势分析。
2021-11-05: 实现echart折线分析图的嵌入,API开发完成。
"""
import cx_Oracle
import pandas as pd
import os
import re
from tqdm import tqdm
import jieba
import json
import zipfile
from flask import Flask, request, make_response
from urllib.parse import quote
import io
from platform_zzsn.settings import BASE_DIR
# 开始加载情感词典
print('开始加载情感词典 ...')
reverse_words = ['车道偏离'] # 屏蔽词
negdict = [] # 消极情感词典
posdict = [] # 积极情感词典
nodict = [] # 否定词词典
plusdict = [] # 程度副词词典
sentiment_base_dir = os.path.join(BASE_DIR, 'static/base/sentiment_dict')
sl = pd.read_csv(os.path.join(sentiment_base_dir, '中文金融词典/dict/formal_neg.txt'), header=None, encoding='utf-8')
for i in range(len(sl[0])):
negdict.append(sl[0][i])
# sl = pd.read_csv('情感极性词典/正面情绪词.txt', header=None, encoding='utf-8')
sl = pd.read_csv(os.path.join(sentiment_base_dir, '中文金融词典/dict/formal_pos.txt'), header=None, encoding='utf-8')
for i in range(len(sl[0])):
posdict.append(sl[0][i])
sl = pd.read_csv(os.path.join(sentiment_base_dir, '情感极性词典/否定词.txt'), header=None, encoding='utf-8')
for i in range(len(sl[0])):
nodict.append(sl[0][i])
sl = pd.read_csv(os.path.join(sentiment_base_dir, '情感极性词典/程度副词.txt'), header=None, encoding='utf-8')
for i in range(len(sl[0])):
plusdict.append(sl[0][i])
print('情感词典加载完成!')
# 加载情感词典结束
for w in ['非公开', '非流动', '车联网', '网联化', '智能网联化', '智能网联', '新能源', '共享化']:
jieba.add_word(w)
def clean_blank_lines(text):
"""
清理多余空行
:param text:
:return:
"""
text_ = re.sub('[\n]+', '\n', text.replace('\t', '').replace('\r', ''))
return text_
def repaire_table_of_content(content):
"""
修复目录格式
eg:
第一节重要提示、 第一节 重要提示、
第四节九、公司未 第四节 九、公司未
第一节重要提示、 第一节 重要提示、
第二节公司简介和 第二节 公司简介和
第三节公司业务概 第三节 公司业务概
"""
chapter = re.findall(r'第\S{1,2}节\S{,5}', content)
for i in chapter:
i_s = i.split('节')
new_cha = i_s[0] + '节' + ' ' + i_s[-1]
# print(i, new_cha)
content = content.replace(i, new_cha)
return content
def filter4sentences(filter_keywords, sentences):
"""
:param filter_keywords: 过滤词
:param sentences: 待过滤的句子
:return:
"""
sentences_success = []
total = 0
for i in tqdm(sentences):
total += len(i['句子'])
for sent in i['句子']:
for w in filter_keywords:
if w in sent:
# print('success +1')
sentences_success.append({'年报文件名称': i['年报文件名称'],
'年份': i['年份'],
'股票简称': i['股票简称'],
'句子': sent})
break
print('句子总数:' + str(total) + ',筛选出的句子数量:' + str(len(sentences_success)))
return sentences_success
# 预测方法
def predict(s, negdict, posdict, nodict, plusdict):
p = 0
for rw in reverse_words: # 去掉文本中的屏蔽词
if rw in s:
s = s.replace(rw, '')
sd = list(jieba.cut(s)) # 分词
temp = {'积极词': [], '消极词': [], '副词': [], '否定词': []}
for i in range(len(sd)):
if sd[i] in negdict:
if i > 0 and sd[i - 1] in nodict:
p = p + 1
temp['消极词'].append((i, sd[i]))
temp['否定词'].append((i - 1, sd[i - 1]))
elif i > 0 and sd[i - 1] in plusdict:
p = p - 2
temp['消极词'].append((i, sd[i]))
temp['副词'].append(sd[i - 1])
else:
p = p - 1
temp['消极词'].append((i, sd[i]))
elif sd[i] in posdict:
if i > 0 and sd[i - 1] in nodict:
p = p - 1
temp['积极词'].append((i, sd[i]))
temp['否定词'].append((i - 1, sd[i - 1]))
elif i > 0 and sd[i - 1] in plusdict:
p = p + 2
temp['积极词'].append((i, sd[i]))
temp['副词'].append((i - 1, sd[i - 1]))
elif i > 0 and sd[i - 1] in negdict:
p = p - 1
temp['积极词'].append((i, sd[i]))
temp['消极词'].append((i - 1, sd[i - 1]))
elif i < len(sd) - 1 and sd[i + 1] in negdict:
p = p - 1
temp['积极词'].append((i, sd[i]))
temp['消极词'].append((i + 1, sd[i + 1]))
else:
p = p + 1
temp['积极词'].append((i, sd[i]))
elif sd[i] in nodict:
p = p - 0.5
temp['否定词'].append((i, sd[i]))
temp_u = {}
for k, v in temp.items():
temp_u[k] = list(set(v))
return p, sd, temp_u
def get_echart_line_map(years_range_list, positive_count_list, negative_count_list, title='数字化转型正负面趋势分析'):
"""
生成折线图
:param years_range_list: 年份区间
:param positive_count_list: 正面数量
:param negative_count_list: 负面数量
:param title 标题
:return:
"""
with open('input_data/echart_line_template.html', 'r') as file:
html = file.read()
html = html.replace('years_range_list', str(years_range_list))
html = html.replace('positive_count_list', str(positive_count_list))
html = html.replace('negative_count_list', str(negative_count_list))
with open(os.path.join('outputs', title + '_' + 'index.html'), 'w') as file2:
file2.write(html)
def sentiment_analysis(years_range_list, sentences, path):
"""
:param years_range_list: 分析年份区间
:param sentences: 多个句子
:return:
"""
mydata = pd.DataFrame(data=sentences)
len1 = len(mydata)
mydata.drop_duplicates(subset=['句子'], inplace=True)
mydata.reset_index(drop=True, inplace=True)
print('去重数量为:' + str(len1 - len(mydata)) + ',剩余条数:' + str(len(mydata)))
tol = 0
# mydata['pred'] = 0
for i in tqdm(range(len(mydata))):
tol = tol + 1
score, sd, info = predict(mydata.loc[i, '句子'], negdict, posdict, nodict, plusdict)
mydata.loc[i, '分词'] = ' '.join(sd)
if score > 0:
mydata.loc[i, 'pred'] = 1 # 积极
mydata.loc[i, 'info'] = json.dumps(info, ensure_ascii=False)
elif score < 0:
mydata.loc[i, 'pred'] = 0 # 消极
mydata.loc[i, 'info'] = json.dumps(info, ensure_ascii=False)
else:
mydata.loc[i, 'pred'] = -2 # 无情感
mydata.loc[i, 'info'] = json.dumps(info, ensure_ascii=False)
print(mydata.head(10))
mydata.to_excel(os.path.join(path, '分析结果_test.xlsx'), index=False, columns=['年报文件名称', '年份', '股票简称',
'句子', '分词', 'pred', 'info'])
mydata_year_set = set(mydata['年份'].tolist())
positive_count = []
negative_count = []
for year in years_range_list:
if year in mydata_year_set:
# 取数据
df_current = mydata[mydata['年份'] == year]
positive_count.append(len(df_current[df_current['pred'] == 1]))
negative_count.append(len(df_current[df_current['pred'] == 0]))
else:
positive_count.append(0)
negative_count.append(0)
# get_echart_line_map(years_range_list, positive_count, negative_count)
return True
def process_v2(IndustryCode='36', start_year=2016, stop_year=2020, path='./'):
"""
从数据库里检索符合条件的内容
:param IndustryCode: 行业大类代码
:param start_year: 起始年份
:param stop_year: 终止年份
:return:
"""
year_range = [str(y) for y in range(int(start_year), int(stop_year) + 1)]
guanliceng_content = [] # 存储管理层文本
success = 0 # 统计成功提取数量
# data_root_path = '../../../东方财富网/' # .txt格式年报数据根路径
filter_keywords_path = os.path.join(sentiment_base_dir, '数字化转型_词库.xlsx')
types = {'行业大类代码': str, '上市公司代码': str}
df = pd.read_excel(os.path.join(sentiment_base_dir, '截至2021年2季度上市公司_4352家【From证监会】.xlsx'), dtype=types)
IndustryCode2info = {}
for idx, row in df.iterrows():
if row['行业大类代码'] not in IndustryCode2info:
IndustryCode2info[row['行业大类代码']] = dict(row)
print('行业大类数量:' + str(len(IndustryCode2info)))
# print('数据库中,行业大类数量:' + str(len(os.listdir(data_root_path))))
connect = cx_Oracle.connect('cis', 'cis_zzsn9988', '114.116.91.1:1521/ORCL')
cursor = connect.cursor()
if IndustryCode in df['行业大类代码'].tolist():
df_useful = df[df['行业大类代码'] == IndustryCode]
print(df_useful.head(10))
IndustryName = IndustryCode2info[IndustryCode]['行业大类名称']
print('正在分析:' + IndustryName + ' ...')
sql_str = "SELECT TITLE, COMPLETE_SENTENCES, YEAR, STOCK_NAME FROM COMPANY_ANNUAL_REPORT WHERE INDUSTRY_CODE='%s' AND YEAR BETWEEN '%s' AND '%s'" % (
str(IndustryCode), str(start_year), str(stop_year))
print('\n' + sql_str + '\n\n')
cursor.execute(sql_str)
data = cursor.fetchall()
cursor.execute('commit')
print(data[0: 10])
print('step2:完整句子提取【开始】 ...')
complete_sentences = []
for i in tqdm(data):
sentences_bytes = i[1].read()
sentences_str = sentences_bytes.decode('utf8') # 对字节解码,转为str类型
# content_bytes2 = content_str.encode('utf8') # 对str类型编码,转为字节类型
temp_dict = {'年报文件名称': i[0],
'年份': i[2],
'股票简称': i[3],
'句子': sentences_str.split('<sep>')}
complete_sentences.append(temp_dict)
# print(json.dumps(temp_dict, ensure_ascii=False, indent=2))
# json.dump(complete_sentences, open(save_path, 'w'),
# ensure_ascii=False, indent=2)
print('step2:完整句子提取【完成】')
print('step3:符合条件的句子筛选【开始】 ...')
keywords = []
for sheet_name in ['来源【政策库】', '来源【模型推荐】']:
df1 = pd.read_excel(filter_keywords_path, sheet_name=sheet_name)
if sheet_name == '来源【政策库】':
df1 = df1[df1['label'] == 1]
keywords.extend(df1['关键词'].to_list())
sentences_useful = filter4sentences(keywords, complete_sentences)
print(sentences_useful[0: 5])
# for item in sentences_useful:
# print(item)
print('step3:符合条件的句子筛选【完成】')
print('step4: 正负面预测开始 ...')
results = sentiment_analysis(year_range, sentences_useful, path)
print('step4: 正负面预测完成!')
# print('step5: 打包结果 ...')
# zip_directory('outputs', 'results_zip/分析结果.zip')
# print('step5: 结果打包完成')
return {"success": 1, "msg": "{} 行业分析完成!".format(IndustryCode)}
else:
print('Sorry! IndustryCategoryCode 不在”行业大类代码“中。请检查!')
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/8/13 9:26
# @Author : 程婷婷
# @FileName: scenario.py
# @Software: PyCharm
import os
import json
import requests
import pandas as pd
from collections import Counter
from requests.adapters import HTTPAdapter
from scenario_service.views import cv_tfidf
from model.base.views import utils
def post_project_info(url, title, content):
payload={'title':title,'content':content}
headers = {
'Content-Type': 'application/json'
}
response = requests.request('POST', url, headers=headers, data=json.dumps(payload))
data = json.loads(response.text)
print(data)
return data
def post_speech(url, title, content):
payload = {'title': title,
'content': content}
files = [
]
headers = {}
response = requests.request("POST", url, headers=headers, data=payload, files=files)
data = json.loads(response.text)
return data
def post_report(url):
response = requests.request("GET", url)
data = json.loads(response.text)
return data
def post_similarity_duplicate(url, text_1, text_2, sim):
payload = {'text_1': text_1, 'text_2': text_2, 'sim': sim}
headers = {
'Content-Type': 'application/json'
}
response = requests.request('POST', url, headers=headers, data=json.dumps(payload))
data = json.loads(response.text)
return data
def post_extraction_company(url, file_name, company_file_name):
payload = {'file_name': file_name, 'company_file_name': company_file_name}
files = [
]
headers = {
'token': '1',
}
response = requests.request('POST', url, headers=headers, data=payload, files=files)
data = json.loads(response.text)
return data
def post_br_single_file(url, file_name):
payload = {'file_path': file_name}
headers = {
'Content-Type': 'application/json'
}
s = requests.session()
s.mount('http://', HTTPAdapter(max_retries=3))
response = s.request('POST', url, headers=headers, data=json.dumps(payload), timeout=60*60*10)
data = json.loads(response.text)
return data
def post_stock_recruitment_predict(url, file_name):
payload = {'file_name': file_name}
files = [
]
headers = {
'token': '1',
}
response = requests.request('POST', url, headers=headers, data=payload, files=files)
data = json.loads(response.text)
return data
def cv_tfidf_keywords(download_path, pending_file, user_file):
file_type = pending_file.split('.')[-1]
if (file_type == 'docx') or (file_type == 'doc'):
doc_text_list = utils.read_docx(pending_file, user_file)
doc_text_list = utils.merge_para(doc_text_list)
else:
# print('运行xlsx文件')
doc_text_list = utils.read_excel(pending_file, user_file)
# print(doc_text_list)
corpus, all_words, = [], []
for para in doc_text_list:
words = utils.filter_stopwords(para)
all_words.extend(words)
corpus.append(' '.join(words))
print("len(corpus):" + str(len(corpus)))
word2count = Counter(all_words)
word, weight = cv_tfidf.cv_tfidf(corpus)
word_tf, keyword, word_weight = cv_tfidf.get_word_tf_frequency(word, weight, word2count)
out_df = pd.DataFrame(columns=['词', '词频', 'tfidf'])
out_df['词'] = keyword
out_df['词频'] = word_weight
out_df['tfidf'] = word_tf
out_df.drop_duplicates(subset=['词', '词频'], inplace=True)
out_df.to_excel(os.path.join(download_path, 'result.xlsx'), index=False, encoding='utf-8')
return out_df
from django.http import JsonResponse
from django.views.decorators.http import require_POST
import pandas as pd
from scenario_service.views import scenario, positive_negative_judgment_base_emotion_words
from model.base.views.token_authorize import *
from model.base.models import ServiceManage
from platform_zzsn.settings import MEDIA_ROOT
UPLOAD_FOLDER = MEDIA_ROOT
# Create your views here.
@require_POST
@login_required
def project_info_filter_single(request):
token = request.META.get("HTTP_AUTHORIZATION")
title = request.POST['title']
content = request.POST['content']
url = 'http://114.116.49.86:7000/br/classification/project_info/filter/pred'
result = scenario.post_project_info(url, title, content)
if result['resultData']['label']:
result['resultData']['label'] = '一带一路项目信息'
else:
result['resultData']['label'] = '非一带一路项目信息'
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def stock_recruitment_filter_single(request):
token = request.META.get("HTTP_AUTHORIZATION")
title = request.POST['title']
content = request.POST['content']
url = 'http://localhost:7005/classification/rc/f_zp_gp/pred'
result = scenario.post_project_info(url, title, content)
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def project_info_extraction_single(request):
token = request.META.get("HTTP_AUTHORIZATION")
title = request.POST['title']
content = request.POST['content']
url = 'http://114.116.49.86:7000/br/doc_event/project_info/extraction/pred'
result = scenario.post_project_info(url, title, content)
result['resultData']['项目名称'] = result['resultData']['pro_name']
result['resultData']['项目名称'] = result['resultData']['pro_name']
result['resultData']['项目国别'] = result['resultData']['pro_country']
result['resultData']['承建单位'] = result['resultData']['pro_build_unit']
result['resultData']['相关方'] = result['resultData']['pro_related_unit']
result['resultData']['项目投资'] = result['resultData']['pro_money']
result['resultData']['开工日期'] = result['resultData']['pro_start_date']
result['resultData']['项目地点'] = result['resultData']['pro_location']
result['resultData']['项目状态'] = result['resultData']['pro_state']
result['resultData']['项目产能'] = result['resultData']['pro_capacity']
result['resultData']['项目背景'] = result['resultData']['pro_background']
result['resultData']['项目简介'] = result['resultData']['pro_brief']
result['resultData']['项目内容'] = result['resultData']['pro_content']
result['resultData']['项目意义'] = result['resultData']['pro_significance']
result['resultData']['项目城市'] = result['resultData']['pro_city']
result['resultData']['业主单位'] = result['resultData']['pro_owner_unit']
result['resultData']['执行方式'] = result['resultData']['pro_run_mode']
result['resultData']['建设周期'] = result['resultData']['pro_time_limit']
result['resultData']['完工日期'] = result['resultData']['pro_end_date']
del result['resultData']['pro_name']
del result['resultData']['pro_country']
del result['resultData']['pro_build_unit']
del result['resultData']['pro_related_unit']
del result['resultData']['pro_money']
del result['resultData']['pro_start_date']
del result['resultData']['pro_location']
del result['resultData']['pro_state']
del result['resultData']['pro_capacity']
del result['resultData']['pro_background']
del result['resultData']['pro_brief']
del result['resultData']['pro_content']
del result['resultData']['pro_significance']
del result['resultData']['pro_city']
del result['resultData']['pro_owner_unit']
del result['resultData']['pro_run_mode']
del result['resultData']['pro_time_limit']
del result['resultData']['pro_end_date']
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def doc_similarity_duplicate_single(request):
token = request.META.get("HTTP_AUTHORIZATION")
text_1 = request.POST['text_1']
text_2 = request.POST['text_2']
sim = request.POST['sim']
url = 'http://localhost:7005/doc_sim/similarity'
result = scenario.post_similarity_duplicate(url, text_1, text_2, sim)
if result['resultData']['is_repetition']:
result['resultData']['is_repetition'] = '重复'
else:
result['resultData']['is_repetition'] = '不重复'
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def report_generator_single(request):
token = request.META.get("HTTP_AUTHORIZATION")
title = request.POST['name']
sid = str(request.POST['sid'])
url = "http://114.116.99.6:1811/report_generator?title=%s&sid=%s" %(title, sid)
result = scenario.post_report(url)
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def extraction_speech_single(request):
token = request.META.get("HTTP_AUTHORIZATION")
title = request.POST['title']
content = request.POST['content']
url = 'http://localhost:1812/speech/'
result = scenario.post_speech(url, title, content)
regular = []
for index,row in enumerate(result['resultData']['抽取内容']):
regular.append({'person_name': list(row.keys())[0], 'person_speech': list(row.values())[0]})
result['resultData']['抽取内容'] = regular
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def extraction_speech(request):
token = request.META.get("HTTP_AUTHORIZATION")
username = request.POST['username']
pending_file = request.POST['pending_file']
path_timestamp = request.POST['path_timestamp']
url = 'http://192.168.1.149:1812/speech/'
path = os.path.join(UPLOAD_FOLDER, path_timestamp)
create_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
service_manager = ServiceManage.objects.create(
name='专家观点、领导讲话、组织发言提取',
username=username,
create_date=create_date,
end_date=None,
state='进行中',
filenames=pending_file + ';',
path=str(path_timestamp),
)
pending_file = os.path.join(path, pending_file)
df = pd.read_excel(pending_file)
result_type, result_speech = [], []
try:
for index in range(len(df['title'])):
speech = scenario.post_speech(url, df['title'][index], df['content'][index])
result_type.append(speech['resultData']['type'])
result_speech.append(speech['resultData']['抽取内容'])
df['讲话类型'] = result_type
df['讲话内容'] = result_speech
df.to_excel(os.path.join(path, 'result.xlsx'), index=False)
except Exception as e:
print(e)
service_manager.state = '失败'
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
service_manager.end_date = end_date
service_manager.save()
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': e,
'resultData': False,
})
else:
service_manager.state = '已完成'
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
service_manager.end_date = end_date
service_manager.save()
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': True,
})
@require_POST
@login_required
def extraction_keywords(request):
path_timestamp = request.POST['path_timestamp']
path = os.path.join(UPLOAD_FOLDER, path_timestamp)
pending_file = request.POST['pending_file']
user_file = request.POST['user_file']
username = request.POST['username']
token = request.META.get("HTTP_AUTHORIZATION")
create_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(pending_file, user_file)
service_manager = ServiceManage.objects.create(
name='关键词挖掘',
username=username,
create_date=create_date,
end_date=None,
state='进行中',
filenames=pending_file+';'+user_file,
path=str(path_timestamp),
)
pending_file = os.path.join(path, pending_file)
user_file = os.path.join(path, user_file)
try:
scenario.cv_tfidf_keywords(path, pending_file, user_file)
except Exception as e:
print(e)
service_manager.state = '失败'
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
service_manager.end_date = end_date
service_manager.save()
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': e,
'resultData': False,
})
else:
service_manager.state = '已完成'
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
service_manager.end_date = end_date
service_manager.save()
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': True,
})
@require_POST
@login_required
def extraction_company(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
path = os.path.join(UPLOAD_FOLDER, path_timestamp)
pending_file = request.POST['pending_file']
company_file_name = request.POST['user_file']
username = request.POST['username']
create_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(pending_file, company_file_name)
service_manager = ServiceManage.objects.create(
name='公司名称提取',
username=username,
create_date=create_date,
end_date=None,
state='进行中',
filenames=pending_file+';'+company_file_name,
path=str(path_timestamp),
)
pending_file = os.path.join(path, pending_file)
company_file_name = os.path.join(path, company_file_name)
url = 'http://localhost:7005/zzsn_platform/liyan/ex_company_name/test_file'
print(pending_file)
print(company_file_name)
result = scenario.post_extraction_company(url, file_name=pending_file, company_file_name=company_file_name)
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(result)
if result['isHandleSuccess']:
service_manager.end_date = end_date
service_manager.state = '已完成'
service_manager.save()
result['resultData'] = True
else:
service_manager.end_date = end_date
service_manager.state = '失败'
service_manager.save()
result['resultData'] = False
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def stock_recruitment_filter(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
path = os.path.join(UPLOAD_FOLDER, path_timestamp)
pending_file = request.POST['pending_file']
username = request.POST['username']
create_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(pending_file)
service_manager = ServiceManage.objects.create(
name='股票招聘筛选',
username=username,
create_date=create_date,
end_date=None,
state='进行中',
filenames=pending_file+';',
path=str(path_timestamp),
)
pending_file = os.path.join(path, pending_file)
url = 'http://192.168.1.149:7001/classification/rc/f_zp_gp/test_file'
print(pending_file)
result = scenario.post_stock_recruitment_predict(url, file_name=pending_file)
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(result)
if result['isHandleSuccess']:
service_manager.end_date = end_date
service_manager.state = '已完成'
service_manager.save()
result['resultData'] = True
else:
service_manager.end_date = end_date
service_manager.state = '失败'
service_manager.save()
result['resultData'] = False
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def project_info_filter(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
pending_file = request.POST['pending_file']
username = request.POST['username']
path = os.path.join(UPLOAD_FOLDER, path_timestamp)
create_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(pending_file)
service_manager = ServiceManage.objects.create(
name='一带一路项目信息筛选',
username=username,
create_date=create_date,
end_date=None,
state='进行中',
filenames=pending_file+';',
path=str(path_timestamp),
)
pending_file = os.path.join(path, pending_file)
url = 'http://114.116.49.86:7000/br/doc_event/project_info/extraction/pred_file'
print(pending_file)
result = scenario.post_br_single_file(url, file_name=pending_file)
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(result)
if result['isHandleSuccess']:
service_manager.end_date = end_date
service_manager.state = '已完成'
service_manager.save()
result['resultData'] = True
else:
service_manager.end_date = end_date
service_manager.state = '失败'
service_manager.save()
result['resultData'] = False
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def project_info_extraction(request):
token = request.META.get("HTTP_AUTHORIZATION")
path_timestamp = request.POST['path_timestamp']
pending_file = request.POST['pending_file']
username = request.POST['username']
path = os.path.join(UPLOAD_FOLDER, path_timestamp)
create_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(pending_file)
service_manager = ServiceManage.objects.create(
name='一带一路项目要素抽取',
username=username,
create_date=create_date,
end_date=None,
state='进行中',
filenames=pending_file+';',
path=str(path_timestamp),
)
pending_file = os.path.join(path, pending_file)
url = 'http://114.116.49.86:7000/br/doc_event/project_info/extraction/pred_file'
print(pending_file)
result = scenario.post_br_single_file(url, file_name=pending_file)
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(result)
if result['isHandleSuccess']:
service_manager.end_date = end_date
service_manager.state = '已完成'
service_manager.save()
result['resultData'] = True
else:
service_manager.end_date = end_date
service_manager.state = '失败'
service_manager.save()
result['resultData'] = False
result['token'] = token
return JsonResponse(result)
@require_POST
@login_required
def positive_negative_judgment(request):
token = request.META.get('HTTP_AUTHORIZATION')
username = request.POST['username']
industry_code = str(request.POST['industry_code']) # 行业ID (必填)
start_year = request.POST['start_year'] # 开始年
stop_year = request.POST['stop_year'] # 结束年
path_timestamp = int(round(time.time() * 1000000))
path = os.path.join(UPLOAD_FOLDER, str(path_timestamp))
if not os.path.exists(path, ):
os.mkdir(path)
create_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
service_manager = ServiceManage.objects.create(
name='基于情感词的正负面分析',
username=username,
create_date=create_date,
end_date=None,
state='进行中',
filenames=str(industry_code),
path=str(path_timestamp),
)
try:
result = positive_negative_judgment_base_emotion_words.process_v2(industry_code, start_year, stop_year, path)
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
service_manager.end_date = end_date
service_manager.state = '成功'
service_manager.save()
return JsonResponse({
'token': token,
'handleMsg': 'success',
'isHandleSuccess': True,
'logs': '处理成功',
'resultData': True,
})
except Exception as e:
print(e)
end_date = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
service_manager.end_date = end_date
service_manager.state = '失败'
service_manager.save()
return JsonResponse({
'token': token,
'handleMsg': 'failure',
'isHandleSuccess': False,
'logs': '处理失败',
'resultData': False,
})
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/6/15 12:10
# @Author : bruxellse_li
# @FileName: word_count.py
# @Software: PyCharm
import os
import pandas as pd
import re
import jieba
import jieba.posseg # 词性获取
import collections # 词频统计库
from base.views import utils
from platform_zzsn.settings import *
class WordCount:
def __init__(self):
# 英文词性转中文词性字典:简洁版
self.En2Cn = {
'zg': '状态词',
'a': '形容词',
'ad': '形容词',
'ag': '形容词',
'al': '形容词',
'an': '形容词',
'b': '区别词',
'bl': '区别词',
'c': '连词',
'cc': '连词',
'd': '副词',
'e': '叹词',
'eng': '英文',
'f': '方位词',
'g': '语素',
'h': '前缀',
'i': '成语',
'j': '简称略语',
'k': '后缀',
'l': '习用语',
'm': '数词',
'mq': '数量词',
'n': '名词',
'ng': '名词',
'nl': '名词',
'nr': '名词',
'nr1': '名词',
'nr2': '名词',
'nrf': '名词',
'nrfg': '名词',
'nrj': '名词',
'ns': '名词',
'nsf': '名词',
'nt': '名词',
'nz': '名词',
'o': '拟声词',
'p': '介词',
'pba': '介词',
'pbei': '介词',
'q': '量词',
'qt': '量词',
'qv': '量词',
'r': '代词',
'rg': '代词',
'rr': '代词',
'rz': '代词',
'rzs': '代词',
'rzt': '代词',
'rzv': '代词',
'ry': '代词',
'rys': '代词',
'ryt': '代词',
'ryv': '代词',
's': '处所词',
't': '时间词',
'tg': '时间词',
'u': '助词',
'ude1': '助词',
'ude2': '助词',
'ude3': '助词',
'udeng': '助词',
'udh': '助词',
'uguo': '助词',
'ule': '助词',
'ulian': '助词',
'uls': '助词',
'usuo': '助词',
'uyy': '助词',
'uzhe': '助词',
'uzhi': '助词',
'v': '动词',
'vd': '动词',
'vf': '动词',
'vg': '动词',
'vi': '动词',
'vl': '动词',
'vn': '动词',
'vshi': '动词',
'vx': '动词',
'vyou': '动词',
'w': '标点符号',
'wb': '标点符号',
'wd': '标点符号',
'wf': '标点符号',
'wj': '标点符号',
'wh': '标点符号',
'wkz': '标点符号',
'wky': '标点符号',
'wm': '标点符号',
'wn': '标点符号',
'wp': '标点符号',
'ws': '标点符号',
'wt': '标点符号',
'ww': '标点符号',
'wyz': '标点符号',
'wyy': '标点符号',
'x': '字符串',
'xu': '字符串',
'xx': '字符串',
'y': '语气词',
'z': '状态词',
'un': '未知词'
}
# 英文词性转中文词性字典:详细版
self.En2Cn_Pro = {
'a': '形容词',
'ad': '形容词-副形词',
'ag': '形容词-形容词性语素',
'al': '形容词-形容词性惯用语',
'an': '形容词-名形词',
'b': '区别词',
'bl': '区别词-区别词性惯用语',
'c': '连词',
'cc': '连词-并列连词',
'd': '副词',
'e': '叹词',
'eng': '英文',
'f': '方位词',
'g': '语素',
'h': '前缀',
'i': '成语',
'j': '简称略语',
'k': '后缀',
'l': '习用语',
'm': '数词',
'mq': '数量词',
'n': '名词',
'ng': '名词-名词性语素',
'nl': '名词-名词性惯用语',
'nr': '名词-人名',
'nr1': '名词-汉语姓氏',
'nr2': '名词-汉语名字',
'nrf': '名词-音译人名',
'nrfg': '名词-人名',
'nrj': '名词-日语人名',
'ns': '名词-地名',
'nsf': '名词-音译地名',
'nt': '名词-机构团体名',
'nz': '名词-其他专名',
'o': '拟声词',
'p': '介词',
'pba': '介词-“把”',
'pbei': '介词-“被”',
'q': '量词',
'qt': '量词-动量词',
'qv': '量词-时量词',
'r': '代词',
'rg': '代词-代词性语素',
'rr': '代词-人称代词',
'rz': '代词-指示代词',
'rzs': '代词-处所指示代词',
'rzt': '代词-时间指示代词',
'rzv': '代词-谓词性指示代词',
'ry': '代词-疑问代词',
'rys': '代词-处所疑问代词',
'ryt': '代词-时间疑问代词',
'ryv': '代词-谓词性疑问代词',
's': '处所词',
't': '时间词',
'tg': '时间词-时间词性语素',
'u': '助词',
'ude1': '助词-“的”“底”',
'ude2': '助词-“地”',
'ude3': '助词-“得”',
'udeng': '助词-“等”“等等”“云云”',
'udh': '助词-“的话”',
'uguo': '助词-“过”',
'ule': '助词-“了”“喽”',
'ulian': '助词-“连”',
'uls': '助词-“来讲”“来说”“而言”“说来”',
'usuo': '助词-“所”',
'uyy': '助词-“一样”“一般”“似的”“般”',
'uzhe': '助词-“着”',
'uzhi': '助词-“之”',
'v': '动词',
'vd': '动词-副动词',
'vf': '动词-趋向动词',
'vg': '动词-动词性语素',
'vi': '动词-不及物动词(内动词)',
'vl': '动词-动词性惯用语',
'vn': '动词-名动词',
'vshi': '动词-“是”',
'vx': '动词-形式动词',
'vyou': '动词-“有”',
'w': '标点符号',
'wb': '标点符号-百分号千分号,全角:% ‰ 半角:%',
'wd': '标点符号-逗号,全角:, 半角:,',
'wf': '标点符号-分号,全角:; 半角: ; ',
'wj': '标点符号-句号,全角:。',
'wh': '标点符号-单位符号,全角:¥ $ £ ° ℃ 半角 $',
'wkz': '标点符号-左括号,全角:( 〔 [ { 《 【 〖 〈 半角:( [ { <',
'wky': '标点符号-右括号,全角:) 〕 ] } 》 】 〗 〉 半角: ) ] { >',
'wm': '标点符号-冒号,全角:: 半角: :',
'wn': '标点符号-顿号,全角:、',
'wp': '标点符号-破折号,全角:—— -- ——- 半角:—',
'ws': '标点符号-省略号,全角:…… …',
'wt': '标点符号-叹号,全角:! 半角:!',
'ww': '标点符号-问号,全角:? 半角:?',
'wyz': '标点符号-左引号,全角:“ ‘ 『',
'wyy': '标点符号-右引号,全角:” ’ 』',
'x': '字符串',
'xu': '字符串-网址URL',
'xx': '字符串-非语素字',
'y': '语气词',
'z': '状态词',
'un': '未知词'
}
def add_customer_word(self, user_words: str):
pattern = re.compile(r''+ '[,,]')
user_words_list = pattern.split(user_words)
for word in user_words_list:
jieba.suggest_freq(word, tune=True)
# def combine_text(self, file_path):
# df = pd.read_excel(file_path)
# text = '。'.join(df['content'])
# return text
def analysis_data(self, string_data):
seg_list_exact = jieba.posseg.cut(string_data, HMM=True) # 精确模式分词+HMM
object_list = []
# 去除停用词
stopwords_path = os.path.join(BASE_DIR, 'static/base/baidu_stopwords.txt')
with open(stopwords_path, 'r', encoding='UTF-8') as meaninglessFile:
stopwords = set(meaninglessFile.read().split('\n'))
stopwords.add(' ')
for word in seg_list_exact: # 循环读出每个分词
if word not in stopwords: # 如果不在去除词库中
object_list.append(word) # 分词追加到列表
number = 1000
word_counts = collections.Counter(object_list) # 对分词做词频统计
word_counts_top = word_counts.most_common(number) # 获取前number个最高频的词
index = 0
out_df = pd.DataFrame(columns=['词语', '词频', '词性'])
for top_word, frequency in word_counts_top: # 获取词语和词频
try:
out_df.loc[index] = [top_word.word, frequency, self.En2Cn_Pro[top_word.flag]]
index += 1
except KeyError:
pass
return out_df
if __name__ == '__main__':
pending_file = r'C:\Users\EDZ\Desktop\data1104.xlsx'
user_file = r'C:\Users\EDZ\Desktop\用户自定义词典_样例.txt'
doc_text_list = utils.read_excel(pending_file, user_file)
# print(doc_text_list)
text = '。'.join(doc_text_list)
print("len(corpus):" + str(len(text)))
wc = WordCount()
out_df = wc.analysis_data(string_data=text)
out_df.to_excel('结果.xlsx', index=False)
\ No newline at end of file
data_loader:
dataset_path: E:\working\model_train\KMeans\dataset\test.xlsx
stopwords_path: E:\working\model_train\base\dataset\stopwords.txt
data_process:
use_stopwords: True
tokenizer: PerceptronLexicalAnalyzer
use_dev: False
train_size: 0.8
test_size: 0.1
random_state: 2021
embedding:
size: 100
window: 5
min_count: 5
workers: 5
sg: 0
iter: 20
norm: l2
use_idf: False
smooth_idf: False
model:
model_save: E:\working\model_train\kmeans_model
n_clusters: 30
init: k-means++
n_init: 5
max_iter: 100
evaluate:
average: micro
runner:
\ No newline at end of file
"""
"""
年报正式用语词典
dict/formal_pos.txt 正式用语正面情绪词典
dict/formal_neg.txt 正式用语负面情绪词典
"""
from cnsenti import Sentiment
senti = Sentiment(pos='dict/formal_pos.txt', # 正面词典txt文件相对路径
neg='dict/formal_neg.txt', # 负面词典txt文件相对路径
merge=False, # 是否将cnsenti自带词典和用户导入的自定义词典融合
encoding='utf-8') # 两txt均为utf-8编码
# test_text = '这家公司是行业的引领者,是中流砥柱。今年的业绩非常好。'
text1 = '公司加强数字化制造顶层策划,助推核心制造能力升级,通过对内外部资源的充分调研、分析和论证,研究并制定了《航天晨光智能制造方案》,提出了公司智能信息化方案规划和智能制造产业发展路径,制定了公司智能制造及信息化建设工作计划,明确通过“三步走”的方式,实现设备资源内部优化和外部拓展 '
text2 = '1、转型升级风险:转型升级成效尚未呈现,主业市场需求持续下降公司虽大力推进转型升级工作,但成效尚未呈现,目前仍以传统装备制造业为主,产品以单件、元件为主,信息化、智能化程度不高,缺乏成套、成组、高端化、集成化的新产品'
result = senti.sentiment_count(text2)
print('sentiment_count', result)
可以使用cnsenti库中的自定义方法,计算年报或财经类社交媒体的文本情绪。
> 姚加权,冯绪,王赞钧,纪荣嵘,张维. 语调、情绪及市场影响:基于金融情绪词典. 管理科学学报,2021. 24(5), 26-46.
该论文开发了中文的金融情感词典,已有的中文金融情感词典有以下不足:
- 大多采用形容情绪词,对于金融场景适用性差
- 将LM英文词典本土化,制作中文金融情绪词典
- 词典构建方法多为人工
该论文开发中文情绪词典,从年报和社交媒体两个数据源出发,借助数据挖掘和深度学习算法,构建了正式用语 和 非正式用于两大类情感词典。
## 标注思路
一般构建词典要么用多个词典融合,要么人工标准训练。该论文采用了一定的技巧,不需要人工标注即可实现近乎人工标注的效果。
### 正式词典标注思路
正式用语情感词典,通过年报公布后3个交易日累积正负收益率为标准,将年报标记为正负面情绪两类。
### 非正式词典标注思路
使用所有中国上市公司在雪球论坛和东方财富股吧内相关帖子,共8130万条。
在网络股票论坛,用户发表自己的意见时,经常带有表情符号,从而使得帖子带有明显的情绪指标。 这种含有特殊指标的帖子,省去了人工标注文本情绪的工作。
<br>
具体构建词典的步骤,大家可以阅读论文原文。论文已经公开了中文情感词典,我已将其整理为4个txt文件
- formal_pos.txt 正式用语**正面**情绪词典
- formal_neg.txt 正式用语**负面**情绪词典
- unformal_pos.txt 非正式用语**正面**情绪词典
- unformal_neg.txt 非正式用语**负面**情绪词典
<br>
## 中文金融词典使用方法
cnsenti实现了自定义词典功能,导入不同的txt词典文件,即可实现不同方面的情绪词统计。
### 年报正式用语词典
- dict/formal_pos.txt 正式用语**正面**情绪词典
- dict/formal_neg.txt 正式用语**负面**情绪词典
```python
from cnsenti import Sentiment
senti = Sentiment(pos='dict/formal_pos.txt', #正面词典txt文件相对路径
neg='dict/formal_neg.txt', #负面词典txt文件相对路径
merge=False, #是否将cnsenti自带词典和用户导入的自定义词典融合
encoding='utf-8') #两txt均为utf-8编码
test_text = '这家公司是行业的引领者,是中流砥柱。今年的业绩非常好。'
result = senti.sentiment_count(test_text)
print('sentiment_count',result)
```
Run
```
sentiment_count {'words': 16, 'sentences': 2, 'pos': 3, 'neg': 0}
```
<br>
### 财经社交媒体非正式用语词典
- dict/unformal_pos.txt 非正式用语**正面**情绪词典
- dict/unformal_neg.txt 非正式用语**负面**情绪词典
```python
from cnsenti import Sentiment
senti = Sentiment(pos='dict/unformal_pos.txt', #正面词典txt文件相对路径
neg='dict/unformal_neg.txt', #负面词典txt文件相对路径
merge=False, #融合cnsenti自带词典和用户导入的自定义词典
encoding='utf-8') #两txt均为utf-8编码
test_text = '这个股票前期走势承压,现在阴跌,散户只能割肉离场,这股票真垃圾'
result = senti.sentiment_count(test_text)
print('sentiment_count',result)
```
Run
```
sentiment_count {'words': 18, 'sentences': 1, 'pos': 0, 'neg': 2}
```
<br>
## 说明
读者如需使用本项目词典,请引用如下参考文献:
> 姚加权,冯绪,王赞钧,纪荣嵘,张维. 语调、情绪及市场影响:基于金融情绪词典. 管理科学学报,2021. 24(5), 26-46.
另外,Python暑期工作坊现在正在报名,内容涵盖Python语法、数据采集(网络爬虫)、文本数据清洗(文本分析)、机器学习等。20号开始直播。感兴趣的可以关注
data_process:
label_encode: true
random_state: 2021
test_file_path: test.txt
test_size: 0.1
tokenizer: jieba
train_file_path: train.txt
train_size: 0.8
use_dev: false
use_stopwords: true
embedding:
name: null
embedding_path: null
tokenizer_path: null
evaluate:
average: binary
model:
autotuneDuration: 100
autotuneModelSize: 200M
model_name: fxxl_model.bin
model_path: null
runner:
thres: null
data_process:
use_dev: False
train_size: 0.7
test_size: 0.2
random_state: 2021
label_encode: False
embedding:
pretrained_name: bert-base-chinese
embedding_path: null
tokenizer_path: null
model:
model_name: fxxl_model
model_path: null
evaluate:
average: micro
runner:
thres: null
\ No newline at end of file
data_process:
use_stopwords: True
tokenizer: PerceptronLexicalAnalyzer
random_state: 2021
embedding:
use_Tencent: True
size: 100
window: 5
min_count: 5
workers: 5
sg: 0
iter: 20
norm: l2
use_idf: True
smooth_idf: True
with_feature_selection: False
embedding_path: voc/
tokenizer_path: null
model:
model_path: null
model_name: kmeans_model.pkl
n_clusters: False
init: k-means++
n_init: 5
max_iter: 100
evaluate:
runner:
save_fname: results2.xlsx
data_process:
use_stopwords: True
use_dev: False
train_size: 0.8
test_size: 0.1
random_state: 2021
embedding:
name: fxxl
title_weight: 5
title_feature_ratio: 0.1
content_feature_ratio: 0.2
tokenizer_path: null
embedding_path: vocab/
model:
name: fxxl
r: 0.95
model_name: null
model_path: model/
evaluate:
average: binary
runner:
thres: 0.55
\ No newline at end of file
data_process:
use_stopwords: True
tokenizer: jieba
use_dev: False
train_size: 0.8
test_size: 0.1
random_state: 2021
min_content: 50
embedding:
embedding_path: bert_CNNVectorize_data
tokenizer_path: tokenizer_data.pkl
model:
input_shape: 3500
batch_size: 32
epochs: 2
shuffle: True
activation: relu
model_name: fxxl.h5
model_path: null
evaluate:
average: binary
runner:
thres: null
\ No newline at end of file
data_process:
use_stopwords: True
tokenizer: jieba
use_dev: False
train_size: 0.8
test_size: 0.1
random_state: 2021
min_content: 50
embedding:
name: fxxl
transformer: tf
transformer_norm: l2
embedding_path: null
tokenizer_path: null
model:
lr: 0.1
reg_alpha: 0
reg_lambda: 1
objective: binary:logitraw
with_sample_weight: True
subsample: 1
thres: 0.55
min_child_weight: 1
scale_pos_weight: 1
model_name: fxxl
model_path: null
evaluate:
average: binary
runner:
thres: null
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论