#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2021/5/11 20:33
# @Author  : 程婷婷
# @FileName: XgboostClassifyRunner.py
# @Software: PyCharm
import numpy as np
import pandas as pd
from model.base.views.runner.BaseRunner import BaseRunner
from model.clustering.views.KMeans.KmeansModel import KmeansModel
from model.clustering.views.KMeans.data.KMeansDataLoader import KMeansDataLoader
from model.clustering.views.KMeans.data.KmeansProcess import KmeansProcess
from model.clustering.views.KMeans.KmeansEvaluator import KmeansEvaluator
import logging

format = '%(asctime)s %(levelname)s %(pathname)s %(funcName)s %(message)s'
logging.basicConfig(format=format, level=logging.INFO)


class KmeansRunner(BaseRunner):
    def __init__(self, config_path):
        super().__init__(config_path)
        self.km = KmeansModel(config_path)
        self.kdl = KMeansDataLoader(config_path)
        self.kp = KmeansProcess(config_path)
        self.ke = KmeansEvaluator(config_path)

    def single_train(self, logger):
        df = self.kdl.read_file()
        if 'labels' not in df.columns:
            df['labels'] = ''
        logger.info('处理后的数据量为 %d 条' %(len(df)))
        transformed_data = self.kp.runner_process(df['content'], df['labels'])
        model, data_cluster, result = self.km.building_model(transformed_data)
        centroids = model.cluster_centers_
        self.labels_ = model.labels_
        result_sorted = []
        similarity = []
        for j in range(max(self.labels_)+1):
            distances = [(np.linalg.norm(centroids[j] - data_cluster[j][i]), result[j][i]) for i in
                         range(len(result[j]))]
            distances_sorted = sorted(distances, key=lambda x: x[0])
            result_sorted.append([value[1] for value in distances_sorted])
            similarity.append([value[0] for value in distances_sorted])
        score = self.ke.compute_silhouette(X=transformed_data, labels=self.labels_)
        print('====================轮廓系数为%.4f====================' %score)
        logger.info('轮廓系数为 %.4f ' %score)
        return model, result_sorted, similarity, df

    def train(self, logger):
        model, result_sorted, similarity, df = self.single_train(logger)
        columns = list(df.columns)
        columns.append('distance')
        writer = pd.ExcelWriter(self.runner_config['save_fpath'])
        for j in range(max(self.labels_)+1):
            df_out = pd.DataFrame(columns=columns)
            if len(result_sorted[j]):
                for i in range(len(result_sorted[j])):
                    row = list(df.iloc[result_sorted[j][i]])
                    row.append(float(similarity[j][i]))
                    df_out.loc[i] = row
            print('第%s类有%d条数据' %(j, len(result_sorted[j])))
            logger.info('第%s类有%d条数据' %(j, len(result_sorted[j])))
            df_out.to_excel(writer, sheet_name='sheet' + str(j + 1), index=False)
        writer.close()
        return 'success'


#if __name__ == '__main__':
#    state = KmeansRunner().write_file()
#    print(state)
