#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2021/6/1 9:58
# @Author  : 程婷婷
# @FileName: BaseDataLoader.py
# @Software: PyCharm
import pandas as pd
from base.views.config.BaseConfig import BaseConfig
class BaseDataLoader:
    def __init__(self, config_path):
        self.data_loader_config = BaseConfig(config_path)._parsed_file['data_loader']

    def read_file(self):
        symbol = self.data_loader_config['dataset_path'].split('.')[-1]
        if (symbol == 'xlsx') or (symbol == 'xls'):
            df = pd.read_excel(r''+self.data_loader_config['dataset_path'])
        elif symbol == '.csv':
            df = pd.read_csv(r''+self.data_loader_config['dataset_path'], sep='\t')
        else:
            print('数据类型错误')
            return '数据类型错误'
        df.drop_duplicates(subset='content', keep='first', inplace=True)
        df.dropna(subset=['content', 'label'], inplace=True)
        df = df.reset_index(drop=True)
        print('=================执行正文去重和去空之后共有%d条数据=============' % len(df['content']))
        return df

    def read_stopwords(self):
        # 读取停顿词列表
        stopword_list = [k.strip() for k in open(self.data_loader_config['stopwords_path'], encoding='utf8').readlines() if
                         k.strip() != '']
        return stopword_list
