#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2021/8/9 11:19
# @Author  : 程婷婷
# @FileName: utils.py
# @Software: PyCharm
import os
import re
import jieba
import zipfile
import pandas as pd
from docx import Document
from platform_zzsn.settings import *


def read_txt(path):
    with open(path, 'r', encoding='utf8') as file:
        lines = file.readlines()
    return lines

def read_docx(pending_file, user_file):
    jieba.load_userdict(user_file)
    document = Document(pending_file)
    doc_text_list = []
    for para in document.paragraphs:
        para_text = re.sub(r'\s', '', para.text)
        if para_text:
            doc_text_list.append(para_text)
    return doc_text_list

def read_excel(pending_file, user_file):
    jieba.load_userdict(user_file)
    doc_text_list = pd.read_excel(pending_file)['content']
    doc_text_list.dropna(inplace=True)
    return doc_text_list

def merge_para(paras):
    new_paras = []
    for i, para in enumerate(paras):
        if not new_paras:
            new_paras.append(para)
        elif (len(new_paras[-1]) < 500):
            new_paras[-1] += para
        else:
            new_paras.append(para)
    return new_paras

def filter_stopwords(para):
    path = os.path.join(BASE_DIR, 'static/base/baidu_stopwords.txt')
    stopword_list = [k.strip() for k in read_txt(path) if
                     k.strip() != '']
    words = [word for word in jieba.lcut(para) if word not in stopword_list]
    return words

# 获取列表的第二个元素
def takeSecond(elem):
    return elem[1]

def takeFirst_len(elem):
    return len(elem[0])

def make_zip(file_dir: str, zip_path: str) -> None:
    zip_f = zipfile.ZipFile(zip_path, 'w')
    pre_len = len(os.path.dirname(file_dir))
    for parent, dir_names, filenames in os.walk(file_dir):
        for filename in filenames:
            path_file = os.path.join(parent, filename)
            arc_name = path_file[pre_len:].strip(os.path.sep)
            zip_f.write(path_file, arc_name)
    zip_f.close()

