import os
import re
import time
import requests
from bs4 import BeautifulSoup

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 内蒙古
def nei_meng_gu():
    start = time.time()
    num = 0
    url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
    try:
        resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
        resp_text.encoding = 'utf-8'
        html = resp_text.text
        soup = BeautifulSoup(html, 'html.parser')
        result = soup.find(class_='right_two')
        li_list = result.find_all(class_='font14wr')
        for a in li_list:
            id_list = []
            a_text = str(a)
            real_href = 'https://gzw.nmg.gov.cn/zfxxgk' + a_text.split('href="..')[-1].split('" target="_blank')[0]
            # # 判断是否已经爬取过
            # todo:测试用 注释掉判重
            is_href = baseTool.db_storage.find_one({'网址': real_href})
            if is_href:
                num += 1
                continue
            try:
                # 获取所需信息
                title = a_text.split('target="_blank">')[-1].split('</a>')[0]
                href_text = requests.get(url=real_href, headers=baseTool.headers, verify=False)
                href_text.encoding = 'utf-8'
                i_html = href_text.text
                i_soup = BeautifulSoup(i_html, 'html.parser')
                # todo:将html中的a标签相对路径改为绝对路径
                i_soup = baseTool.paserUrl(i_soup, real_href)

                i_result = i_soup.find('div', id='d_laiyuan')
                time_ = i_result.find_all('span')[0]
                time_ = str(time_)
                pub_time = time_.split('<span>')[1].split('</span>')[0].replace('发布时间：', '')
                # 发布机关
                origin = i_result.find_all('span')[1]
                origin = str(origin)
                pub_source = origin.split('<span>')[1].split('</span>')[0].replace('来源：', '')
                # 发文机关
                organ = origin
                fwzh = i_soup.find_all('td')[7]
                pub_hao_result = re.findall('〔(.*?)〕', str(fwzh))
                if len(pub_hao_result) == 0:
                    pub_hao = ''
                else:
                    if '内' in str(fwzh):
                        pub_hao = str(fwzh).split('<td>')[1].split('</td>')[0]
                    else:
                        pub_hao = ''
                # 成文时间
                writtenDate = i_soup.find_all('td')[9].text
                topicClassification = i_soup.find_all('td')[3].text
                i_content = i_soup.find(class_='d_show')
                if i_content:
                    content = str(i_content)
                else:
                    i_content = i_soup.find(class_='view TRS_UEDITOR trs_paper_default')
                    content = str(i_content)
                if i_content.text == '' or i_content.text == 'None':
                    log.info(f'{real_href}------{title}----内容为空-----')
                    continue
                # todo:内蒙古市的附件不在正文中，异步加载出来，替换不了标签，附件可上传att表中
                fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
                fu_jian_result = re.findall('href="(.*?)"', str(fujian))

                if len(fu_jian_result) > 0:
                    for fu_jian_re in fu_jian_result:
                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:

                            fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
                            fu_jian_href = fu_jian_re
                            category = os.path.splitext(fu_jian_href)[1]
                            if category not in title:
                                file_name = title + category
                            # print(fu_jian_href)
                            # todo:附件上传至文件服务器
                            retData = baseCore.uptoOBS(fu_jian_href, '1669', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', file_name, num, pub_time)
                            id_list.append(att_id)

                log.info(title)
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

                # todo:传kafka字段
                dic_news = {
                    'attachmentIds': id_list,
                    'author': '',
                    'content': i_content.text,
                    'contentWithTag': content,
                    'createDate': time_now,
                    'deleteFlag': 0,
                    'id': '',
                    'labels': [{'relationId': "1669", 'relationName': "内蒙古自治区国资委", 'labelMark': "policy"}],
                    'origin': origin,
                    'organ': organ,
                    'topicClassification': topicClassification,
                    'issuedNumber': pub_hao,
                    'publishDate': pub_time,
                    'writtenDate': writtenDate,
                    'sid': '1697458829758697473',
                    'sourceAddress': real_href,
                    'summary': '',
                    'title': title
                }
                flag = baseTool.sendKafka(dic_news)

                if flag:
                    baseTool.save_data(dic_news)
                num = num + 1

            except:
                pass
    except:
        pass

    end = time.time()
    log.info(f'共抓取{num}条数据,共耗时{end - start}')

if __name__ == "__main__":
    nei_meng_gu()