import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 青海
def qing_hai():

    def qing_hai1():
        count = 0
        start_time = time.time()
        url_mode = 'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=604'
        try:
            res = requests.get(url=url_mode, headers=baseTool.headers)
            res.encoding = res.apparent_encoding
            res_text = res.text
            # page = BeautifulSoup(res_text, 'html.parser')
            soup = baseTool.paserUrl(res_text, url_mode)
            tr_list = soup.findAll('tr', attrs={'class': 'yhhei13'})
            num = 0
            for tr in tr_list:
                try:
                    durl = tr.find('a').get('href')
                    is_href = baseTool.db_storage.find_one({'网址': durl})
                    if is_href:
                        num += 1
                        log.info('已采集----------跳过')
                        continue
                    title = tr.find('a').text
                    # pub_hao = tr.findAll('td')[-3].text
                    # writtenDate = tr.findAll('td')[-2].text
                    publishDate = tr.findAll('td')[-1].text
                    # t = time.strptime(writtenDate, "%Y年%m月%d日")
                    # writtenDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                    t = time.strptime(publishDate, "%Y年%m月%d日")
                    publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                    if '.html' in durl:
                        res_ = requests.get(url=durl, headers=baseTool.headers)
                        res_.encoding = res_.apparent_encoding
                        res_text_ = res_.text
                        page = BeautifulSoup(res_text_, 'html.parser')
                        contentWithTag = str(page.find('div', attrs={'class': 'nxgz-detail-con'}))
                        origin = str(page.find('div', attrs={'class': 'foot-fb'}))
                        soup = BeautifulSoup(contentWithTag, 'html.parser')
                        content = soup.text
                        if content == '' or content == None:
                            log.info(f'-----{durl}----{title}----内容为空-----')
                            continue
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1681', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num, publishDate)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                        # id_ = redefid(id_list)
                        contentWithTag = str(soup.prettify())
                        # todo:替换完成之后，将附件上传至文件服务器
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': str(content),
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1681", 'relationName': "青海省国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': "",
                            'topicClassification': "",
                            'issuedNumber': '',
                            'publishDate': publishDate,
                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': durl,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            # print(id)
                            # id_list.append(id)
                            num += 1
                            count += 1
                except Exception as e:
                    pass
        except:
            pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def qing_hai2():
        num = 0
        count = 0
        start_time = time.time()
        urls = [
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=627',
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=559',
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=64',
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=65',
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=558', ]
        for url_mode in urls:
            # print(url_mode)
            try:
                res = requests.get(url=url_mode, headers=baseTool.headers)
                res.encoding = res.apparent_encoding
                res_text = res.text
                page = BeautifulSoup(res_text, 'html.parser')
                page_numbers = str(page.find('span', attrs={'id': 'center_wj1_Label5'}).text)
                page_numbers = int(page_numbers)
                num = 0
                for page_number in range(1, page_numbers + 1):
                    url = url_mode + f'&pages={page_number}'
                    # print(url + '................................')
                    res = requests.get(url=url, headers=baseTool.headers)
                    res.encoding = res.apparent_encoding
                    res_text = res.text
                    # page = BeautifulSoup(res_text, 'html.parser')
                    page = baseTool.paserUrl(res_text, url)
                    tr_list = page.findAll('tr', attrs={'class': 'yhhei13'})
                    for tr in tr_list:
                        try:
                            durl = tr.find('a').get('href')
                            is_href = baseTool.db_storage.find_one({'网址': durl})
                            if is_href:
                                num += 1
                                log.info('已采集----------跳过')
                                continue
                            title = tr.find('a').text
                            # pub_hao = tr.findAll('td')[-3].text
                            # writtenDate = tr.findAll('td')[-2].text
                            publishDate = tr.findAll('td')[-1].text
                            # t = time.strptime(writtenDate, "%Y年%m月%d日")
                            # writtenDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                            t = time.strptime(publishDate, "%Y年%m月%d日")
                            publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                            if '.html' in durl:
                                res_ = requests.get(url=durl, headers=baseTool.headers)
                                res_.encoding = res_.apparent_encoding
                                res_text_ = res_.text
                                # page = BeautifulSoup(res_text_, 'html.parser')
                                page = baseTool.paserUrl(res_text_, durl)
                                contentWithTag = str(page.find('td', attrs={'class': 'yhhei15'}))
                                try:
                                    origin = str(page.find('td', attrs={'class', 'heizi12'}).text).split()[0].replace(
                                        '来源：',
                                        '')
                                except:
                                    origin = ''
                                soup = BeautifulSoup(contentWithTag, 'html.parser')
                                content = soup.text
                                if content == '' or content == None:
                                    log.info(f'-----{durl}----{title}----内容为空-----')
                                    continue
                                fu_jian_soup = soup.find_all('a')
                                id_list = []
                                for file in fu_jian_soup:
                                    try:
                                        file_href = file['href']
                                    except Exception as e:
                                        continue
                                    if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                            or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                        file_name = file.text.strip()
                                        category = os.path.splitext(file_href)[1]
                                        if category not in file_name:
                                            file_name = file_name + category
                                        retData = baseCore.uptoOBS(file_href, '1681', file_name)
                                        if retData['state']:
                                            pass
                                        else:
                                            continue
                                        att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num,
                                                                                 publishDate)
                                        id_list.append(att_id)
                                        # todo:将返回的地址更新到soup
                                        file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                                # id_ = redefid(id_list)
                                contentWithTag = str(soup.prettify())
                                # todo:替换完成之后，将附件上传至文件服务器
                                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                                # todo:传kafka字段
                                dic_news = {
                                    'attachmentIds': id_list,
                                    'author': '',
                                    'content': str(content),
                                    'contentWithTag': str(contentWithTag),
                                    'createDate': time_now,
                                    'deleteFlag': 0,
                                    'id': '',
                                    'labels': [{'relationId': "1681", 'relationName': "青海省国资委", 'labelMark': "policy"}],
                                    'origin': origin,
                                    'organ': "",
                                    'topicClassification': "",
                                    'issuedNumber': '',
                                    'publishDate': publishDate,
                                    'writtenDate': None,
                                    'sid': '1697458829758697473',
                                    'sourceAddress': durl,
                                    'summary': '',
                                    'title': title
                                }
                                # print(dic_news)
                                flag = baseTool.sendKafka(dic_news)
                                if flag:
                                    baseTool.save_data(dic_news)
                                    # print(id)
                                    # id_list.append(id)
                                    num += 1
                                    count += 1
                        except:
                            pass
                res.close()
            except Exception as e:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    qing_hai1()
    qing_hai2()

if __name__ == "__main__":
    qing_hai()