import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 河北
def he_bei():
    num = 0
    count = 0
    start_time = time.time()
    url = 'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
    try:
        res = requests.get(url, baseTool.headers)
        # print(res)
        json = res.json()
        # print(json)
        for info in json:
            title = info['title']
            contentWithTag = info['content']
            id = info['id']
            href = 'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id=' + str(id)
            is_href = baseTool.db_storage.find_one({'网址': href})
            if is_href:
                num += 1
                continue
            pub_time_ = info['updated']
            m = round(pub_time_ / 1000)  # 四舍五入取10位时间戳（秒级）
            n = time.localtime(m)  # 将时间戳转换成时间元祖tuple
            publishDate = time.strftime("%Y-%m-%d %H:%M:%S", n)[:10]  # 格式化输出时间
            origin = ''
            soup = baseTool.paserUrl(str(contentWithTag), href)
            fu_jian_soup = soup.find_all('a')
            id_list = []
            for file in fu_jian_soup:
                try:
                    file_href = file['href']
                except Exception as e:
                    continue
                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
                    category = os.path.splitext(file_href)[1]
                    if category not in file_name:
                        file_name = file_name + category
                    retData = baseCore.uptoOBS(file_href, '1668', file_name)
                    if retData['state']:
                        pass
                    else:
                        continue
                    att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num, publishDate)
                    id_list.append(att_id)
                    # todo:将返回的地址更新到soup
                    file['href'] = 'http:obs.ciglobal.cn/' + str(full_path)
            # id_ = redefid(id_list)
            contentWithTag = str(soup.prettify())
            if len(contentWithTag) < 1:
                if len(fu_jian_soup) < 1:
                    continue
            content = soup.text
            if content == '' or content == None:
                log.info(f'-----{href}----{title}----内容为空-----')
                continue
            pattern = r'(冀国.{1,}?号)|(国资.{1,}?号)'
            match_list = re.findall(pattern, content)
            if len(match_list) > 0:
                issuedNumber = match_list[0][0]
                if len(issuedNumber) > 20:
                    issuedNumber = ''
            else:
                issuedNumber = ''
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # todo:传kafka字段
            dic_news = {
                'attachmentIds': id_list,
                'author': '',
                'content': str(content),
                'contentWithTag': str(contentWithTag),
                'createDate': time_now,
                'deleteFlag': 0,
                'id': '',
                'labels': [{'relationId': "1668", 'relationName': "河北省国资委", 'labelMark': "policy"}],
                'origin': origin,
                'organ': "",
                'topicClassification': "",
                'issuedNumber': issuedNumber,
                'publishDate': publishDate,
                'writtenDate': None,
                'sid': '1697458829758697473',
                'sourceAddress': href,
                'summary': '',
                'title': title
            }
            # print(dic_news)
            flag = baseTool.sendKafka(dic_news)
            if flag:
                baseTool.save_data(dic_news)
                num += 1
                count += 1
    except:
        pass
    end_time = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

if __name__ == "__main__":
    he_bei()