import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 浙江
def zhe_jiang():
    start = time.time()
    num = 0
    count = 0
    url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
    try:
        res = requests.get(url, baseTool.headers).content
        soup = BeautifulSoup(res, 'html.parser')
        # print(soup)
        # recordset = soup.find('recordset')
        list_li = re.findall('CDATA\[\\n(.*?)\]\]></record>', str(soup))
        # print(list_li)
        for li in list_li:
            fj_href_list = []
            li = BeautifulSoup(li, 'lxml')
            href = li.find('a')['href']
            pub_time = li.find('a').find('span').text
            title = li.find('a').text.replace(pub_time, '').strip()
            # log.info(title)
            if 'http' in href:
                href = href
            else:
                href = 'http://gzw.zj.gov.cn/' + href
            is_href = baseTool.db_storage.find_one({'网址': href})
            if is_href:
                num += 1
                continue
            try:
                href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                href_text.encoding = href_text.apparent_encoding
                i_html = href_text.text
                i_soup = BeautifulSoup(i_html, 'html.parser')
                # 将相对路径转化为绝对路径
                i_soup = baseTool.paserUrl(i_soup, href)
                # g_xxgk_table cf
                i_info = i_soup.find_all(class_='g_xxgk_td')
                if len(i_info) != 0:
                    try:
                        pub_source = str(i_info[4]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        # pub_time = str(i_info[5]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        pub_hao = str(i_info[2]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        content = i_soup.find(class_='g_content').text
                        contentWithTag = str(i_soup.find(class_='g_content'))
                    except:
                        # pub_source = str(i_info[3])
                        # print(pub_source)
                        pub_source = str(i_info[2]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        # pub_time = str(i_info[3]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        pub_hao = ''
                        content = i_soup.find(class_='g_content').text
                        contentWithTag = str(i_soup.find(class_='g_content'))
                else:
                    try:
                        source = i_soup.find('span', class_='rich_media_meta rich_media_meta_nickname')
                        pub_source = source.find('a').text
                        time_ = i_soup.find('em', id='publish_time')
                        pub_time = time_.text
                        pub_hao = ''
                        content = i_soup.find(
                            class_='zh_CN wx_wap_page wx_wap_desktop_fontsize_2 mm_appmsg comment_feature discuss_tab appmsg_skin_default appmsg_style_default pages_skin_pc not_in_mm').text
                        contentWithTag = str(i_soup.find(
                            class_='zh_CN wx_wap_page wx_wap_desktop_fontsize_2 mm_appmsg comment_feature discuss_tab appmsg_skin_default appmsg_style_default pages_skin_pc not_in_mm'))

                    except:
                        try:
                            source = i_soup.find_all(class_='ant-space-item')
                            # pub_time = str(source[1]).split('<span>')[1].split('</span>')[0]
                            pub_source = str(source[0]).split('<span>')[1].split('</span>')[0].replace('来源：', '')
                            pub_hao = ''
                            content = i_soup.find(class_='index_wrapper__L_zqV').text
                            contentWithTag = str(i_soup.find(class_='index_wrapper__L_zqV'))
                        except:
                            source = i_soup.find('div', class_='zsy_cotitle').find('p').text
                            pub_source = source.split('文章来源：')[1].split('发布时间：')[0]
                            pub_hao = ''
                            content = i_soup.find('div', class_='zsy_comain').replace('扫一扫在手机打开当前页', '').strip().text
                            contentWithTag = str(i_soup.find('div', class_='zsy_comain')).replace('扫一扫在手机打开当前页',
                                                                                                  '').strip()
                            # fujian_list = i_soup.find(class_='related').find_all('li')
                            # for fujian in fujian_list:
                            #     # print(fujian)
                            #     fujian_href = 'http://www.sasac.gov.cn/' + str(fujian.find('a')['href']).replace('../', '')
                            #     fj_href_list.append(fujian_href)
                            # print(fj_href_list)

                log.info(title)
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # todo:传kafka字段
                if content == '' or content == 'None':
                    log.info(f'{href}-----{title}----内容为空')
                    continue
                dic_news = {
                    'attachmentIds': [],
                    'author': '',
                    'content': content,
                    'contentWithTag': str(contentWithTag),
                    'createDate': time_now,
                    'deleteFlag': 0,
                    'id': '',
                    'labels': [{'relationId': "1672", 'relationName': "浙江省国资委", 'labelMark': "policy"}],
                    'origin': pub_source,
                    'organ': pub_source,
                    'topicClassification': '',
                    'issuedNumber': pub_hao,
                    'publishDate': pub_time,
                    'writtenDate': None,
                    'sid': '1697458829758697473',
                    'sourceAddress': href,
                    'summary': '',
                    'title': title
                }
                # print(dic_news)
                flag = baseTool.sendKafka(dic_news)
                if flag:
                    baseTool.save_data(dic_news)
                    num = num + 1
                    count += 1
            except:
                pass
    except:
        pass
    end = time.time()
    log.info(f'共抓取{num}条数据,共耗时{end - start}')

if __name__ == "__main__":
    zhe_jiang()