import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 山东
def shan_dong():
    headers = {
        'Cookie': 'COLLCK=2502513302; COLLCK=2493627587',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.183'
    }
    start = time.time()
    num = 0
    count = 0
    url_list = ['http://gzw.shandong.gov.cn/channels/ch06086/', 'http://gzw.shandong.gov.cn/channels/ch06088/']
    for url in url_list:
        try:
            resp_text = requests.get(url=url, headers=headers, verify=False)
            resp_text.encoding = 'utf-8'
            html = resp_text.text
            soup = BeautifulSoup(html, 'html.parser')
            result = soup.find_all(class_='pagedContent')
            for li in result:
                href = li.find('a')['href']
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
                    href_text.encoding = href_text.apparent_encoding
                    i_html = href_text.text
                    i_soup = BeautifulSoup(i_html, 'html.parser')
                    try:
                        source = i_soup.find_all('tbody')[0]
                        title = str(source).split('标　　题：</strong>')[1].split('</td>')[0].replace('\r', '').replace('\n',
                                                                                                                   '')
                        pub_time = re.findall('<strong>发布日期：</strong>(.*?)</td>', str(source))
                        pub_time = ''.join(pub_time)
                        pub_hao = re.findall('<strong>发文字号：</strong>(.*?)</td>', str(source))
                        pub_hao = ''.join(pub_hao)
                        pub_source = re.findall('<strong>发文机关：</strong>(.*?)</td>', str(source))
                        pub_source = ''.join(pub_source)
                        writtenDate = re.findall('<strong>成文日期：</strong>(.*?)</td>', str(source))
                        writtenDate = ''.join(writtenDate)
                        # print(pub_time,pub_source,pub_hao)
                        content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
                        contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        if pub_hao == '无':
                            p_list = content.find_all('p')
                            for p in p_list:
                                p_text = p.text
                                if '〔' and '〕' in p_text:
                                    pub_hao = p_text
                                    break
                                else:
                                    continue
                    except:
                        try:
                            title = str(i_soup.find('div', attrs={'class': 'wz_title'}).text).strip().lstrip()
                        except:
                            title = ''
                            source = i_soup.find('div', attrs={'id': 'nr'})
                            h1_list = source.find_all('h1')
                            for h1 in h1_list:
                                title = title + str(h1.text)
                            title.strip().lstrip()
                        pub_time = None
                        span_list = source.find_all('span')
                        i = 0
                        for span in span_list:
                            span_text = span.text
                            if '〔' and '〕' in span_text or '鲁国' in span_text or '国办发' in span_text:
                                pub_hao = str(span_text)
                                if '号' not in pub_hao:
                                    pub_hao = pub_hao + str(span_list[i + 1].text)
                                break
                            i = i + 1
                        content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
                        contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': [],
                        'author': '',
                        'content': content,
                        'contentWithTag': str(contentwithtag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1674", 'relationName': "山东省国资委", 'labelMark': "policy"}],
                        'origin': '',
                        'organ': pub_source,
                        'topicClassification': '',
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    # print(dic_news)
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
                        log.info(title)
                        num = num + 1
                        count += 1
                except:
                    pass
        except:
            pass
    end = time.time()
    log.info('共', count, '条', '...........', '共耗时', end - start, '秒')

if __name__ == "__main__":
    shan_dong()