# 中央全面深化改革委员会会议
import json
import sys
import time

import redis
import requests
from bs4 import BeautifulSoup
from datetime import datetime

from kafka import KafkaProducer

sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
        'Host': 'www.12371.cn',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
}
headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
                'Connection': 'keep-alive',
                'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
                'Host': 'news.12371.cn',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'none',
                'Sec-Fetch-User': '?1',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
                'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': '"Windows"'
                        }

if __name__ == "__main__":
        # 中央全面深化改革委员会会议
        r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=5)
        # 中央全面深化改革领导小组会议
        # url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
        url = 'https://www.12371.cn/special/zyqmshggldxzhy19/'

        request = requests.get(url=url, headers=header)
        soup = BeautifulSoup(request.content, 'html.parser')
        # print(soup)
        request.encoding = request.apparent_encoding
        # print(soup)
        # info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
        info_html_list = soup.find_all('div', class_='dyw1023_right_list01 hyty')
        flag = 1
        for info_html in info_html_list:

                if flag == 1:
                        info_code = 'IN-20230816-0004'
                        sid = '1691633319715676162'
                else:
                        sid = '1691633869186277378'
                        info_code = 'IN-20230816-0005'

                ul_list = info_html.find('ul', class_='ul_list').find_all('li')
                for ul in ul_list[::-1]:
                        publishDate_ = str(ul.find('span').text)
                        date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
                        publishDate = date_obj.strftime('%Y-%m-%d')
                        year = int(publishDate[:4])
                        if year < 2023:
                                continue
                        newsUrl = ul.find('a')['href']
                        summary = ul.find('a').text
                        # todo: 链接判重
                        try:
                                flag = r.sismember(info_code, newsUrl)
                                if flag:
                                        log.info('信息已采集入库过')
                                        continue
                        except Exception as e:
                                continue
                        news_request = requests.get(url=newsUrl, headers=headers, allow_redirects=False)
                        news_soup = BeautifulSoup(news_request.content, 'html.parser')
                        # print(news_soup)
                        try:
                                title = news_soup.find('h1', class_='big_title').text
                                source = news_soup.find('div', class_='title_bottom').find('i').text
                                contentwithTag = news_soup.find('div', class_='word')
                                content = contentwithTag.text
                        except Exception as e:
                                 log.error(f'解析网页出错{newsUrl}')
                                 continue

                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

                        dic_info ={
                                'id': '1681549361661489154' + str(int(time.time()*1000)),
                                'title': title,
                                'origin': source,
                                'contentWithTag': str(contentwithTag),
                                'content': content,
                                'summary': summary,
                                'publishDate': publishDate,
                                'sid': sid,
                                'subjectId': '1681549361661489154',
                                'sourceAddress':newsUrl,
                                'checkStatus': 1,
                                'deleteFlag': 0,
                                'createDate': time_now,

                        }
                        r.sadd(info_code, newsUrl)
                        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
                        try:
                                kafka_result = producer.send("research_center_fourth",
                                                             json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
                                # r.sadd(info_code + '-test', sourceAddress)
                                print('发送kafka结束')
                        except Exception as e:
                                print(e)
                                print('发送kafka异常！')
                        finally:
                                producer.close()
                flag += 1