# -*- coding:utf-8 -*-
import datetime
import time

import redis
import requests
import urllib3
from pyquery import PyQuery as pq
import json
from kafka import KafkaProducer

import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
#务院政策问答平台最新发布信息采集

def reqHtml(url,data,header):
    try:
        proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
        json_data=json.dumps(data)
        response = requests.post(url,data=json_data,headers=header,verify=False,timeout=10)
        log.info(response.status_code)
        html=response.text
    except Exception as e:
        html=''
    return html

def page_list():
    # header = {
    #     'Host':'xcx.www.gov.cn',
    #     'Connection':'keep-alive',
    #     'Content-Length':'25',
    #     'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
    #     'x-tif-did':'pb5XUGL1Zm',
    #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
    #     'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
    #     'Content-Type':'application/json',
    #     'xweb_xhr':'1',
    #     'dgd-pre-release':'0',
    #     'x-yss-page':'publicService/pages/policyQALibrary/index/index',
    #     'x-yss-city-code':'4400',
    #     'Accept':'*/*',
    #     'Sec-Fetch-Site':'cross-site',
    #     'Sec-Fetch-Mode':'cors',
    #     'Sec-Fetch-Dest':'empty',
    #     'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
    #     'Accept-Encoding':'gzip, deflate, br',
    #     'Accept-Language':'zh-CN,zh;q=0.9'
    # }
    header = {
        'Host': 'xcx.www.gov.cn',
        'Connection': 'keep-alive',
        'Content-Length': '25',
        'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
        'x-tif-did': 'u8Ajuqdyap',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
        'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
        'Content-Type': 'application/json',
        'xweb_xhr': '1',
        'dgd-pre-release': '0',
        'x-yss-page': 'publicService/pages/policyQALibrary/index/index',
        'x-yss-city-code': '4400',
        'Accept': '*/*',
        'Accept-Language': '*',
        'Sec-Fetch-Site': 'cross-site',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
        'Accept-Encoding': 'gzip, deflate, br'
    }
    url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
    for i in range(1,453):
        log.info(f'采集第{i}页数据')
        k=i
        da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
        data=da.replace('[k]',str(k))
        try:
            data=json.loads(data)
            lhtml=reqHtml(url,data,header)
            hjson=json.loads(lhtml)
            data=hjson['data']['list']
        except Exception as e:
            log.info(e)
            time.sleep(60)
            continue
        for ss in data:
            id=ss['id']
            durl=f'https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicy'
            sourceAddress=f'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
            try:
                flag=r.sismember('IN-20230829-0146-test',sourceAddress)
                if flag:
                    log.info('信息已采集入库过')
                    continue
            except Exception as e:
                continue
            ss['url']=durl
            ss['sourceAddress']=sourceAddress
            detailpaser(ss)
        # time.sleep(5)

def detailpaser(dmsg):
    hh={
        'Host': 'xcx.www.gov.cn',
        'Connection': 'keep-alive',
        'Content-Length': '25',
        'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
        'x-tif-did': 'u8Ajuqdyap',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
        'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
        'Content-Type': 'application/json',
        'xweb_xhr': '1',
        'dgd-pre-release': '0',
        'x-yss-page': 'publicService/pages/policyQALibrary/index/index',
        'x-yss-city-code': '4400',
        'Accept': '*/*',
        'Accept-Language': '*',
        'Sec-Fetch-Site': 'cross-site',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
        'Accept-Encoding': 'gzip, deflate, br'
    }
    try:
        durl=dmsg['url']
        id=str(dmsg['id'])
        data={"id":id}
        json_data=json.dumps(data)
        response = requests.post(durl,data=json_data,headers=hh,verify=False,timeout=10)
        dhtml=response.text
        dd=json.loads(dhtml)
        sendTokafka(dd)
    except Exception as e:
        log.info(e)
    # log.info(dhtml)

def sendTokafka(ddata):
    dd=ddata['data']
    title=dd['title']
    id=dd['id']
    content=dd['content']
    contentWithTag=dd['content']
    publishTime=dd['publishTime']
    if publishTime:
        time_format='%Y年%m月%d日'
        publishDate=str(datetime.datetime.strptime(publishTime, time_format))
    else:
        publishDate = '1900-01-01'
    origin=dd['departmentName']
    sourceAddress=f'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
    sid='1696404919115825153'
    info_code='IN-20230829-0146'
    aa_dict = {
        'content': content,
        'contentWithTag': contentWithTag,
        'id': '',
        'sid': sid,
        'origin': origin,
        'publishDate': publishDate,
        'sourceAddress': sourceAddress,
        'title': title,
        'source': 'python定制采集',
        'type': ''
    }
    log.info(aa_dict)
    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
    try:
        kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
        r.sadd(info_code+'-test',sourceAddress)
        log.info('发送kafka成功！')
    except Exception as e:
        log.info(e)
    finally:
        producer.close()
        # r.close()

if __name__ == '__main__':
    r = redis.Redis(host='114.115.236.206', port=6379,password='clbzzsn', db=5)
    page_list()
