# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium

"""
import json

import redis
from bs4 import BeautifulSoup

import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()

rr=baseCore.r

def getList():

    header={
        'Accept':'*/*',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Host':'searchapi.henan.gov.cn',
        'Origin':'https://www.henan.gov.cn',
        'Pragma':'no-cache',
        'Referer':'https://www.henan.gov.cn/',
        'Sec-Fetch-Dest':'empty',
        'Sec-Fetch-Mode':'cors',
        'Sec-Fetch-Site':'same-site',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"',
    }
    dlist=[]
    pagenum=10
    for i in range(1,pagenum):
        log.info(f'henan采集第{i}页列表')
        lurl=f'https://searchapi.henan.gov.cn/open/api/external?keywords=&siteId=4500000001&allKeyword=&anyKeyword=&noKeyword=&searchRange=-1000&sortType=200&beginTime=&endTime=&pageNumber={i}&pageSize=15&fileType=3&channelMarkId=45000000010115416542055691'
        lcont=reqbase.reqGetHtml(lurl,header)
        if lcont:
            try:
                data=json.loads(lcont)
                datas=data['data']['datas']
                for lmsg in datas:
                    title=lmsg['title']
                    subtitle=lmsg['subtitle']
                    summary=lmsg['summary']
                    createDate=lmsg['createDate']
                    writeDate=lmsg['writeDate']
                    pubDate=lmsg['pubDate']
                    source=lmsg['source']
                    durl=lmsg['selfUrl']
                    docNumberStr=lmsg['docNumberStr']
                    reNum=lmsg['reNum']
                    content=lmsg['content']
                    siteweb='河南省人民政府'
                    detailmsg={
                        'title':title,
                        'subtitle':subtitle,
                        'summary':summary,
                        'createDate':createDate,
                        'writeDate':writeDate,
                        'pubDate':pubDate,
                        'source':source,
                        'durl':durl,
                        'content':content,
                        'siteweb':siteweb,
                        'docNumberStr':docNumberStr,
                        'reNum':reNum,
                    }
                    is_member = rr.sismember('reis_henangov', durl)
                    if is_member:
                        continue
                    paserdetail(detailmsg)
                    dlist.append(detailmsg)
                    rr.sadd('reis_henangov',durl)
            except Exception as e:
                print(f'请求异常{e}-异常页码{i}')

    reqbase.pdwriterXLS(dlist,'河南省人民政府')

def paserdetail(detailmsg):
    headers={
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Cookie':'zh_choose=n; yfx_c_g_u_id_10000001=_ck23110818022219777515353379336; yfx_f_l_v_t_10000001=f_t_1699437742968__r_t_1699437742968__v_t_1699437742968__r_c_0',
        'Host':'www.henan.gov.cn',
        'Pragma':'no-cache',
        'Referer':'https://www.henan.gov.cn/zwgk/fgwj/szfl/',
        'Sec-Fetch-Dest':'document',
        'Sec-Fetch-Mode':'navigate',
        'Sec-Fetch-Site':'same-origin',
        'Sec-Fetch-User':'?1',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"'
    }
    durl=detailmsg['durl']
    dhmsg=reqbase.reqGetHtml(durl,headers)
    soup = BeautifulSoup(dhmsg, 'html.parser')
    soup = reqbase.paserUrl(str(soup), durl)
    contentWithTag=soup.select('div[id="content"]')[0]
    content = contentWithTag.text  # 不带标签正文
    detailmsg['contentWithTag']=contentWithTag
    detailmsg['content']=content
    return detailmsg
if __name__ == '__main__':
    getList()











