# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium

"""
import json

import redis
from bs4 import BeautifulSoup

import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()

rr=baseCore.r

def getList():

    header={
        'Accept':'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Content-Length':'663',
        'Content-Type':'application/json',
        'Cookie':'SESSION=MGFhMGQxNDItM2MyOS00NjU5LWI2MTgtZjdiM2UxNjFkMGI3; _trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false',
        'Host':'www.cq.gov.cn',
        'Origin':'https://www.cq.gov.cn',
        'Pragma':'no-cache',
        'Referer':'https://www.cq.gov.cn/zwgk/search.html?DOCTITLE=REITs&DEPT=&gte=&lte=&REFERENCENO=&nh=&number=',
        'Sec-Fetch-Dest':'empty',
        'Sec-Fetch-Mode':'cors',
        'Sec-Fetch-Site':'same-origin',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'X-Requested-With':'XMLHttpRequest',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"'
    }
    dlist=[]
    pagenum=3
    for i in range(1,pagenum):
        log.info(f'henan采集第{i}页列表')
        lurl='https://www.cq.gov.cn/irs/front/list'
        data={
            "customFilter": {
                "operator": "and",
                "properties": [],
                "filters": [
                    {
                        "operator": "or",
                        "properties": [
                            {
                                "property": "f_202121500898",
                                "operator": "eq",
                                "value": "REITs"
                            },
                            {
                                "property": "f_202142777829",
                                "operator": "eq",
                                "value": "REITs"
                            }
                        ],
                        "filters": []
                    },
                    {
                        "operator": "or",
                        "properties": [
                            {
                                "property": "f_202146838317",
                                "operator": "gte",
                                "value": "2023-11-09 16:14:20"
                            },
                            {
                                "property": "f_202146235090",
                                "operator": "gte",
                                "value": "2023-11-09 16:14:20"
                            }
                        ],
                        "filters": [
                            {
                                "operator": "and",
                                "properties": [
                                    {
                                        "property": "f_202146838317",
                                        "operator": "eq",
                                        "value": None
                                    },
                                    {
                                        "property": "f_202146235090",
                                        "operator": "eq",
                                        "value": None
                                    }
                                ]
                            }
                        ]
                    }
                ]
            },
            "sorts": [],
            "tableName": "t_1775cd018c6",
            "tenantId": "7",
            "pageSize": 10,
            "pageNo": i
        }
        lcont=reqbase.reqPostHtml(lurl,header,data)
        if lcont:
            try:
                data=json.loads(lcont)
                datas=data['data']['list']
                for lmsg in datas:
                    try:
                        title=lmsg['f_202121500898']
                        subtitle=''
                        summary=lmsg['f_202142777829']
                        createDate=''
                        writeDate=''
                        pubDate=lmsg['save_time']
                        source=lmsg['f_202121437464']
                        durl=lmsg['doc_pub_url']
                        wenjianhao=lmsg['f_202121837479']
                        suoyihao=lmsg['f_202121273539']
                        content=''
                        siteweb='重庆市人民政府'
                    except Exception as e:
                        continue
                    detailmsg={
                        'title':title,
                        'subtitle':subtitle,
                        'summary':summary,
                        'createDate':createDate,
                        'writeDate':writeDate,
                        'pubDate':pubDate,
                        'source':source,
                        'durl':durl,
                        'content':content,
                        'siteweb':siteweb,
                        'wenjianhao':wenjianhao,
                        'suoyihao':suoyihao,
                    }
                    is_member = rr.sismember('reis_cqgov', durl)
                    if is_member:
                        continue
                    detailmsg=paserdetail(detailmsg)
                    dlist.append(detailmsg)
                    rr.sadd('reis_cqgov',durl)

            except Exception as e:
                log.info(f'列表解析异常{e}')
    reqbase.pdwriterXLS(dlist,'重庆市人民政府')


def paserdetail(detailmsg):
    headers={
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Cookie':'_trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false; _trs_user=',
        'Host':'www.cq.gov.cn',
        'Pragma':'no-cache',
        'Sec-Fetch-Dest':'document',
        'Sec-Fetch-Mode':'navigate',
        'Sec-Fetch-Site':'same-origin',
        'Sec-Fetch-User':'?1',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"'
    }
    durl=detailmsg['durl']
    dhmsg=reqbase.reqGetHtml(durl,headers)
    try:
        log.info(f'详情请求地址：{durl}')
        soup = BeautifulSoup(dhmsg, 'html.parser')
        soup = reqbase.paserUrl(str(soup), durl)
        contentWithTag,content=soupPaserHtml(soup,'div[class="zcwjk-xlcon"]')
        if not content:
            contentWithTag,content=soupPaserHtml(soup,'div[class="document mt-1 mt-12"]')
        if not content:
            log.info(f'详情内容为空：{durl}')
            contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_word"]')
        detailmsg['contentWithTag']=contentWithTag
        detailmsg['content']=content
    except Exception as e:
        print(f'详情解析异常{e}')
    return detailmsg

def soupPaserHtml(soup,csstag):
    try:
        tagmsg=soup.select(csstag)[0]
        tagmsgtext=tagmsg.text
    except Exception as e:
        tagmsg=''
        tagmsgtext=''
        log.info(f'标签解析异常{e}')
    return  tagmsg,tagmsgtext

if __name__ == '__main__':
    getList()











