# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium

"""
import json

import redis
from bs4 import BeautifulSoup

import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()

rr=baseCore.r

def getList():

    header={
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Cookie':'4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153159&searchUseTime-349',
        'Host':'www.hainan.gov.cn',
        'Pragma':'no-cache',
        'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
        'Sec-Fetch-Dest':'document',
        'Sec-Fetch-Mode':'navigate',
        'Sec-Fetch-Site':'same-origin',
        'Sec-Fetch-User':'?1',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"',
    }
    dlist=[]
    pagenum=5
    for i in range(1,pagenum):
        log.info(f'采集第{i}页列表')
        lurl=f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={i}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
        lcont=reqbase.reqGetHtml(lurl,header)
        if lcont:
            try:
                soup = BeautifulSoup(lcont, 'html.parser')
                soup = reqbase.paserUrl(str(soup), lurl)
                divlist=soup.select('div[id="showPage"]>div')
                for lmsg in divlist:
                    title=soupPaserHtml(lmsg,'h3>a')[1]
                    subtitle=''
                    summary=''
                    createDate=''
                    writeDate=''
                    pubDate=soupPaserHtml(lmsg,'span[class="quily-con"]')[1]
                    source=soupPaserHtml(lmsg,'a[class="address-con permitU"]')[1]
                    try:
                        durl=soupPaserHtml(lmsg,'h3>a')[0].get('href')
                    except Exception as e:
                        durl=''
                        continue
                    docNumberStr=''
                    reNum=''
                    content=''
                    siteweb='海南省人民政府'
                    detailmsg={
                        'title':title,
                        'subtitle':subtitle,
                        'summary':summary,
                        'createDate':createDate,
                        'writeDate':writeDate,
                        'pubDate':pubDate,
                        'source':source,
                        'durl':durl,
                        'content':content,
                        'siteweb':siteweb,
                        'docNumberStr':docNumberStr,
                        'reNum':reNum,
                    }
                    is_member = rr.sismember('reis_hainangov', durl)
                    if is_member:
                        continue
                    detailmsg=paserdetail(detailmsg)
                    dlist.append(detailmsg)
                    rr.sadd('reis_hainangov',durl)

            except Exception as e:
                log.info(f'列表解析异常{e}')
    reqbase.pdwriterXLS(dlist,'海南省人民政府')


def paserdetail(detailmsg):
    headers={
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Cookie':'HttpOnly=true; 4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153247&searchUseTime-337; HA_STICKY_apps=apps.srv34; Hm_lvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515700; yfx_c_g_u_id_10005682=_ck23110915414012919174127333485; yfx_f_l_v_t_10005682=f_t_1699515700292__r_t_1699515700292__v_t_1699515700292__r_c_0; _trs_uv=loqvrn5x_4549_5u3r; _trs_ua_s_1=loqvrn5x_4549_1lnl; arialoadData=true; ariawapChangeViewPort=false; Hm_lpvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515718',
        'Host':'www.hainan.gov.cn',
        'Pragma':'no-cache',
        'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=1&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
        'Sec-Fetch-Dest':'document',
        'Sec-Fetch-Mode':'navigate',
        'Sec-Fetch-Site':'same-origin',
        'Sec-Fetch-User':'?1',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"',
    }
    durl=detailmsg['durl']
    dhmsg=reqbase.reqGetHtml(durl,headers)
    try:
        log.info(f'解析详情地址：{durl}')
        soup = BeautifulSoup(dhmsg, 'html.parser')
        soup = reqbase.paserUrl(str(soup), durl)
        souyihao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(1)')[1]
        fenlei=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(2)')[1]
        fawenjiguan=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(1)')[1]
        write_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(2)')[1]
        wenhao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(1)')[1]
        pub_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(2)')[1]
        contentWithTag,content=soupPaserHtml(soup,'div[id="zoom"]')
        if not content:
            contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
        detailmsg['contentWithTag']=contentWithTag
        detailmsg['content']=content
        detailmsg['souyihao']=souyihao
        detailmsg['fenlei']=fenlei
        detailmsg['fawenjiguan']=fawenjiguan
        detailmsg['wenhao']=wenhao
        detailmsg['pub_data']=pub_data
        detailmsg['write_data']=write_data
    except Exception as e:
        print(f'详情解析异常{e}')
    return detailmsg

def soupPaserHtml(soup,csstag):
    try:
        tagmsg=soup.select(csstag)[0]
        tagmsgtext=tagmsg.text
    except Exception as e:
        tagmsg=''
        tagmsgtext=''
        log.info(f'标签解析异常{e}')
    return  tagmsg,tagmsgtext

if __name__ == '__main__':
    getList()











