# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium

"""
import json

import redis
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()

rr=baseCore.r

def getList():

    header={
        'Accept':'application/json, text/plain, */*',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Cookie':'JSESSIONID=B970564BAAD37BB8E9EF19F78FF45618; Hm_lvt_5544783ae3e1427d6972d9e77268f25d=1699492684; token=43cfe913-ec04-4aa3-b037-96903ccaa188; uuid=43cfe913-ec04-4aa3-b037-96903ccaa188; Hm_lpvt_5544783ae3e1427d6972d9e77268f25d=1699492955; 924omrTVcFchT=0IZtGVa8M20F2wJXy_C6l9PPFZOO1SBDdB3qZtsaLbLGaQ5t4l6Vt8HF9dIwhxtBcLdkdRZwlK42NCaEUjZZoPsXZAZ1o.tgK50mj8FJZTM5zCxcVg3w4cOCSM4BvYApzj7YMWHycK14.NY6Y.AP6bW6g0jDIqZlbp2hKSpDfZYBhjsgwJJraXKf2S4sgG6swjXFVVUHGngt2GMQPUZQRsE0_tL9Pz3_h6JeSD9qHWLOVKJWz0z8hdC_F4kiGZj2FRjjSUZp0VLUS8pjkrJdGYrKhC5xwGy8xSFYBE_trVuCFjr8.vhLqBONYkoWvZM2qNX_WZXg_3wTLMqCMrjoCmkvHf7B9.MMVu8tMC4hwDT4wjeyoNoRjIlgKuwE.aGhn',
        'Host':'www.hubei.gov.cn',
        'Pragma':'no-cache',
        'Referer':'http://www.hubei.gov.cn/site/hubei/search.html',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    dlist=[]
    pagenum=3
    for i in range(1,pagenum):
        log.info(f'湖北采集第{i}页列表')
        lurl=f'http://www.hubei.gov.cn/igs/front/search.jhtml?position=&timeOrder=&code=872801132c71495bbe5a938f6acff5aa&orderBy=all&pageSize=10&type=%E6%96%87%E4%BB%B6&time=&chnldesc=&pageNumber={i}&aggrFieldName=chnldesc&sortByFocus=true&siteId=50&name=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitename=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitetype=%E7%9C%81%E6%94%BF%E5%BA%9C&searchWord=REITS&6LDjm9Ls=0t3_jtGlqEJQLVtYPg5o4LE8KRsDcOrdhcQJ2gpgbWwP9rQyfChv7ADuy_hXWgy2abOG9jq8_hKyrFekh7IWmLmb9VBbEQh7tULy0_6L3zqkGOSoDWEcli5Ympa58KVMviSIxe_LiYGE'
        lcont=reqbase.reqGetHtml(lurl,header)
        if lcont:
            try:
                data=json.loads(lcont)
                datas=data['page']['content']
                for lmsg in datas:
                    title=lmsg['DOCTITLE']
                    subtitle=lmsg['FileName']
                    summary=lmsg['DOCCONTENT']
                    createDate=''
                    writeDate=''
                    pubDate=lmsg['PUBDATE']
                    source=lmsg['publisher']
                    durl=lmsg['url']
                    docNumberStr=''
                    reNum=lmsg['IdxID']
                    content=''
                    siteweb='湖北省人民政府'
                    detailmsg={
                        'title':title,
                        'subtitle':subtitle,
                        'summary':summary,
                        'createDate':createDate,
                        'writeDate':writeDate,
                        'pubDate':pubDate,
                        'source':source,
                        'durl':durl,
                        'content':content,
                        'siteweb':siteweb,
                        'docNumberStr':docNumberStr,
                        'reNum':reNum,
                    }
                    is_member = rr.sismember('reis_hubeigov', durl)
                    if is_member:
                        continue
                    paserdetail(detailmsg)
                    dlist.append(detailmsg)
                    rr.sadd('reis_hubeigov',durl)
            except Exception as e:
                print(f'请求异常{e}-异常页码{i}')

    reqbase.pdwriterXLS(dlist,'河南省人民政府')

def paserdetail(detailmsg):
    # opt = webdriver.ChromeOptions()
    # opt.add_argument("--ignore-certificate-errors")
    # opt.add_argument("--ignore-ssl-errors")
    # opt.add_experimental_option("excludeSwitches", ["enable-automation"])
    # opt.add_experimental_option('excludeSwitches', ['enable-logging'])
    # opt.add_experimental_option('useAutomationExtension', False)
    # opt.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
    # chromedriver = r'D:\chrome62\cmdvip\chromedriver.exe'
    # driver = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
    headers={
        'Host':'www.hubei.gov.cn',
        'Proxy-Connection':'keep-alive',
        'Pragma':'no-cache',
        'Cache-Control':'no-cache',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Referer':'http://www.hubei.gov.cn/zfwj/ezbf/202303/t20230303_4569220.shtml',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cookie':'924omrTVcFchT=0ydF5mX9FkIQDMfAAr4A60Yt6sHsXZOzTlm30NLRHm2OwX_YgXaMFBUe3WeNORSf0ZqYHjvBxVL5CXNSWoCOOThArMBpBDzXVdWxVIoA5YBGBLbPUN4CbcQQLZEty.w1MZkgI1pn30uv5STvyCsHLoYGTDHDSIbaURf4XIXzC3fNhxDX.nR5ZWV_HBo9ZAyC5I93.otc4vf7nD6v3Tympw6h2ZUuyAJ0Q7Nes3n0dIB_BIhwCkjyvJibUZt04ggU6XeXnS.qXr2CaM8BJQQ4mdLJ5apGqInkYuNv2GJP1AvL',

    }
    durl=detailmsg['durl']
    try:
        dhmsg=reqbase.reqGetHtml(durl,headers)
        if dhmsg:
            soup = BeautifulSoup(dhmsg, 'html.parser')
            soup = reqbase.paserUrl(str(soup), durl)
            idx_num=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
            class_type=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
            dplay_gov=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
            pub_date=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
            fileNum=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
            contentWithTag,content=soupPaserHtml(soup,'div[class="hbgov-article-content"]')
            # contentWithTag=soup.select('div[class="hbgov-article-content"]')[0]
            content = contentWithTag.text  # 不带标签正文
            detailmsg['contentWithTag']=contentWithTag
            detailmsg['content']=content
            detailmsg['idx_num']=idx_num
            detailmsg['class_type']=class_type
            detailmsg['dplay_gov']=dplay_gov
            detailmsg['pub_date']=pub_date
            detailmsg['fileNum']=fileNum
    except Exception as e:
        print(e)

    return detailmsg

def soupPaserHtml(soup,csstag):
    try:
        tagmsg=soup.select(csstag)[0]
        tagmsgtext=tagmsg.text
    except Exception as e:
        tagmsg=''
        tagmsgtext=''
        log.info(f'标签解析异常{e}')
    return  tagmsg,tagmsgtext


if __name__ == '__main__':
    getList()











