# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium

"""
import json

import redis
from bs4 import BeautifulSoup

import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()

rr=baseCore.r

def getList():

    header={
        'Accept':'application/json, text/plain, */*',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Content-Length':'147',
        'Content-Type':'application/json',
        'Cookie':'cmssearch_session=ppktmj4Iyt337q8j2yiQBGvfwpRlLe44ifEPusj2; SEARCH_LIST=%5B%22REITS%22%2C%22REITs%22%5D; XSRF-TOKEN=eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
        'Host':'search.gd.gov.cn',
        'Origin':'https://search.gd.gov.cn',
        'Pragma':'no-cache',
        'Referer':'https://search.gd.gov.cn/search/file/2?page=1&position=all&keywords=REITs&filterType=localSite&filterId=undefined',
        'Sec-Fetch-Dest':'empty',
        'Sec-Fetch-Mode':'cors',
        'Sec-Fetch-Site':'same-origin',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'X-XSRF-TOKEN':'eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"'
    }
    dlist=[]
    pagenum=3
    for i in range(1,pagenum):
        log.info(f'henan采集第{i}页列表')
        lurl='https://search.gd.gov.cn/api/search/file'
        data={
            "page":str(i),
            "position":"all",
            "keywords":"REITs",
            "sort":"smart",
            "site_id":"2",
            "range":"site",
            "recommand":1,
            "gdbsDivision":"440000",
            "service_area":1
        }
        lcont=reqbase.reqPostHtml(lurl,header,data)
        if lcont:
            try:
                data=json.loads(lcont)
                datas=data['data']['list']
                for lmsg in datas:
                    title=lmsg['title']
                    subtitle=''
                    summary=lmsg['content']
                    createDate=''
                    writeDate=''
                    pubDate=lmsg['pub_time']
                    source=lmsg['source']
                    durl=lmsg['url']
                    docNumberStr=lmsg['document_number']
                    reNum=lmsg['identifier']
                    content=''
                    siteweb='广东省人民政府'
                    detailmsg={
                        'title':title,
                        'subtitle':subtitle,
                        'summary':summary,
                        'createDate':createDate,
                        'writeDate':writeDate,
                        'pubDate':pubDate,
                        'source':source,
                        'durl':durl,
                        'content':content,
                        'siteweb':siteweb,
                        'docNumberStr':docNumberStr,
                        'reNum':reNum,
                    }
                    is_member = rr.sismember('reis_gdgov', durl)
                    if is_member:
                        continue
                    detailmsg=paserdetail(detailmsg)
                    dlist.append(detailmsg)
                    rr.sadd('reis_gdgov',durl)

            except Exception as e:
                log.info(f'列表解析异常{e}')
    reqbase.pdwriterXLS(dlist,'广东省人民政府-政策文件')

def getList2():

    header={
        'Accept':'application/json, text/plain, */*',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Content-Length':'147',
        'Content-Type':'application/json',
        'Cookie':'cmssearch_session=ppktmj4Iyt337q8j2yiQBGvfwpRlLe44ifEPusj2; SEARCH_LIST=%5B%22REITS%22%2C%22REITs%22%5D; XSRF-TOKEN=eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
        'Host':'search.gd.gov.cn',
        'Origin':'https://search.gd.gov.cn',
        'Pragma':'no-cache',
        'Referer':'https://search.gd.gov.cn/search/file/2?page=1&position=all&keywords=REITs&filterType=localSite&filterId=undefined',
        'Sec-Fetch-Dest':'empty',
        'Sec-Fetch-Mode':'cors',
        'Sec-Fetch-Site':'same-origin',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'X-XSRF-TOKEN':'eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"'
    }
    dlist=[]
    pagenum=3
    for i in range(1,pagenum):
        log.info(f'henan采集第{i}页列表')
        lurl='https://search.gd.gov.cn/api/search/file'
        data={
            "label": "政策解读",
            "position": "all",
            "keywords": "REITs",
            "sort": "smart",
            "site_id": "2",
            "range": "site",
            "page": i,
            "tag_name": "政策解读",
            "recommand": 1,
            "gdbsDivision": "440000",
            "service_area": 1
        }
        lcont=reqbase.reqPostHtml(lurl,header,data)
        if lcont:
            try:
                data=json.loads(lcont)
                datas=data['data']['list']
                for lmsg in datas:
                    title=lmsg['title']
                    subtitle=''
                    summary=lmsg['content']
                    createDate=''
                    writeDate=''
                    pubDate=lmsg['pub_time']
                    source=lmsg['source']
                    durl=lmsg['url']
                    docNumberStr=lmsg['document_number']
                    reNum=lmsg['identifier']
                    content=''
                    siteweb='广东省人民政府'
                    detailmsg={
                        'title':title,
                        'subtitle':subtitle,
                        'summary':summary,
                        'createDate':createDate,
                        'writeDate':writeDate,
                        'pubDate':pubDate,
                        'source':source,
                        'durl':durl,
                        'content':content,
                        'siteweb':siteweb,
                        'docNumberStr':docNumberStr,
                        'reNum':reNum,
                    }
                    is_member = rr.sismember('reis_gdgov', durl)
                    if is_member:
                        continue
                    detailmsg=paserdetail(detailmsg)
                    dlist.append(detailmsg)
                    rr.sadd('reis_gdgov',durl)

            except Exception as e:
                log.info(f'列表解析异常{e}')
    reqbase.pdwriterXLS(dlist,'广东省人民政府-政策解读')

def getList3():

    header={
        'Accept':'application/json, text/plain, */*',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Content-Length':'147',
        'Content-Type':'application/json',
        'Cookie':'cmssearch_session=ppktmj4Iyt337q8j2yiQBGvfwpRlLe44ifEPusj2; SEARCH_LIST=%5B%22REITS%22%2C%22REITs%22%5D; XSRF-TOKEN=eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
        'Host':'search.gd.gov.cn',
        'Origin':'https://search.gd.gov.cn',
        'Pragma':'no-cache',
        'Referer':'https://search.gd.gov.cn/search/file/2?page=1&position=all&keywords=REITs&filterType=localSite&filterId=undefined',
        'Sec-Fetch-Dest':'empty',
        'Sec-Fetch-Mode':'cors',
        'Sec-Fetch-Site':'same-origin',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        'X-XSRF-TOKEN':'eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"'
    }
    dlist=[]
    pagenum=2
    for i in range(1,pagenum):
        log.info(f'henan采集第{i}页列表')
        lurl='https://search.gd.gov.cn/api/search/file'
        data={
            "label": "计划规划",
            "position": "all",
            "keywords": "REITs",
            "sort": "smart",
            "site_id": "2",
            "range": "site",
            "page": i,
            "tag_name": "计划规划",
            "recommand": 1,
            "gdbsDivision": "440000",
            "service_area": 1
        }
        lcont=reqbase.reqPostHtml(lurl,header,data)
        if lcont:
            try:
                data=json.loads(lcont)
                datas=data['data']['list']
                for lmsg in datas:
                    title=lmsg['title']
                    subtitle=''
                    summary=lmsg['content']
                    createDate=''
                    writeDate=''
                    pubDate=lmsg['pub_time']
                    source=lmsg['source']
                    durl=lmsg['url']
                    docNumberStr=lmsg['document_number']
                    reNum=lmsg['identifier']
                    content=''
                    siteweb='广东省人民政府'
                    detailmsg={
                        'title':title,
                        'subtitle':subtitle,
                        'summary':summary,
                        'createDate':createDate,
                        'writeDate':writeDate,
                        'pubDate':pubDate,
                        'source':source,
                        'durl':durl,
                        'content':content,
                        'siteweb':siteweb,
                        'docNumberStr':docNumberStr,
                        'reNum':reNum,
                    }
                    is_member = rr.sismember('reis_gdgov', durl)
                    if is_member:
                        continue
                    detailmsg=paserdetail(detailmsg)
                    dlist.append(detailmsg)
                    rr.sadd('reis_gdgov',durl)

            except Exception as e:
                log.info(f'列表解析异常{e}')
    reqbase.pdwriterXLS(dlist,'广东省人民政府-计划规划')

def paserdetail(detailmsg):
    headers={
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Cookie':'gkmlfront_session=eyJpdiI6InRLSFR6VDc2bVRFcDY5ZlN1aW9yWnc9PSIsInZhbHVlIjoiaU9pdGVDWkxMRitQWFNJK2dEbVdVRDg5bXo2RDg5SEhMSTZWaWpYRGh1XC83NzJEVDFaNFQyUjhHWk5MVUFQTVIiLCJtYWMiOiJiNjdmZmQ1YTY3ZGE4OGQwZWI4OTJiZjQ2NDRjMTJjZjhlNjJiODIzZjMxY2Q2ODFhZGRlMWMyODI0YmMyZTI0In0%3D; front_uc_session=eyJpdiI6ImhaZ3E3VmxkUnFwT0hjbUw4cSs4d3c9PSIsInZhbHVlIjoiTlhhME1jWkVXTUtzK285cmYzWHlSZDV2c3p6M2ZIaEI5NjVJZnVwMDBsVmFXMDI5MWk3bGU0b0NSZHA1WGZobSIsIm1hYyI6IjZjZTMzOThhMGRjNWFjZDdmMzA2Njc0N2UxNThlOGQ0ZDU5OWJjMGIxOTY5ZGRjOWYzZDczZTk4OTFjMTBkNzYifQ%3D%3D',
        'Host':'www.gd.gov.cn',
        'Pragma':'no-cache',
        'Referer':'http://www.gd.gov.cn/',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    durl=detailmsg['durl']
    dhmsg=reqbase.reqGetHtml(durl,headers)
    try:
        soup = BeautifulSoup(dhmsg, 'html.parser')
        soup = reqbase.paserUrl(str(soup), durl)
        class_type=soupPaserHtml(soup,'div[class="classify"]>table>tbody:nth-child(1)>td:nth-child(4)')[1]
        pub_jigou=soupPaserHtml(soup,'div[class="classify"]>table>tbody:nth-child(2)>td:nth-child(2)')[1]
        write_data=soupPaserHtml(soup,'div[class="classify"]>table>tbody:nth-child(2)>td:nth-child(4)')[1]
        #file_num=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(2)')[1]
        #pub_data=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(4)')[1]
        contentWithTag,content=soupPaserHtml(soup,'div[class="article-content"]')
        if not content:
            contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
        detailmsg['contentWithTag']=contentWithTag
        detailmsg['content']=content
        detailmsg['class_type']=class_type
        detailmsg['pub_jigou']=pub_jigou
        detailmsg['write_data']=write_data
    except Exception as e:
        print(f'详情解析异常{e}')
    return detailmsg

def soupPaserHtml(soup,csstag):
    try:
        tagmsg=soup.select(csstag)[0]
        tagmsgtext=tagmsg.text
    except Exception as e:
        tagmsg=''
        tagmsgtext=''
        log.info(f'标签解析异常{e}')
    return  tagmsg,tagmsgtext

if __name__ == '__main__':
    # getList()
    # getList2()
    getList3()











