REITs网站采集

650c704a · 刘伟刚 · 497088ef · 650c704a · 650c704a · 650c704a
--- a/comData/reits2/BaseCore.py
+++ b/comData/reits2/BaseCore.py
--- a/comData/reits2/chongqigov.py
+++ b/comData/reits2/chongqigov.py
+# _*_ coding:utf-8 _*_
+# https://www.henan.gov.cn/
+"""
+信息采集的流程
+1.拼接获取列表连接
+2.对详情页面内容进行解析和清洗
+3.对采集的信息添加链接去重
+4.文件内容的输出字段
+5.内容信息调用请求的方式 requests,selenium
+
+"""
+import json
+
+import redis
+from bs4 import BeautifulSoup
+
+import reqbase
+import BaseCore
+baseCore=BaseCore.BaseCore()
+log=baseCore.getLogger()
+
+rr=baseCore.r
+
+def getList():
+
+    header={
+        'Accept':'application/json, text/javascript, */*; q=0.01',
+        'Accept-Encoding':'gzip, deflate, br',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Content-Length':'663',
+        'Content-Type':'application/json',
+        'Cookie':'SESSION=MGFhMGQxNDItM2MyOS00NjU5LWI2MTgtZjdiM2UxNjFkMGI3; _trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false',
+        'Host':'www.cq.gov.cn',
+        'Origin':'https://www.cq.gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'https://www.cq.gov.cn/zwgk/search.html?DOCTITLE=REITs&DEPT=&gte=&lte=&REFERENCENO=&nh=&number=',
+        'Sec-Fetch-Dest':'empty',
+        'Sec-Fetch-Mode':'cors',
+        'Sec-Fetch-Site':'same-origin',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'X-Requested-With':'XMLHttpRequest',
+        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'sec-ch-ua-mobile':'?0',
+        'sec-ch-ua-platform':'"Windows"'
+    }
+    dlist=[]
+    pagenum=3
+    for i in range(1,pagenum):
+        log.info(f'henan采集第{i}页列表')
+        lurl='https://www.cq.gov.cn/irs/front/list'
+        data={
+            "customFilter": {
+                "operator": "and",
+                "properties": [],
+                "filters": [
+                    {
+                        "operator": "or",
+                        "properties": [
+                            {
+                                "property": "f_202121500898",
+                                "operator": "eq",
+                                "value": "REITs"
+                            },
+                            {
+                                "property": "f_202142777829",
+                                "operator": "eq",
+                                "value": "REITs"
+                            }
+                        ],
+                        "filters": []
+                    },
+                    {
+                        "operator": "or",
+                        "properties": [
+                            {
+                                "property": "f_202146838317",
+                                "operator": "gte",
+                                "value": "2023-11-09 16:14:20"
+                            },
+                            {
+                                "property": "f_202146235090",
+                                "operator": "gte",
+                                "value": "2023-11-09 16:14:20"
+                            }
+                        ],
+                        "filters": [
+                            {
+                                "operator": "and",
+                                "properties": [
+                                    {
+                                        "property": "f_202146838317",
+                                        "operator": "eq",
+                                        "value": None
+                                    },
+                                    {
+                                        "property": "f_202146235090",
+                                        "operator": "eq",
+                                        "value": None
+                                    }
+                                ]
+                            }
+                        ]
+                    }
+                ]
+            },
+            "sorts": [],
+            "tableName": "t_1775cd018c6",
+            "tenantId": "7",
+            "pageSize": 10,
+            "pageNo": i
+        }
+        lcont=reqbase.reqPostHtml(lurl,header,data)
+        if lcont:
+            try:
+                data=json.loads(lcont)
+                datas=data['data']['list']
+                for lmsg in datas:
+                    try:
+                        title=lmsg['f_202121500898']
+                        subtitle=''
+                        summary=lmsg['f_202142777829']
+                        createDate=''
+                        writeDate=''
+                        pubDate=lmsg['save_time']
+                        source=lmsg['f_202121437464']
+                        durl=lmsg['doc_pub_url']
+                        wenjianhao=lmsg['f_202121837479']
+                        suoyihao=lmsg['f_202121273539']
+                        content=''
+                        siteweb='重庆市人民政府'
+                    except Exception as e:
+                        continue
+                    detailmsg={
+                        'title':title,
+                        'subtitle':subtitle,
+                        'summary':summary,
+                        'createDate':createDate,
+                        'writeDate':writeDate,
+                        'pubDate':pubDate,
+                        'source':source,
+                        'durl':durl,
+                        'content':content,
+                        'siteweb':siteweb,
+                        'wenjianhao':wenjianhao,
+                        'suoyihao':suoyihao,
+                    }
+                    is_member = rr.sismember('reis_cqgov', durl)
+                    if is_member:
+                        continue
+                    detailmsg=paserdetail(detailmsg)
+                    dlist.append(detailmsg)
+                    rr.sadd('reis_cqgov',durl)
+
+            except Exception as e:
+                log.info(f'列表解析异常{e}')
+    reqbase.pdwriterXLS(dlist,'重庆市人民政府')
+
+
+def paserdetail(detailmsg):
+    headers={
+        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding':'gzip, deflate, br',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Cookie':'_trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false; _trs_user=',
+        'Host':'www.cq.gov.cn',
+        'Pragma':'no-cache',
+        'Sec-Fetch-Dest':'document',
+        'Sec-Fetch-Mode':'navigate',
+        'Sec-Fetch-Site':'same-origin',
+        'Sec-Fetch-User':'?1',
+        'Upgrade-Insecure-Requests':'1',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'sec-ch-ua-mobile':'?0',
+        'sec-ch-ua-platform':'"Windows"'
+    }
+    durl=detailmsg['durl']
+    dhmsg=reqbase.reqGetHtml(durl,headers)
+    try:
+        log.info(f'详情请求地址：{durl}')
+        soup = BeautifulSoup(dhmsg, 'html.parser')
+        soup = reqbase.paserUrl(str(soup), durl)
+        contentWithTag,content=soupPaserHtml(soup,'div[class="zcwjk-xlcon"]')
+        if not content:
+            contentWithTag,content=soupPaserHtml(soup,'div[class="document mt-1 mt-12"]')
+        if not content:
+            log.info(f'详情内容为空：{durl}')
+            contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_word"]')
+        detailmsg['contentWithTag']=contentWithTag
+        detailmsg['content']=content
+    except Exception as e:
+        print(f'详情解析异常{e}')
+    return detailmsg
+
+def soupPaserHtml(soup,csstag):
+    try:
+        tagmsg=soup.select(csstag)[0]
+        tagmsgtext=tagmsg.text
+    except Exception as e:
+        tagmsg=''
+        tagmsgtext=''
+        log.info(f'标签解析异常{e}')
+    return  tagmsg,tagmsgtext
+
+if __name__ == '__main__':
+    getList()
+
+
+
+
+
+
+
+
+
+
+
--- a/comData/reits2/gdgov.py
+++ b/comData/reits2/gdgov.py
--- a/comData/reits2/guangxigov.py
+++ b/comData/reits2/guangxigov.py
+# _*_ coding:utf-8 _*_
+# https://www.henan.gov.cn/
+"""
+信息采集的流程
+1.拼接获取列表连接
+2.对详情页面内容进行解析和清洗
+3.对采集的信息添加链接去重
+4.文件内容的输出字段
+5.内容信息调用请求的方式 requests,selenium
+
+"""
+import json
+
+import redis
+from bs4 import BeautifulSoup
+
+import reqbase
+import BaseCore
+baseCore=BaseCore.BaseCore()
+log=baseCore.getLogger()
+
+rr=baseCore.r
+
+def getList():
+
+    header={
+        'Accept':'application/json, text/plain, */*',
+        'Accept-Encoding':'gzip, deflate',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Content-Length':'385',
+        'Content-Type':'application/json',
+        'Cookie':'Hm_lvt_a013af4793f2380a4bcf49ca1ce393eb=1699513646; _trs_uv=loqujlqp_3625_8md7; _trs_ua_s_1=loqujlqp_3625_2jhe; arialoadData=true; ariawapChangeViewPort=false; Hm_lpvt_a013af4793f2380a4bcf49ca1ce393eb=1699513657; SEARCHHISTORY=[%22REiTs%22]',
+        'Host':'www.gxzf.gov.cn',
+        'Origin':'http://www.gxzf.gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'http://www.gxzf.gov.cn/irs-intelligent-search/search?code=181aedaa542&dataTypeId=241&configCode=&sign=9cc99c9d-94aa-44b4-aa79-41227a5385d7&orderBy=related&searchBy=all&appendixType=&granularity=ALL&isSearchForced=0&pageNo=1&pageSize=10&isAdvancedSearch&isDefaultAdvanced&advancedFilters%20&searchWord=REiTs&advancedFilters',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
+    }
+    dlist=[]
+    pagenum=5
+    for i in range(1,pagenum):
+        log.info(f'采集第{i}页列表')
+        lurl='http://www.gxzf.gov.cn/irs/front/search'
+        data={
+            "code": "181aedaa542",
+            "dataTypeId": "241",
+            "configCode": "",
+            "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7",
+            "searchWord": "REiTs",
+            "orderBy": "related",
+            "searchBy": "all",
+            "appendixType": "",
+            "granularity": "ALL",
+            "isSearchForced": "0",
+            "filters": [],
+            "pageNo": i,
+            "pageSize": 10,
+            "isAdvancedSearch": None,
+            "isDefaultAdvanced": None,
+            "advancedFilters": None,
+            "advancedFilters ": None,
+            "historySearchWords": [
+                "REiTs"
+            ]
+        }
+        lcont=reqbase.reqPostHtml(lurl,header,data)
+        if lcont:
+            try:
+                data=json.loads(lcont)
+                datas=data['data']['middle']['listAndBox']
+                for lmsgg in datas:
+                    lmsg=lmsgg['data']
+                    title=lmsg['title']
+                    subtitle=''
+                    summary=lmsg['table-7']
+                    createDate=''
+                    writeDate=lmsg['table-4']
+                    pubDate=lmsg['time']
+                    source=lmsg['table-3']
+                    durl=lmsg['url']
+                    docNumberStr=lmsg['table-5']
+                    reNum=lmsg['table-1']
+                    content=''
+                    siteweb='广西壮族自治区人民政府'
+                    detailmsg={
+                        'title':title,
+                        'subtitle':subtitle,
+                        'summary':summary,
+                        'createDate':createDate,
+                        'writeDate':writeDate,
+                        'pubDate':pubDate,
+                        'source':source,
+                        'durl':durl,
+                        'content':content,
+                        'siteweb':siteweb,
+                        'docNumberStr':docNumberStr,
+                        'reNum':reNum,
+                    }
+                    is_member = rr.sismember('reis_gxgov', durl)
+                    if is_member:
+                        continue
+                    detailmsg=paserdetail(detailmsg)
+                    dlist.append(detailmsg)
+                    rr.sadd('reis_gxgov',durl)
+
+            except Exception as e:
+                log.info(f'列表解析异常{e}')
+    reqbase.pdwriterXLS(dlist,'广西壮族自治区人民政府')
+
+
+def paserdetail(detailmsg):
+    headers={
+        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding':'gzip, deflate',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Cookie':'Hm_lvt_a013af4793f2380a4bcf49ca1ce393eb=1699513646; _trs_uv=loqujlqp_3625_8md7; _trs_ua_s_1=loqujlqp_3625_2jhe; arialoadData=true; ariawapChangeViewPort=false; SEARCHHISTORY=[%22REiTs%22]; Hm_lpvt_a013af4793f2380a4bcf49ca1ce393eb=1699514234',
+        'Host':'www.gxzf.gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'http://www.gxzf.gov.cn/irs-intelligent-search/search?code=181aedaa542&dataTypeId=241&configCode=&sign=9cc99c9d-94aa-44b4-aa79-41227a5385d7&orderBy=related&searchBy=all&appendixType=&granularity=ALL&isSearchForced=0&pageNo=1&pageSize=10&isAdvancedSearch&isDefaultAdvanced&advancedFilters%20&searchWord=REiTs&advancedFilters',
+        'Upgrade-Insecure-Requests':'1',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
+    }
+    durl=detailmsg['durl']
+    dhmsg=reqbase.reqGetHtml(durl,headers)
+    try:
+        soup = BeautifulSoup(dhmsg, 'html.parser')
+        soup = reqbase.paserUrl(str(soup), durl)
+        #class_type=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(1)>td:nth-child(4)')[1]
+        pub_jigou=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(2)>td:nth-child(1)')[1]
+        write_data=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(2)>td:nth-child(2)')[1]
+        #file_num=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(2)')[1]
+        #pub_data=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(4)')[1]
+        contentWithTag,content=soupPaserHtml(soup,'div[class="article-con"]')
+        if not content:
+            contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
+        detailmsg['contentWithTag']=contentWithTag
+        detailmsg['content']=content
+        # detailmsg['class_type']=class_type
+        detailmsg['pub_jigou']=pub_jigou
+        detailmsg['write_data']=write_data
+    except Exception as e:
+        print(f'详情解析异常{e}')
+    return detailmsg
+
+def soupPaserHtml(soup,csstag):
+    try:
+        tagmsg=soup.select(csstag)[0]
+        tagmsgtext=tagmsg.text
+    except Exception as e:
+        tagmsg=''
+        tagmsgtext=''
+        log.info(f'标签解析异常{e}')
+    return  tagmsg,tagmsgtext
+
+if __name__ == '__main__':
+    getList()
+
+
+
+
+
+
+
+
+
+
+
--- a/comData/reits2/hainangov.py
+++ b/comData/reits2/hainangov.py
+# _*_ coding:utf-8 _*_
+# https://www.henan.gov.cn/
+"""
+信息采集的流程
+1.拼接获取列表连接
+2.对详情页面内容进行解析和清洗
+3.对采集的信息添加链接去重
+4.文件内容的输出字段
+5.内容信息调用请求的方式 requests,selenium
+
+"""
+import json
+
+import redis
+from bs4 import BeautifulSoup
+
+import reqbase
+import BaseCore
+baseCore=BaseCore.BaseCore()
+log=baseCore.getLogger()
+
+rr=baseCore.r
+
+def getList():
+
+    header={
+        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding':'gzip, deflate, br',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Cookie':'4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153159&searchUseTime-349',
+        'Host':'www.hainan.gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
+        'Sec-Fetch-Dest':'document',
+        'Sec-Fetch-Mode':'navigate',
+        'Sec-Fetch-Site':'same-origin',
+        'Sec-Fetch-User':'?1',
+        'Upgrade-Insecure-Requests':'1',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'sec-ch-ua-mobile':'?0',
+        'sec-ch-ua-platform':'"Windows"',
+    }
+    dlist=[]
+    pagenum=5
+    for i in range(1,pagenum):
+        log.info(f'采集第{i}页列表')
+        lurl=f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={i}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
+        lcont=reqbase.reqGetHtml(lurl,header)
+        if lcont:
+            try:
+                soup = BeautifulSoup(lcont, 'html.parser')
+                soup = reqbase.paserUrl(str(soup), lurl)
+                divlist=soup.select('div[id="showPage"]>div')
+                for lmsg in divlist:
+                    title=soupPaserHtml(lmsg,'h3>a')[1]
+                    subtitle=''
+                    summary=''
+                    createDate=''
+                    writeDate=''
+                    pubDate=soupPaserHtml(lmsg,'span[class="quily-con"]')[1]
+                    source=soupPaserHtml(lmsg,'a[class="address-con permitU"]')[1]
+                    try:
+                        durl=soupPaserHtml(lmsg,'h3>a')[0].get('href')
+                    except Exception as e:
+                        durl=''
+                        continue
+                    docNumberStr=''
+                    reNum=''
+                    content=''
+                    siteweb='海南省人民政府'
+                    detailmsg={
+                        'title':title,
+                        'subtitle':subtitle,
+                        'summary':summary,
+                        'createDate':createDate,
+                        'writeDate':writeDate,
+                        'pubDate':pubDate,
+                        'source':source,
+                        'durl':durl,
+                        'content':content,
+                        'siteweb':siteweb,
+                        'docNumberStr':docNumberStr,
+                        'reNum':reNum,
+                    }
+                    is_member = rr.sismember('reis_hainangov', durl)
+                    if is_member:
+                        continue
+                    detailmsg=paserdetail(detailmsg)
+                    dlist.append(detailmsg)
+                    rr.sadd('reis_hainangov',durl)
+
+            except Exception as e:
+                log.info(f'列表解析异常{e}')
+    reqbase.pdwriterXLS(dlist,'海南省人民政府')
+
+
+def paserdetail(detailmsg):
+    headers={
+        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding':'gzip, deflate, br',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Cookie':'HttpOnly=true; 4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153247&searchUseTime-337; HA_STICKY_apps=apps.srv34; Hm_lvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515700; yfx_c_g_u_id_10005682=_ck23110915414012919174127333485; yfx_f_l_v_t_10005682=f_t_1699515700292__r_t_1699515700292__v_t_1699515700292__r_c_0; _trs_uv=loqvrn5x_4549_5u3r; _trs_ua_s_1=loqvrn5x_4549_1lnl; arialoadData=true; ariawapChangeViewPort=false; Hm_lpvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515718',
+        'Host':'www.hainan.gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=1&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
+        'Sec-Fetch-Dest':'document',
+        'Sec-Fetch-Mode':'navigate',
+        'Sec-Fetch-Site':'same-origin',
+        'Sec-Fetch-User':'?1',
+        'Upgrade-Insecure-Requests':'1',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'sec-ch-ua-mobile':'?0',
+        'sec-ch-ua-platform':'"Windows"',
+    }
+    durl=detailmsg['durl']
+    dhmsg=reqbase.reqGetHtml(durl,headers)
+    try:
+        log.info(f'解析详情地址：{durl}')
+        soup = BeautifulSoup(dhmsg, 'html.parser')
+        soup = reqbase.paserUrl(str(soup), durl)
+        souyihao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(1)')[1]
+        fenlei=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(2)')[1]
+        fawenjiguan=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(1)')[1]
+        write_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(2)')[1]
+        wenhao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(1)')[1]
+        pub_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(2)')[1]
+        contentWithTag,content=soupPaserHtml(soup,'div[id="zoom"]')
+        if not content:
+            contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
+        detailmsg['contentWithTag']=contentWithTag
+        detailmsg['content']=content
+        detailmsg['souyihao']=souyihao
+        detailmsg['fenlei']=fenlei
+        detailmsg['fawenjiguan']=fawenjiguan
+        detailmsg['wenhao']=wenhao
+        detailmsg['pub_data']=pub_data
+        detailmsg['write_data']=write_data
+    except Exception as e:
+        print(f'详情解析异常{e}')
+    return detailmsg
+
+def soupPaserHtml(soup,csstag):
+    try:
+        tagmsg=soup.select(csstag)[0]
+        tagmsgtext=tagmsg.text
+    except Exception as e:
+        tagmsg=''
+        tagmsgtext=''
+        log.info(f'标签解析异常{e}')
+    return  tagmsg,tagmsgtext
+
+if __name__ == '__main__':
+    getList()
+
+
+
+
+
+
+
+
+
+
+
--- a/comData/reits2/henangov.py
+++ b/comData/reits2/henangov.py
+# _*_ coding:utf-8 _*_
+# https://www.henan.gov.cn/
+"""
+信息采集的流程
+1.拼接获取列表连接
+2.对详情页面内容进行解析和清洗
+3.对采集的信息添加链接去重
+4.文件内容的输出字段
+5.内容信息调用请求的方式 requests,selenium
+
+"""
+import json
+
+import redis
+from bs4 import BeautifulSoup
+
+import reqbase
+import BaseCore
+baseCore=BaseCore.BaseCore()
+log=baseCore.getLogger()
+
+rr=baseCore.r
+
+def getList():
+
+    header={
+        'Accept':'*/*',
+        'Accept-Encoding':'gzip, deflate, br',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Host':'searchapi.henan.gov.cn',
+        'Origin':'https://www.henan.gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'https://www.henan.gov.cn/',
+        'Sec-Fetch-Dest':'empty',
+        'Sec-Fetch-Mode':'cors',
+        'Sec-Fetch-Site':'same-site',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'sec-ch-ua-mobile':'?0',
+        'sec-ch-ua-platform':'"Windows"',
+    }
+    dlist=[]
+    pagenum=10
+    for i in range(1,pagenum):
+        log.info(f'henan采集第{i}页列表')
+        lurl=f'https://searchapi.henan.gov.cn/open/api/external?keywords=&siteId=4500000001&allKeyword=&anyKeyword=&noKeyword=&searchRange=-1000&sortType=200&beginTime=&endTime=&pageNumber={i}&pageSize=15&fileType=3&channelMarkId=45000000010115416542055691'
+        lcont=reqbase.reqGetHtml(lurl,header)
+        if lcont:
+            try:
+                data=json.loads(lcont)
+                datas=data['data']['datas']
+                for lmsg in datas:
+                    title=lmsg['title']
+                    subtitle=lmsg['subtitle']
+                    summary=lmsg['summary']
+                    createDate=lmsg['createDate']
+                    writeDate=lmsg['writeDate']
+                    pubDate=lmsg['pubDate']
+                    source=lmsg['source']
+                    durl=lmsg['selfUrl']
+                    docNumberStr=lmsg['docNumberStr']
+                    reNum=lmsg['reNum']
+                    content=lmsg['content']
+                    siteweb='河南省人民政府'
+                    detailmsg={
+                        'title':title,
+                        'subtitle':subtitle,
+                        'summary':summary,
+                        'createDate':createDate,
+                        'writeDate':writeDate,
+                        'pubDate':pubDate,
+                        'source':source,
+                        'durl':durl,
+                        'content':content,
+                        'siteweb':siteweb,
+                        'docNumberStr':docNumberStr,
+                        'reNum':reNum,
+                    }
+                    is_member = rr.sismember('reis_henangov', durl)
+                    if is_member:
+                        continue
+                    paserdetail(detailmsg)
+                    dlist.append(detailmsg)
+                    rr.sadd('reis_henangov',durl)
+            except Exception as e:
+                print(f'请求异常{e}-异常页码{i}')
+
+    reqbase.pdwriterXLS(dlist,'河南省人民政府')
+
+def paserdetail(detailmsg):
+    headers={
+        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding':'gzip, deflate, br',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Cookie':'zh_choose=n; yfx_c_g_u_id_10000001=_ck23110818022219777515353379336; yfx_f_l_v_t_10000001=f_t_1699437742968__r_t_1699437742968__v_t_1699437742968__r_c_0',
+        'Host':'www.henan.gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'https://www.henan.gov.cn/zwgk/fgwj/szfl/',
+        'Sec-Fetch-Dest':'document',
+        'Sec-Fetch-Mode':'navigate',
+        'Sec-Fetch-Site':'same-origin',
+        'Sec-Fetch-User':'?1',
+        'Upgrade-Insecure-Requests':'1',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'sec-ch-ua-mobile':'?0',
+        'sec-ch-ua-platform':'"Windows"'
+    }
+    durl=detailmsg['durl']
+    dhmsg=reqbase.reqGetHtml(durl,headers)
+    soup = BeautifulSoup(dhmsg, 'html.parser')
+    soup = reqbase.paserUrl(str(soup), durl)
+    contentWithTag=soup.select('div[id="content"]')[0]
+    content = contentWithTag.text  # 不带标签正文
+    detailmsg['contentWithTag']=contentWithTag
+    detailmsg['content']=content
+    return detailmsg
+if __name__ == '__main__':
+    getList()
+
+
+
+
+
+
+
+
+
+
+
--- a/comData/reits2/hubeigov.py
+++ b/comData/reits2/hubeigov.py
+# _*_ coding:utf-8 _*_
+# https://www.henan.gov.cn/
+"""
+信息采集的流程
+1.拼接获取列表连接
+2.对详情页面内容进行解析和清洗
+3.对采集的信息添加链接去重
+4.文件内容的输出字段
+5.内容信息调用请求的方式 requests,selenium
+
+"""
+import json
+
+import redis
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import undetected_chromedriver as uc
+
+import reqbase
+import BaseCore
+baseCore=BaseCore.BaseCore()
+log=baseCore.getLogger()
+
+rr=baseCore.r
+
+def getList():
+
+    header={
+        'Accept':'application/json, text/plain, */*',
+        'Accept-Encoding':'gzip, deflate',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Cookie':'JSESSIONID=B970564BAAD37BB8E9EF19F78FF45618; Hm_lvt_5544783ae3e1427d6972d9e77268f25d=1699492684; token=43cfe913-ec04-4aa3-b037-96903ccaa188; uuid=43cfe913-ec04-4aa3-b037-96903ccaa188; Hm_lpvt_5544783ae3e1427d6972d9e77268f25d=1699492955; 924omrTVcFchT=0IZtGVa8M20F2wJXy_C6l9PPFZOO1SBDdB3qZtsaLbLGaQ5t4l6Vt8HF9dIwhxtBcLdkdRZwlK42NCaEUjZZoPsXZAZ1o.tgK50mj8FJZTM5zCxcVg3w4cOCSM4BvYApzj7YMWHycK14.NY6Y.AP6bW6g0jDIqZlbp2hKSpDfZYBhjsgwJJraXKf2S4sgG6swjXFVVUHGngt2GMQPUZQRsE0_tL9Pz3_h6JeSD9qHWLOVKJWz0z8hdC_F4kiGZj2FRjjSUZp0VLUS8pjkrJdGYrKhC5xwGy8xSFYBE_trVuCFjr8.vhLqBONYkoWvZM2qNX_WZXg_3wTLMqCMrjoCmkvHf7B9.MMVu8tMC4hwDT4wjeyoNoRjIlgKuwE.aGhn',
+        'Host':'www.hubei.gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'http://www.hubei.gov.cn/site/hubei/search.html',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
+    }
+    dlist=[]
+    pagenum=3
+    for i in range(1,pagenum):
+        log.info(f'湖北采集第{i}页列表')
+        lurl=f'http://www.hubei.gov.cn/igs/front/search.jhtml?position=&timeOrder=&code=872801132c71495bbe5a938f6acff5aa&orderBy=all&pageSize=10&type=%E6%96%87%E4%BB%B6&time=&chnldesc=&pageNumber={i}&aggrFieldName=chnldesc&sortByFocus=true&siteId=50&name=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitename=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitetype=%E7%9C%81%E6%94%BF%E5%BA%9C&searchWord=REITS&6LDjm9Ls=0t3_jtGlqEJQLVtYPg5o4LE8KRsDcOrdhcQJ2gpgbWwP9rQyfChv7ADuy_hXWgy2abOG9jq8_hKyrFekh7IWmLmb9VBbEQh7tULy0_6L3zqkGOSoDWEcli5Ympa58KVMviSIxe_LiYGE'
+        lcont=reqbase.reqGetHtml(lurl,header)
+        if lcont:
+            try:
+                data=json.loads(lcont)
+                datas=data['page']['content']
+                for lmsg in datas:
+                    title=lmsg['DOCTITLE']
+                    subtitle=lmsg['FileName']
+                    summary=lmsg['DOCCONTENT']
+                    createDate=''
+                    writeDate=''
+                    pubDate=lmsg['PUBDATE']
+                    source=lmsg['publisher']
+                    durl=lmsg['url']
+                    docNumberStr=''
+                    reNum=lmsg['IdxID']
+                    content=''
+                    siteweb='湖北省人民政府'
+                    detailmsg={
+                        'title':title,
+                        'subtitle':subtitle,
+                        'summary':summary,
+                        'createDate':createDate,
+                        'writeDate':writeDate,
+                        'pubDate':pubDate,
+                        'source':source,
+                        'durl':durl,
+                        'content':content,
+                        'siteweb':siteweb,
+                        'docNumberStr':docNumberStr,
+                        'reNum':reNum,
+                    }
+                    is_member = rr.sismember('reis_hubeigov', durl)
+                    if is_member:
+                        continue
+                    paserdetail(detailmsg)
+                    dlist.append(detailmsg)
+                    rr.sadd('reis_hubeigov',durl)
+            except Exception as e:
+                print(f'请求异常{e}-异常页码{i}')
+
+    reqbase.pdwriterXLS(dlist,'河南省人民政府')
+
+def paserdetail(detailmsg):
+    # opt = webdriver.ChromeOptions()
+    # opt.add_argument("--ignore-certificate-errors")
+    # opt.add_argument("--ignore-ssl-errors")
+    # opt.add_experimental_option("excludeSwitches", ["enable-automation"])
+    # opt.add_experimental_option('excludeSwitches', ['enable-logging'])
+    # opt.add_experimental_option('useAutomationExtension', False)
+    # opt.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
+    # chromedriver = r'D:\chrome62\cmdvip\chromedriver.exe'
+    # driver = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
+    headers={
+        'Host':'www.hubei.gov.cn',
+        'Proxy-Connection':'keep-alive',
+        'Pragma':'no-cache',
+        'Cache-Control':'no-cache',
+        'Upgrade-Insecure-Requests':'1',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Referer':'http://www.hubei.gov.cn/zfwj/ezbf/202303/t20230303_4569220.shtml',
+        'Accept-Encoding':'gzip, deflate',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cookie':'924omrTVcFchT=0ydF5mX9FkIQDMfAAr4A60Yt6sHsXZOzTlm30NLRHm2OwX_YgXaMFBUe3WeNORSf0ZqYHjvBxVL5CXNSWoCOOThArMBpBDzXVdWxVIoA5YBGBLbPUN4CbcQQLZEty.w1MZkgI1pn30uv5STvyCsHLoYGTDHDSIbaURf4XIXzC3fNhxDX.nR5ZWV_HBo9ZAyC5I93.otc4vf7nD6v3Tympw6h2ZUuyAJ0Q7Nes3n0dIB_BIhwCkjyvJibUZt04ggU6XeXnS.qXr2CaM8BJQQ4mdLJ5apGqInkYuNv2GJP1AvL',
+
+    }
+    durl=detailmsg['durl']
+    try:
+        dhmsg=reqbase.reqGetHtml(durl,headers)
+        if dhmsg:
+            soup = BeautifulSoup(dhmsg, 'html.parser')
+            soup = reqbase.paserUrl(str(soup), durl)
+            idx_num=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
+            class_type=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
+            dplay_gov=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
+            pub_date=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
+            fileNum=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
+            contentWithTag,content=soupPaserHtml(soup,'div[class="hbgov-article-content"]')
+            # contentWithTag=soup.select('div[class="hbgov-article-content"]')[0]
+            content = contentWithTag.text  # 不带标签正文
+            detailmsg['contentWithTag']=contentWithTag
+            detailmsg['content']=content
+            detailmsg['idx_num']=idx_num
+            detailmsg['class_type']=class_type
+            detailmsg['dplay_gov']=dplay_gov
+            detailmsg['pub_date']=pub_date
+            detailmsg['fileNum']=fileNum
+    except Exception as e:
+        print(e)
+
+    return detailmsg
+
+def soupPaserHtml(soup,csstag):
+    try:
+        tagmsg=soup.select(csstag)[0]
+        tagmsgtext=tagmsg.text
+    except Exception as e:
+        tagmsg=''
+        tagmsgtext=''
+        log.info(f'标签解析异常{e}')
+    return  tagmsg,tagmsgtext
+
+
+if __name__ == '__main__':
+    getList()
+
+
+
+
+
+
+
+
+
+
+
--- a/comData/reits2/reqbase.py
+++ b/comData/reits2/reqbase.py
+# https://www.henan.gov.cn/
+import json
+from urllib.parse import urljoin
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+import xlsxwriter
+import openpyxl
+import BaseCore
+baseCore=BaseCore.BaseCore()
+
+def reqGetHtml(url,header):
+    for i in range(0,3):
+        try:
+            proxy=baseCore.get_proxy()
+            response=requests.get(url=url,headers=header,proxies=proxy,verify=False,timeout=10)
+            response.encoding=response.apparent_encoding
+            hcont=response.text
+            if hcont:
+                break
+        except Exception as e:
+            hcont=''
+
+    return hcont
+
+def reqPostHtml(url,header,data):
+    for i in range(0,3):
+        try:
+            proxy=baseCore.get_proxy()
+            if isinstance(data, str):
+                res=requests.post(url=url,data=data,headers=header,proxies=proxy,verify=False,timeout=10)
+            else:
+                res=requests.post(url=url,data=json.dumps(data),headers=header,verify=False,timeout=10)
+            hcont=res.text
+            if hcont:
+                break
+        except Exception as e:
+            hcont=''
+
+    return hcont
+
+def reqPostStrHtml(url,header,data):
+    for i in range(0,3):
+        try:
+            res=requests.post(url=url,data=data,headers=header,verify=False,timeout=10)
+            hcont=res.text
+            if hcont:
+                break
+        except Exception as e:
+            hcont=''
+
+    return hcont
+
+def createDriver():
+    chrome_driver = r'D:\Google\Chrome\Application\chrome.exe'
+    path =  Service(chrome_driver)
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.binary_location =r'D:\chrome\chromedriver.exe'
+    # 设置代理
+    # proxy = "127.0.0.1:8080"  # 代理地址和端口
+    # chrome_options.add_argument('--proxy-server=http://' + proxy)
+    driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
+    return driver
+# 将html中的相对地址转换成绝对地址
+def paserUrl(html, listurl):
+    # 获取所有的<a>标签和<img>标签
+    if isinstance(html, str):
+        html = BeautifulSoup(html, 'html.parser')
+
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+
+def pdwriterXLS(dlist,siteName):
+    df_out = pd.DataFrame(data=dlist)
+    df_out.to_excel(siteName+'.xlsx', engine='xlsxwriter', index=False)
+
+
+
+
+
+
+
+
+
+
+
+
--- a/comData/reits2/yunnangov.py
+++ b/comData/reits2/yunnangov.py
+# _*_ coding:utf-8 _*_
+
+
+"""
+信息采集的流程
+1.拼接获取列表连接
+2.对详情页面内容进行解析和清洗
+3.对采集的信息添加链接去重
+4.文件内容的输出字段
+5.内容信息调用请求的方式 requests,selenium
+
+"""
+import json
+
+import redis
+from bs4 import BeautifulSoup
+
+import reqbase
+import BaseCore
+baseCore=BaseCore.BaseCore()
+log=baseCore.getLogger()
+
+rr=baseCore.r
+
+def getList():
+
+    header={
+        'Accept':'application/json, text/javascript, */*; q=0.01',
+        'Accept-Encoding':'gzip, deflate, br',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Content-Length':'185',
+        'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
+        'Origin':'https://sheng.so-gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'https://sheng.so-gov.cn/',
+        'Sec-Ch-Ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'Sec-Ch-Ua-Mobile':'?0',
+        'Sec-Ch-Ua-Platform':'"Windows"',
+        'Sec-Fetch-Dest':'empty',
+        'Sec-Fetch-Mode':'cors',
+        'Sec-Fetch-Site':'same-site',
+        'Suid':'cf354a807a13d634f76bf167610f9c07',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
+    }
+    dlist=[]
+    pagenum=4
+    for i in range(3,pagenum):
+        log.info(f'henan采集第{i}页列表')
+        lurl='https://api.so-gov.cn/s'
+        data=f'siteCode=5300000033&tab=zcwj&timestamp=1699525503095&wordToken=72df37fd2f1058524e0c7467610d9ab7&page={i}&pageSize=20&qt=REITs&timeOption=0&sort=dateDesc&keyPlace=0&fileType=&toolsStatus=1'
+        lcont=reqbase.reqPostHtml(lurl,header,data)
+        if lcont:
+            try:
+                data=json.loads(lcont)
+                datas=data['data']['search']['searchs']
+                for myValues in datas:
+                    lmsg=myValues['myValues']
+                    try:
+                        title=lmsg['DRETITLEO']
+                        subtitle=''
+                        summary=lmsg['QUICKDESCRIPTION']
+                        createDate=''
+                        writeDate=''
+                        pubDate=''
+                        source=lmsg['WEBSITENAME']
+                        durl=lmsg['URL']
+                        wenjianhao=lmsg['C3']
+                        suoyinhao=''
+                        content=''
+                        siteweb='云南省人民政府'
+                    except Exception as e:
+                        continue
+                    detailmsg={
+                        'title':title,
+                        'subtitle':subtitle,
+                        'summary':summary,
+                        'createDate':createDate,
+                        'writeDate':writeDate,
+                        'pubDate':pubDate,
+                        'source':source,
+                        'durl':durl,
+                        'content':content,
+                        'siteweb':siteweb,
+                        'wenjianhao':wenjianhao,
+                        'suoyinhao':suoyinhao,
+                    }
+                    is_member = rr.sismember('reis_yngov', durl)
+                    if is_member:
+                        continue
+                    detailmsg=paserdetail(detailmsg)
+                    dlist.append(detailmsg)
+                    rr.sadd('reis_yngov',durl)
+
+            except Exception as e:
+                log.info(f'列表解析异常{e}')
+    reqbase.pdwriterXLS(dlist,'云南人民政府-政策2')
+
+
+def paserdetail(detailmsg):
+    headers={
+        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding':'gzip, deflate, br',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'no-cache',
+        'Connection':'keep-alive',
+        'Cookie':'_gscu_802487706=99519044sn0ozi20; _gscbrs_802487706=1; Hm_lvt_b9099e95d08017e30f6285a8b55eb822=1699519045; TrsAccessMonitor=TrsAccessMonitor-1699519056000-2819180807; _gscs_802487706=995190442fewym20|pv:2; Hm_lpvt_b9099e95d08017e30f6285a8b55eb822=1699519714',
+        'Host':'www.yn.gov.cn',
+        'Pragma':'no-cache',
+        'Referer':'https://sheng.so-gov.cn/',
+        'Sec-Fetch-Dest':'document',
+        'Sec-Fetch-Mode':'navigate',
+        'Sec-Fetch-Site':'cross-site',
+        'Sec-Fetch-User':'?1',
+        'Upgrade-Insecure-Requests':'1',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'sec-ch-ua-mobile':'?0',
+        'sec-ch-ua-platform':'"Windows"',
+
+    }
+    durl=detailmsg['durl']
+    dhmsg=reqbase.reqGetHtml(durl,headers)
+    try:
+        log.info(f'详情页标题：{detailmsg["title"]}')
+        log.info(f'详情请求地址：{durl}')
+        soup = BeautifulSoup(dhmsg, 'html.parser')
+        soup = reqbase.paserUrl(str(soup), durl)
+        suoyinhao=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(1)>dd')[1]
+        wenjianhao=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(2)>dd')[1]
+        pubDate=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(4)>dd')[1]
+        contentWithTag,content=soupPaserHtml(soup,'div[class="trs_editor_view TRS_UEDITOR trs_paper_default trs_web"]')
+        if not content:
+            contentWithTag,content=soupPaserHtml(soup,'div[class="content"]')
+        if not content:
+            contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_external trs_web trs_key4format"]')
+        if not content:
+            log.info(f'详情内容为空：{durl}')
+            #contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_external trs_web trs_key4format"]')
+
+        detailmsg['contentWithTag']=contentWithTag
+        detailmsg['content']=content
+        detailmsg['suoyinhao']=suoyinhao
+        detailmsg['wenjianhao']=wenjianhao
+        detailmsg['pubDate']=pubDate
+
+    except Exception as e:
+        print(f'详情解析异常{e}')
+    return detailmsg
+
+def soupPaserHtml(soup,csstag):
+    try:
+        tagmsg=soup.select(csstag)[0]
+        tagmsgtext=tagmsg.text
+    except Exception as e:
+        tagmsg=''
+        tagmsgtext=''
+        log.info(f'标签解析异常{e}')
+    return  tagmsg,tagmsgtext
+
+if __name__ == '__main__':
+    getList()
+
+
+
+
+
+
+
+
+
+
+
--- a/comData/reits2/云南人民政府-政策.xlsx
+++ b/comData/reits2/云南人民政府-政策.xlsx
--- a/comData/reits2/云南人民政府-解读.xlsx
+++ b/comData/reits2/云南人民政府-解读.xlsx
--- a/comData/reits2/云南人民政府.xlsx
+++ b/comData/reits2/云南人民政府.xlsx
--- a/comData/reits2/广东省人民政府-政策文件.xlsx
+++ b/comData/reits2/广东省人民政府-政策文件.xlsx
--- a/comData/reits2/广东省人民政府-政策解读.xlsx
+++ b/comData/reits2/广东省人民政府-政策解读.xlsx
--- a/comData/reits2/广东省人民政府-计划规划.xlsx
+++ b/comData/reits2/广东省人民政府-计划规划.xlsx
--- a/comData/reits2/广西壮族自治区人民政府.xlsx
+++ b/comData/reits2/广西壮族自治区人民政府.xlsx
--- a/comData/reits2/河南省人民政府.xlsx
+++ b/comData/reits2/河南省人民政府.xlsx
--- a/comData/reits2/海南省人民政府.xlsx
+++ b/comData/reits2/海南省人民政府.xlsx
--- a/comData/reits2/重庆市人民政府.xlsx
+++ b/comData/reits2/重庆市人民政府.xlsx
--- a/estool/ESAPI查询说明
+++ b/estool/ESAPI查询说明
+索引（Index）
+索引（Index）
+索引就是一类文档的集合，类似于关系型数据库中的表。索引由其名称进行标识，每个索引名称必须是小写。
+
+文档（Document）
+Index中单条记录称为文档，等同于关系型数据库表中的行。
+
+字段（Field）
+json结构的字段，等同于关系型数据库表中的列。
+
+映射（Mapping）
+Mapping是处理数据的方式和规则方面做一些限制，如：某个字段的数据类型、默认值、分析器、是否被索引等等，都是映射里可以设置的。
+
+分片（Shards）
+一个索引可以存储超过单个节点硬件限制的大量数据，相当于分表的概念。
+ES提供了将索引划分成多份的能力，每一份称之为分片。
+当创建一个索引的时候，可以指定想要的分片数量。
+允许水平分割/扩展内容容量；允许在分片之上进行分布式的、并行的操作，进而提高性能/吞吐量。
+
+副本（Replicas）
+在分片/节点失败的情况下，提供了高可用性。
+复制分片从不与原/主要分片置于同一节点上是非常重要的。
+扩展搜索量/吞吐量，因为搜索可以在所有副本上并行运行。
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/estool/Esmethodbase.py
+++ b/estool/Esmethodbase.py
+from elasticsearch import Elasticsearch
+
+# 连接ES
+es=Elasticsearch(["192.168.1.90:9200"],
+                 sniff_on_start=True,# 连接前测试
+                 sniff_on_connection_fail=True,# 节点无响应时刷新节点
+                 sniffer_timeout=60) # 设置超时时间
+index_name='test_data'
+def main():
+    # 连接ES
+    es=Elasticsearch(["192.168.1.90:9200"],
+                     sniff_on_start=True,# 连接前测试
+                     sniff_on_connection_fail=True,# 节点无响应时刷新节点
+                     sniffer_timeout=60) # 设置超时时间
+
+if __name__ == '__main__':
+    main()
+
+# 创建索引
+def create_index():
+    # 定义mapping body
+    body_index = {
+        'mappings': {
+            'properties': {
+                'name': {
+                    'type': 'keyword'
+                },
+                'age': {
+                    'type': 'long'
+                },
+                'tags': {
+                    'type': 'text'
+                }
+            }
+        },
+        'settings': {
+            'index': {
+                'number_of_shards': '3',
+                'number_of_replicas': '0'
+            }
+        }
+    }
+    # 创建index
+    res = es.indices.create(index=index_name, body=body_index, ignore=400)
+
+def instert_data():
+    person1 = {
+        'name': '张三',
+        'age': 18,
+        'tags': '勤奋学习十载寒窗，凿壁借光，囊萤映雪，手不释卷，有良好的表达能力。有耐心心态好，善于维系客户关系。果断热情勇敢孤僻活力，思想成熟能够独立工作。'
+    }
+    res = es.index(index=index_name, body=person1)
+
+from elasticsearch import helpers
+def instert_bach():
+    insert_infos = []
+    person2 = {
+        '_index': index_name,
+        'name': '李四',
+        'age': 20,
+        'tags': '有极强的领导艺术，公正严明铁面无私，公私分明。关心他人无微不至，体贴入微。精力充沛，并有很强的事业心。气吞山河正气凛然，善于同各种人员打交道。'
+    }
+    person3 = {
+        '_index': index_name,
+        'name': '王五',
+        'age': 19,
+        'tags': '尊敬师长团结同学，乐于助人学习勤奋，用心向上，用心参加班级学校组织的各种课内外活动。用心开展批评与自我批评。'
+    }
+    insert_infos.append(person2)
+    insert_infos.append(person3)
+    helpers.bulk(client=es, actions=insert_infos)
+
+def del_index():
+    # 删除index
+    res = es.indices.delete(index=index_name, ignore=[400])
+
+def del_doc_byid():
+    # 按id删除
+    res = es.delete(index=index_name, id='bKTgXYUBfH4USN9RFMOh')
+
+def del_by_condation():
+    # 按条件删除
+    body = {
+        'query': {
+            'match': {
+                'name': '张三'
+            }
+        }
+    }
+    res = es.delete_by_query(index=index_name, body=body, ignore=[400, 404])
+
+# index() 方法完成两个操作，如果数据不存在，那就执行插入操作，如果已经存在，那就执行更新操作。
+# index实现更新时，body中必须写入全部字段，否则未包含的字段会被置为空。
+def index_update_doc():
+    body = {
+        'name': '王五',
+        'age': 19,
+        'tags': '尊敬师长团结同学，乐于助人学习勤奋，用心向上，用心参加班级学校组织的各种课内外活动。用心开展批评与自我批评。'
+    }
+    res = es.index(index=index_name, id='baTgXYUBfH4USN9RFMOh', body=body)
+
+def update_doc():
+    body = {
+        'doc': {
+            'name': '王五'
+        }
+    }
+    es.update(index=index_name, id='baTgXYUBfH4USN9RFMOh', body=body)
+
+def select_info():
+    # 查看ES中索引的信息
+    index_info = es.indices.get('*')
+    # 查看索引的名称
+    index_names = index_info.keys()
+
+    index_name = 'es_index'
+    print(es.indices.exists(index_name))
+    doc_count = es.count(index=index_name)
+
+def query_by_id():
+    body = {
+        'query': {
+            'match': {
+                '_id': 'baTgXYUBfH4USN9RFMOh'
+            }
+        }
+    }
+    res = es.search(index=index_name, body=body)
+
+def query_by_filed():
+    body = {
+        'query': {
+            'match': {
+                'age': 20
+            }
+        },
+        '_source': ['name', 'tags']
+    }
+    res = es.search(index=index_name, body=body)
+
+def query_by_sort():
+    body = {
+        'sort': {
+            'age': {
+                'order': 'desc' # asc: 升序, desc: 降序
+            }
+        }
+    }
+    res = es.search(index=index_name, body=body)
+
+def query_by_range():
+    body = {
+        'query': {
+            'range': {
+                'age': {
+                    'gt': 18,
+                    'lte': 20
+                }
+            }
+        }
+    }
+    res = es.search(index=index_name, body=body)
+
+def query_by_page():
+    body = {
+        'sort': {
+            'age': {
+                'order': 'desc' # asc: 升序, desc: 降序
+            }
+        },
+        'from': 0,
+        'size': 1
+    }
+    res = es.search(index=index_name, body=body)
+
+def quere_by_paser():
+    body = {
+        "query": {
+            "match_phrase": {
+                "tags": "耐心"
+            }
+        }
+    }
+    res = es.search(index=index_name, body=body)
+
+def query_by_mult():
+    body = {
+        "query": {
+            "bool": {
+                "must": [
+                    {
+                        "match": {
+                            "name": "张三"
+                        }
+                    },
+                    {
+                        "match_phrase": {
+                            "tags": "耐心"
+                        }
+                    }
+                ]
+            }
+        }
+    }
+    res = es.search(index=index_name, body=body)
+
+def query_by_not():
+    body = {
+        "query": {
+            "bool": {
+                "must": [
+                    {
+                        "match": {
+                            "name": "王五"
+                        }
+                    }
+                ],
+                'must_not': [
+                    {
+                        "match_phrase": {
+                            "tags": "耐心"
+                        }
+                    }
+                ]
+            }
+        }
+    }
+    res = es.search(index=index_name, body=body)
+
+
+
+
+
+
+
+
+
+