代码修改11

497088ef · 刘伟刚 · 3154b028 · 497088ef · 3154b028 · 497088ef
--- a/comData/dingzhi/bmfw.py
+++ b/comData/dingzhi/bmfw.py
-
+# -*- coding:utf-8 -*-
 import datetime
 import time

@@ -26,11 +26,11 @@ def page_list():
    header = {
        'Host':'xcx.www.gov.cn',
        'Connection':'keep-alive',
-        'Content-Length':'72',
+        'Content-Length':'25',
        'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
        'x-tif-did':'pb5XUGL1Zm',
-        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8379',
-        'x-tif-sid':'e1436792814f1c6845af4d84cbc4ad9957',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
+        'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
        'Content-Type':'application/json',
        'xweb_xhr':'1',
        'dgd-pre-release':'0',
@@ -40,9 +40,9 @@ def page_list():
        'Sec-Fetch-Site':'cross-site',
        'Sec-Fetch-Mode':'cors',
        'Sec-Fetch-Dest':'empty',
-        'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/713/page-frame.html',
+        'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
        'Accept-Encoding':'gzip, deflate, br',
-        'Accept-Language':'zh-CN,zh'
+        'Accept-Language':'zh-CN,zh;q=0.9'
    }
    url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
    for i in range(1,445):
@@ -79,23 +79,23 @@ def detailpaser(dmsg):
    hh={
        'Host':'xcx.www.gov.cn',
        'Connection':'keep-alive',
-        'Content-Length':'14',
+        'Content-Length':'25',
        'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
        'x-tif-did':'pb5XUGL1Zm',
-        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8379',
-        'x-tif-sid':'e1436792814f1c6845af4d84cbc4ad9957',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
+        'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
        'Content-Type':'application/json',
        'xweb_xhr':'1',
        'dgd-pre-release':'0',
-        'x-yss-page':'publicService/pages/policyQALibrary/detail/detail',
+        'x-yss-page':'publicService/pages/policyQALibrary/index/index',
        'x-yss-city-code':'4400',
        'Accept':'*/*',
        'Sec-Fetch-Site':'cross-site',
        'Sec-Fetch-Mode':'cors',
        'Sec-Fetch-Dest':'empty',
-        'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/713/page-frame.html',
+        'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
        'Accept-Encoding':'gzip, deflate, br',
-        'Accept-Language':'zh-CN,zh'
+        'Accept-Language':'zh-CN,zh;q=0.9'
    }
    try:
        durl=dmsg['url']

--- a/comData/haiguanData/hgDownFile1yue.py
+++ b/comData/haiguanData/hgDownFile1yue.py
--- a/comData/haiguanData/hgDownFileTest.py
+++ b/comData/haiguanData/hgDownFileTest.py
@@ -74,19 +74,7 @@ class HgDownFile(object):
        return cookie
    #请求下载文件
    def reqDownFile(self,data):
-        header={
-            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-            'Accept-Encoding':'gzip, deflate',
-            'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
-            'Cache-Control':'max-age=0',
-            'Content-Type':'application/x-www-form-urlencoded',
-            'Host':'stats.customs.gov.cn',
-            'Origin':'http://stats.customs.gov.cn',
-            'Proxy-Connection':'keep-alive',
-            'Upgrade-Insecure-Requests':'1',
-            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
-            'Cookie': self.getcookie()
-        }
+
        data=data
        proxy={}
        # response=requests.post(url=self.downUrl,data=data,headers=header,verify=False,timeout=20)
@@ -95,19 +83,36 @@ class HgDownFile(object):
        while statuscode != 200:
            # time.sleep(5)
            try:
+                # header={
+                #     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                #     'Accept-Encoding':'gzip, deflate',
+                #     'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+                #     'Cache-Control':'max-age=0',
+                #     'Content-Type':'application/x-www-form-urlencoded',
+                #     'Host':'stats.customs.gov.cn',
+                #     'Origin':'http://stats.customs.gov.cn',
+                #     'Proxy-Connection':'keep-alive',
+                #     'Upgrade-Insecure-Requests':'1',
+                #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
+                #     'Cookie': self.getcookie()
+                # }
                header={
                    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                    'Accept-Encoding':'gzip, deflate',
                    'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
                    'Cache-Control':'max-age=0',
+                    'Content-Length':'306',
                    'Content-Type':'application/x-www-form-urlencoded',
                    'Host':'stats.customs.gov.cn',
                    'Origin':'http://stats.customs.gov.cn',
                    'Proxy-Connection':'keep-alive',
+                    #'Referer':'http://stats.customs.gov.cn/queryData/queryDataList?pageNum=1&codeLength=8&currentStartTime=202203&currentEndTime=202309&currentDateBySource=202309&selectTableState=3&orderType=CODE%20ASC%20DEFAULT&iEType=0&currencyType=usd&year=2022&startMonth=1&endMonth=11&monthFlag=&unitFlag=false&unitFlag1=false&outerField1=&outerField2=CODE_TS&outerField3=&outerField4=&outerValue1=&outerValue2=&outerValue3=&outerValue4=',
                    'Upgrade-Insecure-Requests':'1',
                    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64',
                    'Cookie': self.getcookie()
                }
+                data_str = '&'.join([f"{key}={value}" for key, value in data.items()])
+
                response=requests.post(url=self.downUrl,data=data,headers=header,verify=False,timeout=20)
                # response.encoding = response.apparent_encoding
                response.encoding = 'GB2312'
@@ -218,6 +223,7 @@ class HgDownFile(object):
        # 2022年 1-1  202202 2   2022年 1月之前数据是 2
        # 2022年 1-2  202202 3  2022年的累计数据是 3
        # 2022年 2-2  202202 1  2022年 1月之后数据是 1
+        selectTableState=2
        if year<2022:
            selectTableState= 2  #202202前的数据为2 后的数据是1
        else:
@@ -225,9 +231,11 @@ class HgDownFile(object):
            e=int(endMonth)
            if year==2022 and s<e: #2022年累计数据单独设置参数
                selectTableState= 3
+                if e==2:
+                    selectTableState= 2
            elif year==2022 and e==1:
                selectTableState= 2
-            else:
+            elif year==2022 and s==e:
                selectTableState= 1  #202202前的数据为2 后的数据是1
        param={
            'pageSize': 10,
@@ -237,9 +245,9 @@ class HgDownFile(object):
            'startMonth': startMonth,
            'endMonth': endMonth,
            'monthFlag':'',
-            'unitFlag': False,
-            'unitFlag1': False,
-            'codeLength': '8',
+            'unitFlag': True,
+            'unitFlag1': True,
+            'codeLength': 8,
            'outerField1': outerField1,
            'outerField2':'',
            'outerField3':'',
@@ -250,24 +258,26 @@ class HgDownFile(object):
            'outerValue4':'',
            'orderType': 'CODE ASC DEFAULT',
            'selectTableState': selectTableState,  #202201前的数据为2 后的数据是1
-            'currentStartTime': '202202',
+            'currentStartTime': 202203,
        }
        return param

    #联合查询字段的参数设置
    def setcodesAndProductparam(self,iEType,currencyType,year,startMonth,endMonth,outerField1,filedCode):
-
+        selectTableState= 1  #默认是1
        if year<2022:
-            selectTableState= 2  #202202前的数据为2 后的数据是1
+            selectTableState= 2  #202203 前的数据为2
        else:
            s=int(startMonth)
            e=int(endMonth)
-            if year==2022 and s<e: #2022年累计数据单独设置参数
+            if year==2022 and s<e: #2022年累计数据参数是3
                selectTableState= 3
+                if e==2:
+                    selectTableState= 2
            elif year==2022 and e==1:
-                selectTableState= 2
-            else:
-                selectTableState= 1  #202202前的数据为2 后的数据是1
+                selectTableState= 2 #202203 1月的数据单月的参数是2
+            elif year==2022 and s==e:
+                selectTableState= 1  #202203除1月的数据单月的参数是1
        param={
            'pageSize': 10,
            'iEType': iEType,
@@ -276,8 +286,8 @@ class HgDownFile(object):
            'startMonth': startMonth,
            'endMonth': endMonth,
            'monthFlag':'',
-            'unitFlag': False,
-            'unitFlag1': False,
+            'unitFlag': True,
+            'unitFlag1': True,
            'codeLength': '8',
            'outerField1': outerField1,
            'outerField2':'CODE_TS',
@@ -289,7 +299,7 @@ class HgDownFile(object):
            'outerValue4':'',
            'orderType': 'CODE ASC DEFAULT',
            'selectTableState': selectTableState,
-            'currentStartTime': '202202',
+            'currentStartTime': 202203,
        }
        return param


--- a/comData/haiguanData/hgflask.py
+++ b/comData/haiguanData/hgflask.py
@@ -6,7 +6,7 @@ import json
 import pymysql
 from pyquery import PyQuery as pq
 from flask_cors import cross_origin
-
+from urllib.parse import unquote


 '''
@@ -71,12 +71,15 @@ def index():
 def get_news():
    data=request.form

-@app.route('/task/setCookie', methods=['GET'])
-# @cross_origin()
+@app.route('/task/setCookie', methods=['GET','POST'])
+@cross_origin()
 def setCookie():
    try:
-        cookie = request.args.get('cookie')
-        r.sadd('hgcookie',cookie)
+        # cookie = request.args.get('cookie')
+        hgcookie = request.form.get('cookie')
+        hgcookie = unquote(hgcookie)
+        r.sadd('hgcookie',hgcookie)
+        print(f'setCookie添加cookie成功到redis{hgcookie}')
    except Exception as e:
        print('error')
    return 'succes'
@@ -100,7 +103,7 @@ def getCookieSize():
    return jsonify(data)

 @app.route('/task/getHtml', methods=['POST'])
-# @cross_origin()
+@cross_origin()
 def getnewMonth():
    try:
        html = request.form.get('html')

--- a/cpws/wsflask.py
+++ b/cpws/wsflask.py
 import os
+from urllib.parse import unquote

 import redis
 from flask import Flask, request, send_file, render_template, jsonify
@@ -71,12 +72,14 @@ def index():
 def get_news():
    data=request.form

-@app.route('/ws/setCookie', methods=['GET'])
+@app.route('/ws/setCookie', methods=['GET','POST'])
 # @cross_origin()
 def setCookie():
    try:
-        cookie = request.args.get('cookie')
-        r.sadd('wscookie',cookie)
+        # cookie = request.args.get('cookie')
+        wscookie = request.form.get('cookie')
+        wscookie = unquote(wscookie)
+        r.sadd('wscookie',wscookie)
    except Exception as e:
        print('error')
    return 'succes'

--- a/cpws/裁判文书网old.ipynb
+++ b/cpws/裁判文书网old.ipynb
--- a/cpws/裁判文书网列表正文.py
+++ b/cpws/裁判文书网列表正文.py
--- a/cpws/裁判文书网列表正文_redis.py
+++ b/cpws/裁判文书网列表正文_redis.py
--- a/cpws/说明
+++ b/cpws/说明
+
+
+裁判文书网修改：
+1.由于header信息的不全导致之前请求时获取不到需要的列表信息
+2.请求中的参数是根据js代码依照时间生成的，
+3.信息的解密采用了DES3的方式进行处理
+4.裁判文书网采集流程修改
+使用浏览器定时刷新网页的方式获取cookie信息，放到redis中。在每次请求是从redis中拉取一个cookie信息。
+
+5.测试目前不确定封号的策略需要进行测试
+一个电脑只能使用本机的cookie信息进行请求访问，不能切换其它的账号否则会导致其它账号被封。
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/jrtt1news_comm/jrttnewspider.py
+++ b/jrtt1news_comm/jrttnewspider.py
@@ -486,8 +486,8 @@ class JrttnewsSpider(object):
    def extractorMsg(self,url,title):
        content=''
        contentWithTag=''
-        lang=''
-        lang=self.detect_language(title)
+        lang='cn'
+        # lang=self.detect_language(title)
        sm=SmartExtractor(lang)
        try:
            # raw_html=self.detailHtml(url)

--- a/sougou_comm/sougouSpider.py
+++ b/sougou_comm/sougouSpider.py
@@ -48,6 +48,7 @@ class SougouSpider(object):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.binary_location =  self.config.get('selenium', 'binary_location')
        self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
+
        # driver = webdriver.Chrome(chrome_options=chrome_options)
        self.qtitle = Queue()
        self.qurl = Queue()
@@ -373,9 +374,9 @@ class SougouSpider(object):
    def extractorMsg(self,url,title):
        content=''
        contentWithTag=''
-        lang=''
+        lang='cn'
        try:
-            lang=self.detect_language(title)
+            # lang=self.detect_language(title)
            raw_html=self.webDriver(url)
            sm=SmartExtractor(lang)
            article=sm.extract_by_html(raw_html)

--- a/sougou_comm/sougoutaskJob_loc.py
+++ b/sougou_comm/sougoutaskJob_loc.py
@@ -235,7 +235,7 @@ if __name__ == '__main__':
                    # 创建一个线程池，指定线程数量为4
                    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
                        # 提交任务给线程池，每个任务处理一个数据
-                        results = [executor.submit(sougouTaskJob.runSpider, data) for data in kwList]
+                        results = [executor.submit(sougouTaskJob.runLocSpider, data) for data in kwList]
                        # 获取任务的执行结果
                        for future in concurrent.futures.as_completed(results):
                            try:

--- a/test/pipeiUrl.py
+++ b/test/pipeiUrl.py
@@ -7,7 +7,7 @@ import openpyxl
 from urllib.parse import urlparse

 # 打开Excel文件
-workbook = openpyxl.load_workbook('2500url.xlsx')
+workbook = openpyxl.load_workbook(r'C:\Users\WIN10\Desktop\aa\qiye.xlsx')

 # 获取工作表对象
 worksheet = workbook.active
@@ -16,8 +16,9 @@ qiyedatas=[]
 # 遍历工作表的行
 for row in worksheet.iter_rows(values_only=True):
      qiyemsg={
-            'url':row[0],
-            'exist':row[1],
+            '序号':row[0],
+            '企业名称':row[0],
+            '网址':row[1],

      }
      qiyedatas.append(qiyemsg)
@@ -31,8 +32,9 @@ sql1 = """select id, info_source_code, web_site_name, site_name , site_uri  from
 cont=1;
 qynot=[]
 qyin=[]
+qynn=[]
 for qy in qiyedatas:
-      name=qy['url']
+      name=qy['网址']
      if name is  None:
            qy['exist']=0
            qyin.append(qy)
@@ -42,6 +44,10 @@ for qy in qiyedatas:
            qyin.append(qy)
            continue
      try:
+            parsed_url = urlparse(name)
+            domain = parsed_url.netloc
+            if domain.startswith("www."):
+                  name = domain[4:]
            sql2=sql1.replace("[name]",name)
            cursor.execute(sql2)
      except Exception as e:
@@ -54,14 +60,29 @@ for qy in qiyedatas:
            qy['exist']=0
            qyin.append(qy)
      else:
+            result_data
+            # qyin.append(qy)
+            for row2 in tqdm(result_data):
+                  try:
+                        rd = {'id': row2[0],
+                              '编码': row2[1],
+                              '网站名称': row2[2],
+                              '栏目名称': row2[3],
+                              '栏目地址': row2[4],
+                              '企业名称': qy['企业名称']
+                              }
+                        qynn.append(rd)
+                  except Exception as e:
+                        print(e)
+                        print("查询失败！！"+sql2)
            cont+=1
            print(cont)
            qy['exist']=1
            qyin.append(qy)
 df_out = pd.DataFrame(data=qyin)
-df_out.to_excel('url企业情况在平台中有数据.xlsx', engine='xlsxwriter', index=False)
+df_out.to_excel('url企业名单.xlsx', engine='xlsxwriter', index=False)

-df_out = pd.DataFrame(data=qynot)
+df_out = pd.DataFrame(data=qynn)
 df_out.to_excel('url企业情况在平台中没有数据.xlsx', engine='xlsxwriter', index=False)



--- a/百度采集/baidu_comm/baiduSpider.py
+++ b/百度采集/baidu_comm/baiduSpider.py
-#coding=utf-8
+#coding=utf-8
@@ -280,7 +280,7 @@ class BaiduSpider(object):
        hasnext = html.xpath('//div[@id="page"]//a[last()]//text()')[0]
        hasnext = hasnext.strip()
        timeFlag=False
-        while hasnext == '下一页 >':
+        while '下一页' in hasnext:
            try:
                if self.page_num==5:
                    break
@@ -451,32 +451,6 @@ class BaiduSpider(object):
                break
                # time.sleep(5)

-    # def getDetailmsg(self,detailhtml,detailmsg):
-    #     try:
-    #         detailurl=detailmsg['detailUrl']
-    #         article_content=self.paserDetail(detailhtml,detailurl)
-    #         content=article_content['content']
-    #         contentWithTag=article_content['body_html']
-    #     except Exception as e:
-    #         self.logger.info('内容抽取失败')
-    #         content=''
-    #         contentWithTag=''
-    #     currentdate=self.getNowDate()
-    #     kword=self.searchkw
-    #     publishtime=detailmsg['publishTag']
-    #     publishtime=self.paserTime(publishtime)
-    #     publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
-    #     detailmsg={
-    #         'title':detailmsg['title'],
-    #         'source':detailmsg['sourceTag'],
-    #         'detailurl':detailurl,
-    #         'content':content,
-    #         'contentHtml':contentWithTag,
-    #         'publishtime':publishDate,
-    #         'currentdate':currentdate,
-    #         'kword':kword
-    #     }
-    #     return detailmsg

    def getProcessitem(self,bdetail):
        nowDate=self.getNowDate()

--- a/采集部署说明
+++ b/采集部署说明
+
+
+114.116.108.171  启动了8个动态采集
+114.115.234.116  启动了4个动态采集和4个国外网站采集
+114.115.218.248  启动了4个动态采集和3个验证服务
+114.115.162.99   启动了特定专题采集 8个服务
+114.115.221.202  为中科软城市采集相关服务
+HK 159.138.150.155 启动了3个国外网站验证服务
+HK 94.74.96.195  启动雅虎财经财务数据采集，
+
+114.115.153.6  老平台研究中心，央企舆情采集
+114.116.122.247 老平台央企舆情，评价中心相关采集和评价中心央企舆情新平台采集
+49.4.24.191 服务器系统不能进行复制粘贴环境
+
+
+到期 114.116.48.72  需要迁移的服务 国外采集 4个 （迁移到116上，116的动态采集服务迁到171上）
+到期 114.115.235.92 建材，机械舆情的测试采集服务 （迁到152.6服务器上）
+
+
+192.168.1.239 启动了静态采集服务4个
+192.168.1.240 启动了静态采集服务4个
+
+
+
+
+
+