维护

71a0e996 · 薛凌堃 · 1ec2de55 · 71a0e996 · 71a0e996 · 71a0e996
--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -224,8 +224,8 @@ def BaseInfoEnterprise_task():
 #企业核心人员
 def CorPerson():
    cnx, cursor = connectSql()
-    # gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
-    gn_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=13 AND a.Place=1"
+    gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
+    # gn_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=13 AND a.Place=1"
    cursor.execute(gn_query)
    gn_result = cursor.fetchall()
    cnx.commit()
@@ -273,6 +273,19 @@ def FinanceFromEase_task():
        print('定时采集异常', e)
        pass

+def JingyingfenxiFromEase():
+    cnx_, cursor_ = cnn11()
+    # 从上市企业库中读取数据
+    sql_sel = '''select social_credit_code from sys_base_enterprise_ipo where exchange = '1' or exchange = '2' or exchange = '3' and listed = '1' '''
+    cursor_.execute(sql_sel)
+    finance = cursor_.fetchall()
+    cnx_.commit()
+    finance_list = [item[0] for item in finance]
+    print('=======')
+    for item in finance_list:
+        r.rpush('Jingyingfenxi:finance_socialCode', item)
+    close11(cnx_, cursor_)
+
 #微信公众号
 def WeiXingetFromSql():
    cnx_,cursor_=cnn11()
@@ -645,7 +658,7 @@ if __name__ == "__main__":
    # shuangbaiqiye()
    # zhuangjingtexind()
    # NoticeEnterprise()
-    NoticeDF()
+    # NoticeDF()
    # AnnualEnterpriseIPO()
    # AnnualEnterprise()
    # BaseInfoEnterprise()
@@ -670,5 +683,6 @@ if __name__ == "__main__":
    # AnnualEnterprise_task()
    # FinanceFromEast()
    # ipo_code()
+    JingyingfenxiFromEase()
    log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时：{basecore.getTimeCost(start,time.time())}===')

--- a/comData/YanBao/get_doc_id.py
+++ b/comData/YanBao/get_doc_id.py
@@ -39,24 +39,23 @@ class EsMethod(object):
       body = {

           "query": {
-                "bool": {
-                  "must": [
-                    {
-                      "match": {
-                        "type": "0"
-                      }
-                    },
-                    {
-                    "range": {
-                      "createDate": {
-                        "gte": "2023-12-13T00:00:00",
-                        "lte": "2023-12-15T00:00:00"
-                      }
+                  "bool": {
+                      "must_not": [
+                        {
+                          "exists": {
+                            "field": "content"
+                          }
+                        }
+                      ],
+                      "must": [
+                        {
+                          "match": {
+                            "type": "1"
+                          }
+                        }
+                      ]
                    }
-                  }
-                  ]
-                }
-              },
+                  },
              "sort": [
                    {
                      "createDate": {
@@ -74,6 +73,7 @@ class EsMethod(object):
                      'hits.hits._source.title',
                      'hits.hits._source.sourceAddress',
                      'hits.hits._source.createDate',
+                      'hits.hits._source.origin'
                      ]  # 字段2
       result = self.es.search(index=index_name
                               , doc_type='_doc'
@@ -86,9 +86,9 @@ def main(page, p, esMethod):
    redis_conn = redis.Redis(connection_pool=pool)
    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
    total = result['hits']['total']['value']
-    if total == 0:
-        log.info('++++已没有数据+++++')
-        return
+    # if total == 0:
+    #     log.info('++++已没有数据+++++')
+    #     return
    try:
        msglist = result['hits']['hits']
    except:
@@ -100,13 +100,15 @@ def main(page, p, esMethod):
        id = mms['_id']
        title = mms['_source']['title']
        sourceAddress = mms['_source']['sourceAddress']
-
-        log.info(f'{id}--{title}--{sourceAddress}---')
-
-        if redis_conn.lrem('YanBao:id', 0, id) == 0:
-            redis_conn.lpush('YanBao:id', id)
+        origin = mms['_source']['origin']
+        log.info(f'{id}--{title}--{origin}--{sourceAddress}---')
+        if origin == 'SEC美国证券交易委员会':
+            redis_conn.lrem('NianbaoUS:id', 0, id)
+            redis_conn.lpush('NianbaoUS:id', id)
        else:
-            continue
+            redis_conn.lrem(f'NianbaoOT_{origin}:id', 0, id)
+            redis_conn.lpush(f'NianbaoOT_{origin}:id', id)
+

 def run_threads(num_threads,esMethod,j):
    threads = []
@@ -126,7 +128,7 @@ def run_threads(num_threads,esMethod,j):

 if __name__ == "__main__":
    j = 0
-    for i in range(2):
+    for i in range(10):
        esMethod = EsMethod()
        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
        # total = result['hits']['total']['value']

--- a/comData/annualReport_US/uploadfile.py
+++ b/comData/annualReport_US/uploadfile.py
+# -*- coding: utf-8 -*-
+import json
+import re
+import threading
+import time
+import uuid
+from urllib.parse import urljoin
+
+import fitz
+import redis
+import requests
+from bs4 import BeautifulSoup
+from obs import ObsClient
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+baseCore = BaseCore.BaseCore()
+
+cnx_ = baseCore.cnx_
+cursor_ = cnx_.cursor()
+
+lock = threading.Lock()
+
+
+pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+
+class EsMethod(object):
+
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'researchreportdata'
+
+    def queryatt(self,index_name,id):
+       body = {
+               "query": {
+                "match": {
+                       "id": id
+                   }
+               }
+
+       }
+
+       filter_path = ['hits.hits._id',
+                      'hits.total.value',
+                      'hits.hits._source.title',
+                      'hits.hits._source.socialCreditCode',
+                      'hits.hits._source.sourceAddress'
+                      # 'hits.hits._source.createDate',
+                      # 'hits.hits._source.publishDate',
+                      ]  # 字段2
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , filter_path=filter_path
+                               , body=body)
+       # log.info(result)
+       return result
+
+    def updateaunn(self, index_name, id, content, contentWithTag):
+        body = {
+            'doc': {
+                'content': content,
+                'contentWithTag': contentWithTag
+            }
+        }
+        result = self.es.update(index=index_name
+                                ,id=id
+                                ,body=body)
+        log.info('更新结果:%s' % result)
+
+def paserUrl(html,listurl):
+    # soup = BeautifulSoup(html, 'html.parser')
+    # 获取所有的<a>标签和<img>标签
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+
+def get_news(news_url,ip_dic):
+    header = {
+        'Host': 'www.sec.gov',
+        'Connection': 'keep-alive',
+        'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-User': '?1',
+        'Sec-Fetch-Dest': 'document',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
+    }
+    response = requests.get(url=news_url,headers=header,verify=False,timeout=30)
+    # response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
+    if response.status_code == 200:
+        # 请求成功，处理响应数据
+        # print(response.text)
+        result = BeautifulSoup(response.content,'html.parser')
+        # print(result)
+        pass
+    else:
+        # 请求失败，输出错误信息
+        log.info('请求失败:', response.status_code, response.text)
+        result = ''
+    return result
+
+def main(esMethod):
+    redis_conn = redis.Redis(connection_pool=pool)
+    id_ = redis_conn.lpop('NianbaoUS:id')
+    id = id_.decode()
+    # id = "23101317164"
+    if id:
+        pass
+    else:
+        log.info('已无数据')
+        return
+    result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id)
+    result = result_['hits']['hits'][0]
+    num = 0
+    title = result['_source']['title']
+    social_code = result['_source']['socialCreditCode']
+    # origin = result['_source']['origin']
+    log.info(f'====={title}=={social_code}===正在更新===')
+    sourceAddress = result['_source']['sourceAddress']
+    ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
+    soup = get_news(sourceAddress,ip_dic)
+    if soup:
+        pass
+    else:
+        return
+    # 相对路径转化为绝对路径
+    soup = paserUrl(soup, sourceAddress)
+    content = soup.text.strip()
+
+    esMethod.updateaunn(esMethod.index_name, str(id), content, str(soup))
+
+
+
+def run_threads(num_threads,esMethod):
+    threads = []
+    for i in range(num_threads):
+
+        thread = threading.Thread(target=main, args=(esMethod,))
+
+        threads.append(thread)
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+if __name__ == '__main__':
+    while True:
+        esMethod = EsMethod()
+        start = time.time()
+        num_threads = 5
+        run_threads(num_threads,esMethod)
+        log.info(f'5线程 总耗时{time.time()-start}秒')
\ No newline at end of file
--- a/comData/dingzhi/bmfw.py
+++ b/comData/dingzhi/bmfw.py
@@ -8,6 +8,13 @@ import urllib3
 from pyquery import PyQuery as pq
 import json
 from kafka import KafkaProducer
+
+import sys
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 #务院政策问答平台最新发布信息采集

@@ -16,37 +23,58 @@ def reqHtml(url,data,header):
        proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
        json_data=json.dumps(data)
        response = requests.post(url,data=json_data,headers=header,verify=False,timeout=10)
-        print(response.status_code)
+        log.info(response.status_code)
        html=response.text
    except Exception as e:
        html=''
    return html

 def page_list():
+    # header = {
+    #     'Host':'xcx.www.gov.cn',
+    #     'Connection':'keep-alive',
+    #     'Content-Length':'25',
+    #     'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
+    #     'x-tif-did':'pb5XUGL1Zm',
+    #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
+    #     'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
+    #     'Content-Type':'application/json',
+    #     'xweb_xhr':'1',
+    #     'dgd-pre-release':'0',
+    #     'x-yss-page':'publicService/pages/policyQALibrary/index/index',
+    #     'x-yss-city-code':'4400',
+    #     'Accept':'*/*',
+    #     'Sec-Fetch-Site':'cross-site',
+    #     'Sec-Fetch-Mode':'cors',
+    #     'Sec-Fetch-Dest':'empty',
+    #     'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
+    #     'Accept-Encoding':'gzip, deflate, br',
+    #     'Accept-Language':'zh-CN,zh;q=0.9'
+    # }
    header = {
-        'Host':'xcx.www.gov.cn',
-        'Connection':'keep-alive',
-        'Content-Length':'25',
-        'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
-        'x-tif-did':'pb5XUGL1Zm',
-        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
-        'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
-        'Content-Type':'application/json',
-        'xweb_xhr':'1',
-        'dgd-pre-release':'0',
-        'x-yss-page':'publicService/pages/policyQALibrary/index/index',
-        'x-yss-city-code':'4400',
-        'Accept':'*/*',
-        'Sec-Fetch-Site':'cross-site',
-        'Sec-Fetch-Mode':'cors',
-        'Sec-Fetch-Dest':'empty',
-        'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
-        'Accept-Encoding':'gzip, deflate, br',
-        'Accept-Language':'zh-CN,zh;q=0.9'
+        'Host': 'xcx.www.gov.cn',
+        'Connection': 'keep-alive',
+        'Content-Length': '25',
+        'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
+        'x-tif-did': 'u8Ajuqdyap',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
+        'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
+        'Content-Type': 'application/json',
+        'xweb_xhr': '1',
+        'dgd-pre-release': '0',
+        'x-yss-page': 'publicService/pages/policyQALibrary/index/index',
+        'x-yss-city-code': '4400',
+        'Accept': '*/*',
+        'Accept-Language': '*',
+        'Sec-Fetch-Site': 'cross-site',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br'
    }
    url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
-    for i in range(1,445):
-        print(f'采集第{i}页数据')
+    for i in range(1,453):
+        log.info(f'采集第{i}页数据')
        k=i
        da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
        data=da.replace('[k]',str(k))
@@ -56,7 +84,7 @@ def page_list():
            hjson=json.loads(lhtml)
            data=hjson['data']['list']
        except Exception as e:
-            print(e)
+            log.info(e)
            time.sleep(60)
            continue
        for ss in data:
@@ -64,9 +92,9 @@ def page_list():
            durl=f'https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicy'
            sourceAddress=f'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
            try:
-                flag=r.sismember('IN-20230829-0146',sourceAddress)
+                flag=r.sismember('IN-20230829-0146-test',sourceAddress)
                if flag:
-                    print('信息已采集入库过')
+                    log.info('信息已采集入库过')
                    continue
            except Exception as e:
                continue
@@ -77,25 +105,25 @@ def page_list():

 def detailpaser(dmsg):
    hh={
-        'Host':'xcx.www.gov.cn',
-        'Connection':'keep-alive',
-        'Content-Length':'25',
-        'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
-        'x-tif-did':'pb5XUGL1Zm',
-        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
-        'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
-        'Content-Type':'application/json',
-        'xweb_xhr':'1',
-        'dgd-pre-release':'0',
-        'x-yss-page':'publicService/pages/policyQALibrary/index/index',
-        'x-yss-city-code':'4400',
-        'Accept':'*/*',
-        'Sec-Fetch-Site':'cross-site',
-        'Sec-Fetch-Mode':'cors',
-        'Sec-Fetch-Dest':'empty',
-        'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
-        'Accept-Encoding':'gzip, deflate, br',
-        'Accept-Language':'zh-CN,zh;q=0.9'
+        'Host': 'xcx.www.gov.cn',
+        'Connection': 'keep-alive',
+        'Content-Length': '25',
+        'x-tif-openid': 'ojyj-40u1IEK5a2CSK7_Pg31ySks',
+        'x-tif-did': 'u8Ajuqdyap',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8501',
+        'x-tif-sid': '755e67ddc8f86552d3f8d356fe22721cc5',
+        'Content-Type': 'application/json',
+        'xweb_xhr': '1',
+        'dgd-pre-release': '0',
+        'x-yss-page': 'publicService/pages/policyQALibrary/index/index',
+        'x-yss-city-code': '4400',
+        'Accept': '*/*',
+        'Accept-Language': '*',
+        'Sec-Fetch-Site': 'cross-site',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Dest': 'empty',
+        'Referer': 'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html',
+        'Accept-Encoding': 'gzip, deflate, br'
    }
    try:
        durl=dmsg['url']
@@ -107,8 +135,8 @@ def detailpaser(dmsg):
        dd=json.loads(dhtml)
        sendTokafka(dd)
    except Exception as e:
-        print(e)
-    print(dhtml)
+        log.info(e)
+    # log.info(dhtml)

 def sendTokafka(ddata):
    dd=ddata['data']
@@ -117,8 +145,11 @@ def sendTokafka(ddata):
    content=dd['content']
    contentWithTag=dd['content']
    publishTime=dd['publishTime']
-    time_format='%Y年%m月%d日'
-    publishDate=str(datetime.datetime.strptime(publishTime, time_format))
+    if publishTime:
+        time_format='%Y年%m月%d日'
+        publishDate=str(datetime.datetime.strptime(publishTime, time_format))
+    else:
+        publishDate = '1900-01-01'
    origin=dd['departmentName']
    sourceAddress=f'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
    sid='1696404919115825153'
@@ -135,13 +166,14 @@ def sendTokafka(ddata):
        'source': 'python定制采集',
        'type': ''
    }
+    log.info(aa_dict)
    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
    try:
        kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
-        r.sadd(info_code,sourceAddress)
-        print('发送kafka成功！')
+        r.sadd(info_code+'-test',sourceAddress)
+        log.info('发送kafka成功！')
    except Exception as e:
-        print(e)
+        log.info(e)
    finally:
        producer.close()
        # r.close()