美国证券交易委员会年报日期修改

14c4b90a · XveLingKun · 07decca3 · 14c4b90a · 14c4b90a · 14c4b90a
--- a/comData/annualReport_US/annualreportUS.py
+++ b/comData/annualReport_US/annualreportUS.py
@@ -56,12 +56,15 @@ def get_news(news_url,ip_dic):
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
    }
-    response = requests.get(url=news_url,headers=header,verify=False,timeout=30)
-    # response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
+    # 测试：
+    news_url = "https://www.sec.gov/Archives/edgar/data/104169/000010416923000020/wmt-20230131.htm"
+
+    # response = requests.get(url=news_url, headers=header, verify=False, timeout=30)
+    response = requests.get(url=news_url, headers=header, verify=False, proxies=ip_dic, timeout=30)
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
-        result = BeautifulSoup(response.content,'html.parser')
+        result = BeautifulSoup(response.content, 'html.parser')
        # print(result)
        pass
    else:
@@ -103,7 +106,7 @@ def spider(com_name,cik,up_okCount):
    #解析页面
    for nnn in range(0,4):
        try:
-            req = requests.get(url=url_json,headers=header,proxies=ip_dic,verify=False,timeout=30)
+            req = requests.get(url=url_json, headers=header, proxies=ip_dic, verify=False, timeout=30)
            # req = requests.get(url=url_json, headers=header, verify=False, timeout=30)
            break
        except Exception as e:
@@ -114,6 +117,7 @@ def spider(com_name,cik,up_okCount):
    except:
        baseCore.rePutIntoR('Sec_cik_US:uscik_annualReport',social_code)
        return
+    req.close()
    info = data['filings']['recent']
    form_type_list = info['form']
    accessionNumber_list = info['accessionNumber']
@@ -157,7 +161,7 @@ def spider(com_name,cik,up_okCount):
                continue
            else:
                pass
-            soup = get_news(news_url,ip_dic)
+            soup = get_news(news_url, ip_dic)
            if soup:
                pass
            else:

--- a/comData/annualReport_US/es_search_ct.py
+++ b/comData/annualReport_US/es_search_ct.py
+"""
+从es中查询contentWithTag字段
+"""
+# -*- coding: utf-8 -*-
+import json
+import threading
+import time
+import uuid
+
+import redis
+import requests
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+baseCore = BaseCore.BaseCore()
+
+# 使用连接池
+# cnx_ = baseCore.pool_11.connection()
+# cursor_ = cnx_.cursor()
+
+cnx_ = baseCore.cnx_
+cursor_ = cnx_.cursor()
+
+lock = threading.Lock()
+pathType = 'QYNotice/'
+taskType = '企业研报/东方财富网'
+
+pool = redis.ConnectionPool(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
+
+class EsMethod(object):
+
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'researchreportdata'
+
+    def queryatt(self,index_name,pnum):
+       body = {
+
+           "query": {
+               "bool": {
+                   "must": [
+                       {
+                           "term": {
+                               "type.keyword": {
+                                   "value": "1"
+                               }
+                           }
+                       },
+                       {
+                           "term": {
+                               "origin.keyword": {
+                                   "value": "SEC美国证券交易委员会"
+                               }
+                           }
+                       }
+                   ]
+               }
+           },
+           "sort": [
+               {
+                   "createDate": {
+                       "order": "desc"
+                   }
+               }
+           ],
+           "track_total_hits": True,
+           "size": 200,
+           "from": pnum
+       }
+
+       filter_path = ['hits.hits._id',
+                      'hits.total.value',
+                      'hits.hits._source.title',
+                      'hits.hits._source.sourceAddress',
+                      'hits.hits._source.year',
+                      'hits.hits._source.origin',
+                      'hits.hits._source.labels',
+                      ]  # 字段2
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , filter_path=filter_path
+                               , body=body)
+       # log.info(result)
+       return result
+
+def main(page, p, esMethod):
+    redis_conn = redis.Redis(connection_pool=pool)
+    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+    time.sleep(2)
+    total = result['hits']['total']['value']
+    # if total == 0:
+    #     log.info('++++已没有数据+++++')
+    #     return
+    try:
+        msglist = result['hits']['hits']
+    except:
+        log.info(f'error-----{result}')
+        return
+    log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
+
+    for mms in msglist:
+        id = mms['_id']
+        title = mms['_source']['title']
+        sourceAddress = mms['_source']['sourceAddress']
+        origin = mms['_source']['origin']
+        year = mms['_source']['year']
+        socialCode = mms['_source']['labels'][0]['relationId']
+        log.info(f'{id}--{title}--{origin}--{sourceAddress}---')
+        if origin == 'SEC美国证券交易委员会':
+            redis_conn.lrem('NianbaoUS:id', 0, id+"|"+title+"|"+sourceAddress+"|"+year+"|"+socialCode)
+            redis_conn.lpush('NianbaoUS:id', id+"|"+title+"|"+sourceAddress+"|"+year+"|"+socialCode)
+        else:
+            redis_conn.lrem(f'NianbaoOT_{origin}:id', 0, id+"|"+title+"|"+sourceAddress+"|"+year+"|"+socialCode)
+            redis_conn.lpush(f'NianbaoOT_{origin}:id', id+"|"+title+"|"+sourceAddress+"|"+year+"|"+socialCode)
+
+
+def run_threads(num_threads,esMethod,j):
+    threads = []
+
+    for i in range(num_threads):
+        page = j + i + 1
+        p = j + i * 200
+        thread = threading.Thread(target=main, args=(page, p, esMethod))
+
+        threads.append(thread)
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+if __name__ == "__main__":
+    j = 0
+    for i in range(40):
+        esMethod = EsMethod()
+        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+        # total = result['hits']['total']['value']
+        # if total == 0:
+        #     log.info('++++已没有数据+++++')
+        #     break
+        start = time.time()
+        num_threads = 5
+        run_threads(num_threads, esMethod, j)
+        j += 1000
+
+        log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
--- a/comData/annualReport_US/test_re.py
+++ b/comData/annualReport_US/test_re.py
+import re
+
+# 定义字符串
+text = "Paramount Group, Inc.:2017年年度报告"
+
+# 使用正则表达式找到所有的数字
+numbers = re.findall(r'\d{4}年', text)
+
+# 打印结果
+for number in numbers:
+    print(number)
--- a/comData/annualReport_US/update_report.py
+++ b/comData/annualReport_US/update_report.py
+import re
+from datetime import datetime
+from elasticsearch import Elasticsearch
+import redis
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+index_name = 'researchreportdata'
+
+
+def get_news(news_url,ip_dic):
+    header = {
+        'Host': 'www.sec.gov',
+        'Connection': 'keep-alive',
+        'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-User': '?1',
+        'Sec-Fetch-Dest': 'document',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
+    }
+
+
+    # response = requests.get(url=news_url, headers=header, verify=False, timeout=30)
+    response = requests.get(url=news_url, headers=header, verify=False, proxies=ip_dic, timeout=30)
+    if response.status_code == 200:
+        # 请求成功，处理响应数据
+        # print(response.text)
+        result = BeautifulSoup(response.content, 'html.parser')
+        # print(result)
+        # with open('wmt-20230131.html', 'w', encoding='utf-8')as f:
+        #
+        #     f.write(str(result))
+
+    else:
+        # 请求失败，输出错误信息
+        print('请求失败:', response.status_code, response.text)
+        state = 0
+        result = ''
+    return result
+
+
+def updateaunn(index_name, id, publishDate: str = None, year:str = None, title:str = None):
+    if title:
+        body = {
+            'doc': {
+                'publishDate': publishDate,
+                'year': year,
+                'title': title
+            }
+        }
+    else:
+        body = {
+            'doc': {
+                'publishDate': publishDate,
+            }
+        }
+    result = es.update(index=index_name
+                            ,id=id
+                            ,body=body)
+    log.info('更新结果:%s' % result)
+
+
+if __name__ == "__main__":
+    # 测试：
+    # 从redis中获取id title url 根据id 更新时间
+    r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
+    while True:
+        item = r.lpop('NianbaoUS:id')
+        if item:
+            item = item.decode()
+        else:
+            break
+
+        # news_url = "https://www.sec.gov/Archives/edgar/data/104169/000010416923000020/wmt-20230131.htm"
+        # 测试：
+        # item = "23101317365|MARSH & MCLENNAN COMPANIES, INC.:2021年年度报告|https://www.sec.gov/Archives/edgar/data/0000062709/000006270922000009/mmc-20211231.htm|2021|ZZSN230711140539905"
+
+        id = item.split("|")[0]
+        title = item.split("|")[1]
+        news_url = item.split("|")[2]
+        year = item.split("|")[3]
+        socialCode = item.split("|")[4]
+        ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
+        while True:
+            count = 0
+            result = get_news(news_url, ip_dic)
+            if count <= 10 and result:
+                break
+            elif count > 10:
+                # 记录没有解析成功的年报
+                r.lpush('NianbaoUS:request_error', item)
+                break
+        if result:
+            # with open('./wmt-20230131.html', 'r', encoding='utf-8') as f:
+            #     html = f.read()
+            # soup = BeautifulSoup(html, 'html.parser')
+            try:
+                publishDate = result.find('ix:nonnumeric', attrs={'format': 'ixt:date-monthname-day-year-en'}).text
+                print(publishDate)
+                # 解析时间
+                publishDate = datetime.strptime(publishDate, '%B %d, %Y').strftime('%Y-%m-%d')
+                new_month = publishDate[5:7]
+                if int(new_month) != 12:
+                    new_year = str(int(publishDate[:4]) - 1)
+                    # todo: 更新发布日期
+                    numbers = re.findall(r'\d{4}年', title)
+                    title = title.replace(numbers[0], new_year + '年')
+                    updateaunn(index_name, id, publishDate, new_year, title)
+                else:
+                    # 年份不用更新，只需更新日期
+                    updateaunn(index_name=index_name, id=id, publishDate=publishDate)
+
+                # 记录更新成功的企业年报
+                r.lpush('NianbaoUS:success', f"{id}|{title}|{publishDate}|{socialCode}|{year}")
+
+            except Exception as e:
+                r.lpush('NianbaoUS:upodate_error', item)
+
+        else:
+            continue
\ No newline at end of file