9/8

257631f1 · 薛凌堃 · 7dcd1a4c · 257631f1 · 257631f1
--- a/comData/annualReport_XQW/annualreportUS.py
+++ b/comData/annualReport_XQW/annualreportUS.py
@@ -19,6 +19,9 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 from bs4 import BeautifulSoup
 from kafka import KafkaProducer
 # from selenium import webdriver
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
 def paserUrl(html,listurl):
    # soup = BeautifulSoup(html, 'html.parser')
@@ -108,13 +111,13 @@ def spider(com_name,cik):
    for form in form_type_list:
        i += 1
        if form == '10-K' or form == '20-F':
-            print(form,i)
+            log.info(form,i)
            accessionNumber = accessionNumber_list[i]
            #发布日期
            filingDate = filingDate_list[i]
            year = filingDate[:4]
-            # u_1 = cik
+            u_1 = cik
-            u_1 = '1395064'
+            # u_1 = '1395064'
            u_2 = accessionNumber.replace('-','')
            u_3 = primaryDocument_list[i]
            news_url = 'https://www.sec.gov/Archives/edgar/data/' + u_1 + '/' + u_2 + '/' + u_3
@@ -126,57 +129,13 @@ def spider(com_name,cik):
            #相对路径转化为绝对路径
            soup = paserUrl(soup,news_url)
            content = soup.text.strip()
-    # url = f'https://www.sec.gov/edgar/browse/?CIK={cik}&owner=exclude'
-    # browser.get(url)
-    # time.sleep(3)
-    # page_source = browser.page_source
-    # soup = BeautifulSoup(page_source, 'html.parser')
-    # # print(soup)
-    # select_ann = soup.find_all('tr', class_='odd')
-    #
-    # for tr in select_ann:
-    #     form_type = tr.find('td').text
-    #     if form_type == '20-F':
-    #         # print(tr)
-    #         # 获取原文链接
-    #         href = tr.find('a', class_='document-link')['href']
-    #         print(href)
-    #         if 'ix?doc' in href:
-    #             href = 'https://www.sec.gov/' + href.split('/ix?doc=/')[1]
-    #         else:
-    #             href = 'https://www.sec.gov' + href
-    #         print(href)
-    #         # 获取发布时间
-    #         a_list = tr.find_all('a')
-    #         # print(a_list)
-    #         for a in a_list:
-    #             text = a.text
-    #             match = re.search(pattern, text)
-    #             if match:
-    #                 pub_date = match.group(0)
-    #                 # print(pub_date)
-    #                 year = pub_date[:4]
-    #                 break
-    #             else:
-    #                 pub_date = ''
-    #                 year = ''
-    #         # 根据年报的链接，请求年报内容，不需要上传文件服务器，直接发送kafka
-    #         browser.get(href)
-    #         time.sleep(3)
-    #         i_page_source = browser.page_source
-    #         i_soup = BeautifulSoup(i_page_source, 'html.parser')
-    #         # print(i_page_source)
-    #         content = i_soup.text
-            # 采集下来正文内容，直接传输kafka
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            title = f'{com_name}:{year}年年度报告'
            dic_news = {
                'attachmentIds': '',
                'author': '',
                'content': content,
-                'contentWithTag': soup,
+                'contentWithTag': str(soup),
                'createDate': time_now,
                'deleteFlag': '0',
                'id': '',
@@ -192,21 +151,21 @@ def spider(com_name,cik):
                'socialCreditCode': '',
                'year': year
            }
-            print(dic_news)
+            # print(dic_news)
            # 将相应字段通过kafka传输保存
            try:
                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
                kafka_result = producer.send("researchReportTopic",
                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
-                print(kafka_result.get(timeout=10))
+                log.info(kafka_result.get(timeout=10))
                dic_result = {
                    'success': 'ture',
                    'message': '操作成功',
                    'code': '200',
                }
-                print(dic_result)
+                log.info(dic_result)
            except Exception as e:
                dic_result = {
@@ -215,6 +174,7 @@ def spider(com_name,cik):
                    'code': '204',
                    'e': e
                }
+                log.info(f'{dic_result}-----{e}')
 def getrequest(social_code,url,headers,data):
@@ -261,8 +221,8 @@ if __name__ == '__main__':
    while True:
        start_time = time.time()
        # 获取企业信息
-        social_code = baseCore.redicPullData('AnnualEnterprise:usqy_socialCode')
+        # social_code = baseCore.redicPullData('AnnualEnterprise:usqy_socialCode')
-        # social_code = ''
+        social_code = 'ZZSN22080900000025'
        if not social_code:
            time.sleep(20)
            continue
@@ -273,7 +233,7 @@ if __name__ == '__main__':
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
-        count = dic_info[15]
+        count = dic_info[16]
        code = dic_info[3]
        com_name = dic_info[4]
        cik = dic_info[13]
@@ -289,26 +249,10 @@ if __name__ == '__main__':
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
            continue
-        # code = 'BP'
-        # com_name = '英国石油公司'
-        # cik = ''
-        #"MNSO" post请求 获取企业CIK 正式
-        # payload = {"keysTyped":f"{code}","narrow":True}
-        # #测试
-        # # payload = {"keysTyped": "BP", "narrow":True}
-        # data = json.dumps(payload)
-        # result = getrequest(social_code,url,headers,data)
-        # # print(result)
-        # #判断接口返回的数据哪一条是该企业 根据股票代码
-        # tickers = result['hits']['hits']
-        # for ticker in tickers:
-        #     i_t = ticker['_source']['tickers']
-        #     if i_t == code:
-        #         cik = ticker['_id']
-        #         print(cik)
-        #         break
-        # break
        spider(com_name,cik)
+        count += 1
+        runType = 'AnnualReportCount'
+        baseCore.updateRun(social_code, runType, count)
        # break

--- a/comData/annualReport_ZJH/雪球网-年报.py
+++ b/comData/annualReport_ZJH/雪球网-年报.py
 # -*- coding: utf-8 -*-
@@ -251,7 +251,7 @@ if __name__ == '__main__':
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
-        count = dic_info[15]
+        count = dic_info[16]
        code = dic_info[3]
        com_name = dic_info[4]
        if code is None: