Merge remote-tracking branch 'origin/master'

# Conflicts: # comData/annualReport_XQW/annualreportUS.py

Merge remote-tracking branch 'origin/master'
# Conflicts: # comData/annualReport_XQW/annualreportUS.py
f5fc57ce · 薛凌堃 · 257631f1 · da835fd7 · f5fc57ce · f5fc57ce
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -678,9 +678,19 @@ class BaseCore:
                id = selects[0]
                return id
+    # 更新企业的CIK
+    def updateCIK(self,social_code,cik):
+        try:
+            sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
+            cnn = self.pool_caiji.connection()
+            cursor = cnn.cursor()
+            cursor.execute(sql)
+            cnn.commit()
+            cursor.close()
+            cnn.close()
+        except:
+            log = self.getLogger()
+            log.info('======保存企业CIK失败=====')

--- a/comData/annualReport_XQW/annualreportUS.py
+++ b/comData/annualReport_XQW/annualreportUS.py
@@ -129,13 +129,57 @@ def spider(com_name,cik):
            #相对路径转化为绝对路径
            soup = paserUrl(soup,news_url)
            content = soup.text.strip()
+    # url = f'https://www.sec.gov/edgar/browse/?CIK={cik}&owner=exclude'
+    # browser.get(url)
+    # time.sleep(3)
+    # page_source = browser.page_source
+    # soup = BeautifulSoup(page_source, 'html.parser')
+    # # print(soup)
+    # select_ann = soup.find_all('tr', class_='odd')
+    #
+    # for tr in select_ann:
+    #     form_type = tr.find('td').text
+    #     if form_type == '20-F':
+    #         # print(tr)
+    #         # 获取原文链接
+    #         href = tr.find('a', class_='document-link')['href']
+    #         print(href)
+    #         if 'ix?doc' in href:
+    #             href = 'https://www.sec.gov/' + href.split('/ix?doc=/')[1]
+    #         else:
+    #             href = 'https://www.sec.gov' + href
+    #         print(href)
+    #         # 获取发布时间
+    #         a_list = tr.find_all('a')
+    #         # print(a_list)
+    #         for a in a_list:
+    #             text = a.text
+    #             match = re.search(pattern, text)
+    #             if match:
+    #                 pub_date = match.group(0)
+    #                 # print(pub_date)
+    #                 year = pub_date[:4]
+    #                 break
+    #             else:
+    #                 pub_date = ''
+    #                 year = ''
+    #         # 根据年报的链接，请求年报内容，不需要上传文件服务器，直接发送kafka
+    #         browser.get(href)
+    #         time.sleep(3)
+    #         i_page_source = browser.page_source
+    #         i_soup = BeautifulSoup(i_page_source, 'html.parser')
+    #         # print(i_page_source)
+    #         content = i_soup.text
+            # 采集下来正文内容，直接传输kafka
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            title = f'{com_name}:{year}年年度报告'
            dic_news = {
                'attachmentIds': '',
                'author': '',
                'content': content,
-                'contentWithTag': str(soup),
+                'contentWithTag': soup,
                'createDate': time_now,
                'deleteFlag': '0',
                'id': '',
@@ -174,7 +218,6 @@ def spider(com_name,cik):
                    'code': '204',
                    'e': e
                }
-                log.info(f'{dic_result}-----{e}')
 def getrequest(social_code,url,headers,data):
@@ -197,6 +240,39 @@ def getrequest(social_code,url,headers,data):
        result = ''
    return result
+def getCIK(social_code,code):
+    cik = ''
+    #"MNSO" post请求 获取企业CIK
+    payload = {"keysTyped":f"{code}","narrow":True}
+    data = json.dumps(payload)
+    result = getrequest(social_code,url,headers,data)
+    #判断接口返回的数据哪一条是该企业 根据股票代码
+    tickers = result['hits']['hits']
+    if len(tickers) == 0:
+        log.error(f'{code}....{social_code}....无hits')
+        return cik
+    for ticker in tickers:
+        try:
+            i_t_ = ticker['_source']['tickers']
+            i_ts = i_t_.split(', ')
+        except:
+            continue
+        for i_t in i_ts:
+            if i_t == code:
+                cik = ticker['_id']
+                if len(cik) < 10:
+                    cik = format(int(cik),'0>10d')
+                    baseCore.updateCIK(social_code,cik)
+                    break
+        if cik != '':
+            break
+    if cik == '':
+        log.error(f'{code}....{social_code}....无CIK')
+    else:
+        log.info(f'{code}....{social_code}....cik为{cik}')
+    return cik
 if __name__ == '__main__':
    headers = {
        'authority': 'efts.sec.gov',
@@ -221,8 +297,8 @@ if __name__ == '__main__':
    while True:
        start_time = time.time()
        # 获取企业信息
-        # social_code = baseCore.redicPullData('AnnualEnterprise:usqy_socialCode')
+        social_code = baseCore.redicPullData('AnnualEnterprise:usqy_socialCode')
-        social_code = 'ZZSN22080900000025'
+        # social_code = ''
        if not social_code:
            time.sleep(20)
            continue
@@ -233,7 +309,7 @@ if __name__ == '__main__':
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
-        count = dic_info[16]
+        count = dic_info[15]
        code = dic_info[3]
        com_name = dic_info[4]
        cik = dic_info[13]
@@ -249,10 +325,26 @@ if __name__ == '__main__':
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
            continue
+        # code = 'BP'
+        # com_name = '英国石油公司'
+        # cik = ''
+        #"MNSO" post请求 获取企业CIK 正式
+        # payload = {"keysTyped":f"{code}","narrow":True}
+        # #测试
+        # # payload = {"keysTyped": "BP", "narrow":True}
+        # data = json.dumps(payload)
+        # result = getrequest(social_code,url,headers,data)
+        # # print(result)
+        # #判断接口返回的数据哪一条是该企业 根据股票代码
+        # tickers = result['hits']['hits']
+        # for ticker in tickers:
+        #     i_t = ticker['_source']['tickers']
+        #     if i_t == code:
+        #         cik = ticker['_id']
+        #         print(cik)
+        #         break
+        # break
        spider(com_name,cik)
-        count += 1
-        runType = 'AnnualReportCount'
-        baseCore.updateRun(social_code, runType, count)
        # break