9/8

61beba6a · 薛凌堃 · f5fc57ce · 61beba6a
--- a/comData/annualReport_XQW/annualreportUS.py
+++ b/comData/annualReport_XQW/annualreportUS.py
@@ -130,56 +130,14 @@ def spider(com_name,cik):
            soup = paserUrl(soup,news_url)
            content = soup.text.strip()

-    # url = f'https://www.sec.gov/edgar/browse/?CIK={cik}&owner=exclude'
-    # browser.get(url)
-    # time.sleep(3)
-    # page_source = browser.page_source
-    # soup = BeautifulSoup(page_source, 'html.parser')
-    # # print(soup)
-    # select_ann = soup.find_all('tr', class_='odd')
-    #
-    # for tr in select_ann:
-    #     form_type = tr.find('td').text
-    #     if form_type == '20-F':
-    #         # print(tr)
-    #         # 获取原文链接
-    #         href = tr.find('a', class_='document-link')['href']
-    #         print(href)
-    #         if 'ix?doc' in href:
-    #             href = 'https://www.sec.gov/' + href.split('/ix?doc=/')[1]
-    #         else:
-    #             href = 'https://www.sec.gov' + href
-    #         print(href)
-    #         # 获取发布时间
-    #         a_list = tr.find_all('a')
-    #         # print(a_list)
-    #         for a in a_list:
-    #             text = a.text
-    #             match = re.search(pattern, text)
-    #             if match:
-    #                 pub_date = match.group(0)
-    #                 # print(pub_date)
-    #                 year = pub_date[:4]
-    #                 break
-    #             else:
-    #                 pub_date = ''
-    #                 year = ''
-    #         # 根据年报的链接，请求年报内容，不需要上传文件服务器，直接发送kafka
-    #         browser.get(href)
-    #         time.sleep(3)
-    #         i_page_source = browser.page_source
-    #         i_soup = BeautifulSoup(i_page_source, 'html.parser')
-    #         # print(i_page_source)
-    #         content = i_soup.text
-
-            # 采集下来正文内容，直接传输kafka
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            title = f'{com_name}:{year}年年度报告'
+            log.info(f'---{title}----采集完成----发送数据----')
            dic_news = {
                'attachmentIds': '',
                'author': '',
                'content': content,
-                'contentWithTag': soup,
+                'contentWithTag': str(soup),
                'createDate': time_now,
                'deleteFlag': '0',
                'id': '',
@@ -218,6 +176,7 @@ def spider(com_name,cik):
                    'code': '204',
                    'e': e
                }
+                log.info(f'{dic_result}---{e}')

 def getrequest(social_code,url,headers,data):

@@ -320,11 +279,13 @@ if __name__ == '__main__':
            baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
            continue
        if cik is None:
-            exeception = 'cik为空'
-            state = 0
-            takeTime = baseCore.getTimeCost(start_time, time.time())
-            baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
-            continue
+            cik = getCIK(social_code,code)
+            if cik == '':
+                exeception = 'cik为空'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
+                continue
        # code = 'BP'
        # com_name = '英国石油公司'
        # cik = ''