年报部署

b127603f · 薛凌堃 · 3ed701a1 · b127603f · b127603f
--- a/comData/annualReport/证监会-年报.py
+++ b/comData/annualReport/证监会-年报.py
-import json
+import json
@@ -6,11 +6,13 @@ from kafka import KafkaProducer
 from bs4 import BeautifulSoup
 import requests, re, time, pymysql,  fitz
 import urllib3
-from base import BaseCore
+import sys
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

-baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()

 cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
@@ -66,7 +68,11 @@ def RequestUrl(url, payload, item_id, start_time):
 def SpiderByZJH(url, payload, dic_info, num, start_time):
    item_id = dic_info[2]
    # years = dic_info['call_year']
-    short_name = dic_info[4]
+    short_name_ = dic_info[4]
+    if short_name_:
+        short_name = short_name_
+    else:
+        short_name = dic_info[1]
    soup = RequestUrl(url, payload, item_id, start_time)
    if soup == '':
        return
@@ -96,7 +102,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
            pdf_url_info = td_list[2]
            # print(pdf_url)
            pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
-            name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') + '.pdf'
+            name_pdf_ = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') + '.pdf'

            pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
            # todo:判断发布日期是否是日期格式
@@ -118,18 +124,18 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
            report_type = td_list[4].text.strip()
            # print(report_type)
            if report_type == '年报':
-                if '摘要' in name_pdf:
+                if '摘要' in name_pdf_:
                    continue
                # 年份还从pdf名称里抽取
                try:
-                    year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
+                    year = re.findall('\d{4}\s*年', name_pdf_)[0].replace('年', '')
                except Exception as e:
                    # pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
                    year = int(pub_time[:4]) - 1
-                    # year = str(year)
+                    year = str(year)

                # page_size = 0
-
+                name_pdf = f'{short_name}:{year}年年度报告.pdf'
                sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
                cursor_.execute(sel_sql, (item_id, year))
                selects = cursor_.fetchone()
@@ -322,7 +328,7 @@ if __name__ == '__main__':
        start_time = time.time()
        # 获取企业信息
        social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
-        # social_code = '91210800765420138L'
+        # social_code = '91100000100003962T'
        if not social_code:
            time.sleep(20)
            continue

--- a/comData/annualReport/雪球网-年报.py
+++ b/comData/annualReport/雪球网-年报.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -11,9 +11,10 @@ import json
 from datetime import datetime

 from kafka import KafkaProducer
-from base.BaseCore import BaseCore
-
-baseCore = BaseCore()
+import sys
+sys.path.append('D:\\KK\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
 import requests, re, time, pymysql, fitz
 from bs4 import BeautifulSoup as bs
 from selenium import webdriver
@@ -35,6 +36,7 @@ chromedriver = r'D:/cmd100/chromedriver.exe'
 browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
 log = baseCore.getLogger()
 requests.adapters.DEFAULT_RETRIES = 3
+
 #11数据库
 cnx = baseCore.cnx_
 cursor = baseCore.cursor_
@@ -259,10 +261,20 @@ if __name__ == '__main__':
    while True:
        start_time = time.time()
        # 获取企业信息
-        # social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
-        social_code = '91330000734507783B'
+        social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
+        # social_code = '91440300192176077R'
        if not social_code:
            time.sleep(20)
+            if not baseCore.check_mysql_conn(cnx):
+                # 11数据库
+                cnx = baseCore.cnx_
+                cursor = baseCore.cursor_
+                log.info('===11数据库重新连接成功===')
+            if not baseCore.check_mysql_conn(cnx_):
+                # 144数据库
+                cnx_ = baseCore.cnx
+                cursor_ = baseCore.cursor
+                log.info('===144数据库重新连接成功===')
            continue
        if social_code == 'None':
            time.sleep(20)