提交 b127603f 作者: 薛凌堃

年报部署

上级 3ed701a1
import json
import json
......@@ -6,11 +6,13 @@ from kafka import KafkaProducer
from bs4 import BeautifulSoup
import requests, re, time, pymysql, fitz
import urllib3
from base import BaseCore
import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
......@@ -66,7 +68,11 @@ def RequestUrl(url, payload, item_id, start_time):
def SpiderByZJH(url, payload, dic_info, num, start_time):
item_id = dic_info[2]
# years = dic_info['call_year']
short_name = dic_info[4]
short_name_ = dic_info[4]
if short_name_:
short_name = short_name_
else:
short_name = dic_info[1]
soup = RequestUrl(url, payload, item_id, start_time)
if soup == '':
return
......@@ -96,7 +102,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
pdf_url_info = td_list[2]
# print(pdf_url)
pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') + '.pdf'
name_pdf_ = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') + '.pdf'
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
# todo:判断发布日期是否是日期格式
......@@ -118,18 +124,18 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
report_type = td_list[4].text.strip()
# print(report_type)
if report_type == '年报':
if '摘要' in name_pdf:
if '摘要' in name_pdf_:
continue
# 年份还从pdf名称里抽取
try:
year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
year = re.findall('\d{4}\s*年', name_pdf_)[0].replace('年', '')
except Exception as e:
# pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
year = int(pub_time[:4]) - 1
# year = str(year)
year = str(year)
# page_size = 0
name_pdf = f'{short_name}:{year}年年度报告.pdf'
sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone()
......@@ -322,7 +328,7 @@ if __name__ == '__main__':
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
# social_code = '91210800765420138L'
# social_code = '91100000100003962T'
if not social_code:
time.sleep(20)
continue
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -11,9 +11,10 @@ import json
from datetime import datetime
from kafka import KafkaProducer
from base.BaseCore import BaseCore
baseCore = BaseCore()
import sys
sys.path.append('D:\\KK\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
import requests, re, time, pymysql, fitz
from bs4 import BeautifulSoup as bs
from selenium import webdriver
......@@ -35,6 +36,7 @@ chromedriver = r'D:/cmd100/chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
log = baseCore.getLogger()
requests.adapters.DEFAULT_RETRIES = 3
#11数据库
cnx = baseCore.cnx_
cursor = baseCore.cursor_
......@@ -259,10 +261,20 @@ if __name__ == '__main__':
while True:
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '91330000734507783B'
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
# social_code = '91440300192176077R'
if not social_code:
time.sleep(20)
if not baseCore.check_mysql_conn(cnx):
# 11数据库
cnx = baseCore.cnx_
cursor = baseCore.cursor_
log.info('===11数据库重新连接成功===')
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log.info('===144数据库重新连接成功===')
continue
if social_code == 'None':
time.sleep(20)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论