提交 b127603f 作者: 薛凌堃

年报部署

上级 3ed701a1
import json import json
...@@ -6,11 +6,13 @@ from kafka import KafkaProducer ...@@ -6,11 +6,13 @@ from kafka import KafkaProducer
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests, re, time, pymysql, fitz import requests, re, time, pymysql, fitz
import urllib3 import urllib3
from base import BaseCore import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
...@@ -66,7 +68,11 @@ def RequestUrl(url, payload, item_id, start_time): ...@@ -66,7 +68,11 @@ def RequestUrl(url, payload, item_id, start_time):
def SpiderByZJH(url, payload, dic_info, num, start_time): def SpiderByZJH(url, payload, dic_info, num, start_time):
item_id = dic_info[2] item_id = dic_info[2]
# years = dic_info['call_year'] # years = dic_info['call_year']
short_name = dic_info[4] short_name_ = dic_info[4]
if short_name_:
short_name = short_name_
else:
short_name = dic_info[1]
soup = RequestUrl(url, payload, item_id, start_time) soup = RequestUrl(url, payload, item_id, start_time)
if soup == '': if soup == '':
return return
...@@ -96,7 +102,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -96,7 +102,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
pdf_url_info = td_list[2] pdf_url_info = td_list[2]
# print(pdf_url) # print(pdf_url)
pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'') pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') + '.pdf' name_pdf_ = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') + '.pdf'
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'') pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
# todo:判断发布日期是否是日期格式 # todo:判断发布日期是否是日期格式
...@@ -118,18 +124,18 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -118,18 +124,18 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
report_type = td_list[4].text.strip() report_type = td_list[4].text.strip()
# print(report_type) # print(report_type)
if report_type == '年报': if report_type == '年报':
if '摘要' in name_pdf: if '摘要' in name_pdf_:
continue continue
# 年份还从pdf名称里抽取 # 年份还从pdf名称里抽取
try: try:
year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '') year = re.findall('\d{4}\s*年', name_pdf_)[0].replace('年', '')
except Exception as e: except Exception as e:
# pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4] # pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
year = int(pub_time[:4]) - 1 year = int(pub_time[:4]) - 1
# year = str(year) year = str(year)
# page_size = 0 # page_size = 0
name_pdf = f'{short_name}:{year}年年度报告.pdf'
sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id=1''' sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
cursor_.execute(sel_sql, (item_id, year)) cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone() selects = cursor_.fetchone()
...@@ -322,7 +328,7 @@ if __name__ == '__main__': ...@@ -322,7 +328,7 @@ if __name__ == '__main__':
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode') social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
# social_code = '91210800765420138L' # social_code = '91100000100003962T'
if not social_code: if not social_code:
time.sleep(20) time.sleep(20)
continue continue
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -11,9 +11,10 @@ import json ...@@ -11,9 +11,10 @@ import json
from datetime import datetime from datetime import datetime
from kafka import KafkaProducer from kafka import KafkaProducer
from base.BaseCore import BaseCore import sys
sys.path.append('D:\\KK\\zzsn_spider\\base')
baseCore = BaseCore() import BaseCore
baseCore = BaseCore.BaseCore()
import requests, re, time, pymysql, fitz import requests, re, time, pymysql, fitz
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from selenium import webdriver from selenium import webdriver
...@@ -35,6 +36,7 @@ chromedriver = r'D:/cmd100/chromedriver.exe' ...@@ -35,6 +36,7 @@ chromedriver = r'D:/cmd100/chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver) browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
log = baseCore.getLogger() log = baseCore.getLogger()
requests.adapters.DEFAULT_RETRIES = 3 requests.adapters.DEFAULT_RETRIES = 3
#11数据库 #11数据库
cnx = baseCore.cnx_ cnx = baseCore.cnx_
cursor = baseCore.cursor_ cursor = baseCore.cursor_
...@@ -259,10 +261,20 @@ if __name__ == '__main__': ...@@ -259,10 +261,20 @@ if __name__ == '__main__':
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode') social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '91330000734507783B' # social_code = '91440300192176077R'
if not social_code: if not social_code:
time.sleep(20) time.sleep(20)
if not baseCore.check_mysql_conn(cnx):
# 11数据库
cnx = baseCore.cnx_
cursor = baseCore.cursor_
log.info('===11数据库重新连接成功===')
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log.info('===144数据库重新连接成功===')
continue continue
if social_code == 'None': if social_code == 'None':
time.sleep(20) time.sleep(20)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论