提交 63cac106 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

...@@ -5,22 +5,18 @@ import socket ...@@ -5,22 +5,18 @@ import socket
import sys import sys
import time import time
import fitz
import logbook import logbook
import logbook.more import logbook.more
import pandas as pd import pandas as pd
import requests import requests
import zhconv import zhconv
import pymysql
import redis import redis
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid import langid
#创建连接池 #创建连接池
import pymysql import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB from DBUtils.PooledDB import PooledDB
# import sys # import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client') # sys.path.append('D://zzsn_spider//base//fdfs_client')
...@@ -28,6 +24,15 @@ from DBUtils.PooledDB import PooledDB ...@@ -28,6 +24,15 @@ from DBUtils.PooledDB import PooledDB
from fdfs_client.client import get_tracker_conf, Fdfs_client from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf') tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
from obs import ObsClient
import fitz
from urllib.parse import unquote
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore: class BaseCore:
...@@ -659,12 +664,10 @@ class BaseCore: ...@@ -659,12 +664,10 @@ class BaseCore:
create_time = retData['create_time'] create_time = retData['create_time']
order_by = num order_by = num
selects = self.secrchATT(item_id,year,type_id) selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects: if selects:
self.getLogger().info(f'com_name:{com_name}已存在') self.getLogger().info(f'com_name:{com_name}--{year}已存在')
id = selects[0] id = ''
return id return id
else: else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
...@@ -695,6 +698,80 @@ class BaseCore: ...@@ -695,6 +698,80 @@ class BaseCore:
log = self.getLogger() log = self.getLogger()
log.info('======保存企业CIK失败=====') log.info('======保存企业CIK失败=====')
#上传至obs华为云服务器,并解析破地方的内容和页数
# 获取文件大小
def convert_size(self,size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def obsexist(self,file_path):
# # 文件路径
# file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
# 检查文件是否存在
response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300:
self.getLogger().info('=====文件不存在obs=====')
else:
self.getLogger().info(f'=====文件存在obs========{file_path}')
def uptoOBS(self,pdf_url, name_pdf,type_id, social_code,pathType,taskType,start_time):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
name = name_pdf + '.pdf'
now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', f'{pathType}{now_time}/' + name, content=response.content)
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = self.getTimeCost(start_time, time.time())
self.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
......
...@@ -475,7 +475,14 @@ def kegaishifan(): ...@@ -475,7 +475,14 @@ def kegaishifan():
#双百企业 #双百企业
def shuangbaiqiye(): def shuangbaiqiye():
pass cnx, cursor = connectSql()
query = "SELECT CompanyName FROM Hundred"
cursor.execute(query)
result = cursor.fetchall()
cnx.commit()
com_namelist = [item[0] for item in result]
for item in com_namelist:
r.rpush('hundred:baseinfo', item)
#专精特新 #专精特新
def zhuangjingtexind(): def zhuangjingtexind():
...@@ -484,7 +491,8 @@ def zhuangjingtexind(): ...@@ -484,7 +491,8 @@ def zhuangjingtexind():
if __name__ == "__main__": if __name__ == "__main__":
start = time.time() start = time.time()
# danxiangguanjun() # danxiangguanjun()
kegaishifan() # kegaishifan()
shuangbaiqiye()
# NoticeEnterprise() # NoticeEnterprise()
# AnnualEnterpriseIPO() # AnnualEnterpriseIPO()
# AnnualEnterprise() # AnnualEnterprise()
......
import json import json
import random
import requests, time, pymysql import requests, time, pymysql
import jieba import jieba
import sys import sys
...@@ -45,24 +47,21 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -45,24 +47,21 @@ def beinWork(tyc_code, social_code,start_time):
retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0} retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
t = time.time() t = time.time()
url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100' url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
for m in range(0, 3):
try: try:
for m in range(0, 3):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent() headers['User-Agent'] = baseCore.getRandomUserAgent()
response = requests.get(url=url, headers=headers, proxies=ip, verify=False) response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
# time.sleep(random.randint(3, 5)) time.sleep(random.randint(3, 5))
break break
except Exception as e:
pass
if (response.status_code == 200): if (response.status_code == 200):
pass pass
else: except Exception as e:
log.error(f"{tyc_code}-----获取总数接口失败") log.error(f"{tyc_code}-----获取总数接口失败")
e = '获取总数接口失败' error = '获取总数接口失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, e) baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{error}----{e}')
return retData return retData
try: try:
json_1 = json.loads(response.content.decode('utf-8')) json_1 = json.loads(response.content.decode('utf-8'))
...@@ -177,7 +176,7 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -177,7 +176,7 @@ def beinWork(tyc_code, social_code,start_time):
pass pass
continue continue
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
# 动态信息列表 # 动态信息列表
up_okCount = up_okCount + 1 up_okCount = up_okCount + 1
list_info = [ list_info = [
...@@ -185,6 +184,7 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -185,6 +184,7 @@ def beinWork(tyc_code, social_code,start_time):
link, link,
'天眼查', '天眼查',
'2', '2',
time_format
] ]
cursor_.execute(insert_sql, tuple(list_info)) cursor_.execute(insert_sql, tuple(list_info))
cnx_.commit() cnx_.commit()
...@@ -214,10 +214,10 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -214,10 +214,10 @@ def beinWork(tyc_code, social_code,start_time):
} }
except Exception as e: except Exception as e:
log.info(f'传输失败:{social_code}----{link}') log.info(f'传输失败:{social_code}----{link}')
e = '数据库传输失败' error = '数据库传输失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e) baseCore.recordLog(social_code, taskType, state, takeTime, link, f'{error}----{e}')
continue continue
# print(dic_news) # print(dic_news)
# 将相应字段通过kafka传输保存 # 将相应字段通过kafka传输保存
......
import json import json
...@@ -21,6 +21,7 @@ tracker_conf = get_tracker_conf('./client.conf') ...@@ -21,6 +21,7 @@ tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
taskType = '企业年报/证监会' taskType = '企业年报/证监会'
pathType = 'ZJHAnnualReport/'
def RequestUrl(url, payload, item_id, start_time): def RequestUrl(url, payload, item_id, start_time):
# ip = get_proxy()[random.randint(0, 3)] # ip = get_proxy()[random.randint(0, 3)]
...@@ -43,26 +44,26 @@ def RequestUrl(url, payload, item_id, start_time): ...@@ -43,26 +44,26 @@ def RequestUrl(url, payload, item_id, start_time):
return soup return soup
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, # def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, page_size): # create_by, create_time, page_size):
#
sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1''' # sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
cursor_.execute(sel_sql, (item_id, year)) # cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone() # selects = cursor_.fetchone()
if selects: # if selects:
print(f'{name_pdf},{year}已存在') # print(f'{name_pdf},{year}已存在')
#
else: # else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' # Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
#
values = ( # values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, # year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, # create_by,
create_time, page_size) # create_time, page_size)
#
cursor_.execute(Upsql, values) # 插入 # cursor_.execute(Upsql, values) # 插入
cnx.commit() # 提交 # cnx.commit() # 提交
print("更新完成:{}".format(Upsql)) # print("更新完成:{}".format(Upsql))
# 采集信息 # 采集信息
def SpiderByZJH(url, payload, dic_info, num, start_time): def SpiderByZJH(url, payload, dic_info, num, start_time):
...@@ -121,19 +122,24 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -121,19 +122,24 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
cursor_.execute(sel_sql, (item_id, year)) cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone() selects = cursor_.fetchone()
if selects: if selects:
print(f'com_name:{short_name}、{year}已存在') log.info(f'com_name:{short_name}、{year}已存在')
continue continue
else: else:
retData = baseCore.upLoadToServe(pdf_url, 1, social_code) retData = baseCore.uptoOBS(pdf_url,name_pdf, 1, social_code,pathType,taskType,start_time)
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
#插入数据库获取att_id #插入数据库获取att_id
num = num + 1 num = num + 1
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num) att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num)
content = retData['content'] if att_id:
if retData['state']:
pass pass
else: else:
log.info(f'====pdf解析失败====')
return False return False
content = retData['content']
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
'attachmentIds': att_id, 'attachmentIds': att_id,
...@@ -169,7 +175,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -169,7 +175,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
'message': '操作成功', 'message': '操作成功',
'code': '200', 'code': '200',
} }
print(dic_result) log.info(dic_result)
return True return True
except Exception as e: except Exception as e:
dic_result = { dic_result = {
...@@ -181,7 +187,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -181,7 +187,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
print(dic_result) log.info(dic_result)
return False return False
else: else:
continue continue
...@@ -311,7 +317,8 @@ if __name__ == '__main__': ...@@ -311,7 +317,8 @@ if __name__ == '__main__':
time.sleep(20) time.sleep(20)
continue continue
dic_info = baseCore.getInfomation(social_code) dic_info = baseCore.getInfomation(social_code)
count = dic_info[15] count = dic_info[16]
log.info(f'====正在采集{social_code}=====')
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
url_parms = ['101111', '101811', '102611'] url_parms = ['101111', '101811', '102611']
...@@ -322,7 +329,7 @@ if __name__ == '__main__': ...@@ -322,7 +329,7 @@ if __name__ == '__main__':
dic_parms = getUrl(code, url_parms, Catagory2_parms) dic_parms = getUrl(code, url_parms, Catagory2_parms)
SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time) SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time)
end_time = time.time() end_time = time.time()
print(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}') log.info(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}')
count += 1 count += 1
runType = 'AnnualReportCount' runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count) baseCore.updateRun(social_code, runType, count)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -152,24 +152,23 @@ def spider_annual_report(dict_info,num): ...@@ -152,24 +152,23 @@ def spider_annual_report(dict_info,num):
cursor.execute(sel_sql, (social_code, int(year))) cursor.execute(sel_sql, (social_code, int(year)))
selects = cursor.fetchone() selects = cursor.fetchone()
if selects: if selects:
print(f'com_name:{com_name}、{year}已存在') log.info(f'com_name:{com_name}、{year}已存在')
continue continue
else: else:
page_size = 0 #上传文件至obs服务器
#上传文件至文件服务器 retData = baseCore.uptoOBS(pdf_url,name_pdf,1,social_code,pathType,taskType,start_time)
retData = baseCore.upLoadToServe(pdf_url,1,social_code)
num = num + 1
try:
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num)
content = retData['content']
if retData['state']: if retData['state']:
pass pass
else: else:
log.info(f'====pdf解析失败====') log.info(f'====pdf解析失败====')
return False return False
num = num + 1
try:
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num)
content = retData['content']
state = 1 state = 1
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '') baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '成功')
except: except:
exception = '数据库传输失败' exception = '数据库传输失败'
state = 0 state = 0
...@@ -236,6 +235,7 @@ def spider_annual_report(dict_info,num): ...@@ -236,6 +235,7 @@ def spider_annual_report(dict_info,num):
if __name__ == '__main__': if __name__ == '__main__':
num = 0 num = 0
taskType = '企业年报/雪球网' taskType = '企业年报/雪球网'
pathType = 'XQWAnnualReport/'
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
......
...@@ -14,6 +14,12 @@ def conn11(): ...@@ -14,6 +14,12 @@ def conn11():
cursor = conn.cursor() cursor = conn.cursor()
return conn,cursor return conn,cursor
def conn144():
conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
#企业公告 #企业公告
def shizhiCodeFromSql(): def shizhiCodeFromSql():
conn,cursor=conn11() conn,cursor=conn11()
...@@ -31,6 +37,7 @@ def shizhiCodeFromSql(): ...@@ -31,6 +37,7 @@ def shizhiCodeFromSql():
finally: finally:
cursor.close() cursor.close()
conn.close() conn.close()
#企业公告 #企业公告
def yahooCodeFromSql(): def yahooCodeFromSql():
conn,cursor=conn11() conn,cursor=conn11()
...@@ -49,6 +56,25 @@ def yahooCodeFromSql(): ...@@ -49,6 +56,25 @@ def yahooCodeFromSql():
cursor.close() cursor.close()
conn.close() conn.close()
#新浪纽交所股票对应的代码
def sinausstockCodeFromSql():
conn,cursor=conn144()
try:
gn_query = "select ticker from mgzqyjwyh_list where state=2 and exchange='NYSE'; "
cursor.execute(gn_query)
gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result]
print('sinausstockCodeFromSql开始将股票代码放入redis=======')
for item in gn_social_list:
r.rpush('sina_usstock:securities_code', item)
print('sinausstockCodeFromSql将股票代码放入redis结束')
except Exception as e:
log.info("数据查询异常")
finally:
cursor.close()
conn.close()
def yahooCode_task(): def yahooCode_task():
# 实例化一个调度器 # 实例化一个调度器
scheduler = BlockingScheduler() scheduler = BlockingScheduler()
...@@ -58,9 +84,12 @@ def yahooCode_task(): ...@@ -58,9 +84,12 @@ def yahooCode_task():
scheduler.add_job(yahooCodeFromSql, 'cron', day='*/3', hour=0, minute=0) scheduler.add_job(yahooCodeFromSql, 'cron', day='*/3', hour=0, minute=0)
# 每天执行一次 # 每天执行一次
scheduler.add_job(shizhiCodeFromSql, 'cron', hour=10,minute=0) scheduler.add_job(shizhiCodeFromSql, 'cron', hour=10,minute=0)
# 每天执行一次
scheduler.add_job(sinausstockCodeFromSql, 'cron', day='*/3', hour=0, minute=0)
try: try:
yahooCodeFromSql() # 定时开始前执行一次 # yahooCodeFromSql() # 定时开始前执行一次
shizhiCodeFromSql() # 定时开始前执行一次 # shizhiCodeFromSql() # 定时开始前执行一次
sinausstockCodeFromSql() # 定时开始前执行一次
scheduler.start() scheduler.start()
except Exception as e: except Exception as e:
print('定时采集异常', e) print('定时采集异常', e)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -373,6 +373,28 @@ class YahooCaiwu(object): ...@@ -373,6 +373,28 @@ class YahooCaiwu(object):
currency='' currency=''
return currency return currency
#对比指标计算
def calculateIndexReq(self):
get_url = 'http://114.115.236.206:8088/sync/calculateIndex'
try:
params={
'type':2
}
resp = requests.get(get_url,params=params)
print(resp.text)
text=json.loads(resp.text)
codee=text['code']
while codee==-200:
time.sleep(600)
resp = requests.get(get_url)
print(resp.text)
text=json.loads(resp.text)
codee=text['code']
if codee==-200:
break
print('调用接口成功!!')
except:
print('调用失败!')
if __name__ == '__main__': if __name__ == '__main__':
# parse_excel() # parse_excel()
#get_content1() #get_content1()
...@@ -383,8 +405,11 @@ if __name__ == '__main__': ...@@ -383,8 +405,11 @@ if __name__ == '__main__':
securitiescode=yahoo.getCodeFromRedis() securitiescode=yahoo.getCodeFromRedis()
yahoo.get_content2(securitiescode) yahoo.get_content2(securitiescode)
except Exception as e: except Exception as e:
print('没有数据暂停5分钟')
yahoo.calculateIndexReq()
if securitiescode: if securitiescode:
yahoo.r.rpush('NoticeEnterprise:securities_code',securitiescode) yahoo.r.rpush('NoticeEnterprise:securities_code',securitiescode)
else: else:
time.sleep(300) time.sleep(300)
print('没有数据暂停5分钟')
import configparser import configparser
...@@ -20,6 +20,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) ...@@ -20,6 +20,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from operator import itemgetter from operator import itemgetter
from itertools import groupby from itertools import groupby
import datetime import datetime
from decimal import Decimal
class SinaUsstock(object): class SinaUsstock(object):
...@@ -54,13 +55,19 @@ class SinaUsstock(object): ...@@ -54,13 +55,19 @@ class SinaUsstock(object):
seriesValue=tddoc.find('td').text().split(' ') seriesValue=tddoc.find('td').text().split(' ')
for i in range(0,len(pdate)): for i in range(0,len(pdate)):
value=seriesValue[i] value=seriesValue[i]
try:
if '亿' in value: if '亿' in value:
value = value.replace("亿", "*100000000") value = value.replace("亿", "").replace(",", "")
value = eval(value) value = Decimal(value) * Decimal('100000000')
# value = eval(value)
elif '万' in value: elif '万' in value:
value = value.replace("万", "*10000") value = value.replace("万", "").replace(",", "")
value = eval(value) value = Decimal(value) * Decimal('10000')
vvla=str(value) # value = eval(value)
except Exception as e:
print(e)
print(value)
vvla=str(value).replace(",", "")
serisemsg={ serisemsg={
'name':seriesName, 'name':seriesName,
'value':vvla, 'value':vvla,
...@@ -71,6 +78,31 @@ class SinaUsstock(object): ...@@ -71,6 +78,31 @@ class SinaUsstock(object):
return seriesList return seriesList
# 判断股票代码是否存在
def check_code(self,com_code):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
res = r.exists('com_sinacaiwushuju_code::'+com_code)
#如果key存在 则不是第一次采集该企业, res = 1
if res:
return False #表示不是第一次采集
else:
return True #表示是第一次采集
def check_date(self,com_code,info_date):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=3)
res = r.sismember('com_sinacaiwushuju_code::'+com_code, info_date) # 注意是 保存set的方式
if res:
return True
else:
return False
# 将采集后的股票代码对应的报告期保存进redis
def add_date(self,com_code,date_list):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
#遍历date_list 放入redis
for date in date_list:
res = r.sadd('com_sinacaiwushuju_code::'+com_code,date)
def getCodeFromRedis(self): def getCodeFromRedis(self):
securitiescode=self.r.lpop('sina_usstock:securities_code') securitiescode=self.r.lpop('sina_usstock:securities_code')
securitiescode = securitiescode.decode('utf-8') securitiescode = securitiescode.decode('utf-8')
...@@ -209,7 +241,7 @@ class SinaUsstock(object): ...@@ -209,7 +241,7 @@ class SinaUsstock(object):
#转换数据格式发送接口 #转换数据格式发送接口
annualzb=zbl1+zbl3+zbl5 annualzb=zbl1+zbl3+zbl5
annualzb=self.groupZbData(annualzb,stock,social_credit_code,'annual') annualzb=self.groupZbData(annualzb,stock,social_credit_code,'year')
self.sendToFinance(annualzb) self.sendToFinance(annualzb)
quarterzb=zbl2+zbl4+zbl6 quarterzb=zbl2+zbl4+zbl6
quarterzb=self.groupZbData(quarterzb,stock,social_credit_code,'quarter') quarterzb=self.groupZbData(quarterzb,stock,social_credit_code,'quarter')
...@@ -228,15 +260,26 @@ class SinaUsstock(object): ...@@ -228,15 +260,26 @@ class SinaUsstock(object):
def sendToFinance(self,zbmsg): def sendToFinance(self,zbmsg):
for zbb in zbmsg: for zbb in zbmsg:
com_code=zbb['securitiesCode']
com_date=zbb['date']
#判断股票代码是否采集过
if self.check_code(com_code):
zbb['ynFirst']=True
if len(zbb) != 0: if len(zbb) != 0:
# 调凯歌接口存储数据 # 调凯歌接口存储数据
data = json.dumps(zbb) data = json.dumps(zbb)
#暂无接口 #暂无接口
url_baocun = '' url_baocun = 'http://114.115.236.206:8088/sync/finance/sina'
# url_baocun = 'http://114.115.236.206:8088/sync/finance/df' # url_baocun = 'http://114.115.236.206:8088/sync/finance/df'
for nnn in range(0, 3): for nnn in range(0, 3):
try: try:
res_baocun = requests.post(url_baocun, data=data) res_baocun = requests.post(url_baocun, data=data)
#将采集到的股票代码和日期进行记录用来标记是否采集过
com_date_list=[]
com_date_list.append(com_date)
self.add_date(com_code,com_date)
self.logger.info(res_baocun.text) self.logger.info(res_baocun.text)
break break
except: except:
...@@ -309,7 +352,7 @@ class SinaUsstock(object): ...@@ -309,7 +352,7 @@ class SinaUsstock(object):
if __name__ == '__main__': if __name__ == '__main__':
sinaUsstock=SinaUsstock() sinaUsstock=SinaUsstock()
# securitiescode= sinaUsstock.r.lpop('sina_usstock:securities_code') # securitiescode= sinaUsstock.r.lpop('sina_usstock:securities_code')
securitiescode= sinaUsstock.getCodeFromRedis() # securitiescode= sinaUsstock.getCodeFromRedis()
securitiescode='AAPL' securitiescode='AAPL'
try: try:
sinaUsstock.get_content2(securitiescode) sinaUsstock.get_content2(securitiescode)
......
...@@ -541,7 +541,10 @@ class BaseCore: ...@@ -541,7 +541,10 @@ class BaseCore:
self.cursor.execute(query) self.cursor.execute(query)
token_list = self.cursor.fetchall() token_list = self.cursor.fetchall()
self.cnx.commit() self.cnx.commit()
try:
token = token_list[random.randint(0, len(token_list)-1)][0] token = token_list[random.randint(0, len(token_list)-1)][0]
except:
token = ''
return token return token
# 删除失效的token # 删除失效的token
......
# 核心工具包
import os
import random
import socket
import sys
import time
import fitz
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import pymysql
import redis
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('/base/client.conf')
client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
# self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
# charset='utf8mb4')
# self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except :
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
pass
else:
end=str.rfind(endStr)
if end==-1:
pass
else:
str = str[0:end+1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self,key):
item = self.r.lpop(key)
return item.decode() if item else None
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
# 获取本机IP
def getIP(self):
IP = socket.gethostbyname(socket.gethostname())
return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, com_name):
data = []
try:
sql = f"SELECT * FROM Hundred WHERE CompanyName = '{com_name}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql,values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
#获取企查查token
def GetToken(self):
#获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token_list = self.cursor.fetchall()
self.cnx.commit()
try:
token = token_list[random.randint(0, len(token_list)-1)][0]
except:
token = ''
return token
# 删除失效的token
def delete_token(self,token):
deletesql = f"delete from QCC_token where token='{token}' "
self.cursor.execute(deletesql)
self.cnx.commit()
#获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
self.cnx.commit()
return token
#检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,key,item):
self.r.rpush(key, item)
#增加计数器的值并返回增加后的值
def incrSet(self,key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
#获取key剩余的过期时间
def getttl(self,key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
#上传至文件服务器,并解析pdf的内容和页数
def upLoadToServe(self,pdf_url,type_id,social_code):
headers = {}
retData = {'state':False,'type_id':type_id,'item_id':social_code,'group_name':'group1','path':'','full_path':'',
'category':'pdf','file_size':'','status':1,'create_by':'XueLingKun',
'create_time':'','page_size':'','content':''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
print(f'======pdf解析失败=====')
return retData
else:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,year,type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,year,type_id)
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self,social_code,cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
# -*- coding: utf-8 -*-
import pandas as pd
import time
import requests
import json
from kafka import KafkaProducer
from BaseCore import BaseCore
from getQccId import find_id_by_name
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log = baseCore.getLogger()
# 通过企查查id获取企业基本信息
def info_by_id(com_id,com_name):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
com_jc_name = ''
try:
result_dict = resp_dict['result']['Company']
except:
log.info(com_name + ":获取失败===========重新放入redis")
baseCore.rePutIntoR('hundred:baseinfo',com_name)
return aa_dict_list
company_name = result_dict['Name']
CreditCode = result_dict['CreditCode']
if CreditCode is None:
CreditCode = ''
try:
OperName = result_dict['Oper']['Name']
except:
OperName = ''
if OperName is None:
OperName = ''
if baseCore.str_have_num(OperName):
OperName = ''
try:
Status = result_dict['ShortStatus']
except:
Status = ''
if Status is None:
Status = ''
try:
StartDate = result_dict['StartDate']
except:
StartDate = ''
if StartDate is None:
StartDate = ''
try:
RegistCapi = result_dict['RegistCapi']
except:
RegistCapi = ''
if RegistCapi is None:
RegistCapi = ''
RecCap = '' # result_dict['RecCap'] #实际缴纳金额,现已没有显示
if RecCap is None:
RecCap = ''
try:
OrgNo = result_dict['CreditCode'][8:-2] + '-' + result_dict['CreditCode'][-2] # 组织机构代码,现已没有显示
except:
OrgNo = ''
if OrgNo is None:
OrgNo = ''
try:
TaxNo = result_dict['TaxNo']
except:
TaxNo = ''
if TaxNo is None:
TaxNo = ''
try:
EconKind = result_dict['EconKind']
except:
EconKind = ''
if EconKind is None:
EconKind = ''
TermStart = '' # result_dict['TermStart'] 营业期限自,现已没有显示
if TermStart is None:
TermStart = ''
TeamEnd = '' # result_dict['TeamEnd']营业期限至,现已没有显示
if TeamEnd is None:
TeamEnd = ''
try:
SubIndustry = result_dict['Industry']['SubIndustry']
except:
SubIndustry = ''
if SubIndustry is None:
SubIndustry = ''
try:
Province = result_dict['Area']['Province']
except:
Province = ''
try:
City = result_dict['Area']['City']
except:
City = ''
try:
County = result_dict['Area']['County']
except:
County = ''
try:
region = Province + City + County
except:
region = ''
BelongOrg = '' # result_dict['BelongOrg']登记机关,现已没有显示
can_bao = ''
CommonList = [] # result_dict['CommonList']参保人数,现已没有显示
for Common_dict in CommonList:
try:
KeyDesc = Common_dict['KeyDesc']
except:
continue
if KeyDesc == '参保人数':
can_bao = Common_dict['Value']
if can_bao == '0':
can_bao = ''
OriginalName = ''
try:
OriginalName_lists = result_dict['OriginalName']
for OriginalName_dict in OriginalName_lists:
OriginalName += OriginalName_dict['Name'] + ' '
except:
OriginalName = ''
try:
OriginalName.strip()
except:
OriginalName = ''
EnglishName = '' # result_dict['EnglishName']企业英文名,现已没有显示
if EnglishName is None:
EnglishName = ''
IxCode = '' # result_dict['IxCode']进出口企业代码,现已没有显示
if IxCode is None:
IxCode = ''
Address = result_dict['Address']
if Address is None:
Address = ''
Scope = '' # result_dict['Scope']经营范围,现已没有显示
if Scope is None:
Scope = ''
try:
PhoneNumber = result_dict['companyExtendInfo']['Tel']
except:
PhoneNumber = ''
if PhoneNumber is None:
PhoneNumber = ''
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
try:
Email = result_dict['companyExtendInfo']['Email']
except:
Email = ''
if Email is None:
Email = ''
try:
Desc = result_dict['companyExtendInfo']['Desc']
except:
Desc = ''
if Desc is None:
Desc = ''
try:
Info = result_dict['companyExtendInfo']['Info']
except:
Info = ''
if Info is None:
Info = ''
company_name = baseCore.hant_2_hans(company_name)
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}".format(token, t,
com_id)
resp_dict2 = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(1)
try:
com2 = resp_dict2['result']['Company']
except:
com2 = ''
try:
Scope = com2['Scope']
except:
Scope = ''
try:
CheckDate = com2['CheckDate']
except:
CheckDate = ''
if CheckDate is None:
CheckDate = ''
try:
TaxpayerType = com2['TaxpayerType'] #纳税人资质
except:
TaxpayerType = ''
if TaxpayerType is None:
TaxpayerType = ''
try:
No = com2['No']
except:
No = ''
if No is None:
No = ''
try:
IxCode = com2['IxCode']
except:
IxCode = ''
try:
OrgNo = com2['OrgNo']
except:
OrgNo = ''
try:
for Common_t in com2['CommonList']:
try:
if Common_t['KeyDesc'] == '参保人数':
can_bao = Common_t['Value']
except:
pass
except:
can_bao = ''
try:
TermStart = com2['TermStart']
except:
TermStart = ''
try:
TeamEnd = com2['TeamEnd']
except:
TeamEnd = ''
try:
RecCap = com2['RecCap']
except:
RecCap = ''
try:
No = com2['No']
except:
No = ''
try:
SubIndustry = com2['IndustryArray'][-1]
except:
SubIndustry = ''
try:
BelongOrg = com2['BelongOrg']
except:
BelongOrg = ''
try:
EnglishName = com2['EnglishName']
except:
EnglishName = ''
aa_dict = {
'qccId': com_id, # 企查查企业id
'name': company_name, # 企业名称
'shortName': com_jc_name, # 企业简称
'socialCreditCode': CreditCode, # 统一社会信用代码
'legalPerson': OperName, # 法定代表人
'officialPhone': PhoneNumber, # 电话
'officialUrl': WebSite, # 官网
'officialEmail': Email, # 邮箱
'briefInfo': Desc, # 简介
'registerStatus': Status, # 登记状态
'incorporationDate': StartDate, # 成立日期
'capital': RegistCapi, # 注册资本
'paidCapital': RecCap, # 实缴资本
'approvalDate': CheckDate, # 核准日期
'organizationCode': OrgNo, # 组织机构代码
'registerNo': No, # 工商注册号
'taxpayerNo': CreditCode, # 纳税人识别号
'type': EconKind, # 企业类型
'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': SubIndustry, # 所属行业
'region': region,
'province': Province, # 所属省
'city': City, # 所属市
'county': County, # 所属县
'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模
'insured': can_bao, # 参保人数
'beforeName': OriginalName, # 曾用名
'englishName': EnglishName, # 英文名
'importExportEnterpriseCode': IxCode, # 进出口企业代码
'address': Address, # 地址
'businessRange': Scope, # 经营范围
'status': 0, # 状态
}
aa_dict_list.append(aa_dict)
log.info(company_name + ":爬取完成")
return aa_dict_list
if __name__ == '__main__':
taskType = '基本信息/企查查/单项双百企业冠军'
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
list_weicha = []
name_list = []
#从redis里拿数据
while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken()
if token:
pass
else:
log.info('==========已无token==========')
time.sleep(30)
continue
# list_all_info = []
start_time = time.time()
# 获取企业信息
com_name = baseCore.redicPullData('hundred:baseinfo')
# com_name = '卓新市万达铸业有限公司'
if com_name == '' or com_name is None:
time.sleep(20)
continue
dic_info = baseCore.getInfomation(com_name)
log.info(f'----当前企业{com_name}--开始处理---')
social_code = dic_info[5]
#企查查id
company_id = dic_info[6]
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == None:
if social_code:
company_id = find_id_by_name(start_time,token,social_code)
else:
company_id = find_id_by_name(start_time,token,com_name)
if company_id == 'null':
log.info('=====搜索不到该企业====')
#todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
baseCore.rePutIntoR('hundred:baseinfo', com_name + ':搜索不到')
continue
if not company_id:
log.info(com_name + ":企业ID获取失败===重新放入redis")
list_weicha.append(com_name + ":企业ID获取失败")
baseCore.rePutIntoR('hundred:baseinfo',com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
time.sleep(20)
continue
else:
log.info(f'====={com_name}===={company_id}=====获取企业id成功=====')
# todo:写入数据库
updateqccid = f"update Hundred set qccid = '{company_id}' where CompanyName = '{com_name}'"
cursor_.execute(updateqccid)
cnx_.commit()
try:
post_data_list = info_by_id(company_id, com_name)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('hundred:baseInfo', com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
continue
if post_data_list:
pass
else:
# log.info(f'======{social_code}====企查查token失效====')
time.sleep(20)
continue
for post_data in post_data_list:
# list_all_info.append(post_data)
if post_data is None:
print(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败")
continue
get_name = post_data['name']
get_socialcode = post_data['socialCreditCode']
#todo:将信用代码更新到表中
updatesocialcode = f"update Hundred set SocialCode = '{get_socialcode}' where CompanyName = '{com_name}'"
cursor_.execute(updatesocialcode)
cnx_.commit()
name_compile = {
'yuan_name':com_name,
'get_name':get_name
}
name_list.append(name_compile)
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
except:
exception = 'kafka传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
# break
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
# -*- coding: utf-8 -*-
import time
from urllib.parse import quote
import requests
import urllib3
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# headers = {
# 'Host': 'xcx.qcc.com',
# 'Connection': 'keep-alive',
# 'Qcc-Platform': 'mp-weixin',
# 'Qcc-Timestamp': '',
# 'Qcc-Version': '1.0.0',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br,'
# }
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'x-request-device-type': 'Android',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
'Content-Type': 'application/json',
'Qcc-Version': '1.0.0',
'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
'xweb_xhr': '1',
'xcx-version': '2023.09.27',
'Qcc-Platform': 'mp-weixin',
'Qcc-CurrentPage': '/company-subpackages/business/index',
'Qcc-Timestamp': '1696661787803',
'Qcc-RefPage': '/company-subpackages/detail/index',
'Accept': '*/*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh'
}
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(start,token,name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except Exception as e:
print(f'{e}-------------重试')
time.sleep(5)
continue
time.sleep(2)
#{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频,请升级小程序版本'}
if resp_dict['status']==40101:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
if resp_dict['status']==401:
KeyNo = False
log.info(f'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
try:
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = 'null'
else:
KeyNo = 'null'
except:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
return KeyNo
log.info("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
\ No newline at end of file
...@@ -541,7 +541,10 @@ class BaseCore: ...@@ -541,7 +541,10 @@ class BaseCore:
self.cursor.execute(query) self.cursor.execute(query)
token_list = self.cursor.fetchall() token_list = self.cursor.fetchall()
self.cnx.commit() self.cnx.commit()
try:
token = token_list[random.randint(0, len(token_list)-1)][0] token = token_list[random.randint(0, len(token_list)-1)][0]
except:
token = ''
return token return token
# 删除失效的token # 删除失效的token
......
import json import json
...@@ -18,12 +18,23 @@ cnx_ = baseCore.cnx_ ...@@ -18,12 +18,23 @@ cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_ cursor_ = baseCore.cursor_
taskType = '企业公告/证监会' taskType = '企业公告/证监会'
obsClient = ObsClient( obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码 access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址 server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
) )
#获取文件大小
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units)-1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(pdf_url,pdf_name,type_id,social_code): def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {} headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '', retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
...@@ -33,7 +44,8 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code): ...@@ -33,7 +44,8 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers['User-Agent'] = baseCore.getRandomUserAgent() headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3): for i in range(0, 3):
try: try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break break
except: except:
time.sleep(3) time.sleep(3)
...@@ -42,8 +54,9 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code): ...@@ -42,8 +54,9 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
for i in range(0, 3): for i in range(0, 3):
try: try:
name = pdf_name + '.pdf' name = pdf_name + '.pdf'
result = obsClient.putContent('zzsn', 'ZJH/'+name, content=resp_content) now_time = time.strftime("%Y-%m")
with fitz.open(stream=resp_content, filetype='pdf') as doc: result = obsClient.putContent('zzsn', f'ZJH/{now_time}/'+name, content=response.content)
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count page_size = doc.page_count
for page in doc.pages(): for page in doc.pages():
retData['content'] += page.get_text() retData['content'] += page.get_text()
...@@ -60,23 +73,25 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code): ...@@ -60,23 +73,25 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
try: try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('/ZJH')[0] retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl']) retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = result['Uploaded size'] retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now retData['create_time'] = time_now
retData['page_size'] = page_size retData['page_size'] = page_size
except: except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData return retData
return retData return retData
def secrchATT(item_id, name, type_id): def secrchATT(item_id, name, type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s ''' sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
cursor_.execute(sel_sql, (item_id, name, type_id)) cursor_.execute(sel_sql, (item_id, name, type_id,order_by))
selects = cursor_.fetchone() selects = cursor_.fetchone()
return selects return selects
# 插入到att表 返回附件id # 插入到att表 返回附件id
def tableUpdate(retData, com_name, year, pdf_name, num): def tableUpdate(retData, com_name, year, pdf_name, num):
item_id = retData['item_id'] item_id = retData['item_id']
...@@ -91,13 +106,13 @@ def tableUpdate(retData, com_name, year, pdf_name, num): ...@@ -91,13 +106,13 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
page_size = retData['page_size'] page_size = retData['page_size']
create_time = retData['create_time'] create_time = retData['create_time']
order_by = num order_by = num
selects = secrchATT(item_id, pdf_name, type_id) # selects = secrchATT(item_id, pdf_name, type_id)
#
if selects: # if selects:
log.info(f'com_name:{com_name}已存在') # log.info(f'pdf_name:{pdf_name}已存在')
id = selects[0] # id = ''
return id # return id
else: # else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = ( values = (
...@@ -108,7 +123,7 @@ def tableUpdate(retData, com_name, year, pdf_name, num): ...@@ -108,7 +123,7 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
cursor_.execute(Upsql, values) # 插入 cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交 cnx_.commit() # 提交
log.info("更新完成:{}".format(Upsql)) log.info("更新完成:{}".format(Upsql))
selects = secrchATT(item_id, pdf_name, type_id) selects = secrchATT(item_id, pdf_name, type_id,order_by)
id = selects[0] id = selects[0]
return id return id
...@@ -125,6 +140,7 @@ def RequestUrl(url, payload, social_code,start_time): ...@@ -125,6 +140,7 @@ def RequestUrl(url, payload, social_code,start_time):
pass pass
# 检查响应状态码 # 检查响应状态码
try:
if response.status_code == 200: if response.status_code == 200:
# 请求成功,处理响应数据 # 请求成功,处理响应数据
# print(response.text) # print(response.text)
...@@ -137,6 +153,12 @@ def RequestUrl(url, payload, social_code,start_time): ...@@ -137,6 +153,12 @@ def RequestUrl(url, payload, social_code,start_time):
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败') baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
soup = '' soup = ''
except:
log.error('请求失败:', url)
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
soup = ''
return soup return soup
def getUrl(code, url_parms, Catagory2_parms): def getUrl(code, url_parms, Catagory2_parms):
...@@ -215,7 +237,6 @@ def getUrl(code, url_parms, Catagory2_parms): ...@@ -215,7 +237,6 @@ def getUrl(code, url_parms, Catagory2_parms):
} }
return dic_parms return dic_parms
def ifInstert(short_name, social_code, pdf_url): def ifInstert(short_name, social_code, pdf_url):
ifexist = True ifexist = True
...@@ -229,16 +250,19 @@ def ifInstert(short_name, social_code, pdf_url): ...@@ -229,16 +250,19 @@ def ifInstert(short_name, social_code, pdf_url):
return ifexist return ifexist
else: else:
return ifexist return ifexist
def InsterInto(short_name, social_code, pdf_url):
def InsterInto(social_code, pdf_url,pub_time):
insert = False
# 信息插入数据库 # 信息插入数据库
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
list_info = [ list_info = [
social_code, social_code,
pdf_url, pdf_url,
'证监会', '证监会',
'1', '1',
pub_time,
] ]
#144数据库 #144数据库
cursor.execute(insert_sql, tuple(list_info)) cursor.execute(insert_sql, tuple(list_info))
...@@ -251,8 +275,18 @@ def InsterInto(short_name, social_code, pdf_url): ...@@ -251,8 +275,18 @@ def InsterInto(short_name, social_code, pdf_url):
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
return insert return insert
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num): def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
#判断文件是否已经存在obs服务器中
# file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
now_time = time.strftime("%Y-%m")
file_path = 'ZJH/'+now_time+'/'+pdf_name+'.pdf'
response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300:
log.info('=====文件不存在obs=====')
pass
else:
log.info(f'=====文件存在obs========{file_path}')
return False
#上传至华为云服务器 #上传至华为云服务器
retData = uptoOBS(pdf_url,pdf_name,8,social_code) retData = uptoOBS(pdf_url,pdf_name,8,social_code)
#附件插入att数据库 #附件插入att数据库
...@@ -263,12 +297,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -263,12 +297,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
return False return False
num = num + 1 num = num + 1
att_id = tableUpdate(retData,com_name,year,pdf_name,num) att_id = tableUpdate(retData,com_name,year,pdf_name,num)
content = retData['content'] if att_id:
if retData['state']:
pass pass
else: else:
log.info(f'====pdf解析失败====')
return False return False
content = retData['content']
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
...@@ -304,7 +337,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -304,7 +337,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
'message': '操作成功', 'message': '操作成功',
'code': '200', 'code': '200',
} }
print(dic_result) log.info(dic_result)
return True return True
except Exception as e: except Exception as e:
dic_result = { dic_result = {
...@@ -316,14 +349,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -316,14 +349,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
print(dic_result) log.info(dic_result)
return False return False
# 采集信息 # 采集信息
def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库中获取到的基本信息 def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库中获取到的基本信息
okCount = 0
errorCount = 0
social_code = dic_info[2] social_code = dic_info[2]
short_name = dic_info[4] short_name = dic_info[4]
com_name = dic_info[1] com_name = dic_info[1]
...@@ -335,26 +365,26 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -335,26 +365,26 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
try: try:
is_exist = soup.find('div',class_='con').text is_exist = soup.find('div',class_='con').text
if is_exist == '没有查询到数据': if is_exist == '没有查询到数据':
state = 1 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, '') baseCore.recordLog(social_code, taskType, state, takeTime, url, '没有查询到数据')
return return
except: except:
pass pass
# 先获取页数 # # 先获取页数
page = soup.find('div', class_='pages').find('ul', class_='g-ul').text # page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
#
total = re.findall(r'\d+', page)[0] # total = re.findall(r'\d+', page)[0]
#
r_page = int(total) % 15 # r_page = int(total) % 15
if r_page == 0: # if r_page == 0:
Maxpage = int(total) // 15 # Maxpage = int(total) // 15
else: # else:
Maxpage = int(total) // 15 + 1 # Maxpage = int(total) // 15 + 1
log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页') # log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
# 首页和其他页不同,遍历 如果是首页 修改一下链接 # # 首页和其他页不同,遍历 如果是首页 修改一下链接
for i in range(1, Maxpage + 1): for i in range(1,51):
log.info(f'==========正在采集第{i}页=========') log.info(f'==========正在采集第{i}页=========')
if i == 1: if i == 1:
href = url href = url
...@@ -366,9 +396,9 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -366,9 +396,9 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
if soup == '': if soup == '':
continue continue
tr_list = soup.find('div', id='txt').find_all('tr') tr_list = soup.find('div', id='txt').find_all('tr')
pageIndex = 0 # pageIndex = 0
for tr in tr_list[1:]: for tr in tr_list[1:]:
pageIndex += 1 # pageIndex += 1
td_list = tr.find_all('td') td_list = tr.find_all('td')
pdf_url_info = td_list[2] pdf_url_info = td_list[2]
# print(pdf_url) # print(pdf_url)
...@@ -376,6 +406,12 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -376,6 +406,12 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[1].strip('\'') name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[1].strip('\'')
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'') pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
#todo:判断发布日期是否是日期格式
pattern = r"^\d{4}-\d{2}-\d{2}$" # 正则表达式匹配YYYY-MM-DD格式的日期
if re.match(pattern, pub_time):
pass
else:
continue
year = pub_time[:4] year = pub_time[:4]
report_type = td_list[4].text.strip() report_type = td_list[4].text.strip()
...@@ -383,30 +419,22 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -383,30 +419,22 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
ifexist = ifInstert(short_name, social_code, pdf_url) ifexist = ifInstert(short_name, social_code, pdf_url)
#如果不存在 ifexist = True #如果不存在 ifexist = True
if ifexist: if ifexist:
# # 公告信息列表
# okCount = okCount + 1
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败 # 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
log.info(f'======={short_name}========{code}===插入公告库成功')
result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num) result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num)
if result: if result:
# 公告信息列表 # 公告信息列表
okCount = okCount + 1
log.info(f'{short_name}==============解析传输操作成功') log.info(f'{short_name}==============解析传输操作成功')
state = 1 state = 1
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')
#发送kafka成功之后 再插入数据库
insert = InsterInto(social_code,pdf_url,pub_time)
if insert:
log.info(f'===={social_code}========{name_pdf}=====插入库成功')
pass pass
else: else:
errorCount += 1
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.error(f'{short_name}=============解析或传输操作失败')
# try:
# insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex,type) values('{social_code}','证监会','{pdf_url}','{name_pdf}','{pub_time}',' ',now(),1,{i},{pageIndex},'1')"
# cursor_.execute(insert_err_sql)
# cnx_.commit()
# except:
# pass
continue continue
else: else:
log.info(f'======={short_name}========{code}===已存在') log.info(f'======={short_name}========{code}===已存在')
...@@ -449,14 +477,15 @@ if __name__ == '__main__': ...@@ -449,14 +477,15 @@ if __name__ == '__main__':
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode') # social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
# social_code = '9110000071092841XX' social_code = '91440500617540496Q'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
continue continue
dic_info = baseCore.getInfomation(social_code) dic_info = baseCore.getInfomation(social_code)
count = dic_info[16]
count = dic_info[17]
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
...@@ -474,11 +503,14 @@ if __name__ == '__main__': ...@@ -474,11 +503,14 @@ if __name__ == '__main__':
com_name = dic_info[1] com_name = dic_info[1]
dic_parms = getUrl(code, url_parms, Catagory2_parms) dic_parms = getUrl(code, url_parms, Catagory2_parms)
dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls) dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)
if dic_parms: if dic_parms:
start_time_cj = time.time() start_time_cj = time.time()
log.info(f'======开始处理{com_name}=====发行公告=======')
SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time,num) SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time,num)
log.info(f'{code}==========={short_name},{com_name},发行公告,耗时{baseCore.getTimeCost(start_time_cj, time.time())}') log.info(f'{code}==========={short_name},{com_name},发行公告,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
start_time_ls = time.time() start_time_ls = time.time()
log.info(f'======开始处理{com_name}=====临时报告=======')
SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time,num) SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time,num)
log.info(f'{code}==========={short_name},{com_name},临时报告,耗时{baseCore.getTimeCost(start_time_ls, time.time())}') log.info(f'{code}==========={short_name},{com_name},临时报告,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
# UpdateInfoSql(retData,retData_ls,social_code) # UpdateInfoSql(retData,retData_ls,social_code)
...@@ -487,11 +519,7 @@ if __name__ == '__main__': ...@@ -487,11 +519,7 @@ if __name__ == '__main__':
log.info(f'{short_name} ---- 该企业耗时 ---- {baseCore.getTimeCost(start_time, end_time)}-----------') log.info(f'{short_name} ---- 该企业耗时 ---- {baseCore.getTimeCost(start_time, end_time)}-----------')
count += 1 count += 1
runType = 'NoticeReportCount' runType = 'NoticeReportCount'
baseCore.updateRun(code, runType, count) baseCore.updateRun(social_code, runType, count)
cursor.close() cursor.close()
cnx.close() cnx.close()
# cursor_.close()
# cnx_.close()
# 释放资源
baseCore.close() baseCore.close()
...@@ -11,24 +11,28 @@ import logbook.more ...@@ -11,24 +11,28 @@ import logbook.more
import pandas as pd import pandas as pd
import requests import requests
import zhconv import zhconv
import pymysql
import redis import redis
from docx import Document
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid import langid
#创建连接池 #创建连接池
import pymysql import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB from DBUtils.PooledDB import PooledDB
# import sys # import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client') # sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('E:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf') tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
from obs import ObsClient
import fitz
from urllib.parse import unquote
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore: class BaseCore:
...@@ -437,9 +441,9 @@ class BaseCore: ...@@ -437,9 +441,9 @@ class BaseCore:
#解析word文件页数 #解析word文件页数
def doc_page(self,file_path): # def doc_page(self,file_path):
doc = Document(file_path) # doc = Document(file_path)
return len(doc.sections) # return len(doc.sections)
def pdf_content(self,resp_content): def pdf_content(self,resp_content):
# 解析pdf文件内容 # 解析pdf文件内容
content = '' content = ''
...@@ -507,9 +511,9 @@ class BaseCore: ...@@ -507,9 +511,9 @@ class BaseCore:
# retData['page_size'] = page_size # retData['page_size'] = page_size
return retData return retData
def secrchATT(self,item_id,file_name,type_id): def secrchATT(self,item_id,file_name,type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s ''' sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id)) self.cursor_.execute(sel_sql, (item_id, file_name, type_id,order_by))
selects = self.cursor_.fetchone() selects = self.cursor_.fetchone()
return selects return selects
...@@ -527,13 +531,8 @@ class BaseCore: ...@@ -527,13 +531,8 @@ class BaseCore:
page_size = retData['page_size'] page_size = retData['page_size']
create_time = retData['create_time'] create_time = retData['create_time']
order_by = num order_by = num
selects = self.secrchATT(item_id,file_name,type_id)
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id,full_path
else:
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = ( values = (
...@@ -544,11 +543,71 @@ class BaseCore: ...@@ -544,11 +543,71 @@ class BaseCore:
self.cursor_.execute(Upsql, values) # 插入 self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交 self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql)) self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,file_name,type_id) selects = self.secrchATT(item_id,file_name,type_id,order_by)
id = selects[0] id = selects[0]
return id,full_path return id,full_path
# 获取文件大小
def convert_size(self,size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(self,file_href,item_id,pathType,file_name):
headers = {}
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(file_href, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
# name = file_name
if category in file_name:
pass
else:
file_name = file_name + '.' + category
result = obsClient.putContent('zzsn', f'{pathType}' + file_name, content=response.content)
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
......
...@@ -224,6 +224,7 @@ def get_content1(): ...@@ -224,6 +224,7 @@ def get_content1():
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
try: try:
...@@ -383,6 +384,7 @@ def get_content2(): ...@@ -383,6 +384,7 @@ def get_content2():
# # 判断是否已经爬取过 # # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
try: try:
...@@ -563,6 +565,7 @@ def get_content3(): ...@@ -563,6 +565,7 @@ def get_content3():
pub_time = li.split('<span>[')[1].split(']</span>')[0] pub_time = li.split('<span>[')[1].split(']</span>')[0]
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
sendContent(href, headers,title,pub_time,num) sendContent(href, headers,title,pub_time,num)
...@@ -591,6 +594,7 @@ def get_content3(): ...@@ -591,6 +594,7 @@ def get_content3():
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
title = doc_item('a').attr('title') title = doc_item('a').attr('title')
...@@ -612,6 +616,7 @@ def get_content3(): ...@@ -612,6 +616,7 @@ def get_content3():
def bei_jing(): def bei_jing():
num = 0 num = 0
start_time = time.time() start_time = time.time()
pathType = 'policy/beijing/'
# 有反爬需要使用selenium # 有反爬需要使用selenium
# service = Service(r'D:/chrome/113/chromedriver.exe') # service = Service(r'D:/chrome/113/chromedriver.exe')
# 配置selenium # 配置selenium
...@@ -664,6 +669,7 @@ def bei_jing(): ...@@ -664,6 +669,7 @@ def bei_jing():
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href[0]}) is_href = db_storage.find_one({'网址': href[0]})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
# 对获取信息页面发送请求 # 对获取信息页面发送请求
...@@ -712,7 +718,7 @@ def bei_jing(): ...@@ -712,7 +718,7 @@ def bei_jing():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1667') retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -721,7 +727,7 @@ def bei_jing(): ...@@ -721,7 +727,7 @@ def bei_jing():
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -754,7 +760,7 @@ def bei_jing(): ...@@ -754,7 +760,7 @@ def bei_jing():
# id_list.append(id) # id_list.append(id)
num += 1 num += 1
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
bro.quit() bro.quit()
except Exception as e: except Exception as e:
log.info(e) log.info(e)
...@@ -763,6 +769,7 @@ def bei_jing(): ...@@ -763,6 +769,7 @@ def bei_jing():
# 内蒙古 # 内蒙古
def nei_meng_gu(): def nei_meng_gu():
start = time.time() start = time.time()
pathType = 'policy/neimenggu/'
num = 0 num = 0
url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html' url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
try: try:
...@@ -780,6 +787,7 @@ def nei_meng_gu(): ...@@ -780,6 +787,7 @@ def nei_meng_gu():
# todo:测试用 注释掉判重 # todo:测试用 注释掉判重
is_href = db_storage.find_one({'网址': real_href}) is_href = db_storage.find_one({'网址': real_href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# 获取所需信息 # 获取所需信息
...@@ -831,16 +839,16 @@ def nei_meng_gu(): ...@@ -831,16 +839,16 @@ def nei_meng_gu():
fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1] fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
fu_jian_href = fu_jian_re fu_jian_href = fu_jian_re
# print(fu_jian_href)
# todo:附件上传至文件服务器 # todo:附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1669') retData = baseCore.uptoOBS(fu_jian_href, '1669',pathType,title)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num) att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
id_list.append(att_id) id_list.append(att_id)
# # todo:将返回的地址更新到soup
# fu_jian_link['href'] = 'http://114.115.215.96/' + full_path
print(title) print(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -881,6 +889,7 @@ def nei_meng_gu(): ...@@ -881,6 +889,7 @@ def nei_meng_gu():
# 吉林 # 吉林
def ji_lin(): def ji_lin():
pathType = 'policy/jilin/'
start = time.time() start = time.time()
num = 0 num = 0
url = 'http://gzw.jl.gov.cn/zwgk/zcwj/' url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
...@@ -902,6 +911,7 @@ def ji_lin(): ...@@ -902,6 +911,7 @@ def ji_lin():
title = a.find('a').text.replace('\n', '') title = a.find('a').text.replace('\n', '')
is_href = db_storage.find_one({'网址': real_href}) is_href = db_storage.find_one({'网址': real_href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html' # real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
...@@ -972,16 +982,17 @@ def ji_lin(): ...@@ -972,16 +982,17 @@ def ji_lin():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
file_name = fu_jian_href.text.strip() file_name = fu_jian_href.text.strip()
retData = baseCore.uploadToserver(fu_jian_href, '1670') # print(fu_jian_href)
retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
#
# todo:将返回的地址更新到soup # # todo:将返回的地址更新到soup
li.find('a')['href'] = 'http://114.115.215.96/' + full_path li.find('a')['href'] = full_path
else: else:
continue continue
else: else:
...@@ -1009,16 +1020,17 @@ def ji_lin(): ...@@ -1009,16 +1020,17 @@ def ji_lin():
if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \ if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
retData = baseCore.uploadToserver(fj_href, '1670') # print(fj_href)
retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
#
# todo:将返回的地址更新到soup # # todo:将返回的地址更新到soup
fu_jian_href['href'] = 'http://114.115.215.96/' + full_path fu_jian_href['href'] = full_path
else: else:
continue continue
...@@ -1062,7 +1074,7 @@ def ji_lin(): ...@@ -1062,7 +1074,7 @@ def ji_lin():
save_data(dic_news) save_data(dic_news)
num = num + 1 num = num + 1
except Exception as e: except Exception as e:
print(e) log.info(e)
pass pass
except: except:
pass pass
...@@ -1073,6 +1085,7 @@ def ji_lin(): ...@@ -1073,6 +1085,7 @@ def ji_lin():
def shang_hai(): def shang_hai():
start = time.time() start = time.time()
pathType = 'policy/shanghai/'
num = 0 num = 0
for page in range(1, 7): for page in range(1, 7):
...@@ -1095,6 +1108,7 @@ def shang_hai(): ...@@ -1095,6 +1108,7 @@ def shang_hai():
href = 'https://www.gzw.sh.gov.cn' + href href = 'https://www.gzw.sh.gov.cn' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html' href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
...@@ -1154,7 +1168,7 @@ def shang_hai(): ...@@ -1154,7 +1168,7 @@ def shang_hai():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
retData = baseCore.uploadToserver(fu_jian_href, '1671') retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1163,7 +1177,7 @@ def shang_hai(): ...@@ -1163,7 +1177,7 @@ def shang_hai():
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
a['href'] = 'http://114.115.215.96/' + full_path a['href'] = full_path
else: else:
continue continue
...@@ -1205,6 +1219,7 @@ def shang_hai(): ...@@ -1205,6 +1219,7 @@ def shang_hai():
# 浙江 # 浙江
def zhe_jiang(): def zhe_jiang():
start = time.time() start = time.time()
pathType = 'policy/zhejiang/'
num = 0 num = 0
url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html' url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
try: try:
...@@ -1227,6 +1242,7 @@ def zhe_jiang(): ...@@ -1227,6 +1242,7 @@ def zhe_jiang():
href = 'http://gzw.zj.gov.cn/' + href href = 'http://gzw.zj.gov.cn/' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -1325,6 +1341,7 @@ def zhe_jiang(): ...@@ -1325,6 +1341,7 @@ def zhe_jiang():
# 福建 # 福建
def fu_jian(): def fu_jian():
error_tag = str(404) error_tag = str(404)
pathType = 'policy/fujian/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/' url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
...@@ -1373,6 +1390,7 @@ def fu_jian(): ...@@ -1373,6 +1390,7 @@ def fu_jian():
# print(real_href) # print(real_href)
is_href = db_storage.find_one({'网址': real_href}) is_href = db_storage.find_one({'网址': real_href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# 文章是远程pdf # 文章是远程pdf
...@@ -1384,7 +1402,7 @@ def fu_jian(): ...@@ -1384,7 +1402,7 @@ def fu_jian():
content = baseCore.pdf_content(resp_content) content = baseCore.pdf_content(resp_content)
contentwithtag = '' contentwithtag = ''
# 文件上传至服务器 # 文件上传至服务器
retData = baseCore.uploadToserver(real_href, '1673') retData = baseCore.uptoOBS(real_href, '1673',pathType,title)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1420,7 +1438,7 @@ def fu_jian(): ...@@ -1420,7 +1438,7 @@ def fu_jian():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# 找到附件后 上传至文件服务器 # 找到附件后 上传至文件服务器
retData = baseCore.uploadToserver(fj_href, '1673') retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1428,7 +1446,7 @@ def fu_jian(): ...@@ -1428,7 +1446,7 @@ def fu_jian():
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将文件服务器的链接替换 # 将文件服务器的链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text) source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip() pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip()
...@@ -1499,6 +1517,7 @@ def shan_dong(): ...@@ -1499,6 +1517,7 @@ def shan_dong():
href = li.find('a')['href'] href = li.find('a')['href']
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -1593,6 +1612,7 @@ def shan_dong(): ...@@ -1593,6 +1612,7 @@ def shan_dong():
# 广东 # 广东
def guang_dong(): def guang_dong():
start = time.time() start = time.time()
pathType = 'policy/guangdong/'
num = 0 num = 0
url = 'http://gzw.gd.gov.cn/zcfg/index.html' url = 'http://gzw.gd.gov.cn/zcfg/index.html'
try: try:
...@@ -1620,6 +1640,7 @@ def guang_dong(): ...@@ -1620,6 +1640,7 @@ def guang_dong():
href = doc_item('a').attr('href') href = doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -1644,7 +1665,7 @@ def guang_dong(): ...@@ -1644,7 +1665,7 @@ def guang_dong():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fj_href, '1676') retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1652,7 +1673,7 @@ def guang_dong(): ...@@ -1652,7 +1673,7 @@ def guang_dong():
att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将文件服务器的链接替换 # 将文件服务器的链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -1692,6 +1713,7 @@ def guang_dong(): ...@@ -1692,6 +1713,7 @@ def guang_dong():
# 海南 # 海南
def hai_nan(): def hai_nan():
pathType = 'policy/hainan/'
def hai_nan1(): def hai_nan1():
# 部门文件 # 部门文件
num = 0 num = 0
...@@ -1717,6 +1739,7 @@ def hai_nan(): ...@@ -1717,6 +1739,7 @@ def hai_nan():
href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/') href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
try: try:
...@@ -1759,7 +1782,7 @@ def hai_nan(): ...@@ -1759,7 +1782,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1677') retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1767,7 +1790,7 @@ def hai_nan(): ...@@ -1767,7 +1790,7 @@ def hai_nan():
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将文件服务器的链接替换 # 将文件服务器的链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
try: try:
# print(href) # print(href)
...@@ -1801,7 +1824,7 @@ def hai_nan(): ...@@ -1801,7 +1824,7 @@ def hai_nan():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# print(f'----附件:{fu_jian_href}-----filename:{file_name}') # print(f'----附件:{fu_jian_href}-----filename:{file_name}')
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1677') retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1809,7 +1832,7 @@ def hai_nan(): ...@@ -1809,7 +1832,7 @@ def hai_nan():
# 更新到数据库 # 更新到数据库
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
continue continue
...@@ -1888,6 +1911,7 @@ def hai_nan(): ...@@ -1888,6 +1911,7 @@ def hai_nan():
# print(title,href) # print(title,href)
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -1959,6 +1983,7 @@ def hai_nan(): ...@@ -1959,6 +1983,7 @@ def hai_nan():
# print(title,href) # print(title,href)
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -2007,7 +2032,7 @@ def hai_nan(): ...@@ -2007,7 +2032,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1677') retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2015,7 +2040,7 @@ def hai_nan(): ...@@ -2015,7 +2040,7 @@ def hai_nan():
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
# print(f'附件:{fu_jian_href}') # print(f'附件:{fu_jian_href}')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -2065,6 +2090,7 @@ def hai_nan(): ...@@ -2065,6 +2090,7 @@ def hai_nan():
# print(title,href) # print(title,href)
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -2113,14 +2139,14 @@ def hai_nan(): ...@@ -2113,14 +2139,14 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1677') retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
print(f'----附件:{fu_jian_href}') print(f'----附件:{fu_jian_href}')
else: else:
pass pass
...@@ -2175,10 +2201,13 @@ def hai_nan(): ...@@ -2175,10 +2201,13 @@ def hai_nan():
try: try:
is_href = db_storage.find_one({'网址': i_href}) is_href = db_storage.find_one({'网址': i_href})
if is_href: if is_href:
num+=1
continue continue
if i_href == 'https://www.gov.cn/jrzg/2013-11/27/content_2536600.htm': if i_href == 'https://www.gov.cn/jrzg/2013-11/27/content_2536600.htm':
num+=1
continue continue
if i_href == 'https://www.gov.cn/jrzg/2013-09/28/content_2497241.htm': if i_href == 'https://www.gov.cn/jrzg/2013-09/28/content_2497241.htm':
num+=1
continue continue
# print(f'中央----{i_href}----') # print(f'中央----{i_href}----')
href_text = requests.get(url=i_href, headers=headers, verify=False) href_text = requests.get(url=i_href, headers=headers, verify=False)
...@@ -2330,6 +2359,7 @@ def hai_nan(): ...@@ -2330,6 +2359,7 @@ def hai_nan():
# 四川 # 四川
def si_chuan(): def si_chuan():
num = 0 num = 0
pathType = 'policy/sichuan/'
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
if page == 1: if page == 1:
...@@ -2349,9 +2379,10 @@ def si_chuan(): ...@@ -2349,9 +2379,10 @@ def si_chuan():
href = 'http://gzw.sc.gov.cn' + doc_item('a').attr('href') href = 'http://gzw.sc.gov.cn' + doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
print(href) # print(href)
href_text = requests.get(url=href, headers=headers, verify=False).text href_text = requests.get(url=href, headers=headers, verify=False).text
doc_href = pq(href_text) doc_href = pq(href_text)
title = str(doc_href('.xxgkzn_title').text()).replace('\n', '').replace('\r', '') title = str(doc_href('.xxgkzn_title').text()).replace('\n', '').replace('\r', '')
...@@ -2374,14 +2405,14 @@ def si_chuan(): ...@@ -2374,14 +2405,14 @@ def si_chuan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 对附件上传至文件服务器 # 对附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1678') retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name)
if retData['stste']: if retData['stste']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
# fu_jian_href_list.append(fu_jian_href) # fu_jian_href_list.append(fu_jian_href)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -2423,6 +2454,7 @@ def si_chuan(): ...@@ -2423,6 +2454,7 @@ def si_chuan():
# 广西 # 广西
def guang_xi(): def guang_xi():
num = 0 num = 0
pathType = 'policy/guangxi/'
start_time = time.time() start_time = time.time()
url_all = """ url_all = """
http://gzw.gxzf.gov.cn/wjzx/2023nwj/ 1 http://gzw.gxzf.gov.cn/wjzx/2023nwj/ 1
...@@ -2463,6 +2495,7 @@ def guang_xi(): ...@@ -2463,6 +2495,7 @@ def guang_xi():
href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '') href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -2498,7 +2531,7 @@ def guang_xi(): ...@@ -2498,7 +2531,7 @@ def guang_xi():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1692') retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2507,7 +2540,7 @@ def guang_xi(): ...@@ -2507,7 +2540,7 @@ def guang_xi():
att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -2550,6 +2583,7 @@ def gui_zhou(): ...@@ -2550,6 +2583,7 @@ def gui_zhou():
http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/ 11 http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/ 11
http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/ 1 http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/ 1
""" """
pathType = 'policy/guizhou/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
for page in range(0, 11): for page in range(0, 11):
...@@ -2566,6 +2600,7 @@ def gui_zhou(): ...@@ -2566,6 +2600,7 @@ def gui_zhou():
href = doc_item('a').attr('href') href = doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -2606,7 +2641,7 @@ def gui_zhou(): ...@@ -2606,7 +2641,7 @@ def gui_zhou():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1694') retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2615,7 +2650,7 @@ def gui_zhou(): ...@@ -2615,7 +2650,7 @@ def gui_zhou():
att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -2655,6 +2690,7 @@ def gui_zhou(): ...@@ -2655,6 +2690,7 @@ def gui_zhou():
# 云南 # 云南
def yun_nan(): def yun_nan():
pathType = 'policy/yunnan/'
def yun_nan1(): def yun_nan1():
""" """
http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml 9 http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml 9
...@@ -2679,6 +2715,7 @@ def yun_nan(): ...@@ -2679,6 +2715,7 @@ def yun_nan():
href = 'http://gzw.yn.gov.cn' + doc_item('a').attr('href') href = 'http://gzw.yn.gov.cn' + doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
fu_jian_href_list = [] fu_jian_href_list = []
...@@ -2710,7 +2747,7 @@ def yun_nan(): ...@@ -2710,7 +2747,7 @@ def yun_nan():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try: try:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1679') retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2719,7 +2756,7 @@ def yun_nan(): ...@@ -2719,7 +2756,7 @@ def yun_nan():
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
continue continue
href_resp.close() href_resp.close()
...@@ -2788,6 +2825,7 @@ def yun_nan(): ...@@ -2788,6 +2825,7 @@ def yun_nan():
href = 'http://gzw.yn.gov.cn' + li.find('a').get('href').replace(' ', '') href = 'http://gzw.yn.gov.cn' + li.find('a').get('href').replace(' ', '')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
print(href) print(href)
...@@ -2822,7 +2860,7 @@ def yun_nan(): ...@@ -2822,7 +2860,7 @@ def yun_nan():
print(fu_jian_href) print(fu_jian_href)
try: try:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1679') retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2831,7 +2869,7 @@ def yun_nan(): ...@@ -2831,7 +2869,7 @@ def yun_nan():
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
continue continue
res_.close() res_.close()
...@@ -2890,6 +2928,7 @@ def chong_qing(): ...@@ -2890,6 +2928,7 @@ def chong_qing():
http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2 http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2
""" """
num = 0 num = 0
pathType = 'policy/chongqing/'
start_time = time.time() start_time = time.time()
for page in range(0, 4): for page in range(0, 4):
if page == 0: if page == 0:
...@@ -2913,6 +2952,7 @@ def chong_qing(): ...@@ -2913,6 +2952,7 @@ def chong_qing():
href = url.split('index')[0] + title_item('a').attr('href').replace('./', '') href = url.split('index')[0] + title_item('a').attr('href').replace('./', '')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
print(href) print(href)
...@@ -2960,7 +3000,7 @@ def chong_qing(): ...@@ -2960,7 +3000,7 @@ def chong_qing():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try: try:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1693') retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2969,7 +3009,7 @@ def chong_qing(): ...@@ -2969,7 +3009,7 @@ def chong_qing():
att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
continue continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -3011,6 +3051,7 @@ def chong_qing(): ...@@ -3011,6 +3051,7 @@ def chong_qing():
# 天津 # 天津
def tian_jin(): def tian_jin():
pathType = 'policy/tianjin/'
def tian_jin1(): def tian_jin1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -3038,6 +3079,7 @@ def tian_jin(): ...@@ -3038,6 +3079,7 @@ def tian_jin():
href = i_href href = i_href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8') # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
...@@ -3082,7 +3124,7 @@ def tian_jin(): ...@@ -3082,7 +3124,7 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1683') retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3090,7 +3132,7 @@ def tian_jin(): ...@@ -3090,7 +3132,7 @@ def tian_jin():
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3160,6 +3202,7 @@ def tian_jin(): ...@@ -3160,6 +3202,7 @@ def tian_jin():
href = url.split('index')[0] + href.replace('./', '') href = url.split('index')[0] + href.replace('./', '')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8') # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
...@@ -3205,7 +3248,7 @@ def tian_jin(): ...@@ -3205,7 +3248,7 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1683') retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3213,7 +3256,7 @@ def tian_jin(): ...@@ -3213,7 +3256,7 @@ def tian_jin():
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3284,6 +3327,7 @@ def tian_jin(): ...@@ -3284,6 +3327,7 @@ def tian_jin():
href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/') href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
...@@ -3332,7 +3376,7 @@ def tian_jin(): ...@@ -3332,7 +3376,7 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1683') retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3340,7 +3384,7 @@ def tian_jin(): ...@@ -3340,7 +3384,7 @@ def tian_jin():
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3388,6 +3432,7 @@ def tian_jin(): ...@@ -3388,6 +3432,7 @@ def tian_jin():
# 新疆 # 新疆
def xin_jiang(): def xin_jiang():
pathType = 'policy/xinjiang/'
def xin_jiang1(): def xin_jiang1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -3407,6 +3452,7 @@ def xin_jiang(): ...@@ -3407,6 +3452,7 @@ def xin_jiang():
continue continue
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
# href = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/201909/559cf77b5a954d028bd187d6c6e46747.shtml' # href = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/201909/559cf77b5a954d028bd187d6c6e46747.shtml'
try: try:
...@@ -3432,7 +3478,7 @@ def xin_jiang(): ...@@ -3432,7 +3478,7 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1682') retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3440,7 +3486,7 @@ def xin_jiang(): ...@@ -3440,7 +3486,7 @@ def xin_jiang():
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3509,6 +3555,7 @@ def xin_jiang(): ...@@ -3509,6 +3555,7 @@ def xin_jiang():
href = 'http://gyzc.xjbt.gov.cn' + href href = 'http://gyzc.xjbt.gov.cn' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_res = requests.get(url=href, headers=headers, verify=False) href_res = requests.get(url=href, headers=headers, verify=False)
...@@ -3530,7 +3577,7 @@ def xin_jiang(): ...@@ -3530,7 +3577,7 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1682') retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3538,7 +3585,7 @@ def xin_jiang(): ...@@ -3538,7 +3585,7 @@ def xin_jiang():
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3594,6 +3641,7 @@ def xin_jiang(): ...@@ -3594,6 +3641,7 @@ def xin_jiang():
# 山西 # 山西
def shan_xi(): def shan_xi():
pathType = 'policy/shanxi/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
for page in range(1, 7): for page in range(1, 7):
...@@ -3618,6 +3666,7 @@ def shan_xi(): ...@@ -3618,6 +3666,7 @@ def shan_xi():
publishDate = tr.xpath('./td[2]/span/text()')[0] publishDate = tr.xpath('./td[2]/span/text()')[0]
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
if ".pdf" in href: if ".pdf" in href:
...@@ -3648,7 +3697,7 @@ def shan_xi(): ...@@ -3648,7 +3697,7 @@ def shan_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1684') retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3656,7 +3705,7 @@ def shan_xi(): ...@@ -3656,7 +3705,7 @@ def shan_xi():
att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3707,6 +3756,7 @@ def shan_xi(): ...@@ -3707,6 +3756,7 @@ def shan_xi():
# 辽宁 # 辽宁
def liao_ning(): def liao_ning():
pathType = 'policy/liaoning/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
...@@ -3727,6 +3777,7 @@ def liao_ning(): ...@@ -3727,6 +3777,7 @@ def liao_ning():
href = 'https://gzw.ln.gov.cn/' + href href = 'https://gzw.ln.gov.cn/' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -3758,7 +3809,7 @@ def liao_ning(): ...@@ -3758,7 +3809,7 @@ def liao_ning():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1685') retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3766,7 +3817,7 @@ def liao_ning(): ...@@ -3766,7 +3817,7 @@ def liao_ning():
att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3816,6 +3867,7 @@ def liao_ning(): ...@@ -3816,6 +3867,7 @@ def liao_ning():
# 黑龙江 # 黑龙江
def hei_long_jiang(): def hei_long_jiang():
pathType = 'policy/heilongjiang/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
...@@ -3837,6 +3889,7 @@ def hei_long_jiang(): ...@@ -3837,6 +3889,7 @@ def hei_long_jiang():
pub_hao = '' pub_hao = ''
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
contentWithTag = text['data']['results'][row]['contentHtml'] contentWithTag = text['data']['results'][row]['contentHtml']
...@@ -3861,7 +3914,7 @@ def hei_long_jiang(): ...@@ -3861,7 +3914,7 @@ def hei_long_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1687') retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3869,7 +3922,7 @@ def hei_long_jiang(): ...@@ -3869,7 +3922,7 @@ def hei_long_jiang():
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -3912,6 +3965,7 @@ def hei_long_jiang(): ...@@ -3912,6 +3965,7 @@ def hei_long_jiang():
# 江苏 # 江苏
def jiang_su(): def jiang_su():
num = 0 num = 0
pathType = 'policy/jiangsu/'
start_time = time.time() start_time = time.time()
pagestart = 1 pagestart = 1
pageend = 45 pageend = 45
...@@ -3940,6 +3994,7 @@ def jiang_su(): ...@@ -3940,6 +3994,7 @@ def jiang_su():
title = a.text title = a.text
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -3967,7 +4022,7 @@ def jiang_su(): ...@@ -3967,7 +4022,7 @@ def jiang_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1687') retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3975,7 +4030,7 @@ def jiang_su(): ...@@ -3975,7 +4030,7 @@ def jiang_su():
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4022,6 +4077,7 @@ def jiang_su(): ...@@ -4022,6 +4077,7 @@ def jiang_su():
# 安徽 # 安徽
def an_hui(): def an_hui():
pathType = 'policy/anhui/'
def an_hui1(): def an_hui1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -4037,6 +4093,7 @@ def an_hui(): ...@@ -4037,6 +4093,7 @@ def an_hui():
href = doc_item('a').attr('href') href = doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -4068,7 +4125,7 @@ def an_hui(): ...@@ -4068,7 +4125,7 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1688') retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4076,7 +4133,7 @@ def an_hui(): ...@@ -4076,7 +4133,7 @@ def an_hui():
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4164,7 +4221,7 @@ def an_hui(): ...@@ -4164,7 +4221,7 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1688') retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4172,7 +4229,7 @@ def an_hui(): ...@@ -4172,7 +4229,7 @@ def an_hui():
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4223,6 +4280,7 @@ def jiang_xi(): ...@@ -4223,6 +4280,7 @@ def jiang_xi():
121-164 121-164
""" """
num = 0 num = 0
pathType = 'policy/jiangxi/'
start_time = time.time() start_time = time.time()
startrecord = 1 startrecord = 1
endrecord = 60 endrecord = 60
...@@ -4248,6 +4306,7 @@ def jiang_xi(): ...@@ -4248,6 +4306,7 @@ def jiang_xi():
for href in href_list: for href in href_list:
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_res = requests.get(url=href, headers=headers, verify=False) href_res = requests.get(url=href, headers=headers, verify=False)
...@@ -4289,7 +4348,7 @@ def jiang_xi(): ...@@ -4289,7 +4348,7 @@ def jiang_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1689') retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4297,7 +4356,7 @@ def jiang_xi(): ...@@ -4297,7 +4356,7 @@ def jiang_xi():
att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4346,6 +4405,7 @@ def jiang_xi(): ...@@ -4346,6 +4405,7 @@ def jiang_xi():
# 河南 # 河南
def he_nan(): def he_nan():
num = 0 num = 0
pathType = 'policy/henan/'
start_time = time.time() start_time = time.time()
for page in range(0, 7): for page in range(0, 7):
if page == 0: if page == 0:
...@@ -4361,6 +4421,7 @@ def he_nan(): ...@@ -4361,6 +4421,7 @@ def he_nan():
href = doc_item('a').attr('href') href = doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
href_res = requests.get(url=href, headers=headers, verify=False) href_res = requests.get(url=href, headers=headers, verify=False)
href_res.encoding = href_res.apparent_encoding href_res.encoding = href_res.apparent_encoding
...@@ -4383,7 +4444,7 @@ def he_nan(): ...@@ -4383,7 +4444,7 @@ def he_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1690') retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4391,7 +4452,7 @@ def he_nan(): ...@@ -4391,7 +4452,7 @@ def he_nan():
att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4438,6 +4499,7 @@ def he_nan(): ...@@ -4438,6 +4499,7 @@ def he_nan():
# 湖南 # 湖南
def hu_nan(): def hu_nan():
num = 0 num = 0
pathType = 'policy/hunan/'
start_time = time.time() start_time = time.time()
for page in range(1, 7): for page in range(1, 7):
if page == 1: if page == 1:
...@@ -4454,6 +4516,7 @@ def hu_nan(): ...@@ -4454,6 +4516,7 @@ def hu_nan():
publishDate = doc_item('td:nth-child(3)').text() publishDate = doc_item('td:nth-child(3)').text()
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
# href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html' # href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html'
try: try:
...@@ -4490,7 +4553,7 @@ def hu_nan(): ...@@ -4490,7 +4553,7 @@ def hu_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1691') retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4498,7 +4561,7 @@ def hu_nan(): ...@@ -4498,7 +4561,7 @@ def hu_nan():
att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4538,6 +4601,7 @@ def hu_nan(): ...@@ -4538,6 +4601,7 @@ def hu_nan():
# 甘肃 # 甘肃
def gan_su(): def gan_su():
pathType = 'policy/gansu/'
def gan_su1(): def gan_su1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -4581,6 +4645,7 @@ def gan_su(): ...@@ -4581,6 +4645,7 @@ def gan_su():
publishDate = dd['publishDate'] publishDate = dd['publishDate']
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
for i in range(0, 4): for i in range(0, 4):
bro.get(href) bro.get(href)
...@@ -4609,7 +4674,7 @@ def gan_su(): ...@@ -4609,7 +4674,7 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1696') retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4617,7 +4682,7 @@ def gan_su(): ...@@ -4617,7 +4682,7 @@ def gan_su():
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4688,6 +4753,7 @@ def gan_su(): ...@@ -4688,6 +4753,7 @@ def gan_su():
publishDate = dd['publishDate'] publishDate = dd['publishDate']
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
bro.get(href) bro.get(href)
try: try:
...@@ -4743,7 +4809,7 @@ def gan_su(): ...@@ -4743,7 +4809,7 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1696') retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4751,7 +4817,7 @@ def gan_su(): ...@@ -4751,7 +4817,7 @@ def gan_su():
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4849,6 +4915,7 @@ def gan_su(): ...@@ -4849,6 +4915,7 @@ def gan_su():
publishDate = dd['publishDate'] publishDate = dd['publishDate']
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
bro.get(href) bro.get(href)
...@@ -4900,7 +4967,7 @@ def gan_su(): ...@@ -4900,7 +4967,7 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1696') retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4908,7 +4975,7 @@ def gan_su(): ...@@ -4908,7 +4975,7 @@ def gan_su():
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4958,6 +5025,7 @@ def gan_su(): ...@@ -4958,6 +5025,7 @@ def gan_su():
# 宁夏 # 宁夏
def ning_xia(): def ning_xia():
num = 0 num = 0
pathType = 'policy/ningxia/'
start_time = time.time() start_time = time.time()
for page in range(0, 3): for page in range(0, 3):
if page == 0: if page == 0:
...@@ -4976,6 +5044,7 @@ def ning_xia(): ...@@ -4976,6 +5044,7 @@ def ning_xia():
publishDate = li.find('span', attrs={'class': 'stdnewslistspan'}).text publishDate = li.find('span', attrs={'class': 'stdnewslistspan'}).text
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_res = requests.get(url=href, headers=headers, verify=False) href_res = requests.get(url=href, headers=headers, verify=False)
...@@ -5001,7 +5070,7 @@ def ning_xia(): ...@@ -5001,7 +5070,7 @@ def ning_xia():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1697') retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5009,7 +5078,7 @@ def ning_xia(): ...@@ -5009,7 +5078,7 @@ def ning_xia():
att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -5052,6 +5121,7 @@ def ning_xia(): ...@@ -5052,6 +5121,7 @@ def ning_xia():
# 陕西 # 陕西
def shanxi(): def shanxi():
num = 0 num = 0
pathType = 'policy/shan_xi/'
start_time = time.time() start_time = time.time()
url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127' url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
# url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127' # url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
...@@ -5072,6 +5142,7 @@ def shanxi(): ...@@ -5072,6 +5142,7 @@ def shanxi():
href = 'https://sxgz.shaanxi.gov.cn/' + href href = 'https://sxgz.shaanxi.gov.cn/' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
res_href = requests.get(url=href, headers=headers) res_href = requests.get(url=href, headers=headers)
...@@ -5101,7 +5172,7 @@ def shanxi(): ...@@ -5101,7 +5172,7 @@ def shanxi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1680') retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5109,7 +5180,7 @@ def shanxi(): ...@@ -5109,7 +5180,7 @@ def shanxi():
att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -5152,6 +5223,7 @@ def shanxi(): ...@@ -5152,6 +5223,7 @@ def shanxi():
# 西藏 # 西藏
def xi_zang(): def xi_zang():
start_time = time.time() start_time = time.time()
pathType = 'policy/xizang/'
url_list = ['http://gzw.lasa.gov.cn/gzw/zccfg/common_list.shtml', url_list = ['http://gzw.lasa.gov.cn/gzw/zccfg/common_list.shtml',
'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ] 'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
for url in url_list: for url in url_list:
...@@ -5169,6 +5241,7 @@ def xi_zang(): ...@@ -5169,6 +5241,7 @@ def xi_zang():
title = li.find('a').text title = li.find('a').text
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
res_href = requests.get(url=href, headers=headers) res_href = requests.get(url=href, headers=headers)
...@@ -5194,7 +5267,7 @@ def xi_zang(): ...@@ -5194,7 +5267,7 @@ def xi_zang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1695') retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5202,7 +5275,7 @@ def xi_zang(): ...@@ -5202,7 +5275,7 @@ def xi_zang():
att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
# todo:替换完成之后,将附件上传至文件服务器 # todo:替换完成之后,将附件上传至文件服务器
...@@ -5242,6 +5315,7 @@ def xi_zang(): ...@@ -5242,6 +5315,7 @@ def xi_zang():
# 青海 # 青海
def qing_hai(): def qing_hai():
pathType = 'policy/qinghai/'
def qing_hai1(): def qing_hai1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -5259,6 +5333,7 @@ def qing_hai(): ...@@ -5259,6 +5333,7 @@ def qing_hai():
durl = tr.find('a').get('href') durl = tr.find('a').get('href')
is_href = db_storage.find_one({'网址': durl}) is_href = db_storage.find_one({'网址': durl})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
title = tr.find('a').text title = tr.find('a').text
...@@ -5297,7 +5372,7 @@ def qing_hai(): ...@@ -5297,7 +5372,7 @@ def qing_hai():
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
# todo:替换完成之后,将附件上传至文件服务器 # todo:替换完成之后,将附件上传至文件服务器
...@@ -5659,41 +5734,41 @@ def hu_bei(): ...@@ -5659,41 +5734,41 @@ def hu_bei():
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
if __name__ == '__main__': if __name__ == '__main__':
get_content1() # get_content1()
get_content2() # get_content2()
get_content3() # get_content3()
bei_jing() # bei_jing()
nei_meng_gu() # nei_meng_gu()
ji_lin() ji_lin()
shang_hai() # shang_hai()
zhe_jiang() # zhe_jiang()
fu_jian() # fu_jian()
shan_dong() # shan_dong()
guang_dong() # guang_dong()
hai_nan() # hai_nan()
si_chuan() # si_chuan()
guang_xi() # guang_xi()
gui_zhou() # gui_zhou()
yun_nan() # yun_nan()
chong_qing() # chong_qing()
tian_jin() # tian_jin()
xin_jiang() # xin_jiang()
shan_xi() # shan_xi()
liao_ning() # liao_ning()
hei_long_jiang() # hei_long_jiang()
jiang_su() # jiang_su()
an_hui() # an_hui()
jiang_xi() # jiang_xi()
he_nan() # he_nan()
hu_nan() # hu_nan()
gan_su() # gan_su()
ning_xia() # ning_xia()
xi_zang() # xi_zang()
shanxi() # shanxi()
qing_hai() # qing_hai()
he_bei() # he_bei()
qing_hai() # qing_hai()
current_time = datetime.datetime.now() # current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1) # midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds() # sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds) # time.sleep(sleep_seconds)
function r(size){
function r(size){
var str = "",
arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
for(var i=0; i<size; i++){
str += arr[Math.round(Math.random() * (arr.length-1))];
}
return str;
}
function strTobinary(str) {
var result = [];
var list = str.split("");
for (var i = 0; i < list.length; i++) {
if (i != 0) {
result.push(" ");
}
var item = list[i];
var binaryStr = item.charCodeAt().toString(2);
result.push(binaryStr);
};
return result.join("");
}
function cipher() {
var date = new Date();
var timestamp = date.getTime().toString();
var salt = r(24);
var year = date.getFullYear().toString();
var month = (date.getMonth() + 1 < 10 ? "0" + (date.getMonth() + 1) : date
.getMonth()).toString();
var day = (date.getDate() < 10 ? "0" + date.getDate() : date.getDate())
.toString();
var iv = year + month + day;
return salt
}
function des(salt,iv,enc) {
// var enc = des3(timestamp, salt, iv).toString();
var str = salt + iv + enc;
var ciphertext = strTobinary(str);
return ciphertext;
}
function token(){
var size = 24
var str = "",
arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
for(var i=0; i<size; i++){
str += arr[Math.round(Math.random() * (arr.length-1))];
}
return str;
}
function pageid() {
var n = 32
var text = "";
var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
for (var i = 0; i < n; i++)
text += possible.charAt(Math.floor(Math.random() * possible.length));
return text;
}
// console.log(cipher());
\ No newline at end of file
import base64
import base64
import json
import random
import time
import execjs
import requests
import urllib3
from Crypto.Cipher import DES3
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore=BaseCore()
log=baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
#保存错误日志
def insertBadSql(error):
insertSql = f"insert into cpwsw_log (code,description,success,create_time,user,keyword,msg) values (%s,%s,%s,now(),%s,%s,%s)"
cursor_.execute(insertSql, tuple(error))
cnx_.commit()
#cookie的处理
def updateCookie(cookie,type):
if type==2:
#session失效,删除token
cursor_.execute("delete from cpwsw_user where cookie=%s",[cookie])
if type ==1:
#正常使用
cursor_.execute("update cpwsw_user set update_time=now() where cookie=%s",[cookie])
if type ==3:
#未知异常
cursor_.execute("update cpwsw_user set fenghao_time=now() where cookie=%s",[cookie])
cnx_.commit()
# 将DES3加密解密设置为类
class EncryptDate:
def __init__(self, pianyi, key):
self.key = key # 初始化密钥
self.iv = bytes(pianyi,encoding='utf8') # 偏移量
self.length = DES3.block_size # 初始化数据块大小
self.des3 = DES3.new(self.key, DES3.MODE_CBC, self.iv) # 初始化AES,CBC模式的实例
# 截断函数,去除填充的字符
self.unpad = lambda date: date[0:-ord(date[-1])]
def pad(self, text):
"""
#填充函数,使被加密数据的字节码长度是block_size的整数倍
"""
count = len(text.encode('utf-8'))
add = self.length - (count % self.length)
entext = text + (chr(add) * add)
return entext
def encrypt(self, encrData): # 加密函数
res = self.des3.encrypt(self.pad(encrData).encode("utf8"))
msg = str(base64.b64encode(res), encoding="utf8")
# msg = res.hex()
return msg
def decrypt(self, decrData): # 解密函数
res = base64.decodebytes(decrData.encode("utf8"))
# res = bytes.fromhex(decrData)
msg = self.des3.decrypt(res).decode("utf8")
return self.unpad(msg)
with open('裁判文书网.js', 'r', encoding='utf-8') as f:
jstext = f.read()
# 在python中调用js代码
ctx = execjs.compile(jstext)
print("ok")
url = 'https://wenshu.court.gov.cn/website/parse/rest.q4w'
#获取登录Cookie
def getCookie():
cursor_.execute(
f"select user,cookie from cpwsw_user where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
row = cursor_.fetchall()
if row:
pass
else:
# 没有查到token
log.info("没有拿到token")
return False
return row[0]
#获取正文
def getDoc(info_id,userCookie):
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': userCookie,
'Host': 'wenshu.court.gov.cn',
'Referer': 'https://wenshu.court.gov.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}
salt = ctx.call('cipher')
date_now = time.strftime("%Y%m%d",time.localtime())
t = time.time()
eg = EncryptDate(date_now,salt) # 偏移量和秘钥,这里密钥的长度必须是16的倍数
des = eg.encrypt(str(t)) #DES3加密
ciphertext = ctx.call("des",salt,date_now,des)
token = ctx.call("token")
data_info = {
'docId':info_id,
'ciphertext':ciphertext,
'cfg':'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch',
'__RequestVerificationToken':token,
'wh':'250',
'ww':'1536',
'cs':'0'
}
ip = baseCore.get_proxy()
res_info = requests.post(url=url,headers=headers,data=data_info,proxies=ip, verify=False)
#{'code': -12, 'description': None, 'secretKey': None, 'result': None, 'success': False} SESSION的值不对
#{'code': 9, 'description': '没有权限请求接口,cfg=com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch', 'secretKey': None, 'result': None, 'success': False}
#'{"code":1,"description":"权限已失效","secretKey":null,"result":null,"success":true}'
code = res_info.json()["code"]
if code != 1:
log.error(f"正文获取失败:----{res_info.json()}")
# 没有正常返回
return ""
try:
eg_jie = EncryptDate(date_now,res_info.json()['secretKey'])
res_jie = eg_jie.decrypt(res_info.json()['result']) #DES3解密
except Exception as e:
return ""
log.error(f"正文获取失败:----{e}")
return res_jie
#
def insertCpwsList(keyword,page,list_info,userCookie):
listCount = 0
repetCount = 0
insertCount = 0
for one_info in list_info:
listCount = listCount + 1
info_title = one_info['1']
info_time = one_info['31']
info_address = one_info['2']
info_yuanyou = one_info['26']
info_bianhao = one_info['7']
info_id = one_info['rowkey']
selectCountSql = f"select count(1) from cpwsw_list where keyword=%s and rowkey=%s"
cursor_.execute(selectCountSql,[keyword,info_id])
count = cursor_.fetchone()[0]
if count > 0:
repetCount = repetCount + 1
continue
else:
insertCount = insertCount + 1
try:
# 获取正文
log.info("开始采集正文")
content = getDoc(info_id,userCookie)
log.info("结束采集正文,开始休眠")
time.sleep(random.randint(60, 180))
if content=='':
log.info("采集到的正文为空")
continue
insertSql = f"insert into cpwsw_list (keyword,title,time,address,yuanyou,bianhao,rowkey,state,create_time,content) " \
f"values (%s,%s,%s,%s,%s,%s,%s,0,now(),%s)"
cursor_.execute(insertSql, [keyword,info_title,info_time,info_address,info_yuanyou,info_bianhao,info_id,content])
cnx_.commit()
updateCookie(userCookie, 1)
except Exception as e:
log.error(f"保存数据库失败:{e}")
log.info(f"---{keyword}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------")
if listCount == 0:
# 列表为空认为结束
return True
if repetCount >= listCount / 2:
# 重复数量大于等于一半认为结束
return True
# 没有结束
return False
def getList(keyword,page):
userAndCookie = getCookie()
if userAndCookie:
pass
else:
log.info("没有拿到token,开始递归")
while True:
log.info("没有拿到token,开始休眠")
time.sleep(60)
log.info("没有拿到token,结束休眠")
userAndCookie = getCookie()
if userAndCookie:
break
user = userAndCookie[0]
userCookie = userAndCookie[1]
log.info(f"获取到user----{user}")
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': userCookie,
'Host': 'wenshu.court.gov.cn',
'Referer': 'https://wenshu.court.gov.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}
salt = ctx.call('cipher')
date_now = time.strftime("%Y%m%d", time.localtime())
t = time.time()
eg = EncryptDate(date_now, salt) # 偏移量和秘钥,这里密钥的长度必须是16的倍数
des = eg.encrypt(str(t)) # DES3加密
ciphertext = ctx.call("des", salt, date_now, des)
pageId = ctx.call("pageid")
token = ctx.call("token")
search_key = [{"key": "s21", "value": f"{keyword}"}]
data = {
'pageId':pageId,
's21': keyword,
'sortFields': 's51:desc', # 按裁判日期排序
'ciphertext': ciphertext,
'pageNum': page,
'pageSize': '5',
'queryCondition': str(search_key),
'cfg': 'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@queryDoc',
'__RequestVerificationToken': token,
'wh': '403',
'ww': '1531',
'cs': '0'
}
res = requests.post(url=url, headers=headers, data=data)
code = res.json()["code"]
if code!=1:
#没有正常返回
#记录信息 删除登录信息
error = [res.json()["code"], res.json()["description"], res.json()["success"], user, keyword,'']
insertBadSql(tuple(error))
updateCookie(userCookie, 3)
return getList(keyword, page)
eg_jie = EncryptDate(date_now, res.json()['secretKey'])
res_jie = eg_jie.decrypt(res.json()['result'])
res_json = json.loads(res_jie) # 将解密后的数据转换为json格式
list_info = res_json['queryResult']['resultList']
return insertCpwsList(keyword, page,list_info,userCookie)
#
def doJob(keyword):
log.info(f"======{keyword}----开始采集=======")
for page in range(1,6):
retFlag = getList(keyword, page)
time.sleep(random.randint(60,180))
if retFlag:
#结束 跳出该公众号
break
else:
#没有结束
pass
log.info(f"======{keyword}---------结束采集=======")
def test():
pass
if __name__=="__main__":
while True:
keyword = baseCore.redicPullData('cpwsqy')
if keyword == 'None' or keyword == None:
log.info("redis已经没有数据了,重新放置数据")
break
doJob(keyword)
baseCore.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论