提交 d82da41e 作者: 丁双波

Merge remote-tracking branch 'origin/master'

...@@ -5,22 +5,18 @@ import socket ...@@ -5,22 +5,18 @@ import socket
import sys import sys
import time import time
import fitz
import logbook import logbook
import logbook.more import logbook.more
import pandas as pd import pandas as pd
import requests import requests
import zhconv import zhconv
import pymysql
import redis import redis
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid import langid
#创建连接池 #创建连接池
import pymysql import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB from DBUtils.PooledDB import PooledDB
# import sys # import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client') # sys.path.append('D://zzsn_spider//base//fdfs_client')
...@@ -28,6 +24,15 @@ from DBUtils.PooledDB import PooledDB ...@@ -28,6 +24,15 @@ from DBUtils.PooledDB import PooledDB
from fdfs_client.client import get_tracker_conf, Fdfs_client from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf') tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
from obs import ObsClient
import fitz
from urllib.parse import unquote
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore: class BaseCore:
...@@ -659,12 +664,10 @@ class BaseCore: ...@@ -659,12 +664,10 @@ class BaseCore:
create_time = retData['create_time'] create_time = retData['create_time']
order_by = num order_by = num
selects = self.secrchATT(item_id,year,type_id) selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects: if selects:
self.getLogger().info(f'com_name:{com_name}已存在') self.getLogger().info(f'com_name:{com_name}--{year}已存在')
id = selects[0] id = ''
return id return id
else: else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
...@@ -695,6 +698,80 @@ class BaseCore: ...@@ -695,6 +698,80 @@ class BaseCore:
log = self.getLogger() log = self.getLogger()
log.info('======保存企业CIK失败=====') log.info('======保存企业CIK失败=====')
#上传至obs华为云服务器,并解析破地方的内容和页数
# 获取文件大小
def convert_size(self,size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def obsexist(self,file_path):
# # 文件路径
# file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
# 检查文件是否存在
response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300:
self.getLogger().info('=====文件不存在obs=====')
else:
self.getLogger().info(f'=====文件存在obs========{file_path}')
def uptoOBS(self,pdf_url, name_pdf,type_id, social_code,pathType,taskType,start_time):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
name = name_pdf + '.pdf'
now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', f'{pathType}{now_time}/' + name, content=response.content)
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = self.getTimeCost(start_time, time.time())
self.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
......
...@@ -475,7 +475,14 @@ def kegaishifan(): ...@@ -475,7 +475,14 @@ def kegaishifan():
#双百企业 #双百企业
def shuangbaiqiye(): def shuangbaiqiye():
pass cnx, cursor = connectSql()
query = "SELECT CompanyName FROM Hundred"
cursor.execute(query)
result = cursor.fetchall()
cnx.commit()
com_namelist = [item[0] for item in result]
for item in com_namelist:
r.rpush('hundred:baseinfo', item)
#专精特新 #专精特新
def zhuangjingtexind(): def zhuangjingtexind():
...@@ -484,7 +491,8 @@ def zhuangjingtexind(): ...@@ -484,7 +491,8 @@ def zhuangjingtexind():
if __name__ == "__main__": if __name__ == "__main__":
start = time.time() start = time.time()
# danxiangguanjun() # danxiangguanjun()
kegaishifan() # kegaishifan()
shuangbaiqiye()
# NoticeEnterprise() # NoticeEnterprise()
# AnnualEnterpriseIPO() # AnnualEnterpriseIPO()
# AnnualEnterprise() # AnnualEnterprise()
......
...@@ -541,7 +541,10 @@ class BaseCore: ...@@ -541,7 +541,10 @@ class BaseCore:
self.cursor.execute(query) self.cursor.execute(query)
token_list = self.cursor.fetchall() token_list = self.cursor.fetchall()
self.cnx.commit() self.cnx.commit()
try:
token = token_list[random.randint(0, len(token_list)-1)][0] token = token_list[random.randint(0, len(token_list)-1)][0]
except:
token = ''
return token return token
# 删除失效的token # 删除失效的token
......
# 核心工具包
import os
import random
import socket
import sys
import time
import fitz
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import pymysql
import redis
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('/base/client.conf')
client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
# self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
# charset='utf8mb4')
# self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except :
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
pass
else:
end=str.rfind(endStr)
if end==-1:
pass
else:
str = str[0:end+1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self,key):
item = self.r.lpop(key)
return item.decode() if item else None
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
# 获取本机IP
def getIP(self):
IP = socket.gethostbyname(socket.gethostname())
return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, com_name):
data = []
try:
sql = f"SELECT * FROM Hundred WHERE CompanyName = '{com_name}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql,values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
#获取企查查token
def GetToken(self):
#获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token_list = self.cursor.fetchall()
self.cnx.commit()
try:
token = token_list[random.randint(0, len(token_list)-1)][0]
except:
token = ''
return token
# 删除失效的token
def delete_token(self,token):
deletesql = f"delete from QCC_token where token='{token}' "
self.cursor.execute(deletesql)
self.cnx.commit()
#获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
self.cnx.commit()
return token
#检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,key,item):
self.r.rpush(key, item)
#增加计数器的值并返回增加后的值
def incrSet(self,key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
#获取key剩余的过期时间
def getttl(self,key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
#上传至文件服务器,并解析pdf的内容和页数
def upLoadToServe(self,pdf_url,type_id,social_code):
headers = {}
retData = {'state':False,'type_id':type_id,'item_id':social_code,'group_name':'group1','path':'','full_path':'',
'category':'pdf','file_size':'','status':1,'create_by':'XueLingKun',
'create_time':'','page_size':'','content':''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
print(f'======pdf解析失败=====')
return retData
else:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,year,type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,year,type_id)
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self,social_code,cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
# -*- coding: utf-8 -*-
import pandas as pd
import time
import requests
import json
from kafka import KafkaProducer
from BaseCore import BaseCore
from getQccId import find_id_by_name
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log = baseCore.getLogger()
# 通过企查查id获取企业基本信息
def info_by_id(com_id,com_name):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
com_jc_name = ''
try:
result_dict = resp_dict['result']['Company']
except:
log.info(com_name + ":获取失败===========重新放入redis")
baseCore.rePutIntoR('hundred:baseinfo',com_name)
return aa_dict_list
company_name = result_dict['Name']
CreditCode = result_dict['CreditCode']
if CreditCode is None:
CreditCode = ''
try:
OperName = result_dict['Oper']['Name']
except:
OperName = ''
if OperName is None:
OperName = ''
if baseCore.str_have_num(OperName):
OperName = ''
try:
Status = result_dict['ShortStatus']
except:
Status = ''
if Status is None:
Status = ''
try:
StartDate = result_dict['StartDate']
except:
StartDate = ''
if StartDate is None:
StartDate = ''
try:
RegistCapi = result_dict['RegistCapi']
except:
RegistCapi = ''
if RegistCapi is None:
RegistCapi = ''
RecCap = '' # result_dict['RecCap'] #实际缴纳金额,现已没有显示
if RecCap is None:
RecCap = ''
try:
OrgNo = result_dict['CreditCode'][8:-2] + '-' + result_dict['CreditCode'][-2] # 组织机构代码,现已没有显示
except:
OrgNo = ''
if OrgNo is None:
OrgNo = ''
try:
TaxNo = result_dict['TaxNo']
except:
TaxNo = ''
if TaxNo is None:
TaxNo = ''
try:
EconKind = result_dict['EconKind']
except:
EconKind = ''
if EconKind is None:
EconKind = ''
TermStart = '' # result_dict['TermStart'] 营业期限自,现已没有显示
if TermStart is None:
TermStart = ''
TeamEnd = '' # result_dict['TeamEnd']营业期限至,现已没有显示
if TeamEnd is None:
TeamEnd = ''
try:
SubIndustry = result_dict['Industry']['SubIndustry']
except:
SubIndustry = ''
if SubIndustry is None:
SubIndustry = ''
try:
Province = result_dict['Area']['Province']
except:
Province = ''
try:
City = result_dict['Area']['City']
except:
City = ''
try:
County = result_dict['Area']['County']
except:
County = ''
try:
region = Province + City + County
except:
region = ''
BelongOrg = '' # result_dict['BelongOrg']登记机关,现已没有显示
can_bao = ''
CommonList = [] # result_dict['CommonList']参保人数,现已没有显示
for Common_dict in CommonList:
try:
KeyDesc = Common_dict['KeyDesc']
except:
continue
if KeyDesc == '参保人数':
can_bao = Common_dict['Value']
if can_bao == '0':
can_bao = ''
OriginalName = ''
try:
OriginalName_lists = result_dict['OriginalName']
for OriginalName_dict in OriginalName_lists:
OriginalName += OriginalName_dict['Name'] + ' '
except:
OriginalName = ''
try:
OriginalName.strip()
except:
OriginalName = ''
EnglishName = '' # result_dict['EnglishName']企业英文名,现已没有显示
if EnglishName is None:
EnglishName = ''
IxCode = '' # result_dict['IxCode']进出口企业代码,现已没有显示
if IxCode is None:
IxCode = ''
Address = result_dict['Address']
if Address is None:
Address = ''
Scope = '' # result_dict['Scope']经营范围,现已没有显示
if Scope is None:
Scope = ''
try:
PhoneNumber = result_dict['companyExtendInfo']['Tel']
except:
PhoneNumber = ''
if PhoneNumber is None:
PhoneNumber = ''
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
try:
Email = result_dict['companyExtendInfo']['Email']
except:
Email = ''
if Email is None:
Email = ''
try:
Desc = result_dict['companyExtendInfo']['Desc']
except:
Desc = ''
if Desc is None:
Desc = ''
try:
Info = result_dict['companyExtendInfo']['Info']
except:
Info = ''
if Info is None:
Info = ''
company_name = baseCore.hant_2_hans(company_name)
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}".format(token, t,
com_id)
resp_dict2 = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(1)
try:
com2 = resp_dict2['result']['Company']
except:
com2 = ''
try:
Scope = com2['Scope']
except:
Scope = ''
try:
CheckDate = com2['CheckDate']
except:
CheckDate = ''
if CheckDate is None:
CheckDate = ''
try:
TaxpayerType = com2['TaxpayerType'] #纳税人资质
except:
TaxpayerType = ''
if TaxpayerType is None:
TaxpayerType = ''
try:
No = com2['No']
except:
No = ''
if No is None:
No = ''
try:
IxCode = com2['IxCode']
except:
IxCode = ''
try:
OrgNo = com2['OrgNo']
except:
OrgNo = ''
try:
for Common_t in com2['CommonList']:
try:
if Common_t['KeyDesc'] == '参保人数':
can_bao = Common_t['Value']
except:
pass
except:
can_bao = ''
try:
TermStart = com2['TermStart']
except:
TermStart = ''
try:
TeamEnd = com2['TeamEnd']
except:
TeamEnd = ''
try:
RecCap = com2['RecCap']
except:
RecCap = ''
try:
No = com2['No']
except:
No = ''
try:
SubIndustry = com2['IndustryArray'][-1]
except:
SubIndustry = ''
try:
BelongOrg = com2['BelongOrg']
except:
BelongOrg = ''
try:
EnglishName = com2['EnglishName']
except:
EnglishName = ''
aa_dict = {
'qccId': com_id, # 企查查企业id
'name': company_name, # 企业名称
'shortName': com_jc_name, # 企业简称
'socialCreditCode': CreditCode, # 统一社会信用代码
'legalPerson': OperName, # 法定代表人
'officialPhone': PhoneNumber, # 电话
'officialUrl': WebSite, # 官网
'officialEmail': Email, # 邮箱
'briefInfo': Desc, # 简介
'registerStatus': Status, # 登记状态
'incorporationDate': StartDate, # 成立日期
'capital': RegistCapi, # 注册资本
'paidCapital': RecCap, # 实缴资本
'approvalDate': CheckDate, # 核准日期
'organizationCode': OrgNo, # 组织机构代码
'registerNo': No, # 工商注册号
'taxpayerNo': CreditCode, # 纳税人识别号
'type': EconKind, # 企业类型
'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': SubIndustry, # 所属行业
'region': region,
'province': Province, # 所属省
'city': City, # 所属市
'county': County, # 所属县
'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模
'insured': can_bao, # 参保人数
'beforeName': OriginalName, # 曾用名
'englishName': EnglishName, # 英文名
'importExportEnterpriseCode': IxCode, # 进出口企业代码
'address': Address, # 地址
'businessRange': Scope, # 经营范围
'status': 0, # 状态
}
aa_dict_list.append(aa_dict)
log.info(company_name + ":爬取完成")
return aa_dict_list
if __name__ == '__main__':
taskType = '基本信息/企查查/单项双百企业冠军'
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
list_weicha = []
name_list = []
#从redis里拿数据
while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken()
if token:
pass
else:
log.info('==========已无token==========')
time.sleep(30)
continue
# list_all_info = []
start_time = time.time()
# 获取企业信息
com_name = baseCore.redicPullData('hundred:baseinfo')
# com_name = '卓新市万达铸业有限公司'
if com_name == '' or com_name is None:
time.sleep(20)
continue
dic_info = baseCore.getInfomation(com_name)
log.info(f'----当前企业{com_name}--开始处理---')
social_code = dic_info[5]
#企查查id
company_id = dic_info[6]
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == None:
if social_code:
company_id = find_id_by_name(start_time,token,social_code)
else:
company_id = find_id_by_name(start_time,token,com_name)
if company_id == 'null':
log.info('=====搜索不到该企业====')
#todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
baseCore.rePutIntoR('hundred:baseinfo', com_name + ':搜索不到')
continue
if not company_id:
log.info(com_name + ":企业ID获取失败===重新放入redis")
list_weicha.append(com_name + ":企业ID获取失败")
baseCore.rePutIntoR('hundred:baseinfo',com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
time.sleep(20)
continue
else:
log.info(f'====={com_name}===={company_id}=====获取企业id成功=====')
# todo:写入数据库
updateqccid = f"update Hundred set qccid = '{company_id}' where CompanyName = '{com_name}'"
cursor_.execute(updateqccid)
cnx_.commit()
try:
post_data_list = info_by_id(company_id, com_name)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('hundred:baseInfo', com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
continue
if post_data_list:
pass
else:
# log.info(f'======{social_code}====企查查token失效====')
time.sleep(20)
continue
for post_data in post_data_list:
# list_all_info.append(post_data)
if post_data is None:
print(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败")
continue
get_name = post_data['name']
get_socialcode = post_data['socialCreditCode']
#todo:将信用代码更新到表中
updatesocialcode = f"update Hundred set SocialCode = '{get_socialcode}' where CompanyName = '{com_name}'"
cursor_.execute(updatesocialcode)
cnx_.commit()
name_compile = {
'yuan_name':com_name,
'get_name':get_name
}
name_list.append(name_compile)
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
except:
exception = 'kafka传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
# break
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
# -*- coding: utf-8 -*-
import time
from urllib.parse import quote
import requests
import urllib3
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# headers = {
# 'Host': 'xcx.qcc.com',
# 'Connection': 'keep-alive',
# 'Qcc-Platform': 'mp-weixin',
# 'Qcc-Timestamp': '',
# 'Qcc-Version': '1.0.0',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br,'
# }
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'x-request-device-type': 'Android',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
'Content-Type': 'application/json',
'Qcc-Version': '1.0.0',
'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
'xweb_xhr': '1',
'xcx-version': '2023.09.27',
'Qcc-Platform': 'mp-weixin',
'Qcc-CurrentPage': '/company-subpackages/business/index',
'Qcc-Timestamp': '1696661787803',
'Qcc-RefPage': '/company-subpackages/detail/index',
'Accept': '*/*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh'
}
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(start,token,name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except Exception as e:
print(f'{e}-------------重试')
time.sleep(5)
continue
time.sleep(2)
#{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频,请升级小程序版本'}
if resp_dict['status']==40101:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
if resp_dict['status']==401:
KeyNo = False
log.info(f'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
try:
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = 'null'
else:
KeyNo = 'null'
except:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
return KeyNo
log.info("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
\ No newline at end of file
...@@ -541,7 +541,10 @@ class BaseCore: ...@@ -541,7 +541,10 @@ class BaseCore:
self.cursor.execute(query) self.cursor.execute(query)
token_list = self.cursor.fetchall() token_list = self.cursor.fetchall()
self.cnx.commit() self.cnx.commit()
try:
token = token_list[random.randint(0, len(token_list)-1)][0] token = token_list[random.randint(0, len(token_list)-1)][0]
except:
token = ''
return token return token
# 删除失效的token # 删除失效的token
......
...@@ -11,24 +11,28 @@ import logbook.more ...@@ -11,24 +11,28 @@ import logbook.more
import pandas as pd import pandas as pd
import requests import requests
import zhconv import zhconv
import pymysql
import redis import redis
from docx import Document
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid import langid
#创建连接池 #创建连接池
import pymysql import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB from DBUtils.PooledDB import PooledDB
# import sys # import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client') # sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('E:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf') tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
from obs import ObsClient
import fitz
from urllib.parse import unquote
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore: class BaseCore:
...@@ -437,9 +441,9 @@ class BaseCore: ...@@ -437,9 +441,9 @@ class BaseCore:
#解析word文件页数 #解析word文件页数
def doc_page(self,file_path): # def doc_page(self,file_path):
doc = Document(file_path) # doc = Document(file_path)
return len(doc.sections) # return len(doc.sections)
def pdf_content(self,resp_content): def pdf_content(self,resp_content):
# 解析pdf文件内容 # 解析pdf文件内容
content = '' content = ''
...@@ -507,9 +511,9 @@ class BaseCore: ...@@ -507,9 +511,9 @@ class BaseCore:
# retData['page_size'] = page_size # retData['page_size'] = page_size
return retData return retData
def secrchATT(self,item_id,file_name,type_id): def secrchATT(self,item_id,file_name,type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s ''' sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id)) self.cursor_.execute(sel_sql, (item_id, file_name, type_id,order_by))
selects = self.cursor_.fetchone() selects = self.cursor_.fetchone()
return selects return selects
...@@ -527,13 +531,8 @@ class BaseCore: ...@@ -527,13 +531,8 @@ class BaseCore:
page_size = retData['page_size'] page_size = retData['page_size']
create_time = retData['create_time'] create_time = retData['create_time']
order_by = num order_by = num
selects = self.secrchATT(item_id,file_name,type_id)
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id,full_path
else:
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = ( values = (
...@@ -544,11 +543,71 @@ class BaseCore: ...@@ -544,11 +543,71 @@ class BaseCore:
self.cursor_.execute(Upsql, values) # 插入 self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交 self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql)) self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,file_name,type_id) selects = self.secrchATT(item_id,file_name,type_id,order_by)
id = selects[0] id = selects[0]
return id,full_path return id,full_path
# 获取文件大小
def convert_size(self,size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(self,file_href,item_id,pathType,file_name):
headers = {}
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(file_href, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
# name = file_name
if category in file_name:
pass
else:
file_name = file_name + '.' + category
result = obsClient.putContent('zzsn', f'{pathType}' + file_name, content=response.content)
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
......
...@@ -224,6 +224,7 @@ def get_content1(): ...@@ -224,6 +224,7 @@ def get_content1():
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
try: try:
...@@ -383,6 +384,7 @@ def get_content2(): ...@@ -383,6 +384,7 @@ def get_content2():
# # 判断是否已经爬取过 # # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
try: try:
...@@ -563,6 +565,7 @@ def get_content3(): ...@@ -563,6 +565,7 @@ def get_content3():
pub_time = li.split('<span>[')[1].split(']</span>')[0] pub_time = li.split('<span>[')[1].split(']</span>')[0]
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
sendContent(href, headers,title,pub_time,num) sendContent(href, headers,title,pub_time,num)
...@@ -591,6 +594,7 @@ def get_content3(): ...@@ -591,6 +594,7 @@ def get_content3():
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
title = doc_item('a').attr('title') title = doc_item('a').attr('title')
...@@ -612,6 +616,7 @@ def get_content3(): ...@@ -612,6 +616,7 @@ def get_content3():
def bei_jing(): def bei_jing():
num = 0 num = 0
start_time = time.time() start_time = time.time()
pathType = 'policy/beijing/'
# 有反爬需要使用selenium # 有反爬需要使用selenium
# service = Service(r'D:/chrome/113/chromedriver.exe') # service = Service(r'D:/chrome/113/chromedriver.exe')
# 配置selenium # 配置selenium
...@@ -664,6 +669,7 @@ def bei_jing(): ...@@ -664,6 +669,7 @@ def bei_jing():
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href[0]}) is_href = db_storage.find_one({'网址': href[0]})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
# 对获取信息页面发送请求 # 对获取信息页面发送请求
...@@ -712,7 +718,7 @@ def bei_jing(): ...@@ -712,7 +718,7 @@ def bei_jing():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1667') retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -721,7 +727,7 @@ def bei_jing(): ...@@ -721,7 +727,7 @@ def bei_jing():
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -754,7 +760,7 @@ def bei_jing(): ...@@ -754,7 +760,7 @@ def bei_jing():
# id_list.append(id) # id_list.append(id)
num += 1 num += 1
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
bro.quit() bro.quit()
except Exception as e: except Exception as e:
log.info(e) log.info(e)
...@@ -763,6 +769,7 @@ def bei_jing(): ...@@ -763,6 +769,7 @@ def bei_jing():
# 内蒙古 # 内蒙古
def nei_meng_gu(): def nei_meng_gu():
start = time.time() start = time.time()
pathType = 'policy/neimenggu/'
num = 0 num = 0
url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html' url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
try: try:
...@@ -780,6 +787,7 @@ def nei_meng_gu(): ...@@ -780,6 +787,7 @@ def nei_meng_gu():
# todo:测试用 注释掉判重 # todo:测试用 注释掉判重
is_href = db_storage.find_one({'网址': real_href}) is_href = db_storage.find_one({'网址': real_href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# 获取所需信息 # 获取所需信息
...@@ -831,16 +839,16 @@ def nei_meng_gu(): ...@@ -831,16 +839,16 @@ def nei_meng_gu():
fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1] fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
fu_jian_href = fu_jian_re fu_jian_href = fu_jian_re
# print(fu_jian_href)
# todo:附件上传至文件服务器 # todo:附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1669') retData = baseCore.uptoOBS(fu_jian_href, '1669',pathType,title)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num) att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
id_list.append(att_id) id_list.append(att_id)
# # todo:将返回的地址更新到soup
# fu_jian_link['href'] = 'http://114.115.215.96/' + full_path
print(title) print(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -881,6 +889,7 @@ def nei_meng_gu(): ...@@ -881,6 +889,7 @@ def nei_meng_gu():
# 吉林 # 吉林
def ji_lin(): def ji_lin():
pathType = 'policy/jilin/'
start = time.time() start = time.time()
num = 0 num = 0
url = 'http://gzw.jl.gov.cn/zwgk/zcwj/' url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
...@@ -902,6 +911,7 @@ def ji_lin(): ...@@ -902,6 +911,7 @@ def ji_lin():
title = a.find('a').text.replace('\n', '') title = a.find('a').text.replace('\n', '')
is_href = db_storage.find_one({'网址': real_href}) is_href = db_storage.find_one({'网址': real_href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html' # real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
...@@ -972,16 +982,17 @@ def ji_lin(): ...@@ -972,16 +982,17 @@ def ji_lin():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
file_name = fu_jian_href.text.strip() file_name = fu_jian_href.text.strip()
retData = baseCore.uploadToserver(fu_jian_href, '1670') # print(fu_jian_href)
retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
#
# todo:将返回的地址更新到soup # # todo:将返回的地址更新到soup
li.find('a')['href'] = 'http://114.115.215.96/' + full_path li.find('a')['href'] = full_path
else: else:
continue continue
else: else:
...@@ -1009,16 +1020,17 @@ def ji_lin(): ...@@ -1009,16 +1020,17 @@ def ji_lin():
if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \ if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
retData = baseCore.uploadToserver(fj_href, '1670') # print(fj_href)
retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
#
# todo:将返回的地址更新到soup # # todo:将返回的地址更新到soup
fu_jian_href['href'] = 'http://114.115.215.96/' + full_path fu_jian_href['href'] = full_path
else: else:
continue continue
...@@ -1062,7 +1074,7 @@ def ji_lin(): ...@@ -1062,7 +1074,7 @@ def ji_lin():
save_data(dic_news) save_data(dic_news)
num = num + 1 num = num + 1
except Exception as e: except Exception as e:
print(e) log.info(e)
pass pass
except: except:
pass pass
...@@ -1073,6 +1085,7 @@ def ji_lin(): ...@@ -1073,6 +1085,7 @@ def ji_lin():
def shang_hai(): def shang_hai():
start = time.time() start = time.time()
pathType = 'policy/shanghai/'
num = 0 num = 0
for page in range(1, 7): for page in range(1, 7):
...@@ -1095,6 +1108,7 @@ def shang_hai(): ...@@ -1095,6 +1108,7 @@ def shang_hai():
href = 'https://www.gzw.sh.gov.cn' + href href = 'https://www.gzw.sh.gov.cn' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html' href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
...@@ -1154,7 +1168,7 @@ def shang_hai(): ...@@ -1154,7 +1168,7 @@ def shang_hai():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
retData = baseCore.uploadToserver(fu_jian_href, '1671') retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1163,7 +1177,7 @@ def shang_hai(): ...@@ -1163,7 +1177,7 @@ def shang_hai():
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
a['href'] = 'http://114.115.215.96/' + full_path a['href'] = full_path
else: else:
continue continue
...@@ -1205,6 +1219,7 @@ def shang_hai(): ...@@ -1205,6 +1219,7 @@ def shang_hai():
# 浙江 # 浙江
def zhe_jiang(): def zhe_jiang():
start = time.time() start = time.time()
pathType = 'policy/zhejiang/'
num = 0 num = 0
url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html' url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
try: try:
...@@ -1227,6 +1242,7 @@ def zhe_jiang(): ...@@ -1227,6 +1242,7 @@ def zhe_jiang():
href = 'http://gzw.zj.gov.cn/' + href href = 'http://gzw.zj.gov.cn/' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -1325,6 +1341,7 @@ def zhe_jiang(): ...@@ -1325,6 +1341,7 @@ def zhe_jiang():
# 福建 # 福建
def fu_jian(): def fu_jian():
error_tag = str(404) error_tag = str(404)
pathType = 'policy/fujian/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/' url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
...@@ -1373,6 +1390,7 @@ def fu_jian(): ...@@ -1373,6 +1390,7 @@ def fu_jian():
# print(real_href) # print(real_href)
is_href = db_storage.find_one({'网址': real_href}) is_href = db_storage.find_one({'网址': real_href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# 文章是远程pdf # 文章是远程pdf
...@@ -1384,7 +1402,7 @@ def fu_jian(): ...@@ -1384,7 +1402,7 @@ def fu_jian():
content = baseCore.pdf_content(resp_content) content = baseCore.pdf_content(resp_content)
contentwithtag = '' contentwithtag = ''
# 文件上传至服务器 # 文件上传至服务器
retData = baseCore.uploadToserver(real_href, '1673') retData = baseCore.uptoOBS(real_href, '1673',pathType,title)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1420,7 +1438,7 @@ def fu_jian(): ...@@ -1420,7 +1438,7 @@ def fu_jian():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# 找到附件后 上传至文件服务器 # 找到附件后 上传至文件服务器
retData = baseCore.uploadToserver(fj_href, '1673') retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1428,7 +1446,7 @@ def fu_jian(): ...@@ -1428,7 +1446,7 @@ def fu_jian():
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将文件服务器的链接替换 # 将文件服务器的链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text) source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip() pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip()
...@@ -1499,6 +1517,7 @@ def shan_dong(): ...@@ -1499,6 +1517,7 @@ def shan_dong():
href = li.find('a')['href'] href = li.find('a')['href']
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -1593,6 +1612,7 @@ def shan_dong(): ...@@ -1593,6 +1612,7 @@ def shan_dong():
# 广东 # 广东
def guang_dong(): def guang_dong():
start = time.time() start = time.time()
pathType = 'policy/guangdong/'
num = 0 num = 0
url = 'http://gzw.gd.gov.cn/zcfg/index.html' url = 'http://gzw.gd.gov.cn/zcfg/index.html'
try: try:
...@@ -1620,6 +1640,7 @@ def guang_dong(): ...@@ -1620,6 +1640,7 @@ def guang_dong():
href = doc_item('a').attr('href') href = doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -1644,7 +1665,7 @@ def guang_dong(): ...@@ -1644,7 +1665,7 @@ def guang_dong():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fj_href, '1676') retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1652,7 +1673,7 @@ def guang_dong(): ...@@ -1652,7 +1673,7 @@ def guang_dong():
att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将文件服务器的链接替换 # 将文件服务器的链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -1692,6 +1713,7 @@ def guang_dong(): ...@@ -1692,6 +1713,7 @@ def guang_dong():
# 海南 # 海南
def hai_nan(): def hai_nan():
pathType = 'policy/hainan/'
def hai_nan1(): def hai_nan1():
# 部门文件 # 部门文件
num = 0 num = 0
...@@ -1717,6 +1739,7 @@ def hai_nan(): ...@@ -1717,6 +1739,7 @@ def hai_nan():
href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/') href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
try: try:
...@@ -1759,7 +1782,7 @@ def hai_nan(): ...@@ -1759,7 +1782,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1677') retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1767,7 +1790,7 @@ def hai_nan(): ...@@ -1767,7 +1790,7 @@ def hai_nan():
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将文件服务器的链接替换 # 将文件服务器的链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
try: try:
# print(href) # print(href)
...@@ -1801,7 +1824,7 @@ def hai_nan(): ...@@ -1801,7 +1824,7 @@ def hai_nan():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# print(f'----附件:{fu_jian_href}-----filename:{file_name}') # print(f'----附件:{fu_jian_href}-----filename:{file_name}')
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1677') retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -1809,7 +1832,7 @@ def hai_nan(): ...@@ -1809,7 +1832,7 @@ def hai_nan():
# 更新到数据库 # 更新到数据库
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
continue continue
...@@ -1888,6 +1911,7 @@ def hai_nan(): ...@@ -1888,6 +1911,7 @@ def hai_nan():
# print(title,href) # print(title,href)
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -1959,6 +1983,7 @@ def hai_nan(): ...@@ -1959,6 +1983,7 @@ def hai_nan():
# print(title,href) # print(title,href)
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -2007,7 +2032,7 @@ def hai_nan(): ...@@ -2007,7 +2032,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1677') retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2015,7 +2040,7 @@ def hai_nan(): ...@@ -2015,7 +2040,7 @@ def hai_nan():
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
# print(f'附件:{fu_jian_href}') # print(f'附件:{fu_jian_href}')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -2065,6 +2090,7 @@ def hai_nan(): ...@@ -2065,6 +2090,7 @@ def hai_nan():
# print(title,href) # print(title,href)
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -2113,14 +2139,14 @@ def hai_nan(): ...@@ -2113,14 +2139,14 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1677') retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
print(f'----附件:{fu_jian_href}') print(f'----附件:{fu_jian_href}')
else: else:
pass pass
...@@ -2175,10 +2201,13 @@ def hai_nan(): ...@@ -2175,10 +2201,13 @@ def hai_nan():
try: try:
is_href = db_storage.find_one({'网址': i_href}) is_href = db_storage.find_one({'网址': i_href})
if is_href: if is_href:
num+=1
continue continue
if i_href == 'https://www.gov.cn/jrzg/2013-11/27/content_2536600.htm': if i_href == 'https://www.gov.cn/jrzg/2013-11/27/content_2536600.htm':
num+=1
continue continue
if i_href == 'https://www.gov.cn/jrzg/2013-09/28/content_2497241.htm': if i_href == 'https://www.gov.cn/jrzg/2013-09/28/content_2497241.htm':
num+=1
continue continue
# print(f'中央----{i_href}----') # print(f'中央----{i_href}----')
href_text = requests.get(url=i_href, headers=headers, verify=False) href_text = requests.get(url=i_href, headers=headers, verify=False)
...@@ -2330,6 +2359,7 @@ def hai_nan(): ...@@ -2330,6 +2359,7 @@ def hai_nan():
# 四川 # 四川
def si_chuan(): def si_chuan():
num = 0 num = 0
pathType = 'policy/sichuan/'
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
if page == 1: if page == 1:
...@@ -2349,9 +2379,10 @@ def si_chuan(): ...@@ -2349,9 +2379,10 @@ def si_chuan():
href = 'http://gzw.sc.gov.cn' + doc_item('a').attr('href') href = 'http://gzw.sc.gov.cn' + doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
print(href) # print(href)
href_text = requests.get(url=href, headers=headers, verify=False).text href_text = requests.get(url=href, headers=headers, verify=False).text
doc_href = pq(href_text) doc_href = pq(href_text)
title = str(doc_href('.xxgkzn_title').text()).replace('\n', '').replace('\r', '') title = str(doc_href('.xxgkzn_title').text()).replace('\n', '').replace('\r', '')
...@@ -2374,14 +2405,14 @@ def si_chuan(): ...@@ -2374,14 +2405,14 @@ def si_chuan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 对附件上传至文件服务器 # 对附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1678') retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name)
if retData['stste']: if retData['stste']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
# fu_jian_href_list.append(fu_jian_href) # fu_jian_href_list.append(fu_jian_href)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -2423,6 +2454,7 @@ def si_chuan(): ...@@ -2423,6 +2454,7 @@ def si_chuan():
# 广西 # 广西
def guang_xi(): def guang_xi():
num = 0 num = 0
pathType = 'policy/guangxi/'
start_time = time.time() start_time = time.time()
url_all = """ url_all = """
http://gzw.gxzf.gov.cn/wjzx/2023nwj/ 1 http://gzw.gxzf.gov.cn/wjzx/2023nwj/ 1
...@@ -2463,6 +2495,7 @@ def guang_xi(): ...@@ -2463,6 +2495,7 @@ def guang_xi():
href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '') href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -2498,7 +2531,7 @@ def guang_xi(): ...@@ -2498,7 +2531,7 @@ def guang_xi():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1692') retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2507,7 +2540,7 @@ def guang_xi(): ...@@ -2507,7 +2540,7 @@ def guang_xi():
att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -2550,6 +2583,7 @@ def gui_zhou(): ...@@ -2550,6 +2583,7 @@ def gui_zhou():
http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/ 11 http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/ 11
http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/ 1 http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/ 1
""" """
pathType = 'policy/guizhou/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
for page in range(0, 11): for page in range(0, 11):
...@@ -2566,6 +2600,7 @@ def gui_zhou(): ...@@ -2566,6 +2600,7 @@ def gui_zhou():
href = doc_item('a').attr('href') href = doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# print(href) # print(href)
...@@ -2606,7 +2641,7 @@ def gui_zhou(): ...@@ -2606,7 +2641,7 @@ def gui_zhou():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1694') retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2615,7 +2650,7 @@ def gui_zhou(): ...@@ -2615,7 +2650,7 @@ def gui_zhou():
att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -2655,6 +2690,7 @@ def gui_zhou(): ...@@ -2655,6 +2690,7 @@ def gui_zhou():
# 云南 # 云南
def yun_nan(): def yun_nan():
pathType = 'policy/yunnan/'
def yun_nan1(): def yun_nan1():
""" """
http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml 9 http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml 9
...@@ -2679,6 +2715,7 @@ def yun_nan(): ...@@ -2679,6 +2715,7 @@ def yun_nan():
href = 'http://gzw.yn.gov.cn' + doc_item('a').attr('href') href = 'http://gzw.yn.gov.cn' + doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
fu_jian_href_list = [] fu_jian_href_list = []
...@@ -2710,7 +2747,7 @@ def yun_nan(): ...@@ -2710,7 +2747,7 @@ def yun_nan():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try: try:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1679') retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2719,7 +2756,7 @@ def yun_nan(): ...@@ -2719,7 +2756,7 @@ def yun_nan():
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
continue continue
href_resp.close() href_resp.close()
...@@ -2788,6 +2825,7 @@ def yun_nan(): ...@@ -2788,6 +2825,7 @@ def yun_nan():
href = 'http://gzw.yn.gov.cn' + li.find('a').get('href').replace(' ', '') href = 'http://gzw.yn.gov.cn' + li.find('a').get('href').replace(' ', '')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
print(href) print(href)
...@@ -2822,7 +2860,7 @@ def yun_nan(): ...@@ -2822,7 +2860,7 @@ def yun_nan():
print(fu_jian_href) print(fu_jian_href)
try: try:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1679') retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2831,7 +2869,7 @@ def yun_nan(): ...@@ -2831,7 +2869,7 @@ def yun_nan():
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
continue continue
res_.close() res_.close()
...@@ -2890,6 +2928,7 @@ def chong_qing(): ...@@ -2890,6 +2928,7 @@ def chong_qing():
http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2 http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2
""" """
num = 0 num = 0
pathType = 'policy/chongqing/'
start_time = time.time() start_time = time.time()
for page in range(0, 4): for page in range(0, 4):
if page == 0: if page == 0:
...@@ -2913,6 +2952,7 @@ def chong_qing(): ...@@ -2913,6 +2952,7 @@ def chong_qing():
href = url.split('index')[0] + title_item('a').attr('href').replace('./', '') href = url.split('index')[0] + title_item('a').attr('href').replace('./', '')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
print(href) print(href)
...@@ -2960,7 +3000,7 @@ def chong_qing(): ...@@ -2960,7 +3000,7 @@ def chong_qing():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try: try:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href, '1693') retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -2969,7 +3009,7 @@ def chong_qing(): ...@@ -2969,7 +3009,7 @@ def chong_qing():
att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = 'http://114.115.215.96/' + full_path fu_jian['href'] = full_path
except: except:
continue continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -3011,6 +3051,7 @@ def chong_qing(): ...@@ -3011,6 +3051,7 @@ def chong_qing():
# 天津 # 天津
def tian_jin(): def tian_jin():
pathType = 'policy/tianjin/'
def tian_jin1(): def tian_jin1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -3038,6 +3079,7 @@ def tian_jin(): ...@@ -3038,6 +3079,7 @@ def tian_jin():
href = i_href href = i_href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8') # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
...@@ -3082,7 +3124,7 @@ def tian_jin(): ...@@ -3082,7 +3124,7 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1683') retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3090,7 +3132,7 @@ def tian_jin(): ...@@ -3090,7 +3132,7 @@ def tian_jin():
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3160,6 +3202,7 @@ def tian_jin(): ...@@ -3160,6 +3202,7 @@ def tian_jin():
href = url.split('index')[0] + href.replace('./', '') href = url.split('index')[0] + href.replace('./', '')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
# href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8') # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
...@@ -3205,7 +3248,7 @@ def tian_jin(): ...@@ -3205,7 +3248,7 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1683') retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3213,7 +3256,7 @@ def tian_jin(): ...@@ -3213,7 +3256,7 @@ def tian_jin():
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3284,6 +3327,7 @@ def tian_jin(): ...@@ -3284,6 +3327,7 @@ def tian_jin():
href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/') href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
...@@ -3332,7 +3376,7 @@ def tian_jin(): ...@@ -3332,7 +3376,7 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1683') retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3340,7 +3384,7 @@ def tian_jin(): ...@@ -3340,7 +3384,7 @@ def tian_jin():
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3388,6 +3432,7 @@ def tian_jin(): ...@@ -3388,6 +3432,7 @@ def tian_jin():
# 新疆 # 新疆
def xin_jiang(): def xin_jiang():
pathType = 'policy/xinjiang/'
def xin_jiang1(): def xin_jiang1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -3407,6 +3452,7 @@ def xin_jiang(): ...@@ -3407,6 +3452,7 @@ def xin_jiang():
continue continue
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
# href = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/201909/559cf77b5a954d028bd187d6c6e46747.shtml' # href = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/201909/559cf77b5a954d028bd187d6c6e46747.shtml'
try: try:
...@@ -3432,7 +3478,7 @@ def xin_jiang(): ...@@ -3432,7 +3478,7 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1682') retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3440,7 +3486,7 @@ def xin_jiang(): ...@@ -3440,7 +3486,7 @@ def xin_jiang():
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3509,6 +3555,7 @@ def xin_jiang(): ...@@ -3509,6 +3555,7 @@ def xin_jiang():
href = 'http://gyzc.xjbt.gov.cn' + href href = 'http://gyzc.xjbt.gov.cn' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_res = requests.get(url=href, headers=headers, verify=False) href_res = requests.get(url=href, headers=headers, verify=False)
...@@ -3530,7 +3577,7 @@ def xin_jiang(): ...@@ -3530,7 +3577,7 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1682') retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3538,7 +3585,7 @@ def xin_jiang(): ...@@ -3538,7 +3585,7 @@ def xin_jiang():
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3594,6 +3641,7 @@ def xin_jiang(): ...@@ -3594,6 +3641,7 @@ def xin_jiang():
# 山西 # 山西
def shan_xi(): def shan_xi():
pathType = 'policy/shanxi/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
for page in range(1, 7): for page in range(1, 7):
...@@ -3618,6 +3666,7 @@ def shan_xi(): ...@@ -3618,6 +3666,7 @@ def shan_xi():
publishDate = tr.xpath('./td[2]/span/text()')[0] publishDate = tr.xpath('./td[2]/span/text()')[0]
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
if ".pdf" in href: if ".pdf" in href:
...@@ -3648,7 +3697,7 @@ def shan_xi(): ...@@ -3648,7 +3697,7 @@ def shan_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1684') retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3656,7 +3705,7 @@ def shan_xi(): ...@@ -3656,7 +3705,7 @@ def shan_xi():
att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3707,6 +3756,7 @@ def shan_xi(): ...@@ -3707,6 +3756,7 @@ def shan_xi():
# 辽宁 # 辽宁
def liao_ning(): def liao_ning():
pathType = 'policy/liaoning/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
...@@ -3727,6 +3777,7 @@ def liao_ning(): ...@@ -3727,6 +3777,7 @@ def liao_ning():
href = 'https://gzw.ln.gov.cn/' + href href = 'https://gzw.ln.gov.cn/' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -3758,7 +3809,7 @@ def liao_ning(): ...@@ -3758,7 +3809,7 @@ def liao_ning():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1685') retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3766,7 +3817,7 @@ def liao_ning(): ...@@ -3766,7 +3817,7 @@ def liao_ning():
att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
...@@ -3816,6 +3867,7 @@ def liao_ning(): ...@@ -3816,6 +3867,7 @@ def liao_ning():
# 黑龙江 # 黑龙江
def hei_long_jiang(): def hei_long_jiang():
pathType = 'policy/heilongjiang/'
num = 0 num = 0
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
...@@ -3837,6 +3889,7 @@ def hei_long_jiang(): ...@@ -3837,6 +3889,7 @@ def hei_long_jiang():
pub_hao = '' pub_hao = ''
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
contentWithTag = text['data']['results'][row]['contentHtml'] contentWithTag = text['data']['results'][row]['contentHtml']
...@@ -3861,7 +3914,7 @@ def hei_long_jiang(): ...@@ -3861,7 +3914,7 @@ def hei_long_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1687') retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3869,7 +3922,7 @@ def hei_long_jiang(): ...@@ -3869,7 +3922,7 @@ def hei_long_jiang():
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -3912,6 +3965,7 @@ def hei_long_jiang(): ...@@ -3912,6 +3965,7 @@ def hei_long_jiang():
# 江苏 # 江苏
def jiang_su(): def jiang_su():
num = 0 num = 0
pathType = 'policy/jiangsu/'
start_time = time.time() start_time = time.time()
pagestart = 1 pagestart = 1
pageend = 45 pageend = 45
...@@ -3940,6 +3994,7 @@ def jiang_su(): ...@@ -3940,6 +3994,7 @@ def jiang_su():
title = a.text title = a.text
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -3967,7 +4022,7 @@ def jiang_su(): ...@@ -3967,7 +4022,7 @@ def jiang_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1687') retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -3975,7 +4030,7 @@ def jiang_su(): ...@@ -3975,7 +4030,7 @@ def jiang_su():
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4022,6 +4077,7 @@ def jiang_su(): ...@@ -4022,6 +4077,7 @@ def jiang_su():
# 安徽 # 安徽
def an_hui(): def an_hui():
pathType = 'policy/anhui/'
def an_hui1(): def an_hui1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -4037,6 +4093,7 @@ def an_hui(): ...@@ -4037,6 +4093,7 @@ def an_hui():
href = doc_item('a').attr('href') href = doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_text = requests.get(url=href, headers=headers, verify=False) href_text = requests.get(url=href, headers=headers, verify=False)
...@@ -4068,7 +4125,7 @@ def an_hui(): ...@@ -4068,7 +4125,7 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1688') retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4076,7 +4133,7 @@ def an_hui(): ...@@ -4076,7 +4133,7 @@ def an_hui():
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4164,7 +4221,7 @@ def an_hui(): ...@@ -4164,7 +4221,7 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1688') retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4172,7 +4229,7 @@ def an_hui(): ...@@ -4172,7 +4229,7 @@ def an_hui():
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4223,6 +4280,7 @@ def jiang_xi(): ...@@ -4223,6 +4280,7 @@ def jiang_xi():
121-164 121-164
""" """
num = 0 num = 0
pathType = 'policy/jiangxi/'
start_time = time.time() start_time = time.time()
startrecord = 1 startrecord = 1
endrecord = 60 endrecord = 60
...@@ -4248,6 +4306,7 @@ def jiang_xi(): ...@@ -4248,6 +4306,7 @@ def jiang_xi():
for href in href_list: for href in href_list:
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_res = requests.get(url=href, headers=headers, verify=False) href_res = requests.get(url=href, headers=headers, verify=False)
...@@ -4289,7 +4348,7 @@ def jiang_xi(): ...@@ -4289,7 +4348,7 @@ def jiang_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1689') retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4297,7 +4356,7 @@ def jiang_xi(): ...@@ -4297,7 +4356,7 @@ def jiang_xi():
att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4346,6 +4405,7 @@ def jiang_xi(): ...@@ -4346,6 +4405,7 @@ def jiang_xi():
# 河南 # 河南
def he_nan(): def he_nan():
num = 0 num = 0
pathType = 'policy/henan/'
start_time = time.time() start_time = time.time()
for page in range(0, 7): for page in range(0, 7):
if page == 0: if page == 0:
...@@ -4361,6 +4421,7 @@ def he_nan(): ...@@ -4361,6 +4421,7 @@ def he_nan():
href = doc_item('a').attr('href') href = doc_item('a').attr('href')
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
href_res = requests.get(url=href, headers=headers, verify=False) href_res = requests.get(url=href, headers=headers, verify=False)
href_res.encoding = href_res.apparent_encoding href_res.encoding = href_res.apparent_encoding
...@@ -4383,7 +4444,7 @@ def he_nan(): ...@@ -4383,7 +4444,7 @@ def he_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1690') retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4391,7 +4452,7 @@ def he_nan(): ...@@ -4391,7 +4452,7 @@ def he_nan():
att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4438,6 +4499,7 @@ def he_nan(): ...@@ -4438,6 +4499,7 @@ def he_nan():
# 湖南 # 湖南
def hu_nan(): def hu_nan():
num = 0 num = 0
pathType = 'policy/hunan/'
start_time = time.time() start_time = time.time()
for page in range(1, 7): for page in range(1, 7):
if page == 1: if page == 1:
...@@ -4454,6 +4516,7 @@ def hu_nan(): ...@@ -4454,6 +4516,7 @@ def hu_nan():
publishDate = doc_item('td:nth-child(3)').text() publishDate = doc_item('td:nth-child(3)').text()
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
# href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html' # href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html'
try: try:
...@@ -4490,7 +4553,7 @@ def hu_nan(): ...@@ -4490,7 +4553,7 @@ def hu_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1691') retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4498,7 +4561,7 @@ def hu_nan(): ...@@ -4498,7 +4561,7 @@ def hu_nan():
att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4538,6 +4601,7 @@ def hu_nan(): ...@@ -4538,6 +4601,7 @@ def hu_nan():
# 甘肃 # 甘肃
def gan_su(): def gan_su():
pathType = 'policy/gansu/'
def gan_su1(): def gan_su1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -4581,6 +4645,7 @@ def gan_su(): ...@@ -4581,6 +4645,7 @@ def gan_su():
publishDate = dd['publishDate'] publishDate = dd['publishDate']
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
for i in range(0, 4): for i in range(0, 4):
bro.get(href) bro.get(href)
...@@ -4609,7 +4674,7 @@ def gan_su(): ...@@ -4609,7 +4674,7 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1696') retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4617,7 +4682,7 @@ def gan_su(): ...@@ -4617,7 +4682,7 @@ def gan_su():
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4688,6 +4753,7 @@ def gan_su(): ...@@ -4688,6 +4753,7 @@ def gan_su():
publishDate = dd['publishDate'] publishDate = dd['publishDate']
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
bro.get(href) bro.get(href)
try: try:
...@@ -4743,7 +4809,7 @@ def gan_su(): ...@@ -4743,7 +4809,7 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1696') retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4751,7 +4817,7 @@ def gan_su(): ...@@ -4751,7 +4817,7 @@ def gan_su():
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4849,6 +4915,7 @@ def gan_su(): ...@@ -4849,6 +4915,7 @@ def gan_su():
publishDate = dd['publishDate'] publishDate = dd['publishDate']
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
bro.get(href) bro.get(href)
...@@ -4900,7 +4967,7 @@ def gan_su(): ...@@ -4900,7 +4967,7 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1696') retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -4908,7 +4975,7 @@ def gan_su(): ...@@ -4908,7 +4975,7 @@ def gan_su():
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -4958,6 +5025,7 @@ def gan_su(): ...@@ -4958,6 +5025,7 @@ def gan_su():
# 宁夏 # 宁夏
def ning_xia(): def ning_xia():
num = 0 num = 0
pathType = 'policy/ningxia/'
start_time = time.time() start_time = time.time()
for page in range(0, 3): for page in range(0, 3):
if page == 0: if page == 0:
...@@ -4976,6 +5044,7 @@ def ning_xia(): ...@@ -4976,6 +5044,7 @@ def ning_xia():
publishDate = li.find('span', attrs={'class': 'stdnewslistspan'}).text publishDate = li.find('span', attrs={'class': 'stdnewslistspan'}).text
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
href_res = requests.get(url=href, headers=headers, verify=False) href_res = requests.get(url=href, headers=headers, verify=False)
...@@ -5001,7 +5070,7 @@ def ning_xia(): ...@@ -5001,7 +5070,7 @@ def ning_xia():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1697') retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5009,7 +5078,7 @@ def ning_xia(): ...@@ -5009,7 +5078,7 @@ def ning_xia():
att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -5052,6 +5121,7 @@ def ning_xia(): ...@@ -5052,6 +5121,7 @@ def ning_xia():
# 陕西 # 陕西
def shanxi(): def shanxi():
num = 0 num = 0
pathType = 'policy/shan_xi/'
start_time = time.time() start_time = time.time()
url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127' url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
# url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127' # url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
...@@ -5072,6 +5142,7 @@ def shanxi(): ...@@ -5072,6 +5142,7 @@ def shanxi():
href = 'https://sxgz.shaanxi.gov.cn/' + href href = 'https://sxgz.shaanxi.gov.cn/' + href
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
res_href = requests.get(url=href, headers=headers) res_href = requests.get(url=href, headers=headers)
...@@ -5101,7 +5172,7 @@ def shanxi(): ...@@ -5101,7 +5172,7 @@ def shanxi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1680') retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5109,7 +5180,7 @@ def shanxi(): ...@@ -5109,7 +5180,7 @@ def shanxi():
att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -5152,6 +5223,7 @@ def shanxi(): ...@@ -5152,6 +5223,7 @@ def shanxi():
# 西藏 # 西藏
def xi_zang(): def xi_zang():
start_time = time.time() start_time = time.time()
pathType = 'policy/xizang/'
url_list = ['http://gzw.lasa.gov.cn/gzw/zccfg/common_list.shtml', url_list = ['http://gzw.lasa.gov.cn/gzw/zccfg/common_list.shtml',
'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ] 'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
for url in url_list: for url in url_list:
...@@ -5169,6 +5241,7 @@ def xi_zang(): ...@@ -5169,6 +5241,7 @@ def xi_zang():
title = li.find('a').text title = li.find('a').text
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
res_href = requests.get(url=href, headers=headers) res_href = requests.get(url=href, headers=headers)
...@@ -5194,7 +5267,7 @@ def xi_zang(): ...@@ -5194,7 +5267,7 @@ def xi_zang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1695') retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5202,7 +5275,7 @@ def xi_zang(): ...@@ -5202,7 +5275,7 @@ def xi_zang():
att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
# todo:替换完成之后,将附件上传至文件服务器 # todo:替换完成之后,将附件上传至文件服务器
...@@ -5242,6 +5315,7 @@ def xi_zang(): ...@@ -5242,6 +5315,7 @@ def xi_zang():
# 青海 # 青海
def qing_hai(): def qing_hai():
pathType = 'policy/qinghai/'
def qing_hai1(): def qing_hai1():
num = 0 num = 0
start_time = time.time() start_time = time.time()
...@@ -5259,6 +5333,7 @@ def qing_hai(): ...@@ -5259,6 +5333,7 @@ def qing_hai():
durl = tr.find('a').get('href') durl = tr.find('a').get('href')
is_href = db_storage.find_one({'网址': durl}) is_href = db_storage.find_one({'网址': durl})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
title = tr.find('a').text title = tr.find('a').text
...@@ -5297,7 +5372,7 @@ def qing_hai(): ...@@ -5297,7 +5372,7 @@ def qing_hai():
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
# todo:替换完成之后,将附件上传至文件服务器 # todo:替换完成之后,将附件上传至文件服务器
...@@ -5659,41 +5734,41 @@ def hu_bei(): ...@@ -5659,41 +5734,41 @@ def hu_bei():
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
if __name__ == '__main__': if __name__ == '__main__':
get_content1() # get_content1()
get_content2() # get_content2()
get_content3() # get_content3()
bei_jing() # bei_jing()
nei_meng_gu() # nei_meng_gu()
ji_lin() ji_lin()
shang_hai() # shang_hai()
zhe_jiang() # zhe_jiang()
fu_jian() # fu_jian()
shan_dong() # shan_dong()
guang_dong() # guang_dong()
hai_nan() # hai_nan()
si_chuan() # si_chuan()
guang_xi() # guang_xi()
gui_zhou() # gui_zhou()
yun_nan() # yun_nan()
chong_qing() # chong_qing()
tian_jin() # tian_jin()
xin_jiang() # xin_jiang()
shan_xi() # shan_xi()
liao_ning() # liao_ning()
hei_long_jiang() # hei_long_jiang()
jiang_su() # jiang_su()
an_hui() # an_hui()
jiang_xi() # jiang_xi()
he_nan() # he_nan()
hu_nan() # hu_nan()
gan_su() # gan_su()
ning_xia() # ning_xia()
xi_zang() # xi_zang()
shanxi() # shanxi()
qing_hai() # qing_hai()
he_bei() # he_bei()
qing_hai() # qing_hai()
current_time = datetime.datetime.now() # current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1) # midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds() # sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds) # time.sleep(sleep_seconds)
"""
新浪财经国内企业动态
"""
import json
import re
import time
import jieba
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from retry import retry
from base.smart import smart_extractor
from base.BaseCore import BaseCore
# 初始化,设置中文分词
jieba.cut("必须加载jieba")
smart = smart_extractor.SmartExtractor('cn')
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
r = baseCore.r
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache'
}
taskType = '企业动态/新浪财经'
pattern = r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}"
# 获取响应页面
@retry(tries=3, delay=1)
def getrequests(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers,proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
# 解析内容
def getDic(social_code, title, href, pub_time):
start_time = time.time()
if 'http' not in href:
href = 'https://finance.sina.com.cn' + href
href_ = href.replace('https', 'http')
try:
# 带标签正文
contentText = smart.extract_by_url(href_).text
# 不带标签正文
content = smart.extract_by_url(href_).cleaned_text
if content == '':
log.error(f'{href}===页面解析失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
return 0
except:
log.error(f'{href}===页面解析失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
return 0
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': '',
'author': '',
'content': content,
'contentWithTag': contentText,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '新浪财经',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': href, # 原文链接
'summary': '',
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': pub_time[:4]
}
# print(dic_news)
try:
sendKafka(dic_news, start_time)
log.info(f'Kafka发送成功')
try:
insertMysql(social_code, href)
log.info(f'数据库保存成功')
except:
log.error(f'{href}===数据入库失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===数据入库失败')
except:
log.error(f'{href}===发送Kafka失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
return 1
# 数据发送至Kafka
@retry(tries=3, delay=1)
def sendKafka(dic_news, start_time):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(dic_news['socialCreditCode'], taskType, state, takeTime, dic_news['sourceAddress'], '')
# 数据保存入库,用于判重
@retry(tries=3, delay=1)
def insertMysql(social_code, link):
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
# 动态信息列表
list_info = [
social_code,
link,
'新浪财经',
'2',
]
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
# 判断动态是否采集过
@retry(tries=3, delay=1)
def selectUrl(url, social_code):
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
cursor.execute(sel_sql, (url, social_code))
selects = cursor.fetchone()
return selects
def doJob():
while True:
start_time = time.time()
social_code = baseCore.redicPullData('NewsEnterprise:gnqy_nyse_socialCode')
# social_code = '914403007261824992'
if not social_code or social_code == 'None':
print(f'============已没有数据============等待===============')
time.sleep(1800)
data = baseCore.getInfomation(social_code)
gpdm = data[3]
log.info(f'{social_code}==={gpdm}===开始采集')
exchange = data[10]
if gpdm == '' or not gpdm:
log.error(f'{social_code}===股票代码为空')
continue
# 根据所在交易所不同,修改股票代码
if exchange == 1:
gpdm_ = 'bj' + gpdm
elif exchange == 2:
gpdm_ = 'sh' + gpdm
elif exchange == 3:
gpdm_ = 'sz' + gpdm
else:
log.info(f'{social_code}==={gpdm}===不在北京、上海、深圳交易所')
continue
page = 1
num_ok = 0
num_error =0
while True:
url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={gpdm_}&Page={page}'
soup = getrequests(url)
if '拒绝访问' in soup.text:
log.error(f'{social_code}===ip封禁')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
r.rpush('NewsEnterprise:gnqy_nyse_socialCode',social_code)
time.sleep(1800)
break
try:
ul = soup.find('div', class_='datelist').find('ul')
a_list = ul.find_all('a')
time_list = re.findall(pattern, str(ul))
for i in range(len(a_list)):
try:
title = a_list[i].text.lstrip().strip()
if title == '':
continue
href = a_list[i].get('href')
selects = selectUrl(href,social_code)
if selects:
log.info(f'{href}===已采集')
continue
if 'http' not in href:
href = 'https://finance.sina.com.cn' + href
pub_time = time_list[i].replace('\xa0', ' ') + ":00"
flg = getDic(social_code,title,href,pub_time)
if flg == 0:
num_error += 1
else:
num_ok += 1
time.sleep(0.5)
except Exception as e:
ee = e.__traceback__.tb_lineno
log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
break
except:
log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
next_flg = soup.select('#con02-7 > table > tr')[1].select('div')[2].text
if '下一页' not in next_flg:
break
page += 1
break
log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
if __name__ == "__main__":
doJob()
"""
新浪财经香港企业动态
"""
from datetime import datetime
import json
import re
import time
import jieba
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from retry import retry
from base.smart import smart_extractor
from base.BaseCore import BaseCore
# 初始化,设置中文分词
jieba.cut("必须加载jieba")
smart = smart_extractor.SmartExtractor('cn')
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
r = baseCore.r
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache'
}
taskType = '企业动态/新浪财经'
# 判断时间是否是正确格式
def format_time(time_str):
try:
# 尝试将时间字符串按指定格式解析为datetime对象
datetime_obj = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
# 检查解析后的时间对象是否与原字符串完全匹配
if datetime_obj.strftime("%Y-%m-%d %H:%M:%S") == time_str:
return time_str
except ValueError:
pass
# 如果无法解析为指定格式,则格式化为"%Y-%m-%d %H:%M:%S"
formatted_time = datetime.strftime(datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
return formatted_time
# 获取响应页面
@retry(tries=3, delay=1)
def getrequests(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers,proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
# 解析内容
def getDic(social_code, title, href, pub_time):
start_time = time.time()
if 'http' not in href:
href = 'https://finance.sina.com.cn' + href
href_ = href.replace('https', 'http')
try:
# 带标签正文
contentText = smart.extract_by_url(href_).text
# 不带标签正文
content = smart.extract_by_url(href_).cleaned_text
if content == '':
log.error(f'{href}===页面解析失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
return 0
except:
log.error(f'{href}===页面解析失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
return 0
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': '',
'author': '',
'content': content,
'contentWithTag': contentText,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '新浪财经',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': href, # 原文链接
'summary': '',
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': pub_time[:4]
}
# print(dic_news)
# try:
# sendKafka(dic_news, start_time)
# log.info(f'Kafka发送成功')
# try:
# insertMysql(social_code, href)
# log.info(f'数据库保存成功')
# except:
# log.error(f'{href}===数据入库失败')
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===数据入库失败')
# except:
# log.error(f'{href}===发送Kafka失败')
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
# return 1
# 数据发送至Kafka
@retry(tries=3, delay=1)
def sendKafka(dic_news, start_time):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(dic_news['socialCreditCode'], taskType, state, takeTime, dic_news['sourceAddress'], '')
# 数据保存入库,用于判重
@retry(tries=3, delay=1)
def insertMysql(social_code, link):
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
# 动态信息列表
list_info = [
social_code,
link,
'新浪财经',
'2',
]
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
# 判断动态是否采集过
@retry(tries=3, delay=1)
def selectUrl(url, social_code):
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
cursor.execute(sel_sql, (url, social_code))
selects = cursor.fetchone()
return selects
def doJob():
# while True:
start_time = time.time()
# social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
social_code = '91330000747735638J'
if not social_code or social_code == 'None':
time.sleep(20)
data = baseCore.getInfomation(social_code)
gpdm = data[3]
log.info(f'{social_code}==={gpdm}===开始采集')
# if gpdm == '' or not gpdm:
# log.error(f'{social_code}===股票代码为空')
# continue
gpdm_ = gpdm.split('.')[0]
if len(gpdm_) != 5:
gpdm_ = gpdm_.zfill(5)
page = 1
num_ok = 0
num_error =0
while True:
url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
soup = getrequests(url)
if '拒绝访问' in soup.text:
log.error(f'{social_code}===ip封禁')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
# r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
time.sleep(1800)
break
next_flg = soup.find('div',class_='part02').text
if '暂无数据' in next_flg:
break
try:
li_list = soup.find('ul', class_='list01').find_all('li')
for li in li_list:
try:
a = li.find('a')
if a:
title = a.text
if title == '':
continue
href = a.get('href')
selects = selectUrl(href,social_code)
if selects:
log.info(f'{href}===已采集过')
continue
pub_time = format_time(li.find('span').text)
print(title)
flag = getDic(social_code,title,href,pub_time)
if flag == 1:
num_ok += 1
else:
num_error += 1
time.sleep(0.5)
except Exception as e:
ee = e.__traceback__.tb_lineno
log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
continue
# 增量使用
# if selects:
# break
except:
log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
page += 1
log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
if __name__ == "__main__":
doJob()
baseCore.close()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论