提交 a5dbff9a 作者: 刘伟刚

Merge remote-tracking branch 'origin/master'

...@@ -4,9 +4,12 @@ import random ...@@ -4,9 +4,12 @@ import random
import socket import socket
import sys import sys
import time import time
import fitz
import logbook import logbook
import logbook.more import logbook.more
import pandas as pd import pandas as pd
import requests
import zhconv import zhconv
import pymysql import pymysql
import redis import redis
...@@ -20,16 +23,21 @@ from pymysql import connections ...@@ -20,16 +23,21 @@ from pymysql import connections
from DBUtils.PooledDB import PooledDB from DBUtils.PooledDB import PooledDB
import pymysql import pymysql
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore: class BaseCore:
# 序列号 # 序列号
__seq = 0 __seq = 0
# 代理池 数据库连接 # 代理池 数据库连接
__cnx_proxy =None # __cnx_proxy =None
__cursor_proxy = None # __cursor_proxy = None
cnx = None cnx = None
cursor = None cursor = None
cnx_ = None
cursor_ = None
r = None r = None
# agent 池 # agent 池
__USER_AGENT_LIST = [ __USER_AGENT_LIST = [
...@@ -228,13 +236,18 @@ class BaseCore: ...@@ -228,13 +236,18 @@ class BaseCore:
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'] __USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self): def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project', # self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4') # charset='utf8mb4')
self.__cursor_proxy = self.__cnx_proxy.cursor() # self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji', self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4') charset='utf8mb4')
self.cursor = self.cnx.cursor() self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis # 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6) self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
...@@ -246,7 +259,7 @@ class BaseCore: ...@@ -246,7 +259,7 @@ class BaseCore:
blocking=True, blocking=True,
host='114.115.159.144', host='114.115.159.144',
port=3306, port=3306,
user='root', user='caiji',
password='zzsn9988', password='zzsn9988',
database='caiji', database='caiji',
charset='utf8mb4' charset='utf8mb4'
...@@ -254,8 +267,6 @@ class BaseCore: ...@@ -254,8 +267,6 @@ class BaseCore:
def close(self): def close(self):
try: try:
self.__cursor_proxy.close()
self.__cnx_proxy.close()
self.cursor.close() self.cursor.close()
self.cnx.close() self.cnx.close()
except : except :
...@@ -345,8 +356,8 @@ class BaseCore: ...@@ -345,8 +356,8 @@ class BaseCore:
# 获取代理 # 获取代理
def get_proxy(self): def get_proxy(self):
sql = "select proxy from clb_proxy" sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql) self.cursor.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall() proxy_lists = self.cursor.fetchall()
ip_list = [] ip_list = []
for proxy_ in proxy_lists: for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", '')) ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
...@@ -369,7 +380,7 @@ class BaseCore: ...@@ -369,7 +380,7 @@ class BaseCore:
if beginStr=='': if beginStr=='':
pass pass
else: else:
begin=str.find(beginStr) begin=str.rfind(beginStr)
if begin==-1: if begin==-1:
begin=0 begin=0
str=str[begin:] str=str[begin:]
...@@ -425,11 +436,18 @@ class BaseCore: ...@@ -425,11 +436,18 @@ class BaseCore:
IP = socket.gethostbyname(socket.gethostname()) IP = socket.gethostbyname(socket.gethostname())
return IP return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息 # 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器 # headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集 # 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出, # 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True): def buildDriver(self, path, headless=True):
service = Service(path) service = Service(path)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
if headless: if headless:
...@@ -442,7 +460,7 @@ class BaseCore: ...@@ -442,7 +460,7 @@ class BaseCore:
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent()) chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36') # 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(chrome_options=chrome_options, service=service) driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f: # with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read() # js = f.read()
# #
...@@ -468,6 +486,7 @@ class BaseCore: ...@@ -468,6 +486,7 @@ class BaseCore:
except: except:
log = self.getLogger() log = self.getLogger()
log.info('=========数据库操作失败========') log.info('=========数据库操作失败========')
return data return data
# 更新企业采集次数 # 更新企业采集次数
...@@ -520,6 +539,13 @@ class BaseCore: ...@@ -520,6 +539,13 @@ class BaseCore:
token = self.cursor.fetchone()[0] token = self.cursor.fetchone()[0]
return token return token
#获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
return token
#检测语言 #检测语言
def detect_language(self, text): def detect_language(self, text):
# 使用langid.py判断文本的语言 # 使用langid.py判断文本的语言
...@@ -565,6 +591,91 @@ class BaseCore: ...@@ -565,6 +591,91 @@ class BaseCore:
self.r.set(key, 0) self.r.set(key, 0)
self.r.expire(key, 3600) self.r.expire(key, 3600)
time.sleep(2) time.sleep(2)
#上传至文件服务器,并解析pdf的内容和页数
def upLoadToServe(self,pdf_url,type_id,social_code):
headers = {}
retData = {'state':False,'type_id':type_id,'item_id':social_code,'group_name':'group1','path':'','full_path':'',
'category':'pdf','file_size':'','status':1,'create_by':'XueLingKun',
'create_time':'','page_size':'','content':''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
print(f'======pdf解析失败=====')
return retData
else:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,year,type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,year,type_id)
id = selects[0]
return id
......
...@@ -301,9 +301,9 @@ def BaseInfoAbroad_task(): ...@@ -301,9 +301,9 @@ def BaseInfoAbroad_task():
def FBS(): def FBS():
cnx,cursor = connectSql() cnx,cursor = connectSql()
# todo:调整为获取福布斯的数据库 # todo:调整为获取福布斯的数据库
gw_query = "select a.SocialCode from EnterpriseInfo a,EnterpriseType b where a.SocialCode=b.SocialCode and b.type=3 and a.Place=2" # gw_query = "select a.SocialCode from EnterpriseInfo a,EnterpriseType b where a.SocialCode=b.SocialCode and b.type=3 and a.Place=2"
cursor.execute(gw_query) # cursor.execute(gw_query)
gw_result = cursor.fetchall() # gw_result = cursor.fetchall()
#获取国内企业 #获取国内企业
gn_query = "select a.SocialCode from EnterpriseInfo a,EnterpriseType b where a.SocialCode=b.SocialCode and b.type=3 and a.Place=1 " gn_query = "select a.SocialCode from EnterpriseInfo a,EnterpriseType b where a.SocialCode=b.SocialCode and b.type=3 and a.Place=1 "
...@@ -311,16 +311,18 @@ def FBS(): ...@@ -311,16 +311,18 @@ def FBS():
gn_result = cursor.fetchall() gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result] gn_social_list = [item[0] for item in gn_result]
gw_social_list = [item[0] for item in gw_result] # gw_social_list = [item[0] for item in gw_result]
for item in gw_social_list: # for item in gw_social_list:
r.rpush('NewsEnterpriseFbs:gwqy_socialCode', item) # r.rpush('NewsEnterpriseFbs:gwqy_socialCode', item)
r.rpush('BaseInfoEnterpriseFbs:gwqy_social_code',item) # r.rpush('BaseInfoEnterpriseFbs:gwqy_social_code',item)
for item in gn_social_list: for item in gn_social_list:
if not r.exists(item): if not r.exists(item):
r.rpush('NewsEnterpriseFbs:gnqy_socialCode', item) # r.rpush('NewsEnterpriseFbs:gnqy_socialCode', item)
# r.rpush('CorPersonEnterpriseFbs:gnqy_socialCode', item)
r.rpush('NoticeEnterpriseFbs:gnqy_socialCode',item) r.rpush('NoticeEnterpriseFbs:gnqy_socialCode',item)
r.rpush('BaseInfoEnterpriseFbs:gnqy_social_code',item) # r.rpush('BaseInfoEnterpriseFbs:gnqy_social_code',item)
# r.rpush('FinanceFromEast:eastfinance_socialCode',item)
closeSql(cnx,cursor) closeSql(cnx,cursor)
#将IPO的国外股票代码放到redis中 #将IPO的国外股票代码放到redis中
......
# connect timeout in seconds
# default value is 30s
connect_timeout=300
# network timeout in seconds
# default value is 30s
network_timeout=600
# the base path to store log files
#base_path=/home/tarena/django-project/cc_shop1/cc_shop1/logs
# tracker_server can ocur more than once, and tracker_server format is
# "host:port", host can be hostname or ip address
tracker_server=114.115.215.96:22122
#standard log level as syslog, case insensitive, value list:
### emerg for emergency
### alert
### crit for critical
### error
### warn for warning
### notice
### info
### debug
log_level=info
# if use connection pool
# default value is false
# since V4.05
use_connection_pool = false
# connections whose the idle time exceeds this time will be closed
# unit: second
# default value is 3600
# since V4.05
connection_pool_max_idle_time = 3600
# if load FastDFS parameters from tracker server
# since V4.05
# default value is false
load_fdfs_parameters_from_tracker=false
# if use storage ID instead of IP address
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# default value is false
# since V4.05
use_storage_id = false
# specify storage ids filename, can use relative or absolute path
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# since V4.05
storage_ids_filename = storage_ids.conf
#HTTP settings
http.tracker_server_port=80
#use "#include" directive to include HTTP other settiongs
##include http.conf
\ No newline at end of file
# __init__.py
__version__ = '2.2.0'
VERSION = tuple(map(int, __version__.split('.')))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: client.py
'''
Client module for Fastdfs 3.08
author: scott yuan scottzer8@gmail.com
date: 2012-06-21
'''
import os
import sys
from fdfs_client.utils import *
from fdfs_client.tracker_client import *
from fdfs_client.storage_client import *
from fdfs_client.exceptions import *
def get_tracker_conf(conf_path='client.conf'):
cf = Fdfs_ConfigParser()
tracker = {}
try:
cf.read(conf_path)
timeout = cf.getint('__config__', 'connect_timeout')
tracker_list = cf.get('__config__', 'tracker_server')
if isinstance(tracker_list, str):
tracker_list = [tracker_list]
tracker_ip_list = []
for tr in tracker_list:
tracker_ip, tracker_port = tr.split(':')
tracker_ip_list.append(tracker_ip)
tracker['host_tuple'] = tuple(tracker_ip_list)
tracker['port'] = int(tracker_port)
tracker['timeout'] = timeout
tracker['name'] = 'Tracker Pool'
except:
raise
return tracker
class Fdfs_client(object):
'''
Class Fdfs_client implemented Fastdfs client protol ver 3.08.
It's useful upload, download, delete file to or from fdfs server, etc. It's uses
connection pool to manage connection to server.
'''
def __init__(self, trackers, poolclass=ConnectionPool):
self.trackers = trackers
self.tracker_pool = poolclass(**self.trackers)
self.timeout = self.trackers['timeout']
return None
def __del__(self):
try:
self.pool.destroy()
self.pool = None
except:
pass
def upload_by_filename(self, filename, meta_dict=None):
'''
Upload a file to Storage server.
arguments:
@filename: string, name of file that will be uploaded
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
} meta_dict can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : local_file_name,
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_by_filename(tc, store_serv, filename, meta_dict)
def upload_by_file(self, filename, meta_dict=None):
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_by_file(tc, store_serv, filename, meta_dict)
def upload_by_buffer(self, filebuffer, file_ext_name=None, meta_dict=None):
'''
Upload a buffer to Storage server.
arguments:
@filebuffer: string, buffer
@file_ext_name: string, file extend name
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
if not filebuffer:
raise DataError('[-] Error: argument filebuffer can not be null.')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_by_buffer(tc, store_serv, filebuffer, file_ext_name, meta_dict)
def upload_slave_by_filename(self, filename, remote_file_id, prefix_name, meta_dict=None):
'''
Upload slave file to Storage server.
arguments:
@filename: string, local file name
@remote_file_id: string, remote file id
@prefix_name: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dictionary {
'Status' : 'Upload slave successed.',
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Remote file id' : remote_file_id,
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading slave)')
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(uploading slave)')
if not prefix_name:
raise DataError('[-] Error: prefix_name can not be null.')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_with_group(group_name)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
try:
ret_dict = store.storage_upload_slave_by_filename(tc, store_serv, filename, prefix_name, remote_filename,
meta_dict=None)
except:
raise
ret_dict['Status'] = 'Upload slave file successed.'
return ret_dict
def upload_slave_by_file(self, filename, remote_file_id, prefix_name, meta_dict=None):
'''
Upload slave file to Storage server.
arguments:
@filename: string, local file name
@remote_file_id: string, remote file id
@prefix_name: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dictionary {
'Status' : 'Upload slave successed.',
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Remote file id' : remote_file_id,
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading slave)')
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(uploading slave)')
if not prefix_name:
raise DataError('[-] Error: prefix_name can not be null.')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_with_group(group_name)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
try:
ret_dict = store.storage_upload_slave_by_file(tc, store_serv, filename, prefix_name, remote_filename,
meta_dict=None)
except:
raise
ret_dict['Status'] = 'Upload slave file successed.'
return ret_dict
def upload_slave_by_buffer(self, filebuffer, remote_file_id, meta_dict=None, file_ext_name=None):
'''
Upload slave file by buffer
arguments:
@filebuffer: string
@remote_file_id: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dictionary {
'Status' : 'Upload slave successed.',
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Remote file id' : remote_file_id,
'Storage IP' : storage_ip
}
'''
if not filebuffer:
raise DataError('[-] Error: argument filebuffer can not be null.')
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(uploading slave)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_slave_by_buffer(tc, store_serv, filebuffer, remote_filename, meta_dict,
file_ext_name)
def upload_appender_by_filename(self, local_filename, meta_dict=None):
'''
Upload an appender file by filename.
arguments:
@local_filename: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
} Notice: it can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(uploading appender)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_appender_by_filename(tc, store_serv, local_filename, meta_dict)
def upload_appender_by_file(self, local_filename, meta_dict=None):
'''
Upload an appender file by file.
arguments:
@local_filename: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
} Notice: it can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(uploading appender)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_appender_by_file(tc, store_serv, local_filename, meta_dict)
def upload_appender_by_buffer(self, filebuffer, file_ext_name=None, meta_dict=None):
'''
Upload a buffer to Storage server.
arguments:
@filebuffer: string
@file_ext_name: string, can be null
@meta_dict: dictionary, can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
if not filebuffer:
raise DataError('[-] Error: argument filebuffer can not be null.')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_appender_by_buffer(tc, store_serv, filebuffer, meta_dict, file_ext_name)
def delete_file(self, remote_file_id):
'''
Delete a file from Storage server.
arguments:
@remote_file_id: string, file_id of file that is on storage server
@return tuple ('Delete file successed.', remote_file_id, storage_ip)
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in delete file)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_delete_file(tc, store_serv, remote_filename)
def download_to_file(self, local_filename, remote_file_id, offset=0, down_bytes=0):
'''
Download a file from Storage server.
arguments:
@local_filename: string, local name of file
@remote_file_id: string, file_id of file that is on storage server
@offset: long
@downbytes: long
@return dict {
'Remote file_id' : remote_file_id,
'Content' : local_filename,
'Download size' : downloaded_size,
'Storage IP' : storage_ip
}
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in download file)')
group_name, remote_filename = tmp
if not offset:
file_offset = int(offset)
if not down_bytes:
download_bytes = int(down_bytes)
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_fetch(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_download_to_file(tc, store_serv, local_filename, file_offset, download_bytes,
remote_filename)
def download_to_buffer(self, remote_file_id, offset=0, down_bytes=0):
'''
Download a file from Storage server and store in buffer.
arguments:
@remote_file_id: string, file_id of file that is on storage server
@offset: long
@down_bytes: long
@return dict {
'Remote file_id' : remote_file_id,
'Content' : file_buffer,
'Download size' : downloaded_size,
'Storage IP' : storage_ip
}
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in download file)')
group_name, remote_filename = tmp
if not offset:
file_offset = int(offset)
if not down_bytes:
download_bytes = int(down_bytes)
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_fetch(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
file_buffer = None
return store.storage_download_to_buffer(tc, store_serv, file_buffer, file_offset, download_bytes,
remote_filename)
def list_one_group(self, group_name):
'''
List one group information.
arguments:
@group_name: string, group name will be list
@return Group_info, instance
'''
tc = Tracker_client(self.tracker_pool)
return tc.tracker_list_one_group(group_name)
def list_servers(self, group_name, storage_ip=None):
'''
List all storage servers information in a group
arguments:
@group_name: string
@return dictionary {
'Group name' : group_name,
'Servers' : server list,
}
'''
tc = Tracker_client(self.tracker_pool)
return tc.tracker_list_servers(group_name, storage_ip)
def list_all_groups(self):
'''
List all group information.
@return dictionary {
'Groups count' : group_count,
'Groups' : list of groups
}
'''
tc = Tracker_client(self.tracker_pool)
return tc.tracker_list_all_groups()
def get_meta_data(self, remote_file_id):
'''
Get meta data of remote file.
arguments:
@remote_fileid: string, remote file id
@return dictionary, meta data
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in get meta data)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_get_metadata(tc, store_serv, remote_filename)
def set_meta_data(self, remote_file_id, meta_dict, op_flag=STORAGE_SET_METADATA_FLAG_OVERWRITE):
'''
Set meta data of remote file.
arguments:
@remote_file_id: string
@meta_dict: dictionary
@op_flag: char, 'O' for overwrite, 'M' for merge
@return dictionary {
'Status' : status,
'Storage IP' : storage_ip
}
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in set meta data)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
try:
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
status = store.storage_set_metadata(tc, store_serv, remote_filename, meta_dict)
except (ConnectionError, ResponseError, DataError):
raise
# if status == 2:
# raise DataError('[-] Error: remote file %s is not exist.' % remote_file_id)
if status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
ret_dict = {}
ret_dict['Status'] = 'Set meta data success.'
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def append_by_filename(self, local_filename, remote_fileid):
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(append)')
tmp = split_remote_fileid(remote_fileid)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(append)')
group_name, appended_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appended_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_append_by_filename(tc, store_serv, local_filename, appended_filename)
def append_by_file(self, local_filename, remote_fileid):
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(append)')
tmp = split_remote_fileid(remote_fileid)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(append)')
group_name, appended_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appended_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_append_by_file(tc, store_serv, local_filename, appended_filename)
def append_by_buffer(self, file_buffer, remote_fileid):
if not file_buffer:
raise DataError('[-] Error: file_buffer can not be null.')
tmp = split_remote_fileid(remote_fileid)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(append)')
group_name, appended_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appended_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_append_by_buffer(tc, store_serv, file_buffer, appended_filename)
def truncate_file(self, truncated_filesize, appender_fileid):
'''
Truncate file in Storage server.
arguments:
@truncated_filesize: long
@appender_fileid: remote_fileid
@return: dictionary {
'Status' : 'Truncate successed.',
'Storage IP' : storage_ip
}
'''
trunc_filesize = int(truncated_filesize)
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: appender_fileid is invalid.(truncate)')
group_name, appender_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_truncate_file(tc, store_serv, trunc_filesize, appender_filename)
def modify_by_filename(self, filename, appender_fileid, offset=0):
'''
Modify a file in Storage server by file.
arguments:
@filename: string, local file name
@offset: long, file offset
@appender_fileid: string, remote file id
@return: dictionary {
'Status' : 'Modify successed.',
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(modify)')
filesize = os.stat(filename).st_size
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: remote_fileid is invalid.(modify)')
group_name, appender_filename = tmp
if not offset:
file_offset = int(offset)
else:
file_offset = 0
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_modify_by_filename(tc, store_serv, filename, file_offset, filesize, appender_filename)
def modify_by_file(self, filename, appender_fileid, offset=0):
'''
Modify a file in Storage server by file.
arguments:
@filename: string, local file name
@offset: long, file offset
@appender_fileid: string, remote file id
@return: dictionary {
'Status' : 'Modify successed.',
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(modify)')
filesize = os.stat(filename).st_size
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: remote_fileid is invalid.(modify)')
group_name, appender_filename = tmp
if not offset:
file_offset = int(offset)
else:
file_offset = 0
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_modify_by_file(tc, store_serv, filename, file_offset, filesize, appender_filename)
def modify_by_buffer(self, filebuffer, appender_fileid, offset=0):
'''
Modify a file in Storage server by buffer.
arguments:
@filebuffer: string, file buffer
@offset: long, file offset
@appender_fileid: string, remote file id
@return: dictionary {
'Status' : 'Modify successed.',
'Storage IP' : storage_ip
}
'''
if not filebuffer:
raise DataError('[-] Error: filebuffer can not be null.(modify)')
filesize = len(filebuffer)
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: remote_fileid is invalid.(modify)')
group_name, appender_filename = tmp
if not offset:
file_offset = int(offset)
else:
file_offset = 0
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_modify_by_buffer(tc, store_serv, filebuffer, file_offset, filesize, appender_filename)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: connection.py
import socket
import os
import sys
import time
import random
from itertools import chain
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# start class Connection
class Connection(object):
'''Manage TCP comunication to and from Fastdfs Server.'''
def __init__(self, **conn_kwargs):
self.pid = os.getpid()
self.host_tuple = conn_kwargs['host_tuple']
self.remote_port = conn_kwargs['port']
self.remote_addr = None
self.timeout = conn_kwargs['timeout']
self._sock = None
def __del__(self):
try:
self.disconnect()
except:
pass
def connect(self):
'''Connect to fdfs server.'''
if self._sock:
return
try:
sock = self._connect()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = sock
# print '[+] Create a connection success.'
# print '\tLocal address is %s:%s.' % self._sock.getsockname()
# print '\tRemote address is %s:%s' % (self.remote_addr, self.remote_port)
def _connect(self):
'''Create TCP socket. The host is random one of host_tuple.'''
self.remote_addr = random.choice(self.host_tuple)
# print '[+] Connecting... remote: %s:%s' % (self.remote_addr, self.remote_port)
# sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# sock.settimeout(self.timeout)
sock = socket.create_connection((self.remote_addr, self.remote_port), self.timeout)
return sock
def disconnect(self):
'''Disconnect from fdfs server.'''
if self._sock is None:
return
try:
self._sock.close()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = None
def get_sock(self):
return self._sock
def _errormessage(self, exception):
# args for socket.error can either be (errno, "message")
# or just "message" '''
if len(exception.args) == 1:
return "[-] Error: connect to %s:%s. %s." % (self.remote_addr, self.remote_port, exception.args[0])
else:
return "[-] Error: %s connect to %s:%s. %s." % \
(exception.args[0], self.remote_addr, self.remote_port, exception.args[1])
# end class Connection
# start ConnectionPool
class ConnectionPool(object):
'''Generic Connection Pool'''
def __init__(self, name='', conn_class=Connection,
max_conn=None, **conn_kwargs):
self.pool_name = name
self.pid = os.getpid()
self.conn_class = conn_class
self.max_conn = max_conn or 2 ** 31
self.conn_kwargs = conn_kwargs
self._conns_created = 0
self._conns_available = []
self._conns_inuse = set()
# print '[+] Create a connection pool success, name: %s.' % self.pool_name
def _check_pid(self):
if self.pid != os.getpid():
self.destroy()
self.__init__(self.conn_class, self.max_conn, **self.conn_kwargs)
def make_conn(self):
'''Create a new connection.'''
if self._conns_created >= self.max_conn:
raise ConnectionError('[-] Error: Too many connections.')
num_try = 10
while True:
try:
if num_try <= 0:
sys.exit()
conn_instance = self.conn_class(**self.conn_kwargs)
conn_instance.connect()
self._conns_created += 1
break
except ConnectionError as e:
print(e)
num_try -= 1
conn_instance = None
return conn_instance
def get_connection(self):
'''Get a connection from pool.'''
self._check_pid()
try:
conn = self._conns_available.pop()
# print '[+] Get a connection from pool %s.' % self.pool_name
# print '\tLocal address is %s:%s.' % conn._sock.getsockname()
# print '\tRemote address is %s:%s' % (conn.remote_addr, conn.remote_port)
except IndexError:
conn = self.make_conn()
self._conns_inuse.add(conn)
return conn
def remove(self, conn):
'''Remove connection from pool.'''
if conn in self._conns_inuse:
self._conns_inuse.remove(conn)
self._conns_created -= 1
if conn in self._conns_available:
self._conns_available.remove(conn)
self._conns_created -= 1
def destroy(self):
'''Disconnect all connections in the pool.'''
all_conns = chain(self._conns_inuse, self._conns_available)
for conn in all_conns:
conn.disconnect()
# print '[-] Destroy connection pool %s.' % self.pool_name
def release(self, conn):
'''Release the connection back to the pool.'''
self._check_pid()
if conn.pid == self.pid:
self._conns_inuse.remove(conn)
self._conns_available.append(conn)
# print '[-] Release connection back to pool %s.' % self.pool_name
# end ConnectionPool class
def tcp_recv_response(conn, bytes_size, buffer_size=4096):
'''Receive response from server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_size: int, will be received byte_stream size
@buffer_size: int, receive buffer size
@Return: tuple,(response, received_size)
'''
recv_buff = []
total_size = 0
try:
while bytes_size > 0:
resp = conn._sock.recv(buffer_size)
recv_buff.append(resp)
total_size += len(resp)
bytes_size -= len(resp)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: (%s)' % e.args)
return (b''.join(recv_buff), total_size)
def tcp_send_data(conn, bytes_stream):
'''Send buffer to server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_stream: trasmit buffer
@Return bool
'''
try:
conn._sock.sendall(bytes_stream)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: (%s)' % e.args)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: exceptions.py
'''Core exceptions raised by fdfs client'''
class FDFSError(Exception):
pass
class ConnectionError(FDFSError):
pass
class ResponseError(FDFSError):
pass
class InvaildResponse(FDFSError):
pass
class DataError(FDFSError):
pass
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: fdfs_protol.py
import struct
import socket
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# define FDFS protol constans
TRACKER_PROTO_CMD_STORAGE_JOIN = 81
FDFS_PROTO_CMD_QUIT = 82
TRACKER_PROTO_CMD_STORAGE_BEAT = 83 # storage heart beat
TRACKER_PROTO_CMD_STORAGE_REPORT_DISK_USAGE = 84 # report disk usage
TRACKER_PROTO_CMD_STORAGE_REPLICA_CHG = 85 # repl new storage servers
TRACKER_PROTO_CMD_STORAGE_SYNC_SRC_REQ = 86 # src storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_REQ = 87 # dest storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_NOTIFY = 88 # sync done notify
TRACKER_PROTO_CMD_STORAGE_SYNC_REPORT = 89 # report src last synced time as dest server
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_QUERY = 79 # dest storage query sync src storage server
TRACKER_PROTO_CMD_STORAGE_REPORT_IP_CHANGED = 78 # storage server report it's ip changed
TRACKER_PROTO_CMD_STORAGE_CHANGELOG_REQ = 77 # storage server request storage server's changelog
TRACKER_PROTO_CMD_STORAGE_REPORT_STATUS = 76 # report specified storage server status
TRACKER_PROTO_CMD_STORAGE_PARAMETER_REQ = 75 # storage server request parameters
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FREE = 74 # storage report trunk free space
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FID = 73 # storage report current trunk file id
TRACKER_PROTO_CMD_STORAGE_FETCH_TRUNK_FID = 72 # storage get current trunk file id
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START = 61 # start of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END = 62 # end of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE = 63 # tracker get a system data file
TRACKER_PROTO_CMD_TRACKER_GET_STATUS = 64 # tracker get status of other tracker
TRACKER_PROTO_CMD_TRACKER_PING_LEADER = 65 # tracker ping leader
TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER = 66 # notify next leader to other trackers
TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER = 67 # commit next leader to other trackers
TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP = 90
TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS = 91
TRACKER_PROTO_CMD_SERVER_LIST_STORAGE = 92
TRACKER_PROTO_CMD_SERVER_DELETE_STORAGE = 93
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ONE = 101
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ONE = 102
TRACKER_PROTO_CMD_SERVICE_QUERY_UPDATE = 103
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ONE = 104
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ALL = 105
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ALL = 106
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ALL = 107
TRACKER_PROTO_CMD_RESP = 100
FDFS_PROTO_CMD_ACTIVE_TEST = 111 # active test, tracker and storage both support since V1.28
STORAGE_PROTO_CMD_REPORT_CLIENT_IP = 9 # ip as tracker client
STORAGE_PROTO_CMD_UPLOAD_FILE = 11
STORAGE_PROTO_CMD_DELETE_FILE = 12
STORAGE_PROTO_CMD_SET_METADATA = 13
STORAGE_PROTO_CMD_DOWNLOAD_FILE = 14
STORAGE_PROTO_CMD_GET_METADATA = 15
STORAGE_PROTO_CMD_SYNC_CREATE_FILE = 16
STORAGE_PROTO_CMD_SYNC_DELETE_FILE = 17
STORAGE_PROTO_CMD_SYNC_UPDATE_FILE = 18
STORAGE_PROTO_CMD_SYNC_CREATE_LINK = 19
STORAGE_PROTO_CMD_CREATE_LINK = 20
STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE = 21
STORAGE_PROTO_CMD_QUERY_FILE_INFO = 22
STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE = 23 # create appender file
STORAGE_PROTO_CMD_APPEND_FILE = 24 # append file
STORAGE_PROTO_CMD_SYNC_APPEND_FILE = 25
STORAGE_PROTO_CMD_FETCH_ONE_PATH_BINLOG = 26 # fetch binlog of one store path
STORAGE_PROTO_CMD_RESP = TRACKER_PROTO_CMD_RESP
STORAGE_PROTO_CMD_UPLOAD_MASTER_FILE = STORAGE_PROTO_CMD_UPLOAD_FILE
STORAGE_PROTO_CMD_TRUNK_ALLOC_SPACE = 27 # since V3.00
STORAGE_PROTO_CMD_TRUNK_ALLOC_CONFIRM = 28 # since V3.00
STORAGE_PROTO_CMD_TRUNK_FREE_SPACE = 29 # since V3.00
STORAGE_PROTO_CMD_TRUNK_SYNC_BINLOG = 30 # since V3.00
STORAGE_PROTO_CMD_TRUNK_GET_BINLOG_SIZE = 31 # since V3.07
STORAGE_PROTO_CMD_TRUNK_DELETE_BINLOG_MARKS = 32 # since V3.07
STORAGE_PROTO_CMD_TRUNK_TRUNCATE_BINLOG_FILE = 33 # since V3.07
STORAGE_PROTO_CMD_MODIFY_FILE = 34 # since V3.08
STORAGE_PROTO_CMD_SYNC_MODIFY_FILE = 35 # since V3.08
STORAGE_PROTO_CMD_TRUNCATE_FILE = 36 # since V3.08
STORAGE_PROTO_CMD_SYNC_TRUNCATE_FILE = 37 # since V3.08
# for overwrite all old metadata
STORAGE_SET_METADATA_FLAG_OVERWRITE = 'O'
STORAGE_SET_METADATA_FLAG_OVERWRITE_STR = "O"
# for replace, insert when the meta item not exist, otherwise update it
STORAGE_SET_METADATA_FLAG_MERGE = 'M'
STORAGE_SET_METADATA_FLAG_MERGE_STR = "M"
FDFS_RECORD_SEPERATOR = '\x01'
FDFS_FIELD_SEPERATOR = '\x02'
# common constants
FDFS_GROUP_NAME_MAX_LEN = 16
IP_ADDRESS_SIZE = 16
FDFS_PROTO_PKG_LEN_SIZE = 8
FDFS_PROTO_CMD_SIZE = 1
FDFS_PROTO_STATUS_SIZE = 1
FDFS_PROTO_IP_PORT_SIZE = (IP_ADDRESS_SIZE + 6)
FDFS_MAX_SERVERS_EACH_GROUP = 32
FDFS_MAX_GROUPS = 512
FDFS_MAX_TRACKERS = 16
FDFS_DOMAIN_NAME_MAX_LEN = 128
FDFS_MAX_META_NAME_LEN = 64
FDFS_MAX_META_VALUE_LEN = 256
FDFS_FILE_PREFIX_MAX_LEN = 16
FDFS_LOGIC_FILE_PATH_LEN = 10
FDFS_TRUE_FILE_PATH_LEN = 6
FDFS_FILENAME_BASE64_LENGTH = 27
FDFS_TRUNK_FILE_INFO_LEN = 16
FDFS_FILE_EXT_NAME_MAX_LEN = 6
FDFS_SPACE_SIZE_BASE_INDEX = 2 # storage space size based (MB)
FDFS_UPLOAD_BY_BUFFER = 1
FDFS_UPLOAD_BY_FILENAME = 2
FDFS_UPLOAD_BY_FILE = 3
FDFS_DOWNLOAD_TO_BUFFER = 1
FDFS_DOWNLOAD_TO_FILE = 2
FDFS_NORMAL_LOGIC_FILENAME_LENGTH = (
FDFS_LOGIC_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_FILE_EXT_NAME_MAX_LEN + 1)
FDFS_TRUNK_FILENAME_LENGTH = (
FDFS_TRUE_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_TRUNK_FILE_INFO_LEN + 1 + FDFS_FILE_EXT_NAME_MAX_LEN)
FDFS_TRUNK_LOGIC_FILENAME_LENGTH = (FDFS_TRUNK_FILENAME_LENGTH + (FDFS_LOGIC_FILE_PATH_LEN - FDFS_TRUE_FILE_PATH_LEN))
FDFS_VERSION_SIZE = 6
TRACKER_QUERY_STORAGE_FETCH_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE)
TRACKER_QUERY_STORAGE_STORE_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE + 1)
# status code, order is important!
FDFS_STORAGE_STATUS_INIT = 0
FDFS_STORAGE_STATUS_WAIT_SYNC = 1
FDFS_STORAGE_STATUS_SYNCING = 2
FDFS_STORAGE_STATUS_IP_CHANGED = 3
FDFS_STORAGE_STATUS_DELETED = 4
FDFS_STORAGE_STATUS_OFFLINE = 5
FDFS_STORAGE_STATUS_ONLINE = 6
FDFS_STORAGE_STATUS_ACTIVE = 7
FDFS_STORAGE_STATUS_RECOVERY = 9
FDFS_STORAGE_STATUS_NONE = 99
class Storage_server(object):
'''Class storage server for upload.'''
def __init__(self):
self.ip_addr = None
self.port = None
self.group_name = ''
self.store_path_index = 0
# Class tracker_header
class Tracker_header(object):
'''
Class for Pack or Unpack tracker header
struct tracker_header{
char pkg_len[FDFS_PROTO_PKG_LEN_SIZE],
char cmd,
char status,
}
'''
def __init__(self):
self.fmt = '!QBB' # pkg_len[FDFS_PROTO_PKG_LEN_SIZE] + cmd + status
self.st = struct.Struct(self.fmt)
self.pkg_len = 0
self.cmd = 0
self.status = 0
def _pack(self, pkg_len=0, cmd=0, status=0):
return self.st.pack(pkg_len, cmd, status)
def _unpack(self, bytes_stream):
self.pkg_len, self.cmd, self.status = self.st.unpack(bytes_stream)
return True
def header_len(self):
return self.st.size
def send_header(self, conn):
'''Send Tracker header to server.'''
header = self._pack(self.pkg_len, self.cmd, self.status)
try:
conn._sock.sendall(header)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: %s' % (e.args,))
def recv_header(self, conn):
'''Receive response from server.
if sucess, class member (pkg_len, cmd, status) is response.
'''
try:
header = conn._sock.recv(self.header_len())
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: %s' % (e.args,))
self._unpack(header)
def fdfs_pack_metadata(meta_dict):
ret = ''
for key in meta_dict:
ret += '%s%c%s%c' % (key, FDFS_FIELD_SEPERATOR, meta_dict[key], FDFS_RECORD_SEPERATOR)
return ret[0:-1]
def fdfs_unpack_metadata(bytes_stream):
li = bytes_stream.split(FDFS_RECORD_SEPERATOR)
return dict([item.split(FDFS_FIELD_SEPERATOR) for item in li])
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: fdfs_test.py
import os
import sys
import time
try:
from fdfs_client.client import *
from fdfs_client.exceptions import *
except ImportError:
import_path = os.path.abspath('../')
sys.path.append(import_path)
from fdfs_client.client import *
from fdfs_client.exceptions import *
def usage():
s = 'Usage: python fdfs_test.py {options} [{local_filename} [{remote_file_id}]]\n'
s += 'options: upfile, upbuffer, downfile, downbuffer, delete, listgroup, listserv\n'
s += ' upslavefile, upslavebuffer, upappendfile, upappendbuffer\n'
s += '\tupfile {local_filename}\n'
s += '\tupbuffer {local_filename}\n'
s += '\tdownfile {local_filename} {remote_file_id}\n'
s += '\tdownbuffer {remote_file_id}\n'
s += '\tdelete {remote_file_id}\n'
s += '\tlistgroup {group_name}\n'
s += '\tlistall \n'
s += '\tlistsrv {group_name} [storage_ip]\n'
s += '\tsetmeta {remote_file_id}\n'
s += '\tgetmeta {remote_file_id}\n'
s += '\tupslavefile {local_filename} {remote_fileid} {prefix_name}\n'
s += '\tupappendfile {local_filename}\n'
s += '\ttruncate {truncate_filesize} {remote_fileid}\n'
s += '\tmodifyfile {local_filename} {remote_fileid} {file_offset}\n'
s += '\tmodifybuffer {local_filename} {remote_fileid} {file_offset}\n'
s += 'e.g.: python fdfs_test.py upfile test'
print(s)
sys.exit(0)
if len(sys.argv) < 2:
usage()
client = Fdfs_client('client.conf')
def upfile_func():
# Upload by filename
# usage: python fdfs_test.py upfile {local_filename}
if len(sys.argv) < 3:
usage()
return None
try:
local_filename = sys.argv[2]
file_size = os.stat(local_filename).st_size
# meta_buffer can be null.
meta_dict = {
'ext_name': 'py',
'file_size': str(file_size) + 'B'
}
t1 = time.time()
ret_dict = client.upload_by_filename(local_filename, meta_dict)
t2 = time.time()
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
print('[+] time consume: %fs' % (t2 - t1))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upfileex_func():
# Upload by file
# usage: python fdfs_test.py upfileex {local_filename}
if len(sys.argv) < 3:
usage()
return None
try:
local_filename = sys.argv[2]
t1 = time.time()
ret_dict = client.upload_by_file(local_filename)
t2 = time.time()
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
print('[+] time consume: %fs' % (t2 - t1))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upslavefile_func():
# upload slave file
# usage: python fdfs_test.py upslavefile {local_filename} {remote_fileid} {prefix_name}
if len(sys.argv) < 5:
usage()
return None
try:
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
prefix_name = sys.argv[4]
ret_dict = client.upload_slave_by_file(local_filename, remote_fileid, \
prefix_name)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upslavebuffer_func():
# upload slave by buffer
# usage: python fdfs_test.py upslavebuffer {local_filename} {remote_fileid} {prefix_name}
if len(sys.argv) < 5:
usage()
return None
try:
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
prefix_name = sys.argv[4]
with open(local_filename, 'rb') as f:
filebuffer = f.read()
ret_dict = client.upload_slave_by_buffer(local_filename, \
remote_fileid, prefix_name)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def del_func():
# delete file
# usage: python fdfs_test.py delete {remote_fileid}
if len(sys.argv) < 3:
usage()
return None
try:
remote_file_id = sys.argv[2]
ret_tuple = client.delete_file(remote_file_id)
print('[+] %s' % ret_tuple[0])
print('[+] remote_fileid: %s' % ret_tuple[1])
print('[+] Storage IP: %s' % ret_tuple[2])
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def downfile_func():
# Download to file
# usage: python fdfs_test.py downfile {local_filename} {remote_fileid}
if len(sys.argv) < 3:
usage()
return None
try:
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
ret_dict = client.download_to_file(local_filename, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def list_group_func():
# List one group info
# usage: python fdfs_test.py listgroup {group_name}
if len(sys.argv) < 3:
usage()
return None
try:
group_name = sys.argv[2]
ret = client.list_one_group(group_name)
print(ret)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def listall_func():
# List all group info
# usage: python fdfs_test.py listall
if len(sys.argv) < 2:
usage()
return None
try:
ret_dict = client.list_all_groups()
print('=' * 80)
print('Groups count:', ret_dict['Groups count'])
for li in ret_dict['Groups']:
print('-' * 80)
print(li)
print('-' * 80)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def list_server_func():
# List all servers info of group
# usage: python fdfs_test.py listsrv {group_name} [storage_ip]
if len(sys.argv) < 3:
usage()
return None
try:
group_name = sys.argv[2]
if len(sys.argv) > 3:
storage_ip = sys.argv[3]
else:
storage_ip = None
ret_dict = client.list_servers(group_name, storage_ip)
print('=' * 80)
print('Group name: %s' % ret_dict['Group name'])
print('=' * 80)
i = 1
for serv in ret_dict['Servers']:
print('Storage server %d:' % i)
print('=' * 80)
print(serv)
i += 1
print('=' * 80)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upbuffer_func():
# Upload by buffer
# usage: python fdfs_test.py upbuffer {local_filename} [remote_file_ext_name]
if len(sys.argv) < 3:
usage()
return None
local_filename = sys.argv[2]
if len(sys.argv) > 3:
ext_name = sys.argv[3]
else:
ext_name = None
# meta_buffer can be null.
meta_buffer = {
'ext_name': 'gif',
'width': '150px',
'height': '80px'
}
try:
with open(local_filename, 'rb') as f:
file_buffer = f.read()
ret_dict = client.upload_by_buffer(file_buffer, ext_name, meta_buffer)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def downbuffer_func():
# Download to buffer
# usage: python fdfs_test.py downbuffer {remote_file_id}
# e.g.: 'group1/M00/00/00/wKjzhU_rLNmjo2-1AAAamGDONEA5818.py'
if len(sys.argv) < 3:
usage()
return None
remote_fileid = sys.argv[2]
try:
ret_dict = client.download_to_buffer(remote_fileid)
print('Downloaded content:')
print(ret_dict['Content'])
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def get_meta_data_func():
# Get meta data of remote file
# usage python fdfs_test.py getmeta {remote_file_id}
if len(sys.argv) < 3:
usage()
return None
remote_fileid = sys.argv[2]
try:
ret_dict = client.get_meta_data(remote_fileid)
print(ret_dict)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def set_meta_data_func():
# Set meta data of remote file
# usage python fdfs_test.py setmeta {remote_file_id}
if len(sys.argv) < 3:
usage()
return None
remote_fileid = sys.argv[2]
meta_dict = {
'ext_name': 'jgp',
'width': '160px',
'hight': '80px',
}
try:
ret_dict = client.set_meta_data(remote_fileid, meta_dict)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upappendfile_func():
# Upload an appender file by filename
# usage: python fdfs_test.py upappendfile {local_filename}
if len(sys.argv) < 3:
usage()
return None
local_filename = sys.argv[2]
try:
ret_dict = client.upload_appender_by_file(local_filename)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upappendbuffer_func():
# Upload an appender file by buffer
# usage: python fdfs_test.py upappendbuffer {local_filename}
if len(sys.argv) < 3:
usage()
return None
local_filename = sys.argv[2]
try:
with open(local_filename, 'rb') as f:
file_buffer = f.read()
ret_dict = client.upload_appender_by_buffer(file_buffer)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def appendfile_func():
# Append a remote file
# usage: python fdfs_test.py appendfile {local_filename} {remote_file_id}
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
try:
ret_dict = client.append_by_file(local_filename, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def appendbuffer_func():
# Append a remote file by buffer
# usage: python fdfs_test.py appendbuffer {local_filename} {remote_file_id}
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
try:
with open(local_filename, 'rb') as f:
filebuffer = f.read()
ret_dict = client.append_by_buffer(filebuffer, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def truncate_func():
# Truncate file
# usage: python fdfs_test.py truncate {truncate_filesize} {remote_file_id}
if len(sys.argv) < 4:
usage()
return None
truncate_filesize = int(sys.argv[2])
remote_fileid = sys.argv[3]
try:
ret_dict = client.truncate_file(truncate_filesize, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def modifyfile_func():
# Modify file by filename
# usage: python fdfs_test.py modifyfile {local_filename} {remote_fileid} [file_offset]
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
if len(sys.argv) > 4:
file_offset = int(sys.argv[4])
else:
file_offset = 0
try:
ret_dict = client.modify_by_filename(local_filename, remote_fileid, file_offset)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def modifybuffer_func():
# Modify file by buffer
# usage: python fdfs_test.py modifybuffer {local_filename} {remote_fileid} [file_offset]
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
if len(sys.argv) > 4:
file_offset = int(sys.argv[4])
else:
file_offset = 0
try:
with open(local_filename, 'rb') as f:
filebuffer = f.read()
ret_dict = client.modify_by_buffer(filebuffer, remote_fileid, file_offset)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
result = {
'upfile': lambda: upfile_func(),
'upfileex': lambda: upfileex_func(),
'upbuffer': lambda: upbuffer_func(),
'delete': lambda: del_func(),
'downfile': lambda: downfile_func(),
'downbuffer': lambda: downbuffer_func(),
'listgroup': lambda: list_group_func(),
'listall': lambda: listall_func(),
'listsrv': lambda: list_server_func(),
'getmeta': lambda: get_meta_data_func(),
'setmeta': lambda: set_meta_data_func(),
'upslavefile': lambda: upslavefile_func(),
'upappendfile': lambda: upappendfile_func(),
'upappendbuffer': lambda: upappendbuffer_func(),
'appendfile': lambda: appendfile_func(),
'appendbuffer': lambda: appendbuffer_func(),
'truncate': lambda: truncate_func(),
'modifyfile': lambda: modifyfile_func(),
'modifybuffer': lambda: modifybuffer_func(),
'-h': lambda: usage(),
}[sys.argv[1].lower()]()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: storage_client.py
import os
import stat
import errno
import struct
import socket
import datetime
import platform
from fdfs_client.fdfs_protol import *
from fdfs_client.connection import *
# from test_fdfs.sendfile import *
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
from fdfs_client.utils import *
__os_sep__ = "/" if platform.system() == 'Windows' else os.sep
def tcp_send_file(conn, filename, buffer_size=1024):
'''
Send file to server, and split into multiple pkgs while sending.
arguments:
@conn: connection
@filename: string
@buffer_size: int ,send buffer size
@Return int: file size if success else raise ConnectionError.
'''
file_size = 0
with open(filename, 'rb') as f:
while 1:
try:
send_buffer = f.read(buffer_size)
send_size = len(send_buffer)
if send_size == 0:
break
tcp_send_data(conn, send_buffer)
file_size += send_size
except ConnectionError as e:
raise ConnectionError('[-] Error while uploading file(%s).' % e.args)
except IOError as e:
raise DataError('[-] Error while reading local file(%s).' % e.args)
return file_size
def tcp_send_file_ex(conn, filename, buffer_size=4096):
'''
Send file to server. Using linux system call 'sendfile'.
arguments:
@conn: connection
@filename: string
@return long, sended size
'''
if 'linux' not in sys.platform.lower():
raise DataError('[-] Error: \'sendfile\' system call only available on linux.')
nbytes = 0
offset = 0
sock_fd = conn.get_sock().fileno()
with open(filename, 'rb') as f:
in_fd = f.fileno()
while 1:
try:
pass
# sent = sendfile(sock_fd, in_fd, offset, buffer_size)
# if 0 == sent:
# break
# nbytes += sent
# offset += sent
except OSError as e:
if e.errno == errno.EAGAIN:
continue
raise
return nbytes
def tcp_recv_file(conn, local_filename, file_size, buffer_size=1024):
'''
Receive file from server, fragmented it while receiving and write to disk.
arguments:
@conn: connection
@local_filename: string
@file_size: int, remote file size
@buffer_size: int, receive buffer size
@Return int: file size if success else raise ConnectionError.
'''
total_file_size = 0
flush_size = 0
remain_bytes = file_size
with open(local_filename, 'wb+') as f:
while remain_bytes > 0:
try:
if remain_bytes >= buffer_size:
file_buffer, recv_size = tcp_recv_response(conn, buffer_size, buffer_size)
else:
file_buffer, recv_size = tcp_recv_response(conn, remain_bytes, buffer_size)
f.write(file_buffer)
remain_bytes -= buffer_size
total_file_size += recv_size
flush_size += recv_size
if flush_size >= 4096:
f.flush()
flush_size = 0
except ConnectionError as e:
raise ConnectionError('[-] Error: while downloading file(%s).' % e.args)
except IOError as e:
raise DataError('[-] Error: while writting local file(%s).' % e.args)
return total_file_size
class Storage_client(object):
'''
The Class Storage_client for storage server.
Note: argument host_tuple of storage server ip address, that should be a single element.
'''
def __init__(self, *kwargs):
conn_kwargs = {
'name': 'Storage Pool',
'host_tuple': (kwargs[0],),
'port': kwargs[1],
'timeout': kwargs[2]
}
self.pool = ConnectionPool(**conn_kwargs)
return None
def __del__(self):
try:
self.pool.destroy()
self.pool = None
except:
pass
def update_pool(self, old_store_serv, new_store_serv, timeout=30):
'''
Update connection pool of storage client.
We need update connection pool of storage client, while storage server is changed.
but if server not changed, we do nothing.
'''
if old_store_serv.ip_addr == new_store_serv.ip_addr:
return None
self.pool.destroy()
conn_kwargs = {
'name': 'Storage_pool',
'host_tuple': (new_store_serv.ip_addr,),
'port': new_store_serv.port,
'timeout': timeout
}
self.pool = ConnectionPool(**conn_kwargs)
return True
def _storage_do_upload_file(self, tracker_client, store_serv, file_buffer, file_size=None, upload_type=None,
meta_dict=None, cmd=None, master_filename=None, prefix_name=None, file_ext_name=None):
'''
core of upload file.
arguments:
@tracker_client: Tracker_client, it is useful connect to tracker server
@store_serv: Storage_server, it is return from query tracker server
@file_buffer: string, file name or file buffer for send
@file_size: int
@upload_type: int, optional: FDFS_UPLOAD_BY_FILE, FDFS_UPLOAD_BY_FILENAME,
FDFS_UPLOAD_BY_BUFFER
@meta_dic: dictionary, store metadata in it
@cmd: int, reference fdfs protol
@master_filename: string, useful upload slave file
@prefix_name: string
@file_ext_name: string
@Return dictionary
{
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : status,
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
}
'''
store_conn = self.pool.get_connection()
th = Tracker_header()
master_filename_len = len(master_filename) if master_filename else 0
prefix_name_len = len(prefix_name) if prefix_name else 0
upload_slave = len(store_serv.group_name) and master_filename_len
file_ext_name = str(file_ext_name) if file_ext_name else ''
# non_slave_fmt |-store_path_index(1)-file_size(8)-file_ext_name(6)-|
non_slave_fmt = '!B Q %ds' % FDFS_FILE_EXT_NAME_MAX_LEN
# slave_fmt |-master_len(8)-file_size(8)-prefix_name(16)-file_ext_name(6)
# -master_name(master_filename_len)-|
slave_fmt = '!Q Q %ds %ds %ds' % (FDFS_FILE_PREFIX_MAX_LEN, FDFS_FILE_EXT_NAME_MAX_LEN, master_filename_len)
th.pkg_len = struct.calcsize(slave_fmt) if upload_slave else struct.calcsize(non_slave_fmt)
th.pkg_len += file_size
th.cmd = cmd
th.send_header(store_conn)
if upload_slave:
send_buffer = struct.pack(
slave_fmt, master_filename_len, file_size, prefix_name, file_ext_name, master_filename)
else:
send_buffer = struct.pack(non_slave_fmt, store_serv.store_path_index, file_size, file_ext_name.encode())
try:
tcp_send_data(store_conn, send_buffer)
if upload_type == FDFS_UPLOAD_BY_FILENAME:
send_file_size = tcp_send_file(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_BUFFER:
tcp_send_data(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_FILE:
send_file_size = tcp_send_file_ex(store_conn, file_buffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(store_conn, th.pkg_len)
if recv_size <= FDFS_GROUP_NAME_MAX_LEN:
errmsg = '[-] Error: Storage response length is not match, '
errmsg += 'expect: %d, actual: %d' % (th.pkg_len, recv_size)
raise ResponseError(errmsg)
# recv_fmt: |-group_name(16)-remote_file_name(recv_size - 16)-|
recv_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, th.pkg_len - FDFS_GROUP_NAME_MAX_LEN)
(group_name, remote_name) = struct.unpack(recv_fmt, recv_buffer)
remote_filename = remote_name.strip(b'\x00')
if meta_dict and len(meta_dict) > 0:
status = self.storage_set_metadata(tracker_client, store_serv, remote_filename, meta_dict)
if status != 0:
# rollback
self.storage_delete_file(tracker_client, store_serv, remote_filename)
raise DataError('[-] Error: %d, %s' % (status, os.strerror(status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dic = {
'Group name': group_name.strip(b'\x00'),
'Remote file_id': group_name.strip(b'\x00') + __os_sep__.encode() + remote_filename,
'Status': 'Upload successed.',
'Local file name': file_buffer if (upload_type == FDFS_UPLOAD_BY_FILENAME
or upload_type == FDFS_UPLOAD_BY_FILE
) else '',
'Uploaded size': appromix(send_file_size) if (upload_type == FDFS_UPLOAD_BY_FILENAME
or upload_type == FDFS_UPLOAD_BY_FILE
) else appromix(len(file_buffer)),
'Storage IP': store_serv.ip_addr
}
return ret_dic
def storage_upload_by_filename(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILENAME,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_FILE, None, None, file_ext_name)
def storage_upload_by_file(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILE,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_FILE, None, None, file_ext_name)
def storage_upload_by_buffer(self, tracker_client, store_serv, file_buffer, file_ext_name=None, meta_dict=None):
buffer_size = len(file_buffer)
return self._storage_do_upload_file(tracker_client, store_serv, file_buffer, buffer_size, FDFS_UPLOAD_BY_BUFFER,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_FILE, None, None, file_ext_name)
def storage_upload_slave_by_filename(self, tracker_client, store_serv, filename, prefix_name, remote_filename,
meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILENAME,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE, remote_filename,
prefix_name, file_ext_name)
def storage_upload_slave_by_file(self, tracker_client, store_serv, filename, prefix_name, remote_filename,
meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILE,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE, remote_filename,
prefix_name, file_ext_name)
def storage_upload_slave_by_buffer(self, tracker_client, store_serv, filebuffer, remote_filename, meta_dict,
file_ext_name):
file_size = len(filebuffer)
return self._storage_do_upload_file(tracker_client, store_serv, filebuffer, file_size, FDFS_UPLOAD_BY_BUFFER,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE, None, remote_filename,
file_ext_name)
def storage_upload_appender_by_filename(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILENAME,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE, None, None,
file_ext_name)
def storage_upload_appender_by_file(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILE,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE, None, None,
file_ext_name)
def storage_upload_appender_by_buffer(self, tracker_client, store_serv, file_buffer, meta_dict=None,
file_ext_name=None):
file_size = len(file_buffer)
return self._storage_do_upload_file(tracker_client, store_serv, file_buffer, file_size, FDFS_UPLOAD_BY_BUFFER,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE, None, None,
file_ext_name)
def storage_delete_file(self, tracker_client, store_serv, remote_filename):
'''
Delete file from storage server.
'''
store_conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = STORAGE_PROTO_CMD_DELETE_FILE
file_name_len = len(remote_filename)
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + file_name_len
try:
th.send_header(store_conn)
# del_fmt: |-group_name(16)-filename(len)-|
del_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, file_name_len)
send_buffer = struct.pack(del_fmt, store_serv.group_name, remote_filename)
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
# if th.status == 2:
# raise DataError('[-] Error: remote file %s is not exist.'
# % (store_serv.group_name + __os_sep__.encode() + remote_filename))
if th.status != 0:
raise DataError('Error: %d, %s' % (th.status, os.strerror(th.status)))
# recv_buffer, recv_size = tcp_recv_response(store_conn, th.pkg_len)
except:
raise
finally:
self.pool.release(store_conn)
remote_filename = store_serv.group_name + __os_sep__.encode() + remote_filename
return ('Delete file successed.', remote_filename, store_serv.ip_addr)
def _storage_do_download_file(self, tracker_client, store_serv, file_buffer, offset, download_size,
download_type, remote_filename):
'''
Core of download file from storage server.
You can choice download type, optional FDFS_DOWNLOAD_TO_FILE or
FDFS_DOWNLOAD_TO_BUFFER. And you can choice file offset.
@Return dictionary
'Remote file name' : remote_filename,
'Content' : local_filename or buffer,
'Download size' : download_size,
'Storage IP' : storage_ip
'''
store_conn = self.pool.get_connection()
th = Tracker_header()
remote_filename_len = len(remote_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + FDFS_GROUP_NAME_MAX_LEN + remote_filename_len
th.cmd = STORAGE_PROTO_CMD_DOWNLOAD_FILE
try:
th.send_header(store_conn)
# down_fmt: |-offset(8)-download_bytes(8)-group_name(16)-remote_filename(len)-|
down_fmt = '!Q Q %ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, remote_filename_len)
send_buffer = struct.pack(down_fmt, offset, download_size, store_serv.group_name, remote_filename)
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
# if th.status == 2:
# raise DataError('[-] Error: remote file %s is not exist.' %
# (store_serv.group_name + __os_sep__.encode() + remote_filename))
if th.status != 0:
raise DataError('Error: %d %s' % (th.status, os.strerror(th.status)))
if download_type == FDFS_DOWNLOAD_TO_FILE:
total_recv_size = tcp_recv_file(store_conn, file_buffer, th.pkg_len)
elif download_type == FDFS_DOWNLOAD_TO_BUFFER:
recv_buffer, total_recv_size = tcp_recv_response(store_conn, th.pkg_len)
except:
raise
finally:
self.pool.release(store_conn)
ret_dic = {
'Remote file_id': store_serv.group_name + __os_sep__.encode() + remote_filename,
'Content': file_buffer if download_type == FDFS_DOWNLOAD_TO_FILE else recv_buffer,
'Download size': appromix(total_recv_size),
'Storage IP': store_serv.ip_addr
}
return ret_dic
def storage_download_to_file(self, tracker_client, store_serv, local_filename, file_offset, download_bytes,
remote_filename):
return self._storage_do_download_file(tracker_client, store_serv, local_filename, file_offset, download_bytes,
FDFS_DOWNLOAD_TO_FILE, remote_filename)
def storage_download_to_buffer(self, tracker_client, store_serv, file_buffer, file_offset, download_bytes,
remote_filename):
return self._storage_do_download_file(tracker_client, store_serv, file_buffer, file_offset, download_bytes,
FDFS_DOWNLOAD_TO_BUFFER, remote_filename)
def storage_set_metadata(self, tracker_client, store_serv, remote_filename, meta_dict,
op_flag=STORAGE_SET_METADATA_FLAG_OVERWRITE):
ret = 0
conn = self.pool.get_connection()
remote_filename_len = len(remote_filename)
meta_buffer = fdfs_pack_metadata(meta_dict)
meta_len = len(meta_buffer)
th = Tracker_header()
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + 1 + FDFS_GROUP_NAME_MAX_LEN + remote_filename_len + meta_len
th.cmd = STORAGE_PROTO_CMD_SET_METADATA
try:
th.send_header(conn)
# meta_fmt: |-filename_len(8)-meta_len(8)-op_flag(1)-group_name(16)
# -filename(remote_filename_len)-meta(meta_len)|
meta_fmt = '!Q Q c %ds %ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, remote_filename_len, meta_len)
send_buffer = struct.pack(meta_fmt, remote_filename_len, meta_len, op_flag, store_serv.group_name,
remote_filename, meta_buffer)
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
ret = th.status
except:
raise
finally:
self.pool.release(conn)
return ret
def storage_get_metadata(self, tracker_client, store_serv, remote_file_name):
store_conn = self.pool.get_connection()
th = Tracker_header()
remote_filename_len = len(remote_file_name)
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + remote_filename_len
th.cmd = STORAGE_PROTO_CMD_GET_METADATA
try:
th.send_header(store_conn)
# meta_fmt: |-group_name(16)-filename(remote_filename_len)-|
meta_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, remote_filename_len)
send_buffer = struct.pack(meta_fmt, store_serv.group_name, remote_file_name.encode())
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
# if th.status == 2:
# raise DataError('[-] Error: Remote file %s has no meta data.'
# % (store_serv.group_name + __os_sep__.encode() + remote_file_name))
if th.status != 0:
raise DataError('[-] Error:%d, %s' % (th.status, os.strerror(th.status)))
if th.pkg_len == 0:
ret_dict = {}
meta_buffer, recv_size = tcp_recv_response(store_conn, th.pkg_len)
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = fdfs_unpack_metadata(meta_buffer)
return ret_dict
def _storage_do_append_file(self, tracker_client, store_serv, file_buffer, file_size, upload_type,
appended_filename):
store_conn = self.pool.get_connection()
th = Tracker_header()
appended_filename_len = len(appended_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + appended_filename_len + file_size
th.cmd = STORAGE_PROTO_CMD_APPEND_FILE
try:
th.send_header(store_conn)
# append_fmt: |-appended_filename_len(8)-file_size(8)-appended_filename(len)
# -filecontent(filesize)-|
append_fmt = '!Q Q %ds' % appended_filename_len
send_buffer = struct.pack(append_fmt, appended_filename_len, file_size, appended_filename)
tcp_send_data(store_conn, send_buffer)
if upload_type == FDFS_UPLOAD_BY_FILENAME:
tcp_send_file(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_BUFFER:
tcp_send_data(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_FILE:
tcp_send_file_ex(store_conn, file_buffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = {}
ret_dict['Status'] = 'Append file successed.'
ret_dict['Appender file name'] = store_serv.group_name + __os_sep__.encode() + appended_filename
ret_dict['Appended size'] = appromix(file_size)
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def storage_append_by_filename(self, tracker_client, store_serv, local_filename, appended_filename):
file_size = os.stat(local_filename).st_size
return self._storage_do_append_file(tracker_client, store_serv, local_filename, file_size,
FDFS_UPLOAD_BY_FILENAME, appended_filename)
def storage_append_by_file(self, tracker_client, store_serv, local_filename, appended_filename):
file_size = os.stat(local_filename).st_size
return self._storage_do_append_file(tracker_client, store_serv, local_filename, file_size, FDFS_UPLOAD_BY_FILE,
appended_filename)
def storage_append_by_buffer(self, tracker_client, store_serv, file_buffer, appended_filename):
file_size = len(file_buffer)
return self._storage_do_append_file(tracker_client, store_serv, file_buffer, file_size, FDFS_UPLOAD_BY_BUFFER,
appended_filename)
def _storage_do_truncate_file(self, tracker_client, store_serv, truncated_filesize, appender_filename):
store_conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = STORAGE_PROTO_CMD_TRUNCATE_FILE
appender_filename_len = len(appender_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + appender_filename_len
try:
th.send_header(store_conn)
# truncate_fmt:|-appender_filename_len(8)-truncate_filesize(8)
# -appender_filename(len)-|
truncate_fmt = '!Q Q %ds' % appender_filename_len
send_buffer = struct.pack(truncate_fmt, appender_filename_len, truncated_filesize, appender_filename)
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = {}
ret_dict['Status'] = 'Truncate successed.'
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def storage_truncate_file(self, tracker_client, store_serv, truncated_filesize, appender_filename):
return self._storage_do_truncate_file(tracker_client, store_serv, truncated_filesize, appender_filename)
def _storage_do_modify_file(self, tracker_client, store_serv, upload_type, filebuffer, offset, filesize,
appender_filename):
store_conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = STORAGE_PROTO_CMD_MODIFY_FILE
appender_filename_len = len(appender_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 3 + appender_filename_len + filesize
try:
th.send_header(store_conn)
# modify_fmt: |-filename_len(8)-offset(8)-filesize(8)-filename(len)-|
modify_fmt = '!Q Q Q %ds' % appender_filename_len
send_buffer = struct.pack(modify_fmt, appender_filename_len, offset, filesize, appender_filename)
tcp_send_data(store_conn, send_buffer)
if upload_type == FDFS_UPLOAD_BY_FILENAME:
upload_size = tcp_send_file(store_conn, filebuffer)
elif upload_type == FDFS_UPLOAD_BY_BUFFER:
tcp_send_data(store_conn, filebuffer)
elif upload_type == FDFS_UPLOAD_BY_FILE:
upload_size = tcp_send_file_ex(store_conn, filebuffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = {}
ret_dict['Status'] = 'Modify successed.'
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def storage_modify_by_filename(self, tracker_client, store_serv, filename, offset, filesize, appender_filename):
return self._storage_do_modify_file(tracker_client, store_serv, FDFS_UPLOAD_BY_FILENAME, filename, offset,
filesize, appender_filename)
def storage_modify_by_file(self, tracker_client, store_serv, filename, offset, filesize, appender_filename):
return self._storage_do_modify_file(tracker_client, store_serv, FDFS_UPLOAD_BY_FILE, filename, offset, filesize,
appender_filename)
def storage_modify_by_buffer(self, tracker_client, store_serv, filebuffer, offset, filesize, appender_filename):
return self._storage_do_modify_file(tracker_client, store_serv, FDFS_UPLOAD_BY_BUFFER, filebuffer, offset,
filesize, appender_filename)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: tracker_client.py
import struct
import socket
from datetime import datetime
from fdfs_client.fdfs_protol import *
from fdfs_client.connection import *
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
from fdfs_client.utils import *
def parse_storage_status(status_code):
try:
ret = {
FDFS_STORAGE_STATUS_INIT: lambda: 'INIT',
FDFS_STORAGE_STATUS_WAIT_SYNC: lambda: 'WAIT_SYNC',
FDFS_STORAGE_STATUS_SYNCING: lambda: 'SYNCING',
FDFS_STORAGE_STATUS_IP_CHANGED: lambda: 'IP_CHANGED',
FDFS_STORAGE_STATUS_DELETED: lambda: 'DELETED',
FDFS_STORAGE_STATUS_OFFLINE: lambda: 'OFFLINE',
FDFS_STORAGE_STATUS_ONLINE: lambda: 'ONLINE',
FDFS_STORAGE_STATUS_ACTIVE: lambda: 'ACTIVE',
FDFS_STORAGE_STATUS_RECOVERY: lambda: 'RECOVERY'
}[status_code]()
except KeyError:
ret = 'UNKNOW'
return ret
class Storage_info(object):
def __init__(self):
self.status = 0
self.id = ''
self.ip_addr = ''
self.domain_name = ''
self.src_id = ''
self.version = ''
self.join_time = datetime.fromtimestamp(0).isoformat()
self.up_time = datetime.fromtimestamp(0).isoformat()
self.totalMB = ''
self.freeMB = ''
self.upload_prio = 0
self.store_path_count = 0
self.subdir_count_per_path = 0
self.curr_write_path = 0
self.storage_port = 23000
self.storage_http_port = 80
self.alloc_count = 0
self.current_count = 0
self.max_count = 0
self.total_upload_count = 0
self.success_upload_count = 0
self.total_append_count = 0
self.success_append_count = 0
self.total_modify_count = 0
self.success_modify_count = 0
self.total_truncate_count = 0
self.success_truncate_count = 0
self.total_setmeta_count = 0
self.success_setmeta_count = 0
self.total_del_count = 0
self.success_del_count = 0
self.total_download_count = 0
self.success_download_count = 0
self.total_getmeta_count = 0
self.success_getmeta_count = 0
self.total_create_link_count = 0
self.success_create_link_count = 0
self.total_del_link_count = 0
self.success_del_link_count = 0
self.total_upload_bytes = 0
self.success_upload_bytes = 0
self.total_append_bytes = 0
self.success_append_bytes = 0
self.total_modify_bytes = 0
self.success_modify_bytes = 0
self.total_download_bytes = 0
self.success_download_bytes = 0
self.total_sync_in_bytes = 0
self.success_sync_in_bytes = 0
self.total_sync_out_bytes = 0
self.success_sync_out_bytes = 0
self.total_file_open_count = 0
self.success_file_open_count = 0
self.total_file_read_count = 0
self.success_file_read_count = 0
self.total_file_write_count = 0
self.success_file_write_count = 0
self.last_source_sync = datetime.fromtimestamp(0).isoformat()
self.last_sync_update = datetime.fromtimestamp(0).isoformat()
self.last_synced_time = datetime.fromtimestamp(0).isoformat()
self.last_heartbeat_time = datetime.fromtimestamp(0).isoformat()
self.if_trunk_server = ''
# fmt = |-status(1)-ipaddr(16)-domain(128)-srcipaddr(16)-ver(6)-52*8-|
self.fmt = '!B 16s 16s 128s 16s 6s 10Q 4s4s4s 42Q?'
def set_info(self, bytes_stream):
(self.status, self.id, ip_addr, domain_name, self.src_id, version, join_time, up_time, totalMB, freeMB,
self.upload_prio, self.store_path_count, self.subdir_count_per_path, self.curr_write_path, self.storage_port,
self.storage_http_port, self.alloc_count, self.current_count, self.max_count, self.total_upload_count,
self.success_upload_count, self.total_append_count, self.success_append_count, self.total_modify_count,
self.success_modify_count, self.total_truncate_count, self.success_truncate_count, self.total_setmeta_count,
self.success_setmeta_count, self.total_del_count, self.success_del_count, self.total_download_count,
self.success_download_count, self.total_getmeta_count, self.success_getmeta_count,
self.total_create_link_count, self.success_create_link_count, self.total_del_link_count,
self.success_del_link_count, self.total_upload_bytes, self.success_upload_bytes, self.total_append_bytes,
self.total_append_bytes, self.total_modify_bytes, self.success_modify_bytes, self.total_download_bytes,
self.success_download_bytes, self.total_sync_in_bytes, self.success_sync_in_bytes, self.total_sync_out_bytes,
self.success_sync_out_bytes, self.total_file_open_count, self.success_file_open_count,
self.total_file_read_count, self.success_file_read_count, self.total_file_write_count,
self.success_file_write_count, last_source_sync, last_sync_update, last_synced_time, last_heartbeat_time,
self.if_trunk_server,) = struct.unpack(self.fmt, bytes_stream)
try:
self.ip_addr = ip_addr.strip(b'\x00')
self.domain_name = domain_name.strip(b'\x00')
self.version = version.strip(b'\x00')
self.totalMB = appromix(totalMB, FDFS_SPACE_SIZE_BASE_INDEX)
self.freeMB = appromix(freeMB, FDFS_SPACE_SIZE_BASE_INDEX)
except ValueError as e:
raise ResponseError('[-] Error: disk space overrun, can not represented it.')
self.join_time = datetime.fromtimestamp(join_time).isoformat()
self.up_time = datetime.fromtimestamp(up_time).isoformat()
self.last_source_sync = datetime.fromtimestamp(last_source_sync).isoformat()
self.last_sync_update = datetime.fromtimestamp(last_sync_update).isoformat()
self.last_synced_time = datetime.fromtimestamp(last_synced_time).isoformat()
self.last_heartbeat_time = datetime.fromtimestamp(last_heartbeat_time).isoformat()
return True
def __str__(self):
'''Transform to readable string.'''
s = 'Storage information:\n'
s += '\tip_addr = %s (%s)\n' % (self.ip_addr, parse_storage_status(self.status))
s += '\thttp domain = %s\n' % self.domain_name
s += '\tversion = %s\n' % self.version
s += '\tjoin time = %s\n' % self.join_time
s += '\tup time = %s\n' % self.up_time
s += '\ttotal storage = %s\n' % self.totalMB
s += '\tfree storage = %s\n' % self.freeMB
s += '\tupload priority = %d\n' % self.upload_prio
s += '\tstore path count = %d\n' % self.store_path_count
s += '\tsubdir count per path = %d\n' % self.subdir_count_per_path
s += '\tstorage port = %d\n' % self.storage_port
s += '\tstorage HTTP port = %d\n' % self.storage_http_port
s += '\tcurrent write path = %d\n' % self.curr_write_path
s += '\tsource ip_addr = %s\n' % self.ip_addr
s += '\tif_trunk_server = %d\n' % self.if_trunk_server
s += '\ttotal upload count = %ld\n' % self.total_upload_count
s += '\tsuccess upload count = %ld\n' % self.success_upload_count
s += '\ttotal download count = %ld\n' % self.total_download_count
s += '\tsuccess download count = %ld\n' % self.success_download_count
s += '\ttotal append count = %ld\n' % self.total_append_count
s += '\tsuccess append count = %ld\n' % self.success_append_count
s += '\ttotal modify count = %ld\n' % self.total_modify_count
s += '\tsuccess modify count = %ld\n' % self.success_modify_count
s += '\ttotal truncate count = %ld\n' % self.total_truncate_count
s += '\tsuccess truncate count = %ld\n' % self.success_truncate_count
s += '\ttotal delete count = %ld\n' % self.total_del_count
s += '\tsuccess delete count = %ld\n' % self.success_del_count
s += '\ttotal set_meta count = %ld\n' % self.total_setmeta_count
s += '\tsuccess set_meta count = %ld\n' % self.success_setmeta_count
s += '\ttotal get_meta count = %ld\n' % self.total_getmeta_count
s += '\tsuccess get_meta count = %ld\n' % self.success_getmeta_count
s += '\ttotal create link count = %ld\n' % self.total_create_link_count
s += '\tsuccess create link count = %ld\n' % self.success_create_link_count
s += '\ttotal delete link count = %ld\n' % self.total_del_link_count
s += '\tsuccess delete link count = %ld\n' % self.success_del_link_count
s += '\ttotal upload bytes = %ld\n' % self.total_upload_bytes
s += '\tsuccess upload bytes = %ld\n' % self.success_upload_bytes
s += '\ttotal download bytes = %ld\n' % self.total_download_bytes
s += '\tsuccess download bytes = %ld\n' % self.success_download_bytes
s += '\ttotal append bytes = %ld\n' % self.total_append_bytes
s += '\tsuccess append bytes = %ld\n' % self.success_append_bytes
s += '\ttotal modify bytes = %ld\n' % self.total_modify_bytes
s += '\tsuccess modify bytes = %ld\n' % self.success_modify_bytes
s += '\ttotal sync_in bytes = %ld\n' % self.total_sync_in_bytes
s += '\tsuccess sync_in bytes = %ld\n' % self.success_sync_in_bytes
s += '\ttotal sync_out bytes = %ld\n' % self.total_sync_out_bytes
s += '\tsuccess sync_out bytes = %ld\n' % self.success_sync_out_bytes
s += '\ttotal file open count = %ld\n' % self.total_file_open_count
s += '\tsuccess file open count = %ld\n' % self.success_file_open_count
s += '\ttotal file read count = %ld\n' % self.total_file_read_count
s += '\tsuccess file read count = %ld\n' % self.success_file_read_count
s += '\ttotal file write count = %ld\n' % self.total_file_write_count
s += '\tsucess file write count = %ld\n' % self.success_file_write_count
s += '\tlast heartbeat time = %s\n' % self.last_heartbeat_time
s += '\tlast source update = %s\n' % self.last_source_sync
s += '\tlast sync update = %s\n' % self.last_sync_update
s += '\tlast synced time = %s\n' % self.last_synced_time
return s
def get_fmt_size(self):
return struct.calcsize(self.fmt)
class Group_info(object):
def __init__(self):
self.group_name = ''
self.totalMB = ''
self.freeMB = ''
self.trunk_freeMB = ''
self.count = 0
self.storage_port = 0
self.store_http_port = 0
self.active_count = 0
self.curr_write_server = 0
self.store_path_count = 0
self.subdir_count_per_path = 0
self.curr_trunk_file_id = 0
self.fmt = '!%ds 11Q' % (FDFS_GROUP_NAME_MAX_LEN + 1)
return None
def __str__(self):
s = 'Group information:\n'
s += '\tgroup name = %s\n' % self.group_name
s += '\ttotal disk space = %s\n' % self.totalMB
s += '\tdisk free space = %s\n' % self.freeMB
s += '\ttrunk free space = %s\n' % self.trunk_freeMB
s += '\tstorage server count = %d\n' % self.count
s += '\tstorage port = %d\n' % self.storage_port
s += '\tstorage HTTP port = %d\n' % self.store_http_port
s += '\tactive server count = %d\n' % self.active_count
s += '\tcurrent write server index = %d\n' % self.curr_write_server
s += '\tstore path count = %d\n' % self.store_path_count
s += '\tsubdir count per path = %d\n' % self.subdir_count_per_path
s += '\tcurrent trunk file id = %d\n' % self.curr_trunk_file_id
return s
def set_info(self, bytes_stream):
(group_name, totalMB, freeMB, trunk_freeMB, self.count, self.storage_port, self.store_http_port,
self.active_count, self.curr_write_server, self.store_path_count, self.subdir_count_per_path,
self.curr_trunk_file_id) = struct.unpack(self.fmt, bytes_stream)
try:
self.group_name = group_name.strip(b'\x00')
self.freeMB = appromix(freeMB, FDFS_SPACE_SIZE_BASE_INDEX)
self.totalMB = appromix(totalMB, FDFS_SPACE_SIZE_BASE_INDEX)
self.trunk_freeMB = appromix(trunk_freeMB, FDFS_SPACE_SIZE_BASE_INDEX)
except ValueError:
raise DataError('[-] Error disk space overrun, can not represented it.')
def get_fmt_size(self):
return struct.calcsize(self.fmt)
class Tracker_client(object):
'''Class Tracker client.'''
def __init__(self, pool):
self.pool = pool
def tracker_list_servers(self, group_name, storage_ip=None):
'''
List servers in a storage group
'''
conn = self.pool.get_connection()
th = Tracker_header()
ip_len = len(storage_ip) if storage_ip else 0
if ip_len >= IP_ADDRESS_SIZE:
ip_len = IP_ADDRESS_SIZE - 1
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + ip_len
th.cmd = TRACKER_PROTO_CMD_SERVER_LIST_STORAGE
group_fmt = '!%ds' % FDFS_GROUP_NAME_MAX_LEN
store_ip_addr = storage_ip or ''
storage_ip_fmt = '!%ds' % ip_len
try:
th.send_header(conn)
send_buffer = struct.pack(group_fmt, group_name) + struct.pack(storage_ip_fmt, store_ip_addr)
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
si = Storage_info()
si_fmt_size = si.get_fmt_size()
recv_size = len(recv_buffer)
if recv_size % si_fmt_size != 0:
errinfo = '[-] Error: response size not match, expect: %d, actual: %d' % (th.pkg_len, recv_size)
raise ResponseError(errinfo)
except ConnectionError:
raise
finally:
self.pool.release(conn)
num_storage = recv_size / si_fmt_size
si_list = []
i = 0
while num_storage:
si.set_info(recv_buffer[(i * si_fmt_size): ((i + 1) * si_fmt_size)])
si_list.append(si)
si = Storage_info()
num_storage -= 1
i += 1
ret_dict = {}
ret_dict['Group name'] = group_name
ret_dict['Servers'] = si_list
return ret_dict
def tracker_list_one_group(self, group_name):
conn = self.pool.get_connection()
th = Tracker_header()
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN
th.cmd = TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP
# group_fmt: |-group_name(16)-|
group_fmt = '!%ds' % FDFS_GROUP_NAME_MAX_LEN
try:
th.send_header(conn)
send_buffer = struct.pack(group_fmt, group_name)
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
group_info = Group_info()
group_info.set_info(recv_buffer)
except ConnectionError:
raise
finally:
self.pool.release(conn)
return group_info
def tracker_list_all_groups(self):
conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS
try:
th.send_header(conn)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
except:
raise
finally:
self.pool.release(conn)
gi = Group_info()
gi_fmt_size = gi.get_fmt_size()
if recv_size % gi_fmt_size != 0:
errmsg = '[-] Error: Response size is mismatch, except: %d, actul: %d' % (th.pkg_len, recv_size)
raise ResponseError(errmsg)
num_groups = recv_size / gi_fmt_size
ret_dict = {}
ret_dict['Groups count'] = num_groups
gi_list = []
i = 0
while num_groups:
gi.set_info(recv_buffer[i * gi_fmt_size: (i + 1) * gi_fmt_size])
gi_list.append(gi)
gi = Group_info()
i += 1
num_groups -= 1
ret_dict['Groups'] = gi_list
return ret_dict
def tracker_query_storage_stor_without_group(self):
'''Query storage server for upload, without group name.
Return: Storage_server object'''
conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ONE
try:
th.send_header(conn)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
if recv_size != TRACKER_QUERY_STORAGE_STORE_BODY_LEN:
errmsg = '[-] Error: Tracker response length is invaild, '
errmsg += 'expect: %d, actual: %d' % (TRACKER_QUERY_STORAGE_STORE_BODY_LEN, recv_size)
raise ResponseError(errmsg)
except ConnectionError:
raise
finally:
self.pool.release(conn)
# recv_fmt |-group_name(16)-ipaddr(16-1)-port(8)-store_path_index(1)|
recv_fmt = '!%ds %ds Q B' % (FDFS_GROUP_NAME_MAX_LEN, IP_ADDRESS_SIZE - 1)
store_serv = Storage_server()
(group_name, ip_addr, store_serv.port, store_serv.store_path_index) = struct.unpack(recv_fmt, recv_buffer)
store_serv.group_name = group_name.strip(b'\x00')
store_serv.ip_addr = ip_addr.strip(b'\x00')
return store_serv
def tracker_query_storage_stor_with_group(self, group_name):
'''Query storage server for upload, based group name.
arguments:
@group_name: string
@Return Storage_server object
'''
conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ONE
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN
th.send_header(conn)
group_fmt = '!%ds' % FDFS_GROUP_NAME_MAX_LEN
send_buffer = struct.pack(group_fmt, group_name)
try:
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
if recv_size != TRACKER_QUERY_STORAGE_STORE_BODY_LEN:
errmsg = '[-] Error: Tracker response length is invaild, '
errmsg += 'expect: %d, actual: %d' % (TRACKER_QUERY_STORAGE_STORE_BODY_LEN, recv_size)
raise ResponseError(errmsg)
except ConnectionError:
raise
finally:
self.pool.release(conn)
# recv_fmt: |-group_name(16)-ipaddr(16-1)-port(8)-store_path_index(1)-|
recv_fmt = '!%ds %ds Q B' % (FDFS_GROUP_NAME_MAX_LEN, IP_ADDRESS_SIZE - 1)
store_serv = Storage_server()
(group, ip_addr, store_serv.port, store_serv.store_path_index) = struct.unpack(recv_fmt, recv_buffer)
store_serv.group_name = group.strip(b'\x00')
store_serv.ip_addr = ip_addr.strip(b'\x00')
return store_serv
def _tracker_do_query_storage(self, group_name, filename, cmd):
'''
core of query storage, based group name and filename.
It is useful download, delete and set meta.
arguments:
@group_name: string
@filename: string. remote file_id
@Return: Storage_server object
'''
conn = self.pool.get_connection()
th = Tracker_header()
file_name_len = len(filename)
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + file_name_len
th.cmd = cmd
th.send_header(conn)
# query_fmt: |-group_name(16)-filename(file_name_len)-|
query_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, file_name_len)
send_buffer = struct.pack(query_fmt, group_name, filename)
try:
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
if recv_size != TRACKER_QUERY_STORAGE_FETCH_BODY_LEN:
errmsg = '[-] Error: Tracker response length is invaild, '
errmsg += 'expect: %d, actual: %d' % (th.pkg_len, recv_size)
raise ResponseError(errmsg)
except ConnectionError:
raise
finally:
self.pool.release(conn)
# recv_fmt: |-group_name(16)-ip_addr(16)-port(8)-|
recv_fmt = '!%ds %ds Q' % (FDFS_GROUP_NAME_MAX_LEN, IP_ADDRESS_SIZE - 1)
store_serv = Storage_server()
(group_name, ipaddr, store_serv.port) = struct.unpack(recv_fmt, recv_buffer)
store_serv.group_name = group_name.strip(b'\x00')
store_serv.ip_addr = ipaddr.strip(b'\x00')
return store_serv
def tracker_query_storage_update(self, group_name, filename):
'''
Query storage server to update(delete and set_meta).
'''
return self._tracker_do_query_storage(group_name, filename, TRACKER_PROTO_CMD_SERVICE_QUERY_UPDATE)
def tracker_query_storage_fetch(self, group_name, filename):
'''
Query storage server to download.
'''
return self._tracker_do_query_storage(group_name, filename, TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ONE)
#!/usr/bin/env python
# -*- coding = utf-8 -*-
# filename: utils.py
import io
import os
import sys
import stat
import platform
import configparser
SUFFIX = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
__os_sep__ = "/" if platform.system() == 'Windows' else os.sep
def appromix(size, base=0):
'''Conver bytes stream size to human-readable format.
Keyword arguments:
size: int, bytes stream size
base: int, suffix index
Return: string
'''
multiples = 1024
if size < 0:
raise ValueError('[-] Error: number must be non-negative.')
if size < multiples:
return '{0:d}{1}'.format(size, SUFFIX[base])
for suffix in SUFFIX[base:]:
if size < multiples:
return '{0:.2f}{1}'.format(size, suffix)
size = size / float(multiples)
raise ValueError('[-] Error: number too big.')
def get_file_ext_name(filename, double_ext=True):
li = filename.split(os.extsep)
if len(li) <= 1:
return ''
else:
if li[-1].find(__os_sep__) != -1:
return ''
if double_ext:
if len(li) > 2:
if li[-2].find(__os_sep__) == -1:
return '%s.%s' % (li[-2], li[-1])
return li[-1]
class Fdfs_ConfigParser(configparser.RawConfigParser):
"""
Extends ConfigParser to allow files without sections.
This is done by wrapping read files and prepending them with a placeholder
section, which defaults to '__config__'
"""
def __init__(self, default_section=None, *args, **kwargs):
configparser.RawConfigParser.__init__(self, *args, **kwargs)
self._default_section = None
self.set_default_section(default_section or '__config__')
def get_default_section(self):
return self._default_section
def set_default_section(self, section):
self.add_section(section)
# move all values from the previous default section to the new one
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
except configparser.NoSectionError:
pass
else:
for (key, value) in default_section_items:
self.set(section, key, value)
self._default_section = section
def read(self, filenames):
if isinstance(filenames, str):
filenames = [filenames]
read_ok = []
for filename in filenames:
try:
with open(filename) as fp:
self.readfp(fp)
except IOError:
continue
else:
read_ok.append(filename)
return read_ok
def readfp(self, fp, *args, **kwargs):
stream = io.StringIO()
try:
stream.name = fp.name
except AttributeError:
pass
stream.write('[' + self._default_section + ']\n')
stream.write(fp.read())
stream.seek(0, 0)
return self._read(stream, stream.name)
def write(self, fp):
# Write the items from the default section manually and then remove them
# from the data. They'll be re-added later.
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
for (key, value) in default_section_items:
fp.write("{0} = {1}\n".format(key, value))
fp.write("\n")
except configparser.NoSectionError:
pass
configparser.RawConfigParser.write(self, fp)
self.add_section(self._default_section)
for (key, value) in default_section_items:
self.set(self._default_section, key, value)
def _read(self, fp, fpname):
"""Parse a sectioned setup file.
The sections in setup file contains a title line at the top,
indicated by a name in square brackets (`[]'), plus key/value
options lines, indicated by `name: value' format lines.
Continuations are represented by an embedded newline then
leading whitespace. Blank lines, lines beginning with a '#',
and just about everything else are ignored.
"""
cursect = None # None, or a dictionary
optname = None
lineno = 0
e = None # None, or an exception
while True:
line = fp.readline()
if not line:
break
lineno = lineno + 1
# comment or blank line?
if line.strip() == '' or line[0] in '#;':
continue
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
# no leading whitespace
continue
# continuation line?
if line[0].isspace() and cursect is not None and optname:
value = line.strip()
if value:
cursect[optname] = "%s\n%s" % (cursect[optname], value)
# a section header or option header?
else:
# is it a section header?
mo = self.SECTCRE.match(line)
if mo:
sectname = mo.group('header')
if sectname in self._sections:
cursect = self._sections[sectname]
elif sectname == DEFAULTSECT:
cursect = self._defaults
else:
cursect = self._dict()
cursect['__name__'] = sectname
self._sections[sectname] = cursect
# So sections can't start with a continuation line
optname = None
# no section header in the file?
elif cursect is None:
raise MissingSectionHeaderError(fpname, lineno, line)
# an option line?
else:
mo = self.OPTCRE.match(line)
if mo:
optname, vi, optval = mo.group('option', 'vi', 'value')
if vi in ('=', ':') and ';' in optval:
# ';' is a comment delimiter only if it follows
# a spacing character
pos = optval.find(';')
if pos != -1 and optval[pos - 1].isspace():
optval = optval[:pos]
optval = optval.strip()
# allow empty values
if optval == '""':
optval = ''
optname = self.optionxform(optname.rstrip())
if optname in cursect:
if not isinstance(cursect[optname], list):
cursect[optname] = [cursect[optname]]
cursect[optname].append(optval)
else:
cursect[optname] = optval
else:
# a non-fatal parsing error occurred. set up the
# exception but keep going. the exception will be
# raised at the end of the file and will contain a
# list of all bogus lines
if not e:
e = ParsingError(fpname)
e.append(lineno, repr(line))
# if any parsing errors occurred, raise an exception
if e:
raise e
def split_remote_fileid(remote_file_id):
'''
Splite remote_file_id to (group_name, remote_file_name)
arguments:
@remote_file_id: string
@return tuple, (group_name, remote_file_name)
'''
index = remote_file_id.find(b'/')
if -1 == index:
return None
return (remote_file_id[0:index], remote_file_id[(index + 1):])
def fdfs_check_file(filename):
ret = True
errmsg = ''
if not os.path.isfile(filename):
ret = False
errmsg = '[-] Error: %s is not a file.' % filename
elif not stat.S_ISREG(os.stat(filename).st_mode):
ret = False
errmsg = '[-] Error: %s is not a regular file.' % filename
return (ret, errmsg)
if __name__ == '__main__':
print(get_file_ext_name('/bc.tar.gz'))
# -*- coding: utf-8 -*-
import pandas as pd
import time
import requests
import json
from kafka import KafkaProducer
from base.BaseCore import BaseCore
from getQccId import find_id_by_name
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log = baseCore.getLogger()
# 通过企查查id获取企业基本信息
def info_by_id(com_id,com_name,social_code):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
com_jc_name = ''
try:
result_dict = resp_dict['result']['Company']
except:
log.info(com_name + ":获取失败===========重新放入redis")
baseCore.rePutIntoR('BaseInfoEnterpriseFbs:gnqy_social_code',social_code)
return aa_dict_list
company_name = result_dict['Name']
CreditCode = result_dict['CreditCode']
if CreditCode is None:
CreditCode = ''
try:
OperName = result_dict['Oper']['Name']
except:
OperName = ''
if OperName is None:
OperName = ''
if baseCore.str_have_num(OperName):
OperName = ''
try:
Status = result_dict['ShortStatus']
except:
Status = ''
if Status is None:
Status = ''
try:
StartDate = result_dict['StartDate']
except:
StartDate = ''
if StartDate is None:
StartDate = ''
try:
RegistCapi = result_dict['RegistCapi']
except:
RegistCapi = ''
if RegistCapi is None:
RegistCapi = ''
RecCap = '' # result_dict['RecCap'] #实际缴纳金额,现已没有显示
if RecCap is None:
RecCap = ''
try:
OrgNo = result_dict['CreditCode'][8:-2] + '-' + result_dict['CreditCode'][-2] # 组织机构代码,现已没有显示
except:
OrgNo = ''
if OrgNo is None:
OrgNo = ''
try:
TaxNo = result_dict['TaxNo']
except:
TaxNo = ''
if TaxNo is None:
TaxNo = ''
try:
EconKind = result_dict['EconKind']
except:
EconKind = ''
if EconKind is None:
EconKind = ''
TermStart = '' # result_dict['TermStart'] 营业期限自,现已没有显示
if TermStart is None:
TermStart = ''
TeamEnd = '' # result_dict['TeamEnd']营业期限至,现已没有显示
if TeamEnd is None:
TeamEnd = ''
try:
SubIndustry = result_dict['Industry']['SubIndustry']
except:
SubIndustry = ''
if SubIndustry is None:
SubIndustry = ''
try:
Province = result_dict['Area']['Province']
except:
Province = ''
try:
City = result_dict['Area']['City']
except:
City = ''
try:
County = result_dict['Area']['County']
except:
County = ''
try:
region = Province + City + County
except:
region = ''
BelongOrg = '' # result_dict['BelongOrg']登记机关,现已没有显示
can_bao = ''
CommonList = [] # result_dict['CommonList']参保人数,现已没有显示
for Common_dict in CommonList:
try:
KeyDesc = Common_dict['KeyDesc']
except:
continue
if KeyDesc == '参保人数':
can_bao = Common_dict['Value']
if can_bao == '0':
can_bao = ''
OriginalName = ''
try:
OriginalName_lists = result_dict['OriginalName']
for OriginalName_dict in OriginalName_lists:
OriginalName += OriginalName_dict['Name'] + ' '
except:
OriginalName = ''
try:
OriginalName.strip()
except:
OriginalName = ''
EnglishName = '' # result_dict['EnglishName']企业英文名,现已没有显示
if EnglishName is None:
EnglishName = ''
IxCode = '' # result_dict['IxCode']进出口企业代码,现已没有显示
if IxCode is None:
IxCode = ''
Address = result_dict['Address']
if Address is None:
Address = ''
Scope = '' # result_dict['Scope']经营范围,现已没有显示
if Scope is None:
Scope = ''
try:
PhoneNumber = result_dict['companyExtendInfo']['Tel']
except:
PhoneNumber = ''
if PhoneNumber is None:
PhoneNumber = ''
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
try:
Email = result_dict['companyExtendInfo']['Email']
except:
Email = ''
if Email is None:
Email = ''
try:
Desc = result_dict['companyExtendInfo']['Desc']
except:
Desc = ''
if Desc is None:
Desc = ''
try:
Info = result_dict['companyExtendInfo']['Info']
except:
Info = ''
if Info is None:
Info = ''
company_name = baseCore.hant_2_hans(company_name)
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}".format(token, t,
com_id)
resp_dict2 = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(1)
try:
com2 = resp_dict2['result']['Company']
except:
com2 = ''
try:
Scope = com2['Scope']
except:
Scope = ''
try:
CheckDate = com2['CheckDate']
except:
CheckDate = ''
if CheckDate is None:
CheckDate = ''
try:
TaxpayerType = com2['TaxpayerType'] #纳税人资质
except:
TaxpayerType = ''
if TaxpayerType is None:
TaxpayerType = ''
try:
No = com2['No']
except:
No = ''
if No is None:
No = ''
try:
IxCode = com2['IxCode']
except:
IxCode = ''
try:
OrgNo = com2['OrgNo']
except:
OrgNo = ''
try:
for Common_t in com2['CommonList']:
try:
if Common_t['KeyDesc'] == '参保人数':
can_bao = Common_t['Value']
except:
pass
except:
can_bao = ''
try:
TermStart = com2['TermStart']
except:
TermStart = ''
try:
TeamEnd = com2['TeamEnd']
except:
TeamEnd = ''
try:
RecCap = com2['RecCap']
except:
RecCap = ''
try:
No = com2['No']
except:
No = ''
try:
SubIndustry = com2['IndustryArray'][-1]
except:
SubIndustry = ''
try:
BelongOrg = com2['BelongOrg']
except:
BelongOrg = ''
try:
EnglishName = com2['EnglishName']
except:
EnglishName = ''
aa_dict = {
'qccId': com_id, # 企查查企业id
'name': company_name, # 企业名称
'shortName': com_jc_name, # 企业简称
'socialCreditCode': CreditCode, # 统一社会信用代码
'legalPerson': OperName, # 法定代表人
'officialPhone': PhoneNumber, # 电话
'officialUrl': WebSite, # 官网
'officialEmail': Email, # 邮箱
'briefInfo': Desc, # 简介
'registerStatus': Status, # 登记状态
'incorporationDate': StartDate, # 成立日期
'capital': RegistCapi, # 注册资本
'paidCapital': RecCap, # 实缴资本
'approvalDate': CheckDate, # 核准日期
'organizationCode': OrgNo, # 组织机构代码
'registerNo': No, # 工商注册号
'taxpayerNo': CreditCode, # 纳税人识别号
'type': EconKind, # 企业类型
'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': SubIndustry, # 所属行业
'region': region,
'province': Province, # 所属省
'city': City, # 所属市
'county': County, # 所属县
'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模
'insured': can_bao, # 参保人数
'beforeName': OriginalName, # 曾用名
'englishName': EnglishName, # 英文名
'importExportEnterpriseCode': IxCode, # 进出口企业代码
'address': Address, # 地址
'businessRange': Scope, # 经营范围
'status': 0, # 状态
}
aa_dict_list.append(aa_dict)
print(company_name + ":爬取完成")
return aa_dict_list
if __name__ == '__main__':
taskType = '基本信息/企查查'
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
#从redis里拿数据
while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken()
list_weicha = []
list_all_info = []
name_list = []
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
social_code = '91110000802100433B'
if social_code == '':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(social_code)
log.info(f'----当前企业{social_code}--开始处理---')
count = dic_info[13]
com_name = dic_info[1]
social_code = dic_info[2]
#企查查id
company_id = dic_info[12]
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == '' or company_id == None:
if social_code:
company_id = find_id_by_name(start_time,token,social_code)
else:
company_id = find_id_by_name(start_time,token,com_name)
if not company_id:
log.info(com_name + ":企业ID获取失败===重新放入redis")
list_weicha.append(com_name + ":企业ID获取失败")
baseCore.rePutIntoR('BaseInfoEnterprise:gnqy_socialCode',social_code)
time.sleep(20)
continue
else:
log.info(f'====={social_code}===={company_id}=====获取企业id成功=====')
# todo:写入数据库
updateSql = f"update EnterpriseInfo set QCCID = '{company_id}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql)
cnx_.commit()
try:
post_data_list = info_by_id(company_id, com_name,social_code)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('BaseInfoEnterprise:gnqy_social_code', social_code)
continue
if post_data_list:
pass
else:
log.info(f'======{social_code}====企查查token失效====')
time.sleep(20)
continue
for post_data in post_data_list:
list_all_info.append(post_data)
if post_data is None:
print(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败")
continue
get_name = post_data['name']
get_socialcode = post_data['socialCreditCode']
name_compile = {
'yuan_name':com_name,
'get_name':get_name
}
name_list.append(name_compile)
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
except:
exception = 'kafka传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
# 信息采集完成后将该企业的采集次数更新
runType = 'BaseInfoRunCount'
count += 1
baseCore.updateRun(social_code, runType, count)
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
...@@ -37,6 +37,11 @@ def find_id_by_name(start,token,name): ...@@ -37,6 +37,11 @@ def find_id_by_name(start,token,name):
time.sleep(5) time.sleep(5)
continue continue
time.sleep(2) time.sleep(2)
#{'status': 40101, 'message': '无效的sessionToken!'}
if resp_dict['status']==40101:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
try: try:
if resp_dict['result']['Result']: if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0] result_dict = resp_dict['result']['Result'][0]
......
""" """
...@@ -27,7 +27,7 @@ def check_code(com_code): ...@@ -27,7 +27,7 @@ def check_code(com_code):
def check_date(com_code,info_date): def check_date(com_code,info_date):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=3) r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=3)
res = r.sismember('com_caiwushuju_date::'+com_code, info_date) # 注意是 保存set的方式 res = r.sismember('com_caiwushuju_code::'+com_code, info_date) # 注意是 保存set的方式
if res: if res:
return True return True
else: else:
...@@ -437,15 +437,16 @@ def getReportTime(): ...@@ -437,15 +437,16 @@ def getReportTime():
# timeNow = baseCore.getNowTime(1)[:10] # timeNow = baseCore.getNowTime(1)[:10]
list_date = [] list_date = []
# 2023-04-01 # 2023-04-01
#todo:正式任务
# 获取当前日期和时间 # 获取当前日期和时间
current_date = datetime.now() # current_date = datetime.now()
# 计算昨天的日期 # 计算昨天的日期
yesterday = current_date - timedelta(days=1) # yesterday = current_date - timedelta(days=1)
# 格式化昨天的日期 # 格式化昨天的日期
report_date = yesterday.strftime('%Y-%m-%d') # report_date = yesterday.strftime('%Y-%m-%d')
list_date.append(report_date) # list_date.append(report_date)
year = int(current_date.strftime('%Y')) # year = int(current_date.strftime('%Y'))
# list_date = ['2023-03-31'] list_date = ['2023-03-31']
list_month = ['-12-31', '-09-30', '-06-30', '-03-31'] list_month = ['-12-31', '-09-30', '-06-30', '-03-31']
for year in range(2022, 2018, -1): for year in range(2022, 2018, -1):
...@@ -459,7 +460,8 @@ def job(taskType): ...@@ -459,7 +460,8 @@ def job(taskType):
# 需要提供股票代码、企业信用代码 # 需要提供股票代码、企业信用代码
while True: while True:
#从redis中获取企业信用代码 #从redis中获取企业信用代码
social_code = baseCore.redicPullData('FinanceFromEast:finance_socialCode') social_code = baseCore.redicPullData('FinanceFromEast:eastfinance_socialCode')
# social_code = '91100000100003962T'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
...@@ -468,8 +470,12 @@ def job(taskType): ...@@ -468,8 +470,12 @@ def job(taskType):
sql_sel = f'''select securities_code,exchange from sys_base_enterprise_ipo where category = '1' and social_credit_code='{social_code}' ''' sql_sel = f'''select securities_code,exchange from sys_base_enterprise_ipo where category = '1' and social_credit_code='{social_code}' '''
cursor.execute(sql_sel) cursor.execute(sql_sel)
row = cursor.fetchone() row = cursor.fetchone()
try:
securities_code = row[0] securities_code = row[0]
pass
except:
log.info(f'======{social_code}没有股票代码======')
continue
exchange = row[1] exchange = row[1]
# for code in list_code: # for code in list_code:
# social_code = rows[0] # social_code = rows[0]
...@@ -510,18 +516,17 @@ def job(taskType): ...@@ -510,18 +516,17 @@ def job(taskType):
time.sleep(1) time.sleep(1)
print(res_baocun.text) print(res_baocun.text)
if len(info_date_list) != 0:
for date in info_date_list:
date_list.append(date)
print(date_list)
date_list = str(date_list)
for nnn in range(0, 3): for nnn in range(0, 3):
try: try:
add_date(com_code,date_list) add_date(com_code, date_list)
break break
except: except:
time.sleep(1) time.sleep(1)
if len(info_date_list) != 0:
for date in info_date_list:
date_list.append(date)
print(date_list)
# date_list = str(date_list)
end_time = time.time() end_time = time.time()
log.info(f'===={com_code}====该企业耗时{end_time-start_time}===') log.info(f'===={com_code}====该企业耗时{end_time-start_time}===')
cnx.close() cnx.close()
...@@ -529,7 +534,7 @@ def job(taskType): ...@@ -529,7 +534,7 @@ def job(taskType):
baseCore.close() baseCore.close()
if __name__=='__main__': if __name__=='__main__':
task_type = '财务数据/东方财富网' task_type = '财务数据/东方财富网/福布斯'
job(task_type) job(task_type)
......
""" """
...@@ -15,20 +15,18 @@ from bs4 import BeautifulSoup ...@@ -15,20 +15,18 @@ from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from datetime import datetime from datetime import datetime
from base import BaseCore from base import BaseCore
from fdfs_client.client import get_tracker_conf, Fdfs_client # from fdfs_client.client import get_tracker_conf, Fdfs_client
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx_ = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
# cnx_ip = pymysql.connect(host='114.115.159.144',user='root', password='zzsn9988', db='clb_project', charset='utf8mb4')
cursor = cnx.cursor()
cursor_ = cnx_.cursor()
tracker_conf = get_tracker_conf('./client.conf') cnx = baseCore.cnx
client = Fdfs_client(tracker_conf) cursor = baseCore.cursor
taskType = '企业公告/证监会' # tracker_conf = get_tracker_conf('./client.conf')
# client = Fdfs_client(tracker_conf)
taskType = '企业公告/证监会/福布斯'
def RequestUrl(url, payload, social_code,start_time): def RequestUrl(url, payload, social_code,start_time):
# ip = get_proxy()[random.randint(0, 3)] # ip = get_proxy()[random.randint(0, 3)]
...@@ -138,30 +136,25 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type ...@@ -138,30 +136,25 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type
inster = False inster = False
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s''' sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s'''
cursor_.execute(sel_sql, (social_code, pdf_url)) cursor.execute(sel_sql, (social_code, pdf_url))
selects = cursor_.fetchone() selects = cursor.fetchone()
if selects: if selects:
print(f'com_name:{short_name}、{pdf_url}已存在') print(f'com_name:{short_name}、{pdf_url}已存在')
return inster return inster
# 信息插入数据库 # 信息插入数据库
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
list_info = [ list_info = [
social_code, social_code,
name_pdf,
'', # 摘要
'', # 正文
pub_time, # 发布时间
pdf_url, pdf_url,
'证监会', '证监会',
report_type,
'1', '1',
'zh'
] ]
cursor_.execute(insert_sql, tuple(list_info)) #144数据库
cnx_.commit() cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
insert = True insert = True
return insert return insert
except: except:
...@@ -171,34 +164,42 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type ...@@ -171,34 +164,42 @@ def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type
return insert return insert
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time): def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
sel_sql = "select article_id from brpa_source_article where source_address = %s" #上传至文件服务器
cursor_.execute(sel_sql, pdf_url) retData = baseCore.upLoadToServe(pdf_url,8,social_code)
row = cursor_.fetchone() #附件插入att数据库
id = row[0] num = num + 1
# 先获取PDF链接下载pdf,在解析内容 att_id = baseCore.tableUpdate(retData,com_name,year,pdf_name,num)
try: content = retData['content']
res = requests.get(pdf_url) if retData['state']:
content = '' pass
# 读取文件内容, else:
with fitz.open(stream=res.content, filetype='pdf') as doc: log.info(f'====pdf解析失败====')
for page in doc.pages():
content += page.get_text()
except:
# print('解析失败')
dic_result = {
'success': 'false',
'message': 'PDF解析失败',
'code': '204',
}
print(dic_result)
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, dic_result['message'])
return False return False
# 先获取PDF链接下载pdf,在解析内容
# try:
# res = requests.get(pdf_url)
# content = ''
# # 读取文件内容,解析内容
# with fitz.open(stream=res.content, filetype='pdf') as doc:
# for page in doc.pages():
# content += page.get_text()
# except:
# # print('解析失败')
# dic_result = {
# 'success': 'false',
# 'message': 'PDF解析失败',
# 'code': '204',
# }
# log.info(dic_result)
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, dic_result['message'])
# return False
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
'attachmentIds': id, 'attachmentIds': att_id,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': '', 'contentWithTag': '',
...@@ -247,11 +248,12 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time): ...@@ -247,11 +248,12 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time):
# 采集信息 # 采集信息
def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获取到的基本信息 def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库中获取到的基本信息
okCount = 0 okCount = 0
errorCount = 0 errorCount = 0
social_code = dic_info[2] social_code = dic_info[2]
short_name = dic_info[4] short_name = dic_info[4]
com_name = dic_info[1]
soup = RequestUrl(url, payload, social_code, start_time) soup = RequestUrl(url, payload, social_code, start_time)
if soup == '': if soup == '':
...@@ -298,9 +300,9 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获 ...@@ -298,9 +300,9 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获
pdf_url_info = td_list[2] pdf_url_info = td_list[2]
# print(pdf_url) # print(pdf_url)
pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'') pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[1].strip('\'')
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'') pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
year = pub_time[:4] year = pub_time[:4]
report_type = td_list[4].text.strip() report_type = td_list[4].text.strip()
...@@ -311,7 +313,7 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获 ...@@ -311,7 +313,7 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获
# # 公告信息列表 # # 公告信息列表
# okCount = okCount + 1 # okCount = okCount + 1
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败 # 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time) result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num)
if result: if result:
# 公告信息列表 # 公告信息列表
...@@ -335,6 +337,7 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获 ...@@ -335,6 +337,7 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获
if __name__ == '__main__': if __name__ == '__main__':
num = 0
headers = { headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate', 'Accept-Encoding': 'gzip, deflate',
...@@ -370,7 +373,8 @@ if __name__ == '__main__': ...@@ -370,7 +373,8 @@ if __name__ == '__main__':
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode') # social_code = baseCore.redicPullData('NoticeEnterpriseFbs:gnqy_socialCode')
social_code = '9110000071092841XX'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
...@@ -391,15 +395,16 @@ if __name__ == '__main__': ...@@ -391,15 +395,16 @@ if __name__ == '__main__':
# 股票代码0、2、3开头的为深圳交易所,6、9开头的为上海交易所,4、8开头的为北京交易所 # 股票代码0、2、3开头的为深圳交易所,6、9开头的为上海交易所,4、8开头的为北京交易所
code = dic_info[3] code = dic_info[3]
short_name = dic_info[4] short_name = dic_info[4]
com_name = dic_info[1]
dic_parms = getUrl(code, url_parms, Catagory2_parms) dic_parms = getUrl(code, url_parms, Catagory2_parms)
dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls) dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)
if len(dic_parms) > 0: if len(dic_parms) > 0:
start_time_cj = time.time() start_time_cj = time.time()
SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time) SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time,num)
log.info(f'{code}==========={short_name},发行公告,耗时{baseCore.getTimeCost(start_time_cj, time.time())}') log.info(f'{code}==========={short_name},{com_name},发行公告,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
start_time_ls = time.time() start_time_ls = time.time()
SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time) SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time,num)
log.info(f'{code}==========={short_name},临时报告,耗时{baseCore.getTimeCost(start_time_ls, time.time())}') log.info(f'{code}==========={short_name},{com_name},临时报告,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
# UpdateInfoSql(retData,retData_ls,social_code) # UpdateInfoSql(retData,retData_ls,social_code)
# log.info(f'{code}================更新成功') # log.info(f'{code}================更新成功')
end_time = time.time() end_time = time.time()
...@@ -410,7 +415,7 @@ if __name__ == '__main__': ...@@ -410,7 +415,7 @@ if __name__ == '__main__':
cursor.close() cursor.close()
cnx.close() cnx.close()
cursor_.close() # cursor_.close()
cnx_.close() # cnx_.close()
# 释放资源 # 释放资源
baseCore.close() baseCore.close()
#补充剩余核心人员信息
#先采集天眼查id,再通过id采集核心人员信息
import datetime
import json
import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
import urllib3
from base.BaseCore import BaseCore
from getTycId import getTycIdByXYDM
baseCore = BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
headers = {
'Cookie':'HWWAFSESID=0e10b77869899be8365; HWWAFSESTIME=1688781923708; csrfToken=VeTF4UIZKJ0q6yWmgfC_FLqv; TYCID=e7cec7501d3311eea9dcb9fb7af79aad; ssuid=3142278034; sajssdk_2015_cross_new_user=1; bannerFlag=true; _ga=GA1.2.1006597844.1688781929; _gid=GA1.2.146077413.1688781929; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1688781929; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2217103123002%22}; tyc-user-info-save-time=1688781977329; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyMzAwMiIsImlhdCI6MTY4ODc4MTk3NiwiZXhwIjoxNjkxMzczOTc2fQ.Luw0DCFul8WxRNOM8X5-NCmy_z3BwJC5JBvofWqWkSQOleJ6zJU0SRbqwAobPfOfVyGFDUBqmxxWd4YKCeCWeQ; tyc-user-phone=%255B%252217103123002%2522%255D; searchSessionId=1688778331.16177575; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22302953956%22%2C%22first_id%22%3A%22189333f38cb947-0fb9b252742a6c-26031d51-921600-189333f38cdcdd%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg5MzMzZjM4Y2I5NDctMGZiOWIyNTI3NDJhNmMtMjYwMzFkNTEtOTIxNjAwLTE4OTMzM2YzOGNkY2RkIiwiJGlkZW50aXR5X2xvZ2luX2lkIjoiMzAyOTUzOTU2In0%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22302953956%22%7D%2C%22%24device_id%22%3A%22189333f38cb947-0fb9b252742a6c-26031d51-921600-189333f38cdcdd%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1688781980',
# 'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
list_all_1 = []
list_all_2 = []
taskType = '天眼查/核心人员'
def doJob():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
# social_code = 'ZZSN23011300000004'
if social_code == None:
time.sleep(20)
continue
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
#数据重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code)
continue
id = data[0]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData:
tycid = retData['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
else:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = []
num = 1
for page in range(1,2):
t = int(time.time()*1000)
#https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip = baseCore.get_proxy()
res = requests.get(url,headers=headers,proxies=ip) # ,verify=False
time.sleep(1)
list_all = res.json()['data']['dataList']
if list_all:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
position = one_info['position']
Salary = one_info['salary']
#todo:获取当前年份
now = datetime.datetime.now()
year = now.year
try:
birthYear = year - int(one_info['age'])
except:
birthYear = ''
StockKeepings = one_info['numberOfShares']
currentTerm = one_info['term']
personInfo = one_info['resume']
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode":social_code,
"name":name,
"sex":sex,
"education":education,
"position":position,
"salary":Salary,
"birthYear":birthYear,
"shareNum":StockKeepings,
"shareRatio":'',
"benefitShare":'',
"currentTerm":currentTerm,
"personInfo":personInfo,
"sort":str(num)
}
dic_json_img = {
"socialCreditCode":social_code,
"name":name,
"sex":sex,
"education":education,
"position":position,
"salary":Salary,
"birthYear":birthYear,
"shareNum":StockKeepings,
"shareRatio":'',
"benefitShare":'',
"currentTerm":currentTerm,
"personInfo":personInfo,
"头像":person_img,
"sort":str(num)
}
num = num+1
list_one_info.append(dic_json)
# list_all_2.append(dic_json_img)
else:
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip = baseCore.get_proxy()
res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
time.sleep(1)
list_all = res.json()['data']['dataList']
if list_all:
for one_info in list_all:
name = one_info['personal_name']
sex = one_info['gender2']
education = ''
position = one_info['position_name']
Salary = ''
birthYear = ''
personInfo = one_info['resume_cn']
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
else:
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
ip = baseCore.get_proxy()
res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
time.sleep(1)
list_all = res.json()['data']['result']
# todo:增加一种情况
if list_all:
for one_info in list_all:
name = one_info['name']
try:
sex = one_info['sex']
except:
sex = ''
try:
education = one_info['education']
except:
education = ''
try:
position = one_info['typeSore']
except:
position = ''
try:
Salary = one_info['salary']
except:
Salary = ''
birthYear = ''
try:
shareRatio = one_info['percent']
except:
shareRatio = ''
try:
benefitShare = one_info['finalBenefitShares']
except:
benefitShare = ''
try:
currentTerm = one_info['term']
except:
currentTerm = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
person_res = requests.get(person_url, headers=headers, proxies=ip)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
continue
else:
pass
response = requests.post('http://114.115.236.206:8088/sync/executive',data=json_updata,timeout=300, verify=False)
print(response.text)
log.info('=========成功======')
except Exception as e:
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
# 重新塞入redis
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
# break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if __name__ == "__main__":
doJob()
\ No newline at end of file
...@@ -19,7 +19,7 @@ jieba.cut("必须加载jieba") ...@@ -19,7 +19,7 @@ jieba.cut("必须加载jieba")
smart =smart_extractor.SmartExtractor('cn') smart =smart_extractor.SmartExtractor('cn')
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor= cnx.cursor() cursor= cnx.cursor()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
...@@ -37,7 +37,7 @@ headers = { ...@@ -37,7 +37,7 @@ headers = {
'Referer': 'https://www.tianyancha.com/', 'Referer': 'https://www.tianyancha.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51' 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51'
} }
taskType = '企业动态/天眼查/福布斯' taskType = '企业动态/天眼查'
def beinWork(tyc_code, social_code): def beinWork(tyc_code, social_code):
start_time = time.time() start_time = time.time()
time.sleep(3) time.sleep(3)
...@@ -154,11 +154,14 @@ def beinWork(tyc_code, social_code): ...@@ -154,11 +154,14 @@ def beinWork(tyc_code, social_code):
# 开始进行智能解析 # 开始进行智能解析
# lang = baseCore.detect_language(title) # lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang) # smart = smart_extractor.SmartExtractor(lang)
contentText = smart.extract_by_url(link).text #带标签正文
contentWithTag = smart.extract_by_url(link).text
#不带标签正文
content = smart.extract_by_url(link).cleaned_text
# time.sleep(3) # time.sleep(3)
except Exception as e: except Exception as e:
contentText = '' contentWithTag = ''
if contentText == '': if contentWithTag == '':
log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}') log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}')
e = '获取正文失败' e = '获取正文失败'
state = 0 state = 0
...@@ -174,7 +177,7 @@ def beinWork(tyc_code, social_code): ...@@ -174,7 +177,7 @@ def beinWork(tyc_code, social_code):
continue continue
try: try:
#todo:更换插入的库 #todo:更换插入的库
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,author,type) values(%s,%s,%s,%s,%s)''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
# 动态信息列表 # 动态信息列表
up_okCount = up_okCount + 1 up_okCount = up_okCount + 1
...@@ -182,14 +185,73 @@ def beinWork(tyc_code, social_code): ...@@ -182,14 +185,73 @@ def beinWork(tyc_code, social_code):
social_code, social_code,
link, link,
'天眼查', '天眼查',
source,
'2', '2',
] ]
cursor_.execute(insert_sql, tuple(list_info)) cursor_.execute(insert_sql, tuple(list_info))
cnx_.commit() cnx_.commit()
# 采集一条资讯记录一条,记录该企业采到了多少的资讯 # 采集一条资讯记录一条,记录该企业采到了多少的资讯
log.info(f'{social_code}----{link}:新增一条') log.info(f'{social_code}----{link}:新增一条')
# 采集一条资讯记录一条,记录该企业采到了多少的资讯
log.info(f'{social_code}----{link}:新增一条')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:插入一条数据,并传入kafka
dic_news = {
'attachmentIds': '',
'author': '',
'content': content,
'contentWithTag': contentWithTag,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '天眼查',
'publishDate': time_format,
'sid': '1684032033495392257',
'sourceAddress': link, # 原文链接
'summary': info_page['abstracts'],
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': time_format[:4]
}
except Exception as e:
log.info(f'传输失败:{social_code}----{link}')
# e = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
continue
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, '')
# return True
except Exception as e: except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.error(dic_result)
e = 'Kafka操作失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e) baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
...@@ -205,8 +267,9 @@ def doJob(): ...@@ -205,8 +267,9 @@ def doJob():
while True: while True:
start = time.time() start = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 天眼查ID19276488
social_code = baseCore.redicPullData('NewsEnterpriseFbs:gnqy_socialCode') # social_code = baseCore.redicPullData('NewsEnterpriseFbs:gnqy_socialCode')
social_code = '912301001275921118'
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
continue continue
...@@ -222,19 +285,25 @@ def doJob(): ...@@ -222,19 +285,25 @@ def doJob():
id = data[0] id = data[0]
xydm = data[2] xydm = data[2]
tycid = data[11] tycid = data[11]
if tycid == None: if tycid == None or tycid== '':
try: try:
retData = getTycIdByXYDM(xydm) retData = getTycIdByXYDM(xydm)
tycid = retData['tycData']['id'] if retData:
tycid = retData['id']
# todo:写入数据库 # todo:写入数据库
updateSql = f"update Enterprise set TYCID = '{tycid}' where SocialCode = '{xydm}'" updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql) cursor_.execute(updateSql)
cnx_.commit() cnx_.commit()
else:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterpriseFbs:gnqy_socialCode', social_code)
except: except:
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败') baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code) baseCore.rePutIntoR('NewsEnterpriseFbs:gnqy_socialCode', social_code)
continue continue
count = data[17] count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理") log.info(f"{id}---{xydm}----{tycid}----开始处理")
...@@ -242,8 +311,10 @@ def doJob(): ...@@ -242,8 +311,10 @@ def doJob():
# 开始采集企业动态 # 开始采集企业动态
retData = beinWork(tycid, xydm) retData = beinWork(tycid, xydm)
# 信息采集完成后将该企业的采集次数更新
# baseCore.updateRun(xydm, runType, count) runType = 'NewsRunCount'
count += 1
baseCore.updateRun(xydm, runType, count)
total = retData['total'] total = retData['total']
up_okCount = retData['up_okCount'] up_okCount = retData['up_okCount']
up_errorCount = retData['up_errorCount'] up_errorCount = retData['up_errorCount']
...@@ -257,7 +328,7 @@ def doJob(): ...@@ -257,7 +328,7 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}') baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5) time.sleep(5)
# break
cursor.close() cursor.close()
cnx.close() cnx.close()
# 释放资源 # 释放资源
......
...@@ -10,9 +10,15 @@ from base.BaseCore import BaseCore ...@@ -10,9 +10,15 @@ from base.BaseCore import BaseCore
requests.adapters.DEFAULT_RETRIES = 5 requests.adapters.DEFAULT_RETRIES = 5
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
headers={ # headers={
'X-AUTH-TOKEN':'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzY4MzgxNjk4NCIsImlhdCI6MTY5MDE3ODYyOCwiZXhwIjoxNjkyNzcwNjI4fQ.VV3Zoa4RM5nVN8UXBc0-81KMGqLzTOme6rButeETGfFQi7p5h4ydg8CFrEsizr_iFwB3_BVaKR2o2xR-M4ipbQ', # 'X-AUTH-TOKEN':'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzY4MzgxNjk4NCIsImlhdCI6MTY5MDE3ODYyOCwiZXhwIjoxNjkyNzcwNjI4fQ.VV3Zoa4RM5nVN8UXBc0-81KMGqLzTOme6rButeETGfFQi7p5h4ydg8CFrEsizr_iFwB3_BVaKR2o2xR-M4ipbQ',
'X-TYCID':'77e997401d5f11ee9e91d5a0fd3c0b83', # 'X-TYCID':'77e997401d5f11ee9e91d5a0fd3c0b83',
# 'version':'TYC-Web',
# 'Content-Type':'application/json;charset=UTF-8'
# }
headers = {
'X-TYCID':'30c1289042f511ee9182cd1e1bcaa517',
# 'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5MjkzMzIxMiwiZXhwIjoxNjk1NTI1MjEyfQ.BKxDem8fpgeDHrIgm3qCoF76ueHtQSG1DggiTl4FAaoNKt4gem6NTX1XYndPXqVj9TXfl-8yp2kKE3jY66dyig',
'version':'TYC-Web', 'version':'TYC-Web',
'Content-Type':'application/json;charset=UTF-8' 'Content-Type':'application/json;charset=UTF-8'
} }
...@@ -27,6 +33,7 @@ def getTycIdByXYDM(xydm): ...@@ -27,6 +33,7 @@ def getTycIdByXYDM(xydm):
paramJsonData = {'keyword':xydm} paramJsonData = {'keyword':xydm}
try: try:
headers['User-Agent'] = baseCore.getRandomUserAgent() headers['User-Agent'] = baseCore.getRandomUserAgent()
headers['X-AUTH-TOKEN'] = baseCore.GetTYCToken()
response = requests.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip) response = requests.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip)
time.sleep(random.randint(3, 5)) time.sleep(random.randint(3, 5))
retJsonData =json.loads(response.content.decode('utf-8')) retJsonData =json.loads(response.content.decode('utf-8'))
...@@ -35,14 +42,14 @@ def getTycIdByXYDM(xydm): ...@@ -35,14 +42,14 @@ def getTycIdByXYDM(xydm):
retData['state'] = True retData['state'] = True
retData['tycData'] = retJsonData['data'][0] retData['tycData'] = retJsonData['data'][0]
response.close() response.close()
return retData return retData['tycData']
else: else:
log.error(f"{xydm}------{retJsonData}") log.error(f"{xydm}------{retJsonData}")
response.close() response.close()
return retData return retData['tycData']
except Exception as e: except:
log.error(f"{xydm}---exception---{e}") log.error(f"---{xydm}--天眼查token失效---")
return retData return retData['tycData']
# 更新天眼查企业基本信息 # 更新天眼查企业基本信息
......
...@@ -3,7 +3,6 @@ import json ...@@ -3,7 +3,6 @@ import json
import requests, time, pymysql import requests, time, pymysql
import jieba import jieba
import sys import sys
from kafka import KafkaProducer from kafka import KafkaProducer
from getTycId import getTycIdByXYDM from getTycId import getTycIdByXYDM
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
...@@ -12,15 +11,15 @@ from base.smart import smart_extractor ...@@ -12,15 +11,15 @@ from base.smart import smart_extractor
# import BaseCore # import BaseCore
# from smart import smart_extractor # from smart import smart_extractor
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 初始化,设置中文分词 # 初始化,设置中文分词
jieba.cut("必须加载jieba") jieba.cut("必须加载jieba")
smart =smart_extractor.SmartExtractor('cn') smart =smart_extractor.SmartExtractor('cn')
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor = cnx.cursor() cursor = cnx.cursor()
pageSize = 10 pageSize = 10
headers = { headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
...@@ -134,10 +133,10 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -134,10 +133,10 @@ def beinWork(tyc_code, social_code,start_time):
link = info_page['uri'] link = info_page['uri']
try: try:
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' ''' sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
cursor.execute(sel_sql, (link, social_code)) cursor_.execute(sel_sql, (link, social_code))
except Exception as e: except Exception as e:
print(e) print(e)
selects = cursor.fetchone() selects = cursor_.fetchone()
if selects: if selects:
log.info(f'{tyc_code}-----{social_code}----{link}:已经存在') log.info(f'{tyc_code}-----{social_code}----{link}:已经存在')
...@@ -156,7 +155,10 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -156,7 +155,10 @@ def beinWork(tyc_code, social_code,start_time):
# 开始进行智能解析 # 开始进行智能解析
# lang = baseCore.detect_language(title) # lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang) # smart = smart_extractor.SmartExtractor(lang)
#带标签正文
contentText = smart.extract_by_url(link).text contentText = smart.extract_by_url(link).text
#不带标签正文
content = smart.extract_by_url(link).cleaned_text
# time.sleep(3) # time.sleep(3)
except Exception as e: except Exception as e:
contentText = '' contentText = ''
...@@ -175,36 +177,25 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -175,36 +177,25 @@ def beinWork(tyc_code, social_code,start_time):
pass pass
continue continue
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
# 动态信息列表 # 动态信息列表
up_okCount = up_okCount + 1 up_okCount = up_okCount + 1
list_info = [ list_info = [
social_code, social_code,
title,
info_page['abstracts'], # 摘要
contentText, # 正文
time_format, # 发布时间
link, link,
'天眼查', '天眼查',
source,
'2', '2',
'zh'
] ]
cursor.execute(insert_sql, tuple(list_info)) cursor_.execute(insert_sql, tuple(list_info))
cnx.commit() cnx_.commit()
# 采集一条资讯记录一条,记录该企业采到了多少的资讯 # 采集一条资讯记录一条,记录该企业采到了多少的资讯
log.info(f'{social_code}----{link}:新增一条') log.info(f'{social_code}----{link}:新增一条')
sel_sql = "select article_id from brpa_source_article where source_address = %s and social_credit_code = %s"
cursor.execute(sel_sql, (link, social_code))
row = cursor.fetchone()
id = row[0]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:插入一条数据,并传入kafka # todo:插入一条数据,并传入kafka
dic_news = { dic_news = {
'attachmentIds': id, 'attachmentIds': '',
'author': '', 'author': '',
'content': contentText, 'content': content,
'contentWithTag': contentText, 'contentWithTag': contentText,
'createDate': time_now, 'createDate': time_now,
'deleteFlag': '0', 'deleteFlag': '0',
...@@ -222,7 +213,6 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -222,7 +213,6 @@ def beinWork(tyc_code, social_code,start_time):
'year': time_format[:4] 'year': time_format[:4]
} }
except Exception as e: except Exception as e:
log.info(f'传输失败:{social_code}----{link}') log.info(f'传输失败:{social_code}----{link}')
e = '数据库传输失败' e = '数据库传输失败'
state = 0 state = 0
...@@ -237,7 +227,6 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -237,7 +227,6 @@ def beinWork(tyc_code, social_code,start_time):
json.dumps(dic_news, ensure_ascii=False).encode('utf8')) json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10)) print(kafka_result.get(timeout=10))
dic_result = { dic_result = {
'success': 'ture', 'success': 'ture',
'message': '操作成功', 'message': '操作成功',
...@@ -250,7 +239,6 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -250,7 +239,6 @@ def beinWork(tyc_code, social_code,start_time):
baseCore.recordLog(social_code, taskType, state, takeTime, link, '') baseCore.recordLog(social_code, taskType, state, takeTime, link, '')
# return True # return True
except Exception as e: except Exception as e:
dic_result = { dic_result = {
'success': 'false', 'success': 'false',
'message': '操作失败', 'message': '操作失败',
...@@ -269,12 +257,12 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -269,12 +257,12 @@ def beinWork(tyc_code, social_code,start_time):
retData['up_repetCount'] = up_repetCount retData['up_repetCount'] = up_repetCount
return retData return retData
# 日志信息保存至现已创建好数据库中,因此并没有再对此前保存日志信息数据库进行保存 # 日志信息保存至现已创建好数据库中,因此并没有再对此前保存日志信息数据库进行保存
def doJob(): def doJob():
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gnqy_socialCode') # social_code = baseCore.redicPullData('NewsEnterprise:gnqy_socialCode')
social_code = '912301001275921118'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
...@@ -291,28 +279,31 @@ def doJob(): ...@@ -291,28 +279,31 @@ def doJob():
id = data[0] id = data[0]
xydm = data[2] xydm = data[2]
tycid = data[11] tycid = data[11]
if tycid == None: if tycid == None or tycid == '':
try: try:
retData = getTycIdByXYDM(xydm) retData = getTycIdByXYDM(xydm)
tycid = retData['tycData']['id'] if retData:
#todo:写入数据库 tycid = retData['id']
updateSql = f"update Enterprise set TYCID = '{tycid}' where SocialCode = '{xydm}'" # todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql) cursor_.execute(updateSql)
cnx_.commit() cnx_.commit()
else:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
except: except:
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败') baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode',social_code) baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue continue
count = data[17] count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理") log.info(f"{id}---{xydm}----{tycid}----开始处理")
start_time = time.time() start_time = time.time()
# updateBeginSql = f"update ssqy_tyc set update_state=2,date_time=now() where id={id}"
# cursor.execute(updateBeginSql)
# cnx.commit()
# 开始采集企业动态 # 开始采集企业动态
retData = beinWork(tycid, xydm,start_time) retData = beinWork(tycid, xydm,start_time)
# 信息采集完成后将该企业的采集次数更新 # 信息采集完成后将该企业的采集次数更新
......
#下载pdf文件
import os
from datetime import time
import pymysql
import requests
import urllib3
from pymysql.converters import escape_string
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
baseCore = BaseCore()
log =baseCore.getLogger()
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
def get_file_name(headers):
filename = ''
if 'Content-Disposition' in headers and headers['Content-Disposition']:
disposition_split = headers['Content-Disposition'].split(';')
if len(disposition_split) > 1:
if disposition_split[1].strip().lower().startswith('filename='):
file_name = disposition_split[1].split('=')
if len(file_name) > 1:
filename = file_name[1]
if not filename:
return baseCore.getNextSeq()+".pdf"
return filename
def downFile(url,path):
try:
baseCore.mkPath(path)
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10)
fileName = get_file_name(response.headers)
with open(os.path.join(path, fileName), "wb") as pyFile:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
pyFile.write(chunk)
except Exception as e:
log.error(f"出错了----------{e}")
return False
return fileName
def getPath(str):
str = str.replace(':', '')
str = str.replace(': ', '')
str = str.replace(' ', '')
str = str.replace('"', '')
str = str.replace("'", '')
str = str.replace("/", '')
return str
if __name__ == '__main__':
while True :
selectSql = f"select id,url,website,ftype,stype,ttype from usvsrussia where state=0 order by id asc limit 1"
cursor.execute(selectSql)
data = cursor.fetchone()
if data:
id=data[0]
url=data[1]
website=data[2]
ftype=data[3]
stype=data[4]
ttype=data[5]
path=r'D:\美国VS俄罗斯制裁'
log.info(f"开始处理{url}----")
if website:
path = os.path.join(path, getPath(website))
if ftype:
path = os.path.join(path, getPath(ftype))
if stype:
path = os.path.join(path, getPath(stype))
if ttype:
path = os.path.join(path, getPath(ttype))
fileName = downFile(url,path)
if fileName:
updateSql = f"update usvsrussia set state=1,pdf_name='{fileName}' ,pdf_path='{escape_string(path)}' where id={id}"
log.info(f"开始处理{url}----处理ok")
else:
updateSql = f"update usvsrussia set state=2 where id={id}"
log.info(f"开始处理{url}----处理error")
cursor.execute(updateSql)
cnx.commit()
else:
log.info("数据处理完毕,程序退出")
break
baseCore.close()
cursor.close()
cnx.close()
\ No newline at end of file
#OFAC:美国财政部外国资产控制办公室 (OFAC),数量在200左右,四个类型里的所有带黑点、PDF文件都要。https://ofac.treasury.gov/
# 美国对俄罗斯相关制裁
# 俄罗斯有害外国活动制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions
# 乌克兰/俄罗斯有害外国活动制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/ukraine-russia-related-sanctions
# 2017年制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/countering-americas-adversaries-through-sanctions-act-related-sanctions
# 马格尼茨基制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/the-magnitsky-sanctions
import os
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
from pymysql.converters import escape_string
from selenium.webdriver.common.by import By
from base.BaseCore import BaseCore
baseCore = BaseCore()
log =baseCore.getLogger()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
# usvsrussia
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
def job1():
log.info("开始采集----俄罗斯有害外国活动制裁")
path=r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path,headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions'
driverContent.get(url)
ftype="Russian Harmful Foreign Activities Sanctions"
# IMPORTANT ADVISORIES AND INFORMATION 重要建议和信息
stype='IMPORTANT ADVISORIES AND INFORMATION'
log.info(f"开始采集栏目---{stype}")
# //*[@id="node-35986"]/div/ul[1]/li
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME,'a') #a标签
text= aEle.text
href = aEle.get_attribute('href')
time = liEle.text.replace(text,'')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql=f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
#log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Price Cap Policies //*[@id="node-35986"]/div/ul[2]/li
stype = 'Price Cap Policies'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text,'(',')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# INTERPRETIVE GUIDANCE 解释指导
#INTERPRETIVE GUIDANCE 单独处理
#FREQUENTLY ASKED QUESTIONS 单独处理
#RUSSIAN HARMFUL FOREIGN ACTIVITIES SANCTIONS DIRECTIVES
liEles = driverContent.find_elements(By.XPATH, '//*[@id="directives"]/ul/li')
stype = 'RUSSIAN HARMFUL FOREIGN ACTIVITIES SANCTIONS DIRECTIVES'
log.info(f"开始采集栏目---{stype}")
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text,'(',')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#APPLYING FOR A SPECIFIC OFAC LICENSE
#GUIDANCE ON OFAC LICENSING POLICY
stype = 'GUIDANCE ON OFAC LICENSING POLICY'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[6]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time =''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#GENERAL LICENSES
stype = 'GENERAL LICENSES'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[7]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text,'(',')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Executive Orders
stype = 'Executive Orders'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[8]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text,'(',')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Determinations
stype = 'Determinations'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[9]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Statutes
stype = 'Statutes'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[10]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Code of Federal Regulations
#Federal Register Notices
stype = 'Federal Register Notices'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[12]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text =liEle.text
href = aEle.get_attribute('href')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
def job2():
log.info("开始采集----乌克兰-俄罗斯有害外国活动制裁")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/ukraine-russia-related-sanctions'
driverContent.get(url)
ftype="Ukraine-/Russia-related Sanctions"
# IMPORTANT ADVISORIES
stype = 'IMPORTANT ADVISORIES'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# IMPORTANT ADVISORIES
stype = 'SANCTIONS BROCHURES'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = aEle.text
href = aEle.get_attribute('href')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#ADDITIONAL UKRAINE-/RUSSIA-RELATED SANCTIONS INFORMATION
#FREQUENTLY ASKED QUESTIONS
#SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST
stype = 'SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="directives"]/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Archived Directives
stype = 'SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST'
log.info(f"开始采集栏目---{stype}---Archived Directives")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="directives"]/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','Archived Directives'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#INTERPRETIVE GUIDANCE
stype = 'INTERPRETIVE GUIDANCE'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[5]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#GUIDANCE ON OFAC LICENSING POLICY
stype = 'GUIDANCE ON OFAC LICENSING POLICY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[7]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
#time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# GENERAL LICENSES
stype = 'GENERAL LICENSES'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[8]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Executive Orders
stype = 'Executive Orders'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[9]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Determinations
stype = 'Determinations'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[10]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
#time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Statutes
stype = 'Statutes'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[11]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Federal Register Notices
stype = 'Federal Register Notices'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[13]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
def job3():
log.info("开始采集----2017年制裁")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/countering-americas-adversaries-through-sanctions-act-related-sanctions'
driverContent.get(url)
ftype="Countering America's Adversaries Through Sanctions Act of 2017 (CAATSA)"
stype = 'Countering Americas Adversaries Through Sanctions Act-Related Sanctions'
href="https://congress.gov/115/plaws/publ44/PLAW-115publ44.pdf"
text="Countering America’s Adversaries Through Sanctions Act” (Public Law 115-44) (CAATSA)"
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
else:
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','August 2, 2017',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Other Documents Related to the Implementation of Section 105
stype = 'Other Documents Related to the Implementation of Section 105'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Ukraine-/Russia-related Directives
stype = 'Ukraine-/Russia-related Directives'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[4]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# ADDITIONAL CAATSA GUIDANCE AND INFORMATION
stype = 'ADDITIONAL CAATSA GUIDANCE AND INFORMATION'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[6]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# AExecutive Orders
stype = 'Executive Orders'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[8]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Statutes
stype = 'Statutes'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[9]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
#time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
def job4():
log.info("开始采集----马格尼茨基制裁")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/the-magnitsky-sanctions'
driverContent.get(url)
ftype = "Magnitsky Sanctions"
# INTERPRETIVE GUIDANCE
stype = 'INTERPRETIVE GUIDANCE'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6306"]/div/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#GUIDANCE ON OFAC LICENSING POLICY
stype = 'GUIDANCE ON OFAC LICENSING POLICY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6306"]/div/ul[4]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Statutes
stype = 'Statutes'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6306"]/div/ul[5]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Federal Register Notices
stype = 'Federal Register Notices'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6306"]/div/ul[7]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
def job5():
log.info("开始采集----第二层数据采集")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/non-english-translations-of-advisories-and-other-documents#ru_food_security'
driverContent.get(url)
ftype = "Russian Harmful Foreign Activities Sanctions"
# TRANSLATIONS OF OFAC FOOD SECURITY FACT SHEET: RUSSIA SANCTIONS AND AGRICULTURAL TRADE
stype = 'Non-English Translations of Advisories and Other Documents'
ttype='TRANSLATIONS OF OFAC FOOD SECURITY FACT SHEET: RUSSIA SANCTIONS AND AGRICULTURAL TRADE'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="ru_food_security"]/ul/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF OFAC FOOD SECURITY FACT SHEET: RUSSIA SANCTIONS AND AGRICULTURAL TRADE
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'TRANSLATIONS OF NORTH KOREAN INFORMATION TECHNOLOGY WORKERS FACT SHEET'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF NORTH KOREAN INFORMATION TECHNOLOGY WORKERS ADVISORY
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'TRANSLATIONS OF NORTH KOREAN INFORMATION TECHNOLOGY WORKERS ADVISORY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF GLOBAL SHIPPING ADVISORY
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'TRANSLATIONS OF GLOBAL SHIPPING ADVISORY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[3]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF NORTH KOREAN SHIPPING ADVISORIES
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'Translations of North Korean Shipping Advisories'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[4]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF NORTH KOREAN CYBER ADVISORY
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'TRANSLATIONS OF NORTH KOREAN CYBER ADVISORY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/div[8]/div/ul/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
def job6():
log.info("开始采集----第二层数据采集")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/iran-sanctions/interpretative-rulings-on-ofac-policy'
driverContent.get(url)
ftype = "Russian Harmful Foreign Activities Sanctions"
stype = 'Interpretative Rulings on OFAC Policy'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-11996"]/div/div/div[3]/div/div/table/tbody/tr')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = aEle.text
href = aEle.get_attribute('href')
time = liEle.find_element(By.TAG_NAME, 'th').text # a标签
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
if __name__ == '__main__':
log.info("美国财政部外国资产控制办公室 (OFAC)网站开始采集")
#job1()
#job2()
#job3()
#job4()
#job5()
job6()
baseCore.close()
cursor.close()
cnx.close()
\ No newline at end of file
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
from pymysql.converters import escape_string
from selenium.webdriver.common.by import By
from base.BaseCore import BaseCore
baseCore = BaseCore()
log =baseCore.getLogger()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
cnx = baseCore.cnx
cursor = baseCore.cursor
def job_2():
log.info('----开始采集---俄罗斯国家杂志----')
path = 'D:chrome/chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url = 'http://publication.pravo.gov.ru/documents/block/president'
req = requests.get(url,headers)
soup = BeautifulSoup(req.content,'html.parser')
container = soup.find('div',class_='documents-container')
web_list = container.find_all('div',class_='documents-table-row')
for web in web_list[:1]:
web_href = web.find('a')['href']
web_url = 'http://publication.pravo.gov.ru/' + web_href
title = web.find('a').text
print(title)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论