提交 88707bfa 作者: 薛凌堃

9/18

上级 d27ed8e5
...@@ -55,9 +55,9 @@ def closeSql(cnx,cursor): ...@@ -55,9 +55,9 @@ def closeSql(cnx,cursor):
def NewsEnterprise(): def NewsEnterprise():
cnx,cursor = connectSql() cnx,cursor = connectSql()
# #获取国内企业 # #获取国内企业
gn_query = "select SocialCode from EnterpriseInfo where Place = '1'" # gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
cursor.execute(gn_query) # cursor.execute(gn_query)
gn_result = cursor.fetchall() # gn_result = cursor.fetchall()
#获取国外企业 #获取国外企业
gw_query = "select SocialCode from EnterpriseInfo where Place = '2'" gw_query = "select SocialCode from EnterpriseInfo where Place = '2'"
cursor.execute(gw_query) cursor.execute(gw_query)
...@@ -66,11 +66,11 @@ def NewsEnterprise(): ...@@ -66,11 +66,11 @@ def NewsEnterprise():
gw_social_list = [item[0] for item in gw_result] gw_social_list = [item[0] for item in gw_result]
#todo:打印长度 #todo:打印长度
# print(len(gw_social_list)) # print(len(gw_social_list))
gn_social_list = [item[0] for item in gn_result] # gn_social_list = [item[0] for item in gn_result]
print('=======') print('=======')
#将数据插入到redis中 #将数据插入到redis中
for item in gn_social_list: # for item in gn_social_list:
r.rpush('NewsEnterprise:gnqy_socialCode', item) # r.rpush('NewsEnterprise:gnqy_socialCode', item)
for item in gw_social_list: for item in gw_social_list:
r.rpush('NewsEnterprise:gwqy_socialCode', item) r.rpush('NewsEnterprise:gwqy_socialCode', item)
...@@ -275,7 +275,7 @@ def AnnualEnterpriseXueQ_task(): ...@@ -275,7 +275,7 @@ def AnnualEnterpriseXueQ_task():
def AnnualEnterpriseUS(): def AnnualEnterpriseUS():
cnx,cursor = connectSql() cnx,cursor = connectSql()
# 获取美股企业 # 获取美股企业
us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null" us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and CreateTime='2023-08-15 14:00:00'"
# us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' " # us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' "
#ZZSN22080900000025 #ZZSN22080900000025
cursor.execute(us_query) cursor.execute(us_query)
...@@ -387,6 +387,44 @@ def yahooCode_task(): ...@@ -387,6 +387,44 @@ def yahooCode_task():
print('定时采集异常', e) print('定时采集异常', e)
pass pass
#新三板
def NQEnterprise():
cnx, cursor = connectSql()
nq_query = "select gpdm from NQEnterprise where QCCID is not null "
cursor.execute(nq_query)
nq_result = cursor.fetchall()
nq_social_list = [item[0] for item in nq_result]
for item in nq_social_list:
# r.rpush('NQEnterprise:nq_Ipo', item)
r.rpush('NQEnterprise:nq_finance',item)
# r.rpush('NQEnterprise:nq_notice',item)
closeSql(cnx, cursor)
def omeng():
cnx, cursor = connectSql()
# om_query = " SELECT A.SocialCode FROM EnterpriseInfo A , EnterpriseType B WHERE B.TYPE=6 AND A.SocialCode=B.SocialCode AND A.Place=1 order by A.id desc "
om_query = " SELECT A.SocialCode FROM EnterpriseInfo A , EnterpriseType B WHERE B.TYPE=6 AND A.SocialCode=B.SocialCode AND A.Place=2 "
cursor.execute(om_query)
om_result = cursor.fetchall()
om_social_list = [item[0] for item in om_result]
for item in om_social_list:
#企业基本信息
r.rpush('gwOMEnterprise_socialcode:BaseInfo', item)
#企业高管信息
# r.rpush('gnOMEnterprise_socialcode:TYCid', item)
# r.rpush('gnOMEnterprise_socialcode:CenterPerson', item)
#企业动态
r.rpush('gwOMEnterprise_socialcode:News', item)
#企业年报
# r.rpush('gnOMEnterprise_socialcode:Report', item)
#企业公告
# r.rpush('gnOMEnterprise_socialcode:Notice', item)
closeSql(cnx, cursor)
if __name__ == "__main__": if __name__ == "__main__":
start = time.time() start = time.time()
...@@ -399,7 +437,9 @@ if __name__ == "__main__": ...@@ -399,7 +437,9 @@ if __name__ == "__main__":
# BaseInfoEnterprise() # BaseInfoEnterprise()
# FBS() # FBS()
# MengZhi() # MengZhi()
AnnualEnterpriseUS() NQEnterprise()
# omeng()
# AnnualEnterpriseUS()
# NoticeEnterprise_task() # NoticeEnterprise_task()
# AnnualEnterprise_task() # AnnualEnterprise_task()
# NoticeEnterprise() # NoticeEnterprise()
......
...@@ -16,7 +16,7 @@ from fdfs_client.storage_client import * ...@@ -16,7 +16,7 @@ from fdfs_client.storage_client import *
from fdfs_client.exceptions import * from fdfs_client.exceptions import *
def get_tracker_conf(conf_path='client.conf'): def get_tracker_conf(conf_path='../../client.conf'):
cf = Fdfs_ConfigParser() cf = Fdfs_ConfigParser()
tracker = {} tracker = {}
try: try:
......
# -*- coding: utf-8 -*-
import redis
import time
from urllib.parse import quote
import pymongo
import requests
from bson.objectid import ObjectId
import json
from pyquery import PyQuery as pq
import urllib3
import hashlib
from kafka import KafkaProducer
import pandas as pd
import zhconv
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except:
print('重试')
time.sleep(5)
continue
time.sleep(2)
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = ''
else:
KeyNo = ''
print("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
# 判断字符串里是否含数字
def str_have_num(str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# 通过企查查id获取企业官网
def info_by_id(com_id,com_name):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
try:
result_dict = resp_dict['result']['Company']
except:
print(com_name + ":获取失败")
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
print(com_name + ":爬取完成")
return WebSite
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
#TODO:需要隔两个小时左右抓包修改
token = '1dcc61d85177733298e5827653706f1a' # 需要隔两个小时左右抓包修改
start = time.time()
list_weicha = []
#待采集企业文件
filename = 'data/内蒙古市属国有企业_官网.xlsx'
df_all = pd.read_excel('data/内蒙古市属国有企业.xls',dtype=str)
list_all_info = []
for num_df in range(162,len(df_all)):
#企业社会信用代码
id_code = str(df_all['本企业代码'][num_df])
#企业名称
com_name = str(df_all['企业名称'][num_df])
#行次
line = str(df_all['行次'][num_df])
dic_com = {
'line': line,
'social_code': id_code,
'com_name': id_code,
'website':''
}
company_id = find_id_by_name(id_code)
if company_id == "":
print(com_name + ":企业ID获取失败")
list_weicha.append(com_name + ":企业ID获取失败")
continue
WebSite = info_by_id(company_id,com_name)
dic_com['website'] = WebSite
log.info(f'---{num_df}-------{com_name}----------耗时{baseCore.getTimeCost(start,time.time())}')
list_all_info.append(dic_com)
baseCore.writerToExcel(list_all_info,filename)
# -*- coding: utf-8 -*-
import redis
import time
from urllib.parse import quote
import pymongo
import requests
from bson.objectid import ObjectId
import json
from pyquery import PyQuery as pq
import urllib3
import hashlib
from kafka import KafkaProducer
import pandas as pd
import zhconv
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except:
print('重试')
time.sleep(5)
continue
time.sleep(2)
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = ''
else:
KeyNo = ''
print("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
# 判断字符串里是否含数字
def str_have_num(str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# 通过企查查id获取企业官网
def info_by_id(com_id,com_name):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
try:
result_dict = resp_dict['result']['Company']
except:
print(com_name + ":获取失败")
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
print(com_name + ":爬取完成")
return WebSite
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
#TODO:需要隔两个小时左右抓包修改
token = '06de5b73cb9d6f9fcae27edaf94749cf' # 需要隔两个小时左右抓包修改
start = time.time()
list_weicha = []
#待采集企业文件
filename = 'data/钢铁企业_官网.xlsx'
df_all = pd.read_excel('data/钢铁企业.xlsx',dtype=str)
list_all_info = []
for num_df in range(len(df_all)):
#企业社会信用代码
id_code = str(df_all['信用代码'][num_df])
#企业名称
com_name = str(df_all['企业名称'][num_df])
#行次
# line = str(df_all['行次'][num_df])
dic_com = {
# 'line': line,
'social_code': id_code,
'com_name': com_name,
'website':''
}
company_id = find_id_by_name(id_code)
if company_id == "":
print(com_name + ":企业ID获取失败")
list_weicha.append(com_name + ":企业ID获取失败")
continue
WebSite = info_by_id(company_id,com_name)
dic_com['website'] = WebSite
log.info(f'---{num_df}-------{com_name}----------耗时{baseCore.getTimeCost(start,time.time())}')
list_all_info.append(dic_com)
baseCore.writerToExcel(list_all_info,filename)
...@@ -9,7 +9,10 @@ import json ...@@ -9,7 +9,10 @@ import json
from kafka import KafkaProducer from kafka import KafkaProducer
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
from getQccId import find_id_by_name from getQccId import find_id_by_name
from base.BaseCore import BaseCore
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
baseCore = BaseCore() baseCore = BaseCore()
cnx = baseCore.cnx cnx = baseCore.cnx
...@@ -310,7 +313,7 @@ def info_by_id(com_id,com_name): ...@@ -310,7 +313,7 @@ def info_by_id(com_id,com_name):
return aa_dict_list return aa_dict_list
if __name__ == '__main__': if __name__ == '__main__':
taskType = '基本信息/企查查' taskType = '基本信息/企查查/福布斯'
headers = { headers = {
'Host': 'xcx.qcc.com', 'Host': 'xcx.qcc.com',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
...@@ -325,75 +328,64 @@ if __name__ == '__main__': ...@@ -325,75 +328,64 @@ if __name__ == '__main__':
#从redis里拿数据 #从redis里拿数据
while True: while True:
start = time.time()
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken() token = baseCore.GetToken()
list_weicha = [] list_weicha = []
list_all_info = [] list_all_info = []
name_list = [] name_list = []
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state1=1 limit 1 " social_code = baseCore.redicPullData('BaseInfoEnterpriseFbs:gnqy_social_code')
#兴业银行 # social_code = '91110000710924945A'
# query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'" if social_code is None:
cursor.execute(query) time.sleep(20)
row = cursor.fetchone()
if row:
pass
else:
print('没有数据了,结束脚本')
break
com_name = row[6]
social_code = row[4]
code = row[7]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
updateBeginSql = f"update Tfbs set state1=0,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
company_id = find_id_by_name(start,token,social_code)
if company_id == False:
#表示token失效
time.sleep(10)
updateBeginSql = f"update Tfbs set state1=1,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
continue continue
log.info(f'----当前企业{social_code}-----')
if company_id == "": dic_info = baseCore.getInfomation(social_code)
log.info(com_name + ":企业ID获取失败") #
count = dic_info[13]
com_name = dic_info[1]
social_code = dic_info[2]
# 企查查id
company_id = dic_info[12]
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == None:
if social_code:
company_id = find_id_by_name(start_time, token, social_code)
else:
company_id = find_id_by_name(start_time, token, com_name)
# todo:写入数据库
updateSql = f"update EnterpriseInfo set QCCID = '{company_id}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql)
cnx_.commit()
post_data_list = info_by_id(company_id, com_name)
if company_id == "":
print(com_name + ":企业ID获取失败")
list_weicha.append(com_name + ":企业ID获取失败") list_weicha.append(com_name + ":企业ID获取失败")
#400表示企业更新失败
updateBeginSql = f"update Tfbs set state1=400,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
continue continue
else: else:
post_data_list = info_by_id(company_id,social_code) log.info(f'====={social_code}===={company_id}=====获取企业id成功=====')
try:
post_data_list = info_by_id(company_id, com_name)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('BaseInfoEnterpriseFbs:gnqy_social_code', social_code)
continue
for post_data in post_data_list: for post_data in post_data_list:
list_all_info.append(post_data) list_all_info.append(post_data)
if post_data is None: if post_data is None:
log.info(com_name + ":企业信息获取失败") print(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败") list_weicha.append(com_name + ":企业信息获取失败")
# 400表示企业更新失败
updateBeginSql = f"update Tfbs set state1=400,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
continue continue
get_name = post_data['name'] get_name = post_data['name']
get_socialcode = post_data['socialCreditCode'] get_socialcode = post_data['socialCreditCode']
name_compile = { name_compile = {
'yuan_name':com_name, 'yuan_name': com_name,
'get_name':get_name 'get_name': get_name
} }
name_list.append(name_compile) name_list.append(name_compile)
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}') log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
try: try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2)) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
...@@ -404,16 +396,17 @@ if __name__ == '__main__': ...@@ -404,16 +396,17 @@ if __name__ == '__main__':
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception) baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败") log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
#200表示成功 # break
updateBeginSql = f"update Tfbs set state1=200,date2='{time_now}' where col3='{social_code}' " # 信息采集完成后将该企业的采集次数更新
# print(updateBeginSql) runType = 'BaseInfoRunCount'
cursor.execute(updateBeginSql) count += 1
cnx.commit() baseCore.updateRun(social_code, runType, count)
nowtime = baseCore.getNowTime(1).replace('-','_')[:10] nowtime = baseCore.getNowTime(1).replace('-', '_')[:10]
companyName = pd.DataFrame(name_list) companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False) companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx', index=False)
false_com = pd.DataFrame(list_weicha) false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False) false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx', index=False)
baseCore.close()
......
import redis
# 连接到Redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# 列表名称
list_name = 'BaseInfoEnterpriseMz:gnqy_socialCode'
# 获取列表中的所有元素
elements = r.lrange(list_name, 0, -1)
# 遍历列表中的元素
for element in elements:
# 获取元素在列表中的数量
count = r.lrem(list_name, 0, element)
# 如果数量大于1,说明有重复值,删除多余的重复值
if count > 1:
r.lrem(list_name, count - 1, element)
# 打印处理后的列表
print(r.lrange(list_name, 0, -1))
...@@ -300,7 +300,7 @@ if __name__ == '__main__': ...@@ -300,7 +300,7 @@ if __name__ == '__main__':
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:usqy_socialCode') # social_code = baseCore.redicPullData('AnnualEnterprise:usqy_socialCode')
social_code = 'ZZSN22080900000025' social_code = 'ZZSN230912210643024'
if not social_code: if not social_code:
time.sleep(20) time.sleep(20)
continue continue
......
# import redis
#
#
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
#
# # 获取所有键
# keys = r.keys('*')
# # print(keys)
# for key in keys:
# f_key = key.decode()
# print(f_key)
# print("----------")
# res = r.exists(f_key)
# value = list(r.smembers(f_key))
# # 对列表进行排序
# value.sort()
# # 遍历排序后的列表
# list_data = []
# for member in value:
# member = member.decode()
# members = member.strip('[').strip(']').replace('\'','').strip().split(',')
# #获取每一个报告期
# for date in members:
# data = date.strip()
# # print(date.strip())
# list_data.append(data)
# # 放入redis
# for item in list_data:
# r.sadd(key, item)
#
# # 获取Set中的所有元素
# items = r.smembers(key)
# # print(items)
# print("======================================")
import datetime
timestamp = 1688054400 # 示例时间戳
date = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
print(date)
import redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
# 获取所有键
# keys = r.keys('*')
# # print(keys)
# for key in keys:
# f_key = key.decode()
# print(f_key)
# print("----------")
# r.srem(f_key,'[]')
# r.srem(f_key,'')
def check_date(com_code,info_date):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=3)
res = r.sismember('com_caiwushuju_code::'+com_code, info_date) # 注意是 保存set的方式
if res:
return True
else:
return False
def add_date():
date_list = ['2023-06-30']
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
#遍历date_list 放入redis
for date in date_list:
res = r.sadd('com_caiwushuju_code::'+'123456',date)
# check_date('sh601398','2023-03-31')
add_date()
#com_caiwushuju_code::bj430418
# ['2018-12-31', '2018-09-30', '2018-06-30', '2018-03-31']
import redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
f_key = 'com_caiwushuju_code::bj851834'
r.set(f_key,3)
res = r.exists(f_key)
print(res)
# list = ['2023-03-31', '2022-12-31', '2022-09-30', '2022-06-30', '2022-03-31', '2021-12-31', '2021-09-30', '2021-06-30', '2021-03-31', '2020-12-31', '2020-09-30', '2020-06-30', '2020-03-31', '2019-12-31', '2019-09-30', '2019-06-30', '2019-03-31']
#
# for a in list:
# r.sadd(f_key,a)
# from base.BaseCore import BaseCore
# baseCore = BaseCore()
# from datetime import datetime,timedelta
# timeNow = baseCore.getNowTime(1)[:10]
# # list_date = []
# #
# # list_date.append(timeNow)
# # print(timeNow)
# # print(list_date)
#
# current_date = datetime.now()
# # 计算昨天的日期
# yesterday = current_date - timedelta(days=1)
# # 格式化昨天的日期
# report_date = yesterday.strftime('%Y-%m-%d')
# # list_date.append(report_date)
# year = int(current_date.strftime('%Y'))
# print(year)
# print(type(year))
\ No newline at end of file
# 核心工具包
import os
import random
import socket
import sys
import time
import fitz
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import pymysql
import redis
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('E:\\kkwork\\zzsn_spider\\base\\client.conf')
client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
# self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
# charset='utf8mb4')
# self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except :
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
pass
else:
end=str.rfind(endStr)
if end==-1:
pass
else:
str = str[0:end+1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self,key):
item = self.r.lpop(key)
return item.decode() if item else None
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
# 获取本机IP
def getIP(self):
IP = socket.gethostbyname(socket.gethostname())
return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, gpdm):
data = []
try:
sql = f"SELECT * FROM NQEnterprise WHERE gpdm = '{gpdm}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql,values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
#获取企查查token
def GetToken(self):
#获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
return token
#获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
return token
#检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,key,item):
self.r.rpush(key, item)
#增加计数器的值并返回增加后的值
def incrSet(self,key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
#获取key剩余的过期时间
def getttl(self,key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
#上传至文件服务器,并解析pdf的内容和页数
def upLoadToServe(self,pdf_url,type_id,social_code):
headers = {}
retData = {'state':False,'type_id':type_id,'item_id':social_code,'group_name':'group1','path':'','full_path':'',
'category':'pdf','file_size':'','status':1,'create_by':'XueLingKun',
'create_time':'','page_size':'','content':''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
print(f'======pdf解析失败=====')
return retData
else:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,year,type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,year,type_id)
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self,social_code,cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
...@@ -327,13 +327,14 @@ if __name__ == '__main__': ...@@ -327,13 +327,14 @@ if __name__ == '__main__':
#从redis里拿数据 #从redis里拿数据
while True: while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得 # TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = '027ea02da6d901a724ecca47930379b4' token = 'b4eb43143abdcf395f1335f322ca29e5'
list_weicha = [] list_weicha = []
list_all_info = [] list_all_info = []
name_list = [] name_list = []
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
com_code = baseCore.redicPullData('EnterpriseIpo:nq_gpdm') # com_code = baseCore.redicPullData('EnterpriseIpo:nq_gpdm')
com_code = '873349'
if '.NQ' in com_code: if '.NQ' in com_code:
com_code1 = com_code com_code1 = com_code
else: else:
...@@ -358,7 +359,7 @@ if __name__ == '__main__': ...@@ -358,7 +359,7 @@ if __name__ == '__main__':
post_data_list = info_by_id(company_id, '',com_code1) post_data_list = info_by_id(company_id, '',com_code1)
except: except:
log.info(f'====={com_code}=====获取基本信息失败,重新放入redis=====') log.info(f'====={com_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('BaseInfoEnterprise:gnqy_social_code', com_code) baseCore.rePutIntoR('EnterpriseIpo:nq_gpdm', com_code)
continue continue
if post_data_list: if post_data_list:
pass pass
......
...@@ -7,7 +7,7 @@ import requests ...@@ -7,7 +7,7 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from selenium import webdriver from selenium import webdriver
import urllib3 import urllib3
from base.BaseCore import BaseCore from BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# from gpdm import Gpdm # from gpdm import Gpdm
baseCore = BaseCore() baseCore = BaseCore()
...@@ -20,15 +20,17 @@ log = baseCore.getLogger() ...@@ -20,15 +20,17 @@ log = baseCore.getLogger()
error_list = [] error_list = []
list_all_info = [] list_all_info = []
# 需要提供股票代码、企业信用代码 # 需要提供股票代码、企业信用代码
flag = 0
while True: while True:
#从表中读取企业 #从表中读取企业
flag = 0 com_code = baseCore.redicPullData('NQEnterprise:nq_Ipo')
com_code1 = baseCore.redicPullData('EnterpriseIpo:nq_gpdm') # com_code = baseCore.redicPullData('NQEnterprise:nq_Ipo_test')
if com_code1 is None: if com_code is None:
time.sleep(20) if flag==0:
if flag==1: time.sleep(20)
log.info('已没有数据----------等待')
continue continue
elif flag==0: elif flag==1:
# 通过接口将数据保存进数据库 # 通过接口将数据保存进数据库
for num in range(0, len(list_all_info), 100): for num in range(0, len(list_all_info), 100):
...@@ -42,15 +44,18 @@ while True: ...@@ -42,15 +44,18 @@ while True:
print(e) print(e)
print("{}:到:{}".format(num, num + 100)) print("{}:到:{}".format(num, num + 100))
print(response.text) print(response.text)
log.info('-----------数据发送接口完毕----------')
flag = 0
continue continue
break
com_code = '838616' #从数据库中查询到其他信息
short_name = '' log.info(f'========正在采集{com_code}===========')
social_code = '' data = baseCore.getInfomation(com_code)
social_code = data[1]
short_name = data[3]
start = time.time() start = time.time()
log.info(f'======开始采集{com_code}======') log.info(f'======开始采集{com_code}======')
url = f'https://xinsanban.eastmoney.com/F10/CompanyInfo/Introduction/833658.html' url = f'https://xinsanban.eastmoney.com/F10/CompanyInfo/Introduction/{com_code}.html'
headers = { headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
...@@ -95,12 +100,9 @@ while True: ...@@ -95,12 +100,9 @@ while True:
"eastIndustry": industry, "eastIndustry": industry,
"csrcIndustry": '' "csrcIndustry": ''
} }
print(dic_cwsj) # print(dic_cwsj)
list_all_info.append(dic_cwsj) list_all_info.append(dic_cwsj)
log.info(f'======{com_code}====采集成功=====') log.info(f'======{com_code}====采集成功=====')
flag = 1 flag = 1
break
...@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup ...@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup
import datetime import datetime
from selenium import webdriver from selenium import webdriver
from base.BaseCore import BaseCore from BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
...@@ -84,7 +84,7 @@ def getdetail(reportInfodata,name_map,listinfo,url_name): ...@@ -84,7 +84,7 @@ def getdetail(reportInfodata,name_map,listinfo,url_name):
listinfo.append(dic_info) listinfo.append(dic_info)
return listinfo return listinfo
def getinfo(com_code): def getinfo(com_code,social_code):
dic_info = {} dic_info = {}
for nnn in range(0, 3): for nnn in range(0, 3):
try: try:
...@@ -127,10 +127,11 @@ def getinfo(com_code): ...@@ -127,10 +127,11 @@ def getinfo(com_code):
break break
except: except:
time.sleep(1) time.sleep(1)
log.info(f'======正在采集:{report_date}=======') log.info(f'======正在采集:{com_code}---{report_date}=======')
#利润表 #利润表
list_Lrb = getdetail(reportLrbdata,lrb_name_map,listLrb,lrb_name) list_Lrb = getdetail(reportLrbdata,lrb_name_map,listLrb,lrb_name)
print(list_Lrb) log.info(f'利润表数据:{len(list_Lrb)}个')
# print(list_Lrb)
#资产负债表 #资产负债表
reportZcfzbdata = b_infoData[i] reportZcfzbdata = b_infoData[i]
list_Zcfzb = getdetail(reportZcfzbdata,zcfzb_name_map,listZcfzb,zcfzb_name) list_Zcfzb = getdetail(reportZcfzbdata,zcfzb_name_map,listZcfzb,zcfzb_name)
...@@ -148,27 +149,30 @@ def getinfo(com_code): ...@@ -148,27 +149,30 @@ def getinfo(com_code):
"cash": list_Xjllb, "cash": list_Xjllb,
"ynFirst": ynFirst, "ynFirst": ynFirst,
} }
print(dic_info) # print(dic_info)
#一个报告期结束 #一个报告期结束
for nnn in range(0, 3):
try:
add_date(com_code, report_date)
break
except:
time.sleep(1)
log.info(f'----{com_code}--{report_date}----结束') log.info(f'----{com_code}--{report_date}----结束')
# if dic_info: if dic_info:
# # 调凯歌接口存储数据 # 调凯歌接口存储数据
# data = json.dumps(dic_info) data = json.dumps(dic_info)
# # print(data) # print(data)
# url_baocun = 'http://114.115.236.206:8088/sync/finance/df' url_baocun = 'http://114.115.236.206:8088/sync/finance/df'
# for nnn in range(0, 3): for nnn in range(0, 3):
# try: try:
# res_baocun = requests.post(url_baocun, data=data) res_baocun = requests.post(url_baocun, data=data)
# break break
# except: except:
# time.sleep(1) time.sleep(1)
# print(res_baocun.text) print(res_baocun.text)
log.info('------------数据发送接口完毕------------')
for nnn in range(0, 3):
try:
add_date(com_code, report_date)
break
except:
time.sleep(1)
else:
log.error(f'---{com_code}--{report_date}--')
return dic_info return dic_info
if __name__ == '__main__': if __name__ == '__main__':
...@@ -178,9 +182,27 @@ if __name__ == '__main__': ...@@ -178,9 +182,27 @@ if __name__ == '__main__':
browser = webdriver.Chrome(chromedriver) browser = webdriver.Chrome(chromedriver)
except Exception as e: except Exception as e:
print(e) print(e)
headers = {
'authority': 'stock.xueqiu.com',
social_code = '' 'method': 'GET',
'path': '/v5/stock/finance/cn/income.json?symbol=NQ873286&type=all&is_detail=true&count=5&timestamp=1694414063178',
'scheme': 'https',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cookie': 'device_id=84ced64554d8060750b1528dc22a3696; s=bt110kz0n8; cookiesu=661693188384462; u=661693188384462; Hm_lvt_1db88642e346389874251b5a1eded6e3=1693188388; xq_a_token=29bdb37dee2432c294425cc9e8f45710a62643a5; xqat=29bdb37dee2432c294425cc9e8f45710a62643a5; xq_r_token=3a35db27fcf5471898becda7aa5dab6afeafe471; xq_id_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ1aWQiOi0xLCJpc3MiOiJ1YyIsImV4cCI6MTY5NjgxMTc5NCwiY3RtIjoxNjk0NDEzNTQ2ODU4LCJjaWQiOiJkOWQwbjRBWnVwIn0.Xxu329nQq4bMtwKFJWlScnUUSWrky4T5SWkMum46c2G8la2z4g0d4nyvsO08WP-7moMffId6P3bGWuELULkbv6EHvIZgqge9-fAD4-hmLOjeRh96NsoGfyTAQK7tbnt9LhKz1fDg6SUi8loMqYgM7l-4g-ZM4B6zrZ5hKWdQJFLy0-V8Wzx7HTFYZSX3FNSsbgGqHlW4vykIpsRaNeOOX1M6LYdt6BhbAi1Iv4TflB08LIdu6F1n4dTRbmPq1KCndb2LsLR2HrJZmqmHJB9WMzwlVcIGdz778_CutNrwuWgJbWtb-s3dSESzO0WWw1uIIGZUvRl1D0KSl0P_GQLw9w; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1694414056',
'Origin': 'https://xueqiu.com',
'Pragma': 'no-cache',
'Referer': 'https://xueqiu.com/snowman/S/NQ873286/detail',
'Sec-Ch-Ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
#中英文名称映射 #中英文名称映射
lrb_name_map = { lrb_name_map = {
'营业总收入':'total_revenue', '营业总收入':'total_revenue',
...@@ -347,32 +369,39 @@ if __name__ == '__main__': ...@@ -347,32 +369,39 @@ if __name__ == '__main__':
'加:期初现金及现金等价物余额':'final_balance_of_cce', '加:期初现金及现金等价物余额':'final_balance_of_cce',
'期末现金及现金等价物余额':'final_balance_of_cce' '期末现金及现金等价物余额':'final_balance_of_cce'
} }
table_type = ['income','balance'] table_type = ['income','balance']
flag = 0
while True:
# com_code = baseCore.redicPullData('NQEnterprise:nq_finance')
com_code = baseCore.redicPullData('NQEnterprise:nq_finance_test')
if com_code is None:
if flag==0:
log.info('已没有数据----------等待')
time.sleep(20)
continue
elif flag==1:
log.info('=============调用对比指标接口======')
time.sleep(5400)
url_ = 'http://114.115.236.206:8088/sync/calculateIndex?type=1'
for nnn in range(0, 3):
try:
res_ = requests.get(url_)
break
except:
time.sleep(1)
print(res_.text)
log.info('-----------数据触发对比指标接口完毕----------')
flag = 0
continue
log.info(f'========正在采集{com_code}===========')
data = baseCore.getInfomation(com_code)
social_code = data[1]
short_name = data[3]
start = time.time()
com_code = 'NQ' + com_code
dic_info = getinfo(com_code,social_code)
flag =1
com_code = 'NQ' + '873286'
headers = {
'authority': 'stock.xueqiu.com',
'method': 'GET',
'path': '/v5/stock/finance/cn/income.json?symbol=NQ873286&type=all&is_detail=true&count=5&timestamp=1694414063178',
'scheme': 'https',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cookie': 'device_id=84ced64554d8060750b1528dc22a3696; s=bt110kz0n8; cookiesu=661693188384462; u=661693188384462; Hm_lvt_1db88642e346389874251b5a1eded6e3=1693188388; xq_a_token=29bdb37dee2432c294425cc9e8f45710a62643a5; xqat=29bdb37dee2432c294425cc9e8f45710a62643a5; xq_r_token=3a35db27fcf5471898becda7aa5dab6afeafe471; xq_id_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ1aWQiOi0xLCJpc3MiOiJ1YyIsImV4cCI6MTY5NjgxMTc5NCwiY3RtIjoxNjk0NDEzNTQ2ODU4LCJjaWQiOiJkOWQwbjRBWnVwIn0.Xxu329nQq4bMtwKFJWlScnUUSWrky4T5SWkMum46c2G8la2z4g0d4nyvsO08WP-7moMffId6P3bGWuELULkbv6EHvIZgqge9-fAD4-hmLOjeRh96NsoGfyTAQK7tbnt9LhKz1fDg6SUi8loMqYgM7l-4g-ZM4B6zrZ5hKWdQJFLy0-V8Wzx7HTFYZSX3FNSsbgGqHlW4vykIpsRaNeOOX1M6LYdt6BhbAi1Iv4TflB08LIdu6F1n4dTRbmPq1KCndb2LsLR2HrJZmqmHJB9WMzwlVcIGdz778_CutNrwuWgJbWtb-s3dSESzO0WWw1uIIGZUvRl1D0KSl0P_GQLw9w; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1694414056',
'Origin': 'https://xueqiu.com',
'Pragma': 'no-cache',
'Referer': 'https://xueqiu.com/snowman/S/NQ873286/detail',
'Sec-Ch-Ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
dic_info = getinfo(com_code)
......
import json
import time
import requests
from bs4 import BeautifulSoup
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
from urllib import parse
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'qgqp_b_id=92f470109c2462c6c6aa5115d15f7b35; emshistory=%5B%22sz007sz%22%5D; qRecords=%5B%7B%22name%22%3A%22%u5929%u98CE%u8BC1%u5238%22%2C%22code%22%3A%22SH601162%22%7D%5D; HAList=ty-1-601766-%u4E2D%u56FD%u4E2D%u8F66%2Cty-116-03690-%u7F8E%u56E2-W%2Cty-0-002828-%u8D1D%u80AF%u80FD%u6E90%2Cty-1-601162-%u5929%u98CE%u8BC1%u5238%2Cty-0-000001-%u5E73%u5B89%u94F6%u884C%2Cty-0-002070-%u4F17%u548C%u9000%2Cty-1-600723-%u9996%u5546%u80A1%u4EFD%2Cty-0-300106-%u897F%u90E8%u7267%u4E1A%2Cty-116-00992-%u8054%u60F3%u96C6%u56E2%2Cty-0-300362-%u5929%u7FD4%u9000; st_si=06778540617554; st_pvi=44810095342512; st_sp=2023-07-18%2013%3A55%3A09; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=1; st_psi=20230901152305423-113300304201-4533354410; st_asi=delete',
'Host': 'data.eastmoney.com',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
file_name = '研报--产业链.xlsx'
for page in range(100,101):
log.info(f'----开始采集第{page}页-------')
param = {"uid":"",
"keyword":"产业链",
"type":["researchReport"],
"client":"web",
"clientVersion":"curr",
"clientType":"web",
"param":{"researchReport":{"client":"web","pageSize":10,"pageIndex":page}}
}
param_url = parse.quote(str(param).replace(" ", ""))
t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
res = requests.get(url).text[1:-1]
res_json = json.loads(res)
list_all = res_json['result']['researchReport']
# print(list_all)
if list_all:
pass
else:
continue
num = 1
for one_news in list_all:
log.info(f'---------开始采集第{num}条---------')
dataList = []
com_name = one_news['stockName']
title = str(one_news['title']).replace('<em>','').replace('</em>','')
date = one_news['date'][:10]
code = one_news['code']
href = f'https://data.eastmoney.com/report/zw_stock.jshtml?infocode={code}'
# print(date,href)
#newsContent
req = requests.get(href,headers=headers,verify=False,timeout=30)
soup = BeautifulSoup(req.content,'html.parser')
content = soup.find('div',class_='newsContent')
try:
pdf_url = soup.find('div',class_='report-infos').find_all('span')[4].find('a')['href']
except:
log.info(f'-----{href}-----')
continue
#.find_all('span')[-1].find('a')['href']
log.info(pdf_url)
dic_info = {
'公司名称':com_name,
'标题':title,
'正文':content.text.strip(),
'附件链接':pdf_url,
'发布时间':date,
'contentwithTag':content.prettify()
}
# print(dic_info)
dataList.append(dic_info)
baseCore.writerToExcel(dataList,file_name)
num+=1
# break
# connect timeout in seconds
# default value is 30s
connect_timeout=300
# network timeout in seconds
# default value is 30s
network_timeout=600
# the base path to store log files
#base_path=/home/tarena/django-project/cc_shop1/cc_shop1/logs
# tracker_server can ocur more than once, and tracker_server format is
# "host:port", host can be hostname or ip address
tracker_server=114.115.215.96:22122
#standard log level as syslog, case insensitive, value list:
### emerg for emergency
### alert
### crit for critical
### error
### warn for warning
### notice
### info
### debug
log_level=info
# if use connection pool
# default value is false
# since V4.05
use_connection_pool = false
# connections whose the idle time exceeds this time will be closed
# unit: second
# default value is 3600
# since V4.05
connection_pool_max_idle_time = 3600
# if load FastDFS parameters from tracker server
# since V4.05
# default value is false
load_fdfs_parameters_from_tracker=false
# if use storage ID instead of IP address
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# default value is false
# since V4.05
use_storage_id = false
# specify storage ids filename, can use relative or absolute path
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# since V4.05
storage_ids_filename = storage_ids.conf
#HTTP settings
http.tracker_server_port=80
#use "#include" directive to include HTTP other settiongs
##include http.conf
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: client.py
'''
Client module for Fastdfs 3.08
author: scott yuan scottzer8@gmail.com
date: 2012-06-21
'''
import os
import sys
from fdfs_client.utils import *
from fdfs_client.tracker_client import *
from fdfs_client.storage_client import *
from fdfs_client.exceptions import *
def get_tracker_conf(conf_path='../../client.conf'):
cf = Fdfs_ConfigParser()
tracker = {}
try:
cf.read(conf_path)
timeout = cf.getint('__config__', 'connect_timeout')
tracker_list = cf.get('__config__', 'tracker_server')
if isinstance(tracker_list, str):
tracker_list = [tracker_list]
tracker_ip_list = []
for tr in tracker_list:
tracker_ip, tracker_port = tr.split(':')
tracker_ip_list.append(tracker_ip)
tracker['host_tuple'] = tuple(tracker_ip_list)
tracker['port'] = int(tracker_port)
tracker['timeout'] = timeout
tracker['name'] = 'Tracker Pool'
except:
raise
return tracker
class Fdfs_client(object):
'''
Class Fdfs_client implemented Fastdfs client protol ver 3.08.
It's useful upload, download, delete file to or from fdfs server, etc. It's uses
connection pool to manage connection to server.
'''
def __init__(self, trackers, poolclass=ConnectionPool):
self.trackers = trackers
self.tracker_pool = poolclass(**self.trackers)
self.timeout = self.trackers['timeout']
return None
def __del__(self):
try:
self.pool.destroy()
self.pool = None
except:
pass
def upload_by_filename(self, filename, meta_dict=None):
'''
Upload a file to Storage server.
arguments:
@filename: string, name of file that will be uploaded
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
} meta_dict can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : local_file_name,
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_by_filename(tc, store_serv, filename, meta_dict)
def upload_by_file(self, filename, meta_dict=None):
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_by_file(tc, store_serv, filename, meta_dict)
def upload_by_buffer(self, filebuffer, file_ext_name=None, meta_dict=None):
'''
Upload a buffer to Storage server.
arguments:
@filebuffer: string, buffer
@file_ext_name: string, file extend name
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
if not filebuffer:
raise DataError('[-] Error: argument filebuffer can not be null.')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_by_buffer(tc, store_serv, filebuffer, file_ext_name, meta_dict)
def upload_slave_by_filename(self, filename, remote_file_id, prefix_name, meta_dict=None):
'''
Upload slave file to Storage server.
arguments:
@filename: string, local file name
@remote_file_id: string, remote file id
@prefix_name: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dictionary {
'Status' : 'Upload slave successed.',
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Remote file id' : remote_file_id,
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading slave)')
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(uploading slave)')
if not prefix_name:
raise DataError('[-] Error: prefix_name can not be null.')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_with_group(group_name)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
try:
ret_dict = store.storage_upload_slave_by_filename(tc, store_serv, filename, prefix_name, remote_filename,
meta_dict=None)
except:
raise
ret_dict['Status'] = 'Upload slave file successed.'
return ret_dict
def upload_slave_by_file(self, filename, remote_file_id, prefix_name, meta_dict=None):
'''
Upload slave file to Storage server.
arguments:
@filename: string, local file name
@remote_file_id: string, remote file id
@prefix_name: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dictionary {
'Status' : 'Upload slave successed.',
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Remote file id' : remote_file_id,
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading slave)')
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(uploading slave)')
if not prefix_name:
raise DataError('[-] Error: prefix_name can not be null.')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_with_group(group_name)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
try:
ret_dict = store.storage_upload_slave_by_file(tc, store_serv, filename, prefix_name, remote_filename,
meta_dict=None)
except:
raise
ret_dict['Status'] = 'Upload slave file successed.'
return ret_dict
def upload_slave_by_buffer(self, filebuffer, remote_file_id, meta_dict=None, file_ext_name=None):
'''
Upload slave file by buffer
arguments:
@filebuffer: string
@remote_file_id: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dictionary {
'Status' : 'Upload slave successed.',
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Remote file id' : remote_file_id,
'Storage IP' : storage_ip
}
'''
if not filebuffer:
raise DataError('[-] Error: argument filebuffer can not be null.')
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(uploading slave)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_slave_by_buffer(tc, store_serv, filebuffer, remote_filename, meta_dict,
file_ext_name)
def upload_appender_by_filename(self, local_filename, meta_dict=None):
'''
Upload an appender file by filename.
arguments:
@local_filename: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
} Notice: it can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(uploading appender)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_appender_by_filename(tc, store_serv, local_filename, meta_dict)
def upload_appender_by_file(self, local_filename, meta_dict=None):
'''
Upload an appender file by file.
arguments:
@local_filename: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
} Notice: it can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(uploading appender)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_appender_by_file(tc, store_serv, local_filename, meta_dict)
def upload_appender_by_buffer(self, filebuffer, file_ext_name=None, meta_dict=None):
'''
Upload a buffer to Storage server.
arguments:
@filebuffer: string
@file_ext_name: string, can be null
@meta_dict: dictionary, can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
if not filebuffer:
raise DataError('[-] Error: argument filebuffer can not be null.')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_appender_by_buffer(tc, store_serv, filebuffer, meta_dict, file_ext_name)
def delete_file(self, remote_file_id):
'''
Delete a file from Storage server.
arguments:
@remote_file_id: string, file_id of file that is on storage server
@return tuple ('Delete file successed.', remote_file_id, storage_ip)
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in delete file)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_delete_file(tc, store_serv, remote_filename)
def download_to_file(self, local_filename, remote_file_id, offset=0, down_bytes=0):
'''
Download a file from Storage server.
arguments:
@local_filename: string, local name of file
@remote_file_id: string, file_id of file that is on storage server
@offset: long
@downbytes: long
@return dict {
'Remote file_id' : remote_file_id,
'Content' : local_filename,
'Download size' : downloaded_size,
'Storage IP' : storage_ip
}
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in download file)')
group_name, remote_filename = tmp
if not offset:
file_offset = int(offset)
if not down_bytes:
download_bytes = int(down_bytes)
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_fetch(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_download_to_file(tc, store_serv, local_filename, file_offset, download_bytes,
remote_filename)
def download_to_buffer(self, remote_file_id, offset=0, down_bytes=0):
'''
Download a file from Storage server and store in buffer.
arguments:
@remote_file_id: string, file_id of file that is on storage server
@offset: long
@down_bytes: long
@return dict {
'Remote file_id' : remote_file_id,
'Content' : file_buffer,
'Download size' : downloaded_size,
'Storage IP' : storage_ip
}
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in download file)')
group_name, remote_filename = tmp
if not offset:
file_offset = int(offset)
if not down_bytes:
download_bytes = int(down_bytes)
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_fetch(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
file_buffer = None
return store.storage_download_to_buffer(tc, store_serv, file_buffer, file_offset, download_bytes,
remote_filename)
def list_one_group(self, group_name):
'''
List one group information.
arguments:
@group_name: string, group name will be list
@return Group_info, instance
'''
tc = Tracker_client(self.tracker_pool)
return tc.tracker_list_one_group(group_name)
def list_servers(self, group_name, storage_ip=None):
'''
List all storage servers information in a group
arguments:
@group_name: string
@return dictionary {
'Group name' : group_name,
'Servers' : server list,
}
'''
tc = Tracker_client(self.tracker_pool)
return tc.tracker_list_servers(group_name, storage_ip)
def list_all_groups(self):
'''
List all group information.
@return dictionary {
'Groups count' : group_count,
'Groups' : list of groups
}
'''
tc = Tracker_client(self.tracker_pool)
return tc.tracker_list_all_groups()
def get_meta_data(self, remote_file_id):
'''
Get meta data of remote file.
arguments:
@remote_fileid: string, remote file id
@return dictionary, meta data
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in get meta data)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_get_metadata(tc, store_serv, remote_filename)
def set_meta_data(self, remote_file_id, meta_dict, op_flag=STORAGE_SET_METADATA_FLAG_OVERWRITE):
'''
Set meta data of remote file.
arguments:
@remote_file_id: string
@meta_dict: dictionary
@op_flag: char, 'O' for overwrite, 'M' for merge
@return dictionary {
'Status' : status,
'Storage IP' : storage_ip
}
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in set meta data)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
try:
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
status = store.storage_set_metadata(tc, store_serv, remote_filename, meta_dict)
except (ConnectionError, ResponseError, DataError):
raise
# if status == 2:
# raise DataError('[-] Error: remote file %s is not exist.' % remote_file_id)
if status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
ret_dict = {}
ret_dict['Status'] = 'Set meta data success.'
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def append_by_filename(self, local_filename, remote_fileid):
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(append)')
tmp = split_remote_fileid(remote_fileid)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(append)')
group_name, appended_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appended_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_append_by_filename(tc, store_serv, local_filename, appended_filename)
def append_by_file(self, local_filename, remote_fileid):
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(append)')
tmp = split_remote_fileid(remote_fileid)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(append)')
group_name, appended_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appended_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_append_by_file(tc, store_serv, local_filename, appended_filename)
def append_by_buffer(self, file_buffer, remote_fileid):
if not file_buffer:
raise DataError('[-] Error: file_buffer can not be null.')
tmp = split_remote_fileid(remote_fileid)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(append)')
group_name, appended_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appended_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_append_by_buffer(tc, store_serv, file_buffer, appended_filename)
def truncate_file(self, truncated_filesize, appender_fileid):
'''
Truncate file in Storage server.
arguments:
@truncated_filesize: long
@appender_fileid: remote_fileid
@return: dictionary {
'Status' : 'Truncate successed.',
'Storage IP' : storage_ip
}
'''
trunc_filesize = int(truncated_filesize)
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: appender_fileid is invalid.(truncate)')
group_name, appender_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_truncate_file(tc, store_serv, trunc_filesize, appender_filename)
def modify_by_filename(self, filename, appender_fileid, offset=0):
'''
Modify a file in Storage server by file.
arguments:
@filename: string, local file name
@offset: long, file offset
@appender_fileid: string, remote file id
@return: dictionary {
'Status' : 'Modify successed.',
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(modify)')
filesize = os.stat(filename).st_size
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: remote_fileid is invalid.(modify)')
group_name, appender_filename = tmp
if not offset:
file_offset = int(offset)
else:
file_offset = 0
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_modify_by_filename(tc, store_serv, filename, file_offset, filesize, appender_filename)
def modify_by_file(self, filename, appender_fileid, offset=0):
'''
Modify a file in Storage server by file.
arguments:
@filename: string, local file name
@offset: long, file offset
@appender_fileid: string, remote file id
@return: dictionary {
'Status' : 'Modify successed.',
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(modify)')
filesize = os.stat(filename).st_size
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: remote_fileid is invalid.(modify)')
group_name, appender_filename = tmp
if not offset:
file_offset = int(offset)
else:
file_offset = 0
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_modify_by_file(tc, store_serv, filename, file_offset, filesize, appender_filename)
def modify_by_buffer(self, filebuffer, appender_fileid, offset=0):
'''
Modify a file in Storage server by buffer.
arguments:
@filebuffer: string, file buffer
@offset: long, file offset
@appender_fileid: string, remote file id
@return: dictionary {
'Status' : 'Modify successed.',
'Storage IP' : storage_ip
}
'''
if not filebuffer:
raise DataError('[-] Error: filebuffer can not be null.(modify)')
filesize = len(filebuffer)
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: remote_fileid is invalid.(modify)')
group_name, appender_filename = tmp
if not offset:
file_offset = int(offset)
else:
file_offset = 0
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_modify_by_buffer(tc, store_serv, filebuffer, file_offset, filesize, appender_filename)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: exceptions.py
'''Core exceptions raised by fdfs client'''
class FDFSError(Exception):
pass
class ConnectionError(FDFSError):
pass
class ResponseError(FDFSError):
pass
class InvaildResponse(FDFSError):
pass
class DataError(FDFSError):
pass
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: fdfs_protol.py
import struct
import socket
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# define FDFS protol constans
TRACKER_PROTO_CMD_STORAGE_JOIN = 81
FDFS_PROTO_CMD_QUIT = 82
TRACKER_PROTO_CMD_STORAGE_BEAT = 83 # storage heart beat
TRACKER_PROTO_CMD_STORAGE_REPORT_DISK_USAGE = 84 # report disk usage
TRACKER_PROTO_CMD_STORAGE_REPLICA_CHG = 85 # repl new storage servers
TRACKER_PROTO_CMD_STORAGE_SYNC_SRC_REQ = 86 # src storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_REQ = 87 # dest storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_NOTIFY = 88 # sync done notify
TRACKER_PROTO_CMD_STORAGE_SYNC_REPORT = 89 # report src last synced time as dest server
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_QUERY = 79 # dest storage query sync src storage server
TRACKER_PROTO_CMD_STORAGE_REPORT_IP_CHANGED = 78 # storage server report it's ip changed
TRACKER_PROTO_CMD_STORAGE_CHANGELOG_REQ = 77 # storage server request storage server's changelog
TRACKER_PROTO_CMD_STORAGE_REPORT_STATUS = 76 # report specified storage server status
TRACKER_PROTO_CMD_STORAGE_PARAMETER_REQ = 75 # storage server request parameters
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FREE = 74 # storage report trunk free space
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FID = 73 # storage report current trunk file id
TRACKER_PROTO_CMD_STORAGE_FETCH_TRUNK_FID = 72 # storage get current trunk file id
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START = 61 # start of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END = 62 # end of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE = 63 # tracker get a system data file
TRACKER_PROTO_CMD_TRACKER_GET_STATUS = 64 # tracker get status of other tracker
TRACKER_PROTO_CMD_TRACKER_PING_LEADER = 65 # tracker ping leader
TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER = 66 # notify next leader to other trackers
TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER = 67 # commit next leader to other trackers
TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP = 90
TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS = 91
TRACKER_PROTO_CMD_SERVER_LIST_STORAGE = 92
TRACKER_PROTO_CMD_SERVER_DELETE_STORAGE = 93
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ONE = 101
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ONE = 102
TRACKER_PROTO_CMD_SERVICE_QUERY_UPDATE = 103
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ONE = 104
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ALL = 105
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ALL = 106
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ALL = 107
TRACKER_PROTO_CMD_RESP = 100
FDFS_PROTO_CMD_ACTIVE_TEST = 111 # active test, tracker and storage both support since V1.28
STORAGE_PROTO_CMD_REPORT_CLIENT_IP = 9 # ip as tracker client
STORAGE_PROTO_CMD_UPLOAD_FILE = 11
STORAGE_PROTO_CMD_DELETE_FILE = 12
STORAGE_PROTO_CMD_SET_METADATA = 13
STORAGE_PROTO_CMD_DOWNLOAD_FILE = 14
STORAGE_PROTO_CMD_GET_METADATA = 15
STORAGE_PROTO_CMD_SYNC_CREATE_FILE = 16
STORAGE_PROTO_CMD_SYNC_DELETE_FILE = 17
STORAGE_PROTO_CMD_SYNC_UPDATE_FILE = 18
STORAGE_PROTO_CMD_SYNC_CREATE_LINK = 19
STORAGE_PROTO_CMD_CREATE_LINK = 20
STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE = 21
STORAGE_PROTO_CMD_QUERY_FILE_INFO = 22
STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE = 23 # create appender file
STORAGE_PROTO_CMD_APPEND_FILE = 24 # append file
STORAGE_PROTO_CMD_SYNC_APPEND_FILE = 25
STORAGE_PROTO_CMD_FETCH_ONE_PATH_BINLOG = 26 # fetch binlog of one store path
STORAGE_PROTO_CMD_RESP = TRACKER_PROTO_CMD_RESP
STORAGE_PROTO_CMD_UPLOAD_MASTER_FILE = STORAGE_PROTO_CMD_UPLOAD_FILE
STORAGE_PROTO_CMD_TRUNK_ALLOC_SPACE = 27 # since V3.00
STORAGE_PROTO_CMD_TRUNK_ALLOC_CONFIRM = 28 # since V3.00
STORAGE_PROTO_CMD_TRUNK_FREE_SPACE = 29 # since V3.00
STORAGE_PROTO_CMD_TRUNK_SYNC_BINLOG = 30 # since V3.00
STORAGE_PROTO_CMD_TRUNK_GET_BINLOG_SIZE = 31 # since V3.07
STORAGE_PROTO_CMD_TRUNK_DELETE_BINLOG_MARKS = 32 # since V3.07
STORAGE_PROTO_CMD_TRUNK_TRUNCATE_BINLOG_FILE = 33 # since V3.07
STORAGE_PROTO_CMD_MODIFY_FILE = 34 # since V3.08
STORAGE_PROTO_CMD_SYNC_MODIFY_FILE = 35 # since V3.08
STORAGE_PROTO_CMD_TRUNCATE_FILE = 36 # since V3.08
STORAGE_PROTO_CMD_SYNC_TRUNCATE_FILE = 37 # since V3.08
# for overwrite all old metadata
STORAGE_SET_METADATA_FLAG_OVERWRITE = 'O'
STORAGE_SET_METADATA_FLAG_OVERWRITE_STR = "O"
# for replace, insert when the meta item not exist, otherwise update it
STORAGE_SET_METADATA_FLAG_MERGE = 'M'
STORAGE_SET_METADATA_FLAG_MERGE_STR = "M"
FDFS_RECORD_SEPERATOR = '\x01'
FDFS_FIELD_SEPERATOR = '\x02'
# common constants
FDFS_GROUP_NAME_MAX_LEN = 16
IP_ADDRESS_SIZE = 16
FDFS_PROTO_PKG_LEN_SIZE = 8
FDFS_PROTO_CMD_SIZE = 1
FDFS_PROTO_STATUS_SIZE = 1
FDFS_PROTO_IP_PORT_SIZE = (IP_ADDRESS_SIZE + 6)
FDFS_MAX_SERVERS_EACH_GROUP = 32
FDFS_MAX_GROUPS = 512
FDFS_MAX_TRACKERS = 16
FDFS_DOMAIN_NAME_MAX_LEN = 128
FDFS_MAX_META_NAME_LEN = 64
FDFS_MAX_META_VALUE_LEN = 256
FDFS_FILE_PREFIX_MAX_LEN = 16
FDFS_LOGIC_FILE_PATH_LEN = 10
FDFS_TRUE_FILE_PATH_LEN = 6
FDFS_FILENAME_BASE64_LENGTH = 27
FDFS_TRUNK_FILE_INFO_LEN = 16
FDFS_FILE_EXT_NAME_MAX_LEN = 6
FDFS_SPACE_SIZE_BASE_INDEX = 2 # storage space size based (MB)
FDFS_UPLOAD_BY_BUFFER = 1
FDFS_UPLOAD_BY_FILENAME = 2
FDFS_UPLOAD_BY_FILE = 3
FDFS_DOWNLOAD_TO_BUFFER = 1
FDFS_DOWNLOAD_TO_FILE = 2
FDFS_NORMAL_LOGIC_FILENAME_LENGTH = (
FDFS_LOGIC_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_FILE_EXT_NAME_MAX_LEN + 1)
FDFS_TRUNK_FILENAME_LENGTH = (
FDFS_TRUE_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_TRUNK_FILE_INFO_LEN + 1 + FDFS_FILE_EXT_NAME_MAX_LEN)
FDFS_TRUNK_LOGIC_FILENAME_LENGTH = (FDFS_TRUNK_FILENAME_LENGTH + (FDFS_LOGIC_FILE_PATH_LEN - FDFS_TRUE_FILE_PATH_LEN))
FDFS_VERSION_SIZE = 6
TRACKER_QUERY_STORAGE_FETCH_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE)
TRACKER_QUERY_STORAGE_STORE_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE + 1)
# status code, order is important!
FDFS_STORAGE_STATUS_INIT = 0
FDFS_STORAGE_STATUS_WAIT_SYNC = 1
FDFS_STORAGE_STATUS_SYNCING = 2
FDFS_STORAGE_STATUS_IP_CHANGED = 3
FDFS_STORAGE_STATUS_DELETED = 4
FDFS_STORAGE_STATUS_OFFLINE = 5
FDFS_STORAGE_STATUS_ONLINE = 6
FDFS_STORAGE_STATUS_ACTIVE = 7
FDFS_STORAGE_STATUS_RECOVERY = 9
FDFS_STORAGE_STATUS_NONE = 99
class Storage_server(object):
'''Class storage server for upload.'''
def __init__(self):
self.ip_addr = None
self.port = None
self.group_name = ''
self.store_path_index = 0
# Class tracker_header
class Tracker_header(object):
'''
Class for Pack or Unpack tracker header
struct tracker_header{
char pkg_len[FDFS_PROTO_PKG_LEN_SIZE],
char cmd,
char status,
}
'''
def __init__(self):
self.fmt = '!QBB' # pkg_len[FDFS_PROTO_PKG_LEN_SIZE] + cmd + status
self.st = struct.Struct(self.fmt)
self.pkg_len = 0
self.cmd = 0
self.status = 0
def _pack(self, pkg_len=0, cmd=0, status=0):
return self.st.pack(pkg_len, cmd, status)
def _unpack(self, bytes_stream):
self.pkg_len, self.cmd, self.status = self.st.unpack(bytes_stream)
return True
def header_len(self):
return self.st.size
def send_header(self, conn):
'''Send Tracker header to server.'''
header = self._pack(self.pkg_len, self.cmd, self.status)
try:
conn._sock.sendall(header)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: %s' % (e.args,))
def recv_header(self, conn):
'''Receive response from server.
if sucess, class member (pkg_len, cmd, status) is response.
'''
try:
header = conn._sock.recv(self.header_len())
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: %s' % (e.args,))
self._unpack(header)
def fdfs_pack_metadata(meta_dict):
ret = ''
for key in meta_dict:
ret += '%s%c%s%c' % (key, FDFS_FIELD_SEPERATOR, meta_dict[key], FDFS_RECORD_SEPERATOR)
return ret[0:-1]
def fdfs_unpack_metadata(bytes_stream):
li = bytes_stream.split(FDFS_RECORD_SEPERATOR)
return dict([item.split(FDFS_FIELD_SEPERATOR) for item in li])
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: storage_client.py
import os
import stat
import errno
import struct
import socket
import datetime
import platform
from fdfs_client.fdfs_protol import *
from fdfs_client.connection import *
# from test_fdfs.sendfile import *
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
from fdfs_client.utils import *
__os_sep__ = "/" if platform.system() == 'Windows' else os.sep
def tcp_send_file(conn, filename, buffer_size=1024):
'''
Send file to server, and split into multiple pkgs while sending.
arguments:
@conn: connection
@filename: string
@buffer_size: int ,send buffer size
@Return int: file size if success else raise ConnectionError.
'''
file_size = 0
with open(filename, 'rb') as f:
while 1:
try:
send_buffer = f.read(buffer_size)
send_size = len(send_buffer)
if send_size == 0:
break
tcp_send_data(conn, send_buffer)
file_size += send_size
except ConnectionError as e:
raise ConnectionError('[-] Error while uploading file(%s).' % e.args)
except IOError as e:
raise DataError('[-] Error while reading local file(%s).' % e.args)
return file_size
def tcp_send_file_ex(conn, filename, buffer_size=4096):
'''
Send file to server. Using linux system call 'sendfile'.
arguments:
@conn: connection
@filename: string
@return long, sended size
'''
if 'linux' not in sys.platform.lower():
raise DataError('[-] Error: \'sendfile\' system call only available on linux.')
nbytes = 0
offset = 0
sock_fd = conn.get_sock().fileno()
with open(filename, 'rb') as f:
in_fd = f.fileno()
while 1:
try:
pass
# sent = sendfile(sock_fd, in_fd, offset, buffer_size)
# if 0 == sent:
# break
# nbytes += sent
# offset += sent
except OSError as e:
if e.errno == errno.EAGAIN:
continue
raise
return nbytes
def tcp_recv_file(conn, local_filename, file_size, buffer_size=1024):
'''
Receive file from server, fragmented it while receiving and write to disk.
arguments:
@conn: connection
@local_filename: string
@file_size: int, remote file size
@buffer_size: int, receive buffer size
@Return int: file size if success else raise ConnectionError.
'''
total_file_size = 0
flush_size = 0
remain_bytes = file_size
with open(local_filename, 'wb+') as f:
while remain_bytes > 0:
try:
if remain_bytes >= buffer_size:
file_buffer, recv_size = tcp_recv_response(conn, buffer_size, buffer_size)
else:
file_buffer, recv_size = tcp_recv_response(conn, remain_bytes, buffer_size)
f.write(file_buffer)
remain_bytes -= buffer_size
total_file_size += recv_size
flush_size += recv_size
if flush_size >= 4096:
f.flush()
flush_size = 0
except ConnectionError as e:
raise ConnectionError('[-] Error: while downloading file(%s).' % e.args)
except IOError as e:
raise DataError('[-] Error: while writting local file(%s).' % e.args)
return total_file_size
class Storage_client(object):
'''
The Class Storage_client for storage server.
Note: argument host_tuple of storage server ip address, that should be a single element.
'''
def __init__(self, *kwargs):
conn_kwargs = {
'name': 'Storage Pool',
'host_tuple': (kwargs[0],),
'port': kwargs[1],
'timeout': kwargs[2]
}
self.pool = ConnectionPool(**conn_kwargs)
return None
def __del__(self):
try:
self.pool.destroy()
self.pool = None
except:
pass
def update_pool(self, old_store_serv, new_store_serv, timeout=30):
'''
Update connection pool of storage client.
We need update connection pool of storage client, while storage server is changed.
but if server not changed, we do nothing.
'''
if old_store_serv.ip_addr == new_store_serv.ip_addr:
return None
self.pool.destroy()
conn_kwargs = {
'name': 'Storage_pool',
'host_tuple': (new_store_serv.ip_addr,),
'port': new_store_serv.port,
'timeout': timeout
}
self.pool = ConnectionPool(**conn_kwargs)
return True
def _storage_do_upload_file(self, tracker_client, store_serv, file_buffer, file_size=None, upload_type=None,
meta_dict=None, cmd=None, master_filename=None, prefix_name=None, file_ext_name=None):
'''
core of upload file.
arguments:
@tracker_client: Tracker_client, it is useful connect to tracker server
@store_serv: Storage_server, it is return from query tracker server
@file_buffer: string, file name or file buffer for send
@file_size: int
@upload_type: int, optional: FDFS_UPLOAD_BY_FILE, FDFS_UPLOAD_BY_FILENAME,
FDFS_UPLOAD_BY_BUFFER
@meta_dic: dictionary, store metadata in it
@cmd: int, reference fdfs protol
@master_filename: string, useful upload slave file
@prefix_name: string
@file_ext_name: string
@Return dictionary
{
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : status,
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
}
'''
store_conn = self.pool.get_connection()
th = Tracker_header()
master_filename_len = len(master_filename) if master_filename else 0
prefix_name_len = len(prefix_name) if prefix_name else 0
upload_slave = len(store_serv.group_name) and master_filename_len
file_ext_name = str(file_ext_name) if file_ext_name else ''
# non_slave_fmt |-store_path_index(1)-file_size(8)-file_ext_name(6)-|
non_slave_fmt = '!B Q %ds' % FDFS_FILE_EXT_NAME_MAX_LEN
# slave_fmt |-master_len(8)-file_size(8)-prefix_name(16)-file_ext_name(6)
# -master_name(master_filename_len)-|
slave_fmt = '!Q Q %ds %ds %ds' % (FDFS_FILE_PREFIX_MAX_LEN, FDFS_FILE_EXT_NAME_MAX_LEN, master_filename_len)
th.pkg_len = struct.calcsize(slave_fmt) if upload_slave else struct.calcsize(non_slave_fmt)
th.pkg_len += file_size
th.cmd = cmd
th.send_header(store_conn)
if upload_slave:
send_buffer = struct.pack(
slave_fmt, master_filename_len, file_size, prefix_name, file_ext_name, master_filename)
else:
send_buffer = struct.pack(non_slave_fmt, store_serv.store_path_index, file_size, file_ext_name.encode())
try:
tcp_send_data(store_conn, send_buffer)
if upload_type == FDFS_UPLOAD_BY_FILENAME:
send_file_size = tcp_send_file(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_BUFFER:
tcp_send_data(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_FILE:
send_file_size = tcp_send_file_ex(store_conn, file_buffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(store_conn, th.pkg_len)
if recv_size <= FDFS_GROUP_NAME_MAX_LEN:
errmsg = '[-] Error: Storage response length is not match, '
errmsg += 'expect: %d, actual: %d' % (th.pkg_len, recv_size)
raise ResponseError(errmsg)
# recv_fmt: |-group_name(16)-remote_file_name(recv_size - 16)-|
recv_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, th.pkg_len - FDFS_GROUP_NAME_MAX_LEN)
(group_name, remote_name) = struct.unpack(recv_fmt, recv_buffer)
remote_filename = remote_name.strip(b'\x00')
if meta_dict and len(meta_dict) > 0:
status = self.storage_set_metadata(tracker_client, store_serv, remote_filename, meta_dict)
if status != 0:
# rollback
self.storage_delete_file(tracker_client, store_serv, remote_filename)
raise DataError('[-] Error: %d, %s' % (status, os.strerror(status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dic = {
'Group name': group_name.strip(b'\x00'),
'Remote file_id': group_name.strip(b'\x00') + __os_sep__.encode() + remote_filename,
'Status': 'Upload successed.',
'Local file name': file_buffer if (upload_type == FDFS_UPLOAD_BY_FILENAME
or upload_type == FDFS_UPLOAD_BY_FILE
) else '',
'Uploaded size': appromix(send_file_size) if (upload_type == FDFS_UPLOAD_BY_FILENAME
or upload_type == FDFS_UPLOAD_BY_FILE
) else appromix(len(file_buffer)),
'Storage IP': store_serv.ip_addr
}
return ret_dic
def storage_upload_by_filename(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILENAME,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_FILE, None, None, file_ext_name)
def storage_upload_by_file(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILE,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_FILE, None, None, file_ext_name)
def storage_upload_by_buffer(self, tracker_client, store_serv, file_buffer, file_ext_name=None, meta_dict=None):
buffer_size = len(file_buffer)
return self._storage_do_upload_file(tracker_client, store_serv, file_buffer, buffer_size, FDFS_UPLOAD_BY_BUFFER,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_FILE, None, None, file_ext_name)
def storage_upload_slave_by_filename(self, tracker_client, store_serv, filename, prefix_name, remote_filename,
meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILENAME,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE, remote_filename,
prefix_name, file_ext_name)
def storage_upload_slave_by_file(self, tracker_client, store_serv, filename, prefix_name, remote_filename,
meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILE,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE, remote_filename,
prefix_name, file_ext_name)
def storage_upload_slave_by_buffer(self, tracker_client, store_serv, filebuffer, remote_filename, meta_dict,
file_ext_name):
file_size = len(filebuffer)
return self._storage_do_upload_file(tracker_client, store_serv, filebuffer, file_size, FDFS_UPLOAD_BY_BUFFER,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE, None, remote_filename,
file_ext_name)
def storage_upload_appender_by_filename(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILENAME,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE, None, None,
file_ext_name)
def storage_upload_appender_by_file(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILE,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE, None, None,
file_ext_name)
def storage_upload_appender_by_buffer(self, tracker_client, store_serv, file_buffer, meta_dict=None,
file_ext_name=None):
file_size = len(file_buffer)
return self._storage_do_upload_file(tracker_client, store_serv, file_buffer, file_size, FDFS_UPLOAD_BY_BUFFER,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE, None, None,
file_ext_name)
def storage_delete_file(self, tracker_client, store_serv, remote_filename):
'''
Delete file from storage server.
'''
store_conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = STORAGE_PROTO_CMD_DELETE_FILE
file_name_len = len(remote_filename)
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + file_name_len
try:
th.send_header(store_conn)
# del_fmt: |-group_name(16)-filename(len)-|
del_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, file_name_len)
send_buffer = struct.pack(del_fmt, store_serv.group_name, remote_filename)
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
# if th.status == 2:
# raise DataError('[-] Error: remote file %s is not exist.'
# % (store_serv.group_name + __os_sep__.encode() + remote_filename))
if th.status != 0:
raise DataError('Error: %d, %s' % (th.status, os.strerror(th.status)))
# recv_buffer, recv_size = tcp_recv_response(store_conn, th.pkg_len)
except:
raise
finally:
self.pool.release(store_conn)
remote_filename = store_serv.group_name + __os_sep__.encode() + remote_filename
return ('Delete file successed.', remote_filename, store_serv.ip_addr)
def _storage_do_download_file(self, tracker_client, store_serv, file_buffer, offset, download_size,
download_type, remote_filename):
'''
Core of download file from storage server.
You can choice download type, optional FDFS_DOWNLOAD_TO_FILE or
FDFS_DOWNLOAD_TO_BUFFER. And you can choice file offset.
@Return dictionary
'Remote file name' : remote_filename,
'Content' : local_filename or buffer,
'Download size' : download_size,
'Storage IP' : storage_ip
'''
store_conn = self.pool.get_connection()
th = Tracker_header()
remote_filename_len = len(remote_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + FDFS_GROUP_NAME_MAX_LEN + remote_filename_len
th.cmd = STORAGE_PROTO_CMD_DOWNLOAD_FILE
try:
th.send_header(store_conn)
# down_fmt: |-offset(8)-download_bytes(8)-group_name(16)-remote_filename(len)-|
down_fmt = '!Q Q %ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, remote_filename_len)
send_buffer = struct.pack(down_fmt, offset, download_size, store_serv.group_name, remote_filename)
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
# if th.status == 2:
# raise DataError('[-] Error: remote file %s is not exist.' %
# (store_serv.group_name + __os_sep__.encode() + remote_filename))
if th.status != 0:
raise DataError('Error: %d %s' % (th.status, os.strerror(th.status)))
if download_type == FDFS_DOWNLOAD_TO_FILE:
total_recv_size = tcp_recv_file(store_conn, file_buffer, th.pkg_len)
elif download_type == FDFS_DOWNLOAD_TO_BUFFER:
recv_buffer, total_recv_size = tcp_recv_response(store_conn, th.pkg_len)
except:
raise
finally:
self.pool.release(store_conn)
ret_dic = {
'Remote file_id': store_serv.group_name + __os_sep__.encode() + remote_filename,
'Content': file_buffer if download_type == FDFS_DOWNLOAD_TO_FILE else recv_buffer,
'Download size': appromix(total_recv_size),
'Storage IP': store_serv.ip_addr
}
return ret_dic
def storage_download_to_file(self, tracker_client, store_serv, local_filename, file_offset, download_bytes,
remote_filename):
return self._storage_do_download_file(tracker_client, store_serv, local_filename, file_offset, download_bytes,
FDFS_DOWNLOAD_TO_FILE, remote_filename)
def storage_download_to_buffer(self, tracker_client, store_serv, file_buffer, file_offset, download_bytes,
remote_filename):
return self._storage_do_download_file(tracker_client, store_serv, file_buffer, file_offset, download_bytes,
FDFS_DOWNLOAD_TO_BUFFER, remote_filename)
def storage_set_metadata(self, tracker_client, store_serv, remote_filename, meta_dict,
op_flag=STORAGE_SET_METADATA_FLAG_OVERWRITE):
ret = 0
conn = self.pool.get_connection()
remote_filename_len = len(remote_filename)
meta_buffer = fdfs_pack_metadata(meta_dict)
meta_len = len(meta_buffer)
th = Tracker_header()
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + 1 + FDFS_GROUP_NAME_MAX_LEN + remote_filename_len + meta_len
th.cmd = STORAGE_PROTO_CMD_SET_METADATA
try:
th.send_header(conn)
# meta_fmt: |-filename_len(8)-meta_len(8)-op_flag(1)-group_name(16)
# -filename(remote_filename_len)-meta(meta_len)|
meta_fmt = '!Q Q c %ds %ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, remote_filename_len, meta_len)
send_buffer = struct.pack(meta_fmt, remote_filename_len, meta_len, op_flag, store_serv.group_name,
remote_filename, meta_buffer)
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
ret = th.status
except:
raise
finally:
self.pool.release(conn)
return ret
def storage_get_metadata(self, tracker_client, store_serv, remote_file_name):
store_conn = self.pool.get_connection()
th = Tracker_header()
remote_filename_len = len(remote_file_name)
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + remote_filename_len
th.cmd = STORAGE_PROTO_CMD_GET_METADATA
try:
th.send_header(store_conn)
# meta_fmt: |-group_name(16)-filename(remote_filename_len)-|
meta_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, remote_filename_len)
send_buffer = struct.pack(meta_fmt, store_serv.group_name, remote_file_name.encode())
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
# if th.status == 2:
# raise DataError('[-] Error: Remote file %s has no meta data.'
# % (store_serv.group_name + __os_sep__.encode() + remote_file_name))
if th.status != 0:
raise DataError('[-] Error:%d, %s' % (th.status, os.strerror(th.status)))
if th.pkg_len == 0:
ret_dict = {}
meta_buffer, recv_size = tcp_recv_response(store_conn, th.pkg_len)
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = fdfs_unpack_metadata(meta_buffer)
return ret_dict
def _storage_do_append_file(self, tracker_client, store_serv, file_buffer, file_size, upload_type,
appended_filename):
store_conn = self.pool.get_connection()
th = Tracker_header()
appended_filename_len = len(appended_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + appended_filename_len + file_size
th.cmd = STORAGE_PROTO_CMD_APPEND_FILE
try:
th.send_header(store_conn)
# append_fmt: |-appended_filename_len(8)-file_size(8)-appended_filename(len)
# -filecontent(filesize)-|
append_fmt = '!Q Q %ds' % appended_filename_len
send_buffer = struct.pack(append_fmt, appended_filename_len, file_size, appended_filename)
tcp_send_data(store_conn, send_buffer)
if upload_type == FDFS_UPLOAD_BY_FILENAME:
tcp_send_file(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_BUFFER:
tcp_send_data(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_FILE:
tcp_send_file_ex(store_conn, file_buffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = {}
ret_dict['Status'] = 'Append file successed.'
ret_dict['Appender file name'] = store_serv.group_name + __os_sep__.encode() + appended_filename
ret_dict['Appended size'] = appromix(file_size)
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def storage_append_by_filename(self, tracker_client, store_serv, local_filename, appended_filename):
file_size = os.stat(local_filename).st_size
return self._storage_do_append_file(tracker_client, store_serv, local_filename, file_size,
FDFS_UPLOAD_BY_FILENAME, appended_filename)
def storage_append_by_file(self, tracker_client, store_serv, local_filename, appended_filename):
file_size = os.stat(local_filename).st_size
return self._storage_do_append_file(tracker_client, store_serv, local_filename, file_size, FDFS_UPLOAD_BY_FILE,
appended_filename)
def storage_append_by_buffer(self, tracker_client, store_serv, file_buffer, appended_filename):
file_size = len(file_buffer)
return self._storage_do_append_file(tracker_client, store_serv, file_buffer, file_size, FDFS_UPLOAD_BY_BUFFER,
appended_filename)
def _storage_do_truncate_file(self, tracker_client, store_serv, truncated_filesize, appender_filename):
store_conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = STORAGE_PROTO_CMD_TRUNCATE_FILE
appender_filename_len = len(appender_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + appender_filename_len
try:
th.send_header(store_conn)
# truncate_fmt:|-appender_filename_len(8)-truncate_filesize(8)
# -appender_filename(len)-|
truncate_fmt = '!Q Q %ds' % appender_filename_len
send_buffer = struct.pack(truncate_fmt, appender_filename_len, truncated_filesize, appender_filename)
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = {}
ret_dict['Status'] = 'Truncate successed.'
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def storage_truncate_file(self, tracker_client, store_serv, truncated_filesize, appender_filename):
return self._storage_do_truncate_file(tracker_client, store_serv, truncated_filesize, appender_filename)
def _storage_do_modify_file(self, tracker_client, store_serv, upload_type, filebuffer, offset, filesize,
appender_filename):
store_conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = STORAGE_PROTO_CMD_MODIFY_FILE
appender_filename_len = len(appender_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 3 + appender_filename_len + filesize
try:
th.send_header(store_conn)
# modify_fmt: |-filename_len(8)-offset(8)-filesize(8)-filename(len)-|
modify_fmt = '!Q Q Q %ds' % appender_filename_len
send_buffer = struct.pack(modify_fmt, appender_filename_len, offset, filesize, appender_filename)
tcp_send_data(store_conn, send_buffer)
if upload_type == FDFS_UPLOAD_BY_FILENAME:
upload_size = tcp_send_file(store_conn, filebuffer)
elif upload_type == FDFS_UPLOAD_BY_BUFFER:
tcp_send_data(store_conn, filebuffer)
elif upload_type == FDFS_UPLOAD_BY_FILE:
upload_size = tcp_send_file_ex(store_conn, filebuffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = {}
ret_dict['Status'] = 'Modify successed.'
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def storage_modify_by_filename(self, tracker_client, store_serv, filename, offset, filesize, appender_filename):
return self._storage_do_modify_file(tracker_client, store_serv, FDFS_UPLOAD_BY_FILENAME, filename, offset,
filesize, appender_filename)
def storage_modify_by_file(self, tracker_client, store_serv, filename, offset, filesize, appender_filename):
return self._storage_do_modify_file(tracker_client, store_serv, FDFS_UPLOAD_BY_FILE, filename, offset, filesize,
appender_filename)
def storage_modify_by_buffer(self, tracker_client, store_serv, filebuffer, offset, filesize, appender_filename):
return self._storage_do_modify_file(tracker_client, store_serv, FDFS_UPLOAD_BY_BUFFER, filebuffer, offset,
filesize, appender_filename)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: tracker_client.py
import struct
import socket
from datetime import datetime
from fdfs_client.fdfs_protol import *
from fdfs_client.connection import *
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
from fdfs_client.utils import *
def parse_storage_status(status_code):
try:
ret = {
FDFS_STORAGE_STATUS_INIT: lambda: 'INIT',
FDFS_STORAGE_STATUS_WAIT_SYNC: lambda: 'WAIT_SYNC',
FDFS_STORAGE_STATUS_SYNCING: lambda: 'SYNCING',
FDFS_STORAGE_STATUS_IP_CHANGED: lambda: 'IP_CHANGED',
FDFS_STORAGE_STATUS_DELETED: lambda: 'DELETED',
FDFS_STORAGE_STATUS_OFFLINE: lambda: 'OFFLINE',
FDFS_STORAGE_STATUS_ONLINE: lambda: 'ONLINE',
FDFS_STORAGE_STATUS_ACTIVE: lambda: 'ACTIVE',
FDFS_STORAGE_STATUS_RECOVERY: lambda: 'RECOVERY'
}[status_code]()
except KeyError:
ret = 'UNKNOW'
return ret
class Storage_info(object):
def __init__(self):
self.status = 0
self.id = ''
self.ip_addr = ''
self.domain_name = ''
self.src_id = ''
self.version = ''
self.join_time = datetime.fromtimestamp(0).isoformat()
self.up_time = datetime.fromtimestamp(0).isoformat()
self.totalMB = ''
self.freeMB = ''
self.upload_prio = 0
self.store_path_count = 0
self.subdir_count_per_path = 0
self.curr_write_path = 0
self.storage_port = 23000
self.storage_http_port = 80
self.alloc_count = 0
self.current_count = 0
self.max_count = 0
self.total_upload_count = 0
self.success_upload_count = 0
self.total_append_count = 0
self.success_append_count = 0
self.total_modify_count = 0
self.success_modify_count = 0
self.total_truncate_count = 0
self.success_truncate_count = 0
self.total_setmeta_count = 0
self.success_setmeta_count = 0
self.total_del_count = 0
self.success_del_count = 0
self.total_download_count = 0
self.success_download_count = 0
self.total_getmeta_count = 0
self.success_getmeta_count = 0
self.total_create_link_count = 0
self.success_create_link_count = 0
self.total_del_link_count = 0
self.success_del_link_count = 0
self.total_upload_bytes = 0
self.success_upload_bytes = 0
self.total_append_bytes = 0
self.success_append_bytes = 0
self.total_modify_bytes = 0
self.success_modify_bytes = 0
self.total_download_bytes = 0
self.success_download_bytes = 0
self.total_sync_in_bytes = 0
self.success_sync_in_bytes = 0
self.total_sync_out_bytes = 0
self.success_sync_out_bytes = 0
self.total_file_open_count = 0
self.success_file_open_count = 0
self.total_file_read_count = 0
self.success_file_read_count = 0
self.total_file_write_count = 0
self.success_file_write_count = 0
self.last_source_sync = datetime.fromtimestamp(0).isoformat()
self.last_sync_update = datetime.fromtimestamp(0).isoformat()
self.last_synced_time = datetime.fromtimestamp(0).isoformat()
self.last_heartbeat_time = datetime.fromtimestamp(0).isoformat()
self.if_trunk_server = ''
# fmt = |-status(1)-ipaddr(16)-domain(128)-srcipaddr(16)-ver(6)-52*8-|
self.fmt = '!B 16s 16s 128s 16s 6s 10Q 4s4s4s 42Q?'
def set_info(self, bytes_stream):
(self.status, self.id, ip_addr, domain_name, self.src_id, version, join_time, up_time, totalMB, freeMB,
self.upload_prio, self.store_path_count, self.subdir_count_per_path, self.curr_write_path, self.storage_port,
self.storage_http_port, self.alloc_count, self.current_count, self.max_count, self.total_upload_count,
self.success_upload_count, self.total_append_count, self.success_append_count, self.total_modify_count,
self.success_modify_count, self.total_truncate_count, self.success_truncate_count, self.total_setmeta_count,
self.success_setmeta_count, self.total_del_count, self.success_del_count, self.total_download_count,
self.success_download_count, self.total_getmeta_count, self.success_getmeta_count,
self.total_create_link_count, self.success_create_link_count, self.total_del_link_count,
self.success_del_link_count, self.total_upload_bytes, self.success_upload_bytes, self.total_append_bytes,
self.total_append_bytes, self.total_modify_bytes, self.success_modify_bytes, self.total_download_bytes,
self.success_download_bytes, self.total_sync_in_bytes, self.success_sync_in_bytes, self.total_sync_out_bytes,
self.success_sync_out_bytes, self.total_file_open_count, self.success_file_open_count,
self.total_file_read_count, self.success_file_read_count, self.total_file_write_count,
self.success_file_write_count, last_source_sync, last_sync_update, last_synced_time, last_heartbeat_time,
self.if_trunk_server,) = struct.unpack(self.fmt, bytes_stream)
try:
self.ip_addr = ip_addr.strip(b'\x00')
self.domain_name = domain_name.strip(b'\x00')
self.version = version.strip(b'\x00')
self.totalMB = appromix(totalMB, FDFS_SPACE_SIZE_BASE_INDEX)
self.freeMB = appromix(freeMB, FDFS_SPACE_SIZE_BASE_INDEX)
except ValueError as e:
raise ResponseError('[-] Error: disk space overrun, can not represented it.')
self.join_time = datetime.fromtimestamp(join_time).isoformat()
self.up_time = datetime.fromtimestamp(up_time).isoformat()
self.last_source_sync = datetime.fromtimestamp(last_source_sync).isoformat()
self.last_sync_update = datetime.fromtimestamp(last_sync_update).isoformat()
self.last_synced_time = datetime.fromtimestamp(last_synced_time).isoformat()
self.last_heartbeat_time = datetime.fromtimestamp(last_heartbeat_time).isoformat()
return True
def __str__(self):
'''Transform to readable string.'''
s = 'Storage information:\n'
s += '\tip_addr = %s (%s)\n' % (self.ip_addr, parse_storage_status(self.status))
s += '\thttp domain = %s\n' % self.domain_name
s += '\tversion = %s\n' % self.version
s += '\tjoin time = %s\n' % self.join_time
s += '\tup time = %s\n' % self.up_time
s += '\ttotal storage = %s\n' % self.totalMB
s += '\tfree storage = %s\n' % self.freeMB
s += '\tupload priority = %d\n' % self.upload_prio
s += '\tstore path count = %d\n' % self.store_path_count
s += '\tsubdir count per path = %d\n' % self.subdir_count_per_path
s += '\tstorage port = %d\n' % self.storage_port
s += '\tstorage HTTP port = %d\n' % self.storage_http_port
s += '\tcurrent write path = %d\n' % self.curr_write_path
s += '\tsource ip_addr = %s\n' % self.ip_addr
s += '\tif_trunk_server = %d\n' % self.if_trunk_server
s += '\ttotal upload count = %ld\n' % self.total_upload_count
s += '\tsuccess upload count = %ld\n' % self.success_upload_count
s += '\ttotal download count = %ld\n' % self.total_download_count
s += '\tsuccess download count = %ld\n' % self.success_download_count
s += '\ttotal append count = %ld\n' % self.total_append_count
s += '\tsuccess append count = %ld\n' % self.success_append_count
s += '\ttotal modify count = %ld\n' % self.total_modify_count
s += '\tsuccess modify count = %ld\n' % self.success_modify_count
s += '\ttotal truncate count = %ld\n' % self.total_truncate_count
s += '\tsuccess truncate count = %ld\n' % self.success_truncate_count
s += '\ttotal delete count = %ld\n' % self.total_del_count
s += '\tsuccess delete count = %ld\n' % self.success_del_count
s += '\ttotal set_meta count = %ld\n' % self.total_setmeta_count
s += '\tsuccess set_meta count = %ld\n' % self.success_setmeta_count
s += '\ttotal get_meta count = %ld\n' % self.total_getmeta_count
s += '\tsuccess get_meta count = %ld\n' % self.success_getmeta_count
s += '\ttotal create link count = %ld\n' % self.total_create_link_count
s += '\tsuccess create link count = %ld\n' % self.success_create_link_count
s += '\ttotal delete link count = %ld\n' % self.total_del_link_count
s += '\tsuccess delete link count = %ld\n' % self.success_del_link_count
s += '\ttotal upload bytes = %ld\n' % self.total_upload_bytes
s += '\tsuccess upload bytes = %ld\n' % self.success_upload_bytes
s += '\ttotal download bytes = %ld\n' % self.total_download_bytes
s += '\tsuccess download bytes = %ld\n' % self.success_download_bytes
s += '\ttotal append bytes = %ld\n' % self.total_append_bytes
s += '\tsuccess append bytes = %ld\n' % self.success_append_bytes
s += '\ttotal modify bytes = %ld\n' % self.total_modify_bytes
s += '\tsuccess modify bytes = %ld\n' % self.success_modify_bytes
s += '\ttotal sync_in bytes = %ld\n' % self.total_sync_in_bytes
s += '\tsuccess sync_in bytes = %ld\n' % self.success_sync_in_bytes
s += '\ttotal sync_out bytes = %ld\n' % self.total_sync_out_bytes
s += '\tsuccess sync_out bytes = %ld\n' % self.success_sync_out_bytes
s += '\ttotal file open count = %ld\n' % self.total_file_open_count
s += '\tsuccess file open count = %ld\n' % self.success_file_open_count
s += '\ttotal file read count = %ld\n' % self.total_file_read_count
s += '\tsuccess file read count = %ld\n' % self.success_file_read_count
s += '\ttotal file write count = %ld\n' % self.total_file_write_count
s += '\tsucess file write count = %ld\n' % self.success_file_write_count
s += '\tlast heartbeat time = %s\n' % self.last_heartbeat_time
s += '\tlast source update = %s\n' % self.last_source_sync
s += '\tlast sync update = %s\n' % self.last_sync_update
s += '\tlast synced time = %s\n' % self.last_synced_time
return s
def get_fmt_size(self):
return struct.calcsize(self.fmt)
class Group_info(object):
def __init__(self):
self.group_name = ''
self.totalMB = ''
self.freeMB = ''
self.trunk_freeMB = ''
self.count = 0
self.storage_port = 0
self.store_http_port = 0
self.active_count = 0
self.curr_write_server = 0
self.store_path_count = 0
self.subdir_count_per_path = 0
self.curr_trunk_file_id = 0
self.fmt = '!%ds 11Q' % (FDFS_GROUP_NAME_MAX_LEN + 1)
return None
def __str__(self):
s = 'Group information:\n'
s += '\tgroup name = %s\n' % self.group_name
s += '\ttotal disk space = %s\n' % self.totalMB
s += '\tdisk free space = %s\n' % self.freeMB
s += '\ttrunk free space = %s\n' % self.trunk_freeMB
s += '\tstorage server count = %d\n' % self.count
s += '\tstorage port = %d\n' % self.storage_port
s += '\tstorage HTTP port = %d\n' % self.store_http_port
s += '\tactive server count = %d\n' % self.active_count
s += '\tcurrent write server index = %d\n' % self.curr_write_server
s += '\tstore path count = %d\n' % self.store_path_count
s += '\tsubdir count per path = %d\n' % self.subdir_count_per_path
s += '\tcurrent trunk file id = %d\n' % self.curr_trunk_file_id
return s
def set_info(self, bytes_stream):
(group_name, totalMB, freeMB, trunk_freeMB, self.count, self.storage_port, self.store_http_port,
self.active_count, self.curr_write_server, self.store_path_count, self.subdir_count_per_path,
self.curr_trunk_file_id) = struct.unpack(self.fmt, bytes_stream)
try:
self.group_name = group_name.strip(b'\x00')
self.freeMB = appromix(freeMB, FDFS_SPACE_SIZE_BASE_INDEX)
self.totalMB = appromix(totalMB, FDFS_SPACE_SIZE_BASE_INDEX)
self.trunk_freeMB = appromix(trunk_freeMB, FDFS_SPACE_SIZE_BASE_INDEX)
except ValueError:
raise DataError('[-] Error disk space overrun, can not represented it.')
def get_fmt_size(self):
return struct.calcsize(self.fmt)
class Tracker_client(object):
'''Class Tracker client.'''
def __init__(self, pool):
self.pool = pool
def tracker_list_servers(self, group_name, storage_ip=None):
'''
List servers in a storage group
'''
conn = self.pool.get_connection()
th = Tracker_header()
ip_len = len(storage_ip) if storage_ip else 0
if ip_len >= IP_ADDRESS_SIZE:
ip_len = IP_ADDRESS_SIZE - 1
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + ip_len
th.cmd = TRACKER_PROTO_CMD_SERVER_LIST_STORAGE
group_fmt = '!%ds' % FDFS_GROUP_NAME_MAX_LEN
store_ip_addr = storage_ip or ''
storage_ip_fmt = '!%ds' % ip_len
try:
th.send_header(conn)
send_buffer = struct.pack(group_fmt, group_name) + struct.pack(storage_ip_fmt, store_ip_addr)
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
si = Storage_info()
si_fmt_size = si.get_fmt_size()
recv_size = len(recv_buffer)
if recv_size % si_fmt_size != 0:
errinfo = '[-] Error: response size not match, expect: %d, actual: %d' % (th.pkg_len, recv_size)
raise ResponseError(errinfo)
except ConnectionError:
raise
finally:
self.pool.release(conn)
num_storage = recv_size / si_fmt_size
si_list = []
i = 0
while num_storage:
si.set_info(recv_buffer[(i * si_fmt_size): ((i + 1) * si_fmt_size)])
si_list.append(si)
si = Storage_info()
num_storage -= 1
i += 1
ret_dict = {}
ret_dict['Group name'] = group_name
ret_dict['Servers'] = si_list
return ret_dict
def tracker_list_one_group(self, group_name):
conn = self.pool.get_connection()
th = Tracker_header()
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN
th.cmd = TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP
# group_fmt: |-group_name(16)-|
group_fmt = '!%ds' % FDFS_GROUP_NAME_MAX_LEN
try:
th.send_header(conn)
send_buffer = struct.pack(group_fmt, group_name)
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
group_info = Group_info()
group_info.set_info(recv_buffer)
except ConnectionError:
raise
finally:
self.pool.release(conn)
return group_info
def tracker_list_all_groups(self):
conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS
try:
th.send_header(conn)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
except:
raise
finally:
self.pool.release(conn)
gi = Group_info()
gi_fmt_size = gi.get_fmt_size()
if recv_size % gi_fmt_size != 0:
errmsg = '[-] Error: Response size is mismatch, except: %d, actul: %d' % (th.pkg_len, recv_size)
raise ResponseError(errmsg)
num_groups = recv_size / gi_fmt_size
ret_dict = {}
ret_dict['Groups count'] = num_groups
gi_list = []
i = 0
while num_groups:
gi.set_info(recv_buffer[i * gi_fmt_size: (i + 1) * gi_fmt_size])
gi_list.append(gi)
gi = Group_info()
i += 1
num_groups -= 1
ret_dict['Groups'] = gi_list
return ret_dict
def tracker_query_storage_stor_without_group(self):
'''Query storage server for upload, without group name.
Return: Storage_server object'''
conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ONE
try:
th.send_header(conn)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
if recv_size != TRACKER_QUERY_STORAGE_STORE_BODY_LEN:
errmsg = '[-] Error: Tracker response length is invaild, '
errmsg += 'expect: %d, actual: %d' % (TRACKER_QUERY_STORAGE_STORE_BODY_LEN, recv_size)
raise ResponseError(errmsg)
except ConnectionError:
raise
finally:
self.pool.release(conn)
# recv_fmt |-group_name(16)-ipaddr(16-1)-port(8)-store_path_index(1)|
recv_fmt = '!%ds %ds Q B' % (FDFS_GROUP_NAME_MAX_LEN, IP_ADDRESS_SIZE - 1)
store_serv = Storage_server()
(group_name, ip_addr, store_serv.port, store_serv.store_path_index) = struct.unpack(recv_fmt, recv_buffer)
store_serv.group_name = group_name.strip(b'\x00')
store_serv.ip_addr = ip_addr.strip(b'\x00')
return store_serv
def tracker_query_storage_stor_with_group(self, group_name):
'''Query storage server for upload, based group name.
arguments:
@group_name: string
@Return Storage_server object
'''
conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ONE
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN
th.send_header(conn)
group_fmt = '!%ds' % FDFS_GROUP_NAME_MAX_LEN
send_buffer = struct.pack(group_fmt, group_name)
try:
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
if recv_size != TRACKER_QUERY_STORAGE_STORE_BODY_LEN:
errmsg = '[-] Error: Tracker response length is invaild, '
errmsg += 'expect: %d, actual: %d' % (TRACKER_QUERY_STORAGE_STORE_BODY_LEN, recv_size)
raise ResponseError(errmsg)
except ConnectionError:
raise
finally:
self.pool.release(conn)
# recv_fmt: |-group_name(16)-ipaddr(16-1)-port(8)-store_path_index(1)-|
recv_fmt = '!%ds %ds Q B' % (FDFS_GROUP_NAME_MAX_LEN, IP_ADDRESS_SIZE - 1)
store_serv = Storage_server()
(group, ip_addr, store_serv.port, store_serv.store_path_index) = struct.unpack(recv_fmt, recv_buffer)
store_serv.group_name = group.strip(b'\x00')
store_serv.ip_addr = ip_addr.strip(b'\x00')
return store_serv
def _tracker_do_query_storage(self, group_name, filename, cmd):
'''
core of query storage, based group name and filename.
It is useful download, delete and set meta.
arguments:
@group_name: string
@filename: string. remote file_id
@Return: Storage_server object
'''
conn = self.pool.get_connection()
th = Tracker_header()
file_name_len = len(filename)
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + file_name_len
th.cmd = cmd
th.send_header(conn)
# query_fmt: |-group_name(16)-filename(file_name_len)-|
query_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, file_name_len)
send_buffer = struct.pack(query_fmt, group_name, filename)
try:
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
if recv_size != TRACKER_QUERY_STORAGE_FETCH_BODY_LEN:
errmsg = '[-] Error: Tracker response length is invaild, '
errmsg += 'expect: %d, actual: %d' % (th.pkg_len, recv_size)
raise ResponseError(errmsg)
except ConnectionError:
raise
finally:
self.pool.release(conn)
# recv_fmt: |-group_name(16)-ip_addr(16)-port(8)-|
recv_fmt = '!%ds %ds Q' % (FDFS_GROUP_NAME_MAX_LEN, IP_ADDRESS_SIZE - 1)
store_serv = Storage_server()
(group_name, ipaddr, store_serv.port) = struct.unpack(recv_fmt, recv_buffer)
store_serv.group_name = group_name.strip(b'\x00')
store_serv.ip_addr = ipaddr.strip(b'\x00')
return store_serv
def tracker_query_storage_update(self, group_name, filename):
'''
Query storage server to update(delete and set_meta).
'''
return self._tracker_do_query_storage(group_name, filename, TRACKER_PROTO_CMD_SERVICE_QUERY_UPDATE)
def tracker_query_storage_fetch(self, group_name, filename):
'''
Query storage server to download.
'''
return self._tracker_do_query_storage(group_name, filename, TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ONE)
#!/usr/bin/env python
# -*- coding = utf-8 -*-
# filename: utils.py
import io
import os
import sys
import stat
import platform
import configparser
SUFFIX = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
__os_sep__ = "/" if platform.system() == 'Windows' else os.sep
def appromix(size, base=0):
'''Conver bytes stream size to human-readable format.
Keyword arguments:
size: int, bytes stream size
base: int, suffix index
Return: string
'''
multiples = 1024
if size < 0:
raise ValueError('[-] Error: number must be non-negative.')
if size < multiples:
return '{0:d}{1}'.format(size, SUFFIX[base])
for suffix in SUFFIX[base:]:
if size < multiples:
return '{0:.2f}{1}'.format(size, suffix)
size = size / float(multiples)
raise ValueError('[-] Error: number too big.')
def get_file_ext_name(filename, double_ext=True):
li = filename.split(os.extsep)
if len(li) <= 1:
return ''
else:
if li[-1].find(__os_sep__) != -1:
return ''
if double_ext:
if len(li) > 2:
if li[-2].find(__os_sep__) == -1:
return '%s.%s' % (li[-2], li[-1])
return li[-1]
class Fdfs_ConfigParser(configparser.RawConfigParser):
"""
Extends ConfigParser to allow files without sections.
This is done by wrapping read files and prepending them with a placeholder
section, which defaults to '__config__'
"""
def __init__(self, default_section=None, *args, **kwargs):
configparser.RawConfigParser.__init__(self, *args, **kwargs)
self._default_section = None
self.set_default_section(default_section or '__config__')
def get_default_section(self):
return self._default_section
def set_default_section(self, section):
self.add_section(section)
# move all values from the previous default section to the new one
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
except configparser.NoSectionError:
pass
else:
for (key, value) in default_section_items:
self.set(section, key, value)
self._default_section = section
def read(self, filenames):
if isinstance(filenames, str):
filenames = [filenames]
read_ok = []
for filename in filenames:
try:
with open(filename) as fp:
self.readfp(fp)
except IOError:
continue
else:
read_ok.append(filename)
return read_ok
def readfp(self, fp, *args, **kwargs):
stream = io.StringIO()
try:
stream.name = fp.name
except AttributeError:
pass
stream.write('[' + self._default_section + ']\n')
stream.write(fp.read())
stream.seek(0, 0)
return self._read(stream, stream.name)
def write(self, fp):
# Write the items from the default section manually and then remove them
# from the data. They'll be re-added later.
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
for (key, value) in default_section_items:
fp.write("{0} = {1}\n".format(key, value))
fp.write("\n")
except configparser.NoSectionError:
pass
configparser.RawConfigParser.write(self, fp)
self.add_section(self._default_section)
for (key, value) in default_section_items:
self.set(self._default_section, key, value)
def _read(self, fp, fpname):
"""Parse a sectioned setup file.
The sections in setup file contains a title line at the top,
indicated by a name in square brackets (`[]'), plus key/value
options lines, indicated by `name: value' format lines.
Continuations are represented by an embedded newline then
leading whitespace. Blank lines, lines beginning with a '#',
and just about everything else are ignored.
"""
cursect = None # None, or a dictionary
optname = None
lineno = 0
e = None # None, or an exception
while True:
line = fp.readline()
if not line:
break
lineno = lineno + 1
# comment or blank line?
if line.strip() == '' or line[0] in '#;':
continue
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
# no leading whitespace
continue
# continuation line?
if line[0].isspace() and cursect is not None and optname:
value = line.strip()
if value:
cursect[optname] = "%s\n%s" % (cursect[optname], value)
# a section header or option header?
else:
# is it a section header?
mo = self.SECTCRE.match(line)
if mo:
sectname = mo.group('header')
if sectname in self._sections:
cursect = self._sections[sectname]
elif sectname == DEFAULTSECT:
cursect = self._defaults
else:
cursect = self._dict()
cursect['__name__'] = sectname
self._sections[sectname] = cursect
# So sections can't start with a continuation line
optname = None
# no section header in the file?
elif cursect is None:
raise MissingSectionHeaderError(fpname, lineno, line)
# an option line?
else:
mo = self.OPTCRE.match(line)
if mo:
optname, vi, optval = mo.group('option', 'vi', 'value')
if vi in ('=', ':') and ';' in optval:
# ';' is a comment delimiter only if it follows
# a spacing character
pos = optval.find(';')
if pos != -1 and optval[pos - 1].isspace():
optval = optval[:pos]
optval = optval.strip()
# allow empty values
if optval == '""':
optval = ''
optname = self.optionxform(optname.rstrip())
if optname in cursect:
if not isinstance(cursect[optname], list):
cursect[optname] = [cursect[optname]]
cursect[optname].append(optval)
else:
cursect[optname] = optval
else:
# a non-fatal parsing error occurred. set up the
# exception but keep going. the exception will be
# raised at the end of the file and will contain a
# list of all bogus lines
if not e:
e = ParsingError(fpname)
e.append(lineno, repr(line))
# if any parsing errors occurred, raise an exception
if e:
raise e
def split_remote_fileid(remote_file_id):
'''
Splite remote_file_id to (group_name, remote_file_name)
arguments:
@remote_file_id: string
@return tuple, (group_name, remote_file_name)
'''
index = remote_file_id.find(b'/')
if -1 == index:
return None
return (remote_file_id[0:index], remote_file_id[(index + 1):])
def fdfs_check_file(filename):
ret = True
errmsg = ''
if not os.path.isfile(filename):
ret = False
errmsg = '[-] Error: %s is not a file.' % filename
elif not stat.S_ISREG(os.stat(filename).st_mode):
ret = False
errmsg = '[-] Error: %s is not a regular file.' % filename
return (ret, errmsg)
if __name__ == '__main__':
print(get_file_ext_name('/bc.tar.gz'))
# link = "../path/to/file/../folder/../"
#
# count = link.count("../")
#
# print(count) # 输出:4
#
# my_list = [1, 2, 3]
#
# result = ','.join(map(str, my_list))
#
# print(type(result)) # 输出:1,2,3
import io
import os
import requests
import win32com.client as win32
import win32com
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'Secure HttpOnly; __jsluid_h=622c9569b7d6db50280612dfbe457ec4; __jsluid_s=b76555cb5e11504bde71924ab7ff780a; _va_ref=%5B%22%22%2C%22%22%2C1693534257%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dm-wPghSYa5m0rKj3FjcXWqp8gfXcvYwljt9y1RdmIrHXd1lAl4QhbIbXDjZR9hZY%26wd%3D%26eqid%3Da0f738ed0005f760000000036459cd31%22%5D; _va_id=afdeea114ed19176.1683278397.11.1693534257.1693534257.; _va_ses=*',
'Host': 'gzw.beijing.gov.cn',
'If-Modified-Since': 'Mon, 31 Jul 2023 13:33:44 GMT',
'If-None-Match': 'W/"64c7b838-9676"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'}
# def doc_page(doc_bytes):
# doc = Document(doc_bytes)
# num_pages = len(doc.sections)
# print("文档页数:", num_pages)
# return num_pages
#
# file_href = 'https://gzw.beijing.gov.cn/xxfb/zcfg/202204/P020230106624599701195.doc'
# resp_content = requests.get(file_href,headers=headers,verify=False, timeout=20).content
# doc_bytes = BytesIO(resp_content)
# category = '.doc'
# if category == '.doc' or category == '.docx':
# page_size = doc_page(doc_bytes)
# print(page_size)
# from docx import Document
# import requests
# from io import BytesIO
# import docx2txt
# import docx
# # 获取远程Word文档内容
# url = "https://gzw.beijing.gov.cn/xxfb/zcfg/202204/P020230106624599701195.doc"
# response = requests.get(url,headers=headers,verify=False, timeout=20)
# doc_bytes = BytesIO(response.content)
#
# # 解析文档页数
# # doc = Document(doc_bytes)
# # num_pages = len(doc.sections)
# # print("文档页数:", num_pages)
#
# doc_file = docx.Document('E:\kkwork\P020230106624599701195.docx')
# # text = doc_file[0].getText()
# # print("文档内容:", text)
#
# # 解析文档页数
# num_pages = len(doc_file.sections)
# print("文档页数:", num_pages)
# from docx import Document
# import requests
#
# # # 下载远程Word文档
url = "https://gzw.beijing.gov.cn/xxfb/zcfg/202204/P020230106624599701195.doc"
# response = requests.get(url,headers=headers,verify=False, timeout=20)
# # with open("remote_document.doc", "wb") as file:
# # file.write(response.content)
#
# # 将doc转为docx格式
# import os
# os.system("textutil -convert docx remote_document.doc")
#
# # 解析文档页数
# doc = Document("remote_document.docx")
# num_pages = len(doc.sections)
# print("文档页数:", num_pages)
# def get_file_stream(file_url):
# # 发送GET请求获取文件内容
# response = requests.get(url,headers=headers,verify=False,timeout=10)
# # 将文件内容保存为文件流
# file_stream = io.BytesIO(response.content)
# return file_stream
# def read_file_stream(file_stream):
# # 创建一个Word应用程序对象
# word_app = win32com.client.Dispatch("Word.Application")
# # 将文件流保存为临时文件
# temp_file = "E:/temp.doc"
# with open(temp_file, "wb") as f:
# f.write(file_stream.read())
# # 打开临时文件
# doc = word_app.Documents.Open(temp_file)
# # 读取文件内容
# # content = doc.Content.Text
# # print(content)
# # 计算文档的统计信息
# doc.ComputeStatistics(2) # 参数2表示统计页数
# # 获取文档的页数
# page_count = doc.BuiltInDocumentProperties("Number of Pages").Value
# # 输出页数
# print(f"页数:{page_count}")
# # 关闭文件和应用程序
# doc.Close()
# word_app.Quit()
# # 删除临时文件
# os.remove(temp_file)
# return page_count
# file_stream = get_file_stream(url)
# page_count = read_file_stream(file_stream)
# print(f"页数22:{page_count}")
import io
import inspect
import requests
import os
import win32com.client as win32
import win32com
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Secure HttpOnly; __jsluid_s=8e1b254e40166ac092b3e137cacf4f09; _va_ses=*; _va_id=0c66e4a68ee344ba.1693453315.1.1693453317.1693453315.',
'Host':'gzw.beijing.gov.cn',
'Pragma':'no-cache',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
url='https://gzw.beijing.gov.cn/xxfb/zcfg/202204/P020230106624599701195.doc'
def get_file_stream(file_url):
# 发送GET请求获取文件内容
response = requests.get(url,headers=header,verify=False,timeout=10)
# 将文件内容保存为文件流
file_stream = io.BytesIO(response.content)
return file_stream
def read_file_stream(file_stream):
# 创建一个Word应用程序对象
word_app = win32com.client.Dispatch("Word.Application")
# 将文件流保存为临时文件
temp_file = "D:/temp.doc"
with open(temp_file, "wb") as f:
f.write(file_stream.read())
# 打开临时文件
doc = word_app.Documents.Open(temp_file)
# 读取文件内容
# content = doc.Content.Text
# print(content)
# 计算文档的统计信息
doc.ComputeStatistics(2) # 参数2表示统计页数
# 获取文档的页数
page_count = doc.BuiltInDocumentProperties("Number of Pages").Value
# 输出页数
print(f"页数:{page_count}")
# 关闭文件和应用程序
doc.Close()
word_app.Quit()
# 删除临时文件
os.remove(temp_file)
return page_count
file_stream = get_file_stream(url)
page_count = read_file_stream(file_stream)
print(f"页数22:{page_count}")
print('程序结束!')
import json
import re
import time
import fitz
import pymongo
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from pyquery import PyQuery as pq
from requests.packages import urllib3
from urllib.parse import urljoin
from BaseCore import BaseCore
baseCore = BaseCore()
urllib3.disable_warnings()
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from lxml import etree
from random import choice
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
real_href = 'http://gzw.fujian.gov.cn/ztzl/gzjgfzjs/gfxwj_7426/201809/t20180911_4492105.htm'
href_text = requests.get(url=real_href, verify=False)
href_text.encoding = href_text.apparent_encoding
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
#相对路径转化为绝对路径
i_soup = paserUrl(i_soup,real_href)
# print(i_soup)
source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip()
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
content = i_soup.find('div', attrs={'class': 'xl_con1'})
pub_hao = ''
print(real_href)
id_list = []
#todo:获取附件地址
fu_jian_list = content.find('ul',class_='clearflx myzj_xl_list').find_all('a')
for fu_jian in fu_jian_list:
fj_href = fu_jian['href']
if '.doc' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
#找到附件后 上传至文件服务器
retData = baseCore.uploadToserver(fj_href,'1673')
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', 'aaa', 1)
id_list.append(att_id)
fu_jian['href'] = 'http://114.115.215.96/' + full_path
print(f'-----{content}')
\ No newline at end of file
[redis]
host=114.115.236.206
port=6379
pass=clbzzsn
[mysql]
host=114.115.159.144
username=caiji
password=zzsn9988
database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_baidu
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
import redis
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
#获取所有键
keys = r.keys('*')
for key in keys:
f_key = key.decode()
if 'wx_url' in f_key:
list_info = []
print(f_key)
print("----------")
# 截取sid
sid = f_key.split('url_')[1]
#拿到微信公众号的key 获取值
value = list(r.smembers(f_key))
list_data = []
for member in value:
member = member.decode()
# print(member)
data = (sid,member)
# print(data)
# print(list_info)
select_sql = f"select sid from wx_link where link = '{member}'"
cursor.execute(select_sql)
select = cursor.fetchone()
if select:
continue
else:
list_info.append(data)
if len(list_info)!=0:
pass
else:
continue
#批量插入
sql = "INSERT INTO wx_link (sid, link,state) VALUES (%s, %s,'99')"
cursor.executemany(sql, list_info)
cnx.commit()
log.info(f'{sid}---插入成功--{len(list_info)}----')
baseCore.close()
import time
import pandas as pd import pandas as pd
...@@ -30,11 +31,52 @@ import pandas as pd ...@@ -30,11 +31,52 @@ import pandas as pd
# workbook.save(filename) # workbook.save(filename)
# writeaa() # writeaa()
gpdm = '01109.HK' # gpdm = '01109.HK'
if 'HK' in str(gpdm): # if 'HK' in str(gpdm):
tmp_g = str(gpdm).split('.')[0] # tmp_g = str(gpdm).split('.')[0]
if len(tmp_g) == 5: # if len(tmp_g) == 5:
gpdm = str(gpdm)[1:] # gpdm = str(gpdm)[1:]
print(gpdm) # print(gpdm)
# else:
# pass
import redis
from base.BaseCore import BaseCore
baseCore = BaseCore()
r = baseCore.r
key = 'counter'
expiration_time = 10 # 设置过期时间 60秒
# # 设置自增
# r.incr(key)
# # #自增并设置过期时间
# while True:
# # 设置自增
# r.incr(key)
# value = int(r.get(key).decode())
#
# if value > 10:
# print(value)
# # 设置过期时间
# r.expire(key, expiration_time)
# time.sleep(20)
# print('------------------')
# continue
# # print(value)
# time.sleep(5)
# print(value)
# print("==========")
def check_url():
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=6)
res = r.exists('WeiXinGZH:infoSourceCode','IN-20220418-0001')
print(res)
if res == 1:
print('True')
else: else:
pass print('False')
check_url()
import time
import pandas as pd
# def writeaa():
# detailList=[]
# aa={
# 'id':3,
# 'name':'qqqwe'
# }
# detailList.append(aa)
# writerToExcel(detailList)
# 将数据追加到excel
# def writerToExcel(detailList):
# # filename='baidu搜索.xlsx'
# # 读取已存在的xlsx文件
# existing_data = pd.read_excel(filename,engine='openpyxl')
# # 创建新的数据
# new_data = pd.DataFrame(data=detailList)
# # 将新数据添加到现有数据的末尾
# combined_data = existing_data.append(new_data, ignore_index=True)
# # 将结果写入到xlsx文件
# combined_data.to_excel(filename, index=False)
#
# from openpyxl import Workbook
#
# if __name__ == '__main__':
# filename='test1.xlsx'
# # # 创建一个工作簿
# workbook = Workbook(filename)
# workbook.save(filename)
# writeaa()
# gpdm = '01109.HK'
# if 'HK' in str(gpdm):
# tmp_g = str(gpdm).split('.')[0]
# if len(tmp_g) == 5:
# gpdm = str(gpdm)[1:]
# print(gpdm)
# else:
# pass
from base.BaseCore import BaseCore
baseCore = BaseCore()
r = baseCore.r
# #自增并设置过期时间
# while True:
# key = 'mykey'
# expiration_time = 60 # 设置过期时间 60秒
# #设置自增
# r.incr(key)
#
#
#
# value = int(r.get(key).decode())
#
# if value > 10:
# print(value)
# # 设置过期时间
# r.expire(key, expiration_time)
#
# time.sleep(70)
# print('------------------')
# continue
# # print(value)
#
# print("==========")
# expiration_time = 60
# # 创建PubSub对象
# p = r.pubsub()
#
# # 订阅过期事件
# p.psubscribe('__keyevent@6__:expired')
# aa = p.listen()
# # 监听过期事件
# for message in p.listen():
# if message['type'] == 'pmessage':
# expired_key = message['data'].decode()
# print('过期的key:', expired_key)
# if expired_key == 'counter':
# # 执行重置操作
# r.set('counter', 0)
# print('计数器已重置为0')
# # 设置自增
# r.incr('counter')
# # 设置过期时间
# r.expire('counter', expiration_time)
for i in range(0, 24, 5):
print(i)
\ No newline at end of file
import pandas as pd
import numpy as np
import pymysql
import time
import requests
import certifi
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log= baseCore.getLogger()
# cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
# cursor = cnx.cursor()
cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji', charset='utf8mb4')
curosr = cnx.cursor()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
#'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
def getInfo(gpdm,xydm):
print('开始')
gpdm_ = gpdm
if 'HK' in gpdm_:
gpdm_ = gpdm_[1:]
start = time.time()
retData={}
retData['base_info'] = {
'公司名称': '',
'英文名':'',
'信用代码': xydm,
'股票代码': gpdm_,
'地址': '',
'电话': '',
'公司网站': '',
'部门': '',
'行业': '',
'员工人数': '',
'公司简介': ''
}
retData['people_info']=[]
# https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE
url = f'https://finance.yahoo.com/quote/{gpdm}/profile?p={gpdm}'
time.sleep(3)
for i in range(0,3):
try:
response = requests.get(url, headers=headers, verify=False)
time.sleep(1)
if (response.status_code == 200):
break
else:
log.error(f"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}")
except :
continue
if (response.status_code == 200):
pass
else:
log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}")
return retData
soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
name = page.find('h3',{'class':'Fz(m) Mb(10px)'})
try:
com_info = page.find('div', {'class': 'Mb(25px)'})
except:
com_info = ''
try:
com_phone = com_info.find_all('p')[0].find('a').text
except:
com_phone = ''
try:
com_url = com_info.find_all('p')[0].find('a', {'target': '_blank'}).text
except:
com_url = ''
try:
com_address = com_info.find_all('p')[0].text.replace(com_phone, '').replace(com_url, '')
except:
com_address = ''
try:
com_bumen = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[0].text
except:
com_bumen = ''
try:
com_hangye = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[1].text
except:
com_hangye = ''
try:
com_people = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[2].text
except:
com_people = ''
try:
com_jianjie = page.find('p', {'class': 'Mt(15px) Lh(1.6)'}).text
except:
com_jianjie = ''
dic_com_info = {
'公司名称':'',
'英文名':name,
'信用代码': xydm,
'股票代码': gpdm_,
'地址': com_address,
'电话': com_phone,
'公司网站': com_url,
'部门': com_bumen,
'行业': com_hangye,
'员工人数': com_people,
'公司简介': com_jianjie
}
retData['base_info']=dic_com_info
#高管信息
retPeople = []
try:
list_people = page.find('table', {'class': 'W(100%)'}).find_all('tr')[1:]
except:
list_people = []
for one_people in list_people:
try:
p_name = one_people.find_all('td')[0].text
except:
p_name = ''
continue
try:
p_zhiwu = one_people.find_all('td')[1].text
except:
p_zhiwu = ''
try:
p_money = one_people.find_all('td')[2].text
except:
p_money = ''
try:
p_xingshi = one_people.find_all('td')[3].text
except:
p_xingshi = ''
try:
p_year = one_people.find_all('td')[4].text
except:
p_year = ''
if(p_zhiwu=="N/A"):
p_zhiwu=""
if (p_money == "N/A"):
p_money = ""
if (p_xingshi == "N/A"):
p_xingshi = ""
if (p_year == "N/A"):
p_year = ""
dic_main_people = {
'公司名称': name,
'股票代码': gpdm_,
'信用代码': xydm,
'姓名': p_name,
'职务': p_zhiwu,
'薪资': p_money,
'行使': p_xingshi,
'出生年份': p_year
}
retPeople.append(dic_main_people)
retData['people_info'] = retPeople
# df_a = pd.DataFrame(retData['base_info'])
log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
return retData
# # 数据库中获取企业gpdm、xydm
sql_select = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col6 is not null and state1=1 and col3 like 'ZZSN%' order by date1 ,id LIMIT 1"
curosr.execute(sql_select)
data = curosr.fetchone()
id = data[0]
# 更新以获取企业的采集状态
# sql_update = f"UPDATE Tfbs set state1 = 2 WHERE id = {id}"
# curosr.execute(sql_update)
# cnx.commit()
xydm = data[4]
gpdm = data[7]
# 获取企业的基本信息和高管信息
retData = getInfo(gpdm,xydm)
print(retData)
curosr.close()
cnx.close()
url = 'https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE'
req = requests.get(url=url,headers=headers,verify=False)
print(req.status_code)
\ No newline at end of file
from kafka import KafkaConsumer
from kafka import KafkaConsumer
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
from selenium import webdriver
import datetime
import time
import redis
import hashlib
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import random
from kafka import KafkaProducer
r = redis.Redis(host="localhost", port=6379)
# 将数据转换成hash值,用来对文章url进行去重,实现增量爬虫
def get_md5(val):
"""把目标数据进行哈希,用哈希值去重更快"""
md5 = hashlib.md5()
md5.update(val.encode('utf-8'))
return md5.hexdigest()
# 使用redis的set对uid进行去重,对爬取过的uid不再全部爬取
def add_uid(name_uid):
res = r.sadd('name_uid', name_uid) # 注意是 保存set的方式
if res == 0: # 若返回0,说明插入不成功,表示有重复
return True
else:
return False
# 使用redis的set对文章url进行去重
def add_url(article_url):
res = r.sadd('article_url', get_md5(article_url), 3) # 注意是 保存set的方式
if res == 0: # 若返回0,说明插入不成功,表示有重复
return True
else:
return False
# 使用模拟浏览器来获取cookie值
def get_cookie():
executable_path = r"D:\chrome\chromedriver.exe"
opt = webdriver.ChromeOptions()
opt.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=opt, executable_path=executable_path)
browser.get("https://weibo.com/")
# 等待界面出现再获取cookie
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id=\"app\"]")))
cookie_list = browser.get_cookies()
browser.quit()
# 格式化打印cookie
cookies = {}
# 获取cookie中的name和value,转化成requests可以使用的形式
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
r.set("cookies", str(cookies), ex=600)
# print(cookies)
# 代码主程序,通过给出的用户url来获取用户发布的文章
def get_content_by_user_uid(url, sid):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
# "accept": "application/json, text/plain, */*",
}
s = requests.session()
cookies_str = r.get("cookies")
if cookies_str == None:
get_cookie()
cookies = json.loads('{' + re.findall("{(.*?)}", str(r.get("cookies")).replace("\'", "\""))[0] + '}')
else:
cookies = json.loads('{' + re.findall("{(.*?)}", str(cookies_str).replace("\'", "\""))[0] + '}')
s.cookies.update(cookies)
list_all_info = [] # 用来储存获取后的所有字典数据
url_get_uid = ''
# 获取到统一格式的名称,用来查询微博uid
if url[-1] == "/":
url = url[:-1]
if "?" not in url:
url_get_uid = "https://weibo.com/ajax/profile/info?custom=" + url.split('/')[-1]
else:
if "%" not in url:
url_get_uid = "https://weibo.com/ajax/profile/info?custom=" + url.split('/')[-1].split('?')[0]
else:
url_get_uid = "https://weibo.com/ajax/profile/info?screen_name=" + url.split('/')[-1].split('?')[0]
try:
res_get_uid_json = s.get(url_get_uid, headers=headers).json()
weibo_name = res_get_uid_json['data']['user']['screen_name'] # 微博号名称
uid = res_get_uid_json['data']['user']['id'] # 微博uid
origin = "微博-" + weibo_name
except:
print(f"{url}:uid获取失败")
return
# print(uid)
if add_uid(uid): # 若uid已存在于redis中,只爬取该作者第一页文章
num_page = 2
else: # 若uid不存在redis中,爬取1000页
num_page = 1000
# 爬取程序入口
for page in range(1, num_page): # 对后面进行无限翻页,直到无内容显示后跳出
try:
url_all_con = f"https://weibo.com/ajax/statuses/mymblog?uid={uid}&page={page}&feature=10" # 使用uid找到每个微博的所有文章
res_all_con_json = s.get(url_all_con, headers=headers).json()
list_all_con = res_all_con_json['data']['list'] # 每页微博文章为json类,取出需要的数据
if list_all_con: # 当页面有内容时进行获取数据,无内容则跳出
for one_con in list_all_con:
list_news_ = one_con['url_struct']
try:
for weibo_news in list_news_:
if "weibo.com" in weibo_news['long_url']:
title_one_con = weibo_news['url_title'] # 文章标题
url_one_con = weibo_news['long_url'] # 文章链接URL
except:
title_one_con = list_news_[0]['url_title'] # 文章标题
url_one_con = "https://weibo.com/ttarticle/p/show?id=" + list_news_[0]['page_id'] # 文章链接URL
if add_url(url_one_con): # 若url已存在,则返回TRUE,跳出本次循环
continue
for num_res in range(0, 3): # url若访问失败可以最多访问3次
try:
res_one_con = s.get(url_one_con, headers=headers) # 对具体文章页面进行请求,获得文章内容
break
except:
time.sleep(2)
continue
soup_one_con = BeautifulSoup(res_one_con.content, 'html.parser')
try:
try:
one_time = soup_one_con.find('span', {'class': 'time'}).text.split(' ')[0] # 文章发表时间
except:
opt = webdriver.ChromeOptions() # 微博的头条文章用request不能获取到,在这里用模拟浏览器获取
opt.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=opt)
browser.get(url_one_con)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable(
(By.XPATH, "/html/body/div/div[2]/div[1]/div[2]/div/span[1]")))
page_source = browser.page_source # 获取页面信息
soup_one_con = BeautifulSoup(page_source, 'html.parser')
one_time = \
soup_one_con.find('div', {'class': 'm-box-col m-box-center-a'}).find('span').text.split(
' ')[0]
one_content = soup_one_con.find('div', {'class': 'art-con-new'}).text
one_content_html = str(soup_one_con.find('div', {'class': 'art-con-new'}))
if len(one_time.split('-')) == 2: # 2022年的微博时间只显示月和日,在这里手动添加年
one_time = str(time.localtime(time.time())[0]) + '-' + one_time
dic_one_news = { # 将获取到的每条文章设置为dic,再对dic进行储存
"author": weibo_name, # 微博名称
"title": title_one_con, # 文章标题
"publishDate": one_time, # 发布时间
"sourceAddress": url_one_con, # 文章链接
"content": one_content, # 文章正文
"contentWithTag": one_content_html, # 带标签的正文
"sid": sid, # kafka传过来的id
"lang": "zh_cn", # 语言
"origin": origin, # 来源
"originalDate": "", # 原文时间
"summary": "", # 摘要
"source": "15", # 采集来源(如通用、定制、微信公众号等)
}
list_all_info.append(dic_one_news) # 储存所有的dic
time.sleep(random.uniform(1.5, 2))
continue
try:
one_content = soup_one_con.find('div', {'class': 'WB_editor_iframe_new'}).text # 文章内容
one_content_html = str(soup_one_con.find('div', {'class': 'WB_editor_iframe_new'}))
except:
one_content = soup_one_con.find('div', {'class': 'WB_editor_iframe_word'}).text
one_content_html = str(soup_one_con.find('div', {'class': 'WB_editor_iframe_word'}))
if len(one_time.split('-')) == 2: # 2022年的微博时间只显示月和日,在这里手动添加年
one_time = str(time.localtime(time.time())[0]) + '-' + one_time
while True: # 将\n空格变为\n
one_content_n = one_content.replace('\n ', '\n').replace('\n ', '\n')
if one_content_n == one_content:
break
else:
one_content = one_content_n
while True: # 将\n空格变为\n
one_content_html_n = one_content_html.replace('\n ', '\n').replace('\n ', '\n')
if one_content_html_n == one_content_html:
break
else:
one_content_html = one_content_html_n
one_content_n = re.sub('\n+', '\n', one_content) # 将连续的\n改为单个\n
one_content_html_n = re.sub('\n+', '\n', one_content_html)
dic_one_news = { # 将获取到的每条文章设置为dic,再对dic进行储存
"author": weibo_name, # 微博名称
"title": title_one_con, # 文章标题
"publishDate": one_time, # 发布时间
"sourceAddress": url_one_con, # 文章链接
"content": one_content_n, # 文章正文
"contentWithTag": one_content_html_n, # 带标签的正文
"sid": sid, # kafka传过来的id
"lang": "zh_cn", # 语言
"origin": origin, # 来源
"originalDate": "", # 原文时间
"summary": "", # 摘要
"source": "15", # 采集来源(如通用、定制、微信公众号等)
}
list_all_info.append(dic_one_news)
time.sleep(random.uniform(1.5, 2)) # 一个文章爬取过后随机暂停1.5-2秒
except:
print("{}:的:{}:获取失败".format(weibo_name, url_one_con))
print("{}:的:{}:页获取成功".format(weibo_name, page))
else: # 页面无内容后表示已到最后一页,退出循环
break
except:
print("{}:的:{}:页获取失败".format(weibo_name, page))
break
# df_con = pd.DataFrame(list_all_info)
# df_con.to_excel(f'{uid}.xlsx')
for one_news_info in list_all_info: # 将每一个文章数据转换为json格式,把json文件用kafka发送出去
for num_pro in range(0, 3):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo",
json.dumps(one_news_info, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
# time.sleep(1)
# print(json.dumps(one_news_info, ensure_ascii=False))
break
except:
time.sleep(5)
print('发送kafka失败!正在重新发送!')
continue
# print(list_all_info[0]['title'])
return
def consume():
"""auto_commit_enable=True, auto_commit_interval_ms=3000"""
consumer = KafkaConsumer("weiBoCrawl", auto_offset_reset='earliest', group_id="python_weibo",
bootstrap_servers=['114.115.159.144:9092'])
# consumer = KafkaConsumer("pythonInfo", auto_offset_reset='earliest', bootstrap_servers=['114.115.159.144:9092'])
for message in consumer:
mes_dict = json.loads(message.value.decode('utf-8'))
# print(message.value.decode('utf-8'))
url = mes_dict['siteUri']
sid = mes_dict['id']
# print(url)
get_content_by_user_uid(url, sid)
if __name__ == "__main__":
# r = redis.Redis(host="localhost",port=6379)
# consume()
get_content_by_user_uid('https://weibo.com/2656274875?refer_flag=1001030103_','1571698920447193090')
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: client.py
'''
Client module for Fastdfs 3.08
author: scott yuan scottzer8@gmail.com
date: 2012-06-21
'''
import os
import sys
from fdfs_client.utils import *
from fdfs_client.tracker_client import *
from fdfs_client.storage_client import *
from fdfs_client.exceptions import *
def get_tracker_conf(conf_path='client.conf'):
cf = Fdfs_ConfigParser()
tracker = {}
try:
cf.read(conf_path)
timeout = cf.getint('__config__', 'connect_timeout')
tracker_list = cf.get('__config__', 'tracker_server')
if isinstance(tracker_list, str):
tracker_list = [tracker_list]
tracker_ip_list = []
for tr in tracker_list:
tracker_ip, tracker_port = tr.split(':')
tracker_ip_list.append(tracker_ip)
tracker['host_tuple'] = tuple(tracker_ip_list)
tracker['port'] = int(tracker_port)
tracker['timeout'] = timeout
tracker['name'] = 'Tracker Pool'
except:
raise
return tracker
class Fdfs_client(object):
'''
Class Fdfs_client implemented Fastdfs client protol ver 3.08.
It's useful upload, download, delete file to or from fdfs server, etc. It's uses
connection pool to manage connection to server.
'''
def __init__(self, trackers, poolclass=ConnectionPool):
self.trackers = trackers
self.tracker_pool = poolclass(**self.trackers)
self.timeout = self.trackers['timeout']
return None
def __del__(self):
try:
self.pool.destroy()
self.pool = None
except:
pass
def upload_by_filename(self, filename, meta_dict=None):
'''
Upload a file to Storage server.
arguments:
@filename: string, name of file that will be uploaded
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
} meta_dict can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : local_file_name,
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_by_filename(tc, store_serv, filename, meta_dict)
def upload_by_file(self, filename, meta_dict=None):
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_by_file(tc, store_serv, filename, meta_dict)
def upload_by_buffer(self, filebuffer, file_ext_name=None, meta_dict=None):
'''
Upload a buffer to Storage server.
arguments:
@filebuffer: string, buffer
@file_ext_name: string, file extend name
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
if not filebuffer:
raise DataError('[-] Error: argument filebuffer can not be null.')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_by_buffer(tc, store_serv, filebuffer, file_ext_name, meta_dict)
def upload_slave_by_filename(self, filename, remote_file_id, prefix_name, meta_dict=None):
'''
Upload slave file to Storage server.
arguments:
@filename: string, local file name
@remote_file_id: string, remote file id
@prefix_name: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dictionary {
'Status' : 'Upload slave successed.',
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Remote file id' : remote_file_id,
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading slave)')
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(uploading slave)')
if not prefix_name:
raise DataError('[-] Error: prefix_name can not be null.')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_with_group(group_name)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
try:
ret_dict = store.storage_upload_slave_by_filename(tc, store_serv, filename, prefix_name, remote_filename,
meta_dict=None)
except:
raise
ret_dict['Status'] = 'Upload slave file successed.'
return ret_dict
def upload_slave_by_file(self, filename, remote_file_id, prefix_name, meta_dict=None):
'''
Upload slave file to Storage server.
arguments:
@filename: string, local file name
@remote_file_id: string, remote file id
@prefix_name: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dictionary {
'Status' : 'Upload slave successed.',
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Remote file id' : remote_file_id,
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(uploading slave)')
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(uploading slave)')
if not prefix_name:
raise DataError('[-] Error: prefix_name can not be null.')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_with_group(group_name)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
try:
ret_dict = store.storage_upload_slave_by_file(tc, store_serv, filename, prefix_name, remote_filename,
meta_dict=None)
except:
raise
ret_dict['Status'] = 'Upload slave file successed.'
return ret_dict
def upload_slave_by_buffer(self, filebuffer, remote_file_id, meta_dict=None, file_ext_name=None):
'''
Upload slave file by buffer
arguments:
@filebuffer: string
@remote_file_id: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
}
@return dictionary {
'Status' : 'Upload slave successed.',
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Remote file id' : remote_file_id,
'Storage IP' : storage_ip
}
'''
if not filebuffer:
raise DataError('[-] Error: argument filebuffer can not be null.')
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(uploading slave)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_slave_by_buffer(tc, store_serv, filebuffer, remote_filename, meta_dict,
file_ext_name)
def upload_appender_by_filename(self, local_filename, meta_dict=None):
'''
Upload an appender file by filename.
arguments:
@local_filename: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
} Notice: it can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(uploading appender)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_appender_by_filename(tc, store_serv, local_filename, meta_dict)
def upload_appender_by_file(self, local_filename, meta_dict=None):
'''
Upload an appender file by file.
arguments:
@local_filename: string
@meta_dict: dictionary e.g.:{
'ext_name' : 'jpg',
'file_size' : '10240B',
'width' : '160px',
'hight' : '80px'
} Notice: it can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(uploading appender)')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_appender_by_file(tc, store_serv, local_filename, meta_dict)
def upload_appender_by_buffer(self, filebuffer, file_ext_name=None, meta_dict=None):
'''
Upload a buffer to Storage server.
arguments:
@filebuffer: string
@file_ext_name: string, can be null
@meta_dict: dictionary, can be null
@return dict {
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : 'Upload successed.',
'Local file name' : '',
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
} if success else None
'''
if not filebuffer:
raise DataError('[-] Error: argument filebuffer can not be null.')
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_stor_without_group()
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_upload_appender_by_buffer(tc, store_serv, filebuffer, meta_dict, file_ext_name)
def delete_file(self, remote_file_id):
'''
Delete a file from Storage server.
arguments:
@remote_file_id: string, file_id of file that is on storage server
@return tuple ('Delete file successed.', remote_file_id, storage_ip)
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in delete file)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_delete_file(tc, store_serv, remote_filename)
def download_to_file(self, local_filename, remote_file_id, offset=0, down_bytes=0):
'''
Download a file from Storage server.
arguments:
@local_filename: string, local name of file
@remote_file_id: string, file_id of file that is on storage server
@offset: long
@downbytes: long
@return dict {
'Remote file_id' : remote_file_id,
'Content' : local_filename,
'Download size' : downloaded_size,
'Storage IP' : storage_ip
}
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in download file)')
group_name, remote_filename = tmp
if not offset:
file_offset = int(offset)
if not down_bytes:
download_bytes = int(down_bytes)
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_fetch(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_download_to_file(tc, store_serv, local_filename, file_offset, download_bytes,
remote_filename)
def download_to_buffer(self, remote_file_id, offset=0, down_bytes=0):
'''
Download a file from Storage server and store in buffer.
arguments:
@remote_file_id: string, file_id of file that is on storage server
@offset: long
@down_bytes: long
@return dict {
'Remote file_id' : remote_file_id,
'Content' : file_buffer,
'Download size' : downloaded_size,
'Storage IP' : storage_ip
}
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in download file)')
group_name, remote_filename = tmp
if not offset:
file_offset = int(offset)
if not down_bytes:
download_bytes = int(down_bytes)
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_fetch(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
file_buffer = None
return store.storage_download_to_buffer(tc, store_serv, file_buffer, file_offset, download_bytes,
remote_filename)
def list_one_group(self, group_name):
'''
List one group information.
arguments:
@group_name: string, group name will be list
@return Group_info, instance
'''
tc = Tracker_client(self.tracker_pool)
return tc.tracker_list_one_group(group_name)
def list_servers(self, group_name, storage_ip=None):
'''
List all storage servers information in a group
arguments:
@group_name: string
@return dictionary {
'Group name' : group_name,
'Servers' : server list,
}
'''
tc = Tracker_client(self.tracker_pool)
return tc.tracker_list_servers(group_name, storage_ip)
def list_all_groups(self):
'''
List all group information.
@return dictionary {
'Groups count' : group_count,
'Groups' : list of groups
}
'''
tc = Tracker_client(self.tracker_pool)
return tc.tracker_list_all_groups()
def get_meta_data(self, remote_file_id):
'''
Get meta data of remote file.
arguments:
@remote_fileid: string, remote file id
@return dictionary, meta data
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in get meta data)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_get_metadata(tc, store_serv, remote_filename)
def set_meta_data(self, remote_file_id, meta_dict, op_flag=STORAGE_SET_METADATA_FLAG_OVERWRITE):
'''
Set meta data of remote file.
arguments:
@remote_file_id: string
@meta_dict: dictionary
@op_flag: char, 'O' for overwrite, 'M' for merge
@return dictionary {
'Status' : status,
'Storage IP' : storage_ip
}
'''
tmp = split_remote_fileid(remote_file_id)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(in set meta data)')
group_name, remote_filename = tmp
tc = Tracker_client(self.tracker_pool)
try:
store_serv = tc.tracker_query_storage_update(group_name, remote_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
status = store.storage_set_metadata(tc, store_serv, remote_filename, meta_dict)
except (ConnectionError, ResponseError, DataError):
raise
# if status == 2:
# raise DataError('[-] Error: remote file %s is not exist.' % remote_file_id)
if status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
ret_dict = {}
ret_dict['Status'] = 'Set meta data success.'
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def append_by_filename(self, local_filename, remote_fileid):
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(append)')
tmp = split_remote_fileid(remote_fileid)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(append)')
group_name, appended_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appended_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_append_by_filename(tc, store_serv, local_filename, appended_filename)
def append_by_file(self, local_filename, remote_fileid):
isfile, errmsg = fdfs_check_file(local_filename)
if not isfile:
raise DataError(errmsg + '(append)')
tmp = split_remote_fileid(remote_fileid)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(append)')
group_name, appended_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appended_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_append_by_file(tc, store_serv, local_filename, appended_filename)
def append_by_buffer(self, file_buffer, remote_fileid):
if not file_buffer:
raise DataError('[-] Error: file_buffer can not be null.')
tmp = split_remote_fileid(remote_fileid)
if not tmp:
raise DataError('[-] Error: remote_file_id is invalid.(append)')
group_name, appended_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appended_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_append_by_buffer(tc, store_serv, file_buffer, appended_filename)
def truncate_file(self, truncated_filesize, appender_fileid):
'''
Truncate file in Storage server.
arguments:
@truncated_filesize: long
@appender_fileid: remote_fileid
@return: dictionary {
'Status' : 'Truncate successed.',
'Storage IP' : storage_ip
}
'''
trunc_filesize = int(truncated_filesize)
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: appender_fileid is invalid.(truncate)')
group_name, appender_filename = tmp
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_truncate_file(tc, store_serv, trunc_filesize, appender_filename)
def modify_by_filename(self, filename, appender_fileid, offset=0):
'''
Modify a file in Storage server by file.
arguments:
@filename: string, local file name
@offset: long, file offset
@appender_fileid: string, remote file id
@return: dictionary {
'Status' : 'Modify successed.',
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(modify)')
filesize = os.stat(filename).st_size
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: remote_fileid is invalid.(modify)')
group_name, appender_filename = tmp
if not offset:
file_offset = int(offset)
else:
file_offset = 0
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_modify_by_filename(tc, store_serv, filename, file_offset, filesize, appender_filename)
def modify_by_file(self, filename, appender_fileid, offset=0):
'''
Modify a file in Storage server by file.
arguments:
@filename: string, local file name
@offset: long, file offset
@appender_fileid: string, remote file id
@return: dictionary {
'Status' : 'Modify successed.',
'Storage IP' : storage_ip
}
'''
isfile, errmsg = fdfs_check_file(filename)
if not isfile:
raise DataError(errmsg + '(modify)')
filesize = os.stat(filename).st_size
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: remote_fileid is invalid.(modify)')
group_name, appender_filename = tmp
if not offset:
file_offset = int(offset)
else:
file_offset = 0
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_modify_by_file(tc, store_serv, filename, file_offset, filesize, appender_filename)
def modify_by_buffer(self, filebuffer, appender_fileid, offset=0):
'''
Modify a file in Storage server by buffer.
arguments:
@filebuffer: string, file buffer
@offset: long, file offset
@appender_fileid: string, remote file id
@return: dictionary {
'Status' : 'Modify successed.',
'Storage IP' : storage_ip
}
'''
if not filebuffer:
raise DataError('[-] Error: filebuffer can not be null.(modify)')
filesize = len(filebuffer)
tmp = split_remote_fileid(appender_fileid)
if not tmp:
raise DataError('[-] Error: remote_fileid is invalid.(modify)')
group_name, appender_filename = tmp
if not offset:
file_offset = int(offset)
else:
file_offset = 0
tc = Tracker_client(self.tracker_pool)
store_serv = tc.tracker_query_storage_update(group_name, appender_filename)
store = Storage_client(store_serv.ip_addr, store_serv.port, self.timeout)
return store.storage_modify_by_buffer(tc, store_serv, filebuffer, file_offset, filesize, appender_filename)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: connection.py
import socket
import os
import sys
import time
import random
from itertools import chain
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# start class Connection
class Connection(object):
'''Manage TCP comunication to and from Fastdfs Server.'''
def __init__(self, **conn_kwargs):
self.pid = os.getpid()
self.host_tuple = conn_kwargs['host_tuple']
self.remote_port = conn_kwargs['port']
self.remote_addr = None
self.timeout = conn_kwargs['timeout']
self._sock = None
def __del__(self):
try:
self.disconnect()
except:
pass
def connect(self):
'''Connect to fdfs server.'''
if self._sock:
return
try:
sock = self._connect()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = sock
# print '[+] Create a connection success.'
# print '\tLocal address is %s:%s.' % self._sock.getsockname()
# print '\tRemote address is %s:%s' % (self.remote_addr, self.remote_port)
def _connect(self):
'''Create TCP socket. The host is random one of host_tuple.'''
self.remote_addr = random.choice(self.host_tuple)
# print '[+] Connecting... remote: %s:%s' % (self.remote_addr, self.remote_port)
# sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# sock.settimeout(self.timeout)
sock = socket.create_connection((self.remote_addr, self.remote_port), self.timeout)
return sock
def disconnect(self):
'''Disconnect from fdfs server.'''
if self._sock is None:
return
try:
self._sock.close()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = None
def get_sock(self):
return self._sock
def _errormessage(self, exception):
# args for socket.error can either be (errno, "message")
# or just "message" '''
if len(exception.args) == 1:
return "[-] Error: connect to %s:%s. %s." % (self.remote_addr, self.remote_port, exception.args[0])
else:
return "[-] Error: %s connect to %s:%s. %s." % \
(exception.args[0], self.remote_addr, self.remote_port, exception.args[1])
# end class Connection
# start ConnectionPool
class ConnectionPool(object):
'''Generic Connection Pool'''
def __init__(self, name='', conn_class=Connection,
max_conn=None, **conn_kwargs):
self.pool_name = name
self.pid = os.getpid()
self.conn_class = conn_class
self.max_conn = max_conn or 2 ** 31
self.conn_kwargs = conn_kwargs
self._conns_created = 0
self._conns_available = []
self._conns_inuse = set()
# print '[+] Create a connection pool success, name: %s.' % self.pool_name
def _check_pid(self):
if self.pid != os.getpid():
self.destroy()
self.__init__(self.conn_class, self.max_conn, **self.conn_kwargs)
def make_conn(self):
'''Create a new connection.'''
if self._conns_created >= self.max_conn:
raise ConnectionError('[-] Error: Too many connections.')
num_try = 10
while True:
try:
if num_try <= 0:
sys.exit()
conn_instance = self.conn_class(**self.conn_kwargs)
conn_instance.connect()
self._conns_created += 1
break
except ConnectionError as e:
print(e)
num_try -= 1
conn_instance = None
return conn_instance
def get_connection(self):
'''Get a connection from pool.'''
self._check_pid()
try:
conn = self._conns_available.pop()
# print '[+] Get a connection from pool %s.' % self.pool_name
# print '\tLocal address is %s:%s.' % conn._sock.getsockname()
# print '\tRemote address is %s:%s' % (conn.remote_addr, conn.remote_port)
except IndexError:
conn = self.make_conn()
self._conns_inuse.add(conn)
return conn
def remove(self, conn):
'''Remove connection from pool.'''
if conn in self._conns_inuse:
self._conns_inuse.remove(conn)
self._conns_created -= 1
if conn in self._conns_available:
self._conns_available.remove(conn)
self._conns_created -= 1
def destroy(self):
'''Disconnect all connections in the pool.'''
all_conns = chain(self._conns_inuse, self._conns_available)
for conn in all_conns:
conn.disconnect()
# print '[-] Destroy connection pool %s.' % self.pool_name
def release(self, conn):
'''Release the connection back to the pool.'''
self._check_pid()
if conn.pid == self.pid:
self._conns_inuse.remove(conn)
self._conns_available.append(conn)
# print '[-] Release connection back to pool %s.' % self.pool_name
# end ConnectionPool class
def tcp_recv_response(conn, bytes_size, buffer_size=4096):
'''Receive response from server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_size: int, will be received byte_stream size
@buffer_size: int, receive buffer size
@Return: tuple,(response, received_size)
'''
recv_buff = []
total_size = 0
try:
while bytes_size > 0:
resp = conn._sock.recv(buffer_size)
recv_buff.append(resp)
total_size += len(resp)
bytes_size -= len(resp)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: (%s)' % e.args)
return (b''.join(recv_buff), total_size)
def tcp_send_data(conn, bytes_stream):
'''Send buffer to server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_stream: trasmit buffer
@Return bool
'''
try:
conn._sock.sendall(bytes_stream)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: (%s)' % e.args)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: exceptions.py
'''Core exceptions raised by fdfs client'''
class FDFSError(Exception):
pass
class ConnectionError(FDFSError):
pass
class ResponseError(FDFSError):
pass
class InvaildResponse(FDFSError):
pass
class DataError(FDFSError):
pass
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: fdfs_protol.py
import struct
import socket
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# define FDFS protol constans
TRACKER_PROTO_CMD_STORAGE_JOIN = 81
FDFS_PROTO_CMD_QUIT = 82
TRACKER_PROTO_CMD_STORAGE_BEAT = 83 # storage heart beat
TRACKER_PROTO_CMD_STORAGE_REPORT_DISK_USAGE = 84 # report disk usage
TRACKER_PROTO_CMD_STORAGE_REPLICA_CHG = 85 # repl new storage servers
TRACKER_PROTO_CMD_STORAGE_SYNC_SRC_REQ = 86 # src storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_REQ = 87 # dest storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_NOTIFY = 88 # sync done notify
TRACKER_PROTO_CMD_STORAGE_SYNC_REPORT = 89 # report src last synced time as dest server
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_QUERY = 79 # dest storage query sync src storage server
TRACKER_PROTO_CMD_STORAGE_REPORT_IP_CHANGED = 78 # storage server report it's ip changed
TRACKER_PROTO_CMD_STORAGE_CHANGELOG_REQ = 77 # storage server request storage server's changelog
TRACKER_PROTO_CMD_STORAGE_REPORT_STATUS = 76 # report specified storage server status
TRACKER_PROTO_CMD_STORAGE_PARAMETER_REQ = 75 # storage server request parameters
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FREE = 74 # storage report trunk free space
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FID = 73 # storage report current trunk file id
TRACKER_PROTO_CMD_STORAGE_FETCH_TRUNK_FID = 72 # storage get current trunk file id
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START = 61 # start of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END = 62 # end of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE = 63 # tracker get a system data file
TRACKER_PROTO_CMD_TRACKER_GET_STATUS = 64 # tracker get status of other tracker
TRACKER_PROTO_CMD_TRACKER_PING_LEADER = 65 # tracker ping leader
TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER = 66 # notify next leader to other trackers
TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER = 67 # commit next leader to other trackers
TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP = 90
TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS = 91
TRACKER_PROTO_CMD_SERVER_LIST_STORAGE = 92
TRACKER_PROTO_CMD_SERVER_DELETE_STORAGE = 93
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ONE = 101
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ONE = 102
TRACKER_PROTO_CMD_SERVICE_QUERY_UPDATE = 103
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ONE = 104
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ALL = 105
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ALL = 106
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ALL = 107
TRACKER_PROTO_CMD_RESP = 100
FDFS_PROTO_CMD_ACTIVE_TEST = 111 # active test, tracker and storage both support since V1.28
STORAGE_PROTO_CMD_REPORT_CLIENT_IP = 9 # ip as tracker client
STORAGE_PROTO_CMD_UPLOAD_FILE = 11
STORAGE_PROTO_CMD_DELETE_FILE = 12
STORAGE_PROTO_CMD_SET_METADATA = 13
STORAGE_PROTO_CMD_DOWNLOAD_FILE = 14
STORAGE_PROTO_CMD_GET_METADATA = 15
STORAGE_PROTO_CMD_SYNC_CREATE_FILE = 16
STORAGE_PROTO_CMD_SYNC_DELETE_FILE = 17
STORAGE_PROTO_CMD_SYNC_UPDATE_FILE = 18
STORAGE_PROTO_CMD_SYNC_CREATE_LINK = 19
STORAGE_PROTO_CMD_CREATE_LINK = 20
STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE = 21
STORAGE_PROTO_CMD_QUERY_FILE_INFO = 22
STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE = 23 # create appender file
STORAGE_PROTO_CMD_APPEND_FILE = 24 # append file
STORAGE_PROTO_CMD_SYNC_APPEND_FILE = 25
STORAGE_PROTO_CMD_FETCH_ONE_PATH_BINLOG = 26 # fetch binlog of one store path
STORAGE_PROTO_CMD_RESP = TRACKER_PROTO_CMD_RESP
STORAGE_PROTO_CMD_UPLOAD_MASTER_FILE = STORAGE_PROTO_CMD_UPLOAD_FILE
STORAGE_PROTO_CMD_TRUNK_ALLOC_SPACE = 27 # since V3.00
STORAGE_PROTO_CMD_TRUNK_ALLOC_CONFIRM = 28 # since V3.00
STORAGE_PROTO_CMD_TRUNK_FREE_SPACE = 29 # since V3.00
STORAGE_PROTO_CMD_TRUNK_SYNC_BINLOG = 30 # since V3.00
STORAGE_PROTO_CMD_TRUNK_GET_BINLOG_SIZE = 31 # since V3.07
STORAGE_PROTO_CMD_TRUNK_DELETE_BINLOG_MARKS = 32 # since V3.07
STORAGE_PROTO_CMD_TRUNK_TRUNCATE_BINLOG_FILE = 33 # since V3.07
STORAGE_PROTO_CMD_MODIFY_FILE = 34 # since V3.08
STORAGE_PROTO_CMD_SYNC_MODIFY_FILE = 35 # since V3.08
STORAGE_PROTO_CMD_TRUNCATE_FILE = 36 # since V3.08
STORAGE_PROTO_CMD_SYNC_TRUNCATE_FILE = 37 # since V3.08
# for overwrite all old metadata
STORAGE_SET_METADATA_FLAG_OVERWRITE = 'O'
STORAGE_SET_METADATA_FLAG_OVERWRITE_STR = "O"
# for replace, insert when the meta item not exist, otherwise update it
STORAGE_SET_METADATA_FLAG_MERGE = 'M'
STORAGE_SET_METADATA_FLAG_MERGE_STR = "M"
FDFS_RECORD_SEPERATOR = '\x01'
FDFS_FIELD_SEPERATOR = '\x02'
# common constants
FDFS_GROUP_NAME_MAX_LEN = 16
IP_ADDRESS_SIZE = 16
FDFS_PROTO_PKG_LEN_SIZE = 8
FDFS_PROTO_CMD_SIZE = 1
FDFS_PROTO_STATUS_SIZE = 1
FDFS_PROTO_IP_PORT_SIZE = (IP_ADDRESS_SIZE + 6)
FDFS_MAX_SERVERS_EACH_GROUP = 32
FDFS_MAX_GROUPS = 512
FDFS_MAX_TRACKERS = 16
FDFS_DOMAIN_NAME_MAX_LEN = 128
FDFS_MAX_META_NAME_LEN = 64
FDFS_MAX_META_VALUE_LEN = 256
FDFS_FILE_PREFIX_MAX_LEN = 16
FDFS_LOGIC_FILE_PATH_LEN = 10
FDFS_TRUE_FILE_PATH_LEN = 6
FDFS_FILENAME_BASE64_LENGTH = 27
FDFS_TRUNK_FILE_INFO_LEN = 16
FDFS_FILE_EXT_NAME_MAX_LEN = 6
FDFS_SPACE_SIZE_BASE_INDEX = 2 # storage space size based (MB)
FDFS_UPLOAD_BY_BUFFER = 1
FDFS_UPLOAD_BY_FILENAME = 2
FDFS_UPLOAD_BY_FILE = 3
FDFS_DOWNLOAD_TO_BUFFER = 1
FDFS_DOWNLOAD_TO_FILE = 2
FDFS_NORMAL_LOGIC_FILENAME_LENGTH = (
FDFS_LOGIC_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_FILE_EXT_NAME_MAX_LEN + 1)
FDFS_TRUNK_FILENAME_LENGTH = (
FDFS_TRUE_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_TRUNK_FILE_INFO_LEN + 1 + FDFS_FILE_EXT_NAME_MAX_LEN)
FDFS_TRUNK_LOGIC_FILENAME_LENGTH = (FDFS_TRUNK_FILENAME_LENGTH + (FDFS_LOGIC_FILE_PATH_LEN - FDFS_TRUE_FILE_PATH_LEN))
FDFS_VERSION_SIZE = 6
TRACKER_QUERY_STORAGE_FETCH_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE)
TRACKER_QUERY_STORAGE_STORE_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE + 1)
# status code, order is important!
FDFS_STORAGE_STATUS_INIT = 0
FDFS_STORAGE_STATUS_WAIT_SYNC = 1
FDFS_STORAGE_STATUS_SYNCING = 2
FDFS_STORAGE_STATUS_IP_CHANGED = 3
FDFS_STORAGE_STATUS_DELETED = 4
FDFS_STORAGE_STATUS_OFFLINE = 5
FDFS_STORAGE_STATUS_ONLINE = 6
FDFS_STORAGE_STATUS_ACTIVE = 7
FDFS_STORAGE_STATUS_RECOVERY = 9
FDFS_STORAGE_STATUS_NONE = 99
class Storage_server(object):
'''Class storage server for upload.'''
def __init__(self):
self.ip_addr = None
self.port = None
self.group_name = ''
self.store_path_index = 0
# Class tracker_header
class Tracker_header(object):
'''
Class for Pack or Unpack tracker header
struct tracker_header{
char pkg_len[FDFS_PROTO_PKG_LEN_SIZE],
char cmd,
char status,
}
'''
def __init__(self):
self.fmt = '!QBB' # pkg_len[FDFS_PROTO_PKG_LEN_SIZE] + cmd + status
self.st = struct.Struct(self.fmt)
self.pkg_len = 0
self.cmd = 0
self.status = 0
def _pack(self, pkg_len=0, cmd=0, status=0):
return self.st.pack(pkg_len, cmd, status)
def _unpack(self, bytes_stream):
self.pkg_len, self.cmd, self.status = self.st.unpack(bytes_stream)
return True
def header_len(self):
return self.st.size
def send_header(self, conn):
'''Send Tracker header to server.'''
header = self._pack(self.pkg_len, self.cmd, self.status)
try:
conn._sock.sendall(header)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: %s' % (e.args,))
def recv_header(self, conn):
'''Receive response from server.
if sucess, class member (pkg_len, cmd, status) is response.
'''
try:
header = conn._sock.recv(self.header_len())
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: %s' % (e.args,))
self._unpack(header)
def fdfs_pack_metadata(meta_dict):
ret = ''
for key in meta_dict:
ret += '%s%c%s%c' % (key, FDFS_FIELD_SEPERATOR, meta_dict[key], FDFS_RECORD_SEPERATOR)
return ret[0:-1]
def fdfs_unpack_metadata(bytes_stream):
li = bytes_stream.split(FDFS_RECORD_SEPERATOR)
return dict([item.split(FDFS_FIELD_SEPERATOR) for item in li])
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: fdfs_test.py
import os
import sys
import time
try:
from fdfs_client.client import *
from fdfs_client.exceptions import *
except ImportError:
import_path = os.path.abspath('../')
sys.path.append(import_path)
from fdfs_client.client import *
from fdfs_client.exceptions import *
def usage():
s = 'Usage: python fdfs_test.py {options} [{local_filename} [{remote_file_id}]]\n'
s += 'options: upfile, upbuffer, downfile, downbuffer, delete, listgroup, listserv\n'
s += ' upslavefile, upslavebuffer, upappendfile, upappendbuffer\n'
s += '\tupfile {local_filename}\n'
s += '\tupbuffer {local_filename}\n'
s += '\tdownfile {local_filename} {remote_file_id}\n'
s += '\tdownbuffer {remote_file_id}\n'
s += '\tdelete {remote_file_id}\n'
s += '\tlistgroup {group_name}\n'
s += '\tlistall \n'
s += '\tlistsrv {group_name} [storage_ip]\n'
s += '\tsetmeta {remote_file_id}\n'
s += '\tgetmeta {remote_file_id}\n'
s += '\tupslavefile {local_filename} {remote_fileid} {prefix_name}\n'
s += '\tupappendfile {local_filename}\n'
s += '\ttruncate {truncate_filesize} {remote_fileid}\n'
s += '\tmodifyfile {local_filename} {remote_fileid} {file_offset}\n'
s += '\tmodifybuffer {local_filename} {remote_fileid} {file_offset}\n'
s += 'e.g.: python fdfs_test.py upfile test'
print(s)
sys.exit(0)
if len(sys.argv) < 2:
usage()
client = Fdfs_client('client.conf')
def upfile_func():
# Upload by filename
# usage: python fdfs_test.py upfile {local_filename}
if len(sys.argv) < 3:
usage()
return None
try:
local_filename = sys.argv[2]
file_size = os.stat(local_filename).st_size
# meta_buffer can be null.
meta_dict = {
'ext_name': 'py',
'file_size': str(file_size) + 'B'
}
t1 = time.time()
ret_dict = client.upload_by_filename(local_filename, meta_dict)
t2 = time.time()
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
print('[+] time consume: %fs' % (t2 - t1))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upfileex_func():
# Upload by file
# usage: python fdfs_test.py upfileex {local_filename}
if len(sys.argv) < 3:
usage()
return None
try:
local_filename = sys.argv[2]
t1 = time.time()
ret_dict = client.upload_by_file(local_filename)
t2 = time.time()
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
print('[+] time consume: %fs' % (t2 - t1))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upslavefile_func():
# upload slave file
# usage: python fdfs_test.py upslavefile {local_filename} {remote_fileid} {prefix_name}
if len(sys.argv) < 5:
usage()
return None
try:
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
prefix_name = sys.argv[4]
ret_dict = client.upload_slave_by_file(local_filename, remote_fileid, \
prefix_name)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upslavebuffer_func():
# upload slave by buffer
# usage: python fdfs_test.py upslavebuffer {local_filename} {remote_fileid} {prefix_name}
if len(sys.argv) < 5:
usage()
return None
try:
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
prefix_name = sys.argv[4]
with open(local_filename, 'rb') as f:
filebuffer = f.read()
ret_dict = client.upload_slave_by_buffer(local_filename, \
remote_fileid, prefix_name)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def del_func():
# delete file
# usage: python fdfs_test.py delete {remote_fileid}
if len(sys.argv) < 3:
usage()
return None
try:
remote_file_id = sys.argv[2]
ret_tuple = client.delete_file(remote_file_id)
print('[+] %s' % ret_tuple[0])
print('[+] remote_fileid: %s' % ret_tuple[1])
print('[+] Storage IP: %s' % ret_tuple[2])
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def downfile_func():
# Download to file
# usage: python fdfs_test.py downfile {local_filename} {remote_fileid}
if len(sys.argv) < 3:
usage()
return None
try:
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
ret_dict = client.download_to_file(local_filename, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def list_group_func():
# List one group info
# usage: python fdfs_test.py listgroup {group_name}
if len(sys.argv) < 3:
usage()
return None
try:
group_name = sys.argv[2]
ret = client.list_one_group(group_name)
print(ret)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def listall_func():
# List all group info
# usage: python fdfs_test.py listall
if len(sys.argv) < 2:
usage()
return None
try:
ret_dict = client.list_all_groups()
print('=' * 80)
print('Groups count:', ret_dict['Groups count'])
for li in ret_dict['Groups']:
print('-' * 80)
print(li)
print('-' * 80)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def list_server_func():
# List all servers info of group
# usage: python fdfs_test.py listsrv {group_name} [storage_ip]
if len(sys.argv) < 3:
usage()
return None
try:
group_name = sys.argv[2]
if len(sys.argv) > 3:
storage_ip = sys.argv[3]
else:
storage_ip = None
ret_dict = client.list_servers(group_name, storage_ip)
print('=' * 80)
print('Group name: %s' % ret_dict['Group name'])
print('=' * 80)
i = 1
for serv in ret_dict['Servers']:
print('Storage server %d:' % i)
print('=' * 80)
print(serv)
i += 1
print('=' * 80)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upbuffer_func():
# Upload by buffer
# usage: python fdfs_test.py upbuffer {local_filename} [remote_file_ext_name]
if len(sys.argv) < 3:
usage()
return None
local_filename = sys.argv[2]
if len(sys.argv) > 3:
ext_name = sys.argv[3]
else:
ext_name = None
# meta_buffer can be null.
meta_buffer = {
'ext_name': 'gif',
'width': '150px',
'height': '80px'
}
try:
with open(local_filename, 'rb') as f:
file_buffer = f.read()
ret_dict = client.upload_by_buffer(file_buffer, ext_name, meta_buffer)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def downbuffer_func():
# Download to buffer
# usage: python fdfs_test.py downbuffer {remote_file_id}
# e.g.: 'group1/M00/00/00/wKjzhU_rLNmjo2-1AAAamGDONEA5818.py'
if len(sys.argv) < 3:
usage()
return None
remote_fileid = sys.argv[2]
try:
ret_dict = client.download_to_buffer(remote_fileid)
print('Downloaded content:')
print(ret_dict['Content'])
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def get_meta_data_func():
# Get meta data of remote file
# usage python fdfs_test.py getmeta {remote_file_id}
if len(sys.argv) < 3:
usage()
return None
remote_fileid = sys.argv[2]
try:
ret_dict = client.get_meta_data(remote_fileid)
print(ret_dict)
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def set_meta_data_func():
# Set meta data of remote file
# usage python fdfs_test.py setmeta {remote_file_id}
if len(sys.argv) < 3:
usage()
return None
remote_fileid = sys.argv[2]
meta_dict = {
'ext_name': 'jgp',
'width': '160px',
'hight': '80px',
}
try:
ret_dict = client.set_meta_data(remote_fileid, meta_dict)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upappendfile_func():
# Upload an appender file by filename
# usage: python fdfs_test.py upappendfile {local_filename}
if len(sys.argv) < 3:
usage()
return None
local_filename = sys.argv[2]
try:
ret_dict = client.upload_appender_by_file(local_filename)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def upappendbuffer_func():
# Upload an appender file by buffer
# usage: python fdfs_test.py upappendbuffer {local_filename}
if len(sys.argv) < 3:
usage()
return None
local_filename = sys.argv[2]
try:
with open(local_filename, 'rb') as f:
file_buffer = f.read()
ret_dict = client.upload_appender_by_buffer(file_buffer)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def appendfile_func():
# Append a remote file
# usage: python fdfs_test.py appendfile {local_filename} {remote_file_id}
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
try:
ret_dict = client.append_by_file(local_filename, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def appendbuffer_func():
# Append a remote file by buffer
# usage: python fdfs_test.py appendbuffer {local_filename} {remote_file_id}
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
try:
with open(local_filename, 'rb') as f:
filebuffer = f.read()
ret_dict = client.append_by_buffer(filebuffer, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def truncate_func():
# Truncate file
# usage: python fdfs_test.py truncate {truncate_filesize} {remote_file_id}
if len(sys.argv) < 4:
usage()
return None
truncate_filesize = int(sys.argv[2])
remote_fileid = sys.argv[3]
try:
ret_dict = client.truncate_file(truncate_filesize, remote_fileid)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def modifyfile_func():
# Modify file by filename
# usage: python fdfs_test.py modifyfile {local_filename} {remote_fileid} [file_offset]
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
if len(sys.argv) > 4:
file_offset = int(sys.argv[4])
else:
file_offset = 0
try:
ret_dict = client.modify_by_filename(local_filename, remote_fileid, file_offset)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
def modifybuffer_func():
# Modify file by buffer
# usage: python fdfs_test.py modifybuffer {local_filename} {remote_fileid} [file_offset]
if len(sys.argv) < 4:
usage()
return None
local_filename = sys.argv[2]
remote_fileid = sys.argv[3]
if len(sys.argv) > 4:
file_offset = int(sys.argv[4])
else:
file_offset = 0
try:
with open(local_filename, 'rb') as f:
filebuffer = f.read()
ret_dict = client.modify_by_buffer(filebuffer, remote_fileid, file_offset)
for key in ret_dict:
print('[+] %s : %s' % (key, ret_dict[key]))
except (ConnectionError, ResponseError, DataError) as e:
print(e)
result = {
'upfile': lambda: upfile_func(),
'upfileex': lambda: upfileex_func(),
'upbuffer': lambda: upbuffer_func(),
'delete': lambda: del_func(),
'downfile': lambda: downfile_func(),
'downbuffer': lambda: downbuffer_func(),
'listgroup': lambda: list_group_func(),
'listall': lambda: listall_func(),
'listsrv': lambda: list_server_func(),
'getmeta': lambda: get_meta_data_func(),
'setmeta': lambda: set_meta_data_func(),
'upslavefile': lambda: upslavefile_func(),
'upappendfile': lambda: upappendfile_func(),
'upappendbuffer': lambda: upappendbuffer_func(),
'appendfile': lambda: appendfile_func(),
'appendbuffer': lambda: appendbuffer_func(),
'truncate': lambda: truncate_func(),
'modifyfile': lambda: modifyfile_func(),
'modifybuffer': lambda: modifybuffer_func(),
'-h': lambda: usage(),
}[sys.argv[1].lower()]()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: storage_client.py
import os
import stat
import errno
import struct
import socket
import datetime
import platform
from fdfs_client.fdfs_protol import *
from fdfs_client.connection import *
# from test_fdfs.sendfile import *
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
from fdfs_client.utils import *
__os_sep__ = "/" if platform.system() == 'Windows' else os.sep
def tcp_send_file(conn, filename, buffer_size=1024):
'''
Send file to server, and split into multiple pkgs while sending.
arguments:
@conn: connection
@filename: string
@buffer_size: int ,send buffer size
@Return int: file size if success else raise ConnectionError.
'''
file_size = 0
with open(filename, 'rb') as f:
while 1:
try:
send_buffer = f.read(buffer_size)
send_size = len(send_buffer)
if send_size == 0:
break
tcp_send_data(conn, send_buffer)
file_size += send_size
except ConnectionError as e:
raise ConnectionError('[-] Error while uploading file(%s).' % e.args)
except IOError as e:
raise DataError('[-] Error while reading local file(%s).' % e.args)
return file_size
def tcp_send_file_ex(conn, filename, buffer_size=4096):
'''
Send file to server. Using linux system call 'sendfile'.
arguments:
@conn: connection
@filename: string
@return long, sended size
'''
if 'linux' not in sys.platform.lower():
raise DataError('[-] Error: \'sendfile\' system call only available on linux.')
nbytes = 0
offset = 0
sock_fd = conn.get_sock().fileno()
with open(filename, 'rb') as f:
in_fd = f.fileno()
while 1:
try:
pass
# sent = sendfile(sock_fd, in_fd, offset, buffer_size)
# if 0 == sent:
# break
# nbytes += sent
# offset += sent
except OSError as e:
if e.errno == errno.EAGAIN:
continue
raise
return nbytes
def tcp_recv_file(conn, local_filename, file_size, buffer_size=1024):
'''
Receive file from server, fragmented it while receiving and write to disk.
arguments:
@conn: connection
@local_filename: string
@file_size: int, remote file size
@buffer_size: int, receive buffer size
@Return int: file size if success else raise ConnectionError.
'''
total_file_size = 0
flush_size = 0
remain_bytes = file_size
with open(local_filename, 'wb+') as f:
while remain_bytes > 0:
try:
if remain_bytes >= buffer_size:
file_buffer, recv_size = tcp_recv_response(conn, buffer_size, buffer_size)
else:
file_buffer, recv_size = tcp_recv_response(conn, remain_bytes, buffer_size)
f.write(file_buffer)
remain_bytes -= buffer_size
total_file_size += recv_size
flush_size += recv_size
if flush_size >= 4096:
f.flush()
flush_size = 0
except ConnectionError as e:
raise ConnectionError('[-] Error: while downloading file(%s).' % e.args)
except IOError as e:
raise DataError('[-] Error: while writting local file(%s).' % e.args)
return total_file_size
class Storage_client(object):
'''
The Class Storage_client for storage server.
Note: argument host_tuple of storage server ip address, that should be a single element.
'''
def __init__(self, *kwargs):
conn_kwargs = {
'name': 'Storage Pool',
'host_tuple': (kwargs[0],),
'port': kwargs[1],
'timeout': kwargs[2]
}
self.pool = ConnectionPool(**conn_kwargs)
return None
def __del__(self):
try:
self.pool.destroy()
self.pool = None
except:
pass
def update_pool(self, old_store_serv, new_store_serv, timeout=30):
'''
Update connection pool of storage client.
We need update connection pool of storage client, while storage server is changed.
but if server not changed, we do nothing.
'''
if old_store_serv.ip_addr == new_store_serv.ip_addr:
return None
self.pool.destroy()
conn_kwargs = {
'name': 'Storage_pool',
'host_tuple': (new_store_serv.ip_addr,),
'port': new_store_serv.port,
'timeout': timeout
}
self.pool = ConnectionPool(**conn_kwargs)
return True
def _storage_do_upload_file(self, tracker_client, store_serv, file_buffer, file_size=None, upload_type=None,
meta_dict=None, cmd=None, master_filename=None, prefix_name=None, file_ext_name=None):
'''
core of upload file.
arguments:
@tracker_client: Tracker_client, it is useful connect to tracker server
@store_serv: Storage_server, it is return from query tracker server
@file_buffer: string, file name or file buffer for send
@file_size: int
@upload_type: int, optional: FDFS_UPLOAD_BY_FILE, FDFS_UPLOAD_BY_FILENAME,
FDFS_UPLOAD_BY_BUFFER
@meta_dic: dictionary, store metadata in it
@cmd: int, reference fdfs protol
@master_filename: string, useful upload slave file
@prefix_name: string
@file_ext_name: string
@Return dictionary
{
'Group name' : group_name,
'Remote file_id' : remote_file_id,
'Status' : status,
'Local file name' : local_filename,
'Uploaded size' : upload_size,
'Storage IP' : storage_ip
}
'''
store_conn = self.pool.get_connection()
th = Tracker_header()
master_filename_len = len(master_filename) if master_filename else 0
prefix_name_len = len(prefix_name) if prefix_name else 0
upload_slave = len(store_serv.group_name) and master_filename_len
file_ext_name = str(file_ext_name) if file_ext_name else ''
# non_slave_fmt |-store_path_index(1)-file_size(8)-file_ext_name(6)-|
non_slave_fmt = '!B Q %ds' % FDFS_FILE_EXT_NAME_MAX_LEN
# slave_fmt |-master_len(8)-file_size(8)-prefix_name(16)-file_ext_name(6)
# -master_name(master_filename_len)-|
slave_fmt = '!Q Q %ds %ds %ds' % (FDFS_FILE_PREFIX_MAX_LEN, FDFS_FILE_EXT_NAME_MAX_LEN, master_filename_len)
th.pkg_len = struct.calcsize(slave_fmt) if upload_slave else struct.calcsize(non_slave_fmt)
th.pkg_len += file_size
th.cmd = cmd
th.send_header(store_conn)
if upload_slave:
send_buffer = struct.pack(
slave_fmt, master_filename_len, file_size, prefix_name, file_ext_name, master_filename)
else:
send_buffer = struct.pack(non_slave_fmt, store_serv.store_path_index, file_size, file_ext_name.encode())
try:
tcp_send_data(store_conn, send_buffer)
if upload_type == FDFS_UPLOAD_BY_FILENAME:
send_file_size = tcp_send_file(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_BUFFER:
tcp_send_data(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_FILE:
send_file_size = tcp_send_file_ex(store_conn, file_buffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(store_conn, th.pkg_len)
if recv_size <= FDFS_GROUP_NAME_MAX_LEN:
errmsg = '[-] Error: Storage response length is not match, '
errmsg += 'expect: %d, actual: %d' % (th.pkg_len, recv_size)
raise ResponseError(errmsg)
# recv_fmt: |-group_name(16)-remote_file_name(recv_size - 16)-|
recv_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, th.pkg_len - FDFS_GROUP_NAME_MAX_LEN)
(group_name, remote_name) = struct.unpack(recv_fmt, recv_buffer)
remote_filename = remote_name.strip(b'\x00')
if meta_dict and len(meta_dict) > 0:
status = self.storage_set_metadata(tracker_client, store_serv, remote_filename, meta_dict)
if status != 0:
# rollback
self.storage_delete_file(tracker_client, store_serv, remote_filename)
raise DataError('[-] Error: %d, %s' % (status, os.strerror(status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dic = {
'Group name': group_name.strip(b'\x00'),
'Remote file_id': group_name.strip(b'\x00') + __os_sep__.encode() + remote_filename,
'Status': 'Upload successed.',
'Local file name': file_buffer if (upload_type == FDFS_UPLOAD_BY_FILENAME
or upload_type == FDFS_UPLOAD_BY_FILE
) else '',
'Uploaded size': appromix(send_file_size) if (upload_type == FDFS_UPLOAD_BY_FILENAME
or upload_type == FDFS_UPLOAD_BY_FILE
) else appromix(len(file_buffer)),
'Storage IP': store_serv.ip_addr
}
return ret_dic
def storage_upload_by_filename(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILENAME,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_FILE, None, None, file_ext_name)
def storage_upload_by_file(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILE,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_FILE, None, None, file_ext_name)
def storage_upload_by_buffer(self, tracker_client, store_serv, file_buffer, file_ext_name=None, meta_dict=None):
buffer_size = len(file_buffer)
return self._storage_do_upload_file(tracker_client, store_serv, file_buffer, buffer_size, FDFS_UPLOAD_BY_BUFFER,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_FILE, None, None, file_ext_name)
def storage_upload_slave_by_filename(self, tracker_client, store_serv, filename, prefix_name, remote_filename,
meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILENAME,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE, remote_filename,
prefix_name, file_ext_name)
def storage_upload_slave_by_file(self, tracker_client, store_serv, filename, prefix_name, remote_filename,
meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILE,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE, remote_filename,
prefix_name, file_ext_name)
def storage_upload_slave_by_buffer(self, tracker_client, store_serv, filebuffer, remote_filename, meta_dict,
file_ext_name):
file_size = len(filebuffer)
return self._storage_do_upload_file(tracker_client, store_serv, filebuffer, file_size, FDFS_UPLOAD_BY_BUFFER,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE, None, remote_filename,
file_ext_name)
def storage_upload_appender_by_filename(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILENAME,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE, None, None,
file_ext_name)
def storage_upload_appender_by_file(self, tracker_client, store_serv, filename, meta_dict=None):
file_size = os.stat(filename).st_size
file_ext_name = get_file_ext_name(filename)
return self._storage_do_upload_file(tracker_client, store_serv, filename, file_size, FDFS_UPLOAD_BY_FILE,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE, None, None,
file_ext_name)
def storage_upload_appender_by_buffer(self, tracker_client, store_serv, file_buffer, meta_dict=None,
file_ext_name=None):
file_size = len(file_buffer)
return self._storage_do_upload_file(tracker_client, store_serv, file_buffer, file_size, FDFS_UPLOAD_BY_BUFFER,
meta_dict, STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE, None, None,
file_ext_name)
def storage_delete_file(self, tracker_client, store_serv, remote_filename):
'''
Delete file from storage server.
'''
store_conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = STORAGE_PROTO_CMD_DELETE_FILE
file_name_len = len(remote_filename)
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + file_name_len
try:
th.send_header(store_conn)
# del_fmt: |-group_name(16)-filename(len)-|
del_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, file_name_len)
send_buffer = struct.pack(del_fmt, store_serv.group_name, remote_filename)
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
# if th.status == 2:
# raise DataError('[-] Error: remote file %s is not exist.'
# % (store_serv.group_name + __os_sep__.encode() + remote_filename))
if th.status != 0:
raise DataError('Error: %d, %s' % (th.status, os.strerror(th.status)))
# recv_buffer, recv_size = tcp_recv_response(store_conn, th.pkg_len)
except:
raise
finally:
self.pool.release(store_conn)
remote_filename = store_serv.group_name + __os_sep__.encode() + remote_filename
return ('Delete file successed.', remote_filename, store_serv.ip_addr)
def _storage_do_download_file(self, tracker_client, store_serv, file_buffer, offset, download_size,
download_type, remote_filename):
'''
Core of download file from storage server.
You can choice download type, optional FDFS_DOWNLOAD_TO_FILE or
FDFS_DOWNLOAD_TO_BUFFER. And you can choice file offset.
@Return dictionary
'Remote file name' : remote_filename,
'Content' : local_filename or buffer,
'Download size' : download_size,
'Storage IP' : storage_ip
'''
store_conn = self.pool.get_connection()
th = Tracker_header()
remote_filename_len = len(remote_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + FDFS_GROUP_NAME_MAX_LEN + remote_filename_len
th.cmd = STORAGE_PROTO_CMD_DOWNLOAD_FILE
try:
th.send_header(store_conn)
# down_fmt: |-offset(8)-download_bytes(8)-group_name(16)-remote_filename(len)-|
down_fmt = '!Q Q %ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, remote_filename_len)
send_buffer = struct.pack(down_fmt, offset, download_size, store_serv.group_name, remote_filename)
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
# if th.status == 2:
# raise DataError('[-] Error: remote file %s is not exist.' %
# (store_serv.group_name + __os_sep__.encode() + remote_filename))
if th.status != 0:
raise DataError('Error: %d %s' % (th.status, os.strerror(th.status)))
if download_type == FDFS_DOWNLOAD_TO_FILE:
total_recv_size = tcp_recv_file(store_conn, file_buffer, th.pkg_len)
elif download_type == FDFS_DOWNLOAD_TO_BUFFER:
recv_buffer, total_recv_size = tcp_recv_response(store_conn, th.pkg_len)
except:
raise
finally:
self.pool.release(store_conn)
ret_dic = {
'Remote file_id': store_serv.group_name + __os_sep__.encode() + remote_filename,
'Content': file_buffer if download_type == FDFS_DOWNLOAD_TO_FILE else recv_buffer,
'Download size': appromix(total_recv_size),
'Storage IP': store_serv.ip_addr
}
return ret_dic
def storage_download_to_file(self, tracker_client, store_serv, local_filename, file_offset, download_bytes,
remote_filename):
return self._storage_do_download_file(tracker_client, store_serv, local_filename, file_offset, download_bytes,
FDFS_DOWNLOAD_TO_FILE, remote_filename)
def storage_download_to_buffer(self, tracker_client, store_serv, file_buffer, file_offset, download_bytes,
remote_filename):
return self._storage_do_download_file(tracker_client, store_serv, file_buffer, file_offset, download_bytes,
FDFS_DOWNLOAD_TO_BUFFER, remote_filename)
def storage_set_metadata(self, tracker_client, store_serv, remote_filename, meta_dict,
op_flag=STORAGE_SET_METADATA_FLAG_OVERWRITE):
ret = 0
conn = self.pool.get_connection()
remote_filename_len = len(remote_filename)
meta_buffer = fdfs_pack_metadata(meta_dict)
meta_len = len(meta_buffer)
th = Tracker_header()
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + 1 + FDFS_GROUP_NAME_MAX_LEN + remote_filename_len + meta_len
th.cmd = STORAGE_PROTO_CMD_SET_METADATA
try:
th.send_header(conn)
# meta_fmt: |-filename_len(8)-meta_len(8)-op_flag(1)-group_name(16)
# -filename(remote_filename_len)-meta(meta_len)|
meta_fmt = '!Q Q c %ds %ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, remote_filename_len, meta_len)
send_buffer = struct.pack(meta_fmt, remote_filename_len, meta_len, op_flag, store_serv.group_name,
remote_filename, meta_buffer)
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
ret = th.status
except:
raise
finally:
self.pool.release(conn)
return ret
def storage_get_metadata(self, tracker_client, store_serv, remote_file_name):
store_conn = self.pool.get_connection()
th = Tracker_header()
remote_filename_len = len(remote_file_name)
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + remote_filename_len
th.cmd = STORAGE_PROTO_CMD_GET_METADATA
try:
th.send_header(store_conn)
# meta_fmt: |-group_name(16)-filename(remote_filename_len)-|
meta_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, remote_filename_len)
send_buffer = struct.pack(meta_fmt, store_serv.group_name, remote_file_name.encode())
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
# if th.status == 2:
# raise DataError('[-] Error: Remote file %s has no meta data.'
# % (store_serv.group_name + __os_sep__.encode() + remote_file_name))
if th.status != 0:
raise DataError('[-] Error:%d, %s' % (th.status, os.strerror(th.status)))
if th.pkg_len == 0:
ret_dict = {}
meta_buffer, recv_size = tcp_recv_response(store_conn, th.pkg_len)
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = fdfs_unpack_metadata(meta_buffer)
return ret_dict
def _storage_do_append_file(self, tracker_client, store_serv, file_buffer, file_size, upload_type,
appended_filename):
store_conn = self.pool.get_connection()
th = Tracker_header()
appended_filename_len = len(appended_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + appended_filename_len + file_size
th.cmd = STORAGE_PROTO_CMD_APPEND_FILE
try:
th.send_header(store_conn)
# append_fmt: |-appended_filename_len(8)-file_size(8)-appended_filename(len)
# -filecontent(filesize)-|
append_fmt = '!Q Q %ds' % appended_filename_len
send_buffer = struct.pack(append_fmt, appended_filename_len, file_size, appended_filename)
tcp_send_data(store_conn, send_buffer)
if upload_type == FDFS_UPLOAD_BY_FILENAME:
tcp_send_file(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_BUFFER:
tcp_send_data(store_conn, file_buffer)
elif upload_type == FDFS_UPLOAD_BY_FILE:
tcp_send_file_ex(store_conn, file_buffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = {}
ret_dict['Status'] = 'Append file successed.'
ret_dict['Appender file name'] = store_serv.group_name + __os_sep__.encode() + appended_filename
ret_dict['Appended size'] = appromix(file_size)
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def storage_append_by_filename(self, tracker_client, store_serv, local_filename, appended_filename):
file_size = os.stat(local_filename).st_size
return self._storage_do_append_file(tracker_client, store_serv, local_filename, file_size,
FDFS_UPLOAD_BY_FILENAME, appended_filename)
def storage_append_by_file(self, tracker_client, store_serv, local_filename, appended_filename):
file_size = os.stat(local_filename).st_size
return self._storage_do_append_file(tracker_client, store_serv, local_filename, file_size, FDFS_UPLOAD_BY_FILE,
appended_filename)
def storage_append_by_buffer(self, tracker_client, store_serv, file_buffer, appended_filename):
file_size = len(file_buffer)
return self._storage_do_append_file(tracker_client, store_serv, file_buffer, file_size, FDFS_UPLOAD_BY_BUFFER,
appended_filename)
def _storage_do_truncate_file(self, tracker_client, store_serv, truncated_filesize, appender_filename):
store_conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = STORAGE_PROTO_CMD_TRUNCATE_FILE
appender_filename_len = len(appender_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 2 + appender_filename_len
try:
th.send_header(store_conn)
# truncate_fmt:|-appender_filename_len(8)-truncate_filesize(8)
# -appender_filename(len)-|
truncate_fmt = '!Q Q %ds' % appender_filename_len
send_buffer = struct.pack(truncate_fmt, appender_filename_len, truncated_filesize, appender_filename)
tcp_send_data(store_conn, send_buffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = {}
ret_dict['Status'] = 'Truncate successed.'
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def storage_truncate_file(self, tracker_client, store_serv, truncated_filesize, appender_filename):
return self._storage_do_truncate_file(tracker_client, store_serv, truncated_filesize, appender_filename)
def _storage_do_modify_file(self, tracker_client, store_serv, upload_type, filebuffer, offset, filesize,
appender_filename):
store_conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = STORAGE_PROTO_CMD_MODIFY_FILE
appender_filename_len = len(appender_filename)
th.pkg_len = FDFS_PROTO_PKG_LEN_SIZE * 3 + appender_filename_len + filesize
try:
th.send_header(store_conn)
# modify_fmt: |-filename_len(8)-offset(8)-filesize(8)-filename(len)-|
modify_fmt = '!Q Q Q %ds' % appender_filename_len
send_buffer = struct.pack(modify_fmt, appender_filename_len, offset, filesize, appender_filename)
tcp_send_data(store_conn, send_buffer)
if upload_type == FDFS_UPLOAD_BY_FILENAME:
upload_size = tcp_send_file(store_conn, filebuffer)
elif upload_type == FDFS_UPLOAD_BY_BUFFER:
tcp_send_data(store_conn, filebuffer)
elif upload_type == FDFS_UPLOAD_BY_FILE:
upload_size = tcp_send_file_ex(store_conn, filebuffer)
th.recv_header(store_conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
except:
raise
finally:
self.pool.release(store_conn)
ret_dict = {}
ret_dict['Status'] = 'Modify successed.'
ret_dict['Storage IP'] = store_serv.ip_addr
return ret_dict
def storage_modify_by_filename(self, tracker_client, store_serv, filename, offset, filesize, appender_filename):
return self._storage_do_modify_file(tracker_client, store_serv, FDFS_UPLOAD_BY_FILENAME, filename, offset,
filesize, appender_filename)
def storage_modify_by_file(self, tracker_client, store_serv, filename, offset, filesize, appender_filename):
return self._storage_do_modify_file(tracker_client, store_serv, FDFS_UPLOAD_BY_FILE, filename, offset, filesize,
appender_filename)
def storage_modify_by_buffer(self, tracker_client, store_serv, filebuffer, offset, filesize, appender_filename):
return self._storage_do_modify_file(tracker_client, store_serv, FDFS_UPLOAD_BY_BUFFER, filebuffer, offset,
filesize, appender_filename)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: tracker_client.py
import struct
import socket
from datetime import datetime
from fdfs_client.fdfs_protol import *
from fdfs_client.connection import *
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
from fdfs_client.utils import *
def parse_storage_status(status_code):
try:
ret = {
FDFS_STORAGE_STATUS_INIT: lambda: 'INIT',
FDFS_STORAGE_STATUS_WAIT_SYNC: lambda: 'WAIT_SYNC',
FDFS_STORAGE_STATUS_SYNCING: lambda: 'SYNCING',
FDFS_STORAGE_STATUS_IP_CHANGED: lambda: 'IP_CHANGED',
FDFS_STORAGE_STATUS_DELETED: lambda: 'DELETED',
FDFS_STORAGE_STATUS_OFFLINE: lambda: 'OFFLINE',
FDFS_STORAGE_STATUS_ONLINE: lambda: 'ONLINE',
FDFS_STORAGE_STATUS_ACTIVE: lambda: 'ACTIVE',
FDFS_STORAGE_STATUS_RECOVERY: lambda: 'RECOVERY'
}[status_code]()
except KeyError:
ret = 'UNKNOW'
return ret
class Storage_info(object):
def __init__(self):
self.status = 0
self.id = ''
self.ip_addr = ''
self.domain_name = ''
self.src_id = ''
self.version = ''
self.join_time = datetime.fromtimestamp(0).isoformat()
self.up_time = datetime.fromtimestamp(0).isoformat()
self.totalMB = ''
self.freeMB = ''
self.upload_prio = 0
self.store_path_count = 0
self.subdir_count_per_path = 0
self.curr_write_path = 0
self.storage_port = 23000
self.storage_http_port = 80
self.alloc_count = 0
self.current_count = 0
self.max_count = 0
self.total_upload_count = 0
self.success_upload_count = 0
self.total_append_count = 0
self.success_append_count = 0
self.total_modify_count = 0
self.success_modify_count = 0
self.total_truncate_count = 0
self.success_truncate_count = 0
self.total_setmeta_count = 0
self.success_setmeta_count = 0
self.total_del_count = 0
self.success_del_count = 0
self.total_download_count = 0
self.success_download_count = 0
self.total_getmeta_count = 0
self.success_getmeta_count = 0
self.total_create_link_count = 0
self.success_create_link_count = 0
self.total_del_link_count = 0
self.success_del_link_count = 0
self.total_upload_bytes = 0
self.success_upload_bytes = 0
self.total_append_bytes = 0
self.success_append_bytes = 0
self.total_modify_bytes = 0
self.success_modify_bytes = 0
self.total_download_bytes = 0
self.success_download_bytes = 0
self.total_sync_in_bytes = 0
self.success_sync_in_bytes = 0
self.total_sync_out_bytes = 0
self.success_sync_out_bytes = 0
self.total_file_open_count = 0
self.success_file_open_count = 0
self.total_file_read_count = 0
self.success_file_read_count = 0
self.total_file_write_count = 0
self.success_file_write_count = 0
self.last_source_sync = datetime.fromtimestamp(0).isoformat()
self.last_sync_update = datetime.fromtimestamp(0).isoformat()
self.last_synced_time = datetime.fromtimestamp(0).isoformat()
self.last_heartbeat_time = datetime.fromtimestamp(0).isoformat()
self.if_trunk_server = ''
# fmt = |-status(1)-ipaddr(16)-domain(128)-srcipaddr(16)-ver(6)-52*8-|
self.fmt = '!B 16s 16s 128s 16s 6s 10Q 4s4s4s 42Q?'
def set_info(self, bytes_stream):
(self.status, self.id, ip_addr, domain_name, self.src_id, version, join_time, up_time, totalMB, freeMB,
self.upload_prio, self.store_path_count, self.subdir_count_per_path, self.curr_write_path, self.storage_port,
self.storage_http_port, self.alloc_count, self.current_count, self.max_count, self.total_upload_count,
self.success_upload_count, self.total_append_count, self.success_append_count, self.total_modify_count,
self.success_modify_count, self.total_truncate_count, self.success_truncate_count, self.total_setmeta_count,
self.success_setmeta_count, self.total_del_count, self.success_del_count, self.total_download_count,
self.success_download_count, self.total_getmeta_count, self.success_getmeta_count,
self.total_create_link_count, self.success_create_link_count, self.total_del_link_count,
self.success_del_link_count, self.total_upload_bytes, self.success_upload_bytes, self.total_append_bytes,
self.total_append_bytes, self.total_modify_bytes, self.success_modify_bytes, self.total_download_bytes,
self.success_download_bytes, self.total_sync_in_bytes, self.success_sync_in_bytes, self.total_sync_out_bytes,
self.success_sync_out_bytes, self.total_file_open_count, self.success_file_open_count,
self.total_file_read_count, self.success_file_read_count, self.total_file_write_count,
self.success_file_write_count, last_source_sync, last_sync_update, last_synced_time, last_heartbeat_time,
self.if_trunk_server,) = struct.unpack(self.fmt, bytes_stream)
try:
self.ip_addr = ip_addr.strip(b'\x00')
self.domain_name = domain_name.strip(b'\x00')
self.version = version.strip(b'\x00')
self.totalMB = appromix(totalMB, FDFS_SPACE_SIZE_BASE_INDEX)
self.freeMB = appromix(freeMB, FDFS_SPACE_SIZE_BASE_INDEX)
except ValueError as e:
raise ResponseError('[-] Error: disk space overrun, can not represented it.')
self.join_time = datetime.fromtimestamp(join_time).isoformat()
self.up_time = datetime.fromtimestamp(up_time).isoformat()
self.last_source_sync = datetime.fromtimestamp(last_source_sync).isoformat()
self.last_sync_update = datetime.fromtimestamp(last_sync_update).isoformat()
self.last_synced_time = datetime.fromtimestamp(last_synced_time).isoformat()
self.last_heartbeat_time = datetime.fromtimestamp(last_heartbeat_time).isoformat()
return True
def __str__(self):
'''Transform to readable string.'''
s = 'Storage information:\n'
s += '\tip_addr = %s (%s)\n' % (self.ip_addr, parse_storage_status(self.status))
s += '\thttp domain = %s\n' % self.domain_name
s += '\tversion = %s\n' % self.version
s += '\tjoin time = %s\n' % self.join_time
s += '\tup time = %s\n' % self.up_time
s += '\ttotal storage = %s\n' % self.totalMB
s += '\tfree storage = %s\n' % self.freeMB
s += '\tupload priority = %d\n' % self.upload_prio
s += '\tstore path count = %d\n' % self.store_path_count
s += '\tsubdir count per path = %d\n' % self.subdir_count_per_path
s += '\tstorage port = %d\n' % self.storage_port
s += '\tstorage HTTP port = %d\n' % self.storage_http_port
s += '\tcurrent write path = %d\n' % self.curr_write_path
s += '\tsource ip_addr = %s\n' % self.ip_addr
s += '\tif_trunk_server = %d\n' % self.if_trunk_server
s += '\ttotal upload count = %ld\n' % self.total_upload_count
s += '\tsuccess upload count = %ld\n' % self.success_upload_count
s += '\ttotal download count = %ld\n' % self.total_download_count
s += '\tsuccess download count = %ld\n' % self.success_download_count
s += '\ttotal append count = %ld\n' % self.total_append_count
s += '\tsuccess append count = %ld\n' % self.success_append_count
s += '\ttotal modify count = %ld\n' % self.total_modify_count
s += '\tsuccess modify count = %ld\n' % self.success_modify_count
s += '\ttotal truncate count = %ld\n' % self.total_truncate_count
s += '\tsuccess truncate count = %ld\n' % self.success_truncate_count
s += '\ttotal delete count = %ld\n' % self.total_del_count
s += '\tsuccess delete count = %ld\n' % self.success_del_count
s += '\ttotal set_meta count = %ld\n' % self.total_setmeta_count
s += '\tsuccess set_meta count = %ld\n' % self.success_setmeta_count
s += '\ttotal get_meta count = %ld\n' % self.total_getmeta_count
s += '\tsuccess get_meta count = %ld\n' % self.success_getmeta_count
s += '\ttotal create link count = %ld\n' % self.total_create_link_count
s += '\tsuccess create link count = %ld\n' % self.success_create_link_count
s += '\ttotal delete link count = %ld\n' % self.total_del_link_count
s += '\tsuccess delete link count = %ld\n' % self.success_del_link_count
s += '\ttotal upload bytes = %ld\n' % self.total_upload_bytes
s += '\tsuccess upload bytes = %ld\n' % self.success_upload_bytes
s += '\ttotal download bytes = %ld\n' % self.total_download_bytes
s += '\tsuccess download bytes = %ld\n' % self.success_download_bytes
s += '\ttotal append bytes = %ld\n' % self.total_append_bytes
s += '\tsuccess append bytes = %ld\n' % self.success_append_bytes
s += '\ttotal modify bytes = %ld\n' % self.total_modify_bytes
s += '\tsuccess modify bytes = %ld\n' % self.success_modify_bytes
s += '\ttotal sync_in bytes = %ld\n' % self.total_sync_in_bytes
s += '\tsuccess sync_in bytes = %ld\n' % self.success_sync_in_bytes
s += '\ttotal sync_out bytes = %ld\n' % self.total_sync_out_bytes
s += '\tsuccess sync_out bytes = %ld\n' % self.success_sync_out_bytes
s += '\ttotal file open count = %ld\n' % self.total_file_open_count
s += '\tsuccess file open count = %ld\n' % self.success_file_open_count
s += '\ttotal file read count = %ld\n' % self.total_file_read_count
s += '\tsuccess file read count = %ld\n' % self.success_file_read_count
s += '\ttotal file write count = %ld\n' % self.total_file_write_count
s += '\tsucess file write count = %ld\n' % self.success_file_write_count
s += '\tlast heartbeat time = %s\n' % self.last_heartbeat_time
s += '\tlast source update = %s\n' % self.last_source_sync
s += '\tlast sync update = %s\n' % self.last_sync_update
s += '\tlast synced time = %s\n' % self.last_synced_time
return s
def get_fmt_size(self):
return struct.calcsize(self.fmt)
class Group_info(object):
def __init__(self):
self.group_name = ''
self.totalMB = ''
self.freeMB = ''
self.trunk_freeMB = ''
self.count = 0
self.storage_port = 0
self.store_http_port = 0
self.active_count = 0
self.curr_write_server = 0
self.store_path_count = 0
self.subdir_count_per_path = 0
self.curr_trunk_file_id = 0
self.fmt = '!%ds 11Q' % (FDFS_GROUP_NAME_MAX_LEN + 1)
return None
def __str__(self):
s = 'Group information:\n'
s += '\tgroup name = %s\n' % self.group_name
s += '\ttotal disk space = %s\n' % self.totalMB
s += '\tdisk free space = %s\n' % self.freeMB
s += '\ttrunk free space = %s\n' % self.trunk_freeMB
s += '\tstorage server count = %d\n' % self.count
s += '\tstorage port = %d\n' % self.storage_port
s += '\tstorage HTTP port = %d\n' % self.store_http_port
s += '\tactive server count = %d\n' % self.active_count
s += '\tcurrent write server index = %d\n' % self.curr_write_server
s += '\tstore path count = %d\n' % self.store_path_count
s += '\tsubdir count per path = %d\n' % self.subdir_count_per_path
s += '\tcurrent trunk file id = %d\n' % self.curr_trunk_file_id
return s
def set_info(self, bytes_stream):
(group_name, totalMB, freeMB, trunk_freeMB, self.count, self.storage_port, self.store_http_port,
self.active_count, self.curr_write_server, self.store_path_count, self.subdir_count_per_path,
self.curr_trunk_file_id) = struct.unpack(self.fmt, bytes_stream)
try:
self.group_name = group_name.strip(b'\x00')
self.freeMB = appromix(freeMB, FDFS_SPACE_SIZE_BASE_INDEX)
self.totalMB = appromix(totalMB, FDFS_SPACE_SIZE_BASE_INDEX)
self.trunk_freeMB = appromix(trunk_freeMB, FDFS_SPACE_SIZE_BASE_INDEX)
except ValueError:
raise DataError('[-] Error disk space overrun, can not represented it.')
def get_fmt_size(self):
return struct.calcsize(self.fmt)
class Tracker_client(object):
'''Class Tracker client.'''
def __init__(self, pool):
self.pool = pool
def tracker_list_servers(self, group_name, storage_ip=None):
'''
List servers in a storage group
'''
conn = self.pool.get_connection()
th = Tracker_header()
ip_len = len(storage_ip) if storage_ip else 0
if ip_len >= IP_ADDRESS_SIZE:
ip_len = IP_ADDRESS_SIZE - 1
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + ip_len
th.cmd = TRACKER_PROTO_CMD_SERVER_LIST_STORAGE
group_fmt = '!%ds' % FDFS_GROUP_NAME_MAX_LEN
store_ip_addr = storage_ip or ''
storage_ip_fmt = '!%ds' % ip_len
try:
th.send_header(conn)
send_buffer = struct.pack(group_fmt, group_name) + struct.pack(storage_ip_fmt, store_ip_addr)
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
si = Storage_info()
si_fmt_size = si.get_fmt_size()
recv_size = len(recv_buffer)
if recv_size % si_fmt_size != 0:
errinfo = '[-] Error: response size not match, expect: %d, actual: %d' % (th.pkg_len, recv_size)
raise ResponseError(errinfo)
except ConnectionError:
raise
finally:
self.pool.release(conn)
num_storage = recv_size / si_fmt_size
si_list = []
i = 0
while num_storage:
si.set_info(recv_buffer[(i * si_fmt_size): ((i + 1) * si_fmt_size)])
si_list.append(si)
si = Storage_info()
num_storage -= 1
i += 1
ret_dict = {}
ret_dict['Group name'] = group_name
ret_dict['Servers'] = si_list
return ret_dict
def tracker_list_one_group(self, group_name):
conn = self.pool.get_connection()
th = Tracker_header()
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN
th.cmd = TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP
# group_fmt: |-group_name(16)-|
group_fmt = '!%ds' % FDFS_GROUP_NAME_MAX_LEN
try:
th.send_header(conn)
send_buffer = struct.pack(group_fmt, group_name)
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
group_info = Group_info()
group_info.set_info(recv_buffer)
except ConnectionError:
raise
finally:
self.pool.release(conn)
return group_info
def tracker_list_all_groups(self):
conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS
try:
th.send_header(conn)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
except:
raise
finally:
self.pool.release(conn)
gi = Group_info()
gi_fmt_size = gi.get_fmt_size()
if recv_size % gi_fmt_size != 0:
errmsg = '[-] Error: Response size is mismatch, except: %d, actul: %d' % (th.pkg_len, recv_size)
raise ResponseError(errmsg)
num_groups = recv_size / gi_fmt_size
ret_dict = {}
ret_dict['Groups count'] = num_groups
gi_list = []
i = 0
while num_groups:
gi.set_info(recv_buffer[i * gi_fmt_size: (i + 1) * gi_fmt_size])
gi_list.append(gi)
gi = Group_info()
i += 1
num_groups -= 1
ret_dict['Groups'] = gi_list
return ret_dict
def tracker_query_storage_stor_without_group(self):
'''Query storage server for upload, without group name.
Return: Storage_server object'''
conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ONE
try:
th.send_header(conn)
th.recv_header(conn)
if th.status != 0:
raise DataError('[-] Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
if recv_size != TRACKER_QUERY_STORAGE_STORE_BODY_LEN:
errmsg = '[-] Error: Tracker response length is invaild, '
errmsg += 'expect: %d, actual: %d' % (TRACKER_QUERY_STORAGE_STORE_BODY_LEN, recv_size)
raise ResponseError(errmsg)
except ConnectionError:
raise
finally:
self.pool.release(conn)
# recv_fmt |-group_name(16)-ipaddr(16-1)-port(8)-store_path_index(1)|
recv_fmt = '!%ds %ds Q B' % (FDFS_GROUP_NAME_MAX_LEN, IP_ADDRESS_SIZE - 1)
store_serv = Storage_server()
(group_name, ip_addr, store_serv.port, store_serv.store_path_index) = struct.unpack(recv_fmt, recv_buffer)
store_serv.group_name = group_name.strip(b'\x00')
store_serv.ip_addr = ip_addr.strip(b'\x00')
return store_serv
def tracker_query_storage_stor_with_group(self, group_name):
'''Query storage server for upload, based group name.
arguments:
@group_name: string
@Return Storage_server object
'''
conn = self.pool.get_connection()
th = Tracker_header()
th.cmd = TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ONE
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN
th.send_header(conn)
group_fmt = '!%ds' % FDFS_GROUP_NAME_MAX_LEN
send_buffer = struct.pack(group_fmt, group_name)
try:
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
if recv_size != TRACKER_QUERY_STORAGE_STORE_BODY_LEN:
errmsg = '[-] Error: Tracker response length is invaild, '
errmsg += 'expect: %d, actual: %d' % (TRACKER_QUERY_STORAGE_STORE_BODY_LEN, recv_size)
raise ResponseError(errmsg)
except ConnectionError:
raise
finally:
self.pool.release(conn)
# recv_fmt: |-group_name(16)-ipaddr(16-1)-port(8)-store_path_index(1)-|
recv_fmt = '!%ds %ds Q B' % (FDFS_GROUP_NAME_MAX_LEN, IP_ADDRESS_SIZE - 1)
store_serv = Storage_server()
(group, ip_addr, store_serv.port, store_serv.store_path_index) = struct.unpack(recv_fmt, recv_buffer)
store_serv.group_name = group.strip(b'\x00')
store_serv.ip_addr = ip_addr.strip(b'\x00')
return store_serv
def _tracker_do_query_storage(self, group_name, filename, cmd):
'''
core of query storage, based group name and filename.
It is useful download, delete and set meta.
arguments:
@group_name: string
@filename: string. remote file_id
@Return: Storage_server object
'''
conn = self.pool.get_connection()
th = Tracker_header()
file_name_len = len(filename)
th.pkg_len = FDFS_GROUP_NAME_MAX_LEN + file_name_len
th.cmd = cmd
th.send_header(conn)
# query_fmt: |-group_name(16)-filename(file_name_len)-|
query_fmt = '!%ds %ds' % (FDFS_GROUP_NAME_MAX_LEN, file_name_len)
send_buffer = struct.pack(query_fmt, group_name, filename)
try:
tcp_send_data(conn, send_buffer)
th.recv_header(conn)
if th.status != 0:
raise DataError('Error: %d, %s' % (th.status, os.strerror(th.status)))
recv_buffer, recv_size = tcp_recv_response(conn, th.pkg_len)
if recv_size != TRACKER_QUERY_STORAGE_FETCH_BODY_LEN:
errmsg = '[-] Error: Tracker response length is invaild, '
errmsg += 'expect: %d, actual: %d' % (th.pkg_len, recv_size)
raise ResponseError(errmsg)
except ConnectionError:
raise
finally:
self.pool.release(conn)
# recv_fmt: |-group_name(16)-ip_addr(16)-port(8)-|
recv_fmt = '!%ds %ds Q' % (FDFS_GROUP_NAME_MAX_LEN, IP_ADDRESS_SIZE - 1)
store_serv = Storage_server()
(group_name, ipaddr, store_serv.port) = struct.unpack(recv_fmt, recv_buffer)
store_serv.group_name = group_name.strip(b'\x00')
store_serv.ip_addr = ipaddr.strip(b'\x00')
return store_serv
def tracker_query_storage_update(self, group_name, filename):
'''
Query storage server to update(delete and set_meta).
'''
return self._tracker_do_query_storage(group_name, filename, TRACKER_PROTO_CMD_SERVICE_QUERY_UPDATE)
def tracker_query_storage_fetch(self, group_name, filename):
'''
Query storage server to download.
'''
return self._tracker_do_query_storage(group_name, filename, TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ONE)
#!/usr/bin/env python
# -*- coding = utf-8 -*-
# filename: utils.py
import io
import os
import sys
import stat
import platform
import configparser
SUFFIX = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
__os_sep__ = "/" if platform.system() == 'Windows' else os.sep
def appromix(size, base=0):
'''Conver bytes stream size to human-readable format.
Keyword arguments:
size: int, bytes stream size
base: int, suffix index
Return: string
'''
multiples = 1024
if size < 0:
raise ValueError('[-] Error: number must be non-negative.')
if size < multiples:
return '{0:d}{1}'.format(size, SUFFIX[base])
for suffix in SUFFIX[base:]:
if size < multiples:
return '{0:.2f}{1}'.format(size, suffix)
size = size / float(multiples)
raise ValueError('[-] Error: number too big.')
def get_file_ext_name(filename, double_ext=True):
li = filename.split(os.extsep)
if len(li) <= 1:
return ''
else:
if li[-1].find(__os_sep__) != -1:
return ''
if double_ext:
if len(li) > 2:
if li[-2].find(__os_sep__) == -1:
return '%s.%s' % (li[-2], li[-1])
return li[-1]
class Fdfs_ConfigParser(configparser.RawConfigParser):
"""
Extends ConfigParser to allow files without sections.
This is done by wrapping read files and prepending them with a placeholder
section, which defaults to '__config__'
"""
def __init__(self, default_section=None, *args, **kwargs):
configparser.RawConfigParser.__init__(self, *args, **kwargs)
self._default_section = None
self.set_default_section(default_section or '__config__')
def get_default_section(self):
return self._default_section
def set_default_section(self, section):
self.add_section(section)
# move all values from the previous default section to the new one
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
except configparser.NoSectionError:
pass
else:
for (key, value) in default_section_items:
self.set(section, key, value)
self._default_section = section
def read(self, filenames):
if isinstance(filenames, str):
filenames = [filenames]
read_ok = []
for filename in filenames:
try:
with open(filename) as fp:
self.readfp(fp)
except IOError:
continue
else:
read_ok.append(filename)
return read_ok
def readfp(self, fp, *args, **kwargs):
stream = io.StringIO()
try:
stream.name = fp.name
except AttributeError:
pass
stream.write('[' + self._default_section + ']\n')
stream.write(fp.read())
stream.seek(0, 0)
return self._read(stream, stream.name)
def write(self, fp):
# Write the items from the default section manually and then remove them
# from the data. They'll be re-added later.
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
for (key, value) in default_section_items:
fp.write("{0} = {1}\n".format(key, value))
fp.write("\n")
except configparser.NoSectionError:
pass
configparser.RawConfigParser.write(self, fp)
self.add_section(self._default_section)
for (key, value) in default_section_items:
self.set(self._default_section, key, value)
def _read(self, fp, fpname):
"""Parse a sectioned setup file.
The sections in setup file contains a title line at the top,
indicated by a name in square brackets (`[]'), plus key/value
options lines, indicated by `name: value' format lines.
Continuations are represented by an embedded newline then
leading whitespace. Blank lines, lines beginning with a '#',
and just about everything else are ignored.
"""
cursect = None # None, or a dictionary
optname = None
lineno = 0
e = None # None, or an exception
while True:
line = fp.readline()
if not line:
break
lineno = lineno + 1
# comment or blank line?
if line.strip() == '' or line[0] in '#;':
continue
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
# no leading whitespace
continue
# continuation line?
if line[0].isspace() and cursect is not None and optname:
value = line.strip()
if value:
cursect[optname] = "%s\n%s" % (cursect[optname], value)
# a section header or option header?
else:
# is it a section header?
mo = self.SECTCRE.match(line)
if mo:
sectname = mo.group('header')
if sectname in self._sections:
cursect = self._sections[sectname]
elif sectname == DEFAULTSECT:
cursect = self._defaults
else:
cursect = self._dict()
cursect['__name__'] = sectname
self._sections[sectname] = cursect
# So sections can't start with a continuation line
optname = None
# no section header in the file?
elif cursect is None:
raise MissingSectionHeaderError(fpname, lineno, line)
# an option line?
else:
mo = self.OPTCRE.match(line)
if mo:
optname, vi, optval = mo.group('option', 'vi', 'value')
if vi in ('=', ':') and ';' in optval:
# ';' is a comment delimiter only if it follows
# a spacing character
pos = optval.find(';')
if pos != -1 and optval[pos - 1].isspace():
optval = optval[:pos]
optval = optval.strip()
# allow empty values
if optval == '""':
optval = ''
optname = self.optionxform(optname.rstrip())
if optname in cursect:
if not isinstance(cursect[optname], list):
cursect[optname] = [cursect[optname]]
cursect[optname].append(optval)
else:
cursect[optname] = optval
else:
# a non-fatal parsing error occurred. set up the
# exception but keep going. the exception will be
# raised at the end of the file and will contain a
# list of all bogus lines
if not e:
e = ParsingError(fpname)
e.append(lineno, repr(line))
# if any parsing errors occurred, raise an exception
if e:
raise e
def split_remote_fileid(remote_file_id):
'''
Splite remote_file_id to (group_name, remote_file_name)
arguments:
@remote_file_id: string
@return tuple, (group_name, remote_file_name)
'''
index = remote_file_id.find(b'/')
if -1 == index:
return None
return (remote_file_id[0:index], remote_file_id[(index + 1):])
def fdfs_check_file(filename):
ret = True
errmsg = ''
if not os.path.isfile(filename):
ret = False
errmsg = '[-] Error: %s is not a file.' % filename
elif not stat.S_ISREG(os.stat(filename).st_mode):
ret = False
errmsg = '[-] Error: %s is not a regular file.' % filename
return (ret, errmsg)
if __name__ == '__main__':
print(get_file_ext_name('/bc.tar.gz'))
# -*- coding: utf-8 -*-
import requests
import datetime
from pyquery import PyQuery as pq
# import cx_Oracle
import urllib3
urllib3.disable_warnings()
# conn = cx_Oracle.connect('cis', 'cis_zzsn9988', '114.116.91.1:1521/ORCL')
# cursor = conn.cursor()
result_list1 = [
[
'人民币',
'CNY'],
[
'美元',
'USD'],
[
'欧元',
'EUR'],
[
'瑞士法郎',
'CHF'],
[
'加元',
'CAD'],
[
'波兰兹罗提',
'PLN'],
[
'英镑',
'GBP'],
[
'澳元',
'AUD'],
[
'泰铢',
'THB'],
[
'沙特里亚尔',
'SAR'],
[
'巴西里亚伊',
'BRL'],
[
'新土耳其新里拉',
'TRY'],
[
'新台币',
'TWD'],
[
'印度卢比',
'INR'],
[
'墨西哥比索',
'MXN'],
[
'日元',
'JPY'],
[
'瑞典克朗',
'SEK'],
[
'韩元',
'KRW'],
[
'俄罗斯卢布',
'RUB'],
[
'新加坡元',
'SGD'],
[
'港币',
'HKD']]
result_list2 = [
'USD',
'CNY']
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'ASPSESSIONIDQQRTBSSA=CJNLDGJALHNJGKBKPEBCDCOE; Hm_lvt_ecdd6f3afaa488ece3938bcdbb89e8da=1692951904; Hm_lpvt_ecdd6f3afaa488ece3938bcdbb89e8da=1692951904',
'Host':'qq.ip138.com',
'Pragma':'no-cache',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'sec-ch-ua':'"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
for result1 in result_list1:
currency_name = result1[0]
currency = result1[1]
to_USD = ''
to_CNY = ''
for i in range(len(result_list2)):
result2 = result_list2[i]
# https://qq.ip138.com/hl.asp?from=CNY&to=USD&q=1
# url = f'''https://qq.ip138.com/hl.asp?from={currency}&to={result2}&q=1'''
# res = requests.get(url, headers,verify=False)
res = requests.get('https://qq.ip138.com/hl.asp?from=CNY&to=USD&q=1')
resp_text=res.content
doc_resp = pq(resp_text)
money = doc_resp('table tr:nth-child(3) td:nth-child(3)').text()
if money == '1':
money_result = money
else:
try:
money_result = round(float(money), 4)
except:
continue
if i == 0:
to_USD = money_result
else:
to_CNY = money_result
now = datetime.datetime.now()
now_time = now.strftime('%Y-%m-%d')
if to_USD == '' or to_CNY == '':
continue
result_dict = {
'币种': currency_name,
'币简称': currency,
'对美元': to_USD,
'对人民币': to_CNY,
'更新时间': now_time }
print(result_dict)
# sql = "insert into CIS_QCC_CURRENCY_RATE(ID, CURRENCY_NAME, CURRENCY_CODE, RATE_TO_USD, RATE_TO_CNY,CREATE_DATE) values (SEQ_CIS_QCC_CURRENCY_RATE.nextval,'{0}','{1}','{2}','{3}','{4}')".format(currency_name, currency, to_USD, to_CNY, now_time)
# cursor.execute(sql)
# conn.commit()
# 核心工具包
import os
import random
import socket
import sys
import time
import fitz
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import pymysql
import redis
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
# self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
# charset='utf8mb4')
# self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except :
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
pass
else:
end=str.rfind(endStr)
if end==-1:
pass
else:
str = str[0:end+1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self,key):
item = self.r.lpop(key)
return item.decode() if item else None
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
# 获取本机IP
def getIP(self):
IP = socket.gethostbyname(socket.gethostname())
return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, social_code):
data = []
try:
sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql,values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
#获取企查查token
def GetToken(self):
#获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
return token
#获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
return token
#检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,key,item):
self.r.rpush(key, item)
#增加计数器的值并返回增加后的值
def incrSet(self,key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
#获取key剩余的过期时间
def getttl(self,key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
def secrchATT(self,item_id,year,type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,year,type_id)
id = selects[0]
return id
...@@ -7,7 +7,7 @@ import requests ...@@ -7,7 +7,7 @@ import requests
import urllib3 import urllib3
from pymysql.converters import escape_string from pymysql.converters import escape_string
from base.BaseCore import BaseCore from BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
...@@ -50,8 +50,9 @@ def get_file_name(headers): ...@@ -50,8 +50,9 @@ def get_file_name(headers):
def downFile(url,path): def downFile(url,path):
try: try:
baseCore.mkPath(path) baseCore.mkPath(path)
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'} # proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10) response = requests.get(url, headers=headers, verify=False, timeout=10)
# response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10)
fileName = get_file_name(response.headers) fileName = get_file_name(response.headers)
with open(os.path.join(path, fileName), "wb") as pyFile: with open(os.path.join(path, fileName), "wb") as pyFile:
for chunk in response.iter_content(chunk_size=1024): for chunk in response.iter_content(chunk_size=1024):
......
import os
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
from pymysql.converters import escape_string
import downPdf
from BaseCore import BaseCore
from datetime import datetime
baseCore = BaseCore()
log =baseCore.getLogger()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
cnx = baseCore.cnx
cursor = baseCore.cursor
def job_2():
log.info('----开始采集---俄罗斯国家杂志----')
# path = 'D:chrome/chromedriver.exe'
# driverContent = baseCore.buildDriver(path, headless=False)
for i in range(1,139):
if i == 1:
url = 'https://www.whitehouse.gov/?s=Russia'
else:
url = f'https://www.whitehouse.gov/?s=Russia&paged={i}'
req = requests.get(url,headers)
soup = BeautifulSoup(req.content,'html.parser')
container = soup.find('col-md-10 col-lg-6',class_='col-md-10 col-lg-6')
web_list = container.find_all('article')
for web in web_list:
title = web.find('h2',class_='entry-title')
newsTitle = title.find('a').text
newsUrl = title.find('a')['href']
date_string = web.find('div',class_='entry-meta shared-meta').find('time').text
#时间格式转化
date_object = datetime.strptime(date_string, "%B %d, %Y")
pub_time = date_object.strftime("%Y-%m-%d")
print(pub_time)
pdf_name = web.find('div',class_='infoindocumentlist').find_all('div')[0].find('span',class_='info-data').text
#下载pdf
path=r'D:\美国VS俄罗斯制裁'
path = os.path.join(path, downPdf.getPath(newsTitle))
downFile(pdfUrl,path,pdf_name)
selectCountSql = f"select * from usvsrussia where url = '{pdfUrl}' "
cursor.execute(selectCountSql)
url = cursor.fetchone()
if url:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,url,title,pub_time,state,pdf_name,pdf_path,create_time) values ('总统令文件','{pdfUrl}','{escape_string(pdftitle)}','{pub_time}',0,'{pdf_name}','{path}',now() )"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
break
\ No newline at end of file
import pandas as pd
import requests
import urllib3
from pyquery import PyQuery as pq
urllib3.disable_warnings()
def getHtml(url):
try:
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url,verify=False,timeout=10)
html=response.text
except Exception as e:
html=''
return html
def getList():
for i in range(17,139):
print(f'============开始采集第{i}页=========')
url=f'https://www.whitehouse.gov/?s=Russia&paged={i}'
html=getHtml(url)
if html:
pass
else:
for nnn in range(0, 3):
html = getHtml(url)
if html:
break
else:
continue
try:
doc=pq(html)
except:
return
ac=doc('div[class="col-md-10 col-lg-6"]>article')
for ii in range(0,len(ac)):
doc=pq(ac[ii])
title=doc('h2[class="entry-title"]>a').text()
url=doc('h2[class="entry-title"]>a').attr('href')
summary=doc('div[class="post-content"]').text()
try:
time=doc('time').attr('datetime').split('T')[0]
except:
time = ''
type=doc('span[class="tax-links cat-links"]>a').text()
detail={
'title':title,
'url':url,
'time':time,
'type':type,
'summary':summary,
}
getDetail(detail)
print(f'=============第{i}页===第{ii}条采集完成==========')
def getDetail(detail):
detailList=[]
url=detail['url']
html=getHtml(url)
if html:
pass
else:
for nnn in range(0,3):
html = getHtml(url)
if html:
break
else:
continue
try:
doc=pq(html)
except:
return
content=doc('section[class="body-content"]').text()
detail['content'] = content
detailList.append(detail)
writerToExcel(detailList)
return content
# 将数据追加到excel
def writerToExcel(detailList):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
from openpyxl import Workbook
if __name__ == '__main__':
# # 创建一个工作簿
filename='./cis.xlsx'
workbook = Workbook()
workbook.save(filename)
getList()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论