提交 b05ef000 作者: 刘伟刚

Merge remote-tracking branch 'origin/master'

...@@ -241,7 +241,6 @@ class BaseCore: ...@@ -241,7 +241,6 @@ class BaseCore:
except : except :
pass pass
# 计算耗时 # 计算耗时
def getTimeCost(self,start, end): def getTimeCost(self,start, end):
seconds = int(end - start) seconds = int(end - start)
...@@ -393,7 +392,7 @@ class BaseCore: ...@@ -393,7 +392,7 @@ class BaseCore:
# 从Redis的List中获取并移除一个元素 # 从Redis的List中获取并移除一个元素
def redicPullData(self,key): def redicPullData(self,key):
item = self.r.lpop(key) item = self.r.rpop(key)
return item.decode() if item else None return item.decode() if item else None
# 获得脚本进程PID # 获得脚本进程PID
...@@ -494,4 +493,27 @@ class BaseCore: ...@@ -494,4 +493,27 @@ class BaseCore:
def rePutIntoR(self,item): def rePutIntoR(self,item):
self.r.rpush('NewsEnterprise:gwqy_socialCode', item) self.r.rpush('NewsEnterprise:gwqy_socialCode', item)
#增加计数器的值并返回增加后的值
def incrSet(self,key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
#获取key剩余的过期时间
def getttl(self,key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
...@@ -72,7 +72,7 @@ def NewsEnterprise_task(): ...@@ -72,7 +72,7 @@ def NewsEnterprise_task():
#企业公告 #企业公告
def NoticeEnterprise(): def NoticeEnterprise():
# 获取国内企业 # 获取国内企业
gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null limit 1 " gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null limit 10 "
cursor.execute(gn_query) cursor.execute(gn_query)
gn_result = cursor.fetchall() gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result] gn_social_list = [item[0] for item in gn_result]
...@@ -95,13 +95,14 @@ def NoticeEnterprise_task(): ...@@ -95,13 +95,14 @@ def NoticeEnterprise_task():
#企业年报 #企业年报
def AnnualEnterprise(): def AnnualEnterprise():
# 获取国内企业 # 获取国内企业
gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null" gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null limit 10"
cursor.execute(gn_query) cursor.execute(gn_query)
gn_result = cursor.fetchall() gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result] gn_social_list = [item[0] for item in gn_result]
print('=======') print('=======')
for item in gn_social_list: for item in gn_social_list:
r.rpush('AnnualEnterprise:gnqy_socialCode', item) r.rpush('AnnualEnterprise:gnqy_socialCode', item)
#企业年报定时任务 #企业年报定时任务
def AnnualEnterprise_task(): def AnnualEnterprise_task():
# 实例化一个调度器 # 实例化一个调度器
...@@ -118,7 +119,7 @@ def AnnualEnterprise_task(): ...@@ -118,7 +119,7 @@ def AnnualEnterprise_task():
#企业基本信息 #企业基本信息
def BaseInfoEnterprise(): def BaseInfoEnterprise():
# 获取国内企业 # 获取国内企业
gn_query = "select SocialCode from EnterpriseInfo where Place = '1' limit 1 " gn_query = "select SocialCode from EnterpriseInfo where Place = '1' limit 10 "
cursor.execute(gn_query) cursor.execute(gn_query)
gn_result = cursor.fetchall() gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result] gn_social_list = [item[0] for item in gn_result]
...@@ -162,35 +163,66 @@ def weixin_task(): ...@@ -162,35 +163,66 @@ def weixin_task():
print('定时采集异常', e) print('定时采集异常', e)
pass pass
# 企业年报——雪球网
def AnnualEnterpriseIPO():
# 获取国内上市企业
gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null and isIPO = 1 limit 10"
cursor.execute(gn_query)
gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result]
print('=======')
for item in gn_social_list:
r.rpush('AnnualEnterprise:gnshqy_socialCode', item)
#国外企业基本信息
def BaseInfoEnterpriseAbroad():
# 获取国外企业
gn_query = "select id from EnterpriseInfo where Place = '2' limit 10 "
cursor.execute(gn_query)
gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result]
print('=======')
for item in gn_social_list:
r.rpush('BaseInfoEnterprise:gwqy_socialCode', item)
##福布斯=====从数据库中读取信息放入redis ##福布斯=====从数据库中读取信息放入redis
def FBS(): def FBS():
# todo:调整为获取福布斯的数据库 # todo:调整为获取福布斯的数据库
gw_query = "select SocialCode from EnterpriseInfo where Place = '2'" # gw_query = "select id from EnterpriseInfo where ext1='fbs2000' and ext2='1' and Place=2"
cursor.execute(gw_query) # cursor.execute(gw_query)
gw_result = cursor.fetchall() # gw_result = cursor.fetchall()
# #获取国内企业 #获取国内企业
gn_query = "select SocialCode from EnterpriseInfo where Place = '1'" gn_query = "select id from EnterpriseInfo where ext1='fbs2000' and ext2='1' and Place=1"
cursor.execute(gn_query) cursor.execute(gn_query)
gn_result = cursor.fetchall() gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result] gn_social_list = [item[0] for item in gn_result]
gw_social_list = [item[0] for item in gw_result] # gw_social_list = [item[0] for item in gw_result]
#
for item in gw_social_list: # for item in gw_social_list:
r.rpush('NewsEnterprise:gwqy_socialCode', item) # r.rpush('NewsEnterpriseFbs:gwqy_socialCode', item)
for item in gn_social_list: for item in gn_social_list:
r.rpush('NewsEnterprise:gnqy_socialCode', item) if not r.exists(item):
r.rpush('NewsEnterpriseFbs:gnqy_socialCode', item)
if __name__ == "__main__": if __name__ == "__main__":
start = time.time() start = time.time()
# NoticeEnterprise()
# AnnualEnterpriseIPO()
# AnnualEnterprise()
# BaseInfoEnterpriseAbroad()
# NewsEnterprise_task() # NewsEnterprise_task()
# NewsEnterprise() # NewsEnterprise()
FBS() # BaseInfoEnterprise()
# FBS()
# NoticeEnterprise_task() # NoticeEnterprise_task()
# AnnualEnterprise_task() # AnnualEnterprise_task()
NoticeEnterprise()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===') log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
# cnx.close() # cnx.close()
# cursor.close() # cursor.close()
......
# connect timeout in seconds
# default value is 30s
connect_timeout=300
# network timeout in seconds
# default value is 30s
network_timeout=600
# the base path to store log files
#base_path=/home/tarena/django-project/cc_shop1/cc_shop1/logs
# tracker_server can ocur more than once, and tracker_server format is
# "host:port", host can be hostname or ip address
tracker_server=114.115.215.96:22122
#standard log level as syslog, case insensitive, value list:
### emerg for emergency
### alert
### crit for critical
### error
### warn for warning
### notice
### info
### debug
log_level=info
# if use connection pool
# default value is false
# since V4.05
use_connection_pool = false
# connections whose the idle time exceeds this time will be closed
# unit: second
# default value is 3600
# since V4.05
connection_pool_max_idle_time = 3600
# if load FastDFS parameters from tracker server
# since V4.05
# default value is false
load_fdfs_parameters_from_tracker=false
# if use storage ID instead of IP address
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# default value is false
# since V4.05
use_storage_id = false
# specify storage ids filename, can use relative or absolute path
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# since V4.05
storage_ids_filename = storage_ids.conf
#HTTP settings
http.tracker_server_port=80
#use "#include" directive to include HTTP other settiongs
##include http.conf
\ No newline at end of file
# __init__.py
__version__ = '2.2.0'
VERSION = tuple(map(int, __version__.split('.')))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: connection.py
import socket
import os
import sys
import time
import random
from itertools import chain
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# start class Connection
class Connection(object):
'''Manage TCP comunication to and from Fastdfs Server.'''
def __init__(self, **conn_kwargs):
self.pid = os.getpid()
self.host_tuple = conn_kwargs['host_tuple']
self.remote_port = conn_kwargs['port']
self.remote_addr = None
self.timeout = conn_kwargs['timeout']
self._sock = None
def __del__(self):
try:
self.disconnect()
except:
pass
def connect(self):
'''Connect to fdfs server.'''
if self._sock:
return
try:
sock = self._connect()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = sock
# print '[+] Create a connection success.'
# print '\tLocal address is %s:%s.' % self._sock.getsockname()
# print '\tRemote address is %s:%s' % (self.remote_addr, self.remote_port)
def _connect(self):
'''Create TCP socket. The host is random one of host_tuple.'''
self.remote_addr = random.choice(self.host_tuple)
# print '[+] Connecting... remote: %s:%s' % (self.remote_addr, self.remote_port)
# sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# sock.settimeout(self.timeout)
sock = socket.create_connection((self.remote_addr, self.remote_port), self.timeout)
return sock
def disconnect(self):
'''Disconnect from fdfs server.'''
if self._sock is None:
return
try:
self._sock.close()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = None
def get_sock(self):
return self._sock
def _errormessage(self, exception):
# args for socket.error can either be (errno, "message")
# or just "message" '''
if len(exception.args) == 1:
return "[-] Error: connect to %s:%s. %s." % (self.remote_addr, self.remote_port, exception.args[0])
else:
return "[-] Error: %s connect to %s:%s. %s." % \
(exception.args[0], self.remote_addr, self.remote_port, exception.args[1])
# end class Connection
# start ConnectionPool
class ConnectionPool(object):
'''Generic Connection Pool'''
def __init__(self, name='', conn_class=Connection,
max_conn=None, **conn_kwargs):
self.pool_name = name
self.pid = os.getpid()
self.conn_class = conn_class
self.max_conn = max_conn or 2 ** 31
self.conn_kwargs = conn_kwargs
self._conns_created = 0
self._conns_available = []
self._conns_inuse = set()
# print '[+] Create a connection pool success, name: %s.' % self.pool_name
def _check_pid(self):
if self.pid != os.getpid():
self.destroy()
self.__init__(self.conn_class, self.max_conn, **self.conn_kwargs)
def make_conn(self):
'''Create a new connection.'''
if self._conns_created >= self.max_conn:
raise ConnectionError('[-] Error: Too many connections.')
num_try = 10
while True:
try:
if num_try <= 0:
sys.exit()
conn_instance = self.conn_class(**self.conn_kwargs)
conn_instance.connect()
self._conns_created += 1
break
except ConnectionError as e:
print(e)
num_try -= 1
conn_instance = None
return conn_instance
def get_connection(self):
'''Get a connection from pool.'''
self._check_pid()
try:
conn = self._conns_available.pop()
# print '[+] Get a connection from pool %s.' % self.pool_name
# print '\tLocal address is %s:%s.' % conn._sock.getsockname()
# print '\tRemote address is %s:%s' % (conn.remote_addr, conn.remote_port)
except IndexError:
conn = self.make_conn()
self._conns_inuse.add(conn)
return conn
def remove(self, conn):
'''Remove connection from pool.'''
if conn in self._conns_inuse:
self._conns_inuse.remove(conn)
self._conns_created -= 1
if conn in self._conns_available:
self._conns_available.remove(conn)
self._conns_created -= 1
def destroy(self):
'''Disconnect all connections in the pool.'''
all_conns = chain(self._conns_inuse, self._conns_available)
for conn in all_conns:
conn.disconnect()
# print '[-] Destroy connection pool %s.' % self.pool_name
def release(self, conn):
'''Release the connection back to the pool.'''
self._check_pid()
if conn.pid == self.pid:
self._conns_inuse.remove(conn)
self._conns_available.append(conn)
# print '[-] Release connection back to pool %s.' % self.pool_name
# end ConnectionPool class
def tcp_recv_response(conn, bytes_size, buffer_size=4096):
'''Receive response from server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_size: int, will be received byte_stream size
@buffer_size: int, receive buffer size
@Return: tuple,(response, received_size)
'''
recv_buff = []
total_size = 0
try:
while bytes_size > 0:
resp = conn._sock.recv(buffer_size)
recv_buff.append(resp)
total_size += len(resp)
bytes_size -= len(resp)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: (%s)' % e.args)
return (b''.join(recv_buff), total_size)
def tcp_send_data(conn, bytes_stream):
'''Send buffer to server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_stream: trasmit buffer
@Return bool
'''
try:
conn._sock.sendall(bytes_stream)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: (%s)' % e.args)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: exceptions.py
'''Core exceptions raised by fdfs client'''
class FDFSError(Exception):
pass
class ConnectionError(FDFSError):
pass
class ResponseError(FDFSError):
pass
class InvaildResponse(FDFSError):
pass
class DataError(FDFSError):
pass
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: fdfs_protol.py
import struct
import socket
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# define FDFS protol constans
TRACKER_PROTO_CMD_STORAGE_JOIN = 81
FDFS_PROTO_CMD_QUIT = 82
TRACKER_PROTO_CMD_STORAGE_BEAT = 83 # storage heart beat
TRACKER_PROTO_CMD_STORAGE_REPORT_DISK_USAGE = 84 # report disk usage
TRACKER_PROTO_CMD_STORAGE_REPLICA_CHG = 85 # repl new storage servers
TRACKER_PROTO_CMD_STORAGE_SYNC_SRC_REQ = 86 # src storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_REQ = 87 # dest storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_NOTIFY = 88 # sync done notify
TRACKER_PROTO_CMD_STORAGE_SYNC_REPORT = 89 # report src last synced time as dest server
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_QUERY = 79 # dest storage query sync src storage server
TRACKER_PROTO_CMD_STORAGE_REPORT_IP_CHANGED = 78 # storage server report it's ip changed
TRACKER_PROTO_CMD_STORAGE_CHANGELOG_REQ = 77 # storage server request storage server's changelog
TRACKER_PROTO_CMD_STORAGE_REPORT_STATUS = 76 # report specified storage server status
TRACKER_PROTO_CMD_STORAGE_PARAMETER_REQ = 75 # storage server request parameters
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FREE = 74 # storage report trunk free space
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FID = 73 # storage report current trunk file id
TRACKER_PROTO_CMD_STORAGE_FETCH_TRUNK_FID = 72 # storage get current trunk file id
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START = 61 # start of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END = 62 # end of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE = 63 # tracker get a system data file
TRACKER_PROTO_CMD_TRACKER_GET_STATUS = 64 # tracker get status of other tracker
TRACKER_PROTO_CMD_TRACKER_PING_LEADER = 65 # tracker ping leader
TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER = 66 # notify next leader to other trackers
TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER = 67 # commit next leader to other trackers
TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP = 90
TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS = 91
TRACKER_PROTO_CMD_SERVER_LIST_STORAGE = 92
TRACKER_PROTO_CMD_SERVER_DELETE_STORAGE = 93
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ONE = 101
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ONE = 102
TRACKER_PROTO_CMD_SERVICE_QUERY_UPDATE = 103
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ONE = 104
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ALL = 105
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ALL = 106
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ALL = 107
TRACKER_PROTO_CMD_RESP = 100
FDFS_PROTO_CMD_ACTIVE_TEST = 111 # active test, tracker and storage both support since V1.28
STORAGE_PROTO_CMD_REPORT_CLIENT_IP = 9 # ip as tracker client
STORAGE_PROTO_CMD_UPLOAD_FILE = 11
STORAGE_PROTO_CMD_DELETE_FILE = 12
STORAGE_PROTO_CMD_SET_METADATA = 13
STORAGE_PROTO_CMD_DOWNLOAD_FILE = 14
STORAGE_PROTO_CMD_GET_METADATA = 15
STORAGE_PROTO_CMD_SYNC_CREATE_FILE = 16
STORAGE_PROTO_CMD_SYNC_DELETE_FILE = 17
STORAGE_PROTO_CMD_SYNC_UPDATE_FILE = 18
STORAGE_PROTO_CMD_SYNC_CREATE_LINK = 19
STORAGE_PROTO_CMD_CREATE_LINK = 20
STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE = 21
STORAGE_PROTO_CMD_QUERY_FILE_INFO = 22
STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE = 23 # create appender file
STORAGE_PROTO_CMD_APPEND_FILE = 24 # append file
STORAGE_PROTO_CMD_SYNC_APPEND_FILE = 25
STORAGE_PROTO_CMD_FETCH_ONE_PATH_BINLOG = 26 # fetch binlog of one store path
STORAGE_PROTO_CMD_RESP = TRACKER_PROTO_CMD_RESP
STORAGE_PROTO_CMD_UPLOAD_MASTER_FILE = STORAGE_PROTO_CMD_UPLOAD_FILE
STORAGE_PROTO_CMD_TRUNK_ALLOC_SPACE = 27 # since V3.00
STORAGE_PROTO_CMD_TRUNK_ALLOC_CONFIRM = 28 # since V3.00
STORAGE_PROTO_CMD_TRUNK_FREE_SPACE = 29 # since V3.00
STORAGE_PROTO_CMD_TRUNK_SYNC_BINLOG = 30 # since V3.00
STORAGE_PROTO_CMD_TRUNK_GET_BINLOG_SIZE = 31 # since V3.07
STORAGE_PROTO_CMD_TRUNK_DELETE_BINLOG_MARKS = 32 # since V3.07
STORAGE_PROTO_CMD_TRUNK_TRUNCATE_BINLOG_FILE = 33 # since V3.07
STORAGE_PROTO_CMD_MODIFY_FILE = 34 # since V3.08
STORAGE_PROTO_CMD_SYNC_MODIFY_FILE = 35 # since V3.08
STORAGE_PROTO_CMD_TRUNCATE_FILE = 36 # since V3.08
STORAGE_PROTO_CMD_SYNC_TRUNCATE_FILE = 37 # since V3.08
# for overwrite all old metadata
STORAGE_SET_METADATA_FLAG_OVERWRITE = 'O'
STORAGE_SET_METADATA_FLAG_OVERWRITE_STR = "O"
# for replace, insert when the meta item not exist, otherwise update it
STORAGE_SET_METADATA_FLAG_MERGE = 'M'
STORAGE_SET_METADATA_FLAG_MERGE_STR = "M"
FDFS_RECORD_SEPERATOR = '\x01'
FDFS_FIELD_SEPERATOR = '\x02'
# common constants
FDFS_GROUP_NAME_MAX_LEN = 16
IP_ADDRESS_SIZE = 16
FDFS_PROTO_PKG_LEN_SIZE = 8
FDFS_PROTO_CMD_SIZE = 1
FDFS_PROTO_STATUS_SIZE = 1
FDFS_PROTO_IP_PORT_SIZE = (IP_ADDRESS_SIZE + 6)
FDFS_MAX_SERVERS_EACH_GROUP = 32
FDFS_MAX_GROUPS = 512
FDFS_MAX_TRACKERS = 16
FDFS_DOMAIN_NAME_MAX_LEN = 128
FDFS_MAX_META_NAME_LEN = 64
FDFS_MAX_META_VALUE_LEN = 256
FDFS_FILE_PREFIX_MAX_LEN = 16
FDFS_LOGIC_FILE_PATH_LEN = 10
FDFS_TRUE_FILE_PATH_LEN = 6
FDFS_FILENAME_BASE64_LENGTH = 27
FDFS_TRUNK_FILE_INFO_LEN = 16
FDFS_FILE_EXT_NAME_MAX_LEN = 6
FDFS_SPACE_SIZE_BASE_INDEX = 2 # storage space size based (MB)
FDFS_UPLOAD_BY_BUFFER = 1
FDFS_UPLOAD_BY_FILENAME = 2
FDFS_UPLOAD_BY_FILE = 3
FDFS_DOWNLOAD_TO_BUFFER = 1
FDFS_DOWNLOAD_TO_FILE = 2
FDFS_NORMAL_LOGIC_FILENAME_LENGTH = (
FDFS_LOGIC_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_FILE_EXT_NAME_MAX_LEN + 1)
FDFS_TRUNK_FILENAME_LENGTH = (
FDFS_TRUE_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_TRUNK_FILE_INFO_LEN + 1 + FDFS_FILE_EXT_NAME_MAX_LEN)
FDFS_TRUNK_LOGIC_FILENAME_LENGTH = (FDFS_TRUNK_FILENAME_LENGTH + (FDFS_LOGIC_FILE_PATH_LEN - FDFS_TRUE_FILE_PATH_LEN))
FDFS_VERSION_SIZE = 6
TRACKER_QUERY_STORAGE_FETCH_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE)
TRACKER_QUERY_STORAGE_STORE_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE + 1)
# status code, order is important!
FDFS_STORAGE_STATUS_INIT = 0
FDFS_STORAGE_STATUS_WAIT_SYNC = 1
FDFS_STORAGE_STATUS_SYNCING = 2
FDFS_STORAGE_STATUS_IP_CHANGED = 3
FDFS_STORAGE_STATUS_DELETED = 4
FDFS_STORAGE_STATUS_OFFLINE = 5
FDFS_STORAGE_STATUS_ONLINE = 6
FDFS_STORAGE_STATUS_ACTIVE = 7
FDFS_STORAGE_STATUS_RECOVERY = 9
FDFS_STORAGE_STATUS_NONE = 99
class Storage_server(object):
'''Class storage server for upload.'''
def __init__(self):
self.ip_addr = None
self.port = None
self.group_name = ''
self.store_path_index = 0
# Class tracker_header
class Tracker_header(object):
'''
Class for Pack or Unpack tracker header
struct tracker_header{
char pkg_len[FDFS_PROTO_PKG_LEN_SIZE],
char cmd,
char status,
}
'''
def __init__(self):
self.fmt = '!QBB' # pkg_len[FDFS_PROTO_PKG_LEN_SIZE] + cmd + status
self.st = struct.Struct(self.fmt)
self.pkg_len = 0
self.cmd = 0
self.status = 0
def _pack(self, pkg_len=0, cmd=0, status=0):
return self.st.pack(pkg_len, cmd, status)
def _unpack(self, bytes_stream):
self.pkg_len, self.cmd, self.status = self.st.unpack(bytes_stream)
return True
def header_len(self):
return self.st.size
def send_header(self, conn):
'''Send Tracker header to server.'''
header = self._pack(self.pkg_len, self.cmd, self.status)
try:
conn._sock.sendall(header)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: %s' % (e.args,))
def recv_header(self, conn):
'''Receive response from server.
if sucess, class member (pkg_len, cmd, status) is response.
'''
try:
header = conn._sock.recv(self.header_len())
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: %s' % (e.args,))
self._unpack(header)
def fdfs_pack_metadata(meta_dict):
ret = ''
for key in meta_dict:
ret += '%s%c%s%c' % (key, FDFS_FIELD_SEPERATOR, meta_dict[key], FDFS_RECORD_SEPERATOR)
return ret[0:-1]
def fdfs_unpack_metadata(bytes_stream):
li = bytes_stream.split(FDFS_RECORD_SEPERATOR)
return dict([item.split(FDFS_FIELD_SEPERATOR) for item in li])
#!/usr/bin/env python
# -*- coding = utf-8 -*-
# filename: utils.py
import io
import os
import sys
import stat
import platform
import configparser
SUFFIX = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
__os_sep__ = "/" if platform.system() == 'Windows' else os.sep
def appromix(size, base=0):
'''Conver bytes stream size to human-readable format.
Keyword arguments:
size: int, bytes stream size
base: int, suffix index
Return: string
'''
multiples = 1024
if size < 0:
raise ValueError('[-] Error: number must be non-negative.')
if size < multiples:
return '{0:d}{1}'.format(size, SUFFIX[base])
for suffix in SUFFIX[base:]:
if size < multiples:
return '{0:.2f}{1}'.format(size, suffix)
size = size / float(multiples)
raise ValueError('[-] Error: number too big.')
def get_file_ext_name(filename, double_ext=True):
li = filename.split(os.extsep)
if len(li) <= 1:
return ''
else:
if li[-1].find(__os_sep__) != -1:
return ''
if double_ext:
if len(li) > 2:
if li[-2].find(__os_sep__) == -1:
return '%s.%s' % (li[-2], li[-1])
return li[-1]
class Fdfs_ConfigParser(configparser.RawConfigParser):
"""
Extends ConfigParser to allow files without sections.
This is done by wrapping read files and prepending them with a placeholder
section, which defaults to '__config__'
"""
def __init__(self, default_section=None, *args, **kwargs):
configparser.RawConfigParser.__init__(self, *args, **kwargs)
self._default_section = None
self.set_default_section(default_section or '__config__')
def get_default_section(self):
return self._default_section
def set_default_section(self, section):
self.add_section(section)
# move all values from the previous default section to the new one
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
except configparser.NoSectionError:
pass
else:
for (key, value) in default_section_items:
self.set(section, key, value)
self._default_section = section
def read(self, filenames):
if isinstance(filenames, str):
filenames = [filenames]
read_ok = []
for filename in filenames:
try:
with open(filename) as fp:
self.readfp(fp)
except IOError:
continue
else:
read_ok.append(filename)
return read_ok
def readfp(self, fp, *args, **kwargs):
stream = io.StringIO()
try:
stream.name = fp.name
except AttributeError:
pass
stream.write('[' + self._default_section + ']\n')
stream.write(fp.read())
stream.seek(0, 0)
return self._read(stream, stream.name)
def write(self, fp):
# Write the items from the default section manually and then remove them
# from the data. They'll be re-added later.
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
for (key, value) in default_section_items:
fp.write("{0} = {1}\n".format(key, value))
fp.write("\n")
except configparser.NoSectionError:
pass
configparser.RawConfigParser.write(self, fp)
self.add_section(self._default_section)
for (key, value) in default_section_items:
self.set(self._default_section, key, value)
def _read(self, fp, fpname):
"""Parse a sectioned setup file.
The sections in setup file contains a title line at the top,
indicated by a name in square brackets (`[]'), plus key/value
options lines, indicated by `name: value' format lines.
Continuations are represented by an embedded newline then
leading whitespace. Blank lines, lines beginning with a '#',
and just about everything else are ignored.
"""
cursect = None # None, or a dictionary
optname = None
lineno = 0
e = None # None, or an exception
while True:
line = fp.readline()
if not line:
break
lineno = lineno + 1
# comment or blank line?
if line.strip() == '' or line[0] in '#;':
continue
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
# no leading whitespace
continue
# continuation line?
if line[0].isspace() and cursect is not None and optname:
value = line.strip()
if value:
cursect[optname] = "%s\n%s" % (cursect[optname], value)
# a section header or option header?
else:
# is it a section header?
mo = self.SECTCRE.match(line)
if mo:
sectname = mo.group('header')
if sectname in self._sections:
cursect = self._sections[sectname]
elif sectname == DEFAULTSECT:
cursect = self._defaults
else:
cursect = self._dict()
cursect['__name__'] = sectname
self._sections[sectname] = cursect
# So sections can't start with a continuation line
optname = None
# no section header in the file?
elif cursect is None:
raise MissingSectionHeaderError(fpname, lineno, line)
# an option line?
else:
mo = self.OPTCRE.match(line)
if mo:
optname, vi, optval = mo.group('option', 'vi', 'value')
if vi in ('=', ':') and ';' in optval:
# ';' is a comment delimiter only if it follows
# a spacing character
pos = optval.find(';')
if pos != -1 and optval[pos - 1].isspace():
optval = optval[:pos]
optval = optval.strip()
# allow empty values
if optval == '""':
optval = ''
optname = self.optionxform(optname.rstrip())
if optname in cursect:
if not isinstance(cursect[optname], list):
cursect[optname] = [cursect[optname]]
cursect[optname].append(optval)
else:
cursect[optname] = optval
else:
# a non-fatal parsing error occurred. set up the
# exception but keep going. the exception will be
# raised at the end of the file and will contain a
# list of all bogus lines
if not e:
e = ParsingError(fpname)
e.append(lineno, repr(line))
# if any parsing errors occurred, raise an exception
if e:
raise e
def split_remote_fileid(remote_file_id):
'''
Splite remote_file_id to (group_name, remote_file_name)
arguments:
@remote_file_id: string
@return tuple, (group_name, remote_file_name)
'''
index = remote_file_id.find(b'/')
if -1 == index:
return None
return (remote_file_id[0:index], remote_file_id[(index + 1):])
def fdfs_check_file(filename):
ret = True
errmsg = ''
if not os.path.isfile(filename):
ret = False
errmsg = '[-] Error: %s is not a file.' % filename
elif not stat.S_ISREG(os.stat(filename).st_mode):
ret = False
errmsg = '[-] Error: %s is not a regular file.' % filename
return (ret, errmsg)
if __name__ == '__main__':
print(get_file_ext_name('/bc.tar.gz'))
import requests, re, time, pymysql
import requests, re, time, pymysql
from bs4 import BeautifulSoup as bs
from fdfs_client.client import get_tracker_conf, Fdfs_client
from base import BaseCore
baseCore = BaseCore.BaseCore()
requests.adapters.DEFAULT_RETRIES = 3
log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor = cnx.cursor()
tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf)
taskType = '企业年报/雪球网'
def tableUpdate(year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time):
sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s'''
cursor.execute(sel_sql, (item_id, year))
selects = cursor.fetchone()
if selects:
print(f'{com_name},{year}已存在')
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by,
create_time)
cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交
print("更新完成:{}".format(Upsql))
def getContent(social_code, com_name, code,start_time):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
}
comp = re.compile('-?[1-9]\d*')
num = 1
ip = baseCore.get_proxy()
url_1 = f'https://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/{code}/page_type/ndbg.phtml'
res_1 = requests.get(url_1, proxies=ip)
soup = bs(res_1.content, 'html.parser',from_encoding='gb2312')
# 获取年度报告列表
try:
list_all = soup.find('div', {'class': 'datelist'}).find_all('a')
except:
log.info(f'{social_code}.........年度报告列表为空')
exception = '年度报告列表为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
return
# 获取年报详细信息
for href in list_all:
ip = baseCore.get_proxy()
year_url = 'https://vip.stock.finance.sina.com.cn' + href.get('href')
year_name = href.text
res_2 = requests.get(year_url, proxies=ip)
soup_2 = bs(res_2.content, 'html.parser',from_encoding='gb2312')
try:
pdf_url = soup_2.find('th', {'style': 'text-align:center'}).find('a').get('href')
except:
log.error(f'{social_code}....{year_url}....无下载链接')
exception = '无下载链接'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
continue
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
try:
year = comp.findall(year_name)[0]
except:
continue
name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
result = ''
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
break
except Exception as e:
log.error(f'{social_code}...年报上传服务器出错:{e}')
time.sleep(3)
continue
if result == '':
exception = '上传服务器失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
type_id = '1'
item_id = social_code
group_name = 'group1'
path = bytes.decode(result['Remote file_id']).replace('group1', '')
full_path = bytes.decode(result['Remote file_id'])
category = 'pdf'
file_size = result['Uploaded size']
order_by = num
status = 1
create_by = 'XueLingKun'
create_time = time_now
try:
tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size,
order_by, status, create_by, create_time)
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '')
except:
exception = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
def begin():
while True:
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
if not social_code:
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
if social_code == '':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(social_code)
count = dic_info[15]
code = dic_info[3]
com_name = dic_info[4]
if code is None:
exeception = '股票代码为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
continue
getContent(social_code, com_name, code,start_time)
count += 1
runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count)
if __name__ == '__main__':
begin()
cursor.close()
cnx.close()
baseCore.close()
...@@ -331,6 +331,12 @@ if __name__ == '__main__': ...@@ -331,6 +331,12 @@ if __name__ == '__main__':
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode') social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
if not social_code:
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
if social_code == '': if social_code == '':
time.sleep(20) time.sleep(20)
continue continue
......
""" """
...@@ -256,8 +256,18 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获 ...@@ -256,8 +256,18 @@ def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获
soup = RequestUrl(url, payload, social_code, start_time) soup = RequestUrl(url, payload, social_code, start_time)
if soup == '': if soup == '':
return return
# 先获取页数 # 判断查找内容是否存在
try:
is_exist = soup.find('div',class_='con').text
if is_exist == '没有查询到数据':
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, '')
return
except:
pass
# 先获取页数
page = soup.find('div', class_='pages').find('ul', class_='g-ul').text page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
total = re.findall(r'\d+', page)[0] total = re.findall(r'\d+', page)[0]
......
...@@ -14,9 +14,9 @@ from base.smart import smart_extractor ...@@ -14,9 +14,9 @@ from base.smart import smart_extractor
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
jieba.cut("必须加载jieba")
# 初始化,设置中文分词 # 初始化,设置中文分词
jieba.cut("必须加载jieba")
smart =smart_extractor.SmartExtractor('cn')
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
...@@ -154,8 +154,8 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -154,8 +154,8 @@ def beinWork(tyc_code, social_code,start_time):
time_format = baseCore.getNowTime(1) time_format = baseCore.getNowTime(1)
try: try:
# 开始进行智能解析 # 开始进行智能解析
lang = baseCore.detect_language(title) # lang = baseCore.detect_language(title)
smart = smart_extractor.SmartExtractor(lang) # smart = smart_extractor.SmartExtractor(lang)
contentText = smart.extract_by_url(link).text contentText = smart.extract_by_url(link).text
# time.sleep(3) # time.sleep(3)
except Exception as e: except Exception as e:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论