提交 88707bfa 作者: 薛凌堃

9/18

上级 d27ed8e5
......@@ -55,9 +55,9 @@ def closeSql(cnx,cursor):
def NewsEnterprise():
cnx,cursor = connectSql()
# #获取国内企业
gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
cursor.execute(gn_query)
gn_result = cursor.fetchall()
# gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
# cursor.execute(gn_query)
# gn_result = cursor.fetchall()
#获取国外企业
gw_query = "select SocialCode from EnterpriseInfo where Place = '2'"
cursor.execute(gw_query)
......@@ -66,11 +66,11 @@ def NewsEnterprise():
gw_social_list = [item[0] for item in gw_result]
#todo:打印长度
# print(len(gw_social_list))
gn_social_list = [item[0] for item in gn_result]
# gn_social_list = [item[0] for item in gn_result]
print('=======')
#将数据插入到redis中
for item in gn_social_list:
r.rpush('NewsEnterprise:gnqy_socialCode', item)
# for item in gn_social_list:
# r.rpush('NewsEnterprise:gnqy_socialCode', item)
for item in gw_social_list:
r.rpush('NewsEnterprise:gwqy_socialCode', item)
......@@ -275,7 +275,7 @@ def AnnualEnterpriseXueQ_task():
def AnnualEnterpriseUS():
cnx,cursor = connectSql()
# 获取美股企业
us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null"
us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and CreateTime='2023-08-15 14:00:00'"
# us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' "
#ZZSN22080900000025
cursor.execute(us_query)
......@@ -387,6 +387,44 @@ def yahooCode_task():
print('定时采集异常', e)
pass
#新三板
def NQEnterprise():
cnx, cursor = connectSql()
nq_query = "select gpdm from NQEnterprise where QCCID is not null "
cursor.execute(nq_query)
nq_result = cursor.fetchall()
nq_social_list = [item[0] for item in nq_result]
for item in nq_social_list:
# r.rpush('NQEnterprise:nq_Ipo', item)
r.rpush('NQEnterprise:nq_finance',item)
# r.rpush('NQEnterprise:nq_notice',item)
closeSql(cnx, cursor)
def omeng():
cnx, cursor = connectSql()
# om_query = " SELECT A.SocialCode FROM EnterpriseInfo A , EnterpriseType B WHERE B.TYPE=6 AND A.SocialCode=B.SocialCode AND A.Place=1 order by A.id desc "
om_query = " SELECT A.SocialCode FROM EnterpriseInfo A , EnterpriseType B WHERE B.TYPE=6 AND A.SocialCode=B.SocialCode AND A.Place=2 "
cursor.execute(om_query)
om_result = cursor.fetchall()
om_social_list = [item[0] for item in om_result]
for item in om_social_list:
#企业基本信息
r.rpush('gwOMEnterprise_socialcode:BaseInfo', item)
#企业高管信息
# r.rpush('gnOMEnterprise_socialcode:TYCid', item)
# r.rpush('gnOMEnterprise_socialcode:CenterPerson', item)
#企业动态
r.rpush('gwOMEnterprise_socialcode:News', item)
#企业年报
# r.rpush('gnOMEnterprise_socialcode:Report', item)
#企业公告
# r.rpush('gnOMEnterprise_socialcode:Notice', item)
closeSql(cnx, cursor)
if __name__ == "__main__":
start = time.time()
......@@ -399,7 +437,9 @@ if __name__ == "__main__":
# BaseInfoEnterprise()
# FBS()
# MengZhi()
AnnualEnterpriseUS()
NQEnterprise()
# omeng()
# AnnualEnterpriseUS()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
# NoticeEnterprise()
......
......@@ -16,7 +16,7 @@ from fdfs_client.storage_client import *
from fdfs_client.exceptions import *
def get_tracker_conf(conf_path='client.conf'):
def get_tracker_conf(conf_path='../../client.conf'):
cf = Fdfs_ConfigParser()
tracker = {}
try:
......
# -*- coding: utf-8 -*-
import redis
import time
from urllib.parse import quote
import pymongo
import requests
from bson.objectid import ObjectId
import json
from pyquery import PyQuery as pq
import urllib3
import hashlib
from kafka import KafkaProducer
import pandas as pd
import zhconv
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except:
print('重试')
time.sleep(5)
continue
time.sleep(2)
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = ''
else:
KeyNo = ''
print("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
# 判断字符串里是否含数字
def str_have_num(str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# 通过企查查id获取企业官网
def info_by_id(com_id,com_name):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
try:
result_dict = resp_dict['result']['Company']
except:
print(com_name + ":获取失败")
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
print(com_name + ":爬取完成")
return WebSite
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
#TODO:需要隔两个小时左右抓包修改
token = '1dcc61d85177733298e5827653706f1a' # 需要隔两个小时左右抓包修改
start = time.time()
list_weicha = []
#待采集企业文件
filename = 'data/内蒙古市属国有企业_官网.xlsx'
df_all = pd.read_excel('data/内蒙古市属国有企业.xls',dtype=str)
list_all_info = []
for num_df in range(162,len(df_all)):
#企业社会信用代码
id_code = str(df_all['本企业代码'][num_df])
#企业名称
com_name = str(df_all['企业名称'][num_df])
#行次
line = str(df_all['行次'][num_df])
dic_com = {
'line': line,
'social_code': id_code,
'com_name': id_code,
'website':''
}
company_id = find_id_by_name(id_code)
if company_id == "":
print(com_name + ":企业ID获取失败")
list_weicha.append(com_name + ":企业ID获取失败")
continue
WebSite = info_by_id(company_id,com_name)
dic_com['website'] = WebSite
log.info(f'---{num_df}-------{com_name}----------耗时{baseCore.getTimeCost(start,time.time())}')
list_all_info.append(dic_com)
baseCore.writerToExcel(list_all_info,filename)
# -*- coding: utf-8 -*-
import redis
import time
from urllib.parse import quote
import pymongo
import requests
from bson.objectid import ObjectId
import json
from pyquery import PyQuery as pq
import urllib3
import hashlib
from kafka import KafkaProducer
import pandas as pd
import zhconv
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except:
print('重试')
time.sleep(5)
continue
time.sleep(2)
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = ''
else:
KeyNo = ''
print("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
# 判断字符串里是否含数字
def str_have_num(str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# 通过企查查id获取企业官网
def info_by_id(com_id,com_name):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
try:
result_dict = resp_dict['result']['Company']
except:
print(com_name + ":获取失败")
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
print(com_name + ":爬取完成")
return WebSite
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
#TODO:需要隔两个小时左右抓包修改
token = '06de5b73cb9d6f9fcae27edaf94749cf' # 需要隔两个小时左右抓包修改
start = time.time()
list_weicha = []
#待采集企业文件
filename = 'data/钢铁企业_官网.xlsx'
df_all = pd.read_excel('data/钢铁企业.xlsx',dtype=str)
list_all_info = []
for num_df in range(len(df_all)):
#企业社会信用代码
id_code = str(df_all['信用代码'][num_df])
#企业名称
com_name = str(df_all['企业名称'][num_df])
#行次
# line = str(df_all['行次'][num_df])
dic_com = {
# 'line': line,
'social_code': id_code,
'com_name': com_name,
'website':''
}
company_id = find_id_by_name(id_code)
if company_id == "":
print(com_name + ":企业ID获取失败")
list_weicha.append(com_name + ":企业ID获取失败")
continue
WebSite = info_by_id(company_id,com_name)
dic_com['website'] = WebSite
log.info(f'---{num_df}-------{com_name}----------耗时{baseCore.getTimeCost(start,time.time())}')
list_all_info.append(dic_com)
baseCore.writerToExcel(list_all_info,filename)
......@@ -9,7 +9,10 @@ import json
from kafka import KafkaProducer
from base.BaseCore import BaseCore
from getQccId import find_id_by_name
from base.BaseCore import BaseCore
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
baseCore = BaseCore()
cnx = baseCore.cnx
......@@ -310,7 +313,7 @@ def info_by_id(com_id,com_name):
return aa_dict_list
if __name__ == '__main__':
taskType = '基本信息/企查查'
taskType = '基本信息/企查查/福布斯'
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
......@@ -325,75 +328,64 @@ if __name__ == '__main__':
#从redis里拿数据
while True:
start = time.time()
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken()
list_weicha = []
list_all_info = []
name_list = []
start_time = time.time()
# 获取企业信息
query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state1=1 limit 1 "
#兴业银行
# query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'"
cursor.execute(query)
row = cursor.fetchone()
if row:
pass
else:
print('没有数据了,结束脚本')
break
com_name = row[6]
social_code = row[4]
code = row[7]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
updateBeginSql = f"update Tfbs set state1=0,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
company_id = find_id_by_name(start,token,social_code)
if company_id == False:
#表示token失效
time.sleep(10)
updateBeginSql = f"update Tfbs set state1=1,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
social_code = baseCore.redicPullData('BaseInfoEnterpriseFbs:gnqy_social_code')
# social_code = '91110000710924945A'
if social_code is None:
time.sleep(20)
continue
if company_id == "":
log.info(com_name + ":企业ID获取失败")
log.info(f'----当前企业{social_code}-----')
dic_info = baseCore.getInfomation(social_code)
#
count = dic_info[13]
com_name = dic_info[1]
social_code = dic_info[2]
# 企查查id
company_id = dic_info[12]
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == None:
if social_code:
company_id = find_id_by_name(start_time, token, social_code)
else:
company_id = find_id_by_name(start_time, token, com_name)
# todo:写入数据库
updateSql = f"update EnterpriseInfo set QCCID = '{company_id}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql)
cnx_.commit()
post_data_list = info_by_id(company_id, com_name)
if company_id == "":
print(com_name + ":企业ID获取失败")
list_weicha.append(com_name + ":企业ID获取失败")
#400表示企业更新失败
updateBeginSql = f"update Tfbs set state1=400,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
continue
else:
post_data_list = info_by_id(company_id,social_code)
log.info(f'====={social_code}===={company_id}=====获取企业id成功=====')
try:
post_data_list = info_by_id(company_id, com_name)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('BaseInfoEnterpriseFbs:gnqy_social_code', social_code)
continue
for post_data in post_data_list:
list_all_info.append(post_data)
if post_data is None:
log.info(com_name + ":企业信息获取失败")
print(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败")
# 400表示企业更新失败
updateBeginSql = f"update Tfbs set state1=400,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
continue
get_name = post_data['name']
get_socialcode = post_data['socialCreditCode']
name_compile = {
'yuan_name':com_name,
'get_name':get_name
'yuan_name': com_name,
'get_name': get_name
}
name_list.append(name_compile)
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
......@@ -404,16 +396,17 @@ if __name__ == '__main__':
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
#200表示成功
updateBeginSql = f"update Tfbs set state1=200,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
# break
# 信息采集完成后将该企业的采集次数更新
runType = 'BaseInfoRunCount'
count += 1
baseCore.updateRun(social_code, runType, count)
nowtime = baseCore.getNowTime(1).replace('-', '_')[:10]
companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx', index=False)
false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx', index=False)
baseCore.close()
......
import redis
# 连接到Redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# 列表名称
list_name = 'BaseInfoEnterpriseMz:gnqy_socialCode'
# 获取列表中的所有元素
elements = r.lrange(list_name, 0, -1)
# 遍历列表中的元素
for element in elements:
# 获取元素在列表中的数量
count = r.lrem(list_name, 0, element)
# 如果数量大于1,说明有重复值,删除多余的重复值
if count > 1:
r.lrem(list_name, count - 1, element)
# 打印处理后的列表
print(r.lrange(list_name, 0, -1))
......@@ -300,7 +300,7 @@ if __name__ == '__main__':
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:usqy_socialCode')
social_code = 'ZZSN22080900000025'
social_code = 'ZZSN230912210643024'
if not social_code:
time.sleep(20)
continue
......
# import redis
#
#
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
#
# # 获取所有键
# keys = r.keys('*')
# # print(keys)
# for key in keys:
# f_key = key.decode()
# print(f_key)
# print("----------")
# res = r.exists(f_key)
# value = list(r.smembers(f_key))
# # 对列表进行排序
# value.sort()
# # 遍历排序后的列表
# list_data = []
# for member in value:
# member = member.decode()
# members = member.strip('[').strip(']').replace('\'','').strip().split(',')
# #获取每一个报告期
# for date in members:
# data = date.strip()
# # print(date.strip())
# list_data.append(data)
# # 放入redis
# for item in list_data:
# r.sadd(key, item)
#
# # 获取Set中的所有元素
# items = r.smembers(key)
# # print(items)
# print("======================================")
import datetime
timestamp = 1688054400 # 示例时间戳
date = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
print(date)
import redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
# 获取所有键
# keys = r.keys('*')
# # print(keys)
# for key in keys:
# f_key = key.decode()
# print(f_key)
# print("----------")
# r.srem(f_key,'[]')
# r.srem(f_key,'')
def check_date(com_code,info_date):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=3)
res = r.sismember('com_caiwushuju_code::'+com_code, info_date) # 注意是 保存set的方式
if res:
return True
else:
return False
def add_date():
date_list = ['2023-06-30']
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
#遍历date_list 放入redis
for date in date_list:
res = r.sadd('com_caiwushuju_code::'+'123456',date)
# check_date('sh601398','2023-03-31')
add_date()
#com_caiwushuju_code::bj430418
# ['2018-12-31', '2018-09-30', '2018-06-30', '2018-03-31']
import redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
f_key = 'com_caiwushuju_code::bj851834'
r.set(f_key,3)
res = r.exists(f_key)
print(res)
# list = ['2023-03-31', '2022-12-31', '2022-09-30', '2022-06-30', '2022-03-31', '2021-12-31', '2021-09-30', '2021-06-30', '2021-03-31', '2020-12-31', '2020-09-30', '2020-06-30', '2020-03-31', '2019-12-31', '2019-09-30', '2019-06-30', '2019-03-31']
#
# for a in list:
# r.sadd(f_key,a)
# from base.BaseCore import BaseCore
# baseCore = BaseCore()
# from datetime import datetime,timedelta
# timeNow = baseCore.getNowTime(1)[:10]
# # list_date = []
# #
# # list_date.append(timeNow)
# # print(timeNow)
# # print(list_date)
#
# current_date = datetime.now()
# # 计算昨天的日期
# yesterday = current_date - timedelta(days=1)
# # 格式化昨天的日期
# report_date = yesterday.strftime('%Y-%m-%d')
# # list_date.append(report_date)
# year = int(current_date.strftime('%Y'))
# print(year)
# print(type(year))
\ No newline at end of file
......@@ -327,13 +327,14 @@ if __name__ == '__main__':
#从redis里拿数据
while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = '027ea02da6d901a724ecca47930379b4'
token = 'b4eb43143abdcf395f1335f322ca29e5'
list_weicha = []
list_all_info = []
name_list = []
start_time = time.time()
# 获取企业信息
com_code = baseCore.redicPullData('EnterpriseIpo:nq_gpdm')
# com_code = baseCore.redicPullData('EnterpriseIpo:nq_gpdm')
com_code = '873349'
if '.NQ' in com_code:
com_code1 = com_code
else:
......@@ -358,7 +359,7 @@ if __name__ == '__main__':
post_data_list = info_by_id(company_id, '',com_code1)
except:
log.info(f'====={com_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('BaseInfoEnterprise:gnqy_social_code', com_code)
baseCore.rePutIntoR('EnterpriseIpo:nq_gpdm', com_code)
continue
if post_data_list:
pass
......
......@@ -7,7 +7,7 @@ import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib3
from base.BaseCore import BaseCore
from BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# from gpdm import Gpdm
baseCore = BaseCore()
......@@ -20,15 +20,17 @@ log = baseCore.getLogger()
error_list = []
list_all_info = []
# 需要提供股票代码、企业信用代码
flag = 0
while True:
#从表中读取企业
flag = 0
com_code1 = baseCore.redicPullData('EnterpriseIpo:nq_gpdm')
if com_code1 is None:
time.sleep(20)
if flag==1:
com_code = baseCore.redicPullData('NQEnterprise:nq_Ipo')
# com_code = baseCore.redicPullData('NQEnterprise:nq_Ipo_test')
if com_code is None:
if flag==0:
time.sleep(20)
log.info('已没有数据----------等待')
continue
elif flag==0:
elif flag==1:
# 通过接口将数据保存进数据库
for num in range(0, len(list_all_info), 100):
......@@ -42,15 +44,18 @@ while True:
print(e)
print("{}:到:{}".format(num, num + 100))
print(response.text)
log.info('-----------数据发送接口完毕----------')
flag = 0
continue
break
com_code = '838616'
short_name = ''
social_code = ''
#从数据库中查询到其他信息
log.info(f'========正在采集{com_code}===========')
data = baseCore.getInfomation(com_code)
social_code = data[1]
short_name = data[3]
start = time.time()
log.info(f'======开始采集{com_code}======')
url = f'https://xinsanban.eastmoney.com/F10/CompanyInfo/Introduction/833658.html'
url = f'https://xinsanban.eastmoney.com/F10/CompanyInfo/Introduction/{com_code}.html'
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -95,12 +100,9 @@ while True:
"eastIndustry": industry,
"csrcIndustry": ''
}
print(dic_cwsj)
# print(dic_cwsj)
list_all_info.append(dic_cwsj)
log.info(f'======{com_code}====采集成功=====')
flag = 1
break
......@@ -10,7 +10,7 @@ from bs4 import BeautifulSoup
import datetime
from selenium import webdriver
from base.BaseCore import BaseCore
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
......@@ -84,7 +84,7 @@ def getdetail(reportInfodata,name_map,listinfo,url_name):
listinfo.append(dic_info)
return listinfo
def getinfo(com_code):
def getinfo(com_code,social_code):
dic_info = {}
for nnn in range(0, 3):
try:
......@@ -127,10 +127,11 @@ def getinfo(com_code):
break
except:
time.sleep(1)
log.info(f'======正在采集:{report_date}=======')
log.info(f'======正在采集:{com_code}---{report_date}=======')
#利润表
list_Lrb = getdetail(reportLrbdata,lrb_name_map,listLrb,lrb_name)
print(list_Lrb)
log.info(f'利润表数据:{len(list_Lrb)}个')
# print(list_Lrb)
#资产负债表
reportZcfzbdata = b_infoData[i]
list_Zcfzb = getdetail(reportZcfzbdata,zcfzb_name_map,listZcfzb,zcfzb_name)
......@@ -148,27 +149,30 @@ def getinfo(com_code):
"cash": list_Xjllb,
"ynFirst": ynFirst,
}
print(dic_info)
# print(dic_info)
#一个报告期结束
for nnn in range(0, 3):
try:
add_date(com_code, report_date)
break
except:
time.sleep(1)
log.info(f'----{com_code}--{report_date}----结束')
# if dic_info:
# # 调凯歌接口存储数据
# data = json.dumps(dic_info)
# # print(data)
# url_baocun = 'http://114.115.236.206:8088/sync/finance/df'
# for nnn in range(0, 3):
# try:
# res_baocun = requests.post(url_baocun, data=data)
# break
# except:
# time.sleep(1)
# print(res_baocun.text)
if dic_info:
# 调凯歌接口存储数据
data = json.dumps(dic_info)
# print(data)
url_baocun = 'http://114.115.236.206:8088/sync/finance/df'
for nnn in range(0, 3):
try:
res_baocun = requests.post(url_baocun, data=data)
break
except:
time.sleep(1)
print(res_baocun.text)
log.info('------------数据发送接口完毕------------')
for nnn in range(0, 3):
try:
add_date(com_code, report_date)
break
except:
time.sleep(1)
else:
log.error(f'---{com_code}--{report_date}--')
return dic_info
if __name__ == '__main__':
......@@ -178,9 +182,27 @@ if __name__ == '__main__':
browser = webdriver.Chrome(chromedriver)
except Exception as e:
print(e)
social_code = ''
headers = {
'authority': 'stock.xueqiu.com',
'method': 'GET',
'path': '/v5/stock/finance/cn/income.json?symbol=NQ873286&type=all&is_detail=true&count=5&timestamp=1694414063178',
'scheme': 'https',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cookie': 'device_id=84ced64554d8060750b1528dc22a3696; s=bt110kz0n8; cookiesu=661693188384462; u=661693188384462; Hm_lvt_1db88642e346389874251b5a1eded6e3=1693188388; xq_a_token=29bdb37dee2432c294425cc9e8f45710a62643a5; xqat=29bdb37dee2432c294425cc9e8f45710a62643a5; xq_r_token=3a35db27fcf5471898becda7aa5dab6afeafe471; xq_id_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ1aWQiOi0xLCJpc3MiOiJ1YyIsImV4cCI6MTY5NjgxMTc5NCwiY3RtIjoxNjk0NDEzNTQ2ODU4LCJjaWQiOiJkOWQwbjRBWnVwIn0.Xxu329nQq4bMtwKFJWlScnUUSWrky4T5SWkMum46c2G8la2z4g0d4nyvsO08WP-7moMffId6P3bGWuELULkbv6EHvIZgqge9-fAD4-hmLOjeRh96NsoGfyTAQK7tbnt9LhKz1fDg6SUi8loMqYgM7l-4g-ZM4B6zrZ5hKWdQJFLy0-V8Wzx7HTFYZSX3FNSsbgGqHlW4vykIpsRaNeOOX1M6LYdt6BhbAi1Iv4TflB08LIdu6F1n4dTRbmPq1KCndb2LsLR2HrJZmqmHJB9WMzwlVcIGdz778_CutNrwuWgJbWtb-s3dSESzO0WWw1uIIGZUvRl1D0KSl0P_GQLw9w; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1694414056',
'Origin': 'https://xueqiu.com',
'Pragma': 'no-cache',
'Referer': 'https://xueqiu.com/snowman/S/NQ873286/detail',
'Sec-Ch-Ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
#中英文名称映射
lrb_name_map = {
'营业总收入':'total_revenue',
......@@ -347,32 +369,39 @@ if __name__ == '__main__':
'加:期初现金及现金等价物余额':'final_balance_of_cce',
'期末现金及现金等价物余额':'final_balance_of_cce'
}
table_type = ['income','balance']
flag = 0
while True:
# com_code = baseCore.redicPullData('NQEnterprise:nq_finance')
com_code = baseCore.redicPullData('NQEnterprise:nq_finance_test')
if com_code is None:
if flag==0:
log.info('已没有数据----------等待')
time.sleep(20)
continue
elif flag==1:
log.info('=============调用对比指标接口======')
time.sleep(5400)
url_ = 'http://114.115.236.206:8088/sync/calculateIndex?type=1'
for nnn in range(0, 3):
try:
res_ = requests.get(url_)
break
except:
time.sleep(1)
print(res_.text)
log.info('-----------数据触发对比指标接口完毕----------')
flag = 0
continue
log.info(f'========正在采集{com_code}===========')
data = baseCore.getInfomation(com_code)
social_code = data[1]
short_name = data[3]
start = time.time()
com_code = 'NQ' + com_code
dic_info = getinfo(com_code,social_code)
flag =1
com_code = 'NQ' + '873286'
headers = {
'authority': 'stock.xueqiu.com',
'method': 'GET',
'path': '/v5/stock/finance/cn/income.json?symbol=NQ873286&type=all&is_detail=true&count=5&timestamp=1694414063178',
'scheme': 'https',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cookie': 'device_id=84ced64554d8060750b1528dc22a3696; s=bt110kz0n8; cookiesu=661693188384462; u=661693188384462; Hm_lvt_1db88642e346389874251b5a1eded6e3=1693188388; xq_a_token=29bdb37dee2432c294425cc9e8f45710a62643a5; xqat=29bdb37dee2432c294425cc9e8f45710a62643a5; xq_r_token=3a35db27fcf5471898becda7aa5dab6afeafe471; xq_id_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ1aWQiOi0xLCJpc3MiOiJ1YyIsImV4cCI6MTY5NjgxMTc5NCwiY3RtIjoxNjk0NDEzNTQ2ODU4LCJjaWQiOiJkOWQwbjRBWnVwIn0.Xxu329nQq4bMtwKFJWlScnUUSWrky4T5SWkMum46c2G8la2z4g0d4nyvsO08WP-7moMffId6P3bGWuELULkbv6EHvIZgqge9-fAD4-hmLOjeRh96NsoGfyTAQK7tbnt9LhKz1fDg6SUi8loMqYgM7l-4g-ZM4B6zrZ5hKWdQJFLy0-V8Wzx7HTFYZSX3FNSsbgGqHlW4vykIpsRaNeOOX1M6LYdt6BhbAi1Iv4TflB08LIdu6F1n4dTRbmPq1KCndb2LsLR2HrJZmqmHJB9WMzwlVcIGdz778_CutNrwuWgJbWtb-s3dSESzO0WWw1uIIGZUvRl1D0KSl0P_GQLw9w; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1694414056',
'Origin': 'https://xueqiu.com',
'Pragma': 'no-cache',
'Referer': 'https://xueqiu.com/snowman/S/NQ873286/detail',
'Sec-Ch-Ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
dic_info = getinfo(com_code)
......
import json
import time
import requests
from bs4 import BeautifulSoup
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
from urllib import parse
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'qgqp_b_id=92f470109c2462c6c6aa5115d15f7b35; emshistory=%5B%22sz007sz%22%5D; qRecords=%5B%7B%22name%22%3A%22%u5929%u98CE%u8BC1%u5238%22%2C%22code%22%3A%22SH601162%22%7D%5D; HAList=ty-1-601766-%u4E2D%u56FD%u4E2D%u8F66%2Cty-116-03690-%u7F8E%u56E2-W%2Cty-0-002828-%u8D1D%u80AF%u80FD%u6E90%2Cty-1-601162-%u5929%u98CE%u8BC1%u5238%2Cty-0-000001-%u5E73%u5B89%u94F6%u884C%2Cty-0-002070-%u4F17%u548C%u9000%2Cty-1-600723-%u9996%u5546%u80A1%u4EFD%2Cty-0-300106-%u897F%u90E8%u7267%u4E1A%2Cty-116-00992-%u8054%u60F3%u96C6%u56E2%2Cty-0-300362-%u5929%u7FD4%u9000; st_si=06778540617554; st_pvi=44810095342512; st_sp=2023-07-18%2013%3A55%3A09; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=1; st_psi=20230901152305423-113300304201-4533354410; st_asi=delete',
'Host': 'data.eastmoney.com',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
file_name = '研报--产业链.xlsx'
for page in range(100,101):
log.info(f'----开始采集第{page}页-------')
param = {"uid":"",
"keyword":"产业链",
"type":["researchReport"],
"client":"web",
"clientVersion":"curr",
"clientType":"web",
"param":{"researchReport":{"client":"web","pageSize":10,"pageIndex":page}}
}
param_url = parse.quote(str(param).replace(" ", ""))
t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
res = requests.get(url).text[1:-1]
res_json = json.loads(res)
list_all = res_json['result']['researchReport']
# print(list_all)
if list_all:
pass
else:
continue
num = 1
for one_news in list_all:
log.info(f'---------开始采集第{num}条---------')
dataList = []
com_name = one_news['stockName']
title = str(one_news['title']).replace('<em>','').replace('</em>','')
date = one_news['date'][:10]
code = one_news['code']
href = f'https://data.eastmoney.com/report/zw_stock.jshtml?infocode={code}'
# print(date,href)
#newsContent
req = requests.get(href,headers=headers,verify=False,timeout=30)
soup = BeautifulSoup(req.content,'html.parser')
content = soup.find('div',class_='newsContent')
try:
pdf_url = soup.find('div',class_='report-infos').find_all('span')[4].find('a')['href']
except:
log.info(f'-----{href}-----')
continue
#.find_all('span')[-1].find('a')['href']
log.info(pdf_url)
dic_info = {
'公司名称':com_name,
'标题':title,
'正文':content.text.strip(),
'附件链接':pdf_url,
'发布时间':date,
'contentwithTag':content.prettify()
}
# print(dic_info)
dataList.append(dic_info)
baseCore.writerToExcel(dataList,file_name)
num+=1
# break
# connect timeout in seconds
# default value is 30s
connect_timeout=300
# network timeout in seconds
# default value is 30s
network_timeout=600
# the base path to store log files
#base_path=/home/tarena/django-project/cc_shop1/cc_shop1/logs
# tracker_server can ocur more than once, and tracker_server format is
# "host:port", host can be hostname or ip address
tracker_server=114.115.215.96:22122
#standard log level as syslog, case insensitive, value list:
### emerg for emergency
### alert
### crit for critical
### error
### warn for warning
### notice
### info
### debug
log_level=info
# if use connection pool
# default value is false
# since V4.05
use_connection_pool = false
# connections whose the idle time exceeds this time will be closed
# unit: second
# default value is 3600
# since V4.05
connection_pool_max_idle_time = 3600
# if load FastDFS parameters from tracker server
# since V4.05
# default value is false
load_fdfs_parameters_from_tracker=false
# if use storage ID instead of IP address
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# default value is false
# since V4.05
use_storage_id = false
# specify storage ids filename, can use relative or absolute path
# same as tracker.conf
# valid only when load_fdfs_parameters_from_tracker is false
# since V4.05
storage_ids_filename = storage_ids.conf
#HTTP settings
http.tracker_server_port=80
#use "#include" directive to include HTTP other settiongs
##include http.conf
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: exceptions.py
'''Core exceptions raised by fdfs client'''
class FDFSError(Exception):
pass
class ConnectionError(FDFSError):
pass
class ResponseError(FDFSError):
pass
class InvaildResponse(FDFSError):
pass
class DataError(FDFSError):
pass
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: fdfs_protol.py
import struct
import socket
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# define FDFS protol constans
TRACKER_PROTO_CMD_STORAGE_JOIN = 81
FDFS_PROTO_CMD_QUIT = 82
TRACKER_PROTO_CMD_STORAGE_BEAT = 83 # storage heart beat
TRACKER_PROTO_CMD_STORAGE_REPORT_DISK_USAGE = 84 # report disk usage
TRACKER_PROTO_CMD_STORAGE_REPLICA_CHG = 85 # repl new storage servers
TRACKER_PROTO_CMD_STORAGE_SYNC_SRC_REQ = 86 # src storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_REQ = 87 # dest storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_NOTIFY = 88 # sync done notify
TRACKER_PROTO_CMD_STORAGE_SYNC_REPORT = 89 # report src last synced time as dest server
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_QUERY = 79 # dest storage query sync src storage server
TRACKER_PROTO_CMD_STORAGE_REPORT_IP_CHANGED = 78 # storage server report it's ip changed
TRACKER_PROTO_CMD_STORAGE_CHANGELOG_REQ = 77 # storage server request storage server's changelog
TRACKER_PROTO_CMD_STORAGE_REPORT_STATUS = 76 # report specified storage server status
TRACKER_PROTO_CMD_STORAGE_PARAMETER_REQ = 75 # storage server request parameters
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FREE = 74 # storage report trunk free space
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FID = 73 # storage report current trunk file id
TRACKER_PROTO_CMD_STORAGE_FETCH_TRUNK_FID = 72 # storage get current trunk file id
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START = 61 # start of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END = 62 # end of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE = 63 # tracker get a system data file
TRACKER_PROTO_CMD_TRACKER_GET_STATUS = 64 # tracker get status of other tracker
TRACKER_PROTO_CMD_TRACKER_PING_LEADER = 65 # tracker ping leader
TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER = 66 # notify next leader to other trackers
TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER = 67 # commit next leader to other trackers
TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP = 90
TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS = 91
TRACKER_PROTO_CMD_SERVER_LIST_STORAGE = 92
TRACKER_PROTO_CMD_SERVER_DELETE_STORAGE = 93
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ONE = 101
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ONE = 102
TRACKER_PROTO_CMD_SERVICE_QUERY_UPDATE = 103
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ONE = 104
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ALL = 105
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ALL = 106
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ALL = 107
TRACKER_PROTO_CMD_RESP = 100
FDFS_PROTO_CMD_ACTIVE_TEST = 111 # active test, tracker and storage both support since V1.28
STORAGE_PROTO_CMD_REPORT_CLIENT_IP = 9 # ip as tracker client
STORAGE_PROTO_CMD_UPLOAD_FILE = 11
STORAGE_PROTO_CMD_DELETE_FILE = 12
STORAGE_PROTO_CMD_SET_METADATA = 13
STORAGE_PROTO_CMD_DOWNLOAD_FILE = 14
STORAGE_PROTO_CMD_GET_METADATA = 15
STORAGE_PROTO_CMD_SYNC_CREATE_FILE = 16
STORAGE_PROTO_CMD_SYNC_DELETE_FILE = 17
STORAGE_PROTO_CMD_SYNC_UPDATE_FILE = 18
STORAGE_PROTO_CMD_SYNC_CREATE_LINK = 19
STORAGE_PROTO_CMD_CREATE_LINK = 20
STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE = 21
STORAGE_PROTO_CMD_QUERY_FILE_INFO = 22
STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE = 23 # create appender file
STORAGE_PROTO_CMD_APPEND_FILE = 24 # append file
STORAGE_PROTO_CMD_SYNC_APPEND_FILE = 25
STORAGE_PROTO_CMD_FETCH_ONE_PATH_BINLOG = 26 # fetch binlog of one store path
STORAGE_PROTO_CMD_RESP = TRACKER_PROTO_CMD_RESP
STORAGE_PROTO_CMD_UPLOAD_MASTER_FILE = STORAGE_PROTO_CMD_UPLOAD_FILE
STORAGE_PROTO_CMD_TRUNK_ALLOC_SPACE = 27 # since V3.00
STORAGE_PROTO_CMD_TRUNK_ALLOC_CONFIRM = 28 # since V3.00
STORAGE_PROTO_CMD_TRUNK_FREE_SPACE = 29 # since V3.00
STORAGE_PROTO_CMD_TRUNK_SYNC_BINLOG = 30 # since V3.00
STORAGE_PROTO_CMD_TRUNK_GET_BINLOG_SIZE = 31 # since V3.07
STORAGE_PROTO_CMD_TRUNK_DELETE_BINLOG_MARKS = 32 # since V3.07
STORAGE_PROTO_CMD_TRUNK_TRUNCATE_BINLOG_FILE = 33 # since V3.07
STORAGE_PROTO_CMD_MODIFY_FILE = 34 # since V3.08
STORAGE_PROTO_CMD_SYNC_MODIFY_FILE = 35 # since V3.08
STORAGE_PROTO_CMD_TRUNCATE_FILE = 36 # since V3.08
STORAGE_PROTO_CMD_SYNC_TRUNCATE_FILE = 37 # since V3.08
# for overwrite all old metadata
STORAGE_SET_METADATA_FLAG_OVERWRITE = 'O'
STORAGE_SET_METADATA_FLAG_OVERWRITE_STR = "O"
# for replace, insert when the meta item not exist, otherwise update it
STORAGE_SET_METADATA_FLAG_MERGE = 'M'
STORAGE_SET_METADATA_FLAG_MERGE_STR = "M"
FDFS_RECORD_SEPERATOR = '\x01'
FDFS_FIELD_SEPERATOR = '\x02'
# common constants
FDFS_GROUP_NAME_MAX_LEN = 16
IP_ADDRESS_SIZE = 16
FDFS_PROTO_PKG_LEN_SIZE = 8
FDFS_PROTO_CMD_SIZE = 1
FDFS_PROTO_STATUS_SIZE = 1
FDFS_PROTO_IP_PORT_SIZE = (IP_ADDRESS_SIZE + 6)
FDFS_MAX_SERVERS_EACH_GROUP = 32
FDFS_MAX_GROUPS = 512
FDFS_MAX_TRACKERS = 16
FDFS_DOMAIN_NAME_MAX_LEN = 128
FDFS_MAX_META_NAME_LEN = 64
FDFS_MAX_META_VALUE_LEN = 256
FDFS_FILE_PREFIX_MAX_LEN = 16
FDFS_LOGIC_FILE_PATH_LEN = 10
FDFS_TRUE_FILE_PATH_LEN = 6
FDFS_FILENAME_BASE64_LENGTH = 27
FDFS_TRUNK_FILE_INFO_LEN = 16
FDFS_FILE_EXT_NAME_MAX_LEN = 6
FDFS_SPACE_SIZE_BASE_INDEX = 2 # storage space size based (MB)
FDFS_UPLOAD_BY_BUFFER = 1
FDFS_UPLOAD_BY_FILENAME = 2
FDFS_UPLOAD_BY_FILE = 3
FDFS_DOWNLOAD_TO_BUFFER = 1
FDFS_DOWNLOAD_TO_FILE = 2
FDFS_NORMAL_LOGIC_FILENAME_LENGTH = (
FDFS_LOGIC_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_FILE_EXT_NAME_MAX_LEN + 1)
FDFS_TRUNK_FILENAME_LENGTH = (
FDFS_TRUE_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_TRUNK_FILE_INFO_LEN + 1 + FDFS_FILE_EXT_NAME_MAX_LEN)
FDFS_TRUNK_LOGIC_FILENAME_LENGTH = (FDFS_TRUNK_FILENAME_LENGTH + (FDFS_LOGIC_FILE_PATH_LEN - FDFS_TRUE_FILE_PATH_LEN))
FDFS_VERSION_SIZE = 6
TRACKER_QUERY_STORAGE_FETCH_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE)
TRACKER_QUERY_STORAGE_STORE_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE + 1)
# status code, order is important!
FDFS_STORAGE_STATUS_INIT = 0
FDFS_STORAGE_STATUS_WAIT_SYNC = 1
FDFS_STORAGE_STATUS_SYNCING = 2
FDFS_STORAGE_STATUS_IP_CHANGED = 3
FDFS_STORAGE_STATUS_DELETED = 4
FDFS_STORAGE_STATUS_OFFLINE = 5
FDFS_STORAGE_STATUS_ONLINE = 6
FDFS_STORAGE_STATUS_ACTIVE = 7
FDFS_STORAGE_STATUS_RECOVERY = 9
FDFS_STORAGE_STATUS_NONE = 99
class Storage_server(object):
'''Class storage server for upload.'''
def __init__(self):
self.ip_addr = None
self.port = None
self.group_name = ''
self.store_path_index = 0
# Class tracker_header
class Tracker_header(object):
'''
Class for Pack or Unpack tracker header
struct tracker_header{
char pkg_len[FDFS_PROTO_PKG_LEN_SIZE],
char cmd,
char status,
}
'''
def __init__(self):
self.fmt = '!QBB' # pkg_len[FDFS_PROTO_PKG_LEN_SIZE] + cmd + status
self.st = struct.Struct(self.fmt)
self.pkg_len = 0
self.cmd = 0
self.status = 0
def _pack(self, pkg_len=0, cmd=0, status=0):
return self.st.pack(pkg_len, cmd, status)
def _unpack(self, bytes_stream):
self.pkg_len, self.cmd, self.status = self.st.unpack(bytes_stream)
return True
def header_len(self):
return self.st.size
def send_header(self, conn):
'''Send Tracker header to server.'''
header = self._pack(self.pkg_len, self.cmd, self.status)
try:
conn._sock.sendall(header)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: %s' % (e.args,))
def recv_header(self, conn):
'''Receive response from server.
if sucess, class member (pkg_len, cmd, status) is response.
'''
try:
header = conn._sock.recv(self.header_len())
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: %s' % (e.args,))
self._unpack(header)
def fdfs_pack_metadata(meta_dict):
ret = ''
for key in meta_dict:
ret += '%s%c%s%c' % (key, FDFS_FIELD_SEPERATOR, meta_dict[key], FDFS_RECORD_SEPERATOR)
return ret[0:-1]
def fdfs_unpack_metadata(bytes_stream):
li = bytes_stream.split(FDFS_RECORD_SEPERATOR)
return dict([item.split(FDFS_FIELD_SEPERATOR) for item in li])
#!/usr/bin/env python
# -*- coding = utf-8 -*-
# filename: utils.py
import io
import os
import sys
import stat
import platform
import configparser
SUFFIX = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
__os_sep__ = "/" if platform.system() == 'Windows' else os.sep
def appromix(size, base=0):
'''Conver bytes stream size to human-readable format.
Keyword arguments:
size: int, bytes stream size
base: int, suffix index
Return: string
'''
multiples = 1024
if size < 0:
raise ValueError('[-] Error: number must be non-negative.')
if size < multiples:
return '{0:d}{1}'.format(size, SUFFIX[base])
for suffix in SUFFIX[base:]:
if size < multiples:
return '{0:.2f}{1}'.format(size, suffix)
size = size / float(multiples)
raise ValueError('[-] Error: number too big.')
def get_file_ext_name(filename, double_ext=True):
li = filename.split(os.extsep)
if len(li) <= 1:
return ''
else:
if li[-1].find(__os_sep__) != -1:
return ''
if double_ext:
if len(li) > 2:
if li[-2].find(__os_sep__) == -1:
return '%s.%s' % (li[-2], li[-1])
return li[-1]
class Fdfs_ConfigParser(configparser.RawConfigParser):
"""
Extends ConfigParser to allow files without sections.
This is done by wrapping read files and prepending them with a placeholder
section, which defaults to '__config__'
"""
def __init__(self, default_section=None, *args, **kwargs):
configparser.RawConfigParser.__init__(self, *args, **kwargs)
self._default_section = None
self.set_default_section(default_section or '__config__')
def get_default_section(self):
return self._default_section
def set_default_section(self, section):
self.add_section(section)
# move all values from the previous default section to the new one
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
except configparser.NoSectionError:
pass
else:
for (key, value) in default_section_items:
self.set(section, key, value)
self._default_section = section
def read(self, filenames):
if isinstance(filenames, str):
filenames = [filenames]
read_ok = []
for filename in filenames:
try:
with open(filename) as fp:
self.readfp(fp)
except IOError:
continue
else:
read_ok.append(filename)
return read_ok
def readfp(self, fp, *args, **kwargs):
stream = io.StringIO()
try:
stream.name = fp.name
except AttributeError:
pass
stream.write('[' + self._default_section + ']\n')
stream.write(fp.read())
stream.seek(0, 0)
return self._read(stream, stream.name)
def write(self, fp):
# Write the items from the default section manually and then remove them
# from the data. They'll be re-added later.
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
for (key, value) in default_section_items:
fp.write("{0} = {1}\n".format(key, value))
fp.write("\n")
except configparser.NoSectionError:
pass
configparser.RawConfigParser.write(self, fp)
self.add_section(self._default_section)
for (key, value) in default_section_items:
self.set(self._default_section, key, value)
def _read(self, fp, fpname):
"""Parse a sectioned setup file.
The sections in setup file contains a title line at the top,
indicated by a name in square brackets (`[]'), plus key/value
options lines, indicated by `name: value' format lines.
Continuations are represented by an embedded newline then
leading whitespace. Blank lines, lines beginning with a '#',
and just about everything else are ignored.
"""
cursect = None # None, or a dictionary
optname = None
lineno = 0
e = None # None, or an exception
while True:
line = fp.readline()
if not line:
break
lineno = lineno + 1
# comment or blank line?
if line.strip() == '' or line[0] in '#;':
continue
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
# no leading whitespace
continue
# continuation line?
if line[0].isspace() and cursect is not None and optname:
value = line.strip()
if value:
cursect[optname] = "%s\n%s" % (cursect[optname], value)
# a section header or option header?
else:
# is it a section header?
mo = self.SECTCRE.match(line)
if mo:
sectname = mo.group('header')
if sectname in self._sections:
cursect = self._sections[sectname]
elif sectname == DEFAULTSECT:
cursect = self._defaults
else:
cursect = self._dict()
cursect['__name__'] = sectname
self._sections[sectname] = cursect
# So sections can't start with a continuation line
optname = None
# no section header in the file?
elif cursect is None:
raise MissingSectionHeaderError(fpname, lineno, line)
# an option line?
else:
mo = self.OPTCRE.match(line)
if mo:
optname, vi, optval = mo.group('option', 'vi', 'value')
if vi in ('=', ':') and ';' in optval:
# ';' is a comment delimiter only if it follows
# a spacing character
pos = optval.find(';')
if pos != -1 and optval[pos - 1].isspace():
optval = optval[:pos]
optval = optval.strip()
# allow empty values
if optval == '""':
optval = ''
optname = self.optionxform(optname.rstrip())
if optname in cursect:
if not isinstance(cursect[optname], list):
cursect[optname] = [cursect[optname]]
cursect[optname].append(optval)
else:
cursect[optname] = optval
else:
# a non-fatal parsing error occurred. set up the
# exception but keep going. the exception will be
# raised at the end of the file and will contain a
# list of all bogus lines
if not e:
e = ParsingError(fpname)
e.append(lineno, repr(line))
# if any parsing errors occurred, raise an exception
if e:
raise e
def split_remote_fileid(remote_file_id):
'''
Splite remote_file_id to (group_name, remote_file_name)
arguments:
@remote_file_id: string
@return tuple, (group_name, remote_file_name)
'''
index = remote_file_id.find(b'/')
if -1 == index:
return None
return (remote_file_id[0:index], remote_file_id[(index + 1):])
def fdfs_check_file(filename):
ret = True
errmsg = ''
if not os.path.isfile(filename):
ret = False
errmsg = '[-] Error: %s is not a file.' % filename
elif not stat.S_ISREG(os.stat(filename).st_mode):
ret = False
errmsg = '[-] Error: %s is not a regular file.' % filename
return (ret, errmsg)
if __name__ == '__main__':
print(get_file_ext_name('/bc.tar.gz'))
# link = "../path/to/file/../folder/../"
#
# count = link.count("../")
#
# print(count) # 输出:4
#
# my_list = [1, 2, 3]
#
# result = ','.join(map(str, my_list))
#
# print(type(result)) # 输出:1,2,3
import io
import os
import requests
import win32com.client as win32
import win32com
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'Secure HttpOnly; __jsluid_h=622c9569b7d6db50280612dfbe457ec4; __jsluid_s=b76555cb5e11504bde71924ab7ff780a; _va_ref=%5B%22%22%2C%22%22%2C1693534257%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dm-wPghSYa5m0rKj3FjcXWqp8gfXcvYwljt9y1RdmIrHXd1lAl4QhbIbXDjZR9hZY%26wd%3D%26eqid%3Da0f738ed0005f760000000036459cd31%22%5D; _va_id=afdeea114ed19176.1683278397.11.1693534257.1693534257.; _va_ses=*',
'Host': 'gzw.beijing.gov.cn',
'If-Modified-Since': 'Mon, 31 Jul 2023 13:33:44 GMT',
'If-None-Match': 'W/"64c7b838-9676"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'}
# def doc_page(doc_bytes):
# doc = Document(doc_bytes)
# num_pages = len(doc.sections)
# print("文档页数:", num_pages)
# return num_pages
#
# file_href = 'https://gzw.beijing.gov.cn/xxfb/zcfg/202204/P020230106624599701195.doc'
# resp_content = requests.get(file_href,headers=headers,verify=False, timeout=20).content
# doc_bytes = BytesIO(resp_content)
# category = '.doc'
# if category == '.doc' or category == '.docx':
# page_size = doc_page(doc_bytes)
# print(page_size)
# from docx import Document
# import requests
# from io import BytesIO
# import docx2txt
# import docx
# # 获取远程Word文档内容
# url = "https://gzw.beijing.gov.cn/xxfb/zcfg/202204/P020230106624599701195.doc"
# response = requests.get(url,headers=headers,verify=False, timeout=20)
# doc_bytes = BytesIO(response.content)
#
# # 解析文档页数
# # doc = Document(doc_bytes)
# # num_pages = len(doc.sections)
# # print("文档页数:", num_pages)
#
# doc_file = docx.Document('E:\kkwork\P020230106624599701195.docx')
# # text = doc_file[0].getText()
# # print("文档内容:", text)
#
# # 解析文档页数
# num_pages = len(doc_file.sections)
# print("文档页数:", num_pages)
# from docx import Document
# import requests
#
# # # 下载远程Word文档
url = "https://gzw.beijing.gov.cn/xxfb/zcfg/202204/P020230106624599701195.doc"
# response = requests.get(url,headers=headers,verify=False, timeout=20)
# # with open("remote_document.doc", "wb") as file:
# # file.write(response.content)
#
# # 将doc转为docx格式
# import os
# os.system("textutil -convert docx remote_document.doc")
#
# # 解析文档页数
# doc = Document("remote_document.docx")
# num_pages = len(doc.sections)
# print("文档页数:", num_pages)
# def get_file_stream(file_url):
# # 发送GET请求获取文件内容
# response = requests.get(url,headers=headers,verify=False,timeout=10)
# # 将文件内容保存为文件流
# file_stream = io.BytesIO(response.content)
# return file_stream
# def read_file_stream(file_stream):
# # 创建一个Word应用程序对象
# word_app = win32com.client.Dispatch("Word.Application")
# # 将文件流保存为临时文件
# temp_file = "E:/temp.doc"
# with open(temp_file, "wb") as f:
# f.write(file_stream.read())
# # 打开临时文件
# doc = word_app.Documents.Open(temp_file)
# # 读取文件内容
# # content = doc.Content.Text
# # print(content)
# # 计算文档的统计信息
# doc.ComputeStatistics(2) # 参数2表示统计页数
# # 获取文档的页数
# page_count = doc.BuiltInDocumentProperties("Number of Pages").Value
# # 输出页数
# print(f"页数:{page_count}")
# # 关闭文件和应用程序
# doc.Close()
# word_app.Quit()
# # 删除临时文件
# os.remove(temp_file)
# return page_count
# file_stream = get_file_stream(url)
# page_count = read_file_stream(file_stream)
# print(f"页数22:{page_count}")
import io
import inspect
import requests
import os
import win32com.client as win32
import win32com
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Secure HttpOnly; __jsluid_s=8e1b254e40166ac092b3e137cacf4f09; _va_ses=*; _va_id=0c66e4a68ee344ba.1693453315.1.1693453317.1693453315.',
'Host':'gzw.beijing.gov.cn',
'Pragma':'no-cache',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
url='https://gzw.beijing.gov.cn/xxfb/zcfg/202204/P020230106624599701195.doc'
def get_file_stream(file_url):
# 发送GET请求获取文件内容
response = requests.get(url,headers=header,verify=False,timeout=10)
# 将文件内容保存为文件流
file_stream = io.BytesIO(response.content)
return file_stream
def read_file_stream(file_stream):
# 创建一个Word应用程序对象
word_app = win32com.client.Dispatch("Word.Application")
# 将文件流保存为临时文件
temp_file = "D:/temp.doc"
with open(temp_file, "wb") as f:
f.write(file_stream.read())
# 打开临时文件
doc = word_app.Documents.Open(temp_file)
# 读取文件内容
# content = doc.Content.Text
# print(content)
# 计算文档的统计信息
doc.ComputeStatistics(2) # 参数2表示统计页数
# 获取文档的页数
page_count = doc.BuiltInDocumentProperties("Number of Pages").Value
# 输出页数
print(f"页数:{page_count}")
# 关闭文件和应用程序
doc.Close()
word_app.Quit()
# 删除临时文件
os.remove(temp_file)
return page_count
file_stream = get_file_stream(url)
page_count = read_file_stream(file_stream)
print(f"页数22:{page_count}")
print('程序结束!')
import json
import re
import time
import fitz
import pymongo
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from pyquery import PyQuery as pq
from requests.packages import urllib3
from urllib.parse import urljoin
from BaseCore import BaseCore
baseCore = BaseCore()
urllib3.disable_warnings()
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from lxml import etree
from random import choice
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
real_href = 'http://gzw.fujian.gov.cn/ztzl/gzjgfzjs/gfxwj_7426/201809/t20180911_4492105.htm'
href_text = requests.get(url=real_href, verify=False)
href_text.encoding = href_text.apparent_encoding
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
#相对路径转化为绝对路径
i_soup = paserUrl(i_soup,real_href)
# print(i_soup)
source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip()
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
content = i_soup.find('div', attrs={'class': 'xl_con1'})
pub_hao = ''
print(real_href)
id_list = []
#todo:获取附件地址
fu_jian_list = content.find('ul',class_='clearflx myzj_xl_list').find_all('a')
for fu_jian in fu_jian_list:
fj_href = fu_jian['href']
if '.doc' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
#找到附件后 上传至文件服务器
retData = baseCore.uploadToserver(fj_href,'1673')
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', 'aaa', 1)
id_list.append(att_id)
fu_jian['href'] = 'http://114.115.215.96/' + full_path
print(f'-----{content}')
\ No newline at end of file
[redis]
host=114.115.236.206
port=6379
pass=clbzzsn
[mysql]
host=114.115.159.144
username=caiji
password=zzsn9988
database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_baidu
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
import redis
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
#获取所有键
keys = r.keys('*')
for key in keys:
f_key = key.decode()
if 'wx_url' in f_key:
list_info = []
print(f_key)
print("----------")
# 截取sid
sid = f_key.split('url_')[1]
#拿到微信公众号的key 获取值
value = list(r.smembers(f_key))
list_data = []
for member in value:
member = member.decode()
# print(member)
data = (sid,member)
# print(data)
# print(list_info)
select_sql = f"select sid from wx_link where link = '{member}'"
cursor.execute(select_sql)
select = cursor.fetchone()
if select:
continue
else:
list_info.append(data)
if len(list_info)!=0:
pass
else:
continue
#批量插入
sql = "INSERT INTO wx_link (sid, link,state) VALUES (%s, %s,'99')"
cursor.executemany(sql, list_info)
cnx.commit()
log.info(f'{sid}---插入成功--{len(list_info)}----')
baseCore.close()
import time
import pandas as pd
......@@ -30,11 +31,52 @@ import pandas as pd
# workbook.save(filename)
# writeaa()
gpdm = '01109.HK'
if 'HK' in str(gpdm):
tmp_g = str(gpdm).split('.')[0]
if len(tmp_g) == 5:
gpdm = str(gpdm)[1:]
print(gpdm)
# gpdm = '01109.HK'
# if 'HK' in str(gpdm):
# tmp_g = str(gpdm).split('.')[0]
# if len(tmp_g) == 5:
# gpdm = str(gpdm)[1:]
# print(gpdm)
# else:
# pass
import redis
from base.BaseCore import BaseCore
baseCore = BaseCore()
r = baseCore.r
key = 'counter'
expiration_time = 10 # 设置过期时间 60秒
# # 设置自增
# r.incr(key)
# # #自增并设置过期时间
# while True:
# # 设置自增
# r.incr(key)
# value = int(r.get(key).decode())
#
# if value > 10:
# print(value)
# # 设置过期时间
# r.expire(key, expiration_time)
# time.sleep(20)
# print('------------------')
# continue
# # print(value)
# time.sleep(5)
# print(value)
# print("==========")
def check_url():
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=6)
res = r.exists('WeiXinGZH:infoSourceCode','IN-20220418-0001')
print(res)
if res == 1:
print('True')
else:
pass
print('False')
check_url()
import time
import pandas as pd
# def writeaa():
# detailList=[]
# aa={
# 'id':3,
# 'name':'qqqwe'
# }
# detailList.append(aa)
# writerToExcel(detailList)
# 将数据追加到excel
# def writerToExcel(detailList):
# # filename='baidu搜索.xlsx'
# # 读取已存在的xlsx文件
# existing_data = pd.read_excel(filename,engine='openpyxl')
# # 创建新的数据
# new_data = pd.DataFrame(data=detailList)
# # 将新数据添加到现有数据的末尾
# combined_data = existing_data.append(new_data, ignore_index=True)
# # 将结果写入到xlsx文件
# combined_data.to_excel(filename, index=False)
#
# from openpyxl import Workbook
#
# if __name__ == '__main__':
# filename='test1.xlsx'
# # # 创建一个工作簿
# workbook = Workbook(filename)
# workbook.save(filename)
# writeaa()
# gpdm = '01109.HK'
# if 'HK' in str(gpdm):
# tmp_g = str(gpdm).split('.')[0]
# if len(tmp_g) == 5:
# gpdm = str(gpdm)[1:]
# print(gpdm)
# else:
# pass
from base.BaseCore import BaseCore
baseCore = BaseCore()
r = baseCore.r
# #自增并设置过期时间
# while True:
# key = 'mykey'
# expiration_time = 60 # 设置过期时间 60秒
# #设置自增
# r.incr(key)
#
#
#
# value = int(r.get(key).decode())
#
# if value > 10:
# print(value)
# # 设置过期时间
# r.expire(key, expiration_time)
#
# time.sleep(70)
# print('------------------')
# continue
# # print(value)
#
# print("==========")
# expiration_time = 60
# # 创建PubSub对象
# p = r.pubsub()
#
# # 订阅过期事件
# p.psubscribe('__keyevent@6__:expired')
# aa = p.listen()
# # 监听过期事件
# for message in p.listen():
# if message['type'] == 'pmessage':
# expired_key = message['data'].decode()
# print('过期的key:', expired_key)
# if expired_key == 'counter':
# # 执行重置操作
# r.set('counter', 0)
# print('计数器已重置为0')
# # 设置自增
# r.incr('counter')
# # 设置过期时间
# r.expire('counter', expiration_time)
for i in range(0, 24, 5):
print(i)
\ No newline at end of file
import pandas as pd
import numpy as np
import pymysql
import time
import requests
import certifi
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log= baseCore.getLogger()
# cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
# cursor = cnx.cursor()
cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji', charset='utf8mb4')
curosr = cnx.cursor()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
#'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
def getInfo(gpdm,xydm):
print('开始')
gpdm_ = gpdm
if 'HK' in gpdm_:
gpdm_ = gpdm_[1:]
start = time.time()
retData={}
retData['base_info'] = {
'公司名称': '',
'英文名':'',
'信用代码': xydm,
'股票代码': gpdm_,
'地址': '',
'电话': '',
'公司网站': '',
'部门': '',
'行业': '',
'员工人数': '',
'公司简介': ''
}
retData['people_info']=[]
# https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE
url = f'https://finance.yahoo.com/quote/{gpdm}/profile?p={gpdm}'
time.sleep(3)
for i in range(0,3):
try:
response = requests.get(url, headers=headers, verify=False)
time.sleep(1)
if (response.status_code == 200):
break
else:
log.error(f"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}")
except :
continue
if (response.status_code == 200):
pass
else:
log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}")
return retData
soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
name = page.find('h3',{'class':'Fz(m) Mb(10px)'})
try:
com_info = page.find('div', {'class': 'Mb(25px)'})
except:
com_info = ''
try:
com_phone = com_info.find_all('p')[0].find('a').text
except:
com_phone = ''
try:
com_url = com_info.find_all('p')[0].find('a', {'target': '_blank'}).text
except:
com_url = ''
try:
com_address = com_info.find_all('p')[0].text.replace(com_phone, '').replace(com_url, '')
except:
com_address = ''
try:
com_bumen = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[0].text
except:
com_bumen = ''
try:
com_hangye = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[1].text
except:
com_hangye = ''
try:
com_people = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[2].text
except:
com_people = ''
try:
com_jianjie = page.find('p', {'class': 'Mt(15px) Lh(1.6)'}).text
except:
com_jianjie = ''
dic_com_info = {
'公司名称':'',
'英文名':name,
'信用代码': xydm,
'股票代码': gpdm_,
'地址': com_address,
'电话': com_phone,
'公司网站': com_url,
'部门': com_bumen,
'行业': com_hangye,
'员工人数': com_people,
'公司简介': com_jianjie
}
retData['base_info']=dic_com_info
#高管信息
retPeople = []
try:
list_people = page.find('table', {'class': 'W(100%)'}).find_all('tr')[1:]
except:
list_people = []
for one_people in list_people:
try:
p_name = one_people.find_all('td')[0].text
except:
p_name = ''
continue
try:
p_zhiwu = one_people.find_all('td')[1].text
except:
p_zhiwu = ''
try:
p_money = one_people.find_all('td')[2].text
except:
p_money = ''
try:
p_xingshi = one_people.find_all('td')[3].text
except:
p_xingshi = ''
try:
p_year = one_people.find_all('td')[4].text
except:
p_year = ''
if(p_zhiwu=="N/A"):
p_zhiwu=""
if (p_money == "N/A"):
p_money = ""
if (p_xingshi == "N/A"):
p_xingshi = ""
if (p_year == "N/A"):
p_year = ""
dic_main_people = {
'公司名称': name,
'股票代码': gpdm_,
'信用代码': xydm,
'姓名': p_name,
'职务': p_zhiwu,
'薪资': p_money,
'行使': p_xingshi,
'出生年份': p_year
}
retPeople.append(dic_main_people)
retData['people_info'] = retPeople
# df_a = pd.DataFrame(retData['base_info'])
log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
return retData
# # 数据库中获取企业gpdm、xydm
sql_select = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col6 is not null and state1=1 and col3 like 'ZZSN%' order by date1 ,id LIMIT 1"
curosr.execute(sql_select)
data = curosr.fetchone()
id = data[0]
# 更新以获取企业的采集状态
# sql_update = f"UPDATE Tfbs set state1 = 2 WHERE id = {id}"
# curosr.execute(sql_update)
# cnx.commit()
xydm = data[4]
gpdm = data[7]
# 获取企业的基本信息和高管信息
retData = getInfo(gpdm,xydm)
print(retData)
curosr.close()
cnx.close()
url = 'https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE'
req = requests.get(url=url,headers=headers,verify=False)
print(req.status_code)
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: connection.py
import socket
import os
import sys
import time
import random
from itertools import chain
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# start class Connection
class Connection(object):
'''Manage TCP comunication to and from Fastdfs Server.'''
def __init__(self, **conn_kwargs):
self.pid = os.getpid()
self.host_tuple = conn_kwargs['host_tuple']
self.remote_port = conn_kwargs['port']
self.remote_addr = None
self.timeout = conn_kwargs['timeout']
self._sock = None
def __del__(self):
try:
self.disconnect()
except:
pass
def connect(self):
'''Connect to fdfs server.'''
if self._sock:
return
try:
sock = self._connect()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = sock
# print '[+] Create a connection success.'
# print '\tLocal address is %s:%s.' % self._sock.getsockname()
# print '\tRemote address is %s:%s' % (self.remote_addr, self.remote_port)
def _connect(self):
'''Create TCP socket. The host is random one of host_tuple.'''
self.remote_addr = random.choice(self.host_tuple)
# print '[+] Connecting... remote: %s:%s' % (self.remote_addr, self.remote_port)
# sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# sock.settimeout(self.timeout)
sock = socket.create_connection((self.remote_addr, self.remote_port), self.timeout)
return sock
def disconnect(self):
'''Disconnect from fdfs server.'''
if self._sock is None:
return
try:
self._sock.close()
except socket.error as e:
raise ConnectionError(self._errormessage(e))
self._sock = None
def get_sock(self):
return self._sock
def _errormessage(self, exception):
# args for socket.error can either be (errno, "message")
# or just "message" '''
if len(exception.args) == 1:
return "[-] Error: connect to %s:%s. %s." % (self.remote_addr, self.remote_port, exception.args[0])
else:
return "[-] Error: %s connect to %s:%s. %s." % \
(exception.args[0], self.remote_addr, self.remote_port, exception.args[1])
# end class Connection
# start ConnectionPool
class ConnectionPool(object):
'''Generic Connection Pool'''
def __init__(self, name='', conn_class=Connection,
max_conn=None, **conn_kwargs):
self.pool_name = name
self.pid = os.getpid()
self.conn_class = conn_class
self.max_conn = max_conn or 2 ** 31
self.conn_kwargs = conn_kwargs
self._conns_created = 0
self._conns_available = []
self._conns_inuse = set()
# print '[+] Create a connection pool success, name: %s.' % self.pool_name
def _check_pid(self):
if self.pid != os.getpid():
self.destroy()
self.__init__(self.conn_class, self.max_conn, **self.conn_kwargs)
def make_conn(self):
'''Create a new connection.'''
if self._conns_created >= self.max_conn:
raise ConnectionError('[-] Error: Too many connections.')
num_try = 10
while True:
try:
if num_try <= 0:
sys.exit()
conn_instance = self.conn_class(**self.conn_kwargs)
conn_instance.connect()
self._conns_created += 1
break
except ConnectionError as e:
print(e)
num_try -= 1
conn_instance = None
return conn_instance
def get_connection(self):
'''Get a connection from pool.'''
self._check_pid()
try:
conn = self._conns_available.pop()
# print '[+] Get a connection from pool %s.' % self.pool_name
# print '\tLocal address is %s:%s.' % conn._sock.getsockname()
# print '\tRemote address is %s:%s' % (conn.remote_addr, conn.remote_port)
except IndexError:
conn = self.make_conn()
self._conns_inuse.add(conn)
return conn
def remove(self, conn):
'''Remove connection from pool.'''
if conn in self._conns_inuse:
self._conns_inuse.remove(conn)
self._conns_created -= 1
if conn in self._conns_available:
self._conns_available.remove(conn)
self._conns_created -= 1
def destroy(self):
'''Disconnect all connections in the pool.'''
all_conns = chain(self._conns_inuse, self._conns_available)
for conn in all_conns:
conn.disconnect()
# print '[-] Destroy connection pool %s.' % self.pool_name
def release(self, conn):
'''Release the connection back to the pool.'''
self._check_pid()
if conn.pid == self.pid:
self._conns_inuse.remove(conn)
self._conns_available.append(conn)
# print '[-] Release connection back to pool %s.' % self.pool_name
# end ConnectionPool class
def tcp_recv_response(conn, bytes_size, buffer_size=4096):
'''Receive response from server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_size: int, will be received byte_stream size
@buffer_size: int, receive buffer size
@Return: tuple,(response, received_size)
'''
recv_buff = []
total_size = 0
try:
while bytes_size > 0:
resp = conn._sock.recv(buffer_size)
recv_buff.append(resp)
total_size += len(resp)
bytes_size -= len(resp)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: (%s)' % e.args)
return (b''.join(recv_buff), total_size)
def tcp_send_data(conn, bytes_stream):
'''Send buffer to server.
It is not include tracker header.
arguments:
@conn: connection
@bytes_stream: trasmit buffer
@Return bool
'''
try:
conn._sock.sendall(bytes_stream)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: (%s)' % e.args)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: exceptions.py
'''Core exceptions raised by fdfs client'''
class FDFSError(Exception):
pass
class ConnectionError(FDFSError):
pass
class ResponseError(FDFSError):
pass
class InvaildResponse(FDFSError):
pass
class DataError(FDFSError):
pass
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# filename: fdfs_protol.py
import struct
import socket
from fdfs_client.exceptions import (
FDFSError,
ConnectionError,
ResponseError,
InvaildResponse,
DataError
)
# define FDFS protol constans
TRACKER_PROTO_CMD_STORAGE_JOIN = 81
FDFS_PROTO_CMD_QUIT = 82
TRACKER_PROTO_CMD_STORAGE_BEAT = 83 # storage heart beat
TRACKER_PROTO_CMD_STORAGE_REPORT_DISK_USAGE = 84 # report disk usage
TRACKER_PROTO_CMD_STORAGE_REPLICA_CHG = 85 # repl new storage servers
TRACKER_PROTO_CMD_STORAGE_SYNC_SRC_REQ = 86 # src storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_REQ = 87 # dest storage require sync
TRACKER_PROTO_CMD_STORAGE_SYNC_NOTIFY = 88 # sync done notify
TRACKER_PROTO_CMD_STORAGE_SYNC_REPORT = 89 # report src last synced time as dest server
TRACKER_PROTO_CMD_STORAGE_SYNC_DEST_QUERY = 79 # dest storage query sync src storage server
TRACKER_PROTO_CMD_STORAGE_REPORT_IP_CHANGED = 78 # storage server report it's ip changed
TRACKER_PROTO_CMD_STORAGE_CHANGELOG_REQ = 77 # storage server request storage server's changelog
TRACKER_PROTO_CMD_STORAGE_REPORT_STATUS = 76 # report specified storage server status
TRACKER_PROTO_CMD_STORAGE_PARAMETER_REQ = 75 # storage server request parameters
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FREE = 74 # storage report trunk free space
TRACKER_PROTO_CMD_STORAGE_REPORT_TRUNK_FID = 73 # storage report current trunk file id
TRACKER_PROTO_CMD_STORAGE_FETCH_TRUNK_FID = 72 # storage get current trunk file id
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START = 61 # start of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END = 62 # end of tracker get system data files
TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE = 63 # tracker get a system data file
TRACKER_PROTO_CMD_TRACKER_GET_STATUS = 64 # tracker get status of other tracker
TRACKER_PROTO_CMD_TRACKER_PING_LEADER = 65 # tracker ping leader
TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER = 66 # notify next leader to other trackers
TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER = 67 # commit next leader to other trackers
TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP = 90
TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS = 91
TRACKER_PROTO_CMD_SERVER_LIST_STORAGE = 92
TRACKER_PROTO_CMD_SERVER_DELETE_STORAGE = 93
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ONE = 101
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ONE = 102
TRACKER_PROTO_CMD_SERVICE_QUERY_UPDATE = 103
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ONE = 104
TRACKER_PROTO_CMD_SERVICE_QUERY_FETCH_ALL = 105
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITHOUT_GROUP_ALL = 106
TRACKER_PROTO_CMD_SERVICE_QUERY_STORE_WITH_GROUP_ALL = 107
TRACKER_PROTO_CMD_RESP = 100
FDFS_PROTO_CMD_ACTIVE_TEST = 111 # active test, tracker and storage both support since V1.28
STORAGE_PROTO_CMD_REPORT_CLIENT_IP = 9 # ip as tracker client
STORAGE_PROTO_CMD_UPLOAD_FILE = 11
STORAGE_PROTO_CMD_DELETE_FILE = 12
STORAGE_PROTO_CMD_SET_METADATA = 13
STORAGE_PROTO_CMD_DOWNLOAD_FILE = 14
STORAGE_PROTO_CMD_GET_METADATA = 15
STORAGE_PROTO_CMD_SYNC_CREATE_FILE = 16
STORAGE_PROTO_CMD_SYNC_DELETE_FILE = 17
STORAGE_PROTO_CMD_SYNC_UPDATE_FILE = 18
STORAGE_PROTO_CMD_SYNC_CREATE_LINK = 19
STORAGE_PROTO_CMD_CREATE_LINK = 20
STORAGE_PROTO_CMD_UPLOAD_SLAVE_FILE = 21
STORAGE_PROTO_CMD_QUERY_FILE_INFO = 22
STORAGE_PROTO_CMD_UPLOAD_APPENDER_FILE = 23 # create appender file
STORAGE_PROTO_CMD_APPEND_FILE = 24 # append file
STORAGE_PROTO_CMD_SYNC_APPEND_FILE = 25
STORAGE_PROTO_CMD_FETCH_ONE_PATH_BINLOG = 26 # fetch binlog of one store path
STORAGE_PROTO_CMD_RESP = TRACKER_PROTO_CMD_RESP
STORAGE_PROTO_CMD_UPLOAD_MASTER_FILE = STORAGE_PROTO_CMD_UPLOAD_FILE
STORAGE_PROTO_CMD_TRUNK_ALLOC_SPACE = 27 # since V3.00
STORAGE_PROTO_CMD_TRUNK_ALLOC_CONFIRM = 28 # since V3.00
STORAGE_PROTO_CMD_TRUNK_FREE_SPACE = 29 # since V3.00
STORAGE_PROTO_CMD_TRUNK_SYNC_BINLOG = 30 # since V3.00
STORAGE_PROTO_CMD_TRUNK_GET_BINLOG_SIZE = 31 # since V3.07
STORAGE_PROTO_CMD_TRUNK_DELETE_BINLOG_MARKS = 32 # since V3.07
STORAGE_PROTO_CMD_TRUNK_TRUNCATE_BINLOG_FILE = 33 # since V3.07
STORAGE_PROTO_CMD_MODIFY_FILE = 34 # since V3.08
STORAGE_PROTO_CMD_SYNC_MODIFY_FILE = 35 # since V3.08
STORAGE_PROTO_CMD_TRUNCATE_FILE = 36 # since V3.08
STORAGE_PROTO_CMD_SYNC_TRUNCATE_FILE = 37 # since V3.08
# for overwrite all old metadata
STORAGE_SET_METADATA_FLAG_OVERWRITE = 'O'
STORAGE_SET_METADATA_FLAG_OVERWRITE_STR = "O"
# for replace, insert when the meta item not exist, otherwise update it
STORAGE_SET_METADATA_FLAG_MERGE = 'M'
STORAGE_SET_METADATA_FLAG_MERGE_STR = "M"
FDFS_RECORD_SEPERATOR = '\x01'
FDFS_FIELD_SEPERATOR = '\x02'
# common constants
FDFS_GROUP_NAME_MAX_LEN = 16
IP_ADDRESS_SIZE = 16
FDFS_PROTO_PKG_LEN_SIZE = 8
FDFS_PROTO_CMD_SIZE = 1
FDFS_PROTO_STATUS_SIZE = 1
FDFS_PROTO_IP_PORT_SIZE = (IP_ADDRESS_SIZE + 6)
FDFS_MAX_SERVERS_EACH_GROUP = 32
FDFS_MAX_GROUPS = 512
FDFS_MAX_TRACKERS = 16
FDFS_DOMAIN_NAME_MAX_LEN = 128
FDFS_MAX_META_NAME_LEN = 64
FDFS_MAX_META_VALUE_LEN = 256
FDFS_FILE_PREFIX_MAX_LEN = 16
FDFS_LOGIC_FILE_PATH_LEN = 10
FDFS_TRUE_FILE_PATH_LEN = 6
FDFS_FILENAME_BASE64_LENGTH = 27
FDFS_TRUNK_FILE_INFO_LEN = 16
FDFS_FILE_EXT_NAME_MAX_LEN = 6
FDFS_SPACE_SIZE_BASE_INDEX = 2 # storage space size based (MB)
FDFS_UPLOAD_BY_BUFFER = 1
FDFS_UPLOAD_BY_FILENAME = 2
FDFS_UPLOAD_BY_FILE = 3
FDFS_DOWNLOAD_TO_BUFFER = 1
FDFS_DOWNLOAD_TO_FILE = 2
FDFS_NORMAL_LOGIC_FILENAME_LENGTH = (
FDFS_LOGIC_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_FILE_EXT_NAME_MAX_LEN + 1)
FDFS_TRUNK_FILENAME_LENGTH = (
FDFS_TRUE_FILE_PATH_LEN + FDFS_FILENAME_BASE64_LENGTH + FDFS_TRUNK_FILE_INFO_LEN + 1 + FDFS_FILE_EXT_NAME_MAX_LEN)
FDFS_TRUNK_LOGIC_FILENAME_LENGTH = (FDFS_TRUNK_FILENAME_LENGTH + (FDFS_LOGIC_FILE_PATH_LEN - FDFS_TRUE_FILE_PATH_LEN))
FDFS_VERSION_SIZE = 6
TRACKER_QUERY_STORAGE_FETCH_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE)
TRACKER_QUERY_STORAGE_STORE_BODY_LEN = (FDFS_GROUP_NAME_MAX_LEN + IP_ADDRESS_SIZE - 1 + FDFS_PROTO_PKG_LEN_SIZE + 1)
# status code, order is important!
FDFS_STORAGE_STATUS_INIT = 0
FDFS_STORAGE_STATUS_WAIT_SYNC = 1
FDFS_STORAGE_STATUS_SYNCING = 2
FDFS_STORAGE_STATUS_IP_CHANGED = 3
FDFS_STORAGE_STATUS_DELETED = 4
FDFS_STORAGE_STATUS_OFFLINE = 5
FDFS_STORAGE_STATUS_ONLINE = 6
FDFS_STORAGE_STATUS_ACTIVE = 7
FDFS_STORAGE_STATUS_RECOVERY = 9
FDFS_STORAGE_STATUS_NONE = 99
class Storage_server(object):
'''Class storage server for upload.'''
def __init__(self):
self.ip_addr = None
self.port = None
self.group_name = ''
self.store_path_index = 0
# Class tracker_header
class Tracker_header(object):
'''
Class for Pack or Unpack tracker header
struct tracker_header{
char pkg_len[FDFS_PROTO_PKG_LEN_SIZE],
char cmd,
char status,
}
'''
def __init__(self):
self.fmt = '!QBB' # pkg_len[FDFS_PROTO_PKG_LEN_SIZE] + cmd + status
self.st = struct.Struct(self.fmt)
self.pkg_len = 0
self.cmd = 0
self.status = 0
def _pack(self, pkg_len=0, cmd=0, status=0):
return self.st.pack(pkg_len, cmd, status)
def _unpack(self, bytes_stream):
self.pkg_len, self.cmd, self.status = self.st.unpack(bytes_stream)
return True
def header_len(self):
return self.st.size
def send_header(self, conn):
'''Send Tracker header to server.'''
header = self._pack(self.pkg_len, self.cmd, self.status)
try:
conn._sock.sendall(header)
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while writting to socket: %s' % (e.args,))
def recv_header(self, conn):
'''Receive response from server.
if sucess, class member (pkg_len, cmd, status) is response.
'''
try:
header = conn._sock.recv(self.header_len())
except (socket.error, socket.timeout) as e:
raise ConnectionError('[-] Error: while reading from socket: %s' % (e.args,))
self._unpack(header)
def fdfs_pack_metadata(meta_dict):
ret = ''
for key in meta_dict:
ret += '%s%c%s%c' % (key, FDFS_FIELD_SEPERATOR, meta_dict[key], FDFS_RECORD_SEPERATOR)
return ret[0:-1]
def fdfs_unpack_metadata(bytes_stream):
li = bytes_stream.split(FDFS_RECORD_SEPERATOR)
return dict([item.split(FDFS_FIELD_SEPERATOR) for item in li])
#!/usr/bin/env python
# -*- coding = utf-8 -*-
# filename: utils.py
import io
import os
import sys
import stat
import platform
import configparser
SUFFIX = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
__os_sep__ = "/" if platform.system() == 'Windows' else os.sep
def appromix(size, base=0):
'''Conver bytes stream size to human-readable format.
Keyword arguments:
size: int, bytes stream size
base: int, suffix index
Return: string
'''
multiples = 1024
if size < 0:
raise ValueError('[-] Error: number must be non-negative.')
if size < multiples:
return '{0:d}{1}'.format(size, SUFFIX[base])
for suffix in SUFFIX[base:]:
if size < multiples:
return '{0:.2f}{1}'.format(size, suffix)
size = size / float(multiples)
raise ValueError('[-] Error: number too big.')
def get_file_ext_name(filename, double_ext=True):
li = filename.split(os.extsep)
if len(li) <= 1:
return ''
else:
if li[-1].find(__os_sep__) != -1:
return ''
if double_ext:
if len(li) > 2:
if li[-2].find(__os_sep__) == -1:
return '%s.%s' % (li[-2], li[-1])
return li[-1]
class Fdfs_ConfigParser(configparser.RawConfigParser):
"""
Extends ConfigParser to allow files without sections.
This is done by wrapping read files and prepending them with a placeholder
section, which defaults to '__config__'
"""
def __init__(self, default_section=None, *args, **kwargs):
configparser.RawConfigParser.__init__(self, *args, **kwargs)
self._default_section = None
self.set_default_section(default_section or '__config__')
def get_default_section(self):
return self._default_section
def set_default_section(self, section):
self.add_section(section)
# move all values from the previous default section to the new one
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
except configparser.NoSectionError:
pass
else:
for (key, value) in default_section_items:
self.set(section, key, value)
self._default_section = section
def read(self, filenames):
if isinstance(filenames, str):
filenames = [filenames]
read_ok = []
for filename in filenames:
try:
with open(filename) as fp:
self.readfp(fp)
except IOError:
continue
else:
read_ok.append(filename)
return read_ok
def readfp(self, fp, *args, **kwargs):
stream = io.StringIO()
try:
stream.name = fp.name
except AttributeError:
pass
stream.write('[' + self._default_section + ']\n')
stream.write(fp.read())
stream.seek(0, 0)
return self._read(stream, stream.name)
def write(self, fp):
# Write the items from the default section manually and then remove them
# from the data. They'll be re-added later.
try:
default_section_items = self.items(self._default_section)
self.remove_section(self._default_section)
for (key, value) in default_section_items:
fp.write("{0} = {1}\n".format(key, value))
fp.write("\n")
except configparser.NoSectionError:
pass
configparser.RawConfigParser.write(self, fp)
self.add_section(self._default_section)
for (key, value) in default_section_items:
self.set(self._default_section, key, value)
def _read(self, fp, fpname):
"""Parse a sectioned setup file.
The sections in setup file contains a title line at the top,
indicated by a name in square brackets (`[]'), plus key/value
options lines, indicated by `name: value' format lines.
Continuations are represented by an embedded newline then
leading whitespace. Blank lines, lines beginning with a '#',
and just about everything else are ignored.
"""
cursect = None # None, or a dictionary
optname = None
lineno = 0
e = None # None, or an exception
while True:
line = fp.readline()
if not line:
break
lineno = lineno + 1
# comment or blank line?
if line.strip() == '' or line[0] in '#;':
continue
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
# no leading whitespace
continue
# continuation line?
if line[0].isspace() and cursect is not None and optname:
value = line.strip()
if value:
cursect[optname] = "%s\n%s" % (cursect[optname], value)
# a section header or option header?
else:
# is it a section header?
mo = self.SECTCRE.match(line)
if mo:
sectname = mo.group('header')
if sectname in self._sections:
cursect = self._sections[sectname]
elif sectname == DEFAULTSECT:
cursect = self._defaults
else:
cursect = self._dict()
cursect['__name__'] = sectname
self._sections[sectname] = cursect
# So sections can't start with a continuation line
optname = None
# no section header in the file?
elif cursect is None:
raise MissingSectionHeaderError(fpname, lineno, line)
# an option line?
else:
mo = self.OPTCRE.match(line)
if mo:
optname, vi, optval = mo.group('option', 'vi', 'value')
if vi in ('=', ':') and ';' in optval:
# ';' is a comment delimiter only if it follows
# a spacing character
pos = optval.find(';')
if pos != -1 and optval[pos - 1].isspace():
optval = optval[:pos]
optval = optval.strip()
# allow empty values
if optval == '""':
optval = ''
optname = self.optionxform(optname.rstrip())
if optname in cursect:
if not isinstance(cursect[optname], list):
cursect[optname] = [cursect[optname]]
cursect[optname].append(optval)
else:
cursect[optname] = optval
else:
# a non-fatal parsing error occurred. set up the
# exception but keep going. the exception will be
# raised at the end of the file and will contain a
# list of all bogus lines
if not e:
e = ParsingError(fpname)
e.append(lineno, repr(line))
# if any parsing errors occurred, raise an exception
if e:
raise e
def split_remote_fileid(remote_file_id):
'''
Splite remote_file_id to (group_name, remote_file_name)
arguments:
@remote_file_id: string
@return tuple, (group_name, remote_file_name)
'''
index = remote_file_id.find(b'/')
if -1 == index:
return None
return (remote_file_id[0:index], remote_file_id[(index + 1):])
def fdfs_check_file(filename):
ret = True
errmsg = ''
if not os.path.isfile(filename):
ret = False
errmsg = '[-] Error: %s is not a file.' % filename
elif not stat.S_ISREG(os.stat(filename).st_mode):
ret = False
errmsg = '[-] Error: %s is not a regular file.' % filename
return (ret, errmsg)
if __name__ == '__main__':
print(get_file_ext_name('/bc.tar.gz'))
# -*- coding: utf-8 -*-
import requests
import datetime
from pyquery import PyQuery as pq
# import cx_Oracle
import urllib3
urllib3.disable_warnings()
# conn = cx_Oracle.connect('cis', 'cis_zzsn9988', '114.116.91.1:1521/ORCL')
# cursor = conn.cursor()
result_list1 = [
[
'人民币',
'CNY'],
[
'美元',
'USD'],
[
'欧元',
'EUR'],
[
'瑞士法郎',
'CHF'],
[
'加元',
'CAD'],
[
'波兰兹罗提',
'PLN'],
[
'英镑',
'GBP'],
[
'澳元',
'AUD'],
[
'泰铢',
'THB'],
[
'沙特里亚尔',
'SAR'],
[
'巴西里亚伊',
'BRL'],
[
'新土耳其新里拉',
'TRY'],
[
'新台币',
'TWD'],
[
'印度卢比',
'INR'],
[
'墨西哥比索',
'MXN'],
[
'日元',
'JPY'],
[
'瑞典克朗',
'SEK'],
[
'韩元',
'KRW'],
[
'俄罗斯卢布',
'RUB'],
[
'新加坡元',
'SGD'],
[
'港币',
'HKD']]
result_list2 = [
'USD',
'CNY']
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'ASPSESSIONIDQQRTBSSA=CJNLDGJALHNJGKBKPEBCDCOE; Hm_lvt_ecdd6f3afaa488ece3938bcdbb89e8da=1692951904; Hm_lpvt_ecdd6f3afaa488ece3938bcdbb89e8da=1692951904',
'Host':'qq.ip138.com',
'Pragma':'no-cache',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'sec-ch-ua':'"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
for result1 in result_list1:
currency_name = result1[0]
currency = result1[1]
to_USD = ''
to_CNY = ''
for i in range(len(result_list2)):
result2 = result_list2[i]
# https://qq.ip138.com/hl.asp?from=CNY&to=USD&q=1
# url = f'''https://qq.ip138.com/hl.asp?from={currency}&to={result2}&q=1'''
# res = requests.get(url, headers,verify=False)
res = requests.get('https://qq.ip138.com/hl.asp?from=CNY&to=USD&q=1')
resp_text=res.content
doc_resp = pq(resp_text)
money = doc_resp('table tr:nth-child(3) td:nth-child(3)').text()
if money == '1':
money_result = money
else:
try:
money_result = round(float(money), 4)
except:
continue
if i == 0:
to_USD = money_result
else:
to_CNY = money_result
now = datetime.datetime.now()
now_time = now.strftime('%Y-%m-%d')
if to_USD == '' or to_CNY == '':
continue
result_dict = {
'币种': currency_name,
'币简称': currency,
'对美元': to_USD,
'对人民币': to_CNY,
'更新时间': now_time }
print(result_dict)
# sql = "insert into CIS_QCC_CURRENCY_RATE(ID, CURRENCY_NAME, CURRENCY_CODE, RATE_TO_USD, RATE_TO_CNY,CREATE_DATE) values (SEQ_CIS_QCC_CURRENCY_RATE.nextval,'{0}','{1}','{2}','{3}','{4}')".format(currency_name, currency, to_USD, to_CNY, now_time)
# cursor.execute(sql)
# conn.commit()
......@@ -7,7 +7,7 @@ import requests
import urllib3
from pymysql.converters import escape_string
from base.BaseCore import BaseCore
from BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
......@@ -50,8 +50,9 @@ def get_file_name(headers):
def downFile(url,path):
try:
baseCore.mkPath(path)
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10)
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url, headers=headers, verify=False, timeout=10)
# response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10)
fileName = get_file_name(response.headers)
with open(os.path.join(path, fileName), "wb") as pyFile:
for chunk in response.iter_content(chunk_size=1024):
......
import os
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
from pymysql.converters import escape_string
import downPdf
from BaseCore import BaseCore
from datetime import datetime
baseCore = BaseCore()
log =baseCore.getLogger()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
cnx = baseCore.cnx
cursor = baseCore.cursor
def job_2():
log.info('----开始采集---俄罗斯国家杂志----')
# path = 'D:chrome/chromedriver.exe'
# driverContent = baseCore.buildDriver(path, headless=False)
for i in range(1,139):
if i == 1:
url = 'https://www.whitehouse.gov/?s=Russia'
else:
url = f'https://www.whitehouse.gov/?s=Russia&paged={i}'
req = requests.get(url,headers)
soup = BeautifulSoup(req.content,'html.parser')
container = soup.find('col-md-10 col-lg-6',class_='col-md-10 col-lg-6')
web_list = container.find_all('article')
for web in web_list:
title = web.find('h2',class_='entry-title')
newsTitle = title.find('a').text
newsUrl = title.find('a')['href']
date_string = web.find('div',class_='entry-meta shared-meta').find('time').text
#时间格式转化
date_object = datetime.strptime(date_string, "%B %d, %Y")
pub_time = date_object.strftime("%Y-%m-%d")
print(pub_time)
pdf_name = web.find('div',class_='infoindocumentlist').find_all('div')[0].find('span',class_='info-data').text
#下载pdf
path=r'D:\美国VS俄罗斯制裁'
path = os.path.join(path, downPdf.getPath(newsTitle))
downFile(pdfUrl,path,pdf_name)
selectCountSql = f"select * from usvsrussia where url = '{pdfUrl}' "
cursor.execute(selectCountSql)
url = cursor.fetchone()
if url:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,url,title,pub_time,state,pdf_name,pdf_path,create_time) values ('总统令文件','{pdfUrl}','{escape_string(pdftitle)}','{pub_time}',0,'{pdf_name}','{path}',now() )"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
break
\ No newline at end of file
import pandas as pd
import requests
import urllib3
from pyquery import PyQuery as pq
urllib3.disable_warnings()
def getHtml(url):
try:
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url,verify=False,timeout=10)
html=response.text
except Exception as e:
html=''
return html
def getList():
for i in range(17,139):
print(f'============开始采集第{i}页=========')
url=f'https://www.whitehouse.gov/?s=Russia&paged={i}'
html=getHtml(url)
if html:
pass
else:
for nnn in range(0, 3):
html = getHtml(url)
if html:
break
else:
continue
try:
doc=pq(html)
except:
return
ac=doc('div[class="col-md-10 col-lg-6"]>article')
for ii in range(0,len(ac)):
doc=pq(ac[ii])
title=doc('h2[class="entry-title"]>a').text()
url=doc('h2[class="entry-title"]>a').attr('href')
summary=doc('div[class="post-content"]').text()
try:
time=doc('time').attr('datetime').split('T')[0]
except:
time = ''
type=doc('span[class="tax-links cat-links"]>a').text()
detail={
'title':title,
'url':url,
'time':time,
'type':type,
'summary':summary,
}
getDetail(detail)
print(f'=============第{i}页===第{ii}条采集完成==========')
def getDetail(detail):
detailList=[]
url=detail['url']
html=getHtml(url)
if html:
pass
else:
for nnn in range(0,3):
html = getHtml(url)
if html:
break
else:
continue
try:
doc=pq(html)
except:
return
content=doc('section[class="body-content"]').text()
detail['content'] = content
detailList.append(detail)
writerToExcel(detailList)
return content
# 将数据追加到excel
def writerToExcel(detailList):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
from openpyxl import Workbook
if __name__ == '__main__':
# # 创建一个工作簿
filename='./cis.xlsx'
workbook = Workbook()
workbook.save(filename)
getList()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论