提交 caa3a936 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	base/BaseCore.py
...@@ -15,15 +15,15 @@ from selenium.webdriver.chrome.service import Service ...@@ -15,15 +15,15 @@ from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook from openpyxl import Workbook
import langid import langid
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore: class BaseCore:
# 序列号 # 序列号
__seq = 0 __seq = 0
# 代理池 数据库连接 # 代理池 数据库连接
__cnx_proxy = None __cnx_proxy =None
__cursor_proxy = None __cursor_proxy = None
# agent 池 # agent 池
__USER_AGENT_LIST = [ __USER_AGENT_LIST = [
...@@ -218,9 +218,8 @@ class BaseCore: ...@@ -218,9 +218,8 @@ class BaseCore:
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5' 'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
] ]
# Android agent池 #Android agent池
__USER_PHONE_AGENT_LIST = [ __USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self): def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project', self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project',
...@@ -239,11 +238,12 @@ class BaseCore: ...@@ -239,11 +238,12 @@ class BaseCore:
self.__cnx_proxy.close() self.__cnx_proxy.close()
self.cursor.close() self.cursor.close()
self.cnx.close() self.cnx.close()
except: except :
pass pass
# 计算耗时 # 计算耗时
def getTimeCost(self, start, end): def getTimeCost(self,start, end):
seconds = int(end - start) seconds = int(end - start)
m, s = divmod(seconds, 60) m, s = divmod(seconds, 60)
h, m = divmod(m, 60) h, m = divmod(m, 60)
...@@ -256,7 +256,6 @@ class BaseCore: ...@@ -256,7 +256,6 @@ class BaseCore:
else: else:
ms = int((end - start) * 1000) ms = int((end - start) * 1000)
return "%d毫秒" % (ms) return "%d毫秒" % (ms)
# 当前时间格式化 # 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S # 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S # 2 : 010101120000 %y%m%d%H%M%S
...@@ -286,7 +285,7 @@ class BaseCore: ...@@ -286,7 +285,7 @@ class BaseCore:
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3) return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式 # 日志格式
def logFormate(self, record, handler): def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format( formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间 date=record.time, # 日志时间
level=record.level_name, # 日志等级 level=record.level_name, # 日志等级
...@@ -296,9 +295,8 @@ class BaseCore: ...@@ -296,9 +295,8 @@ class BaseCore:
msg=record.message # 日志内容 msg=record.message # 日志内容
) )
return formate return formate
# 获取logger # 获取logger
def getLogger(self, fileLogFlag=True, stdOutFlag=True): def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0])) dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs") dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log" filename = filename.replace(".py", "") + ".log"
...@@ -347,34 +345,34 @@ class BaseCore: ...@@ -347,34 +345,34 @@ class BaseCore:
proxy_list.append(proxy) proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)] return proxy_list[random.randint(0, 3)]
# 字符串截取 #字符串截取
def getSubStr(self, str, beginStr, endStr): def getSubStr(self,str,beginStr,endStr):
if beginStr == '': if beginStr=='':
pass pass
else: else:
begin = str.find(beginStr) begin=str.find(beginStr)
if begin == -1: if begin==-1:
begin = 0 begin=0
str = str[begin:] str=str[begin:]
if endStr == '': if endStr=='':
pass pass
else: else:
end = str.rfind(endStr) end=str.rfind(endStr)
if end == -1: if end==-1:
pass pass
else: else:
str = str[0:end + 1] str = str[0:end+1]
return str return str
# 繁体字转简体字 # 繁体字转简体字
def hant_2_hans(self, hant_str: str): def hant_2_hans(self,hant_str: str):
''' '''
Function: 将 hant_str 由繁体转化为简体 Function: 将 hant_str 由繁体转化为简体
''' '''
return zhconv.convert(hant_str, 'zh-hans') return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字 # 判断字符串里是否含数字
def str_have_num(self, str_num): def str_have_num(self,str_num):
panduan = False panduan = False
for str_1 in str_num: for str_1 in str_num:
...@@ -394,7 +392,7 @@ class BaseCore: ...@@ -394,7 +392,7 @@ class BaseCore:
# return gw_item.decode() if gw_item else None # return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素 # 从Redis的List中获取并移除一个元素
def redicPullData(self, key): def redicPullData(self,key):
item = self.r.lpop(key) item = self.r.lpop(key)
return item.decode() if item else None return item.decode() if item else None
...@@ -460,16 +458,16 @@ class BaseCore: ...@@ -460,16 +458,16 @@ class BaseCore:
print(e) print(e)
self.cnx.commit() self.cnx.commit()
# 获取企查查token #获取企查查token
def GetToken(self): def GetToken(self):
# 获取企查查token #获取企查查token
query = "select token from QCC_token " query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改 # token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query) self.cursor.execute(query)
token = self.cursor.fetchone()[0] token = self.cursor.fetchone()[0]
return token return token
# 检测语言 #检测语言
def detect_language(self, text): def detect_language(self, text):
# 使用langid.py判断文本的语言 # 使用langid.py判断文本的语言
result = langid.classify(text) result = langid.classify(text)
...@@ -479,11 +477,11 @@ class BaseCore: ...@@ -479,11 +477,11 @@ class BaseCore:
return 'cn' return 'cn'
return result[0] return result[0]
# 追加接入excel #追加接入excel
def writerToExcel(self, detailList, filename): def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx' # filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件 # 读取已存在的xlsx文件
existing_data = pd.read_excel(filename, engine='openpyxl') existing_data = pd.read_excel(filename,engine='openpyxl')
# 创建新的数据 # 创建新的数据
new_data = pd.DataFrame(data=detailList) new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾 # 将新数据添加到现有数据的末尾
...@@ -492,6 +490,8 @@ class BaseCore: ...@@ -492,6 +490,8 @@ class BaseCore:
combined_data.to_excel(filename, index=False) combined_data.to_excel(filename, index=False)
# return combined_data # return combined_data
# 对失败或者断掉的企业 重新放入redis #对失败或者断掉的企业 重新放入redis
def rePutIntoR(self, item): def rePutIntoR(self,item):
self.r.rpush('NewsEnterprise:gwqy_socialCode', item) self.r.rpush('NewsEnterprise:gwqy_socialCode', item)
import pandas as pd import pandas as pd
def writeaa(): # def writeaa():
detailList=[] # detailList=[]
aa={ # aa={
'id':3, # 'id':3,
'name':'qqqwe' # 'name':'qqqwe'
} # }
detailList.append(aa) # detailList.append(aa)
writerToExcel(detailList) # writerToExcel(detailList)
# 将数据追加到excel # 将数据追加到excel
def writerToExcel(detailList): # def writerToExcel(detailList):
# filename='baidu搜索.xlsx' # # filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件 # # 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl') # existing_data = pd.read_excel(filename,engine='openpyxl')
# 创建新的数据 # # 创建新的数据
new_data = pd.DataFrame(data=detailList) # new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾 # # 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True) # combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件 # # 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False) # combined_data.to_excel(filename, index=False)
#
# from openpyxl import Workbook
#
# if __name__ == '__main__':
# filename='test1.xlsx'
# # # 创建一个工作簿
# workbook = Workbook(filename)
# workbook.save(filename)
# writeaa()
from openpyxl import Workbook gpdm = '01109.HK'
if 'HK' in str(gpdm):
if __name__ == '__main__': tmp_g = str(gpdm).split('.')[0]
filename='test1.xlsx' if len(tmp_g) == 5:
# # 创建一个工作簿 gpdm = str(gpdm)[1:]
workbook = Workbook(filename) print(gpdm)
workbook.save(filename) else:
writeaa() pass
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论