提交 9d49a0cd 作者: XveLingKun

谷歌搜索

上级 252c04d3
import datetime
import os
import random
import redis
import sys
import time
import logbook
......@@ -211,12 +213,18 @@ class BaseCore:
try:
self.__cursor_proxy.close()
self.__cnx_proxy.close()
self.cursor_.close()
self.cnx_.close()
except :
pass
def __init__(self):
self.r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor()
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
pass
# 计算耗时
......@@ -348,3 +356,41 @@ class BaseCore:
ip_list.append(proxy)
return ip_list
# 从Redis的List中获取并移除一个元素
def redicPullData(self, key):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
item = self.r.lpop(key)
return item.decode() if item else None
def getSidName(self, sid):
sqlSelect = f"SELECT words_name FROM `key_words` WHERE id = '{sid}'"
self.cursor_.execute(sqlSelect)
data = self.cursor_.fetchone()[0]
return data
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
def getUniqueCode(self, abbr, serverId, threadId):
while True:
timeCode = self.r.blpop(['timeCode:google'], 2)
if timeCode:
timeCode = timeCode[1]
timeCode = timeCode.decode('utf-8')
break
else:
time.sleep(2)
pid = str(self.getPID())
if len(pid) < 4:
pid = pid.zfill(4)
elif len(pid) > 4:
pid = pid[0:4]
uniqueCode = abbr + str(datetime.datetime.now().strftime('%Y%m%d'))[2:] + serverId + pid + str(threadId) + str(timeCode)
return uniqueCode
\ No newline at end of file
......@@ -27,6 +27,9 @@ class GoogleTaskJob(object):
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
self.r_6 = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=6)
def getkafka(self):
# Kafka集群的地址
......@@ -108,35 +111,36 @@ class GoogleTaskJob(object):
def paserKeyMsg(self,keymsg):
num = 1
logger.info('----------')
wordsCode=keymsg['wordsCode']
id=keymsg['id']
try:
searchEngines=keymsg['searchEngines']
if 'java.util.ArrayList' in searchEngines:
searchEngines=searchEngines[1]
except Exception as e:
searchEngines=[]
kwList=[]
if searchEngines:
if '4' in searchEngines:
keyword=keymsg['keyWord']
kwList = []
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
kwList.append((num,kwmsg))
num += 1
return kwList
def runSpider(self,kwmsg):
def runSpider(self,threadId,kwmsg, item, bangdan_name):
if 'lay' in kwmsg['kw']:
com_name = item.split('|')[2]
else:
com_name = item.split('|')[1]
searchkw = com_name + ' ' + kwmsg['kw']
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
googleSpider=GoogleSpider(searchkw,wordsCode,sid)
print(f'======拼接的关键词是{searchkw}=={com_name}====')
wordsCode = kwmsg['wordsCode']
sid = kwmsg['sid']
googleSpider = GoogleSpider(threadId, searchkw, wordsCode, sid, item, bangdan_name)
try:
googleSpider.get_page_html()
......@@ -151,7 +155,28 @@ class GoogleTaskJob(object):
finally:
googleSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw)
import random
def get_comname(self):
# todo:读取redis里的企业名称添加到关键词上
# ZZSN22080900000001|沃尔玛|WMT|1
item = baseCore.redicPullData('GOOGLE_KEYWORDS:COMPANY_NAME:2023_500')
# item = 'ZZSN22080900000001|沃尔玛|WMT|1'
if item:
return item
else:
logger.info('====已无企业===')
return None
# 从Redis的List中获取并移除一个元素
def redicPullData(key, r):
try:
r.ping()
except:
r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
item = r.lpop(key)
return item.decode() if item else None
if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss)
......@@ -164,14 +189,28 @@ if __name__ == '__main__':
print('---------------')
while True:
try:
codeids=[]
# codeid='KW-20230727-0001'
codeids.append('KW-20240318-0001')
for codeid in codeids:
# try:
# googleTaskJob.r.ping()
# except:
# googleTaskJob.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# all_keys = 'GOOGLE_KEYWORDS:COMPANY_NAME*'
# keys = googleTaskJob.r.scan_iter(f"{key}*")
# for key in keys:
item = googleTaskJob.get_comname()
bangdan_name = '2023年世界500强'
if item:
pass
else:
break
codeList = [
'KW-20240516-0002'
]
for codeid in codeList:
try:
# keymsg=baiduTaskJob.getkafka()
keymsg=googleTaskJob.getkeyFromredis(codeid)
kwList=googleTaskJob.paserKeyMsg(keymsg)
#keymsg=baiduTaskJob.getkafka()
keymsg = googleTaskJob.getkeyFromredis(codeid)
kwList = googleTaskJob.paserKeyMsg(keymsg)
# kwList=reversed(kwList)
# 从列表中随机选择5个数据
# kwList = random.sample(kwList, 4)
......@@ -182,9 +221,9 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(googleTaskJob.runSpider, data) for data in kwList]
results = [executor.submit(googleTaskJob.runSpider, num, data, item, bangdan_name) for num, data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
......@@ -195,5 +234,5 @@ if __name__ == '__main__':
# 处理任务执行过程中的异常
logger.info(f"任务执行exception: {e}")
except Exception as e:
logger.info('采集异常')
logger.info(f'采集异常{e}')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论