提交 b420fb1e 作者: 刘伟刚

搜狐搜索代码

上级 864508c6
[redis]
host=114.115.236.206
port=6379
pass=clbzzsn
[mysql]
host=114.115.159.144
username=caiji
password=zzsn9988
database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
[kafka]
bootstrap_servers=114.115.159.144:9092
topic=keyWordsInfo
groupId=python_sougou
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install redis==4.3.5 -i https://pypi.douban.com/simple
pip install kafka-python==2.0.2 -i https://pypi.douban.com/simple
pip install PyMySQL -i https://pypi.douban.com/simple
pip install gne==0.3.0 -i https://pypi.douban.com/simple
pip install selenium==4.9.1 -i https://pypi.douban.com/simple
pip install logbook -i https://pypi.douban.com/simple
pip install tqdm -i https://pypi.douban.com/simple
pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install jieba -i https://mirrors.aliyun.com/pypi/simple
selenium==3.141.0
selenium-wire==5.1.0
pip install --upgrade selenium
pip install --upgrade urllib3
pip3 uninstall urllib3
ImportError: urllib3 v2.0 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'OpenSSL 1.1.0i 14 Aug 2018'. See: https://github.com/urllib3/urllib3/issues/2168
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
任务集成测试
1、连接redis做取出
2、连接kafka做信息的获取,与存储
"""
import time
import redis
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from souhunewspider import SouhunewsSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
class SouhunewsTaskJob(object):
def __init__(self):
# 创建ConfigParser对象
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
def getkafka(self):
# Kafka集群的地址
bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# 要订阅的主题
topic = self.config.get('kafka', 'topic')
groupId=self.config.get('kafka', 'groupId')
consumer = KafkaConsumer(topic, group_id=groupId,
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
try:
logger.info("value:",record.value)
keymsg=record.value
if keymsg:
break
else:
continue
#print("%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
except Exception as e:
logger.info("msg.value error:",e)
except KeyboardInterrupt as e:
keymsg={}
finally:
consumer.close()
return keymsg
def getkeyFromredis(self,codeid):
kvalue=self.r.get('KEY_WORDS_TO_REDIS::'+codeid)
kvalue=kvalue.decode('utf-8')
kvalue=json.loads(kvalue)
return kvalue
def getkeywords(self,keywords):
kwList=[]
if ')+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
elif len(kk2)==3:
result = list(itertools.product(kk2[0], kk2[1],kk2[2]))
elif len(kk2)==4:
result = list(itertools.product(kk2[0], kk2[1],kk2[2],kk2[3]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
elif '+(' in keywords:
k1List=keywords.split('+')
kk2=[]
for k2 in k1List:
k2=k2.strip("()")
k2List=k2.split('|')
kk2.append(k2List)
if len(kk2)==2:
result = list(itertools.product(kk2[0], kk2[1]))
for res in result:
kwstr=''
for kw in res:
kwstr+=kw+"+"
kwList.append(kwstr.strip('+'))
else:
k3=keywords.split("|")
kwList=k3
return kwList
def paserKeyMsg(self,keymsg):
logger.info('----------')
wordsCode=keymsg['wordsCode']
id=keymsg['id']
try:
searchEngines=keymsg['searchEngines']
except Exception as e:
searchEngines=[]
kwList=[]
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
else:
pass
# logger.info('+++++')
# keyword=keymsg['keyWord']
# keymsglist=self.getkeywords(keyword)
# for kw in keymsglist:
# kwmsg={
# 'kw':kw,
# 'wordsCode':wordsCode,
# 'sid':id
# }
# kwList.append(kwmsg)
return kwList
# def runSpider(self,kwmsg):
# try:
# searchkw=kwmsg['kw']
# wordsCode=kwmsg['wordsCode']
# sid=kwmsg['sid']
#
# baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
# baiduSpider.get_page_html()
# baiduSpider.get_detail_html()
# except Exception as e:
# logger.info('百度搜索异常'+searchkw)
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
souhunewsSpider=SouhunewsSpider(searchkw,wordsCode,sid)
try:
souhunewsSpider.get_page_html()
except Exception as e:
try:
souhunewsSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
if souhunewsSpider.detailList.qsize() != 0:
try:
souhunewsSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
logger.info("关键词采集结束!"+searchkw)
if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss)
# print(keymsglist)
# 创建Redis连接
souhunewsTaskJob=SouhunewsTaskJob()
baseCore=BaseCore()
logger=baseCore.getLogger()
print('---------------')
while True:
try:
try:
keymsg=souhunewsTaskJob.getkafka()
kwList=souhunewsTaskJob.paserKeyMsg(keymsg)
except Exception as e:
logger.info("从kafka拿取信息失败!")
time.sleep(5)
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
result = future.result()
# 处理任务的执行结果
logger.info(f"任务执行结束: {result}")
except Exception as e:
# 处理任务执行过程中的异常
logger.info(f"任务执行exception: {e}")
except Exception as e:
logger.info('采集异常')
title baidu_comm
chcp 65001
cd /d %~dp0
python baidutaskJob.py
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论