搜狐搜索代码

b420fb1e · 刘伟刚 · 864508c6 · b420fb1e · b420fb1e · b420fb1e
--- a/souhunews_comm/baseCore.py
+++ b/souhunews_comm/baseCore.py
--- a/souhunews_comm/config.ini
+++ b/souhunews_comm/config.ini
+[redis]
+host=114.115.236.206
+port=6379
+pass=clbzzsn
+[mysql]
+host=114.115.159.144
+username=caiji
+password=zzsn9988
+database=caiji
+url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
+[kafka]
+bootstrap_servers=114.115.159.144:9092
+topic=keyWordsInfo
+groupId=python_sougou
+[selenium]
+chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
+binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
--- a/souhunews_comm/entity.py
+++ b/souhunews_comm/entity.py
+# -*- coding: utf-8 -*-
+# 智能采集请求
+# 1、考虑：请求智能采集时，不再使用实体类
+#    a. 仍使用：通过HTTP的 raw 请求体，直接传递HTML源文件，通过query参数传递 lang-code、link-text 参数
+#    b. 原因：在 postman 中，不方便进行测试，无法使用粘贴后的HTML源文件
+# 2、不考虑：使用实体类，利大于弊
+#    a. 使用实体类，方便扩展参数字段
+#    b. 方便展示接口文档：调用 json_parameter_utility.get_json_parameters 函数，可显示请求实体类
+class ExtractionRequest:
+    # 语言代码
+    # 1、采集“非中文”的文章时，需要用到语言代码
+    lang_code = ""
+    # 链接文本
+    # 1、用于采集标题，如果不提供，标题的准确度会下降
+    link_text = ""
+    # 文章页面源文件
+    # 1、用于采集标题、发布时间、内容等
+    article_html = ""
+    @staticmethod
+    def from_dict(dictionary: dict):
+        extraction_request = ExtractionRequest()
+        # 尝试方法：
+        # 1、将字典，更新到内部的 __dict__ 对象
+        # extraction_request.__dict__.update(dictionary)
+        # 将字典值，设置到当前对象
+        for key in dictionary:
+            setattr(extraction_request, key, dictionary[key])
+        return extraction_request
+    def to_dict(self):
+        # 转换为字典对象：
+        # 1、序列化为JSON时，需要调用此方法
+        # 2、转换为JSON字符串：json.dumps(extraction_result, default=ExtractionResult.to_dict)
+        data = {}
+        # 借助内部的 __dict__ 对象
+        # 1、将内部的 __dict__ 对象，更新到新的字典对象中
+        data.update(self.__dict__)
+        return data
+# 采集结果
+class ExtractionResult:
+    # 标题
+    title = ""
+    # 发布日期
+    publish_date = ""
+    # 正文（保留所有HTML标记，如：br、img）
+    text = ""
+    # URL
+    url = ""
+    # 摘要
+    meta_description = ""
+    # 干净正文（不带HTML）
+    cleaned_text = ""
+    # 来源（目前只支持采集中文网站中的“来源”）
+    # source = ""
+    # 顶部图片（top_image：采集不到任何内容，不再使用此属性）
+    # top_image = ""
+    def to_dict(self):
+        # 转换为字典对象：
+        # 1、序列化为JSON时，需要调用此方法
+        # 2、转换为JSON字符串：json.dumps(extraction_result, default=ExtractionResult.to_dict)
+        data = {}
+        # 借助内部的 __dict__ 对象
+        # 1、将内部的 __dict__ 对象，更新到新的字典对象中
+        data.update(self.__dict__)
+        return data
+class UrlPickingRequest:
+    # 列表页面的响应URL
+    # 1、作为Base URL，用于拼接提取到的相对URL
+    # 2、Base URL：必须使用响应URL
+    # 3、示例：在 Python中，通过 requests.get(url) 请求URL后，需要使用 resp.url 作为 Base URL
+    list_page_resp_url = ""
+    # 列表页面源文件
+    # 1、用于提取文章网址
+    list_page_html = ""
+    @staticmethod
+    def from_dict(dictionary: dict):
+        url_picking_request = UrlPickingRequest()
+        # 将字典值，设置到当前对象
+        for key in dictionary:
+            setattr(url_picking_request, key, dictionary[key])
+        return url_picking_request
+    def to_dict(self):
+        # 转换为字典对象：
+        # 1、序列化为JSON时，需要调用此方法
+        # 2、转换为JSON字符串：json.dumps(extraction_result, default=ExtractionResult.to_dict)
+        data = {}
+        # 借助内部的 __dict__ 对象
+        # 1、将内部的 __dict__ 对象，更新到新的字典对象中
+        data.update(self.__dict__)
+        return data
--- a/souhunews_comm/requirements.txt
+++ b/souhunews_comm/requirements.txt
+pip install langid -i https://mirrors.aliyun.com/pypi/simple/
+pip install redis==4.3.5  -i https://pypi.douban.com/simple
+pip install kafka-python==2.0.2  -i https://pypi.douban.com/simple
+pip install PyMySQL  -i https://pypi.douban.com/simple
+pip install gne==0.3.0  -i https://pypi.douban.com/simple
+pip install selenium==4.9.1  -i https://pypi.douban.com/simple
+pip install logbook  -i https://pypi.douban.com/simple
+pip install tqdm  -i https://pypi.douban.com/simple
+pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
+pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
+pip install langid -i https://mirrors.aliyun.com/pypi/simple/
+pip install jieba -i https://mirrors.aliyun.com/pypi/simple
+selenium==3.141.0
+selenium-wire==5.1.0
+pip install --upgrade selenium
+pip install --upgrade urllib3
+pip3 uninstall urllib3
+ImportError: urllib3 v2.0 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'OpenSSL 1.1.0i  14 Aug 2018'. See: https://github.com/urllib3/urllib3/issues/2168                                                          
\ No newline at end of file
--- a/souhunews_comm/smart_extractor.py
+++ b/souhunews_comm/smart_extractor.py
--- a/souhunews_comm/smart_extractor_utility.py
+++ b/souhunews_comm/smart_extractor_utility.py
--- a/souhunews_comm/souhunewspider.py
+++ b/souhunews_comm/souhunewspider.py
--- a/souhunews_comm/souhunewspiderbak.py
+++ b/souhunews_comm/souhunewspiderbak.py
--- a/souhunews_comm/souhunewstaskJob_comm.py
+++ b/souhunews_comm/souhunewstaskJob_comm.py
+# -*- coding: utf-8 -*-
+"""
+任务集成测试
+1、连接redis做取出
+2、连接kafka做信息的获取，与存储
+"""
+import time
+import redis
+from kafka import KafkaProducer
+from kafka import KafkaConsumer
+import json
+import itertools
+from souhunewspider import SouhunewsSpider
+import concurrent.futures
+from baseCore import BaseCore
+from queue import Queue
+import configparser
+class SouhunewsTaskJob(object):
+    def __init__(self):
+        # 创建ConfigParser对象
+        self.config = configparser.ConfigParser()
+        # 读取配置文件
+        self.config.read('config.ini')
+        self.r = redis.Redis(host=self.config.get('redis', 'host'),
+                             port=self.config.get('redis', 'port'),
+                             password=self.config.get('redis', 'pass'), db=0)
+    def getkafka(self):
+        # Kafka集群的地址
+        bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
+        # 要订阅的主题
+        topic = self.config.get('kafka', 'topic')
+        groupId=self.config.get('kafka', 'groupId')
+        consumer = KafkaConsumer(topic, group_id=groupId,
+                                 bootstrap_servers=[bootstrap_servers],
+                                 value_deserializer=lambda m: json.loads(m.decode('utf-8')))
+        try:
+            for record in consumer:
+                try:
+                    logger.info("value:",record.value)
+                    keymsg=record.value
+                    if keymsg:
+                        break
+                    else:
+                        continue
+                #print("%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
+                except Exception as e:
+                    logger.info("msg.value error:",e)
+        except KeyboardInterrupt as e:
+            keymsg={}
+        finally:
+            consumer.close()
+        return keymsg
+    def getkeyFromredis(self,codeid):
+        kvalue=self.r.get('KEY_WORDS_TO_REDIS::'+codeid)
+        kvalue=kvalue.decode('utf-8')
+        kvalue=json.loads(kvalue)
+        return kvalue
+    def getkeywords(self,keywords):
+        kwList=[]
+        if ')+(' in keywords:
+            k1List=keywords.split('+')
+            kk2=[]
+            for k2 in k1List:
+                k2=k2.strip("()")
+                k2List=k2.split('|')
+                kk2.append(k2List)
+            if len(kk2)==2:
+                result = list(itertools.product(kk2[0], kk2[1]))
+            elif len(kk2)==3:
+                result = list(itertools.product(kk2[0], kk2[1],kk2[2]))
+            elif len(kk2)==4:
+                result = list(itertools.product(kk2[0], kk2[1],kk2[2],kk2[3]))
+            for res in result:
+                kwstr=''
+                for kw in res:
+                    kwstr+=kw+"+"
+                kwList.append(kwstr.strip('+'))
+        elif '+(' in keywords:
+            k1List=keywords.split('+')
+            kk2=[]
+            for k2 in k1List:
+                k2=k2.strip("()")
+                k2List=k2.split('|')
+                kk2.append(k2List)
+            if len(kk2)==2:
+                result = list(itertools.product(kk2[0], kk2[1]))
+            for res in result:
+                kwstr=''
+                for kw in res:
+                    kwstr+=kw+"+"
+                kwList.append(kwstr.strip('+'))
+        else:
+            k3=keywords.split("|")
+            kwList=k3
+        return kwList
+    def paserKeyMsg(self,keymsg):
+        logger.info('----------')
+        wordsCode=keymsg['wordsCode']
+        id=keymsg['id']
+        try:
+            searchEngines=keymsg['searchEngines']
+        except Exception as e:
+            searchEngines=[]
+        kwList=[]
+        if searchEngines:
+            if '3' in searchEngines:
+                keyword=keymsg['keyWord']
+                keymsglist=self.getkeywords(keyword)
+                for kw in keymsglist:
+                    kwmsg={
+                        'kw':kw,
+                        'wordsCode':wordsCode,
+                        'sid':id
+                    }
+                    kwList.append(kwmsg)
+            else:
+                pass
+                # logger.info('+++++')
+                # keyword=keymsg['keyWord']
+                # keymsglist=self.getkeywords(keyword)
+                # for kw in keymsglist:
+                #     kwmsg={
+                #         'kw':kw,
+                #         'wordsCode':wordsCode,
+                #         'sid':id
+                #     }
+                #     kwList.append(kwmsg)
+        return kwList
+    # def runSpider(self,kwmsg):
+    #     try:
+    #         searchkw=kwmsg['kw']
+    #         wordsCode=kwmsg['wordsCode']
+    #         sid=kwmsg['sid']
+    #
+    #         baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
+    #         baiduSpider.get_page_html()
+    #         baiduSpider.get_detail_html()
+    #     except Exception as e:
+    #         logger.info('百度搜索异常'+searchkw)
+    #     finally:
+    #         baiduSpider.driver.quit()
+    #     logger.info("关键词采集结束！"+searchkw)
+    def runSpider(self,kwmsg):
+        searchkw=kwmsg['kw']
+        wordsCode=kwmsg['wordsCode']
+        sid=kwmsg['sid']
+        souhunewsSpider=SouhunewsSpider(searchkw,wordsCode,sid)
+        try:
+            souhunewsSpider.get_page_html()
+        except Exception as e:
+            try:
+                souhunewsSpider.get_page_html()
+            except Exception as e:
+                logger.info('搜狗搜索异常'+searchkw)
+        if souhunewsSpider.detailList.qsize() != 0:
+            try:
+                souhunewsSpider.get_detail_html()
+            except Exception as e:
+                logger.info('详情解析异常'+searchkw)
+        logger.info("关键词采集结束！"+searchkw)
+if __name__ == '__main__':
+    # ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
+    # keymsglist=getkeywords(ss)
+    # print(keymsglist)
+    # 创建Redis连接
+    souhunewsTaskJob=SouhunewsTaskJob()
+    baseCore=BaseCore()
+    logger=baseCore.getLogger()
+    print('---------------')
+    while True:
+        try:
+            try:
+                keymsg=souhunewsTaskJob.getkafka()
+                kwList=souhunewsTaskJob.paserKeyMsg(keymsg)
+            except Exception as e:
+                logger.info("从kafka拿取信息失败！")
+                time.sleep(5)
+                continue
+            if kwList:
+                # 创建一个线程池，指定线程数量为4
+                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                    # 提交任务给线程池，每个任务处理一个数据
+                    results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList]
+                    # 获取任务的执行结果
+                    for future in concurrent.futures.as_completed(results):
+                        try:
+                            result = future.result()
+                            # 处理任务的执行结果
+                            logger.info(f"任务执行结束: {result}")
+                        except Exception as e:
+                            # 处理任务执行过程中的异常
+                            logger.info(f"任务执行exception: {e}")
+        except Exception as e:
+            logger.info('采集异常')
--- a/souhunews_comm/souhunewstaskJob_loc.py
+++ b/souhunews_comm/souhunewstaskJob_loc.py
--- a/souhunews_comm/start.bat
+++ b/souhunews_comm/start.bat
+title baidu_comm
+chcp 65001
+cd /d %~dp0
+python baidutaskJob.py
\ No newline at end of file