1/11

176c0051 · 薛凌堃 · 55d9be00 · 176c0051 · 176c0051 · 176c0051
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -663,6 +663,8 @@ class BaseCore:
            return 'cn'
        if result[0] == '':
            return 'cn'
+        if result[0] == 'ja':
+            return 'jp'
        return result[0]

    #创建excel文件

--- a/comData/YanBao/resentYanbao.py
+++ b/comData/YanBao/resentYanbao.py
@@ -895,7 +895,7 @@ def qianyanzhishiku():
 def shijiejingjiluntan():
    allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}

-    for i in range(76, 128):
+    for i in range(1, 2):

        # res = requests.get(url)
        # soup = BeautifulSoup(res.content,'html.parser')
@@ -1637,23 +1637,23 @@ if __name__ == '__main__':
    #     juliangsuanshu()
    # except Exception as e:
    #     pass
-    try:
-        log.info('ke36')
-        ke36()
-    except Exception as e:
-        ke36()
-        pass
    # try:
-    #     log.info('qianyanzhishiku')
-    #     qianyanzhishiku()
+    #     log.info('ke36')
+    #     ke36()
    # except Exception as e:
+    #     ke36()
    #     pass
    # try:
-    #     log.info('shijiejingjiluntan')
-    #     shijiejingjiluntan()
+    #     log.info('qianyanzhishiku')
+    #     qianyanzhishiku()
    # except Exception as e:
-    #     log.info(e)
    #     pass
+    try:
+        log.info('shijiejingjiluntan')
+        shijiejingjiluntan()
+    except Exception as e:
+        log.info(e)
+        pass
    # try:
    #     log.info('dongfangcaifu')
    #     dongfangcaifu()

--- a/comData/shangbiao/tyc_shangbiao.py
+++ b/comData/shangbiao/tyc_shangbiao.py
@@ -37,8 +37,8 @@ if __name__ == "__main__":
    while True:
        start_time = time.time()
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
-        social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
-        # social_code = '91350700856994874M'
+        # social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
+        social_code = '91130629MA0CG2DL51'
        # 判断 如果Redis中已经没有数据，则等待
        if social_code == None:
            # time.sleep(20)
@@ -149,3 +149,4 @@ if __name__ == "__main__":
            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
            time.sleep(5)

+        break
\ No newline at end of file
--- a/comData/zhuanli/500qiang_zhuanli.py
+++ b/comData/zhuanli/500qiang_zhuanli.py
@@ -68,7 +68,7 @@ if __name__ == '__main__':
            place = data[6]
            if place == 1:
                log.info(f'{com_name}--国内')
-                baseCore.rePutIntoR('ZhuanLi_500:zgSocial_code',social_code)
+                baseCore.rePutIntoR('Zhuanli:gwSocial_code',social_code)
                continue
            if english_name_:
                pass

--- a/comData/zhuanli/tyc_zhuanli.py
+++ b/comData/zhuanli/tyc_zhuanli.py
-import requests,time,re,random
+import functools
+import random
+import threading
+import traceback
+
+import pymysql
+import requests,time
 from base import BaseCore
-import pandas as pd
-from bs4 import BeautifulSoup as bs
+import concurrent.futures
+
 from comData.Tyc.getTycId import getTycIdByXYDM
 baseCore = BaseCore.BaseCore()
-cnx = baseCore.cnx
-cursor = baseCore.cursor
+# cnx = baseCore.cnx
+# cursor = baseCore.cursor
 log = baseCore.getLogger()
-taskType = '天眼查专利/国内上市'
+taskType = '天眼查专利/国内榜单'
+# 需调整放入国外和国内的redis
+# 设置一个全局变量用于控制线程退出
+should_exit = False
+def connectSql():
+    cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
+                                   charset='utf8mb4')
+    cursor = cnx.cursor()
+    return cnx,cursor
+
+#关闭数据库连接
+def closeSql(cnx,cursor):
+    cnx.close()
+    cursor.close()

+# 获取代理
+def get_proxy():
+    cnx,cursor = connectSql()
+    sql = "select proxy from clb_proxy"
+    cursor.execute(sql)
+    proxy_lists = cursor.fetchall()
+    cnx.commit()
+    closeSql(cnx,cursor)
+    ip_list = []
+    for proxy_ in proxy_lists:
+        ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
+    proxy_list = []
+    for str_ip in ip_list:
+        str_ip_list = str_ip.split('-')
+        proxyMeta = "http://%(host)s:%(port)s" % {
+            "host": str_ip_list[0],
+            "port": str_ip_list[1],
+        }
+        proxy = {
+            "http": proxyMeta,
+            "https": proxyMeta
+        }
+        proxy_list.append(proxy)
+    return proxy_list[random.randint(0, 4)]

-def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
+def spider_zhuanli(com_name, social_code, tycid):
+    page = 1
    start_time = time.time()
-    log.info(f'===正在处理第{page}页===')
-    # list_all_info = []

    t = int(time.time() * 1000)
    header = {
@@ -36,13 +78,14 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
        'sec-ch-ua-platform': '"Windows"',
        'version': 'TYC-Web'
    }
+    while True:
+        log.info(f'===正在处理第{page}页===')
        url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
-
        try:
-        ip = baseCore.get_proxy()
+            ip = get_proxy()
        except:
            time.sleep(2)
-        ip = baseCore.get_proxy()
+            ip = get_proxy()
        try:
            res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
        except:
@@ -60,21 +103,31 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
                '企业名称': com_name,
                '统一信用代码': social_code
            }
+            cnx, cursor = connectSql()
            selectSql = f"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
+            # lock.acquire()
            cursor.execute(selectSql)
            count = cursor.fetchone()[0]
+            closeSql(cnx, cursor)
+            # lock.release()
            if count > 0:
                log.info(f"{com_name}---{social_code}---已经存在---无专利")
-            return 0
+                log.info(f"---{social_code}----{tycid}--共{page-1}页--结束处理")
+                break
            else:
                values_tuple = tuple(dic_info.values())
                # log.info(f"{gpdm}-------{companyname}---新增")
+                cnx, cursor = connectSql()
                insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code) values (%s,%s)"
+                # lock.acquire()
                cursor.execute(insertSql, values_tuple)
                cnx.commit()
+                # lock.release()
+                closeSql(cnx,cursor)
                log.info(f"{com_name}---{social_code}---新增---无专利")
-        return 0
-    # print(list_all)
+                log.info(f"---{social_code}----{tycid}--共{page-1}页--结束处理")
+                break
+
        if list_all:
            for one_zhuanli in list_all:
                title = one_zhuanli['title']
@@ -135,36 +188,42 @@ def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
                    '天眼查详情id': uuid,
                    '年份': shenqingri[:4]
                }
+                cnx, cursor = connectSql()
                selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
+                # lock.acquire()
                cursor.execute(selectSql)
                count = cursor.fetchone()[0]
+                # lock.release()
+                closeSql(cnx,cursor)
                if count > 0:
                    log.info(f"{com_name}-------{shenqing_code}---已经存在")
-                continue
                else:
                    values_tuple = tuple(dic_info.values())
                    # log.info(f"{gpdm}-------{companyname}---新增")
+                    cnx,cursor = connectSql()
                    insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                    cursor.execute(insertSql, values_tuple)
                    cnx.commit()
+                    closeSql(cnx,cursor)
                    log.info(f"{com_name}-------{shenqing_code}---新增")
                time.sleep(2)
-            # list_all_info.append(dic_info)
            log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
-        return page
+            page+=1
        else:
-        return 0
+            log.info(f"---{social_code}----{tycid}--共{page}页--结束处理")
+            break

-if __name__ == "__main__":
-    while True:
-        list_all_info = []
-        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
-        social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
-        # social_code = '91350700856994874M'
+def runSpider():
+    # 根据从Redis中拿到的社会信用代码, 在数据库中获取对应基本信息
+    # social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
+    social_code = '91360400794798498A'
    # 判断 如果Redis中已经没有数据，则等待
    if social_code == None:
        # time.sleep(20)
-            break
+        # 任务执行结束后设置should_exit为True
+        global should_exit
+        should_exit = True
+
    start = time.time()
    try:
        data = baseCore.getInfomation(social_code)
@@ -173,50 +232,48 @@ if __name__ == "__main__":
        else:
            # 数据重新塞入redis
            baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
-                continue
+            return False
        id = data[0]
        com_name = data[1]
-            xydm = data[2]
        tycid = data[11]
+        place = data[6]
+        if place != 1:
+            baseCore.rePutIntoR('Zhuanli:gwSocial_code', social_code)
+            return False
        if tycid == None or tycid == '':
            try:
-                    retData = getTycIdByXYDM(xydm)
+                retData = getTycIdByXYDM(social_code)
                if retData['tycData'] and retData['reput']:
                    tycid = retData['tycData']['id']
                    # todo:写入数据库
-                        updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
+                    cnx,cursor = connectSql()
+                    updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{social_code}'"
                    cursor.execute(updateSql)
                    cnx.commit()
+                    closeSql(cnx,cursor)
                elif not retData['tycData'] and retData['reput']:
                    state = 0
                    takeTime = baseCore.getTimeCost(start, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
                    log.info(f'======={social_code}====重新放入redis====')
-                        baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
-                        continue
+                    baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
+                    return False
                elif not retData['reput'] and not retData['tycData']:
-                        continue
+                    return False
            except:
                state = 0
                takeTime = baseCore.getTimeCost(start, time.time())
                baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
-                    baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
-                    continue
-            count = data[17]
-            log.info(f"{id}---{xydm}----{tycid}----开始处理")
-            page = 1
-            while True:
-                page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info)
-                if page != 0:
-                    page += 1
+                baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
+                return False
+
+        log.info(f"{id}---{social_code}----{tycid}----开始处理")
+
+        spider_zhuanli(com_name, social_code, tycid)

-                else:
-                    # print(len(list_all_info))
-                    # df_all_info = pd.DataFrame(list_all_info)
-                    # df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
-                    log.info(f"{id}---{xydm}----{tycid}----结束处理")
-                    break
    except Exception as e:
+
+        traceback.print_exc()
        log.info(f'==={social_code}=====获取企业信息失败==={e}=')
        # 重新塞入redis
        baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
@@ -224,3 +281,56 @@ if __name__ == "__main__":
        takeTime = baseCore.getTimeCost(start, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
        time.sleep(5)
+    finally:
+        # global should_exit
+        # should_exit = True
+        return
+
+# if __name__ == "__main__":
+#     while True:
+#         # 创建一个线程池，指定线程数量为4
+#         with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+#             results = []
+#             while True:
+#                 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
+#                 social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
+#                 # social_code = '91350700856994874M'
+#                 # 判断 如果Redis中已经没有数据，则等待
+#                 if social_code == None:
+#                     # time.sleep(20)
+#                     break
+#
+#                 future = executor.submit(runSpider, social_code)
+#                 results.append(future)
+#                 # 获取任务的执行结果
+#             for future in concurrent.futures.as_completed(results):
+#                 try:
+#                     result = future.result()
+#                     # 处理任务的执行结果
+#                     print(f"任务执行结束: {result}")
+#                 except Exception as e:
+#                     # 处理任务执行过程中的异常
+#                     # print(f"任务执行exception: {e}")
+#                     traceback.print_exc()
+
+def run_threads(num_threads):
+    threads = []
+    for i in range(num_threads):
+
+        thread = threading.Thread(target=runSpider)
+
+        threads.append(thread)
+        thread.start()
+    # while True:
+    #     if should_exit:
+    #         break
+    for thread in threads:
+        thread.join()
+
+if __name__ == '__main__':
+    while True:
+
+        start = time.time()
+        num_threads = 1
+        run_threads(num_threads)
+        log.info(f'5线程 总耗时{time.time()-start}秒')
--- a/google_comm/test.py
+++ b/google_comm/test.py
+import requests
+url = 'https://www.ctwant.com/article/308534'
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
+}
+req = requests.get(url,headers)
+print(req.text)
\ No newline at end of file
--- a/百度采集/baidu_comm/baiduSpider.py
+++ b/百度采集/baidu_comm/baiduSpider.py
-#coding=utf-8
+#coding=utf-8
@@ -25,7 +25,7 @@ from baseCore import BaseCore
 import configparser

 from smart_extractor import SmartExtractor
-
+# baseCore=BaseCore()

 class BaiduSpider(object):
    def __init__(self,searchkw,wordsCode,sid):
@@ -40,13 +40,15 @@ class BaiduSpider(object):
                             port=self.config.get('redis', 'port'),
                             password=self.config.get('redis', 'pass'), db=0)
        self.page_num = 1
-        chrome_driver =self.config.get('selenium', 'chrome_driver')
-        self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
-        path =  Service(chrome_driver)
-        chrome_options = webdriver.ChromeOptions()
-        chrome_options.binary_location =  self.config.get('selenium', 'binary_location')
-        self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
-        # driver = webdriver.Chrome(chrome_options=chrome_options)
+        # chrome_driver =self.config.get('selenium', 'chrome_driver')
+        # self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
+        # path =  Service(chrome_driver)
+        # chrome_options = webdriver.ChromeOptions()
+        # chrome_options.binary_location =  self.config.get('selenium', 'binary_location')
+        # proxy = baseCore.get_proxy()
+        # chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
+        # self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
+        # # driver = webdriver.Chrome(chrome_options=chrome_options)
        self.qtitle = Queue()
        self.qurl = Queue()
        self.detailList = Queue()
@@ -54,14 +56,16 @@ class BaiduSpider(object):
        self.wordsCode = wordsCode
        self.sid = sid
    def createDriver(self):
+
        chrome_driver =self.config.get('selenium', 'chrome_driver')
+        self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
        path =  Service(chrome_driver)
        chrome_options = webdriver.ChromeOptions()
-        chrome_options.binary_location =self.config.get('selenium', 'binary_location')
-        # 设置代理
-        # proxy = "127.0.0.1:8080"  # 代理地址和端口
-        # chrome_options.add_argument('--proxy-server=http://' + proxy)
+        chrome_options.binary_location =  self.config.get('selenium', 'binary_location')
+        proxy = baseCore.get_proxy()
+        chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
        self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
+        # driver = webdriver.Chrome(chrome_options=chrome_options)
    #将列表数据插入到表中 meta_search_result
    def itemInsertToTable(self,items):
        try:

--- a/百度采集/baidu_comm/baidutaskJob_comm.py
+++ b/百度采集/baidu_comm/baidutaskJob_comm.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -12,12 +12,16 @@ from kafka import KafkaProducer
 from kafka import KafkaConsumer
 import json
 import itertools
+
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
 from baiduSpider import BaiduSpider

 import concurrent.futures
 from baseCore import BaseCore
 from queue import Queue
 import configparser
+from tqdm import tqdm

 class BaiduTaskJob(object):
    def __init__(self):
@@ -39,7 +43,7 @@ class BaiduTaskJob(object):
                                 bootstrap_servers=[bootstrap_servers],
                                 value_deserializer=lambda m: json.loads(m.decode('utf-8')))
        try:
-            for record in consumer:
+            for record in tqdm(consumer, desc="Consuming messages"):
                try:
                    logger.info("value:",record.value)
                    keymsg=record.value
@@ -119,7 +123,15 @@ class BaiduTaskJob(object):
        kwList=[]
        if searchEngines:
            if '3' in searchEngines:
-                keyword=keymsg['keyWord']
+                start_time = time.time()
+                keyword = keymsg['keyWord']
+                wordsName = keymsg['wordsName']
+                first = wordsName
+                if wordsName == first:
+                    end_time = time.time()
+                    if int(end_time - start_time) > 10:
+                        logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
+                logger.info(f"获取到关键词组:{wordsName}---{wordsCode}")
                keymsglist=self.getkeywords(keyword)
                for kw in keymsglist:
                    kwmsg={
@@ -157,6 +169,25 @@ class BaiduTaskJob(object):
    #     finally:
    #         baiduSpider.driver.quit()
    #     logger.info("关键词采集结束！"+searchkw)
+    def createDriver(self):
+        chrome_driver = r'D:\cmd100\chromedriver.exe'
+        path = Service(chrome_driver)
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+        chrome_options.add_argument('--disable-gpu')
+        chrome_options.add_argument('--ignore-certificate-errors')
+        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+        chrome_options.add_argument("--start-maximized")
+        proxy = baseCore.get_proxy()
+        chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
+        chrome_options.add_argument(
+            'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
+        # chrome_options.add_argument('--headless')
+
+        browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
+
+        return browser
+
    def runSpider(self,kwmsg):
        searchkw=kwmsg['kw']
        wordsCode=kwmsg['wordsCode']
@@ -166,6 +197,8 @@ class BaiduTaskJob(object):
            baiduSpider.get_page_html()
        except Exception as e:
            try:
+                baiduSpider.driver.quit()
+                baiduSpider.driver=self.createDriver()
                baiduSpider.get_page_html()
            except Exception as e:
                logger.info('百度搜索异常'+searchkw)

--- a/百度采集/baidu_comm/baseCore.py
+++ b/百度采集/baidu_comm/baseCore.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -293,6 +293,7 @@ class BaseCore:
        sql = "select proxy from clb_proxy"
        self.__cursor_proxy.execute(sql)
        proxy_lists = self.__cursor_proxy.fetchall()
+        self.__cnx_proxy.commit()
        ip_list = []
        for proxy_ in proxy_lists:
            ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
@@ -304,8 +305,8 @@ class BaseCore:
                "port": str_ip_list[1],
            }
            proxy = {
-                "HTTP": proxyMeta,
-                "HTTPS": proxyMeta
+                "http": proxyMeta,
+                "https": proxyMeta
            }
            proxy_list.append(proxy)
        return proxy_list[random.randint(0, 3)]