Merge remote-tracking branch 'origin/master'

6146dfc0 · LiuLiYuan · ecc47f4d · 540a101a · 6146dfc0 · 6146dfc0
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -15,6 +15,11 @@ from selenium.webdriver.chrome.service import Service
 from openpyxl import Workbook
 import langid

+#创建连接池
+import pymysql
+from pymysql import connections
+from DBUtils.PooledDB import PooledDB
+
 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源

 class BaseCore:
@@ -233,6 +238,20 @@ class BaseCore:
        # 连接到Redis
        self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)

+        self.pool_caiji = PooledDB(
+            creator=pymysql,
+            maxconnections=5,
+            mincached=2,
+            maxcached=5,
+            blocking=True,
+            host='114.115.159.144',
+            port=3306,
+            user='root',
+            password='zzsn9988',
+            database='caiji',
+            charset='utf8mb4'
+        )
+
    def close(self):
        try:
            self.__cursor_proxy.close()
@@ -434,32 +453,66 @@ class BaseCore:

    # 根据社会信用代码获取企业信息
    def getInfomation(self, social_code):
+        data = []
+        try:
            sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
-        self.cursor.execute(sql)
-        data = self.cursor.fetchone()
+            # self.cursor.execute(sql)
+            # data = self.cursor.fetchone()
+            conn = self.pool_caiji.connection()
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            data = cursor.fetchone()
+            data = list(data)
+            cursor.close()
+            conn.close()
+        except:
+            log = self.getLogger()
+            log.info('=========数据库操作失败========')
        return data

    # 更新企业采集次数
    def updateRun(self, social_code, runType, count):
+        try:
            sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
-        self.cursor.execute(sql_update)
-        self.cnx.commit()
+            # self.cursor.execute(sql_update)
+            # self.cnx.commit()
+            conn = self.pool_caiji.connection()
+            cursor = conn.cursor()
+            cursor.execute(sql_update)
+            conn.commit()
+            cursor.close()
+            conn.close()
+        except:
+            log = self.getLogger()
+            log.info('======更新数据库失败======')

    # 保存日志入库
    def recordLog(self, xydm, taskType, state, takeTime, url, e):
+        try:
            createTime = self.getNowTime(1)
            ip = self.getIP()
            pid = self.getPID()
            sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
-        try:
-            self.cursor.execute(sql, values)
-        except Exception as e:
-            print(e)
-        self.cnx.commit()
+            # try:
+            #     self.cursor.execute(sql, values)
+            # except Exception as e:
+            #     print(e)
+            # self.cnx.commit()
+            cnn = self.pool_caiji.connection()
+            cursor = cnn.cursor()
+            cursor.execute(sql,values)
+            cnn.commit()
+            cursor.close()
+            cnn.close()
+        except:
+            log = self.getLogger()
+            log.info('======保存日志失败=====')
+

    #获取企查查token
    def GetToken(self):
+
        #获取企查查token
        query = "select token from QCC_token "
        # token = '67ec7402166df1da84ae83c4b95cefc0'  # 需要隔两个小时左右抓包修改
@@ -491,8 +544,8 @@ class BaseCore:
        # return combined_data

    #对失败或者断掉的企业 重新放入redis
-    def rePutIntoR(self,item):
-        self.r.rpush('NewsEnterprise:gwqy_socialCode', item)
+    def rePutIntoR(self,key,item):
+        self.r.rpush(key, item)

    #增加计数器的值并返回增加后的值
    def incrSet(self,key):
@@ -518,3 +571,10 @@ class BaseCore:



+
+
+
+
+
+
+
--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
--- a/comData/caiwushuju/BaseCore.py
+++ b/comData/caiwushuju/BaseCore.py
--- a/comData/caiwushuju/RedisPPData.py
+++ b/comData/caiwushuju/RedisPPData.py
+import time
+
+from base import BaseCore
+from apscheduler.schedulers.blocking import BlockingScheduler
+import pymysql
+basecore = BaseCore.BaseCore()
+log = basecore.getLogger()
+r = basecore.r
+
+
+def conn11():
+    conn = pymysql.Connect(host='114.116.44.11', port=3306, user='root', passwd='f7s0&7qqtK', db='clb_project',
+                           charset='utf8')
+    cursor = conn.cursor()
+    return conn,cursor
+
+#企业公告
+def yahooCodeFromSql():
+    conn,cursor=conn11()
+    try:
+        gn_query = "select securities_code from sys_base_enterprise_ipo where category in ('4','5','6') "
+        cursor.execute(gn_query)
+        gn_result = cursor.fetchall()
+        gn_social_list = [item[0] for item in gn_result]
+        print('=======')
+        for item in gn_social_list:
+            r.rpush('NoticeEnterprise:securities_code', item)
+    except Exception as e:
+        log.info("数据查询异常")
+    finally:
+        cursor.close()
+        conn.close()
+
+def yahooCode_task():
+    # 实例化一个调度器
+    scheduler = BlockingScheduler()
+    # 每天执行一次
+    # scheduler.add_job(yahooCodeFromSql, 'cron', hour=0,minute=0)
+    #3天执行一次
+    scheduler.add_job(yahooCodeFromSql, 'interval', days=3)
+    try:
+        yahooCodeFromSql()  # 定时开始前执行一次
+        scheduler.start()
+    except Exception as e:
+        print('定时采集异常', e)
+        pass
+
+if __name__ == "__main__":
+    start = time.time()
+    # NoticeEnterprise()
+    # AnnualEnterpriseIPO()
+    # AnnualEnterprise()
+    # BaseInfoEnterpriseAbroad()
+    # NewsEnterprise_task()
+    # NewsEnterprise()
+    # BaseInfoEnterprise()
+    # FBS()
+    # NoticeEnterprise_task()
+    # AnnualEnterprise_task()
+    # NoticeEnterprise()
+    yahooCode_task()
+    log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时：{basecore.getTimeCost(start,time.time())}===')
+    # cnx.close()
+    # cursor.close()
+    # basecore.close()
--- a/comData/caiwushuju/YAHOO财务数据4.py
+++ b/comData/caiwushuju/YAHOO财务数据4.py
--- a/comData/caiwushuju/config.ini
+++ b/comData/caiwushuju/config.ini
+[redis]
+host=114.115.236.206
+port=6379
+pass=clbzzsn
+
+[mysql]
+host=114.115.159.144
+username=root
+password=zzsn9988
+database=caiji
+url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
+
+[kafka]
+bootstrap_servers=114.115.159.144:9092
+topic=keyWordsInfo
+groupId=python_baidu
+
+[selenium]
+chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
+binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
+
--- a/comData/caiwushuju/东方财富网财务数据.py
+++ b/comData/caiwushuju/东方财富网财务数据.py
--- a/comData/dfcfwGpdm/__init__.py
+++ b/comData/dfcfwGpdm/__init__.py
--- a/comData/ipoInfo/chromedriver.exe
+++ b/comData/ipoInfo/chromedriver.exe
--- a/comData/dfcfwGpdm/gpdm.py
+++ b/comData/dfcfwGpdm/gpdm.py
--- a/comData/ipoInfo/ipoInfo.py
+++ b/comData/ipoInfo/ipoInfo.py
+"""
+    企业上市信息：只有上市的企业才能如企业库，未上市企业跳过采集步骤。退市企业标注为0
+"""
+import json
+import time
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+import urllib3
+from base.BaseCore import BaseCore
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+from gpdm import Gpdm
+baseCore = BaseCore()
+chromedriver = "./chromedriver"
+browser = webdriver.Chrome(chromedriver)
+taskType = '上市信息/东方财富网'
+gpdm = Gpdm()
+gpdmList = gpdm.doJob()
+log = baseCore.getLogger()
+error_list = []
+list_all_info = []
+# 需要提供股票代码、企业信用代码
+for com_code1 in gpdmList:
+    start = time.time()
+    # 股票代码0、2、3开头的为深圳交易所，6、9开头的为上海交易所，8开头的为北京交易所
+    if com_code1[0] == '2' or com_code1[0] == '0' or com_code1[0] == '3':
+        com_code = 'sz' + com_code1
+    if com_code1[0] == '9' or com_code1[0] == '6':
+        com_code = 'sh' + com_code1
+    if com_code1[0] == '8' or com_code1[0] == '4':
+        com_code = 'bj' + com_code1
+    if com_code1[0] == 'A':
+        com_code = ''
+    log.info(f'======开始采集{com_code}======')
+    url = f'https://quote.eastmoney.com/{com_code}.html'
+
+    url_1 = f'https://emweb.eastmoney.com/PC_HSF10/CompanySurvey/PageAjax?code={com_code}'
+    url_2 = f'https://emweb.eastmoney.com/PC_HSF10/BusinessAnalysis/PageAjax?code={com_code}'
+
+    browser.get(url)
+    time.sleep(8)
+    page_source = browser.page_source
+    soup_t = BeautifulSoup(page_source, 'html.parser')
+    try:
+        result = soup_t.find('div',class_='quote_quotenums').text
+        # print(f'result:{result}')
+        # if result=='未上市'or result=='已退市':
+        if result == '未上市' :
+            continue
+        if result == '已退市':
+            tag = 0
+        else:
+            tag = 1
+    except Exception as e:
+        error_list.append(com_code)
+        log.info(f'={com_code}===解析上市状态失败=====')
+        state = 0
+        takeTime = baseCore.getTimeCost(start, time.time())
+        baseCore.recordLog('', taskType, state, takeTime, '', f'{com_code}解析上市状态失败--e:{e}')
+        print('error')
+
+    requests.adapters.DEFAULT_RETRIES = 5
+
+    json_1 = requests.get(url_1,verify=False).json()
+    json_2 = requests.get(url_2,verify=False).json()
+
+    # SECURITY_TYPE
+    try:
+        jys = json_1['jbzl'][0]['TRADE_MARKET']
+    except Exception as e:
+        log.info(f'====={com_code}=====解析交易所失败======')
+        state = 0
+        takeTime = baseCore.getTimeCost(start, time.time())
+        baseCore.recordLog('', taskType, state, takeTime, '', f'{com_code}解析交易所失败--e:{e}')
+        continue
+    try:
+        if "上海" in jys:
+            jys_code = '2'
+        if "深圳" in jys:
+            jys_code = '3'
+    except:
+        jys = json_1['jbzl'][0]['SECURITY_TYPE']
+        if "北京" in jys:
+            jys_code = '1'
+    short_name = json_1['jbzl'][0]['STR_NAMEA']
+    zhengquan_type = json_1['jbzl'][0]['SECURITY_TYPE']
+    # print(zhengquan_type)
+    if 'A' in zhengquan_type:
+        # print(zhengquan_type)
+        category = '1'
+    if 'B' in zhengquan_type:
+        category = '2'
+    if '新三板' in zhengquan_type:
+        category = '3'
+    if 'H' in zhengquan_type:
+        category = '4'
+    id_code = json_1['jbzl'][0]['REG_NUM']
+    dongcai = json_1['jbzl'][0]['EM2016']
+    zhengjian = json_1['jbzl'][0]['INDUSTRYCSRC1']
+
+    try:
+        shangshishijian = json_1['fxxg'][0]['LISTING_DATE'][:10]
+    except:
+        shangshishijian = ''
+
+    zhuyingfanwei = json_2['zyfw'][0]['BUSINESS_SCOPE']
+
+    dic_cwsj = {
+        "exchange": jys_code,
+        "category": category,  # 股票类型(1-A股;2-B股;3-新三板;4-H股)
+        'listed':tag,
+        "listingDate": shangshishijian,
+        "securitiesCode": com_code[2:],
+        "securitiesShortName": short_name,
+        "securitiesType": zhengquan_type,
+        "socialCreditCode": id_code,
+        "businessScope": zhuyingfanwei,
+        "eastIndustry": dongcai,
+        "csrcIndustry": zhengjian
+    }
+
+    list_all_info.append(dic_cwsj)
+    log.info(f'======{com_code}====采集成功=====')
+
+# 通过接口将数据保存进数据库
+for num in range(0, len(list_all_info),100):
+
+    json_updata = json.dumps(list_all_info[num:num+100])
+    # print(json_updata)
+    try:
+        response = requests.post('http://114.115.236.206:8088/sync/enterpriseIpo', data=json_updata, timeout=300,
+                                 verify=False)
+    except Exception as e:
+        print(e)
+    print("{}：到：{}".format(num, num + 100))
+    print(response.text)
--- a/comData/dfcfwGpdm/reademe.txt
+++ b/comData/dfcfwGpdm/reademe.txt
--- a/comData/weixin_solo/oneWeixin.py
+++ b/comData/weixin_solo/oneWeixin.py
@@ -212,65 +212,18 @@ def get_info(sid,json_search,origin,url_,info_source_code,page):
            continue
    return list_all_info,num_caiji

-def job(count,key):
-
-    # 刷新浏览器并获取当前token和cookie
-    token, cookies = flushAndGetToken(list_b)
-
-    log.info('===========获取公众号============')
+def RequestUrl(dic_url,token,key):
    start_ = time.time()
-     #todo:redis中数据 pop一条
-    infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
-    if infoSourceCode == 'None' or infoSourceCode == None:
-        #当一次采集完之后，重新插入数据并等待插入完成
-        getFromSql()
-        time.sleep(20)
-        log.info(f'========本次公众号已采集完毕，共采集{count}个公众号=========总耗时：{baseCore.getTimeCost(start_,time.time())}')
-        return count
-
-    sql = f"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
-    # '一带一路百人论坛'
-    # sql = f"-- SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = 'IN-20220609-57436' "
-    cursor.execute(sql)
-    row = cursor.fetchone()
-
-    dic_url = {
-        'url_': row[0],
-        'sid': row[1],
-        'name': row[2],
-        'info_source_code': row[3],
-        'biz': ''
-    }
-
-    log.info('===========获取biz==========')
-    s.cookies.update(cookies)
-    s.keep_alive = False
    url_ = dic_url['url_']
    origin = dic_url['name']
    info_source_code = dic_url['info_source_code']
    sid = dic_url['sid']
-    try:
-        biz = url_.split('__biz=')[1].split('==&')[0].split('=')[0]
-        dic_url['biz'] = biz
-    except Exception as e:
-        log.info(f'---公众号--{origin}---biz错误')
-        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-        error = [
-            origin,
-            url_,
-            info_source_code,
-            e,
-            'biz错误',
-            time_now
-        ]
-        insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
-        cursor_.execute(insertSql, tuple(error))
-        cnx_.commit()
-        return count
-
+    biz = dic_url['biz']
    fakeid = biz + '=='
    url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
-    #获取页数
+    ret = -1
+    json_search = ''
+    # 获取页数
    try:
        # ip = baseCore.get_proxy()
        json_search = s.get(url_search, headers=headers,
@@ -281,7 +234,8 @@ def job(count,key):
        log.error(f'===公众号{origin}请求失败！当前时间：{baseCore.getNowTime(1)}======={e}===')
        rePutIntoR(info_source_code)
        time.sleep(20)
-        return count
+        return json_search,ret
+
    ret = json_search['base_resp']['ret']
    # {"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
    # TODO:需要判断返回值，根据返回值判断是封号还是biz错误
@@ -304,7 +258,7 @@ def job(count,key):
        #     browser_run.refresh()
        r.set(key, 50)
        r.expire(key, 5400)
-        return count
+        return json_search,ret
    elif ret == 200002:
        # 公众号链接错误 保存库里 记录错误信息及错误类型
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -320,7 +274,7 @@ def job(count,key):
        cursor_.execute(insertSql, tuple(error))
        cnx_.commit()
        log.info(f'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}')
-        return count
+        return json_search,ret
    elif ret == 200003:
        # 无效的session
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -336,7 +290,7 @@ def job(count,key):
        cursor_.execute(insertSql, tuple(error))
        cnx_.commit()
        log.info(f'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}')
-        return count
+        return json_search,ret
    else:
        log.info(f'----其他情况-----{json_search}---公众号{origin}------')
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -351,7 +305,65 @@ def job(count,key):
        insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
        cursor_.execute(insertSql, tuple(error))
        cnx_.commit()
+        return json_search,ret
+
+def job(count,key):
+
+    # 刷新浏览器并获取当前token和cookie
+    token, cookies = flushAndGetToken(list_b)
+
+    log.info('===========获取公众号============')
+    start_ = time.time()
+     #todo:redis中数据 pop一条
+    infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
+    if infoSourceCode == 'None' or infoSourceCode == None:
+        #当一次采集完之后，重新插入数据并等待插入完成
+        getFromSql()
+        time.sleep(20)
+        log.info(f'========本次公众号已采集完毕，共采集{count}个公众号=========总耗时：{baseCore.getTimeCost(start_,time.time())}')
+        return count
+
+    sql = f"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
+    # '一带一路百人论坛'
+    # sql = f"-- SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = 'IN-20220609-57436' "
+    cursor.execute(sql)
+    row = cursor.fetchone()
+
+    dic_url = {
+        'url_': row[0],
+        'sid': row[1],
+        'name': row[2],
+        'info_source_code': row[3],
+        'biz': ''
+    }
+
+    log.info('===========获取biz==========')
+    s.cookies.update(cookies)
+    s.keep_alive = False
+    url_ = dic_url['url_']
+    origin = dic_url['name']
+    info_source_code = dic_url['info_source_code']
+    sid = dic_url['sid']
+    try:
+        biz = url_.split('__biz=')[1].split('==&')[0].split('=')[0]
+        dic_url['biz'] = biz
+    except Exception as e:
+        log.info(f'---公众号--{origin}---biz错误')
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        error = [
+            origin,
+            url_,
+            info_source_code,
+            e,
+            'biz错误',
+            time_now
+        ]
+        insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
+        cursor_.execute(insertSql, tuple(error))
+        cnx_.commit()
        return count
+    json_search,ret = RequestUrl(dic_url,token,key)
+    if ret == 0:
        try:
            Max_data = int(json_search['app_msg_cnt'])
            Max_page = int(int(json_search['app_msg_cnt']) / 5)
@@ -364,22 +376,13 @@ def job(count,key):
            Max_data = 5
        log.info(f'开始采集{origin}-----共{Max_page}页---{Max_data}条数据-----')
        for i in range(0, Max_data, 5):
-        url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin={i}&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
-
-    # url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
-    #              https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAwNDA5Njc1Mg==&type=9&query=&token=550883192&lang=zh_CN&f=json&ajax=1
-        try:
-            # ip = get_proxy()[random.randint(0, 3)]
-            json_search = s.get(url_search, headers=headers,
-                                verify=False).json()  # , proxies=ip, verify=False
-            str_t = json.dumps(json_search)
-            time.sleep(2)
-        except Exception as e:
-            log.error(f'===公众号{origin}请求失败！当前时间：{baseCore.getNowTime(1)}======={e}===')
-            rePutIntoR(info_source_code)
+            json_search,ret = RequestUrl(dic_url,token,key)
+            if ret == 0:
+                pass
+            else:
                return count
-
-        list_all = json_search['app_msg_list']
+            if json_search != '':
+                # list_all = json_search['app_msg_list']
                try:
                    #开始采集每一页文章信息
                    page = int(i/5+1)
@@ -422,9 +425,12 @@ def job(count,key):
                    insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
                    cursor_.execute(insertSql, tuple(false))
                    cnx_.commit()
-            log.info(f'{fakeid}、公众号：{origin}采集失败！！！！！！耗时{baseCore.getTimeCost(start_, time.time())}')
+                    log.info(f'{biz}、公众号：{origin}采集失败！！！！！！耗时{baseCore.getTimeCost(start_, time.time())}')
        count += 1
-    log.info(f'{fakeid}、公众号{origin}:采集成功！、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_, time.time())}')
+        log.info(f'{biz}、公众号{origin}:采集成功！、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_, time.time())}')
+        return count
+    else:
+        return count

    time.sleep(2)
    return count

--- a/comData/yhcj/NewsYahooAuto.py
+++ b/comData/yhcj/NewsYahooAuto.py