10/24

5c05e843 · 薛凌堃 · cdc4a715 · 5c05e843 · 5c05e843 · 5c05e843
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -23,6 +23,7 @@ from DBUtils.PooledDB import PooledDB
 # sys.path.append('D://zzsn_spider//base//fdfs_client')
 from fdfs_client.client import get_tracker_conf, Fdfs_client
+import uuid
 tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
 client = Fdfs_client(tracker_conf)
@@ -682,12 +683,13 @@ class BaseCore:
            id = ''
            return id
        else:
-            Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+            Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
            values = (
                year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
                status, create_by,
-                pub_time, page_size)
+                create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
+                pub_time)
            self.cursor_.execute(Upsql, values)  # 插入
            self.cnx_.commit()  # 提交
@@ -735,6 +737,12 @@ class BaseCore:
        else:
            self.getLogger().info(f'=====文件存在obs========{file_path}')
+    #uuid 根据时间戳生成 文件名 上传到obs
+    def getuuid(self):
+        get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+        return get_timestamp_uuid
    def uptoOBS(self, pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by):
        headers = {}
        retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
@@ -751,7 +759,7 @@ class BaseCore:
                time.sleep(3)
                continue
        page_size = 0
-        name = name_pdf
+        name = str(self.getuuid()) + '.pdf'
        now_time = time.strftime("%Y-%m")
        try:
            result = self.getOBSres(pathType, now_time, name, response)
@@ -773,8 +781,8 @@ class BaseCore:
            try:
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                retData['state'] = True
-                retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
+                retData['path'] = result['body']['objectUrl'].split('.com')[1]
-                retData['full_path'] = unquote(result['body']['objectUrl'])
+                retData['full_path'] = result['body']['objectUrl']
                retData['file_size'] = self.convert_size(file_size)
                retData['create_time'] = time_now
                retData['page_size'] = page_size
@@ -788,6 +796,6 @@ class BaseCore:
    @retry(tries=3, delay=1)
    def getOBSres(self, pathType, now_time, name, response):
-        result = obsClient.putContent('zzsn', f'{pathType}{now_time}/' + name, content=response.content)
+        result = obsClient.putContent('zzsn', pathType + name, content=response.content)
-        # resp = obsClient.putFile('zzsn', f'{pathType}{now_time}/' + name, file_path='要上传的那个文件的本地路径')
+        # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
        return result
--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -71,7 +71,7 @@ def NewsEnterprise():
    #将数据插入到redis中
    for item in gn_social_list:
        r.rpush('NewsEnterprise:gnqy_socialCode', item)
-        # r.rpush('NewsEnterprise:gnqybuchong_socialCode', item)
+        # r.rpush('NewsEnterprise:gnqybc_socialCode', item)
    # for item in gw_social_list:
    #     r.rpush('NewsEnterprise:gwqy_socialCode', item)
@@ -256,7 +256,7 @@ def AnnualEnterprise_task():
 def AnnualEnterpriseXueQ():
    cnx,cursor = connectSql()
    # 获取国内上市企业
-    gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null and isIPO = 1 limit 10"
+    gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null and isIPO = 1"
    cursor.execute(gn_query)
    gn_result = cursor.fetchall()
    cnx.commit()
@@ -518,6 +518,19 @@ def fbspdfurlinfo():
    for item in com_namelist:
        r.rpush('ompdfurlinfo:id', item)
+def dujs_1020():
+    cnx, cursor = connectSql()
+    query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=7 AND a.Place=1"
+    cursor.execute(query)
+    result = cursor.fetchall()
+    cnx.commit()
+    com_namelist = [item[0] for item in result]
+    for item in com_namelist:
+        r.rpush('dujs_1020:baseinfo_socialcode', item)
+        # r.rpush('dujs_1020:news_socialcode', item)
+        # r.rpush('dujs_1020:person_socialcode', item)
 if __name__ == "__main__":
    start = time.time()
    # fbspdfurlinfo()
@@ -530,13 +543,15 @@ if __name__ == "__main__":
    # AnnualEnterprise()
    # BaseInfoEnterpriseAbroad()
    # NewsEnterprise_task()
-    # NewsEnterprise()
+    NewsEnterprise()
+    # AnnualEnterpriseXueQ()
+    # dujs_1020()
    # dujioashou()
    # BaseInfoEnterprise()
    # FBS()
    # MengZhi()
    # NQEnterprise()
-    SEC_CIK()
+    # SEC_CIK()
    # dujioashou()
    # omeng()
    # AnnualEnterpriseUS()

--- a/comData/annualReport/fbs_annualreport.py
+++ b/comData/annualReport/fbs_annualreport.py
--- a/comData/annualReport/证监会-年报.py
+++ b/comData/annualReport/证监会-年报.py
 import json
@@ -316,7 +316,8 @@ if __name__ == '__main__':
    while True:
        start_time = time.time()
        # 获取企业信息
-        social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
+        # social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
+        social_code = '91100000100003962T'
        if not social_code:
            time.sleep(20)
            continue

--- a/comData/annualReport/雪球网-年报.py
+++ b/comData/annualReport/雪球网-年报.py
 # -*- coding: utf-8 -*-
@@ -16,8 +16,22 @@ import requests, re, time, pymysql, fitz
 from bs4 import BeautifulSoup as bs
 from selenium import webdriver
-chromedriver = "D:/chrome/chromedriver.exe"
+# chromedriver = "D:/chrome/chromedriver.exe"
-browser = webdriver.Chrome(chromedriver)
+# browser = webdriver.Chrome(chromedriver)
+opt = webdriver.ChromeOptions()
+opt.add_argument(
+        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
+opt.add_argument("--ignore-certificate-errors")
+opt.add_argument("--ignore-ssl-errors")
+opt.add_experimental_option("excludeSwitches", ["enable-automation"])
+opt.add_experimental_option('excludeSwitches', ['enable-logging'])
+opt.add_experimental_option('useAutomationExtension', False)
+opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
+chromedriver = r'D:/cmd100/chromedriver.exe'
+browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
 from fdfs_client.client import get_tracker_conf, Fdfs_client
 log = baseCore.getLogger()
 requests.adapters.DEFAULT_RETRIES = 3
@@ -73,7 +87,8 @@ def spider_annual_report(dict_info,num):
    for i in list_all:
        # ip = get_proxy()[random.randint(0, 3)]
        pdf_name_a = i.text
+        if 'H股公告' in pdf_name_a:
+            continue
        year_url = 'https://vip.stock.finance.sina.com.cn' + i.get('href')
        year_name = i.text
        browser.get(year_url)
@@ -93,7 +108,7 @@ def spider_annual_report(dict_info,num):
            baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
            continue
        #公告日期
-        pub_time = soup_2.find('td',{'class':'head'}).text.split('公告日期')[1]
+        pub_time = soup_2.find('td',{'class':'head'}).text.split('公告日期:')[1]
        try:
            # 标题中有年份，
@@ -169,12 +184,12 @@ def spider_annual_report(dict_info,num):
                    state = 1
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '成功')
-                except:
+                except Exception as e:
                    exception = '数据库传输失败'
                    state = 0
                    takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
+                    baseCore.recordLog(social_code, taskType, state, takeTime, year_url, f'{exception} - --{e}')
+                    return False
                #发送数据到kafka
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                dic_news = {
@@ -210,7 +225,7 @@ def spider_annual_report(dict_info,num):
                        'message': '操作成功',
                        'code': '200',
                    }
-                    print(dic_result)
+                    log.info(dic_result)
                    # return True
                except Exception as e:
                    dic_result = {
@@ -222,7 +237,7 @@ def spider_annual_report(dict_info,num):
                    state = 0
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
-                    print(dic_result)
+                    log.info(dic_result)
                    return False
                # num = num + 1
@@ -240,7 +255,7 @@ if __name__ == '__main__':
        start_time = time.time()
        # 获取企业信息
        # social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
-        social_code = '9133060072360502XQ'
+        social_code = '91100000100003962T'
        if not social_code:
            time.sleep(20)
            continue
@@ -272,11 +287,12 @@ if __name__ == '__main__':
            'code':code,
        }
        # list_info.append(dict_info)
-        spider_annual_report(dict_info,num)
+        if spider_annual_report(dict_info,num):
            count += 1
            runType = 'AnnualReportCount'
            baseCore.updateRun(social_code, runType, count)
+        break
    # cursor.close()
    cnx_.close()
    # 释放资源

--- a/comData/annualReport1014/BaseCore.py
+++ b/comData/annualReport1014/BaseCore.py
@@ -4,6 +4,7 @@ import random
 import socket
 import sys
 import time
+import uuid
 import logbook
 import logbook.more
@@ -742,6 +743,12 @@ class BaseCore:
        else:
            self.getLogger().info(f'=====文件存在obs========{file_path}')
+    import uuid
+    def getuuid(self):
+        get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+        return get_timestamp_uuid
    def uptoOBS(self, pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by,file_path):
        headers = {}
        retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
@@ -758,7 +765,7 @@ class BaseCore:
                time.sleep(3)
                continue
-        name = name_pdf + '.pdf'
+        name = str(self.getuuid()) + '.pdf'
        now_time = time.strftime("%Y-%m")
        try:
            result = self.getOBSres(pathType, now_time, name, file_path)

--- a/comData/annualReport1014/report.py
+++ b/comData/annualReport1014/report.py
@@ -9,7 +9,7 @@ from kafka import KafkaProducer
 from obs import ObsClient
 import fitz
 from urllib.parse import unquote
+import uuid
 from retry import retry
 obsClient = ObsClient(
@@ -29,7 +29,7 @@ type_id = 1
 create_by = 'XueLingKun'
 taskType = '企业年报'
 #付俊雪的需要改为巨潮资讯网1_福布斯2000_PDF_60_付
-file_path = 'D:\\年报\\失败'
+file_path = 'D:\\年报\\欧盟记分牌2500_年报补充_87_20231020'
 log.info(f'=============当前pid为{baseCore.getPID()}==============')
 def sendKafka(dic_news):
@@ -66,6 +66,11 @@ def sendKafka(dic_news):
        log.info(dic_result)
        return False
+def getuuid():
+    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+    return get_timestamp_uuid
 def uptoOBS(retData, pathType, taskType, start_time,file_name,pdf_path):
    """
            retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
@@ -93,7 +98,8 @@ def uptoOBS(retData, pathType, taskType, start_time,file_name,pdf_path):
               'create_time': create_time, 'page_size': page_size, 'content': content}
    try:
-        result = getOBSres(pathType, file_name, pdf_path)
+        name = str(getuuid()) + '.pdf'
+        result = getOBSres(pathType, name, pdf_path)
    except:
        log = baseCore.getLogger()
        log.error(f'OBS发送失败')
@@ -101,8 +107,8 @@ def uptoOBS(retData, pathType, taskType, start_time,file_name,pdf_path):
    try:
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        retData_f['state'] = True
-        retData_f['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
+        retData_f['path'] = result['body']['objectUrl'].split('.com')[1]
-        retData_f['full_path'] = unquote(result['body']['objectUrl'])
+        retData_f['full_path'] = result['body']['objectUrl']
        retData_f['create_time'] = time_now
    except Exception as e:
        state = 0

--- a/comData/weixin_solo/get_tokenCookies.py
+++ b/comData/weixin_solo/get_tokenCookies.py
@@ -51,7 +51,7 @@ if __name__=="__main__":
    opt.add_experimental_option('excludeSwitches', ['enable-logging'])
    opt.add_experimental_option('useAutomationExtension', False)
    opt.binary_location = r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
-    chromedriver = r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
+    chromedriver = r'D:\cmd100\chromedriver.exe'
    browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
    url = "https://mp.weixin.qq.com/"
    browser.get(url)

--- a/test.py
+++ b/test.py
@@ -18,10 +18,81 @@ element.getparent() #获取给定元素的父元素
 # aa = pd.read_csv(StringIO(data),escapechar='\r')
 # print(aa)
-import pandas as pd
+# import pandas as pd
+#
+# # 读取txt文件
+# data = pd.read_csv('D:\\美国证券交易委员会\\2022q4\\sub.txt', delimiter='\t')  # 根据实际情况选择正确的分隔符
+#
+# # 将数据保存为csv文件
+# data.to_csv('D:\\美国证券交易委员会\\2022q4\\sub.csv', index=False)  # index=False表示不保存行索引
+"""验证码识别测试"""
+# import ddddocr
+#
+# ocr = ddddocr.DdddOcr()
+#
+# with open("D:\\kkwork\\captchaNew (3).jfif", 'rb') as f:
+#     image = f.read()
+#
+# res = ocr.classification(image)
+# print(res)
+"""测试中国执行信息公开网  模拟浏览器"""
+import ddddocr
+from PIL import Image
+import re
+import requests, time, random, json, pymysql, redis
+import urllib3
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from obs import ObsClient
+from kafka import KafkaProducer
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+def use_ocr(img):
+    ocr = ddddocr.DdddOcr()
+    with open(img, 'rb') as f:
+        image = f.read()
+    res = ocr.classification(image)
+    print(res)
+    return res
+if __name__=="__main__":
+    requests.DEFAULT_RETRIES = 5
+    time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    log.info(f'开始时间为：{time_start}')
+    requests.adapters.DEFAULT_RETRIES = 3
+    headers = {
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
+    }
+    opt = webdriver.ChromeOptions()
+    opt.add_argument(
+        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
+    opt.add_argument("--ignore-certificate-errors")
+    opt.add_argument("--ignore-ssl-errors")
+    opt.add_experimental_option("excludeSwitches", ["enable-automation"])
+    opt.add_experimental_option('excludeSwitches', ['enable-logging'])
+    opt.add_experimental_option('useAutomationExtension', False)
+    opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
+    chromedriver = r'D:/cmd100/chromedriver.exe'
+    browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
+    url = "http://zxgk.court.gov.cn/shixin/"
+    browser.get(url)
+    # 可改动
+    time.sleep(20)
-# 读取txt文件
+screen_img_path = "D:/screen/xxx.png"
-data = pd.read_csv('D:\\美国证券交易委员会\\2023q2\\pre.txt', delimiter='\t')  # 根据实际情况选择正确的分隔符
+out_img_path = "D:/out/xxx.png"
+ele = driver.find_element(By.ID, 'XXXX')
-# 将数据保存为csv文件
+code = use_ocr(out_img_path)
-data.to_csv('D:\\美国证券交易委员会\\2023q2\\pre.csv', index=False)  # index=False表示不保存行索引
+验证码输入框元素.send_keys(code)