企业研报代理IP 自动重启进程

c155afc4 · 薛凌堃 · 04c8d5f3 · c155afc4
--- a/comData/YanBao/resentYanbao.py
+++ b/comData/YanBao/resentYanbao.py
+import os
+import subprocess
 import traceback
 import urllib
 import uuid
-
+from datetime import datetime
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
 from selenium import webdriver
 from bs4 import BeautifulSoup
 from urllib import parse
@@ -9,6 +13,8 @@ import requests, re, time, pymysql, json, redis
 from kafka import KafkaProducer

 import urllib3
+from selenium.webdriver.support.wait import WebDriverWait
+
 urllib3.disable_warnings()
 from obs import ObsClient
 import fitz
@@ -18,7 +24,7 @@ sys.path.append('D:\\kkwork\\zzsn_spider\\base')
 import BaseCore
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
-
+r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
 obsClient = ObsClient(
    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
@@ -26,7 +32,18 @@ obsClient = ObsClient(
 )
 # tracker_conf = get_tracker_conf('./client.conf')
 # client = Fdfs_client(tracker_conf)
-chromedriver = 'D:/chrome/113/chromedriver.exe'
+# chromedriver = 'D:/chrome/113/chromedriver.exe'
+opt = webdriver.ChromeOptions()
+opt.add_argument(
+        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
+
+opt.add_argument("--ignore-certificate-errors")
+opt.add_argument("--ignore-ssl-errors")
+opt.add_experimental_option("excludeSwitches", ["enable-automation"])
+opt.add_experimental_option('excludeSwitches', ['enable-logging'])
+opt.add_experimental_option('useAutomationExtension', False)
+opt.binary_location = r'D:/Google/Chrome/Application/chrome.exe'
+chromedriver = r'D:/cmd100/chromedriver.exe'

 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
@@ -72,7 +89,7 @@ def getuuid():
 def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
                create_by, create_time, come, page_size):
    with cnx.cursor() as cursor:
-        Upsql = '''insert into clb_sys_attachment_copy2(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
        year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, create_by,
@@ -81,7 +98,7 @@ def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, c
        cursor.execute(Upsql, values)  # 插入
        cnx.commit()  # 提交

-        querySql = '''select id from clb_sys_attachment_copy2 where type_id=4 and full_path = %s'''  # and stock_code = "01786.HK"
+        querySql = '''select id from clb_sys_attachment where type_id=4 and full_path = %s'''  # and stock_code = "01786.HK"
        cursor.execute(querySql, full_path)
        selects = cursor.fetchone()
        pdf_id = selects[0]
@@ -92,14 +109,22 @@ def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, c

 # redis去重
 def add_check_url(article_url):
-    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
+
    # res = r.sadd(f'report_pdf_two_history', article_url,3)
-    res = r.sadd(f'report_pdf_three_history_2', article_url, 3)  # 注意是 保存set的方式
+    res = r.sadd(f'report_pdf_three_history', article_url, 3)  # 注意是 保存set的方式
    if res == 0:  # 若返回0,说明插入不成功，表示有重复
        return True
    else:
        return False

+# redis上传失败删除数据
+def delete_url(article_url):
+    # r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
+    res = r.srem('report_pdf_three_history', article_url)
+    if res > 0:
+        return True
+    else:
+        return False

 def uptoOBS(pdf_url, name_pdf, type_id, pathType):
    retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
@@ -108,6 +133,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType):
               'create_time': '', 'page_size': '', 'content': ''}
    for i in range(0, 3):
        try:
+            ip = baseCore.get_proxy()
            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
            file_size = int(response.headers.get('Content-Length'))
            break
@@ -230,6 +256,12 @@ def download(data, order_by):
        pass
    else:
        log.info(f'====pdf解析失败====')
+        delete_url(sourceAddress)
+        # 获取当前进程pid
+        current_pid = baseCore.getPID()
+        # todo: 重新启动新进程，杀死当前进程
+        subprocess.Popen([sys.executable] + sys.argv)
+        os.kill(current_pid, 9)
        return
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    page_size = retData['page_size']
@@ -296,17 +328,429 @@ def download(data, order_by):
    log.info(dic_result)
    return

+def Mob():
+    url = 'https://www.mob.com/mobData/report'
+    res = requests.get(url=url,headers=headers).content
+    soup = BeautifulSoup(res,'html.parser')
+    max_info = soup.find('span',class_='el-pagination__total').text
+    max_info = re.findall('\d{1,4}',max_info)[0]
+    # print(type(max_info))
+    max_page = int((int(max_info)/9) + 1)
+    print(max_page)
+    i_id = 0
+    for page in range(max_page):
+        url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
+        res = requests.get(url=url, headers=headers).content
+        soup = BeautifulSoup(res, 'html.parser')
+        result = soup.find('ul', class_='fix')
+        li_list = result.find_all('li')
+        # for id in range(1, 149):
+        id = i_id
+        for li in li_list:
+            id += 1
+            title = li.find('div',class_='title').text
+            time = li.find('div',class_='date tc').text.strip()
+            year = re.findall('\d{4}',time)[0]
+        # for id in range(29,178):
+            real_id = 178 - id
+            href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
+            # href = 'https://www.mob.com/mobdata/report/169'
+            res_href = requests.get(url=href,headers=headers).content
+            i_soup = BeautifulSoup(res_href,'html.parser')
+            url_pdf = 'https://api.os.mob.com/api/academy_report/download/' +  i_soup.find('div', class_='report-top').find('a')['href']
+            summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
+            fin_summary = []
+            for s in summary_list:
+                summary = s.text
+                fin_summary.append(summary)
+            summary = ''.join(fin_summary)
+            dic_post = {
+                'title': title,  # 报告名称
+                'url_pdf': url_pdf,  # 报告链接
+                'year': year,  # 报告年份
+                'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
+                'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
+                'category': 'pdf',  # 文件后缀名，如：pdf
+                'create_by': 'XueLingKun',  # 创建人，使用驼峰命名，如：TangYuHang
+                'publishDate': time,  # 时间
+                'origin': 'Mob研究院',  # 来源
+                'sourceAddress': href,  # 原文链接
+                'content': '',  # 内容
+                'summary': summary,  # 摘要
+                'sid': '1662008807781212161',  # 信息源id
+            }
+            order_by = 1
+            download(dic_post,order_by)
+            order_by += 1
+            # print(dic_post)
+            # url = 'http://114.115.155.139:5002/report_download'
+            # # report-list
+            # res = requests.post(url, data=json.dumps(dic_post))
+            # print(res.json())
+        i_id += 9
+
+def yidong_guanxiangtai():
+
+    url = 'http://mi.talkingdata.com/reports.html?category=all&tag=all&page=1'
+    res = requests.get(url=url,headers=headers).content
+    soup = BeautifulSoup(res,'html.parser')
+    max_page = soup.find(class_='results-page')
+    max_page = max_page.find_all('li',class_='trans')[-2].text
+    # print((max_page))
+    for page in range(int(max_page)):
+        url = 'http://mi.talkingdata.com/reports.html?category=all&tag=all&page={}'.format(page)
+        res = requests.get(url=url,headers=headers).content
+        soup = BeautifulSoup(res, 'html.parser')
+        result = soup.find(class_='content-data-report clearfix')
+        div_list = result.find_all(class_='reports-list clearfix')
+        for div in div_list:
+            info_list = div.find_all(class_='download-book')
+            for info in info_list:
+                href = info.find('b').find('a')['href']
+                title = info.find('b').find('a')['title'].strip()
+                time = info.find(class_='operate-book').find('p').text
+                year = re.findall('\d{4}', time)[0]
+                # print(href,title,time)
+                res_href = requests.get(url=href,headers=headers).content
+                i_soup = BeautifulSoup(res_href,'html.parser')
+                i_result = i_soup.find(class_='report-content l')
+                p_list = []
+                plist = i_result.find_all('p')
+                for p in plist:
+                    if 'img' in str(p) or 'TalkingData' in str(p) or 'http' in str(p) or '请填写相关信息' in str(p):
+                        continue
+                    else:
+                        p = p.text.strip()
+                        p_list.append(p)
+                # print(p_list)
+                summary = ''.join(p_list).replace('\n','')
+                # print(summary)
+                url_pdf = i_soup.find(class_='operate-verify').find('button')['data-url']
+                # print(url_pdf)
+                dic_post = {
+                    'title': title,  # 报告名称
+                    'url_pdf': url_pdf,  # 报告链接
+                    'year': year,  # 报告年份
+                    'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
+                    'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
+                    'category': 'pdf',  # 文件后缀名，如：pdf
+                    'create_by': 'XueLingKun',  # 创建人，使用驼峰命名，如：TangYuHang
+                    'publishDate': time,  # 时间
+                    'origin': '移动观象台',  # 来源
+                    'sourceAddress': href,  # 原文链接
+                    'content': '',  # 内容
+                    'summary': summary,  # 摘要
+                    'sid': '1662008276140597250',  # 信息源id
+                }
+                order_by = 1
+                download(dic_post, order_by)
+                order_by += 1
+                # print(page,dic_post)
+                # url = 'http://114.115.155.139:5002/report_download'
+                # # report-list
+                # res = requests.post(url, data=json.dumps(dic_post))
+                # print(res.json())
+
+
+# 巨量算数
+def juliangsuanshu():
+    browser = webdriver.Chrome(chromedriver)
+
+    url = 'https://trendinsight.oceanengine.com/arithmetic-report'
+    browser.get(url)#跳到指定页面
+
+    page_source = browser.page_source#获取页面信息
+    soup = BeautifulSoup(page_source, 'html.parser')
+
+    list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
+    for one_info in list_all:
+        info_title = one_info.a.text.strip()
+        info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split('：')[1]
+        info_href = one_info.a.get('href')
+        info_url = 'https://trendinsight.oceanengine.com'+info_href
+
+        res_info = requests.get(info_url)
+        soup_info = BeautifulSoup(res_info.content,'html.parser')
+        list_script = soup_info.find_all('script')
+        for script in list_script:
+            if 'window._SSR_DATA' in script.text:
+                json_str = script.text
+        info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
+
+        info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
+        info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
+
+        dic_post = {
+            'title': info_title,  # 报告名称
+            'url_pdf': info_pdf,  # 报告链接
+            'year': info_date[:4],  # 报告年份
+            'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
+            'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
+            'category': 'pdf',  # 文件后缀名，如：pdf
+            'create_by': 'TangYuHang',  # 创建人，使用驼峰命名，如：TangYuHang
+            'publishDate': info_date,  # 时间
+            'origin': '巨量算数',  # 来源
+            'sourceAddress': info_url,  # 原文链接
+            'content': '',  # 内容
+            'summary': info_zhaiyao,  # 摘要
+            'sid': '1662008524476948481',  # 信息源id
+        }
+        order_by = 1
+        download(dic_post, order_by)
+        order_by += 1
+    #     print(page,dic_post)
+        # url = 'http://114.115.155.139:5002/report_download'
+        # # report-list
+        # res = requests.post(url, data=json.dumps(dic_post))
+        # print(res.json())
+        time.sleep(2)
+    browser.quit()
+
+
+# 36氪
+def ke36():
+    # browser = webdriver.Chrome(chromedriver)
+    browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
+
+    url = 'https://36kr.com/academe'
+    browser.get(url)#跳到指定页面
+
+    page_source = browser.page_source#获取页面信息
+    soup = BeautifulSoup(page_source, 'html.parser')
+
+    list_all = soup.find('div',{'class':'report-list-wrapper'}).find_all('div',{'class':'report-card type-4'})
+    for one_info in list_all:
+        info_title = one_info.find('div',{'class':'title'}).text
+        info_zhaiyao = one_info.find('div',{'class':'desc'}).text
+        info_url = one_info.a.get('href')
+
+        browser.get(info_url)#跳到指定页面
+
+        page_source = browser.page_source#获取页面信息
+        soup_info = BeautifulSoup(page_source, 'html.parser')
+
+        info_date = soup_info.find('meta',{'property':'article:published_time'}).get('content')[:10]
+        info_content = soup_info.find('div',{'class':'common-width margin-bottom-20'}).text
+        dic_post = {
+            'title': info_title,  # 报告名称
+            'url_pdf': '',  # 报告链接
+            'year': info_date[:4],  # 报告年份
+            'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
+            'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
+            'category': '',  # 文件后缀名，如：pdf
+            'create_by': 'TangYuHang',  # 创建人，使用驼峰命名，如：TangYuHang
+            'publishDate': info_date,  # 时间
+            'origin': '36氪研究院',  # 来源
+            'sourceAddress': info_url,  # 原文链接
+            'content': info_content,  # 内容
+            'summary': info_zhaiyao,  # 摘要
+            'sid': '1662008421217378306',  # 信息源id
+        }
+        order_by = 1
+        download(dic_post, order_by)
+        order_by += 1
+    #     print(page,dic_post)
+        # url = 'http://114.115.155.139:5002/report_download'
+        # # report-list
+        # res = requests.post(url, data=json.dumps(dic_post))
+        # print(res.json())
+        time.sleep(2)
+    browser.quit()
+
+
+# 前沿知识库
+def qianyanzhishiku():
+    url = 'https://wk.askci.com/Periodical/quality/index_1.shtml'
+
+    res = requests.get(url)
+    soup = BeautifulSoup(res.content,'html.parser')
+
+    list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
+    for one_info in list_all:
+        info_title = one_info.a.get('title')
+        info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01')
+        info_href = one_info.a.get('href')
+        info_url = 'https://wk.askci.com'+info_href
+
+        res_info = requests.get(info_url)
+        soup_info = BeautifulSoup(res_info.content,'html.parser')
+        info_pdf_url = soup_info.find('iframe',{'scrolling':'auto'}).get('src').split('pdfpath=')[1]
+        info_pdf = urllib.parse.unquote(info_pdf_url)
+
+        dic_post = {
+            'title': info_title,  # 报告名称
+            'url_pdf': info_pdf,  # 报告链接
+            'year': info_date[:4],  # 报告年份
+            'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
+            'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
+            'category': 'pdf',  # 文件后缀名，如：pdf
+            'create_by': 'TangYuHang',  # 创建人，使用驼峰命名，如：TangYuHang
+            'publishDate': info_date,  # 时间
+            'origin': '前沿知识库',  # 来源
+            'sourceAddress': info_url,  # 原文链接
+            'content': '',  # 内容
+            'summary': '',  # 摘要
+            'sid': '1662008620631367682',  # 信息源id
+        }
+        order_by = 1
+        download(dic_post, order_by)
+        order_by += 1
+    #     print(page,dic_post)
+        # url = 'http://114.115.155.139:5002/report_download'
+        # # report-list
+        # res = requests.post(url, data=json.dumps(dic_post))
+        # print(res.json())
+        time.sleep(2)
+
+
+# # 世界经济论坛
+# def shijiejingjiluntan():
+#     allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
+#
+#     url = f'https://cn.weforum.org/reports'
+#
+#     res = requests.get(url)
+#     soup = BeautifulSoup(res.content,'html.parser')
+#
+#     list_all = soup.find('div',{'class':'collection-group collection-group--custom js-scroll'}).find_all('div',{'class':'report-listing-tout__content'})
+#
+#     for one_info in list_all:
+#         info_title = one_info.find('h4').text.strip()
+#         info_date = one_info.find('div',{'class':'report-listing-tout__date'}).text.strip()
+#         try:
+#             info_pdf = one_info.find('a').get('href')
+#         except:
+#             info_pdf = ''
+#         list_date = info_date.replace('月','').split(' ')
+#         info_date = list_date[2]+'-'+allnum[list_date[1]]+'-'+list_date[0]
+#
+#         info_href = one_info.find('a',{'class':'report-listing-tout__cta'}).get('href')
+#         info_url = 'https://cn.weforum.org'+info_href
+#
+#         res_info = requests.get(info_url)
+#         soup_info = BeautifulSoup(res_info.content,'html.parser')
+#
+#         info_zhaiyao = soup_info.find('div',{'class':'report__intro'}).text.strip()
+#         info_content = soup_info.find('div',{'class':'small-12 medium-8 columns'}).text.strip()
+#
+#         dic_post = {
+#             'title': info_title,  # 报告名称
+#             'url_pdf': info_pdf,  # 报告链接
+#             'year': info_date[:4],  # 报告年份
+#             'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
+#             'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
+#             'category': 'pdf',  # 文件后缀名，如：pdf
+#             'create_by': 'TangYuHang',  # 创建人，使用驼峰命名，如：TangYuHang
+#             'publishDate': info_date,  # 时间
+#             'origin': '世界经济论坛',  # 来源
+#             'sourceAddress': info_url,  # 原文链接
+#             'content': info_zhaiyao,  # 内容
+#             'summary': info_content,  # 摘要
+#             'sid': '1662008019231088642',  # 信息源id
+#         }
+#         order_by = 1
+#         download(dic_post, order_by)
+#         order_by += 1
+#     #     print(page,dic_post)
+#         # url = 'http://114.115.155.139:5002/report_download'
+#         # # report-list
+#         # res = requests.post(url, data=json.dumps(dic_post))
+#         # print(res.json())
+#         time.sleep(2)
+
+
+
+# 世界经济论坛
+def shijiejingjiluntan():
+    allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
+
+    for i in range(10, 128):
+
+        # res = requests.get(url)
+        # soup = BeautifulSoup(res.content,'html.parser')
+        browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
+
+        url = f'https://cn.weforum.org/publications/?page={i}'
+        browser.get(url)  # 跳到指定页面
+        wait = WebDriverWait(browser, 30)
+        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
+        page_source = browser.page_source  # 获取页面信息
+        soup = BeautifulSoup(page_source, 'html.parser')
+        time.sleep(2)
+        list_all = soup.find('div', {'class':'wef-qrllub'}).find_all('div',{'class':'wef-184hs11'})
+        time.sleep(2)
+        for one_info in list_all:
+            tag = one_info.find('div', class_='wef-wx6hgt').find_all('div',class_='wef-0')[1]
+            info_title = tag.find('a').text.strip()
+            info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime']
+            datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ')
+            info_date = datetime_obj.strftime('%Y-%m-%d')
+            info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip()
+            try:
+                info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href')
+            except:
+                info_pdf = ''
+
+            info_href = tag.find('a').get('href')
+
+
+            res_info = requests.get(info_href)
+            soup_info = BeautifulSoup(res_info.content,'html.parser')


+            info_content = soup_info.find('div',{'class':'small-12 medium-8 columns'}).text.strip()
+
+            dic_post = {
+                'title': info_title,  # 报告名称
+                'url_pdf': info_pdf,  # 报告链接
+                'year': info_date[:4],  # 报告年份
+                'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
+                'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
+                'category': 'pdf',  # 文件后缀名，如：pdf
+                'create_by': 'TangYuHang',  # 创建人，使用驼峰命名，如：TangYuHang
+                'publishDate': info_date,  # 时间
+                'origin': '世界经济论坛',  # 来源
+                'sourceAddress': info_href,  # 原文链接
+                'content': info_content,  # 内容
+                'summary': info_zhaiyao,  # 摘要
+                'sid': '1662008019231088642',  # 信息源id
+            }
+            order_by = 1
+            download(dic_post, order_by)
+            order_by += 1
+    #     print(page,dic_post)
+        # url = 'http://114.115.155.139:5002/report_download'
+        # # report-list
+        # res = requests.post(url, data=json.dumps(dic_post))
+        # print(res.json())
+            time.sleep(2)
+        browser.quit()
+
 # 东方财富网
 def dongfangcaifu():
-    cnx2 = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project',
-                           charset='utf8mb4')
+    headers = {
+        'Accept': '*/*',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Host': 'search-api-web.eastmoney.com',
+        'Pragma': 'no-cache',
+        'Sec-Fetch-Dest': 'script',
+        'Sec-Fetch-Mode': 'no-cors',
+        'Sec-Fetch-Site': 'same-site',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
+        'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"'
+    }
+    cnx2 = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
+                                    charset='utf8mb4')
    list_short_name = []
    list_social_code = []

    with cnx2.cursor() as cursor:
-        sel_sql = '''select securities_short_name,social_credit_code from sys_base_enterprise_ipo'''
+        sel_sql = '''select securities_short_name,social_credit_code from sys_base_enterprise_ipo where category = 1'''
        cursor.execute(sel_sql)
        selects = cursor.fetchall()

@@ -336,11 +780,14 @@ def dongfangcaifu():
        }

        param_url = parse.quote(str(param).replace(" ", ""))
+        # param_url = parse.quote(str(param))
        # param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'

        t = int(time.time() * 1000)
        url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
-        res = requests.get(url).text[1:-1]
+        # url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
+
+        res = requests.get(url=url,headers=headers).text[1:-1]
        res_json = json.loads(res)
        list_all = res_json['result']['researchReport']

@@ -418,7 +865,6 @@ def dongfangcaifu():
        # if len(list_all) != 10:
        #     break

-
 # 东方财富网2
 def dongfangcaifu2():
    list_short_name = ['新', '的', '电', '能']
@@ -525,9 +971,10 @@ def dongfangcaifu3():
    # log.info("格式化后的日期为：", formatted_date)
    # for i in range(1,1349):
    for i in range(1, 15):
+        # ip = baseCore.get_proxy()
        url = f'https://reportapi.eastmoney.com/report/list?industryCode=*&pageSize=50&industry=*&rating=&ratingChange=&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=0&orgCode=&code=*&rcode=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
        # url = 'https://reportapi.eastmoney.com/report/list?industryCode=*&pageSize=50&industry=*&rating=&ratingChange=&beginTime=2021-06-13&endTime=2023-06-13&pageNo=1&fields=&qType=0&orgCode=&code=*&rcode=&p=1&pageNum=1&pageNumber=1&_=1686645164397'
-        res = requests.get(url).text
+        res = requests.get(url=url, headers=headers, verify=False).text
        # log.info(res)
        res_json = json.loads(res)

@@ -538,7 +985,7 @@ def dongfangcaifu3():
            news_title = one_news['title']
            # log.info(news_title)
            news_date = one_news['publishDate'][:10]
-            comparison_date = "2023-12-08"
+            comparison_date = "2023-12-14"
            # 比较发布日期是否小于2023-10-06
            if news_date < comparison_date:
                continue
@@ -548,7 +995,7 @@ def dongfangcaifu3():

            news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html'

-            news_res = requests.get(news_href)
+            news_res = requests.get(url=news_href,headers=headers,verify=False)
            news_soup = BeautifulSoup(news_res.content, 'html.parser')
            # log.info(news_soup)
            try:
@@ -604,10 +1051,15 @@ def dongfangcaifu4():

    # log.info("格式化后的日期为：", formatted_date)
    for i in range(1, 15):
-
+        # ip = baseCore.get_proxy()
        url = f'https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=1&orgCode=&rcode=&p={i}&pageNum={i}&pageNumber={i}&_={t}'
        # url = "https://reportapi.eastmoney.com/report/list?&industryCode=*&pageSize=50&industry=*&rating=*&ratingChange=*&beginTime=2021-06-27&endTime=2023-06-27&pageNo=6&fields=&qType=1&orgCode=&rcode=&p=6&pageNum=6&pageNumber=6&_=1687831020493"
-        res = requests.get(url).text
+        for i in range(0,3):
+            try:
+                res = requests.get(url=url,headers=headers,verify=False).text
+                break
+            except:
+                continue
        # log.info(res)
        res_json = json.loads(res)

@@ -620,7 +1072,7 @@ def dongfangcaifu4():
            news_date = one_news['publishDate'][:10]
            news_come = one_news['orgSName']
            news_date = one_news['publishDate'][:10]
-            comparison_date = "2023-12-08"
+            comparison_date = "2023-12-14"
            # 比较发布日期是否小于2023-10-06
            if news_date < comparison_date:
                continue
@@ -628,7 +1080,7 @@ def dongfangcaifu4():
                pass
            news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html'

-            news_res = requests.get(news_href)
+            news_res = requests.get(url=news_href,headers=headers,verify=False)
            news_soup = BeautifulSoup(news_res.content, 'html.parser')
            # log.info(news_soup)
            try:
@@ -693,10 +1145,11 @@ def dongfangcaifu5():

    # log.info("格式化后的日期为：", formatted_date)

-    for i in range(1, 5):
+    for i in range(1, 10):
+        # ip = baseCore.get_proxy()
        url = f'https://reportapi.eastmoney.com/report/newStockList?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=4&p={i}&pageNum={i}&pageNumber={i}&_={t}'

-        res = requests.get(url).text
+        res = requests.get(url=url,headers=headers,verify=False).text
        log.info(res)
        res_json = json.loads(res)

@@ -708,7 +1161,7 @@ def dongfangcaifu5():
            # log.info(news_title)
            news_date = one_news['publishDate'][:10]
            news_come = one_news['orgSName']
-            comparison_date = "2023-12-08"
+            comparison_date = "2023-12-14"
            # 比较发布日期是否小于2023-10-06
            if news_date < comparison_date:
                continue
@@ -716,7 +1169,7 @@ def dongfangcaifu5():
                pass
            news_href = 'https://data.eastmoney.com/report/info/' + one_news['infoCode'] + '.html'

-            news_res = requests.get(news_href)
+            news_res = requests.get(url=news_href,headers=headers,verify=False)
            news_soup = BeautifulSoup(news_res.content, 'html.parser')
            # log.info(news_soup)
            try:
@@ -773,9 +1226,15 @@ def dongfangcaifu6():

    # log.info("格式化后的日期为：", formatted_date)
    for i in range(1, 15):
+        # ip = baseCore.get_proxy()
        url = f'https://reportapi.eastmoney.com/report/jg?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=3&orgCode=&author=&p={i}&pageNum={i}&pageNumber={i}&_={t}'

-        res = requests.get(url).text
+        for i in range(0, 3):
+            try:
+                res = requests.get(url=url, headers=headers, verify=False).text
+                break
+            except:
+                continue
        # log.info(res)
        res_json = json.loads(res)

@@ -786,7 +1245,7 @@ def dongfangcaifu6():
            news_title = one_news['title']
            # log.info(news_title)
            news_date = one_news['publishDate'][:10]
-            comparison_date = "2023-12-08"
+            comparison_date = "2023-12-14"
            # 比较发布日期是否小于2023-10-06
            if news_date < comparison_date:
                continue
@@ -798,7 +1257,7 @@ def dongfangcaifu6():
            news_href = 'https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl=' + one_news[
                'encodeUrl'] + '='

-            news_res = requests.get(news_href)
+            news_res = requests.get(url=news_href,headers=headers,verify=False)
            news_soup = BeautifulSoup(news_res.content, 'html.parser')
            # log.info(news_soup)
            try:
@@ -862,9 +1321,10 @@ def dongfangcaifu7():

    # log.info("格式化后的日期为：", formatted_date)
    for i in range(1, 3):
+        # ip = baseCore.get_proxy()
        url = f'https://reportapi.eastmoney.com/report/jg?pageSize=50&beginTime={pre_date}&endTime={formatted_date}&pageNo={i}&fields=&qType=2&orgCode=&author=&p={i}&pageNum={i}&pageNumber={i}&_={t}'

-        res = requests.get(url).text
+        res = requests.get(url=url,headers=headers,verify=False).text
        # log.info(res)
        res_json = json.loads(res)

@@ -875,7 +1335,7 @@ def dongfangcaifu7():
            news_title = one_news['title']
            # log.info(news_title)
            news_date = one_news['publishDate'][:10]
-            comparison_date = "2023-12-08"
+            comparison_date = "2023-12-14"
            # 比较发布日期是否小于2023-10-06
            if news_date < comparison_date:
                continue
@@ -887,7 +1347,7 @@ def dongfangcaifu7():
            news_href = 'https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl=' + one_news[
                'encodeUrl'] + '='

-            news_res = requests.get(news_href)
+            news_res = requests.get(url=news_href,headers=headers,verify=False)
            news_soup = BeautifulSoup(news_res.content, 'html.parser')
            # log.info(news_soup)
            try:
@@ -962,7 +1422,8 @@ if __name__ == '__main__':
    # try:
    #     log.info('shijiejingjiluntan')
    #     shijiejingjiluntan()
-    # except:
+    # except Exception as e:
+    #     log.info(e)
    #     pass
    # try:
    #     log.info('dongfangcaifu')
@@ -974,33 +1435,38 @@ if __name__ == '__main__':
    #     dongfangcaifu2()
    # except:
    #     pass
-    #
-    # try:
-    #     log.info('dongfangcaifu3')
-    #     dongfangcaifu3()
-    # except Exception as e:
-    #     pass
-    #
+
+    try:
+        log.info('dongfangcaifu3')
+        dongfangcaifu3()
+    except Exception as e:
+        log.info(e)
+        pass
+
    # try:
    #     log.info('dongfangcaifu4')
    #     dongfangcaifu4()
-    # except:
+    # except Exception as e:
+    #     log.info(e)
    #     pass

-    try:
-        log.info('dongfangcaifu5')
-        dongfangcaifu5()
-    except:
-        pass
+    # try:
+    #     log.info('dongfangcaifu5')
+    #     dongfangcaifu5()
+    # except Exception as e:
+    #     log.info(e)
+    #     pass

    # try:
    #     log.info('dongfangcaifu6')
    #     dongfangcaifu6()
-    # except:
+    # except Exception as e:
+    #     log.info(e)
    #     pass
-    #
+
    # try:
    #     log.info('dongfangcaifu7')
    #     dongfangcaifu7()
-    # except:
+    # except Exception as e:
+    #     log.info(e)
    #     pass