研报

be4f79be · 薛凌堃 · 5d5dff2b · be4f79be
--- a/comData/YanBao/resentYanbao.py
+++ b/comData/YanBao/resentYanbao.py
@@ -257,11 +257,11 @@ def download(data, order_by):
    else:
        log.info(f'====pdf解析失败====')
        delete_url(sourceAddress)
-        # 获取当前进程pid
+        # # 获取当前进程pid
-        current_pid = baseCore.getPID()
+        # current_pid = baseCore.getPID()
-        # todo: 重新启动新进程，杀死当前进程
+        # # todo: 重新启动新进程，杀死当前进程
-        subprocess.Popen([sys.executable] + sys.argv)
+        # subprocess.Popen([sys.executable] + sys.argv)
-        os.kill(current_pid, 9)
+        # os.kill(current_pid, 9)
        return
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    page_size = retData['page_size']
@@ -328,37 +328,156 @@ def download(data, order_by):
    log.info(dic_result)
    return
+# def Mob():
+#     url = 'https://www.mob.com/mobData/report'
+#     res = requests.get(url=url,headers=headers).content
+#     soup = BeautifulSoup(res,'html.parser')
+#     max_info = soup.find('span',class_='el-pagination__total').text
+#     max_info = re.findall('\d{1,4}',max_info)[0]
+#     # print(type(max_info))
+#     max_page = int((int(max_info)/9) + 1)
+#     print(max_page)
+#     i_id = 0
+#     for page in range(max_page):
+#         url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
+#         res = requests.get(url=url, headers=headers).content
+#         soup = BeautifulSoup(res, 'html.parser')
+#         result = soup.find('ul', class_='fix')
+#         li_list = result.find_all('li')
+#         # for id in range(1, 149):
+#         id = i_id
+#         for li in li_list:
+#             id += 1
+#             title = li.find('div',class_='title').text
+#             time = li.find('div',class_='date tc').text.strip()
+#             year = re.findall('\d{4}',time)[0]
+#         # for id in range(29,178):
+#             real_id = 178 - id
+#             href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
+#             # href = 'https://www.mob.com/mobdata/report/169'
+#             res_href = requests.get(url=href,headers=headers).content
+#             i_soup = BeautifulSoup(res_href,'html.parser')
+#             url_pdf = 'https://api.os.mob.com/api/academy_report/download/' +  i_soup.find('div', class_='report-top').find('a')['href']
+#             summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
+#             fin_summary = []
+#             for s in summary_list:
+#                 summary = s.text
+#                 fin_summary.append(summary)
+#             summary = ''.join(fin_summary)
+#             dic_post = {
+#                 'title': title,  # 报告名称
+#                 'url_pdf': url_pdf,  # 报告链接
+#                 'year': year,  # 报告年份
+#                 'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
+#                 'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
+#                 'category': 'pdf',  # 文件后缀名，如：pdf
+#                 'create_by': 'XueLingKun',  # 创建人，使用驼峰命名，如：TangYuHang
+#                 'publishDate': time,  # 时间
+#                 'origin': 'Mob研究院',  # 来源
+#                 'sourceAddress': href,  # 原文链接
+#                 'content': '',  # 内容
+#                 'summary': summary,  # 摘要
+#                 'sid': '1662008807781212161',  # 信息源id
+#             }
+#             order_by = 1
+#             download(dic_post,order_by)
+#             order_by += 1
+#             # print(dic_post)
+#             # url = 'http://114.115.155.139:5002/report_download'
+#             # # report-list
+#             # res = requests.post(url, data=json.dumps(dic_post))
+#             # print(res.json())
+#         i_id += 9
 def Mob():
-    url = 'https://www.mob.com/mobData/report'
+    # loginfo = baseCore.redicPullData('Mob:loginfo')
-    res = requests.get(url=url,headers=headers).content
+    # account = loginfo.split('|')[0]
-    soup = BeautifulSoup(res,'html.parser')
+    # password = loginfo.split('|')[1]
-    max_info = soup.find('span',class_='el-pagination__total').text
+    # usecount = loginfo.split('|')[2]
-    max_info = re.findall('\d{1,4}',max_info)[0]
+    usecount = 0
-    # print(type(max_info))
+    # 测试用
-    max_page = int((int(max_info)/9) + 1)
+    # account = '13636711746'
-    print(max_page)
+    # password = 'Zhenghao123'
-    i_id = 0
-    for page in range(max_page):
+    # account = '18703752600'
-        url = 'https://www.mob.com/mobdata/report?page={}'.format(page+1)
+    # password = 'Axlk010208!'
-        res = requests.get(url=url, headers=headers).content
+    # account = '13273737131'
-        soup = BeautifulSoup(res, 'html.parser')
+    # password = 'liu1230...'
-        result = soup.find('ul', class_='fix')
+    # account = '15237560528'
-        li_list = result.find_all('li')
+    # password = 'xlk123456!'
-        # for id in range(1, 149):
+    # account = '17103126138'
-        id = i_id
+    # password = '171BlackOne'
-        for li in li_list:
+    account = '17103128590'
-            id += 1
+    password = '171BlackTwo'
-            title = li.find('div',class_='title').text
+    browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
-            time = li.find('div',class_='date tc').text.strip()
+    f_url = 'https://www.mob.com/developer/login'
-            year = re.findall('\d{4}',time)[0]
+    browser.get(f_url)
-        # for id in range(29,178):
+    browser.find_element(By.CLASS_NAME, 's1').click()
-            real_id = 178 - id
+    browser.find_element(By.CSS_SELECTOR, 'input[type="text"]').send_keys(f'{account}')
-            href = 'https://www.mob.com/mobdata/report/{}'.format(real_id)
+    browser.find_element(By.CSS_SELECTOR, 'input[type="password"]').send_keys(f'{password}')
-            # href = 'https://www.mob.com/mobdata/report/169'
+    browser.find_element(By.XPATH, '//*[@id="app"]/section/div/div[2]/div/div[2]/section/div[3]/div/form/div[3]/div/button/span').click()
-            res_href = requests.get(url=href,headers=headers).content
+    if usecount < 5:
+        pass
+    else:
+        return Mob()
+    # 获取登录的信息
+    # url = browser.current_url
+    # print(url)
+    url = 'https://www.mob.com/mobdata/report'
+    browser.get(url)
+    # tags = browser.find_elements(By.CLASS_NAME, 'main-title')
+    # for tag in tags:
+    #     if 'Mob研究院' in tag.text:
+    #         tag.click()
+    #     else:
+    #         continue
+    #     # try:
+    #     #     web = tag.find_element(By.CLASS_NAME, "")
+    #     #     web.click()
+    #     #     break
+    #     # except:
+    #     #     continue
+    cookies_list = browser.get_cookies()
+    cookies = {}
+    # 获取cookie中的name和value,转化成requests可以使用的形式
+    for cookie in cookies_list:
+        cookies[cookie['name']] = cookie['value']
+    # cookies_ = json.loads('{' + re.findall("{(.*?)}", str(cookies).replace("\'", "\""))[0] + '}')
+    # cookies_ = json.dumps(cookies)
+    session = requests.session()
+    session.cookies.update(cookies)
+    for i in range(5):
+        url = f'https://api.os.mob.com/api/academy_report/list?limit=18&page={i}&keyword=&year='
+        req = session.get(url=url, headers=headers)
+        data_json = req.json()
+        news_list = data_json['data']['list']
+        for info in news_list:
+            title = info['title']
+            publishDate = info['effective_date']
+            year = publishDate[:4]
+            report_id = info['report_id']
+            href = 'https://www.mob.com/mobdata/report/{}'.format(report_id)
+            # tf_url = add_check_url(href)
+            is_member = r.sismember('report_pdf_three_history', href)
+            if is_member:
+                continue
+            res_href = session.get(url=href, headers=headers).content
            i_soup = BeautifulSoup(res_href,'html.parser')
-            url_pdf = 'https://api.os.mob.com/api/academy_report/download/' +  i_soup.find('div', class_='report-top').find('a')['href']
            summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
+            news_url = f'https://api.os.mob.com/api/academy_report/download/{report_id}'
+            # headers['token'] = '92b42171-7a33-4f3b-a25b-9ca689699e10'
+            # headers['token'] = '495f9714-7ea8-4987-91c0-2b0ede38238b'
+            # headers['token'] = '0dcbde4a-9aaa-4651-b886-856add4b8df9'
+            # headers['token'] = '2fcdd67b-da81-4f2f-9d6f-529fdbf6ae1f'
+            # headers['token'] = 'dd54bc77-50fa-4a25-aec7-95ec45bd17f8'
+            headers['token'] = '2fd143d3-a1ec-4d9d-9d9b-38a1d4cf8387'
+            news_req = session.get(url=news_url,headers=headers)
+            pdf_url = news_req.json()['data']
            fin_summary = []
            for s in summary_list:
                summary = s.text
@@ -366,13 +485,13 @@ def Mob():
            summary = ''.join(fin_summary)
            dic_post = {
                'title': title,  # 报告名称
-                'url_pdf': url_pdf,  # 报告链接
+                'url_pdf': pdf_url,  # 报告链接
                'year': year,  # 报告年份
                'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
                'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
                'category': 'pdf',  # 文件后缀名，如：pdf
                'create_by': 'XueLingKun',  # 创建人，使用驼峰命名，如：TangYuHang
-                'publishDate': time,  # 时间
+                'publishDate': publishDate,  # 时间
                'origin': 'Mob研究院',  # 来源
                'sourceAddress': href,  # 原文链接
                'content': '',  # 内容
@@ -382,12 +501,7 @@ def Mob():
            order_by = 1
            download(dic_post,order_by)
            order_by += 1
-            # print(dic_post)
-            # url = 'http://114.115.155.139:5002/report_download'
-            # # report-list
-            # res = requests.post(url, data=json.dumps(dic_post))
-            # print(res.json())
-        i_id += 9
 def yidong_guanxiangtai():
@@ -452,30 +566,83 @@ def yidong_guanxiangtai():
                # print(res.json())
-# 巨量算数
+# # 巨量算数
-def juliangsuanshu():
+# def juliangsuanshu():
-    browser = webdriver.Chrome(chromedriver)
+#     # browser = webdriver.Chrome(chromedriver)
+#     browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
+#
+#     url = 'https://trendinsight.oceanengine.com/arithmetic-report'
+#     browser.get(url)#跳到指定页面
+#
+#     page_source = browser.page_source#获取页面信息
+#     soup = BeautifulSoup(page_source, 'html.parser')
+#
+#     list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
+#     for one_info in list_all:
+#         info_title = one_info.a.text.strip()
+#         info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split('：')[1]
+#         info_href = one_info.a.get('href')
+#         info_url = 'https://trendinsight.oceanengine.com'+info_href
+#
+#         res_info = requests.get(info_url)
+#         soup_info = BeautifulSoup(res_info.content,'html.parser')
+#         list_script = soup_info.find_all('script')
+#         for script in list_script:
+#             if 'window._SSR_DATA' in script.text:
+#                 json_str = script.text
+#         info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
+#
+#         info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
+#         info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
+#
+#         dic_post = {
+#             'title': info_title,  # 报告名称
+#             'url_pdf': info_pdf,  # 报告链接
+#             'year': info_date[:4],  # 报告年份
+#             'type_id': '4',  # 报告种类，（年报：1，季报：2，月报：3，研报：4）
+#             'item_id': 'YanBao',  # 关联记录id，如：企业信用代码
+#             'category': 'pdf',  # 文件后缀名，如：pdf
+#             'create_by': 'TangYuHang',  # 创建人，使用驼峰命名，如：TangYuHang
+#             'publishDate': info_date,  # 时间
+#             'origin': '巨量算数',  # 来源
+#             'sourceAddress': info_url,  # 原文链接
+#             'content': '',  # 内容
+#             'summary': info_zhaiyao,  # 摘要
+#             'sid': '1662008524476948481',  # 信息源id
+#         }
+#         order_by = 1
+#         download(dic_post, order_by)
+#         order_by += 1
+#     #     print(page,dic_post)
+#         # url = 'http://114.115.155.139:5002/report_download'
+#         # # report-list
+#         # res = requests.post(url, data=json.dumps(dic_post))
+#         # print(res.json())
+#         time.sleep(2)
+#     browser.quit()
-    url = 'https://trendinsight.oceanengine.com/arithmetic-report'
+# 巨量算数
-    browser.get(url)#跳到指定页面
-    page_source = browser.page_source#获取页面信息
+def getnews(browser):
+    page_source = browser.page_source  # 获取页面信息
    soup = BeautifulSoup(page_source, 'html.parser')
-    list_all = soup.find('div',{'class':'index-module__reportList--nit0R'}).find_all('div',{'class':'card-module__cardContent--GDAoy index-module__cardContent--vRJI_'})
+    list_all = soup.find('div', {'class': 'byted-loading byted-loading-block'}).find_all('div', {
+        'class': 'commonCardContainer-TMfUEr hoverShadow-oVbBH0 reportListCard-EhYynV'})
    for one_info in list_all:
+        try:
            info_title = one_info.a.text.strip()
-        info_date = one_info.find('div',{'class':'card-module__releaseTime--MbbUa'}).text.split('：')[1]
+            info_date = one_info.find('div', {'class': 'releaseTime-MbbUaH'}).text.split('：')[1]
            info_href = one_info.a.get('href')
-        info_url = 'https://trendinsight.oceanengine.com'+info_href
+            info_url = 'https://trendinsight.oceanengine.com' + info_href
            res_info = requests.get(info_url)
-        soup_info = BeautifulSoup(res_info.content,'html.parser')
+            soup_info = BeautifulSoup(res_info.content, 'html.parser')
            list_script = soup_info.find_all('script')
            for script in list_script:
                if 'window._SSR_DATA' in script.text:
                    json_str = script.text
-        info_json = json.loads(json_str.replace('window._SSR_DATA = ',''))
+            info_json = json.loads(json_str.replace('window._SSR_DATA = ', ''))
            info_zhaiyao = info_json['data']['storeState']['report_detail']['report_info']['introduction']
            info_pdf = info_json['data']['storeState']['report_detail']['report_info']['post_files'][0]['file_url']
@@ -504,6 +671,26 @@ def juliangsuanshu():
            # res = requests.post(url, data=json.dumps(dic_post))
            # print(res.json())
            time.sleep(2)
+        except Exception as e:
+            continue
+    # todo:点击下一页
+    # wait = WebDriverWait(browser, 30)
+    # wait.until(EC.presence_of_element_located((By.CLASS_NAME, "byted-pager-item-group")))
+    # try:
+    #     browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
+    # except:
+    #     time.sleep(1)
+    #     browser.find_element(By.XPATH, '//ul[@class="byted-pager-item-group"]/li[last()]').click()
+    # return getnews(browser)
+def juliangsuanshu():
+    # browser = webdriver.Chrome(chromedriver)
+    browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
+    url = 'https://trendinsight.oceanengine.com/arithmetic-report'
+    browser.get(url)#跳到指定页面
+    getnews(browser)
    browser.quit()
@@ -560,12 +747,15 @@ def ke36():
 # 前沿知识库
 def qianyanzhishiku():
-    url = 'https://wk.askci.com/Periodical/quality/index_1.shtml'
+    for i in range(40,60):
+        log.info(f'====第{i}页====')
+        url = f'https://wk.askci.com/Periodical/quality/index_{i}.shtml'
        res = requests.get(url)
        soup = BeautifulSoup(res.content,'html.parser')
-    list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
+        # list_all = soup.find('div',{'class':'quality_report pt-20 pb-40'}).find_all('li')
+        list_all = soup.find('div',{'class':'show_report_list'}).find_all('li')
        for one_info in list_all:
            info_title = one_info.a.get('title')
            info_date = one_info.find('div',{'class':'time'}).text.replace('年','-').replace('月','-01')
@@ -664,7 +854,7 @@ def qianyanzhishiku():
 def shijiejingjiluntan():
    allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
-    for i in range(10, 128):
+    for i in range(76, 128):
        # res = requests.get(url)
        # soup = BeautifulSoup(res.content,'html.parser')
@@ -672,6 +862,7 @@ def shijiejingjiluntan():
        url = f'https://cn.weforum.org/publications/?page={i}'
        browser.get(url)  # 跳到指定页面
+        time.sleep(5)
        wait = WebDriverWait(browser, 30)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
        page_source = browser.page_source  # 获取页面信息
@@ -685,7 +876,12 @@ def shijiejingjiluntan():
            info_date = one_info.find('div',{'class':'wef-1nvfeoy'}).find('time')['datetime']
            datetime_obj = datetime.strptime(info_date, '%Y-%m-%dT%H:%M:%SZ')
            info_date = datetime_obj.strftime('%Y-%m-%d')
+            # if info_date >= '2022-07-21':
+            #     continue
+            try:
                info_zhaiyao = one_info.find('div', {'class': 'wef-8xl60i'}).text.strip()
+            except:
+                info_zhaiyao = ''
            try:
                info_pdf = one_info.find('div',{'class':'wef-1nvfeoy'}).find('a').get('href')
            except:
@@ -1394,11 +1590,11 @@ def dongfangcaifu7():
 if __name__ == '__main__':
-    # try:
+    try:
-    #     log.info('mob')
+        log.info('mob')
-    #     Mob()
+        Mob()
-    # except:
+    except Exception as e:
-    #     pass
+        pass
    # try:
    #     log.info('yidong_guanxiangtai')
    #     yidong_guanxiangtai()
@@ -1407,7 +1603,7 @@ if __name__ == '__main__':
    # try:
    #     log.info('juliangsuanshu')
    #     juliangsuanshu()
-    # except:
+    # except Exception as e:
    #     pass
    # try:
    #     log.info('ke36')
@@ -1417,7 +1613,7 @@ if __name__ == '__main__':
    # try:
    #     log.info('qianyanzhishiku')
    #     qianyanzhishiku()
-    # except:
+    # except Exception as e:
    #     pass
    # try:
    #     log.info('shijiejingjiluntan')
@@ -1442,31 +1638,31 @@ if __name__ == '__main__':
    # except Exception as e:
    #     log.info(e)
    #     pass
+    #
    # try:
    #     log.info('dongfangcaifu4')
    #     dongfangcaifu4()
    # except Exception as e:
    #     log.info(e)
    #     pass
+    #
-    try:
+    # try:
-        log.info('dongfangcaifu5')
+    #     log.info('dongfangcaifu5')
-        dongfangcaifu5()
+    #     dongfangcaifu5()
-    except Exception as e:
+    # except Exception as e:
-        log.info(e)
+    #     log.info(e)
-        pass
+    #     pass
+    #
-    try:
+    # try:
-        log.info('dongfangcaifu6')
+    #     log.info('dongfangcaifu6')
-        dongfangcaifu6()
+    #     dongfangcaifu6()
-    except Exception as e:
+    # except Exception as e:
-        log.info(e)
+    #     log.info(e)
-        pass
+    #     pass
+    #
-    try:
+    # try:
-        log.info('dongfangcaifu7')
+    #     log.info('dongfangcaifu7')
-        dongfangcaifu7()
+    #     dongfangcaifu7()
-    except Exception as e:
+    # except Exception as e:
-        log.info(e)
+    #     log.info(e)
-        pass
+    #     pass