雅虎采集脚本提交

8f9f0213 · 丁双波 · 42cc2e60 · 8f9f0213 · 8f9f0213 · 8f9f0213
--- a/comData/yhcj/__init__.py
+++ b/comData/yhcj/__init__.py
--- a/comData/yhcj/reademe.txt
+++ b/comData/yhcj/reademe.txt
+雅虎财经 国外上市企业信息采集
--- a/comData/yhcj/雅虎财经_企业动态.py
+++ b/comData/yhcj/雅虎财经_企业动态.py
+# 雅虎财经企业动态获取
+# 雅虎财经企业动态获取
+import time
+
+import pandas as pd
+import pymysql
+import requests
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+
+from selenium import webdriver
+
+from base.BaseCore import BaseCore
+
+baseCore = BaseCore()
+log= BaseCore.getLogger()
+#获取资讯详情
+def getZx(xydm,url,title,cnx):
+    start_time_content= time.time()
+    try:
+        chrome_options_content = webdriver.ChromeOptions()
+        chrome_options_content.add_argument('--disable-gpu')
+        chrome_options_content.add_argument('--ignore-certificate-errors')
+        chrome_options_content.add_experimental_option('excludeSwitches', ['enable-automation'])
+        chrome_options_content.add_argument("--disable-blink-features=AutomationControlled")
+        chrome_options_content.add_argument("--start-maximized")
+        prefs_content = {'profile.managed_default_content_settings.images': 2}
+        chrome_options_content.add_experimental_option('prefs', prefs_content)
+        chrome_options_content.add_argument('--headless')
+        executable_path = r'E:\chromedriver_win32\chromedriver.exe'
+        driverContent = webdriver.Chrome(options=chrome_options_content, executable_path=executable_path)
+
+        driverContent.get(url)
+        try:
+            clickButton = driverContent.find_element(By.CLASS_NAME,"collapse-button")
+            clickButton.click()
+        except Exception as e:
+             pass
+        time.sleep(0.5)
+
+        authorElement = driverContent.find_element(By.CLASS_NAME,"caas-author-byline-collapse")
+
+        timeElement = driverContent.find_element(By.CLASS_NAME,"caas-attr-time-style").find_element(By.TAG_NAME,"time")
+
+        contentElement = driverContent.find_element(By.CLASS_NAME,"caas-body")
+
+        author = authorElement.text.lstrip().strip().replace("'","''")
+
+        pub_time = timeElement.get_attribute("datetime").lstrip().strip().replace("'","''").replace("T"," ")
+        pub_time = pub_time[0:19]
+        content = contentElement.text.lstrip().strip().replace("'","''")
+
+        driverContent.close()
+        # 动态信息列表
+        list_info = [
+            xydm,
+            title,
+            '',
+            content,
+            pub_time,
+            url,
+            '雅虎财经',
+            author,
+            '2',
+            'zh'
+        ]
+        with cnx.cursor() as cursor:
+            try:
+                insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+                cursor.execute(insert_sql, tuple(list_info))
+                cnx.commit()
+
+            except Exception as e1:
+                log.error("保存数据库失败")
+
+        log.info(f"文章耗时，耗时{baseCore.getTimeCost(start_time_content,time.time())}")
+    except Exception  as e:
+        log.error("获取正文失败")
+
+chrome_options = webdriver.ChromeOptions()
+chrome_options.add_argument('--disable-gpu')
+chrome_options.add_argument('--ignore-certificate-errors')
+chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
+chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+chrome_options.add_argument("--start-maximized")
+prefs = {'profile.managed_default_content_settings.images': 2}
+chrome_options.add_experimental_option('prefs',prefs)
+chrome_options.add_argument('--headless')
+executable_path = r'E:\chromedriver_win32\chromedriver.exe'
+driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path)
+
+cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
+
+def scroll(driver):
+    for i in range(0,30):
+        #js = "window.scrollTo(0,document.body.scrollHeight)"
+        js = "var q=document.documentElement.scrollTop=100000"
+        driver.execute_script(js)
+        time.sleep(0.1)
+
+
+
+#读取excel数据
+df_all = pd.read_excel(r'.\data\国外企业.xlsx', sheet_name=0, keep_default_na=False)
+for num in range(718,len(df_all)):
+    start_time = time.time()
+    country = df_all['国别'][num]
+    if(country!='国外'):
+        continue
+    enname=df_all['英文名称'][num]
+    gpdm = df_all['股票票代码'][num]
+    xydm = df_all['信用代码'][num]
+    if(gpdm==''):
+        log.error(f"{num}--{gpdm}--股票代码为空 跳过")
+        continue
+    if (xydm == ''):
+        log.error(f"{num}--{gpdm}--信用代码为空 跳过")
+        continue
+    count = int(df_all['企业动态数量（7.15）'][num])
+    # if(count>0):
+    #     log.error(f"{num}--{gpdm}--动态大于0 跳过")
+    #     continue
+
+    #https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG
+    url=f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
+    driver.get(url)
+    scroll(driver)
+    # if True:
+    #     continue
+    try:
+        news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream')
+    except Exception as e:
+        log.error(f"{num}--{gpdm}--没找到新闻元素")
+        continue
+    news_lis =  news_div.find_elements(By.XPATH,"./ul/li")
+    log.info(f"{num}--{gpdm}--{len(news_lis)}条信息")
+    for i in range(0,len(news_lis)):
+        try:
+            a_ele= news_lis[i].find_element(By.XPATH,"./div[1]/div[1]/div[2]/h3[1]/a")
+        except Exception :
+            log.error(f"{num}--{gpdm}--{i}----a标签没找到")
+            continue
+        news_url = a_ele.get_attribute("href").lstrip().strip().replace("'","''")
+        if(news_url.startswith("https://finance.yahoo.com")):
+            pass
+        else:
+            continue
+        #判断url是否已经存在
+        with cnx.cursor() as cursor:
+            sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
+            cursor.execute(sel_sql, (news_url,xydm))
+            selects = cursor.fetchall()
+            if selects:
+                log.error(f"{num}--{gpdm}--网址已经存在----{news_url}")
+                continue
+        title = a_ele.text.lstrip().strip().replace("'","''")
+        getZx(xydm,news_url,title,cnx)
+        log.info(f"{num}--{gpdm}--{i}----{news_url}----------{news_url}")
+
+    log.info(f"{num}--{gpdm}--企业整体，耗时{baseCore.getTimeCost(start_time,time.time())}")
+#释放资源
+baseCore.close()
\ No newline at end of file
--- a/comData/yhcj/雅虎财经_企业基本信息_高管信息.py
+++ b/comData/yhcj/雅虎财经_企业基本信息_高管信息.py