提交 8f9f0213 作者: 丁双波

雅虎采集脚本提交

上级 42cc2e60
雅虎财经 国外上市企业信息采集
# 雅虎财经企业动态获取
# 雅虎财经企业动态获取
import time
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium import webdriver
from base.BaseCore import BaseCore
baseCore = BaseCore()
log= BaseCore.getLogger()
#获取资讯详情
def getZx(xydm,url,title,cnx):
start_time_content= time.time()
try:
chrome_options_content = webdriver.ChromeOptions()
chrome_options_content.add_argument('--disable-gpu')
chrome_options_content.add_argument('--ignore-certificate-errors')
chrome_options_content.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options_content.add_argument("--disable-blink-features=AutomationControlled")
chrome_options_content.add_argument("--start-maximized")
prefs_content = {'profile.managed_default_content_settings.images': 2}
chrome_options_content.add_experimental_option('prefs', prefs_content)
chrome_options_content.add_argument('--headless')
executable_path = r'E:\chromedriver_win32\chromedriver.exe'
driverContent = webdriver.Chrome(options=chrome_options_content, executable_path=executable_path)
driverContent.get(url)
try:
clickButton = driverContent.find_element(By.CLASS_NAME,"collapse-button")
clickButton.click()
except Exception as e:
pass
time.sleep(0.5)
authorElement = driverContent.find_element(By.CLASS_NAME,"caas-author-byline-collapse")
timeElement = driverContent.find_element(By.CLASS_NAME,"caas-attr-time-style").find_element(By.TAG_NAME,"time")
contentElement = driverContent.find_element(By.CLASS_NAME,"caas-body")
author = authorElement.text.lstrip().strip().replace("'","''")
pub_time = timeElement.get_attribute("datetime").lstrip().strip().replace("'","''").replace("T"," ")
pub_time = pub_time[0:19]
content = contentElement.text.lstrip().strip().replace("'","''")
driverContent.close()
# 动态信息列表
list_info = [
xydm,
title,
'',
content,
pub_time,
url,
'雅虎财经',
author,
'2',
'zh'
]
with cnx.cursor() as cursor:
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
except Exception as e1:
log.error("保存数据库失败")
log.info(f"文章耗时,耗时{baseCore.getTimeCost(start_time_content,time.time())}")
except Exception as e:
log.error("获取正文失败")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
prefs = {'profile.managed_default_content_settings.images': 2}
chrome_options.add_experimental_option('prefs',prefs)
chrome_options.add_argument('--headless')
executable_path = r'E:\chromedriver_win32\chromedriver.exe'
driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
def scroll(driver):
for i in range(0,30):
#js = "window.scrollTo(0,document.body.scrollHeight)"
js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js)
time.sleep(0.1)
#读取excel数据
df_all = pd.read_excel(r'.\data\国外企业.xlsx', sheet_name=0, keep_default_na=False)
for num in range(718,len(df_all)):
start_time = time.time()
country = df_all['国别'][num]
if(country!='国外'):
continue
enname=df_all['英文名称'][num]
gpdm = df_all['股票票代码'][num]
xydm = df_all['信用代码'][num]
if(gpdm==''):
log.error(f"{num}--{gpdm}--股票代码为空 跳过")
continue
if (xydm == ''):
log.error(f"{num}--{gpdm}--信用代码为空 跳过")
continue
count = int(df_all['企业动态数量(7.15)'][num])
# if(count>0):
# log.error(f"{num}--{gpdm}--动态大于0 跳过")
# continue
#https://finance.yahoo.com/quote/GOOG/press-releases?p=GOOG
url=f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver.get(url)
scroll(driver)
# if True:
# continue
try:
news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream')
except Exception as e:
log.error(f"{num}--{gpdm}--没找到新闻元素")
continue
news_lis = news_div.find_elements(By.XPATH,"./ul/li")
log.info(f"{num}--{gpdm}--{len(news_lis)}条信息")
for i in range(0,len(news_lis)):
try:
a_ele= news_lis[i].find_element(By.XPATH,"./div[1]/div[1]/div[2]/h3[1]/a")
except Exception :
log.error(f"{num}--{gpdm}--{i}----a标签没找到")
continue
news_url = a_ele.get_attribute("href").lstrip().strip().replace("'","''")
if(news_url.startswith("https://finance.yahoo.com")):
pass
else:
continue
#判断url是否已经存在
with cnx.cursor() as cursor:
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
cursor.execute(sel_sql, (news_url,xydm))
selects = cursor.fetchall()
if selects:
log.error(f"{num}--{gpdm}--网址已经存在----{news_url}")
continue
title = a_ele.text.lstrip().strip().replace("'","''")
getZx(xydm,news_url,title,cnx)
log.info(f"{num}--{gpdm}--{i}----{news_url}----------{news_url}")
log.info(f"{num}--{gpdm}--企业整体,耗时{baseCore.getTimeCost(start_time,time.time())}")
#释放资源
baseCore.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论