提交 c5576d56 作者: 薛凌堃

雅虎财经企业动态

上级 5dc4e829
# 雅虎财经企业动态获取 # 雅虎财经企业动态获取
...@@ -4,13 +4,19 @@ import time ...@@ -4,13 +4,19 @@ import time
import pymysql import pymysql
from kafka import KafkaProducer from kafka import KafkaProducer
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from base.BaseCore import BaseCore import sys
baseCore = BaseCore() sys.path.append('D:/zzsn_spider/base')
import BaseCore
from smart import smart_extractor
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
taskType = '企业动态/雅虎财经' taskType = '企业动态/雅虎财经'
smart =smart_extractor.SmartExtractor('cn')
last_url = ''
# 获取资讯详情 # 获取资讯详情
def getZx(xydm, url, title, cnx, path): def getZx(xydm, url, title, cnx, path):
start_time_content = time.time() start_time_content = time.time()
...@@ -30,13 +36,13 @@ def getZx(xydm, url, title, cnx, path): ...@@ -30,13 +36,13 @@ def getZx(xydm, url, title, cnx, path):
timeElement = driverContent.find_element(By.CLASS_NAME, "caas-attr-time-style").find_element(By.TAG_NAME, timeElement = driverContent.find_element(By.CLASS_NAME, "caas-attr-time-style").find_element(By.TAG_NAME,
"time") "time")
contentElement = driverContent.find_element(By.CLASS_NAME, "caas-body") contentElement = driverContent.find_element(By.CLASS_NAME, "caas-body").get_attribute('outerHTML')
author = authorElement.text.lstrip().strip().replace("'", "''") author = authorElement.text.lstrip().strip().replace("'", "''")
pub_time = timeElement.get_attribute("datetime").lstrip().strip().replace("'", "''").replace("T", " ") pub_time = timeElement.get_attribute("datetime").lstrip().strip().replace("'", "''").replace("T", " ")
pub_time = pub_time[0:19] pub_time = pub_time[0:19]
content = contentElement.text.lstrip().strip().replace("'", "''") content = contentElement.replace("'", "''")
driverContent.close() driverContent.close()
# 动态信息列表 # 动态信息列表
...@@ -129,17 +135,52 @@ def getZx(xydm, url, title, cnx, path): ...@@ -129,17 +135,52 @@ def getZx(xydm, url, title, cnx, path):
return exception return exception
# 拖拽30次获取企业新闻 def selectUrl(news_url,xydm):
def scroll(driver): # with cnx.cursor() as cursor:
for i in range(0, 30): sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
# js = "window.scrollTo(0,document.body.scrollHeight)" cursor.execute(sel_sql, (news_url,xydm))
selects = cursor.fetchall()
return selects
def getLastUrl():
news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream')
news_lis = news_div.find_elements(By.XPATH,"./ul/li")
last = len(news_lis)
try:
url = news_lis[last-1].find_element(By.XPATH,"./div[1]/div[1]/div[2]/h3[1]/a").get_attribute("href").lstrip().strip().replace("'","''")
except:
url = news_lis[last-1].find_element(By.XPATH,"./div[1]/div[1]/div[1]/h3[1]/a").get_attribute("href").lstrip().strip().replace("'","''")
return url
def scroll(xydm,name,gpdm):
last_url_ = ''
try:
last_url = getLastUrl()
except:
log.error(f"{name}--{gpdm}--获取不到最后一条链接")
while True:
js = "var q=document.documentElement.scrollTop=100000" js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js) driver.execute_script(js)
time.sleep(0.1) time.sleep(1)
try:
last_url_ = getLastUrl()
except Exception as e:
log.error(f"{name}--{gpdm}--获取不到最后一条链接")
break
try:
selects = selectUrl(last_url_,xydm)
except:
break
if selects:
break
if last_url_ == last_url:
break
last_url = last_url_
if __name__ == "__main__": if __name__ == "__main__":
path = r'D:\chrome\chromedriver.exe' path = r'D:\zzsn_spider\comData\cmd6\chromedriver.exe'
driver = baseCore.buildDriver(path) driver = baseCore.buildDriver(path)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor = cnx.cursor() cursor = cnx.cursor()
...@@ -155,6 +196,12 @@ if __name__ == "__main__": ...@@ -155,6 +196,12 @@ if __name__ == "__main__":
name = data[1] name = data[1]
enname = data[5] enname = data[5]
gpdm = data[3] gpdm = data[3]
if 'HK' in str(gpdm):
tmp_g = str(gpdm).split('.')[0]
if len(tmp_g) == 5:
gpdm = str(gpdm)[1:]
else:
pass
xydm = data[2] xydm = data[2]
# 获取该企业对应项目的采集次数 # 获取该企业对应项目的采集次数
...@@ -169,7 +216,6 @@ if __name__ == "__main__": ...@@ -169,7 +216,6 @@ if __name__ == "__main__":
continue continue
url = f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}" url = f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver.get(url) driver.get(url)
scroll(driver)
try: try:
news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream') news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream')
except Exception as e: except Exception as e:
...@@ -179,6 +225,11 @@ if __name__ == "__main__": ...@@ -179,6 +225,11 @@ if __name__ == "__main__":
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exception) baseCore.recordLog(xydm, taskType, state, takeTime, url, exception)
continue continue
try:
scroll(xydm,name,gpdm)
except Exception as e:
print(e)
log.error(f"{name}--{gpdm}--拖拽出现问题")
news_lis = news_div.find_elements(By.XPATH, "./ul/li") news_lis = news_div.find_elements(By.XPATH, "./ul/li")
log.info(f"{name}--{gpdm}--{len(news_lis)}条信息") log.info(f"{name}--{gpdm}--{len(news_lis)}条信息")
for i in range(0, len(news_lis)): for i in range(0, len(news_lis)):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论