Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
c5576d56
提交
c5576d56
authored
8月 14, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
雅虎财经企业动态
上级
5dc4e829
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
64 行增加
和
13 行删除
+64
-13
雅虎财经_企业动态.py
comData/yhcj/雅虎财经_企业动态.py
+64
-13
没有找到文件。
comData/yhcj/雅虎财经_企业动态.py
浏览文件 @
c5576d56
# 雅虎财
经企业动态获取
# 雅虎财
经企业动态获取
...
...
@@ -4,13 +4,19 @@ import time
import
pymysql
from
kafka
import
KafkaProducer
from
selenium.webdriver.common.by
import
By
from
base.BaseCore
import
BaseCore
import
sys
baseCore
=
BaseCore
()
sys
.
path
.
append
(
'D:/zzsn_spider/base'
)
import
BaseCore
from
smart
import
smart_extractor
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
taskType
=
'企业动态/雅虎财经'
smart
=
smart_extractor
.
SmartExtractor
(
'cn'
)
last_url
=
''
# 获取资讯详情
def
getZx
(
xydm
,
url
,
title
,
cnx
,
path
):
start_time_content
=
time
.
time
()
...
...
@@ -30,13 +36,13 @@ def getZx(xydm, url, title, cnx, path):
timeElement
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"caas-attr-time-style"
)
.
find_element
(
By
.
TAG_NAME
,
"time"
)
contentElement
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"caas-body"
)
contentElement
=
driverContent
.
find_element
(
By
.
CLASS_NAME
,
"caas-body"
)
.
get_attribute
(
'outerHTML'
)
author
=
authorElement
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
pub_time
=
timeElement
.
get_attribute
(
"datetime"
)
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
.
replace
(
"T"
,
" "
)
pub_time
=
pub_time
[
0
:
19
]
content
=
contentElement
.
text
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
content
=
contentElement
.
replace
(
"'"
,
"''"
)
driverContent
.
close
()
# 动态信息列表
...
...
@@ -129,17 +135,52 @@ def getZx(xydm, url, title, cnx, path):
return
exception
# 拖拽30次获取企业新闻
def
scroll
(
driver
):
for
i
in
range
(
0
,
30
):
# js = "window.scrollTo(0,document.body.scrollHeight)"
def
selectUrl
(
news_url
,
xydm
):
# with cnx.cursor() as cursor:
sel_sql
=
'''select social_credit_code from brpa_source_article where source_address =
%
s and social_credit_code=
%
s '''
cursor
.
execute
(
sel_sql
,
(
news_url
,
xydm
))
selects
=
cursor
.
fetchall
()
return
selects
def
getLastUrl
():
news_div
=
driver
.
find_element
(
By
.
ID
,
'summaryPressStream-0-Stream'
)
news_lis
=
news_div
.
find_elements
(
By
.
XPATH
,
"./ul/li"
)
last
=
len
(
news_lis
)
try
:
url
=
news_lis
[
last
-
1
]
.
find_element
(
By
.
XPATH
,
"./div[1]/div[1]/div[2]/h3[1]/a"
)
.
get_attribute
(
"href"
)
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
except
:
url
=
news_lis
[
last
-
1
]
.
find_element
(
By
.
XPATH
,
"./div[1]/div[1]/div[1]/h3[1]/a"
)
.
get_attribute
(
"href"
)
.
lstrip
()
.
strip
()
.
replace
(
"'"
,
"''"
)
return
url
def
scroll
(
xydm
,
name
,
gpdm
):
last_url_
=
''
try
:
last_url
=
getLastUrl
()
except
:
log
.
error
(
f
"{name}--{gpdm}--获取不到最后一条链接"
)
while
True
:
js
=
"var q=document.documentElement.scrollTop=100000"
driver
.
execute_script
(
js
)
time
.
sleep
(
0.1
)
time
.
sleep
(
1
)
try
:
last_url_
=
getLastUrl
()
except
Exception
as
e
:
log
.
error
(
f
"{name}--{gpdm}--获取不到最后一条链接"
)
break
try
:
selects
=
selectUrl
(
last_url_
,
xydm
)
except
:
break
if
selects
:
break
if
last_url_
==
last_url
:
break
last_url
=
last_url_
if
__name__
==
"__main__"
:
path
=
r'D:\
chrome
\chromedriver.exe'
path
=
r'D:\
zzsn_spider\comData\cmd6
\chromedriver.exe'
driver
=
baseCore
.
buildDriver
(
path
)
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'dbScore'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
...
...
@@ -155,6 +196,12 @@ if __name__ == "__main__":
name
=
data
[
1
]
enname
=
data
[
5
]
gpdm
=
data
[
3
]
if
'HK'
in
str
(
gpdm
):
tmp_g
=
str
(
gpdm
)
.
split
(
'.'
)[
0
]
if
len
(
tmp_g
)
==
5
:
gpdm
=
str
(
gpdm
)[
1
:]
else
:
pass
xydm
=
data
[
2
]
# 获取该企业对应项目的采集次数
...
...
@@ -169,7 +216,6 @@ if __name__ == "__main__":
continue
url
=
f
"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver
.
get
(
url
)
scroll
(
driver
)
try
:
news_div
=
driver
.
find_element
(
By
.
ID
,
'summaryPressStream-0-Stream'
)
except
Exception
as
e
:
...
...
@@ -179,6 +225,11 @@ if __name__ == "__main__":
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exception
)
continue
try
:
scroll
(
xydm
,
name
,
gpdm
)
except
Exception
as
e
:
print
(
e
)
log
.
error
(
f
"{name}--{gpdm}--拖拽出现问题"
)
news_lis
=
news_div
.
find_elements
(
By
.
XPATH
,
"./ul/li"
)
log
.
info
(
f
"{name}--{gpdm}--{len(news_lis)}条信息"
)
for
i
in
range
(
0
,
len
(
news_lis
)):
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论