Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
0a703aad
提交
0a703aad
authored
2月 19, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
中国证券报·中证网
上级
79e1222f
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
138 行增加
和
72 行删除
+138
-72
zzcx.py
comData/dingzhi/zzcx.py
+138
-72
没有找到文件。
comData/dingzhi/zzcx.py
浏览文件 @
0a703aad
...
...
@@ -2,8 +2,12 @@
中证智能财讯
"""
import
json
import
os
import
sys
import
time
import
redis
from
kafka
import
KafkaProducer
from
obs
import
ObsClient
import
fitz
import
requests
...
...
@@ -11,6 +15,10 @@ from bs4 import BeautifulSoup
from
retry
import
retry
from
selenium.webdriver.common.by
import
By
from
selenium
import
webdriver
from
tempfile
import
NamedTemporaryFile
from
selenium.webdriver.support.wait
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
...
...
@@ -36,66 +44,12 @@ def create_driver():
@retry
(
tries
=
3
,
delay
=
1
)
def
getOBSres
(
pathType
,
name
,
response
):
result
=
obsClient
.
putContent
(
'zzsn'
,
f
'{pathType}/'
+
name
,
content
=
response
.
content
)
# result = obsClient.putFile('zzsn', pathType+name, file_path=response)
result
=
obsClient
.
putFile
(
'zzsn'
,
pathType
+
name
,
file_path
=
response
)
return
result
def
uptoOBS
(
pdf_url
,
name_pdf
,
type_id
,
social_code
,
pathType
,
taskType
,
start_time
,
create_by
):
headers
=
{}
retData
=
{
'state'
:
False
,
'type_id'
:
type_id
,
'item_id'
:
social_code
,
'group_name'
:
''
,
'path'
:
''
,
'full_path'
:
''
,
'category'
:
'pdf'
,
'file_size'
:
''
,
'status'
:
1
,
'create_by'
:
create_by
,
'create_time'
:
''
,
'page_size'
:
''
,
'content'
:
''
}
headers
[
'User-Agent'
]
=
baseCore
.
getRandomUserAgent
()
for
i
in
range
(
0
,
3
):
try
:
response
=
requests
.
get
(
pdf_url
,
headers
=
headers
,
verify
=
False
,
timeout
=
20
)
file_size
=
int
(
response
.
headers
.
get
(
'Content-Length'
))
break
except
:
time
.
sleep
(
3
)
continue
page_size
=
0
name
=
str
(
baseCore
.
getuuid
())
+
'.pdf'
now_time
=
time
.
strftime
(
"
%
Y-
%
m"
)
try
:
result
=
getOBSres
(
pathType
,
now_time
,
name
,
response
)
except
:
log
=
baseCore
.
getLogger
()
log
.
error
(
f
'OBS发送失败'
)
return
retData
try
:
with
fitz
.
open
(
stream
=
response
.
content
,
filetype
=
'pdf'
)
as
doc
:
page_size
=
doc
.
page_count
for
page
in
doc
.
pages
():
retData
[
'content'
]
+=
page
.
get_text
()
except
:
log
=
baseCore
.
getLogger
()
log
.
error
(
f
'文件损坏'
)
return
retData
if
page_size
<
1
:
# pdf解析失败
# print(f'======pdf解析失败=====')
return
retData
else
:
try
:
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
retData
[
'state'
]
=
True
retData
[
'path'
]
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
retData
[
'full_path'
]
=
result
[
'body'
][
'objectUrl'
]
retData
[
'file_size'
]
=
baseCore
.
convert_size
(
file_size
)
retData
[
'create_time'
]
=
time_now
retData
[
'page_size'
]
=
page_size
except
Exception
as
e
:
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
pdf_url
,
f
'{e}'
)
return
retData
return
retData
def
zzcx
():
driver
=
create_driver
()
driver
.
maximize_window
()
url
=
'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload
=
{
"pageNo"
:
1
,
"pageSize"
:
15
,
"statusList"
:
[
0
],
"keyword"
:
""
}
headers
=
{
...
...
@@ -119,7 +73,7 @@ def zzcx():
result_json
=
requests
.
post
(
url
=
url
,
data
=
payload
,
headers
=
headers
)
.
json
()
print
(
result_json
)
pages
=
result_json
[
'data'
][
'pages'
]
for
page
in
range
(
1
,
int
(
pages
+
1
)
):
for
page
in
range
(
1
,
int
(
pages
)
+
1
):
payload_page
=
{
"pageNo"
:
page
,
"pageSize"
:
15
,
"statusList"
:
[
0
],
"keyword"
:
""
}
payload_page
=
json
.
dumps
(
payload_page
)
datas
=
requests
.
post
(
url
=
url
,
data
=
payload_page
,
headers
=
headers
)
...
...
@@ -128,23 +82,130 @@ def zzcx():
title
=
news
[
'title'
]
news_url
=
'https://zzcx.cs.com.cn/app/zzb/detail?id='
+
news
[
'manuscriptId'
]
try
:
flag
=
r
.
sismember
(
'IN-20240129-0001'
,
news_url
)
if
flag
:
log
.
info
(
'信息已采集入库过'
)
continue
except
Exception
as
e
:
continue
# news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=2eeeb171e36b42ada02dad77b80038b1'
# 使用模拟浏览器打开
driver
=
create_driver
()
driver
.
get
(
news_url
)
div_
=
driver
.
find_element
(
By
.
ID
,
'line'
)
div
=
div_
.
find_element
(
By
.
XPATH
,
'..'
)
image_data
=
div
.
screenshot_as_base64
# todo:保存到obs链接及标签替换
baseCore
.
uptoOBS
()
html
=
driver
.
page_source
news_req
=
requests
.
get
(
url
=
news_url
,
headers
=
headers
)
news_soup
=
BeautifulSoup
(
news_req
.
content
,
'html.parser'
)
div_photo
=
driver
.
find_elements
(
By
.
ID
,
'line'
)
for
png_
in
div_photo
:
div
=
png_
.
find_element
(
By
.
XPATH
,
'.//div/div[1]/div'
)
# div = png_.find_element(By.CLASS_NAME, 'ant-col ant-col-17')
# todo:滚轮需要滑动
driver
.
execute_script
(
"arguments[0].scrollIntoView();"
,
div
)
time
.
sleep
(
1
)
#todo:保存成临时文件
temp_file
=
NamedTemporaryFile
(
delete
=
False
,
suffix
=
".png"
)
temp_file
.
close
()
div
.
screenshot
(
temp_file
.
name
)
file_path
=
temp_file
.
name
# todo:保存到obs链接及标签替换
name
=
str
(
baseCore
.
getuuid
())
result
=
getOBSres
(
pathType
,
name
,
file_path
)
path
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
full_path
=
result
[
'body'
][
'objectUrl'
]
#todo:替换标签 删除标签
dele_tag
=
png_
.
find_element
(
By
.
XPATH
,
'.//div/div[1]//div'
)
driver
.
execute_script
(
"arguments[0].remove()"
,
dele_tag
)
#todo:将图片塞进去 新建一个new_tag
append_tag
=
png_
.
find_element
(
By
.
XPATH
,
'.//div/div[1]'
)
driver
.
execute_script
(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com"
+
path
+
"'; arguments[0].insertBefore(newElement, arguments[0].firstChild);"
,
append_tag
)
os
.
remove
(
file_path
)
# div_undefined_line = driver.find_elements(By.ID, 'k-line-undefined')
div_undefined_line
=
driver
.
find_elements
(
By
.
ID
,
'KLineSubscription'
)
for
u_png
in
div_undefined_line
:
div_u
=
u_png
.
find_element
(
By
.
XPATH
,
'.//div'
)
# todo:滚轮需要滑动
driver
.
execute_script
(
"arguments[0].scrollIntoView();"
,
div_u
)
time
.
sleep
(
3
)
# todo:保存成临时文件
temp_file
=
NamedTemporaryFile
(
delete
=
False
,
suffix
=
".png"
)
temp_file
.
close
()
div_u
.
screenshot
(
temp_file
.
name
)
file_path
=
temp_file
.
name
# todo:保存到obs链接及标签替换
name
=
str
(
baseCore
.
getuuid
())
result
=
getOBSres
(
pathType
,
name
,
file_path
)
path
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
full_path
=
result
[
'body'
][
'objectUrl'
]
# todo:替换标签 删除标签
dele_tag
=
u_png
.
find_element
(
By
.
XPATH
,
'.//div'
)
driver
.
execute_script
(
"arguments[0].remove()"
,
dele_tag
)
# todo:将图片塞进去 新建一个new_tag
# append_tag = u_png.find_element(By.XPATH, './/div')
driver
.
execute_script
(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com"
+
path
+
"'; arguments[0].insertBefore(newElement, arguments[0].firstChild);"
,
u_png
)
os
.
remove
(
file_path
)
div_line_bar
=
driver
.
find_elements
(
By
.
ID
,
'bar-line-bar-line'
)
for
lin_bar_tag
in
div_line_bar
:
line_bars
=
lin_bar_tag
.
find_elements
(
By
.
XPATH
,
'.//div[contains(@class, "ant-col-11")]'
)
for
line_bar
in
line_bars
:
photo_line_bar
=
line_bar
.
find_element
(
By
.
XPATH
,
'.//div'
)
# todo:滚轮需要滑动
driver
.
execute_script
(
"arguments[0].scrollIntoView();"
,
photo_line_bar
)
time
.
sleep
(
1
)
# todo:保存成临时文件
temp_file
=
NamedTemporaryFile
(
delete
=
False
,
suffix
=
".png"
)
temp_file
.
close
()
photo_line_bar
.
screenshot
(
temp_file
.
name
)
file_path
=
temp_file
.
name
# todo:保存到obs链接及标签替换
name
=
str
(
baseCore
.
getuuid
())
result
=
getOBSres
(
pathType
,
name
,
file_path
)
path
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
full_path
=
result
[
'body'
][
'objectUrl'
]
# todo:替换标签 删除标签
dele_tag_
=
line_bar
.
find_element
(
By
.
XPATH
,
'.//div'
)
driver
.
execute_script
(
"arguments[0].remove()"
,
dele_tag_
)
# todo:将图片塞进去 新建一个new_tag
driver
.
execute_script
(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com"
+
path
+
"'; newElement.style.width = '50
%
'; newElement.style.position = 'relative'; newElement.style.float = 'left'; arguments[0].insertBefore(newElement, arguments[0].firstChild);"
,
line_bar
)
# #todo:创建清晰的图片标签
# driver.execute_script(f"""
# var img = new Image();
# img.src = "http://zzsn.luyuen.com{path}"; // 替换为你的图片路径
# img.onload = function() {{
# var canvas = document.createElement("canvas");
# canvas.width = img.width;
# canvas.height = img.height;
# var ctx = canvas.getContext("2d");
# ctx.drawImage(img, 0, 0);
# document.body.appendChild(canvas);
# }}; arguments[0].insertBefore(img, arguments[0].firstChild);
# """, line_bar)
os
.
remove
(
file_path
)
html
=
driver
.
page_source
news_soup
=
BeautifulSoup
(
html
,
'html.parser'
)
detail_info
=
news_soup
.
find
(
'div'
,
class_
=
'subTitle___svblj'
)
div_list
=
detail_info
.
find_all
(
'div'
)
origin
=
div_list
[
0
]
.
text
publishDate
=
div_list
[
1
]
.
text
contentWithTag
=
news_soup
.
find
(
'div'
,
class_
=
'editable___1EtCQ editor-editable'
)
# print(contentWithTag)
for
tag
in
contentWithTag
.
find_all
(
'span'
):
if
tag
.
text
==
'
\ufeff
'
:
tag
.
decompose
()
content
=
contentWithTag
.
text
info_code
=
'IN-20240129-0001'
result_dict
=
{
...
...
@@ -152,25 +213,29 @@ def zzcx():
'sid'
:
'1751787750127857666'
,
'title'
:
title
,
'organ'
:
origin
,
'origin'
:
'国务院国有资产监督管理委员会'
,
'origin'
:
origin
,
# '摘要': zhaiyao,
'source'
:
16
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag
,
'contentWithTag'
:
str
(
contentWithTag
)
,
'publishDate'
:
publishDate
,
'sourceAddress'
:
news_url
,
}
log
.
info
(
f
'{page}--{title}--{href}'
)
# info_list.append(result_dict)
log
.
info
(
f
'{page}--{title}--{news_url}'
)
print
(
result_dict
)
# break
# break
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
result_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
r
.
sadd
(
info_code
+
'-test'
,
href
)
r
.
sadd
(
info_code
,
news_url
)
log
.
info
(
'发送kafka成功!'
)
except
Exception
as
e
:
log
.
info
(
e
)
finally
:
producer
.
close
()
if
__name__
==
"__main__"
:
pathType
=
'PhotoDingzhi/'
r
=
redis
.
Redis
(
host
=
'114.115.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
zzcx
()
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论