Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
862e97ab
提交
862e97ab
authored
1月 31, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
1/31
上级
1d1053c8
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
8 个修改的文件
包含
453 行增加
和
60 行删除
+453
-60
CorePerson.py
comData/Tyc/CorePerson.py
+0
-0
CorePerson2.py
comData/Tyc/CorePerson2.py
+0
-0
resentYanbao.py
comData/YanBao/resentYanbao.py
+37
-35
dfsm_sasac.py
comData/dingzhi/dfsm_sasac.py
+145
-0
gzyw_sasac.py
comData/dingzhi/gzyw_sasac.py
+157
-0
zzcx.py
comData/dingzhi/zzcx.py
+52
-0
ClassTool.py
comData/policylaw/ClassTool.py
+2
-1
test.py
test.py
+60
-24
没有找到文件。
comData/Tyc/CorePerson.py
浏览文件 @
862e97ab
差异被折叠。
点击展开。
comData/Tyc/CorePerson2.py
浏览文件 @
862e97ab
差异被折叠。
点击展开。
comData/YanBao/resentYanbao.py
浏览文件 @
862e97ab
...
@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
...
@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
break
break
except
Exception
as
e
:
except
Exception
as
e
:
time
.
sleep
(
3
)
time
.
sleep
(
3
)
log
.
info
(
e
)
continue
continue
if
page_size
<
1
:
if
page_size
<
1
:
...
@@ -206,7 +207,8 @@ def download(data, order_by,header):
...
@@ -206,7 +207,8 @@ def download(data, order_by,header):
come
=
data
[
'come'
]
come
=
data
[
'come'
]
except
:
except
:
come
=
''
come
=
''
if
publishDate
<
'2024-01-29'
:
return
tf_url
=
add_check_url
(
sourceAddress
)
tf_url
=
add_check_url
(
sourceAddress
)
if
tf_url
:
if
tf_url
:
dic_result
=
{
dic_result
=
{
...
@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
...
@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
# qianyanzhishiku()
# qianyanzhishiku()
# except Exception as e:
# except Exception as e:
# pass
# pass
try
:
#
try:
log
.
info
(
'shijiejingjiluntan'
)
#
log.info('shijiejingjiluntan')
shijiejingjiluntan
()
#
shijiejingjiluntan()
except
Exception
as
e
:
#
except Exception as e:
log
.
info
(
e
)
#
log.info(e)
pass
#
pass
# try:
# try:
# log.info('dongfangcaifu')
# log.info('dongfangcaifu')
# dongfangcaifu()
# dongfangcaifu()
...
@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
...
@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
# except Exception as e:
# except Exception as e:
# log.info(e)
# log.info(e)
# pass
# pass
#
#
try:
try
:
#
log.info('dongfangcaifu4')
log
.
info
(
'dongfangcaifu4'
)
#
dongfangcaifu4()
dongfangcaifu4
()
#
except Exception as e:
except
Exception
as
e
:
#
log.info(e)
log
.
info
(
e
)
#
pass
pass
#
#
try:
try
:
#
log.info('dongfangcaifu5')
log
.
info
(
'dongfangcaifu5'
)
#
dongfangcaifu5()
dongfangcaifu5
()
#
except Exception as e:
except
Exception
as
e
:
#
log.info(e)
log
.
info
(
e
)
#
pass
pass
#
#
try:
try
:
#
log.info('dongfangcaifu6')
log
.
info
(
'dongfangcaifu6'
)
#
dongfangcaifu6()
dongfangcaifu6
()
#
except Exception as e:
except
Exception
as
e
:
#
log.info(e)
log
.
info
(
e
)
#
pass
pass
#
#
try:
try
:
#
log.info('dongfangcaifu7')
log
.
info
(
'dongfangcaifu7'
)
#
dongfangcaifu7()
dongfangcaifu7
()
#
except Exception as e:
except
Exception
as
e
:
#
log.info(e)
log
.
info
(
e
)
#
pass
pass
comData/dingzhi/dfsm_sasac.py
0 → 100644
浏览文件 @
862e97ab
import
requests
import
json
import
sys
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
,
}
def
two_dfsm_mtgc
():
info_list
=
[]
"""
地方扫描
"""
url_list
=
[
'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
,
# 'http://www.sasac.gov.cn/n2588025/n2588139/index.html'
]
for
url
in
url_list
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
pages
=
soup
.
find
(
'td'
,
class_
=
'pages'
)
pages_tag
=
pages
[
'id'
]
.
split
(
'pag_'
)[
1
]
pages
=
str
(
pages
)
.
split
(
f
'maxPageNum{pages_tag}='
)[
1
]
.
split
(
'";'
)[
0
]
# print(pages)
# for page in range(378,int(pages)+1):
for
page
in
range
(
1
,
378
):
log
.
info
(
f
'==============开始采集第{page}页==============='
)
if
page
==
1
:
url
=
'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
else
:
url
=
f
'http://www.sasac.gov.cn/n2588025/n2588129/index_{pages_tag}_{int(pages)+1-page}.html'
try
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
except
:
continue
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
li_list
=
soup
.
find
(
'span'
,
id
=
f
'comp_{pages_tag}'
)
if
li_list
:
li_list
=
li_list
.
find_all
(
'li'
)
else
:
li_list
=
soup
.
find_all
(
'li'
)
for
li
in
li_list
:
# print(type(li))
if
len
(
li
):
a
=
li
.
find
(
'a'
)
# print(a)
href
=
a
[
'href'
]
if
'http'
in
href
:
href
=
href
else
:
href
=
'http://www.sasac.gov.cn/'
+
str
(
href
)
.
replace
(
'../../'
,
''
)
# print(href)
try
:
flag
=
r
.
sismember
(
'IN-20240129-0019-test'
,
href
)
if
flag
:
log
.
info
(
'信息已采集入库过'
)
continue
# else:
# log.info(f'未采到----{page}-----{href}')
# continue
except
Exception
as
e
:
continue
# href = "http://www.sasac.gov.cn/n2588025/n2588129/c2711101/content.html"
try
:
title
=
a
[
'title'
]
except
:
title
=
''
# print(title)
try
:
res_href
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
except
:
continue
res_href
.
encoding
=
res_href
.
apparent_encoding
href_text
=
res_href
.
text
i_soup
=
BeautifulSoup
(
href_text
,
'html.parser'
)
result
=
i_soup
.
find
(
class_
=
'zsy_cotitle'
)
try
:
if
result
:
result
=
result
.
find
(
'p'
)
.
text
pub_source
=
result
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'文章来源:'
,
''
)
.
strip
()
pub_time
=
result
.
split
(
'发布时间:'
)[
1
]
# print(pub_source,pub_time)
try
:
i_soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
i_soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
except
:
pass
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
))
content
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
)
.
text
)
.
replace
(
'扫一扫在手机打开当前页'
,
''
)
else
:
result
=
i_soup
.
find
(
class_
=
'lyshijian'
)
.
find_all
(
'span'
)
try
:
pub_source
=
str
(
result
[
0
])
.
split
(
'文章来源:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_time
=
str
(
result
[
1
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
except
:
pub_time
=
str
(
result
[
0
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_source
=
''
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'pages_content'
))
content
=
str
(
i_soup
.
find
(
class_
=
'articlecontent'
)
.
text
)
if
title
==
''
:
log
.
info
(
f
'title为空----{page}--{title}--{href}'
)
continue
info_code
=
'IN-20240129-0019'
result_dict
=
{
'id'
:
''
,
'sid'
:
'1751849444877144065'
,
'title'
:
title
,
'organ'
:
pub_source
,
'origin'
:
'国务院国有资产监督管理委员会'
,
# '摘要': zhaiyao,
'source'
:
16
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag
,
'publishDate'
:
pub_time
,
'sourceAddress'
:
href
,
}
log
.
info
(
f
'{page}--{title}--{href}'
)
# info_list.append(result_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
result_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
r
.
sadd
(
info_code
+
'-test'
,
href
)
log
.
info
(
'发送kafka成功!'
)
except
Exception
as
e
:
log
.
info
(
e
)
finally
:
producer
.
close
()
except
:
continue
if
__name__
==
"__main__"
:
r
=
redis
.
Redis
(
host
=
'114.115.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
two_dfsm_mtgc
()
\ No newline at end of file
comData/dingzhi/gzyw_sasac.py
0 → 100644
浏览文件 @
862e97ab
import
json
import
sys
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
,
}
#国资要闻
def
gzyw
():
info_list
=
[]
url
=
'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
# pages = soup.find('td',id='pag_4278129')
pages
=
soup
.
find
(
'td'
,
class_
=
'pages'
)
pages_tag
=
pages
[
'id'
]
.
split
(
'pag_'
)[
1
]
pages
=
str
(
pages
)
.
split
(
f
'maxPageNum{pages_tag}='
)[
1
]
.
split
(
'";'
)[
0
]
# print(pages)
for
page
in
range
(
1
,
int
(
pages
)
+
1
):
log
.
info
(
f
'==============开始采集第{page}页==============='
)
if
page
==
1
:
url
=
'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
else
:
#http://www.sasac.gov.cn/n2588025/n2643309/index_4278129_131.html
url
=
f
'http://www.sasac.gov.cn/n2588025/n2643314/index_{pages_tag}_{int(pages)+1-page}.html'
try
:
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
except
:
continue
res
.
encoding
=
res
.
apparent_encoding
res_text
=
res
.
text
soup
=
BeautifulSoup
(
res_text
,
'html.parser'
)
li_list
=
soup
.
find
(
'span'
,
id
=
f
'comp_{pages_tag}'
)
if
li_list
:
li_list
=
li_list
.
find_all
(
'li'
)
else
:
li_list
=
soup
.
find_all
(
'li'
)
for
li
in
li_list
:
# print(type(li))
if
len
(
li
):
a
=
li
.
find
(
'a'
)
# print(a)
href
=
a
[
'href'
]
if
'http'
in
href
:
href
=
href
else
:
href
=
'http://www.sasac.gov.cn/'
+
str
(
href
)
.
replace
(
'../../'
,
''
)
# print(href)
try
:
flag
=
r
.
sismember
(
'IN-20240129-0002-test'
,
href
)
if
flag
:
# log.info('信息已采集入库过')
continue
# else:
# log.info(f'未采到----{page}-----{href}')
except
Exception
as
e
:
continue
try
:
title
=
a
[
'title'
]
except
:
title
=
''
# print(title)
try
:
res_href
=
requests
.
get
(
url
=
href
,
headers
=
headers
,
verify
=
False
)
except
:
continue
res_href
.
encoding
=
res_href
.
apparent_encoding
href_text
=
res_href
.
text
i_soup
=
BeautifulSoup
(
href_text
,
'html.parser'
)
result
=
i_soup
.
find
(
class_
=
'zsy_cotitle'
)
try
:
if
result
:
result_
=
result
.
find
(
'p'
)
.
text
pub_source
=
result_
.
split
(
'发布时间:'
)[
0
]
.
replace
(
'文章来源:'
,
''
)
.
strip
()
pub_time
=
result_
.
split
(
'发布时间:'
)[
1
]
# print(pub_source,pub_time)
if
title
==
''
:
result
.
find
(
'p'
)
.
decompose
()
title
=
result
.
text
.
strip
()
.
replace
(
' '
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\t
'
,
''
)
try
:
i_soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
i_soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
except
:
pass
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
))
content
=
str
(
i_soup
.
find
(
class_
=
'zsy_comain'
)
.
text
)
.
replace
(
'扫一扫在手机打开当前页'
,
''
)
else
:
result
=
i_soup
.
find
(
class_
=
'lyshijian'
)
if
result
:
result_
=
result
.
find_all
(
'span'
)
try
:
pub_source
=
str
(
result_
[
0
])
.
split
(
'文章来源:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_time
=
str
(
result_
[
1
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
except
:
pub_time
=
str
(
result_
[
0
])
.
split
(
'发布时间:'
)[
1
]
.
split
(
'</span>'
)[
0
]
.
strip
()
pub_source
=
''
if
title
==
''
:
result
.
find
(
'p'
)
.
decompose
()
title
=
result
.
text
.
strip
()
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'articlecontent'
))
content
=
str
(
i_soup
.
find
(
class_
=
'articlecontent'
)
.
text
)
else
:
result
=
i_soup
.
find
(
class_
=
'pages-date'
)
pub_source
=
result
.
find
(
'span'
)
.
text
.
replace
(
'来源:'
,
''
)
.
strip
()
pub_time
=
result
.
text
pub_time
=
pub_time
.
split
(
'来源'
)[
0
]
.
strip
()
contentWithTag
=
str
(
i_soup
.
find
(
class_
=
'pages_content'
))
content
=
str
(
i_soup
.
find
(
class_
=
'pages_content'
)
.
text
)
# content = str(i_soup.find(class_='articlecontent').text)
if
title
==
''
:
log
.
info
(
f
'title为空----{page}--{title}--{href}'
)
continue
# zhaiyao = HanLP.extractSummary(content,6)
info_code
=
'IN-20240129-0002'
result_dict
=
{
'id'
:
''
,
'sid'
:
'1751810519211053058'
,
'title'
:
title
,
'organ'
:
pub_source
,
'origin'
:
'国务院国有资产监督管理委员会'
,
# '摘要': zhaiyao,
'source'
:
16
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag
,
'publishDate'
:
pub_time
,
'sourceAddress'
:
href
,
}
log
.
info
(
f
'{page}--{title}--{href}'
)
# info_list.append(result_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
result_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
r
.
sadd
(
info_code
+
'-test'
,
href
)
log
.
info
(
'发送kafka成功!'
)
except
Exception
as
e
:
log
.
info
(
e
)
finally
:
producer
.
close
()
except
:
continue
if
__name__
==
"__main__"
:
r
=
redis
.
Redis
(
host
=
'114.115.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
gzyw
()
\ No newline at end of file
comData/dingzhi/zzcx.py
0 → 100644
浏览文件 @
862e97ab
"""
中证智能财讯
"""
import
json
import
requests
from
bs4
import
BeautifulSoup
def
zzcx
():
url
=
'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload
=
{
"pageNo"
:
1
,
"pageSize"
:
15
,
"statusList"
:
[
0
],
"keyword"
:
""
}
headers
=
{
'Accept'
:
'application/json'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Content-Length'
:
'56'
,
'Content-Type'
:
'application/json;charset=UTF-8'
,
'Cookie'
:
'zycna=VEwasVGF9akBAXuVA58n9CJm'
,
'Sec-Ch-Ua'
:
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
'Sec-Ch-Ua-Platform'
:
'"Windows"'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'Origin'
:
'https://zzcx.cs.com.cn'
,
'Referer'
:
'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
}
payload
=
json
.
dumps
(
payload
)
result_json
=
requests
.
post
(
url
=
url
,
data
=
payload
,
headers
=
headers
)
.
json
()
print
(
result_json
)
pages
=
result_json
[
'data'
][
'pages'
]
for
page
in
range
(
1
,
int
(
pages
+
1
)):
payload_page
=
{
"pageNo"
:
page
,
"pageSize"
:
15
,
"statusList"
:
[
0
],
"keyword"
:
""
}
payload_page
=
json
.
dumps
(
payload_page
)
datas
=
requests
.
post
(
url
=
url
,
data
=
payload_page
,
headers
=
headers
)
records
=
datas
.
json
()[
'data'
][
'records'
]
for
news
in
records
:
title
=
news
[
'title'
]
news_url
=
'https://zzcx.cs.com.cn/app/zzb/detail?id='
+
news
[
'manuscriptId'
]
news_req
=
requests
.
get
(
url
=
news_url
,
headers
=
headers
)
news_soup
=
BeautifulSoup
(
news_req
.
content
,
'html.parser'
)
detail_info
=
news_soup
.
find
(
'div'
,
class_
=
'subTitle___svblj'
)
div_list
=
detail_info
.
find_all
(
'div'
)
origin
=
div_list
[
0
]
.
text
publishDate
=
div_list
[
1
]
.
text
if
__name__
==
"__main__"
:
zzcx
()
\ No newline at end of file
comData/policylaw/ClassTool.py
浏览文件 @
862e97ab
...
@@ -85,7 +85,8 @@ class ClassTool():
...
@@ -85,7 +85,8 @@ class ClassTool():
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'来源'
:
dic_news
[
'labels'
][
0
][
'relationName'
],
'创建时间'
:
dic_news
[
'createDate'
],
'创建时间'
:
dic_news
[
'createDate'
],
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
],
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
],
'发布时间'
:
dic_news
[
'publishDate'
]
'发布时间'
:
dic_news
[
'publishDate'
],
'标题'
:
dic_news
[
'title'
]
}
}
self
.
db_storage
.
insert_one
(
aaa_dic
)
self
.
db_storage
.
insert_one
(
aaa_dic
)
...
...
test.py
浏览文件 @
862e97ab
...
@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
...
@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
#
#
# code = use_ocr(out_img_path)
# code = use_ocr(out_img_path)
# 验证码输入框元素.send_keys(code)
# 验证码输入框元素.send_keys(code)
# import requests
# headers = {
# # 'Accept': '*/*',
# # 'Accept-Encoding': 'gzip, deflate, br',
# # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# # 'Cache-Control': 'no-cache',
# # 'Connection': 'keep-alive',
# # 'Host': 'search-api-web.eastmoney.com',
# # 'Pragma': 'no-cache',
# # 'Sec-Fetch-Dest': 'script',
# # 'Sec-Fetch-Mode': 'no-cors',
# # 'Sec-Fetch-Site': 'same-site',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# # 'sec-ch-ua-mobile': '?0',
# # 'sec-ch-ua-platform': '"Windows"'
# }
# url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
#
#
# # res = requests.get(url).text[1:-1]
# res = requests.get(url=url, headers=headers)
# with open('./a.pdf','wb') as f:
# f.write(res.content)
import
datetime
import
json
import
requests
import
requests
headers
=
{
import
pymongo
# 'Accept': '*/*',
from
base
import
BaseCore
# 'Accept-Encoding': 'gzip, deflate, br',
baseCore
=
BaseCore
.
BaseCore
()
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
log
=
baseCore
.
getLogger
()
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com',
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
中科软
[
# 'Pragma': 'no-cache',
'数据源_0504'
]
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors',
datas
=
db_storage
.
find
({
'postCode'
:
'2'
})
.
limit
(
5
)
# 'Sec-Fetch-Site': 'same-site',
for
data
in
datas
:
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
,
title
=
data
[
'titleForeign'
]
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
contentWithTag
=
data
[
'richTextForeign'
]
# 'sec-ch-ua-mobile': '?0',
summary
=
data
[
'contentForeign'
]
# 'sec-ch-ua-platform': '"Windows"'
dic_info
=
{
}
'title'
:
title
,
url
=
"https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob
%
E7
%
A0
%94%
E7
%
A9
%
B6
%
E9
%99%
A2
%
E3
%80%8
A2023
%
E5
%
B9
%
B4
%
E4
%
B8
%
AD
%
E5
%9
B
%
BD
%
E6
%96%87%
E6
%97%85%
E4
%
BA
%
A7
%
E4
%
B8
%9
A
%
E5
%8
F
%91%
E5
%
B1
%95%
E8
%
B6
%8
B
%
E5
%8
A
%
BF
%
E6
%8
A
%
A5
%
E5
%91%8
A
%
E3
%80%8
B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH
%2
Fk
%3
D"
'summary'
:
summary
,
'contentWithTag'
:
contentWithTag
}
# res = requests.get(url).text[1:-1]
headers
=
{
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
'Content-Type'
:
'application/json'
,
with
open
(
'./a.pdf'
,
'wb'
)
as
f
:
}
f
.
write
(
res
.
content
)
dic_info_
=
json
.
dumps
(
dic_info
)
\ No newline at end of file
# print(dic_info_)
# with open('./data.json','w') as f:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req
=
requests
.
post
(
'http://117.78.23.14:5000/translate'
,
data
=
dic_info_
,
headers
=
headers
)
log
.
info
(
req
.
text
)
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论