Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
f434a907
提交
f434a907
authored
4月 07, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
华尔街采集
上级
a16f8aa1
全部展开
显示空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
236 行增加
和
0 行删除
+236
-0
wsj-TECH.py
comData/dingzhi/wsj-TECH.py
+0
-0
wsj_cookie.txt
comData/dingzhi/wsj_cookie.txt
+2
-0
wsj_detail.py
comData/dingzhi/wsj_detail.py
+180
-0
wsj_getcookies.py
comData/dingzhi/wsj_getcookies.py
+54
-0
没有找到文件。
comData/dingzhi/wsj-TECH.py
浏览文件 @
f434a907
差异被折叠。
点击展开。
comData/dingzhi/wsj_cookie.txt
0 → 100644
浏览文件 @
f434a907
{"s_tp": "4333", "_ncg_domain_id_": "7c5b7036-4ad7-4687-8961-1b6f5d273984.1.1711521614929.1774593614929", "_ncg_sp_id.5378": "7c5b7036-4ad7-4687-8961-1b6f5d273984.1711521615.1.1711521616.1711521615.89221ca3-0ff7-447c-bd6b-88bc0d7b9ca1", "__eoi": "ID=74957e9c589e06c8:T=1711521614:RT=1711521614:S=AA-AfjbpSmTSwP45PV1P_Sfr-2eA", "_ncg_id_": "7c5b7036-4ad7-4687-8961-1b6f5d273984", "__gpi": "UID=00000d6a82c6423a:T=1711521614:RT=1711521614:S=ALNI_Mamy4ax3m1xyG-SWHlZCu8YbWNV7w", "__gads": "ID=4e5002beb21f7800:T=1711521614:RT=1711521614:S=ALNI_MaWml862ei0DYTIgH5jH8-qiduUyA", "dicbo_id": "%7B%22dicbo_fetch%22%3A1711521614435%7D", "s_cc": "true", "_ncg_sp_ses.5378": "*", "_dj_sp_id": "f24fbbcc-3872-4f04-b628-048e0da2d503", "_uetvid": "ddc10a40ec0411ee968a03a12159858e", "s_ppv": "CWSJ_Home_Home%2520Page%2C13%2C13%2C570", "_uetsid": "ddc0cf80ec0411eea60095acfb805f07", "_gcl_au": "1.1.1409873185.1711521614", "_pin_unauth": "dWlkPU4yUTFOVEF6TXpZdFlqYzRaaTAwWXpGa0xXRTVNell0TjJJd05qaGtPVFUzTkdNMw", "AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1", "_rdt_uuid": "1711521612889.741c0512-609f-4857-9453-d2627aab72f2", "cX_P": "lu9fryu88xq1rnhv", "_meta_facebookTag_sync": "1711521612864", "_dj_id.9183": ".1711521612.1.1711521612..42d375e3-8210-4847-b37b-e2413e02fe5a..1bafccfe-479d-43d7-ae40-2c0b27f1273b.1711521612160.1", "usr_prof_v2": "eyJwIjp7InBzIjowLjg4LCJxIjowLjg2fSwiY3AiOnsiZWMiOiJTdGFibGUiLCJwYyI6MC4wMTQzMywicHNyIjowLjMyNTEsInRkIjoxNzI1LCJhZCI6MjgsInFjIjozMCwicW8iOjI3LCJzY2VuIjp7ImNoZSI6MC4wMzA0MywiY2huIjowLjAzMDgyLCJjaGEiOjAuMDE0MzMsImNocCI6MC4wMTczfX0sIm9wIjp7ImkiOiI2MTdiY2UwMCIsImoiOnsianQiOiJlbGdtIn19LCJpYyI6M30%3D", "_scid": "22e08d14-b460-4edb-8046-8b897104f696", "ResponsiveConditional_initialBreakpoint": "md", "vcdpaApplies": "false", "_pctx": "%7Bu%7DN4IgrgzgpgThIC5QDYoFYBGBjNB2AhgEwAMAzACYbHH6kCcGuhuGAZgIzEAcWhUhebFla5y5QqRzssxNPnKs0zOqwAspKKox06yYAHcIAKwC%2BiUAAcYUVgEsAHohCGjIADQgALgE8LUJwDCABogJiYekLAAyp74npBO%2BAB2APZJ7iAQtp5QAJLkTnSExaSkaEpcaMi4yGjsdKSqoUA", "cX_G": "cx%3Aar1n90irbdrh1nz3umsn41upp%3A12bdufvkc9frm", "ab_uuid": "5be368fe-5b01-4451-8d6c-54c8bd163f1b", "AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1585540135%7CMCIDTS%7C19810%7CMCMID%7C48333472371426108300129167066567447308%7CMCAID%7CNONE%7CMCOPTOUT-1711528813s%7CNONE%7CMCAAMLH-1712126413%7C6%7CMCAAMB-1712126413%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCSYNCSOP%7C411-19817%7CvVersion%7C4.4.0", "datadome": "i1JSVSww9u2lCnQC7CmZsfQQNOATMw1rMpQl2syLPun2T1iS1iZ9PP~uUddO0Tp~ABwhQ~K~x~CIeUhxxhz18uVd1dpdc5wK8gDSrOlrovmG8ozGm4KjIXF~mas7NsGe", "gdprApplies": "false", "ccpaApplies": "false", "_fbp": "fb.1.1711521612864.2074745750", "ajs_anonymous_id": "86ceb415-2de7-4a04-bbf7-b3b952852670", "utag_main": "v_id:018e7ea33c2c004ea140fb3e2df00507d0013075007e8$_sn:1$_se:1$_ss:1$_st:1711523410801$ses_id:1711521610801%3Bexp-session$_pn:1%3Bexp-session$_prevpage:CWSJ_Home_Home%20Page%3Bexp-1711525210821$vapi_domain:wsj.com", "regulationApplies": "gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse", "ca_rt": "MI50RGh7xOF_Ha5M-pfrXQ.IWGd-OI_q1g7AlutTc36h9FX7n1uQrebrg16x7dunYcPqFafWtPhVQ0ZQqvmKc6OzBixBYWSZvxCXF_gye9vCE0vT01vzuzUI8a-JK4X4LE", "_dj_ses.9183": "*", "DJSESSION": "country%3Dsg%7C%7Ccontinent%3Das%7C%7Cregion%3D", "ca_id": "eJxdkMtSgzAUht8la2hzIYSwkg7o4NTWqVUXjsOEXGwstB1Ixer47qYXN-7O-f7_XL_Bm_3Qm2ojWg1S0IruAAJgRGubwz-oW2Ebn36t1hjS5ArFZCS3rVf2e6u8oDVmnFETMgNVGOEkCmvMVChNLDGKGElU4t2uE3J9Kog1rSVlAkOiaggF4TXDrDYIJhJrTFktpWFKYSIpkpAKZagfYSKio5rz2Dfrto3uQfoCrhdFsShuwnKWl09l_phNvfr8cBuW91l-Di8gm-WLeZlXy2wyLZYXeDeflNMCvAZA7N2qcvZ4OWIIUYwoTwIgOy2cVpVwntMYR5gzjAJgT-DPyP2bPndnwCk5Adv7BcHKuV2fjsfDMIyG_v34ubFsrN448PMLjPFu1g.qGey7P4In7Rq_zwb3rdDqeMgXg4ctbinCQ8wcWje7hmNwg48tJ5nKGnGeVLUEIynG34nBOAARDWFeZPNyFFPQr5JS-xjPAnfuuCYxSkS8Z6C3FVwBZ0D4rkSU11Ts67PVCwzfI3f4qxYy9M8JB4WnM6PQYU0wJ_WCZAzb2pDxEASxzKfzzm_M5FdybgMHkY-4WcbF-Zp0V8RHWG7eH9OvmIYZZNq621vKCZXg6hnJmeh6FZdVxiSXPrOG4K1zGgtb-wHrKJFVu5VmGXY_ygilEFec8v27wiASm9IiMmqZ-wQ_ej0u9OS2YMIA5Fzn7kqx1mgYOWCL3eLVYR1R01oigJx4q2GdyrQDVyp0X_8Z7aaGSj4UgYR8Q2mxfrB7AWeWqeKbec8RdJPDh9kjKvcs_KVPtfNVyzHKH-f9fv7hnh2Rmot44XSUPb4WqNUJx8N1FNXV8mjOLP37oVCZZaeVW2NoNNeNbN75WYS4kSjBhuUtBN8iumYaxR5xmCk41UYbKldPN4qyFE0J1hBt64mXbhGD0clVCFdBMWpw29ZIMdkXwpqX3Ig0FVGku6QO4pXlOWeqVPb2Tc78yGqxmg-Nqc90C2-nANOTdT1AZt27-FuB6KAiadLNFlcVcYNYi8vD6ylHGZiHB8DemoFPLoKN1WH5ggodOBRyvMFO2Asry8", "_scid_r": "22e08d14-b460-4edb-8046-8b897104f696", "_ncg_g_id_": "cbeb2daa-da53-494b-a606-90a386ee0b55.3.1711521616.1774593614929", "TR": "V2-6e5bc57a203db00a39b727bf108c2e257bccf7dd23c51c05adf5279f43e4b996", "_sctr": "1%7C1711468800000", "_pcid": "%7B%22browserId%22%3A%22lu9fryu88xq1rnhv%22%7D", "wsjregion": "asia%2Ccn"}
\ No newline at end of file
comData/dingzhi/wsj_detail.py
0 → 100644
浏览文件 @
f434a907
from
bs4
import
BeautifulSoup
import
requests
,
time
,
json
import
redis
,
random
from
kafka
import
KafkaProducer
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.edge.options
import
Options
from
selenium.webdriver.edge.service
import
Service
from
apscheduler.schedulers.blocking
import
BlockingScheduler
import
sys
sys
.
path
.
append
(
"../../base"
)
from
base
import
BaseCore
log
=
BaseCore
.
BaseCore
()
.
getLogger
()
def
create_driver
():
ip
=
{
'https'
:
'https://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
edge_service
=
Service
(
r'D:\soft\msedgedriver.exe'
)
edge_options
=
Options
()
# 开启开发者模式
edge_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
# 禁用启用Blink运行时的功能
edge_options
.
add_argument
(
'--disable-blink-features=AutomationControlled'
)
edge_options
.
add_argument
(
'--proxy-server=
%
s'
%
ip
[
'http'
])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver
=
webdriver
.
Edge
(
service
=
edge_service
,
options
=
edge_options
)
return
driver
def
create_google
():
driver_path
=
r'D:\cmd100\chromedriver.exe'
chromr_bin
=
r'D:\Google\Chrome\Application\chrome.exe'
chrome_driver
=
driver_path
path
=
Service
(
chrome_driver
)
chrome_options
=
webdriver
.
ChromeOptions
()
# chrome_options.add_argument('--headless')
chrome_options
.
add_argument
(
'--disable-gpu'
)
chrome_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
chrome_options
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
chrome_options
.
binary_location
=
chromr_bin
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
return
driver
def
login
():
driver
=
create_google
()
un
=
'zhk2058@163.com'
pw
=
'ZZM205899'
driver
.
get
(
"https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https
%3
A
%2
F
%2
Fcn.wsj.com
%2
Fclient
%2
Fauth&response_type=code&scope=openid
%20
idp_id
%20
roles
%20
tags
%20
email
%20
given_name
%20
family_name
%20
uuid
%20
djid
%20
djUsername
%20
djStatus
%20
trackid
%20
prts
%20
updated_at
%20
created_at
%20
offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https
%253
A
%252
F
%252
Fcn.wsj.com
%252
F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin"
)
time
.
sleep
(
5
)
driver
.
find_element
(
By
.
XPATH
,
"//div/input[@name = 'username']"
)
.
send_keys
(
un
)
# //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]'
)
.
click
()
time
.
sleep
(
3
)
# //*[@id="password-login"]/div/form/div[5]/button
driver
.
find_element
(
By
.
ID
,
"password-login-password"
)
.
send_keys
(
pw
)
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="password-login"]/div/form/div[5]/button'
)
.
click
()
time
.
sleep
(
3
)
cookies
=
driver
.
get_cookies
()
return
cookies
,
driver
def
parser_content
(
href
,
driver
):
while
True
:
driver
.
get
(
href
)
time
.
sleep
(
2
)
news_soup
=
BeautifulSoup
(
driver
.
page_source
,
'html.parser'
)
news_content
=
news_soup
.
find
(
'div'
,
class_
=
'article-content'
)
if
news_content
is
None
:
driver
.
refresh
()
time
.
sleep
(
3
)
log
.
info
(
'封号'
)
return
None
,
None
else
:
break
content
=
news_content
.
text
return
content
,
news_content
def
getData
(
key
):
keys
=
r
.
scan_iter
(
f
"{key}*"
)
for
key
in
keys
:
fields
=
r
.
hgetall
(
key
)
decode_fields
=
{
k
.
decode
():
v
.
decode
()
for
k
,
v
in
fields
.
items
()}
# 获取一条信息
# r.delete(key)
# print(f"删除成功{key}")
newsUrl
=
decode_fields
[
'newsUrl'
]
# todo: 判断是否已采集
try
:
flag
=
r_2
.
sismember
(
'IN-20240403-0041'
,
newsUrl
)
if
flag
:
log
.
info
(
'信息已采集入库过'
)
continue
except
Exception
as
e
:
continue
publishDate
=
decode_fields
[
'publishDate'
]
title
=
decode_fields
[
'title'
]
summary
=
decode_fields
[
'summary'
]
# todo:发送kafka
sid
=
'1775455062911447042'
info_code
=
"IN-20240403-0041"
dic_news
=
{
'content'
:
''
,
'contentWithTag'
:
''
,
'id'
:
''
,
'summary'
:
summary
,
'origin'
:
'华尔街日报中文网-科技'
,
'publishDate'
:
publishDate
,
'sid'
:
sid
,
'sourceAddress'
:
newsUrl
,
'title'
:
title
,
'source'
:
'16'
,
'type'
:
''
}
# 将相应字段通过kafka传输保存
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
log
.
info
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
log
.
info
(
dic_result
)
r_2
.
sadd
(
info_code
,
newsUrl
)
except
Exception
as
e
:
log
.
info
(
e
)
log
.
info
(
f
'传输失败:{dic_news["title"]}、{dic_news["publishDate"]}'
)
# 不用对内容做处理
# content, contentWithTag = parser_content(newsUrl, driver)
# if content is None:
# time.sleep(20*60)
# else:
# log.info(f'成功--{decode_fields}')
# r.delete(key)
# time.sleep(3)
return
True
if
__name__
==
'__main__'
:
r
=
redis
.
Redis
(
host
=
'114.116.90.53'
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
6
)
r_2
=
redis
.
Redis
(
host
=
"114.116.90.53"
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
5
)
key
=
'WSJ:NewsInfo'
# ip = {
# 'https': 'https://127.0.0.1:1080',
# 'http': 'http://127.0.0.1:1080'
#
# }
# cookies, driver = login()
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
# "content-type": "application/json; charset=UTF-8",
# "Connection": "keep-alive"
# }
# with open('wsj_cookie.txt', 'r') as g:
# cookies = g.read()
# cookies = json.loads(cookies)
#
# # url = 'https://cn.wsj.com/articles/欧盟根据新数字竞争法对苹果-meta-谷歌展开调查-732a3d4f'
# url = 'https://www.wsj.com/economy/china-industrial-profits-return-to-growth-d3530ec5'
#
# driver = create_driver()
# for cookie in cookies:
# driver.add_cookie(cookie)
# driver.get(url)
while
True
:
getData
(
key
)
time
.
sleep
(
60
*
60
*
1
)
comData/dingzhi/wsj_getcookies.py
0 → 100644
浏览文件 @
f434a907
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.edge.options
import
Options
from
selenium.webdriver.edge.service
import
Service
import
time
,
json
def
create_driver
():
ip
=
{
'https'
:
'https://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
edge_service
=
Service
(
r'D:\soft\msedgedriver.exe'
)
edge_options
=
Options
()
# 开启开发者模式
edge_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
# 禁用启用Blink运行时的功能
edge_options
.
add_argument
(
'--disable-blink-features=AutomationControlled'
)
edge_options
.
add_argument
(
'--proxy-server=
%
s'
%
ip
[
'http'
])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver
=
webdriver
.
Edge
(
service
=
edge_service
,
options
=
edge_options
)
return
driver
def
login
():
driver
=
create_driver
()
un
=
'zhk2058@163.com'
pw
=
'ZZM205899'
driver
.
get
(
"https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https
%3
A
%2
F
%2
Fcn.wsj.com
%2
Fclient
%2
Fauth&response_type=code&scope=openid
%20
idp_id
%20
roles
%20
tags
%20
email
%20
given_name
%20
family_name
%20
uuid
%20
djid
%20
djUsername
%20
djStatus
%20
trackid
%20
prts
%20
updated_at
%20
created_at
%20
offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https
%253
A
%252
F
%252
Fcn.wsj.com
%252
F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin"
)
time
.
sleep
(
5
)
driver
.
find_element
(
By
.
XPATH
,
"//div/input[@name = 'username']"
)
.
send_keys
(
un
)
# //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]'
)
.
click
()
time
.
sleep
(
3
)
# //*[@id="password-login"]/div/form/div[5]/button
driver
.
find_element
(
By
.
ID
,
"password-login-password"
)
.
send_keys
(
pw
)
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="password-login"]/div/form/div[5]/button'
)
.
click
()
time
.
sleep
(
3
)
cookie
=
driver
.
get_cookies
()
return
cookie
,
driver
if
__name__
==
'__main__'
:
cookie
,
driver
=
login
()
cookies
=
{}
for
item
in
cookie
:
cookies
[
item
[
'name'
]]
=
item
[
'value'
]
with
open
(
"wsj_cookie.txt"
,
"w"
)
as
f
:
f
.
write
(
json
.
dumps
(
cookies
))
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论