Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
f434a907
提交
f434a907
authored
4月 07, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
华尔街采集
上级
a16f8aa1
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
388 行增加
和
15 行删除
+388
-15
wsj-TECH.py
comData/dingzhi/wsj-TECH.py
+152
-15
wsj_cookie.txt
comData/dingzhi/wsj_cookie.txt
+2
-0
wsj_detail.py
comData/dingzhi/wsj_detail.py
+180
-0
wsj_getcookies.py
comData/dingzhi/wsj_getcookies.py
+54
-0
没有找到文件。
comData/dingzhi/wsj-TECH.py
浏览文件 @
f434a907
import
requests
import
requests
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
import
re
import
re
import
json
import
redis
import
time
,
datetime
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.edge.options
import
Options
from
selenium.webdriver.edge.service
import
Service
from
apscheduler.schedulers.blocking
import
BlockingScheduler
def
create_driver
():
ip
=
{
'https'
:
'https://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
if
__name__
==
'__main__'
:
url
=
'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
headers
=
{
'Cookie'
:
'gdprApplies=false; ccpaApplies=false; vcdpaApplies=false; regulationApplies=gdpr
%3
Afalse
%2
Ccpra
%3
Afalse
%2
Cvcdpa
%3
Afalse; _pcid=
%7
B
%22
browserId
%22%3
A
%22
ltzfvavl4ju9vgpi
%22%7
D; cX_P=ltzfvavl4ju9vgpi; dnsDisplayed=undefined; signedLspa=undefined; _sp_su=false; cX_G=cx
%3
Allui1w2zab163r7fbco37esw7
%3
A317ttgvfg79lq; AMCVS_CB68E4BA55144CAA0A4C98A5
%40
AdobeOrg=1; ajs_anonymous_id=a1fa0ab7-91e0-41f5-8659-f77686a9adc3; _gcl_au=1.1.1271150883.1710917108; s_cc=true; _pin_unauth=dWlkPU5qRTNNV0V3WlRndFpqSXlNaTAwWVdSa0xXSTVaR1F0TVdVMU56TTRPR001WTJReQ; _ncg_id_=41c19b00-1a9e-4b2d-90df-7b8344634212; _fbp=fb.1.1710917107810.1699847377; _dj_sp_id=09dfe400-9303-4f0d-ab44-e30daad2eaea; _ncg_domain_id_=41c19b00-1a9e-4b2d-90df-7b8344634212.1.1710917109623.1773989109623; _scid=6ee68aeb-d484-4800-a696-c0adb7b914a4; _ncg_g_id_=b7310b5e-1113-4f94-8ca3-7ea5b3c2ef71.3.1710917112.1773989109623; DJSESSION=country
%3
Dhk
%7
C
%7
Ccontinent
%3
Das
%7
C
%7
Cregion
%3
D; wsjregion=asia
%2
Ccn; AMCV_CB68E4BA55144CAA0A4C98A5
%40
AdobeOrg=1585540135
%7
CMCIDTS
%7
C19803
%7
CMCMID
%7
C26000677277848255171457287474499803357
%7
CMCAAMLH-1711618354
%7
C6
%7
CMCAAMB-1711618354
%7
CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y
%7
CMCOPTOUT-1711020754s
%7
CNONE
%7
CMCAID
%7
CNONE
%7
CvVersion
%7
C4.4.0; _pin_unauth=dWlkPU5ERXhNbVppWVRZdE5UTTJZaTAwTmpRMUxUazFabVl0WXpRek9XTmlZamd5TW1VMg; _fbp=fb.1.1710917107810.1699847377; _meta_facebookTag_sync=1711013559439; djcs_route=f2994554-703d-42af-a2b8-785c61193619; ca_rt=B2-Zo67DYbWomddfWX4igw.4qexK32nu-8FzO0Q-M1MtldjOaZYCKtr4ejWSn8EuXlvOgHhaJ_VHNVNCAGwqx91Dosk3YRHhQoX1YuERztjFJN5nNTJ-IkBNAHwCIdMoY8; ca_id=eJxdkMtOg0AUht9l1tDOlQFW0oAGU1tTqy6MIcNc7FhoG5iK1fjuTi8L4-6c7__P9Ru82Q-9qTai1SAFregOIABGtLY5_IO6Fbbx6ddqjSGLr1BERnLbemW_t8oLWmOecGZCbqAKKY5pWGOuQmkiiRHlJFaxd7tOyPWpINKslowLDImqIRQkqTnmtUEwllhjxmspDVcKE8mQhEwow_wIQ4mmdZJEvlm3bXQP0hdwvSiKRXETlrO8fCrzx2zq1eeH27C8z_JzeAHZLF_My7xaZpNpsbzAu_mknBbgNQBi71aVs8fLEUcIIsKSJACy08JpVQnnOYswxQnHKAD2BP4Y9efuDChlJ2B7vyBYObfr0_F4GIbR0L8fPzeWjdUbB35-AYojbss.r5GaoioJDee6VernrH2oRDgGbTxrPefp2P9CPAXjncI5Z1XbCQnkbCsJOXqTXeC92ryLYDm1dAl-B14KYwL1eAi6mBF88dkhze2ISucUCtHwe9B54d-hTMROM2GR9ifS6QV279pzV3mTHZF_7ziLtbTPiL-PFpzsbxQotpLpKwzFnnxrbF5e-5jfpWAhTg-eiPo2yBowpU-wg2echaNlmlrHxaE7j2V5rygnhAyuxGA2PJ4cjOYcG3uY39dAk4NqeMfVgPUsRLBNqxLEd6I6Y1bE2nRk88A0rLa1vtLv-ZB-4gskyyCnab1PRWN8SwemTuskXnwMhmY1-dBEXc8uNV-lRoCPOMKIS-PE4DX5JJ7CDmZdl1kUzW0FaLNRMuIrTvlz6wnS_6nXkgKSFS6fFWLkBVwUOddKcyqtOjgUHJgLmnmLTJtvoqMD-83k_AAFUy6RrFatThHRHC45yjsb2vWrCMoKnSQwmU7HHU_2zyO9sQQaXYrP2qFbLi9oa3RFYw38jfykO27ZqxAabgMktsSafI6giPW_iQGfurmY-SJUDxqf8tSmwDwhUQobochTrnmHE_sX7Tonf-0YEQUIThHSe5skrnA6RwgXMj3qnHV8urQRn8WM9DmiIw7R1zX5VjvHgIlW9bHykWLpfnlzr2KsS77ZEnPUTA9jKWxdSJs; TR=V2-6e5bc57a203db00a39b727bf108c2e257bccf7dd23c51c05adf5279f43e4b996; ab_uuid=5be368fe-5b01-4451-8d6c-54c8bd163f1b; usr_prof_v2=eyJwIjp7InBzIjowLjgsInEiOjAuODV9LCJjcCI6eyJlYyI6IlN0YWJsZSIsInBjIjowLjAxMjAxLCJwc3IiOjAuNDQ3NSwidGQiOjE3MTksImFkIjoyOCwicWMiOjIxLCJxbyI6MjMsInNjZW4iOnsiY2hlIjowLjAzMjU2LCJjaG4iOjAuMDIxNjgsImNoYSI6MC4wMTIwMSwiY2hwIjowLjAxMzQ1fX0sIm9wIjp7ImkiOiI2MTdiY2UwMCIsImoiOnsianQiOiJlbGdtIn19LCJpYyI6Nn0
%3
D; utag_main=v_id:018e5a9b3c51001f7967cb1f91690506f0014067007e8$_sn:2$_se:2$_ss:0$_st:1711015401906$vapi_domain:wsj.com$ses_id:1711013553852
%3
Bexp-session$_pn:2
%3
Bexp-session$_prevpage:CWSJ_Home_Tech
%3
Bexp-1711017201915; _pctx=
%7
Bu
%7
DN4IgrgzgpgThIC5QDYoFYBGBjNB2AhgEwAMAzACYbHH6kCcGuhuGAZgIzEAcWhUhebFla5y5QqRzssxNPnKs0zOqwAspKKox06yYAHcIAKwC
%2
BiUAAcYUVgEsAHohCGjIADQgALgE8LUJwDCABogJiYekLAAyp74npBO
%2
BAB2APZJ7iAQtp5QAJLkTnSExaSkaEpcaMi4yGjsdKSqoUA; _dj_id.9183=.1710917108.2.1711013602.1710917108.b8474262-5ed9-4de4-a0d7-deb5b8e74386.318bf4f0-fd38-4ff3-a153-dfb8e83f4842.9b7abe89-6983-437a-933e-f77e03dd4f49.1711013555174.2; _scid_r=6ee68aeb-d484-4800-a696-c0adb7b914a4; _ncg_sp_id.5378=41c19b00-1a9e-4b2d-90df-7b8344634212.1710917110.2.1711013602.1710917111.2fa95b23-4e7d-489c-b7d3-81086312ca4f; _uetsid=f3603ad0e76511eeb0a7e164836efe1a; _uetvid=64aa3700e68511eebe452957ac3861c3; datadome=xDtRNqFhkjvX5OHJjDUvZJnRfHeCdi_ysN9qG8GC4Os1S2IsutTgJXKYGM3aPEkdEkc7W~4nJuiN1y8XAP8fN81P2lfJ8BGS~JBFgavd0psSTris5e~an90PcbNgL54q; ResponsiveConditional_initialBreakpoint=lg; __gads=ID=8d962eb26c834930:T=1710917105:RT=1711015497:S=ALNI_Maw2YIR-9L0CKOx0yoGb_jgM0pcGA; __gpi=UID=00000d49491e3c9b:T=1710917105:RT=1711015497:S=ALNI_MYBakK-TvogAZ1BbqEvyt3N3t4bMg; __eoi=ID=e1c0d0848d87c017:T=1710917105:RT=1711015497:S=AA-Afjb1xilZZE2hfIUmTyUlS4bt; s_tp=3129; s_ppv=CWSJ_Home_Tech
%2
C29
%2
C29
%2
C919'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
}
ip
=
{
'http'
:
'http://127.0.0.1:1080'
,
'https'
:
'http://127.0.0.1:1080'
}
edge_service
=
Service
(
r'D:\soft\msedgedriver.exe'
)
req
=
requests
.
get
(
url
,
headers
,
proxies
=
ip
)
edge_options
=
Options
()
soup
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
# 开启开发者模式
# print(soup)
edge_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
scrip
=
soup
.
find
(
'body'
)
.
find
(
'script'
)
# 禁用启用Blink运行时的功能
edge_options
.
add_argument
(
'--disable-blink-features=AutomationControlled'
)
edge_options
.
add_argument
(
'--proxy-server=
%
s'
%
ip
[
'http'
])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver
=
webdriver
.
Edge
(
service
=
edge_service
,
options
=
edge_options
)
return
driver
def
get_pagesource
():
driver
=
create_driver
()
# un = 'zhk2058@163.com'
# pw = 'ZZM205899'
# driver.get(
# "https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https%3A%2F%2Fcn.wsj.com%2Fclient%2Fauth&response_type=code&scope=openid%20idp_id%20roles%20tags%20email%20given_name%20family_name%20uuid%20djid%20djUsername%20djStatus%20trackid%20prts%20updated_at%20created_at%20offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https%253A%252F%252Fcn.wsj.com%252F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin")
# time.sleep(5)
# driver.find_element(By.XPATH, "//div/input[@name = 'username']").send_keys(un)
# # //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
# driver.find_element(By.XPATH, '//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]').click()
# time.sleep(3)
# # //*[@id="password-login"]/div/form/div[5]/button
# driver.find_element(By.ID, "password-login-password").send_keys(pw)
# driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
# time.sleep(3)
url
=
'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
driver
.
get
(
url
)
time
.
sleep
(
3
)
while
True
:
page_source
=
driver
.
page_source
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
try
:
scrip
=
soup
.
find
(
'body'
)
.
find
(
'script'
)
.
text
# print(scrip)
scrip
=
re
.
findall
(
'__STATE__ =(.*);'
,
scrip
)[
0
]
.
strip
()
break
except
:
if
soup
.
text
==
''
:
return
None
,
driver
time
.
sleep
(
3
)
continue
return
soup
,
driver
def
get_newshref
(
key
):
soup
,
driver
=
get_pagesource
()
if
soup
:
pass
else
:
return
None
scrip
=
soup
.
find
(
'body'
)
.
find
(
'script'
)
.
text
# print(scrip)
# print(scrip)
pattern
=
re
.
compile
(
r'\{\"data\": \{.*?\}\}'
)
scrip
=
re
.
findall
(
'__STATE__ =(.*);'
,
scrip
)[
0
]
.
strip
()
match
=
pattern
.
search
(
scrip
)
reqJson
=
json
.
loads
(
scrip
)
if
match
:
# print(reqJson)
print
(
match
.
group
(
0
))
pattern
=
re
.
compile
(
'article'
)
keys
=
[
key
for
key
in
reqJson
[
'data'
]
.
keys
()
if
pattern
.
match
(
key
)]
# filtered_data = {key: value for key, value in reqJson['data'].items() if pattern.match(key)}
print
(
keys
)
news_list
=
[]
for
key_
in
keys
:
title
=
reqJson
[
'data'
][
key_
][
'data'
][
'data'
][
'headline'
]
# print(title)
summary
=
reqJson
[
'data'
][
key_
][
'data'
][
'data'
][
'summary'
]
try
:
seoId
=
reqJson
[
'data'
][
key_
][
'data'
][
'data'
][
'seoId'
]
except
:
continue
newsUrl
=
'https://cn.wsj.com/articles/'
+
seoId
print
(
newsUrl
)
timestamp
=
int
(
reqJson
[
'data'
][
key_
][
'data'
][
'data'
][
'timestamp'
])
/
1000
publishDate
=
datetime
.
datetime
.
fromtimestamp
(
timestamp
)
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
dic_newsinfo
=
{
'title'
:
title
,
'summary'
:
summary
,
'publishDate'
:
publishDate
,
'newsUrl'
:
newsUrl
}
# redis_client.hset(key, mapping=dic_newsinfo)
news_list
.
append
(
dic_newsinfo
)
return
news_list
,
driver
def
caiji
():
redis_client
=
redis
.
Redis
(
host
=
'114.116.90.53'
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
6
)
key
=
'WSJ:NewsInfo'
news_list
,
driver
=
get_newshref
(
key
)
# #todo:将获取到的列表全部放进redis等待
#
# count = 0
# time.sleep(10)
# 开始一个pipeline
pipeline
=
redis_client
.
pipeline
()
for
idx
,
info
in
enumerate
(
news_list
):
# href = info['newsUrl']
# title = info['title']
# summary = info['summary']
# publishDate = info['publishDate']
# 存入 redis
hash_key
=
f
'{key}:{idx}'
pipeline
.
hset
(
hash_key
,
mapping
=
info
)
# 执行pipeline
pipeline
.
execute
()
# driver.get(href)
# time.sleep(3)
# news_soup = BeautifulSoup(driver.page_source, 'html.parser')
# news_content = news_soup.find('div', class_='article-content')
# content = news_content.text
# print(f'{info["title"]}---已采集')
# count += 1
# print(count)
#华尔街列表定时任务
def
wsj_list_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每天执行一次
scheduler
.
add_job
(
caiji
,
'cron'
,
hour
=
9
,
minute
=
0
,
max_instances
=
1
)
try
:
# redisPushData # 定时开始前执行一次
# NewsEnterprise()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
if
__name__
==
'__main__'
:
wsj_list_task
()
comData/dingzhi/wsj_cookie.txt
0 → 100644
浏览文件 @
f434a907
{"s_tp": "4333", "_ncg_domain_id_": "7c5b7036-4ad7-4687-8961-1b6f5d273984.1.1711521614929.1774593614929", "_ncg_sp_id.5378": "7c5b7036-4ad7-4687-8961-1b6f5d273984.1711521615.1.1711521616.1711521615.89221ca3-0ff7-447c-bd6b-88bc0d7b9ca1", "__eoi": "ID=74957e9c589e06c8:T=1711521614:RT=1711521614:S=AA-AfjbpSmTSwP45PV1P_Sfr-2eA", "_ncg_id_": "7c5b7036-4ad7-4687-8961-1b6f5d273984", "__gpi": "UID=00000d6a82c6423a:T=1711521614:RT=1711521614:S=ALNI_Mamy4ax3m1xyG-SWHlZCu8YbWNV7w", "__gads": "ID=4e5002beb21f7800:T=1711521614:RT=1711521614:S=ALNI_MaWml862ei0DYTIgH5jH8-qiduUyA", "dicbo_id": "%7B%22dicbo_fetch%22%3A1711521614435%7D", "s_cc": "true", "_ncg_sp_ses.5378": "*", "_dj_sp_id": "f24fbbcc-3872-4f04-b628-048e0da2d503", "_uetvid": "ddc10a40ec0411ee968a03a12159858e", "s_ppv": "CWSJ_Home_Home%2520Page%2C13%2C13%2C570", "_uetsid": "ddc0cf80ec0411eea60095acfb805f07", "_gcl_au": "1.1.1409873185.1711521614", "_pin_unauth": "dWlkPU4yUTFOVEF6TXpZdFlqYzRaaTAwWXpGa0xXRTVNell0TjJJd05qaGtPVFUzTkdNMw", "AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1", "_rdt_uuid": "1711521612889.741c0512-609f-4857-9453-d2627aab72f2", "cX_P": "lu9fryu88xq1rnhv", "_meta_facebookTag_sync": "1711521612864", "_dj_id.9183": ".1711521612.1.1711521612..42d375e3-8210-4847-b37b-e2413e02fe5a..1bafccfe-479d-43d7-ae40-2c0b27f1273b.1711521612160.1", "usr_prof_v2": "eyJwIjp7InBzIjowLjg4LCJxIjowLjg2fSwiY3AiOnsiZWMiOiJTdGFibGUiLCJwYyI6MC4wMTQzMywicHNyIjowLjMyNTEsInRkIjoxNzI1LCJhZCI6MjgsInFjIjozMCwicW8iOjI3LCJzY2VuIjp7ImNoZSI6MC4wMzA0MywiY2huIjowLjAzMDgyLCJjaGEiOjAuMDE0MzMsImNocCI6MC4wMTczfX0sIm9wIjp7ImkiOiI2MTdiY2UwMCIsImoiOnsianQiOiJlbGdtIn19LCJpYyI6M30%3D", "_scid": "22e08d14-b460-4edb-8046-8b897104f696", "ResponsiveConditional_initialBreakpoint": "md", "vcdpaApplies": "false", "_pctx": "%7Bu%7DN4IgrgzgpgThIC5QDYoFYBGBjNB2AhgEwAMAzACYbHH6kCcGuhuGAZgIzEAcWhUhebFla5y5QqRzssxNPnKs0zOqwAspKKox06yYAHcIAKwC%2BiUAAcYUVgEsAHohCGjIADQgALgE8LUJwDCABogJiYekLAAyp74npBO%2BAB2APZJ7iAQtp5QAJLkTnSExaSkaEpcaMi4yGjsdKSqoUA", "cX_G": "cx%3Aar1n90irbdrh1nz3umsn41upp%3A12bdufvkc9frm", "ab_uuid": "5be368fe-5b01-4451-8d6c-54c8bd163f1b", "AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1585540135%7CMCIDTS%7C19810%7CMCMID%7C48333472371426108300129167066567447308%7CMCAID%7CNONE%7CMCOPTOUT-1711528813s%7CNONE%7CMCAAMLH-1712126413%7C6%7CMCAAMB-1712126413%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCSYNCSOP%7C411-19817%7CvVersion%7C4.4.0", "datadome": "i1JSVSww9u2lCnQC7CmZsfQQNOATMw1rMpQl2syLPun2T1iS1iZ9PP~uUddO0Tp~ABwhQ~K~x~CIeUhxxhz18uVd1dpdc5wK8gDSrOlrovmG8ozGm4KjIXF~mas7NsGe", "gdprApplies": "false", "ccpaApplies": "false", "_fbp": "fb.1.1711521612864.2074745750", "ajs_anonymous_id": "86ceb415-2de7-4a04-bbf7-b3b952852670", "utag_main": "v_id:018e7ea33c2c004ea140fb3e2df00507d0013075007e8$_sn:1$_se:1$_ss:1$_st:1711523410801$ses_id:1711521610801%3Bexp-session$_pn:1%3Bexp-session$_prevpage:CWSJ_Home_Home%20Page%3Bexp-1711525210821$vapi_domain:wsj.com", "regulationApplies": "gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse", "ca_rt": "MI50RGh7xOF_Ha5M-pfrXQ.IWGd-OI_q1g7AlutTc36h9FX7n1uQrebrg16x7dunYcPqFafWtPhVQ0ZQqvmKc6OzBixBYWSZvxCXF_gye9vCE0vT01vzuzUI8a-JK4X4LE", "_dj_ses.9183": "*", "DJSESSION": "country%3Dsg%7C%7Ccontinent%3Das%7C%7Cregion%3D", "ca_id": "eJxdkMtSgzAUht8la2hzIYSwkg7o4NTWqVUXjsOEXGwstB1Ixer47qYXN-7O-f7_XL_Bm_3Qm2ojWg1S0IruAAJgRGubwz-oW2Ebn36t1hjS5ArFZCS3rVf2e6u8oDVmnFETMgNVGOEkCmvMVChNLDGKGElU4t2uE3J9Kog1rSVlAkOiaggF4TXDrDYIJhJrTFktpWFKYSIpkpAKZagfYSKio5rz2Dfrto3uQfoCrhdFsShuwnKWl09l_phNvfr8cBuW91l-Di8gm-WLeZlXy2wyLZYXeDeflNMCvAZA7N2qcvZ4OWIIUYwoTwIgOy2cVpVwntMYR5gzjAJgT-DPyP2bPndnwCk5Adv7BcHKuV2fjsfDMIyG_v34ubFsrN448PMLjPFu1g.qGey7P4In7Rq_zwb3rdDqeMgXg4ctbinCQ8wcWje7hmNwg48tJ5nKGnGeVLUEIynG34nBOAARDWFeZPNyFFPQr5JS-xjPAnfuuCYxSkS8Z6C3FVwBZ0D4rkSU11Ts67PVCwzfI3f4qxYy9M8JB4WnM6PQYU0wJ_WCZAzb2pDxEASxzKfzzm_M5FdybgMHkY-4WcbF-Zp0V8RHWG7eH9OvmIYZZNq621vKCZXg6hnJmeh6FZdVxiSXPrOG4K1zGgtb-wHrKJFVu5VmGXY_ygilEFec8v27wiASm9IiMmqZ-wQ_ej0u9OS2YMIA5Fzn7kqx1mgYOWCL3eLVYR1R01oigJx4q2GdyrQDVyp0X_8Z7aaGSj4UgYR8Q2mxfrB7AWeWqeKbec8RdJPDh9kjKvcs_KVPtfNVyzHKH-f9fv7hnh2Rmot44XSUPb4WqNUJx8N1FNXV8mjOLP37oVCZZaeVW2NoNNeNbN75WYS4kSjBhuUtBN8iumYaxR5xmCk41UYbKldPN4qyFE0J1hBt64mXbhGD0clVCFdBMWpw29ZIMdkXwpqX3Ig0FVGku6QO4pXlOWeqVPb2Tc78yGqxmg-Nqc90C2-nANOTdT1AZt27-FuB6KAiadLNFlcVcYNYi8vD6ylHGZiHB8DemoFPLoKN1WH5ggodOBRyvMFO2Asry8", "_scid_r": "22e08d14-b460-4edb-8046-8b897104f696", "_ncg_g_id_": "cbeb2daa-da53-494b-a606-90a386ee0b55.3.1711521616.1774593614929", "TR": "V2-6e5bc57a203db00a39b727bf108c2e257bccf7dd23c51c05adf5279f43e4b996", "_sctr": "1%7C1711468800000", "_pcid": "%7B%22browserId%22%3A%22lu9fryu88xq1rnhv%22%7D", "wsjregion": "asia%2Ccn"}
\ No newline at end of file
comData/dingzhi/wsj_detail.py
0 → 100644
浏览文件 @
f434a907
from
bs4
import
BeautifulSoup
import
requests
,
time
,
json
import
redis
,
random
from
kafka
import
KafkaProducer
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.edge.options
import
Options
from
selenium.webdriver.edge.service
import
Service
from
apscheduler.schedulers.blocking
import
BlockingScheduler
import
sys
sys
.
path
.
append
(
"../../base"
)
from
base
import
BaseCore
log
=
BaseCore
.
BaseCore
()
.
getLogger
()
def
create_driver
():
ip
=
{
'https'
:
'https://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
edge_service
=
Service
(
r'D:\soft\msedgedriver.exe'
)
edge_options
=
Options
()
# 开启开发者模式
edge_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
# 禁用启用Blink运行时的功能
edge_options
.
add_argument
(
'--disable-blink-features=AutomationControlled'
)
edge_options
.
add_argument
(
'--proxy-server=
%
s'
%
ip
[
'http'
])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver
=
webdriver
.
Edge
(
service
=
edge_service
,
options
=
edge_options
)
return
driver
def
create_google
():
driver_path
=
r'D:\cmd100\chromedriver.exe'
chromr_bin
=
r'D:\Google\Chrome\Application\chrome.exe'
chrome_driver
=
driver_path
path
=
Service
(
chrome_driver
)
chrome_options
=
webdriver
.
ChromeOptions
()
# chrome_options.add_argument('--headless')
chrome_options
.
add_argument
(
'--disable-gpu'
)
chrome_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
chrome_options
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
chrome_options
.
binary_location
=
chromr_bin
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
return
driver
def
login
():
driver
=
create_google
()
un
=
'zhk2058@163.com'
pw
=
'ZZM205899'
driver
.
get
(
"https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https
%3
A
%2
F
%2
Fcn.wsj.com
%2
Fclient
%2
Fauth&response_type=code&scope=openid
%20
idp_id
%20
roles
%20
tags
%20
email
%20
given_name
%20
family_name
%20
uuid
%20
djid
%20
djUsername
%20
djStatus
%20
trackid
%20
prts
%20
updated_at
%20
created_at
%20
offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https
%253
A
%252
F
%252
Fcn.wsj.com
%252
F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin"
)
time
.
sleep
(
5
)
driver
.
find_element
(
By
.
XPATH
,
"//div/input[@name = 'username']"
)
.
send_keys
(
un
)
# //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]'
)
.
click
()
time
.
sleep
(
3
)
# //*[@id="password-login"]/div/form/div[5]/button
driver
.
find_element
(
By
.
ID
,
"password-login-password"
)
.
send_keys
(
pw
)
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="password-login"]/div/form/div[5]/button'
)
.
click
()
time
.
sleep
(
3
)
cookies
=
driver
.
get_cookies
()
return
cookies
,
driver
def
parser_content
(
href
,
driver
):
while
True
:
driver
.
get
(
href
)
time
.
sleep
(
2
)
news_soup
=
BeautifulSoup
(
driver
.
page_source
,
'html.parser'
)
news_content
=
news_soup
.
find
(
'div'
,
class_
=
'article-content'
)
if
news_content
is
None
:
driver
.
refresh
()
time
.
sleep
(
3
)
log
.
info
(
'封号'
)
return
None
,
None
else
:
break
content
=
news_content
.
text
return
content
,
news_content
def
getData
(
key
):
keys
=
r
.
scan_iter
(
f
"{key}*"
)
for
key
in
keys
:
fields
=
r
.
hgetall
(
key
)
decode_fields
=
{
k
.
decode
():
v
.
decode
()
for
k
,
v
in
fields
.
items
()}
# 获取一条信息
# r.delete(key)
# print(f"删除成功{key}")
newsUrl
=
decode_fields
[
'newsUrl'
]
# todo: 判断是否已采集
try
:
flag
=
r_2
.
sismember
(
'IN-20240403-0041'
,
newsUrl
)
if
flag
:
log
.
info
(
'信息已采集入库过'
)
continue
except
Exception
as
e
:
continue
publishDate
=
decode_fields
[
'publishDate'
]
title
=
decode_fields
[
'title'
]
summary
=
decode_fields
[
'summary'
]
# todo:发送kafka
sid
=
'1775455062911447042'
info_code
=
"IN-20240403-0041"
dic_news
=
{
'content'
:
''
,
'contentWithTag'
:
''
,
'id'
:
''
,
'summary'
:
summary
,
'origin'
:
'华尔街日报中文网-科技'
,
'publishDate'
:
publishDate
,
'sid'
:
sid
,
'sourceAddress'
:
newsUrl
,
'title'
:
title
,
'source'
:
'16'
,
'type'
:
''
}
# 将相应字段通过kafka传输保存
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
log
.
info
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
log
.
info
(
dic_result
)
r_2
.
sadd
(
info_code
,
newsUrl
)
except
Exception
as
e
:
log
.
info
(
e
)
log
.
info
(
f
'传输失败:{dic_news["title"]}、{dic_news["publishDate"]}'
)
# 不用对内容做处理
# content, contentWithTag = parser_content(newsUrl, driver)
# if content is None:
# time.sleep(20*60)
# else:
# log.info(f'成功--{decode_fields}')
# r.delete(key)
# time.sleep(3)
return
True
if
__name__
==
'__main__'
:
r
=
redis
.
Redis
(
host
=
'114.116.90.53'
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
6
)
r_2
=
redis
.
Redis
(
host
=
"114.116.90.53"
,
port
=
6380
,
password
=
'clbzzsn'
,
db
=
5
)
key
=
'WSJ:NewsInfo'
# ip = {
# 'https': 'https://127.0.0.1:1080',
# 'http': 'http://127.0.0.1:1080'
#
# }
# cookies, driver = login()
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
# "content-type": "application/json; charset=UTF-8",
# "Connection": "keep-alive"
# }
# with open('wsj_cookie.txt', 'r') as g:
# cookies = g.read()
# cookies = json.loads(cookies)
#
# # url = 'https://cn.wsj.com/articles/欧盟根据新数字竞争法对苹果-meta-谷歌展开调查-732a3d4f'
# url = 'https://www.wsj.com/economy/china-industrial-profits-return-to-growth-d3530ec5'
#
# driver = create_driver()
# for cookie in cookies:
# driver.add_cookie(cookie)
# driver.get(url)
while
True
:
getData
(
key
)
time
.
sleep
(
60
*
60
*
1
)
comData/dingzhi/wsj_getcookies.py
0 → 100644
浏览文件 @
f434a907
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.edge.options
import
Options
from
selenium.webdriver.edge.service
import
Service
import
time
,
json
def
create_driver
():
ip
=
{
'https'
:
'https://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
edge_service
=
Service
(
r'D:\soft\msedgedriver.exe'
)
edge_options
=
Options
()
# 开启开发者模式
edge_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
# 禁用启用Blink运行时的功能
edge_options
.
add_argument
(
'--disable-blink-features=AutomationControlled'
)
edge_options
.
add_argument
(
'--proxy-server=
%
s'
%
ip
[
'http'
])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver
=
webdriver
.
Edge
(
service
=
edge_service
,
options
=
edge_options
)
return
driver
def
login
():
driver
=
create_driver
()
un
=
'zhk2058@163.com'
pw
=
'ZZM205899'
driver
.
get
(
"https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https
%3
A
%2
F
%2
Fcn.wsj.com
%2
Fclient
%2
Fauth&response_type=code&scope=openid
%20
idp_id
%20
roles
%20
tags
%20
email
%20
given_name
%20
family_name
%20
uuid
%20
djid
%20
djUsername
%20
djStatus
%20
trackid
%20
prts
%20
updated_at
%20
created_at
%20
offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https
%253
A
%252
F
%252
Fcn.wsj.com
%252
F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin"
)
time
.
sleep
(
5
)
driver
.
find_element
(
By
.
XPATH
,
"//div/input[@name = 'username']"
)
.
send_keys
(
un
)
# //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]'
)
.
click
()
time
.
sleep
(
3
)
# //*[@id="password-login"]/div/form/div[5]/button
driver
.
find_element
(
By
.
ID
,
"password-login-password"
)
.
send_keys
(
pw
)
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="password-login"]/div/form/div[5]/button'
)
.
click
()
time
.
sleep
(
3
)
cookie
=
driver
.
get_cookies
()
return
cookie
,
driver
if
__name__
==
'__main__'
:
cookie
,
driver
=
login
()
cookies
=
{}
for
item
in
cookie
:
cookies
[
item
[
'name'
]]
=
item
[
'value'
]
with
open
(
"wsj_cookie.txt"
,
"w"
)
as
f
:
f
.
write
(
json
.
dumps
(
cookies
))
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论