Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
976e2fb4
提交
976e2fb4
authored
3月 22, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
谷歌搜索脚本维护
上级
82652d12
显示空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
30 行增加
和
5 行删除
+30
-5
wsj-TECH.py
comData/dingzhi/wsj-TECH.py
+24
-0
config.ini
google_comm/config.ini
+2
-2
googleSpider.py
google_comm/googleSpider.py
+3
-2
googletaskJob_loc.py
google_comm/googletaskJob_loc.py
+1
-1
没有找到文件。
comData/dingzhi/wsj-TECH.py
0 → 100644
浏览文件 @
976e2fb4
import
requests
from
bs4
import
BeautifulSoup
import
re
if
__name__
==
'__main__'
:
url
=
'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
headers
=
{
'Cookie'
:
'gdprApplies=false; ccpaApplies=false; vcdpaApplies=false; regulationApplies=gdpr
%3
Afalse
%2
Ccpra
%3
Afalse
%2
Cvcdpa
%3
Afalse; _pcid=
%7
B
%22
browserId
%22%3
A
%22
ltzfvavl4ju9vgpi
%22%7
D; cX_P=ltzfvavl4ju9vgpi; dnsDisplayed=undefined; signedLspa=undefined; _sp_su=false; cX_G=cx
%3
Allui1w2zab163r7fbco37esw7
%3
A317ttgvfg79lq; AMCVS_CB68E4BA55144CAA0A4C98A5
%40
AdobeOrg=1; ajs_anonymous_id=a1fa0ab7-91e0-41f5-8659-f77686a9adc3; _gcl_au=1.1.1271150883.1710917108; s_cc=true; _pin_unauth=dWlkPU5qRTNNV0V3WlRndFpqSXlNaTAwWVdSa0xXSTVaR1F0TVdVMU56TTRPR001WTJReQ; _ncg_id_=41c19b00-1a9e-4b2d-90df-7b8344634212; _fbp=fb.1.1710917107810.1699847377; _dj_sp_id=09dfe400-9303-4f0d-ab44-e30daad2eaea; _ncg_domain_id_=41c19b00-1a9e-4b2d-90df-7b8344634212.1.1710917109623.1773989109623; _scid=6ee68aeb-d484-4800-a696-c0adb7b914a4; _ncg_g_id_=b7310b5e-1113-4f94-8ca3-7ea5b3c2ef71.3.1710917112.1773989109623; DJSESSION=country
%3
Dhk
%7
C
%7
Ccontinent
%3
Das
%7
C
%7
Cregion
%3
D; wsjregion=asia
%2
Ccn; AMCV_CB68E4BA55144CAA0A4C98A5
%40
AdobeOrg=1585540135
%7
CMCIDTS
%7
C19803
%7
CMCMID
%7
C26000677277848255171457287474499803357
%7
CMCAAMLH-1711618354
%7
C6
%7
CMCAAMB-1711618354
%7
CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y
%7
CMCOPTOUT-1711020754s
%7
CNONE
%7
CMCAID
%7
CNONE
%7
CvVersion
%7
C4.4.0; _pin_unauth=dWlkPU5ERXhNbVppWVRZdE5UTTJZaTAwTmpRMUxUazFabVl0WXpRek9XTmlZamd5TW1VMg; _fbp=fb.1.1710917107810.1699847377; _meta_facebookTag_sync=1711013559439; djcs_route=f2994554-703d-42af-a2b8-785c61193619; ca_rt=B2-Zo67DYbWomddfWX4igw.4qexK32nu-8FzO0Q-M1MtldjOaZYCKtr4ejWSn8EuXlvOgHhaJ_VHNVNCAGwqx91Dosk3YRHhQoX1YuERztjFJN5nNTJ-IkBNAHwCIdMoY8; ca_id=eJxdkMtOg0AUht9l1tDOlQFW0oAGU1tTqy6MIcNc7FhoG5iK1fjuTi8L4-6c7__P9Ru82Q-9qTai1SAFregOIABGtLY5_IO6Fbbx6ddqjSGLr1BERnLbemW_t8oLWmOecGZCbqAKKY5pWGOuQmkiiRHlJFaxd7tOyPWpINKslowLDImqIRQkqTnmtUEwllhjxmspDVcKE8mQhEwow_wIQ4mmdZJEvlm3bXQP0hdwvSiKRXETlrO8fCrzx2zq1eeH27C8z_JzeAHZLF_My7xaZpNpsbzAu_mknBbgNQBi71aVs8fLEUcIIsKSJACy08JpVQnnOYswxQnHKAD2BP4Y9efuDChlJ2B7vyBYObfr0_F4GIbR0L8fPzeWjdUbB35-AYojbss.r5GaoioJDee6VernrH2oRDgGbTxrPefp2P9CPAXjncI5Z1XbCQnkbCsJOXqTXeC92ryLYDm1dAl-B14KYwL1eAi6mBF88dkhze2ISucUCtHwe9B54d-hTMROM2GR9ifS6QV279pzV3mTHZF_7ziLtbTPiL-PFpzsbxQotpLpKwzFnnxrbF5e-5jfpWAhTg-eiPo2yBowpU-wg2echaNlmlrHxaE7j2V5rygnhAyuxGA2PJ4cjOYcG3uY39dAk4NqeMfVgPUsRLBNqxLEd6I6Y1bE2nRk88A0rLa1vtLv-ZB-4gskyyCnab1PRWN8SwemTuskXnwMhmY1-dBEXc8uNV-lRoCPOMKIS-PE4DX5JJ7CDmZdl1kUzW0FaLNRMuIrTvlz6wnS_6nXkgKSFS6fFWLkBVwUOddKcyqtOjgUHJgLmnmLTJtvoqMD-83k_AAFUy6RrFatThHRHC45yjsb2vWrCMoKnSQwmU7HHU_2zyO9sQQaXYrP2qFbLi9oa3RFYw38jfykO27ZqxAabgMktsSafI6giPW_iQGfurmY-SJUDxqf8tSmwDwhUQobochTrnmHE_sX7Tonf-0YEQUIThHSe5skrnA6RwgXMj3qnHV8urQRn8WM9DmiIw7R1zX5VjvHgIlW9bHykWLpfnlzr2KsS77ZEnPUTA9jKWxdSJs; TR=V2-6e5bc57a203db00a39b727bf108c2e257bccf7dd23c51c05adf5279f43e4b996; ab_uuid=5be368fe-5b01-4451-8d6c-54c8bd163f1b; usr_prof_v2=eyJwIjp7InBzIjowLjgsInEiOjAuODV9LCJjcCI6eyJlYyI6IlN0YWJsZSIsInBjIjowLjAxMjAxLCJwc3IiOjAuNDQ3NSwidGQiOjE3MTksImFkIjoyOCwicWMiOjIxLCJxbyI6MjMsInNjZW4iOnsiY2hlIjowLjAzMjU2LCJjaG4iOjAuMDIxNjgsImNoYSI6MC4wMTIwMSwiY2hwIjowLjAxMzQ1fX0sIm9wIjp7ImkiOiI2MTdiY2UwMCIsImoiOnsianQiOiJlbGdtIn19LCJpYyI6Nn0
%3
D; utag_main=v_id:018e5a9b3c51001f7967cb1f91690506f0014067007e8$_sn:2$_se:2$_ss:0$_st:1711015401906$vapi_domain:wsj.com$ses_id:1711013553852
%3
Bexp-session$_pn:2
%3
Bexp-session$_prevpage:CWSJ_Home_Tech
%3
Bexp-1711017201915; _pctx=
%7
Bu
%7
DN4IgrgzgpgThIC5QDYoFYBGBjNB2AhgEwAMAzACYbHH6kCcGuhuGAZgIzEAcWhUhebFla5y5QqRzssxNPnKs0zOqwAspKKox06yYAHcIAKwC
%2
BiUAAcYUVgEsAHohCGjIADQgALgE8LUJwDCABogJiYekLAAyp74npBO
%2
BAB2APZJ7iAQtp5QAJLkTnSExaSkaEpcaMi4yGjsdKSqoUA; _dj_id.9183=.1710917108.2.1711013602.1710917108.b8474262-5ed9-4de4-a0d7-deb5b8e74386.318bf4f0-fd38-4ff3-a153-dfb8e83f4842.9b7abe89-6983-437a-933e-f77e03dd4f49.1711013555174.2; _scid_r=6ee68aeb-d484-4800-a696-c0adb7b914a4; _ncg_sp_id.5378=41c19b00-1a9e-4b2d-90df-7b8344634212.1710917110.2.1711013602.1710917111.2fa95b23-4e7d-489c-b7d3-81086312ca4f; _uetsid=f3603ad0e76511eeb0a7e164836efe1a; _uetvid=64aa3700e68511eebe452957ac3861c3; datadome=xDtRNqFhkjvX5OHJjDUvZJnRfHeCdi_ysN9qG8GC4Os1S2IsutTgJXKYGM3aPEkdEkc7W~4nJuiN1y8XAP8fN81P2lfJ8BGS~JBFgavd0psSTris5e~an90PcbNgL54q; ResponsiveConditional_initialBreakpoint=lg; __gads=ID=8d962eb26c834930:T=1710917105:RT=1711015497:S=ALNI_Maw2YIR-9L0CKOx0yoGb_jgM0pcGA; __gpi=UID=00000d49491e3c9b:T=1710917105:RT=1711015497:S=ALNI_MYBakK-TvogAZ1BbqEvyt3N3t4bMg; __eoi=ID=e1c0d0848d87c017:T=1710917105:RT=1711015497:S=AA-Afjb1xilZZE2hfIUmTyUlS4bt; s_tp=3129; s_ppv=CWSJ_Home_Tech
%2
C29
%2
C29
%2
C919'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
ip
=
{
'http'
:
'http://127.0.0.1:1080'
,
'https'
:
'http://127.0.0.1:1080'
}
req
=
requests
.
get
(
url
,
headers
,
proxies
=
ip
)
soup
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
# print(soup)
scrip
=
soup
.
find
(
'body'
)
.
find
(
'script'
)
# print(scrip)
pattern
=
re
.
compile
(
r'\{\"data\": \{.*?\}\}'
)
match
=
pattern
.
search
(
scrip
)
if
match
:
print
(
match
.
group
(
0
))
google_comm/config.ini
浏览文件 @
976e2fb4
[redis]
host
=
114.11
5.236.206
port
=
63
79
host
=
114.11
6.90.53
port
=
63
80
pass
=
clbzzsn
[mysql]
...
...
google_comm/googleSpider.py
浏览文件 @
976e2fb4
...
...
@@ -270,9 +270,10 @@ class GoogleSpider(object):
wait
=
WebDriverWait
(
self
.
driver
,
20
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
TAG_NAME
,
"body"
)))
try
:
self
.
driver
.
find_element
(
'xpath'
,
'//div[
@class="GKS7s"]/span[text()="新闻"
]'
)
.
click
()
self
.
driver
.
find_element
(
'xpath'
,
'//div[
contains(@class, "YmvwI") and contains(text(), "新闻")
]'
)
.
click
()
except
:
self
.
driver
.
find_element
(
'xpath'
,
'//*[@id="hdtb-msb"]/div[1]/div/div[2]/a/span'
)
.
click
()
self
.
logger
.
info
(
'点击新闻按钮失效'
)
return
time
.
sleep
(
3
)
self
.
driver
.
find_element
(
'xpath'
,
'//div[@id="hdtb-tls"]'
)
.
click
()
...
...
google_comm/googletaskJob_loc.py
浏览文件 @
976e2fb4
...
...
@@ -166,7 +166,7 @@ if __name__ == '__main__':
try
:
codeids
=
[]
# codeid='KW-20230727-0001'
codeids
.
append
(
'KW-202
30925-0002
'
)
codeids
.
append
(
'KW-202
40318-0001
'
)
for
codeid
in
codeids
:
try
:
# keymsg=baiduTaskJob.getkafka()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论