提交 976e2fb4 作者: 薛凌堃

谷歌搜索脚本维护

上级 82652d12
import requests
from bs4 import BeautifulSoup
import re
if __name__ == '__main__':
url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
headers = {
'Cookie': 'gdprApplies=false; ccpaApplies=false; vcdpaApplies=false; regulationApplies=gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse; _pcid=%7B%22browserId%22%3A%22ltzfvavl4ju9vgpi%22%7D; cX_P=ltzfvavl4ju9vgpi; dnsDisplayed=undefined; signedLspa=undefined; _sp_su=false; cX_G=cx%3Allui1w2zab163r7fbco37esw7%3A317ttgvfg79lq; AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg=1; ajs_anonymous_id=a1fa0ab7-91e0-41f5-8659-f77686a9adc3; _gcl_au=1.1.1271150883.1710917108; s_cc=true; _pin_unauth=dWlkPU5qRTNNV0V3WlRndFpqSXlNaTAwWVdSa0xXSTVaR1F0TVdVMU56TTRPR001WTJReQ; _ncg_id_=41c19b00-1a9e-4b2d-90df-7b8344634212; _fbp=fb.1.1710917107810.1699847377; _dj_sp_id=09dfe400-9303-4f0d-ab44-e30daad2eaea; _ncg_domain_id_=41c19b00-1a9e-4b2d-90df-7b8344634212.1.1710917109623.1773989109623; _scid=6ee68aeb-d484-4800-a696-c0adb7b914a4; _ncg_g_id_=b7310b5e-1113-4f94-8ca3-7ea5b3c2ef71.3.1710917112.1773989109623; DJSESSION=country%3Dhk%7C%7Ccontinent%3Das%7C%7Cregion%3D; wsjregion=asia%2Ccn; AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg=1585540135%7CMCIDTS%7C19803%7CMCMID%7C26000677277848255171457287474499803357%7CMCAAMLH-1711618354%7C6%7CMCAAMB-1711618354%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1711020754s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0; _pin_unauth=dWlkPU5ERXhNbVppWVRZdE5UTTJZaTAwTmpRMUxUazFabVl0WXpRek9XTmlZamd5TW1VMg; _fbp=fb.1.1710917107810.1699847377; _meta_facebookTag_sync=1711013559439; djcs_route=f2994554-703d-42af-a2b8-785c61193619; ca_rt=B2-Zo67DYbWomddfWX4igw.4qexK32nu-8FzO0Q-M1MtldjOaZYCKtr4ejWSn8EuXlvOgHhaJ_VHNVNCAGwqx91Dosk3YRHhQoX1YuERztjFJN5nNTJ-IkBNAHwCIdMoY8; ca_id=eJxdkMtOg0AUht9l1tDOlQFW0oAGU1tTqy6MIcNc7FhoG5iK1fjuTi8L4-6c7__P9Ru82Q-9qTai1SAFregOIABGtLY5_IO6Fbbx6ddqjSGLr1BERnLbemW_t8oLWmOecGZCbqAKKY5pWGOuQmkiiRHlJFaxd7tOyPWpINKslowLDImqIRQkqTnmtUEwllhjxmspDVcKE8mQhEwow_wIQ4mmdZJEvlm3bXQP0hdwvSiKRXETlrO8fCrzx2zq1eeH27C8z_JzeAHZLF_My7xaZpNpsbzAu_mknBbgNQBi71aVs8fLEUcIIsKSJACy08JpVQnnOYswxQnHKAD2BP4Y9efuDChlJ2B7vyBYObfr0_F4GIbR0L8fPzeWjdUbB35-AYojbss.r5GaoioJDee6VernrH2oRDgGbTxrPefp2P9CPAXjncI5Z1XbCQnkbCsJOXqTXeC92ryLYDm1dAl-B14KYwL1eAi6mBF88dkhze2ISucUCtHwe9B54d-hTMROM2GR9ifS6QV279pzV3mTHZF_7ziLtbTPiL-PFpzsbxQotpLpKwzFnnxrbF5e-5jfpWAhTg-eiPo2yBowpU-wg2echaNlmlrHxaE7j2V5rygnhAyuxGA2PJ4cjOYcG3uY39dAk4NqeMfVgPUsRLBNqxLEd6I6Y1bE2nRk88A0rLa1vtLv-ZB-4gskyyCnab1PRWN8SwemTuskXnwMhmY1-dBEXc8uNV-lRoCPOMKIS-PE4DX5JJ7CDmZdl1kUzW0FaLNRMuIrTvlz6wnS_6nXkgKSFS6fFWLkBVwUOddKcyqtOjgUHJgLmnmLTJtvoqMD-83k_AAFUy6RrFatThHRHC45yjsb2vWrCMoKnSQwmU7HHU_2zyO9sQQaXYrP2qFbLi9oa3RFYw38jfykO27ZqxAabgMktsSafI6giPW_iQGfurmY-SJUDxqf8tSmwDwhUQobochTrnmHE_sX7Tonf-0YEQUIThHSe5skrnA6RwgXMj3qnHV8urQRn8WM9DmiIw7R1zX5VjvHgIlW9bHykWLpfnlzr2KsS77ZEnPUTA9jKWxdSJs; TR=V2-6e5bc57a203db00a39b727bf108c2e257bccf7dd23c51c05adf5279f43e4b996; ab_uuid=5be368fe-5b01-4451-8d6c-54c8bd163f1b; usr_prof_v2=eyJwIjp7InBzIjowLjgsInEiOjAuODV9LCJjcCI6eyJlYyI6IlN0YWJsZSIsInBjIjowLjAxMjAxLCJwc3IiOjAuNDQ3NSwidGQiOjE3MTksImFkIjoyOCwicWMiOjIxLCJxbyI6MjMsInNjZW4iOnsiY2hlIjowLjAzMjU2LCJjaG4iOjAuMDIxNjgsImNoYSI6MC4wMTIwMSwiY2hwIjowLjAxMzQ1fX0sIm9wIjp7ImkiOiI2MTdiY2UwMCIsImoiOnsianQiOiJlbGdtIn19LCJpYyI6Nn0%3D; utag_main=v_id:018e5a9b3c51001f7967cb1f91690506f0014067007e8$_sn:2$_se:2$_ss:0$_st:1711015401906$vapi_domain:wsj.com$ses_id:1711013553852%3Bexp-session$_pn:2%3Bexp-session$_prevpage:CWSJ_Home_Tech%3Bexp-1711017201915; _pctx=%7Bu%7DN4IgrgzgpgThIC5QDYoFYBGBjNB2AhgEwAMAzACYbHH6kCcGuhuGAZgIzEAcWhUhebFla5y5QqRzssxNPnKs0zOqwAspKKox06yYAHcIAKwC%2BiUAAcYUVgEsAHohCGjIADQgALgE8LUJwDCABogJiYekLAAyp74npBO%2BAB2APZJ7iAQtp5QAJLkTnSExaSkaEpcaMi4yGjsdKSqoUA; _dj_id.9183=.1710917108.2.1711013602.1710917108.b8474262-5ed9-4de4-a0d7-deb5b8e74386.318bf4f0-fd38-4ff3-a153-dfb8e83f4842.9b7abe89-6983-437a-933e-f77e03dd4f49.1711013555174.2; _scid_r=6ee68aeb-d484-4800-a696-c0adb7b914a4; _ncg_sp_id.5378=41c19b00-1a9e-4b2d-90df-7b8344634212.1710917110.2.1711013602.1710917111.2fa95b23-4e7d-489c-b7d3-81086312ca4f; _uetsid=f3603ad0e76511eeb0a7e164836efe1a; _uetvid=64aa3700e68511eebe452957ac3861c3; datadome=xDtRNqFhkjvX5OHJjDUvZJnRfHeCdi_ysN9qG8GC4Os1S2IsutTgJXKYGM3aPEkdEkc7W~4nJuiN1y8XAP8fN81P2lfJ8BGS~JBFgavd0psSTris5e~an90PcbNgL54q; ResponsiveConditional_initialBreakpoint=lg; __gads=ID=8d962eb26c834930:T=1710917105:RT=1711015497:S=ALNI_Maw2YIR-9L0CKOx0yoGb_jgM0pcGA; __gpi=UID=00000d49491e3c9b:T=1710917105:RT=1711015497:S=ALNI_MYBakK-TvogAZ1BbqEvyt3N3t4bMg; __eoi=ID=e1c0d0848d87c017:T=1710917105:RT=1711015497:S=AA-Afjb1xilZZE2hfIUmTyUlS4bt; s_tp=3129; s_ppv=CWSJ_Home_Tech%2C29%2C29%2C919',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
ip = {'http': 'http://127.0.0.1:1080', 'https': 'http://127.0.0.1:1080'}
req = requests.get(url, headers,proxies=ip)
soup = BeautifulSoup(req.content, 'html.parser')
# print(soup)
scrip = soup.find('body').find('script')
# print(scrip)
pattern = re.compile(r'\{\"data\": \{.*?\}\}')
match = pattern.search(scrip)
if match:
print(match.group(0))
[redis]
host=114.115.236.206
port=6379
host=114.116.90.53
port=6380
pass=clbzzsn
[mysql]
......
......@@ -270,9 +270,10 @@ class GoogleSpider(object):
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
try:
self.driver.find_element('xpath', '//div[@class="GKS7s"]/span[text()="新闻"]').click()
self.driver.find_element('xpath', '//div[contains(@class, "YmvwI") and contains(text(), "新闻")]').click()
except:
self.driver.find_element('xpath', '//*[@id="hdtb-msb"]/div[1]/div/div[2]/a/span').click()
self.logger.info('点击新闻按钮失效')
return
time.sleep(3)
self.driver.find_element('xpath', '//div[@id="hdtb-tls"]').click()
......
......@@ -166,7 +166,7 @@ if __name__ == '__main__':
try:
codeids=[]
# codeid='KW-20230727-0001'
codeids.append('KW-20230925-0002')
codeids.append('KW-20240318-0001')
for codeid in codeids:
try:
# keymsg=baiduTaskJob.getkafka()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论