提交 f434a907 作者: XveLingKun

华尔街采集

上级 a16f8aa1
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import re
import json
import redis
import time,datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from apscheduler.schedulers.blocking import BlockingScheduler
def create_driver():
ip = {
'https': 'https://127.0.0.1:1080',
'http': 'http://127.0.0.1:1080'
if __name__ == '__main__':
url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
headers = {
'Cookie': 'gdprApplies=false; ccpaApplies=false; vcdpaApplies=false; regulationApplies=gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse; _pcid=%7B%22browserId%22%3A%22ltzfvavl4ju9vgpi%22%7D; cX_P=ltzfvavl4ju9vgpi; dnsDisplayed=undefined; signedLspa=undefined; _sp_su=false; cX_G=cx%3Allui1w2zab163r7fbco37esw7%3A317ttgvfg79lq; AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg=1; ajs_anonymous_id=a1fa0ab7-91e0-41f5-8659-f77686a9adc3; _gcl_au=1.1.1271150883.1710917108; s_cc=true; _pin_unauth=dWlkPU5qRTNNV0V3WlRndFpqSXlNaTAwWVdSa0xXSTVaR1F0TVdVMU56TTRPR001WTJReQ; _ncg_id_=41c19b00-1a9e-4b2d-90df-7b8344634212; _fbp=fb.1.1710917107810.1699847377; _dj_sp_id=09dfe400-9303-4f0d-ab44-e30daad2eaea; _ncg_domain_id_=41c19b00-1a9e-4b2d-90df-7b8344634212.1.1710917109623.1773989109623; _scid=6ee68aeb-d484-4800-a696-c0adb7b914a4; _ncg_g_id_=b7310b5e-1113-4f94-8ca3-7ea5b3c2ef71.3.1710917112.1773989109623; DJSESSION=country%3Dhk%7C%7Ccontinent%3Das%7C%7Cregion%3D; wsjregion=asia%2Ccn; AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg=1585540135%7CMCIDTS%7C19803%7CMCMID%7C26000677277848255171457287474499803357%7CMCAAMLH-1711618354%7C6%7CMCAAMB-1711618354%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1711020754s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0; _pin_unauth=dWlkPU5ERXhNbVppWVRZdE5UTTJZaTAwTmpRMUxUazFabVl0WXpRek9XTmlZamd5TW1VMg; _fbp=fb.1.1710917107810.1699847377; _meta_facebookTag_sync=1711013559439; djcs_route=f2994554-703d-42af-a2b8-785c61193619; ca_rt=B2-Zo67DYbWomddfWX4igw.4qexK32nu-8FzO0Q-M1MtldjOaZYCKtr4ejWSn8EuXlvOgHhaJ_VHNVNCAGwqx91Dosk3YRHhQoX1YuERztjFJN5nNTJ-IkBNAHwCIdMoY8; ca_id=eJxdkMtOg0AUht9l1tDOlQFW0oAGU1tTqy6MIcNc7FhoG5iK1fjuTi8L4-6c7__P9Ru82Q-9qTai1SAFregOIABGtLY5_IO6Fbbx6ddqjSGLr1BERnLbemW_t8oLWmOecGZCbqAKKY5pWGOuQmkiiRHlJFaxd7tOyPWpINKslowLDImqIRQkqTnmtUEwllhjxmspDVcKE8mQhEwow_wIQ4mmdZJEvlm3bXQP0hdwvSiKRXETlrO8fCrzx2zq1eeH27C8z_JzeAHZLF_My7xaZpNpsbzAu_mknBbgNQBi71aVs8fLEUcIIsKSJACy08JpVQnnOYswxQnHKAD2BP4Y9efuDChlJ2B7vyBYObfr0_F4GIbR0L8fPzeWjdUbB35-AYojbss.r5GaoioJDee6VernrH2oRDgGbTxrPefp2P9CPAXjncI5Z1XbCQnkbCsJOXqTXeC92ryLYDm1dAl-B14KYwL1eAi6mBF88dkhze2ISucUCtHwe9B54d-hTMROM2GR9ifS6QV279pzV3mTHZF_7ziLtbTPiL-PFpzsbxQotpLpKwzFnnxrbF5e-5jfpWAhTg-eiPo2yBowpU-wg2echaNlmlrHxaE7j2V5rygnhAyuxGA2PJ4cjOYcG3uY39dAk4NqeMfVgPUsRLBNqxLEd6I6Y1bE2nRk88A0rLa1vtLv-ZB-4gskyyCnab1PRWN8SwemTuskXnwMhmY1-dBEXc8uNV-lRoCPOMKIS-PE4DX5JJ7CDmZdl1kUzW0FaLNRMuIrTvlz6wnS_6nXkgKSFS6fFWLkBVwUOddKcyqtOjgUHJgLmnmLTJtvoqMD-83k_AAFUy6RrFatThHRHC45yjsb2vWrCMoKnSQwmU7HHU_2zyO9sQQaXYrP2qFbLi9oa3RFYw38jfykO27ZqxAabgMktsSafI6giPW_iQGfurmY-SJUDxqf8tSmwDwhUQobochTrnmHE_sX7Tonf-0YEQUIThHSe5skrnA6RwgXMj3qnHV8urQRn8WM9DmiIw7R1zX5VjvHgIlW9bHykWLpfnlzr2KsS77ZEnPUTA9jKWxdSJs; TR=V2-6e5bc57a203db00a39b727bf108c2e257bccf7dd23c51c05adf5279f43e4b996; ab_uuid=5be368fe-5b01-4451-8d6c-54c8bd163f1b; usr_prof_v2=eyJwIjp7InBzIjowLjgsInEiOjAuODV9LCJjcCI6eyJlYyI6IlN0YWJsZSIsInBjIjowLjAxMjAxLCJwc3IiOjAuNDQ3NSwidGQiOjE3MTksImFkIjoyOCwicWMiOjIxLCJxbyI6MjMsInNjZW4iOnsiY2hlIjowLjAzMjU2LCJjaG4iOjAuMDIxNjgsImNoYSI6MC4wMTIwMSwiY2hwIjowLjAxMzQ1fX0sIm9wIjp7ImkiOiI2MTdiY2UwMCIsImoiOnsianQiOiJlbGdtIn19LCJpYyI6Nn0%3D; utag_main=v_id:018e5a9b3c51001f7967cb1f91690506f0014067007e8$_sn:2$_se:2$_ss:0$_st:1711015401906$vapi_domain:wsj.com$ses_id:1711013553852%3Bexp-session$_pn:2%3Bexp-session$_prevpage:CWSJ_Home_Tech%3Bexp-1711017201915; _pctx=%7Bu%7DN4IgrgzgpgThIC5QDYoFYBGBjNB2AhgEwAMAzACYbHH6kCcGuhuGAZgIzEAcWhUhebFla5y5QqRzssxNPnKs0zOqwAspKKox06yYAHcIAKwC%2BiUAAcYUVgEsAHohCGjIADQgALgE8LUJwDCABogJiYekLAAyp74npBO%2BAB2APZJ7iAQtp5QAJLkTnSExaSkaEpcaMi4yGjsdKSqoUA; _dj_id.9183=.1710917108.2.1711013602.1710917108.b8474262-5ed9-4de4-a0d7-deb5b8e74386.318bf4f0-fd38-4ff3-a153-dfb8e83f4842.9b7abe89-6983-437a-933e-f77e03dd4f49.1711013555174.2; _scid_r=6ee68aeb-d484-4800-a696-c0adb7b914a4; _ncg_sp_id.5378=41c19b00-1a9e-4b2d-90df-7b8344634212.1710917110.2.1711013602.1710917111.2fa95b23-4e7d-489c-b7d3-81086312ca4f; _uetsid=f3603ad0e76511eeb0a7e164836efe1a; _uetvid=64aa3700e68511eebe452957ac3861c3; datadome=xDtRNqFhkjvX5OHJjDUvZJnRfHeCdi_ysN9qG8GC4Os1S2IsutTgJXKYGM3aPEkdEkc7W~4nJuiN1y8XAP8fN81P2lfJ8BGS~JBFgavd0psSTris5e~an90PcbNgL54q; ResponsiveConditional_initialBreakpoint=lg; __gads=ID=8d962eb26c834930:T=1710917105:RT=1711015497:S=ALNI_Maw2YIR-9L0CKOx0yoGb_jgM0pcGA; __gpi=UID=00000d49491e3c9b:T=1710917105:RT=1711015497:S=ALNI_MYBakK-TvogAZ1BbqEvyt3N3t4bMg; __eoi=ID=e1c0d0848d87c017:T=1710917105:RT=1711015497:S=AA-Afjb1xilZZE2hfIUmTyUlS4bt; s_tp=3129; s_ppv=CWSJ_Home_Tech%2C29%2C29%2C919',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
} }
ip = {'http': 'http://127.0.0.1:1080', 'https': 'http://127.0.0.1:1080'} edge_service = Service(r'D:\soft\msedgedriver.exe')
req = requests.get(url, headers,proxies=ip) edge_options = Options()
soup = BeautifulSoup(req.content, 'html.parser') # 开启开发者模式
# print(soup) edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
scrip = soup.find('body').find('script') # 禁用启用Blink运行时的功能
edge_options.add_argument('--disable-blink-features=AutomationControlled')
edge_options.add_argument('--proxy-server=%s' % ip['http'])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver = webdriver.Edge(service=edge_service, options=edge_options)
return driver
def get_pagesource():
driver = create_driver()
# un = 'zhk2058@163.com'
# pw = 'ZZM205899'
# driver.get(
# "https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https%3A%2F%2Fcn.wsj.com%2Fclient%2Fauth&response_type=code&scope=openid%20idp_id%20roles%20tags%20email%20given_name%20family_name%20uuid%20djid%20djUsername%20djStatus%20trackid%20prts%20updated_at%20created_at%20offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https%253A%252F%252Fcn.wsj.com%252F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin")
# time.sleep(5)
# driver.find_element(By.XPATH, "//div/input[@name = 'username']").send_keys(un)
# # //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
# driver.find_element(By.XPATH, '//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]').click()
# time.sleep(3)
# # //*[@id="password-login"]/div/form/div[5]/button
# driver.find_element(By.ID, "password-login-password").send_keys(pw)
# driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
# time.sleep(3)
url = 'https://cn.wsj.com/zh-hans/news/technology?mod=nav_top_section'
driver.get(url)
time.sleep(3)
while True:
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
try:
scrip = soup.find('body').find('script').text
# print(scrip) # print(scrip)
pattern = re.compile(r'\{\"data\": \{.*?\}\}') scrip = re.findall('__STATE__ =(.*);', scrip)[0].strip()
match = pattern.search(scrip) break
if match: except:
print(match.group(0)) if soup.text == '':
return None, driver
time.sleep(3)
continue
return soup, driver
def get_newshref(key):
soup, driver = get_pagesource()
if soup:
pass
else:
return None
scrip = soup.find('body').find('script').text
# print(scrip)
scrip = re.findall('__STATE__ =(.*);', scrip)[0].strip()
reqJson = json.loads(scrip)
# print(reqJson)
pattern = re.compile('article')
keys = [key for key in reqJson['data'].keys() if pattern.match(key)]
# filtered_data = {key: value for key, value in reqJson['data'].items() if pattern.match(key)}
print(keys)
news_list = []
for key_ in keys:
title = reqJson['data'][key_]['data']['data']['headline']
# print(title)
summary = reqJson['data'][key_]['data']['data']['summary']
try:
seoId = reqJson['data'][key_]['data']['data']['seoId']
except:
continue
newsUrl = 'https://cn.wsj.com/articles/' + seoId
print(newsUrl)
timestamp = int(reqJson['data'][key_]['data']['data']['timestamp'])/1000
publishDate = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
dic_newsinfo = {
'title': title,
'summary': summary,
'publishDate': publishDate,
'newsUrl': newsUrl
}
# redis_client.hset(key, mapping=dic_newsinfo)
news_list.append(dic_newsinfo)
return news_list, driver
def caiji():
redis_client = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
key = 'WSJ:NewsInfo'
news_list, driver = get_newshref(key)
# #todo:将获取到的列表全部放进redis等待
#
# count = 0
# time.sleep(10)
# 开始一个pipeline
pipeline = redis_client.pipeline()
for idx, info in enumerate(news_list):
# href = info['newsUrl']
# title = info['title']
# summary = info['summary']
# publishDate = info['publishDate']
# 存入 redis
hash_key = f'{key}:{idx}'
pipeline.hset(hash_key, mapping=info)
# 执行pipeline
pipeline.execute()
# driver.get(href)
# time.sleep(3)
# news_soup = BeautifulSoup(driver.page_source, 'html.parser')
# news_content = news_soup.find('div', class_='article-content')
# content = news_content.text
# print(f'{info["title"]}---已采集')
# count += 1
# print(count)
#华尔街列表定时任务
def wsj_list_task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每天执行一次
scheduler.add_job(caiji, 'cron', hour=9,minute=0,max_instances=1)
try:
# redisPushData # 定时开始前执行一次
# NewsEnterprise()
scheduler.start()
except Exception as e:
print('定时采集异常', e)
pass
if __name__ == '__main__':
wsj_list_task()
{"s_tp": "4333", "_ncg_domain_id_": "7c5b7036-4ad7-4687-8961-1b6f5d273984.1.1711521614929.1774593614929", "_ncg_sp_id.5378": "7c5b7036-4ad7-4687-8961-1b6f5d273984.1711521615.1.1711521616.1711521615.89221ca3-0ff7-447c-bd6b-88bc0d7b9ca1", "__eoi": "ID=74957e9c589e06c8:T=1711521614:RT=1711521614:S=AA-AfjbpSmTSwP45PV1P_Sfr-2eA", "_ncg_id_": "7c5b7036-4ad7-4687-8961-1b6f5d273984", "__gpi": "UID=00000d6a82c6423a:T=1711521614:RT=1711521614:S=ALNI_Mamy4ax3m1xyG-SWHlZCu8YbWNV7w", "__gads": "ID=4e5002beb21f7800:T=1711521614:RT=1711521614:S=ALNI_MaWml862ei0DYTIgH5jH8-qiduUyA", "dicbo_id": "%7B%22dicbo_fetch%22%3A1711521614435%7D", "s_cc": "true", "_ncg_sp_ses.5378": "*", "_dj_sp_id": "f24fbbcc-3872-4f04-b628-048e0da2d503", "_uetvid": "ddc10a40ec0411ee968a03a12159858e", "s_ppv": "CWSJ_Home_Home%2520Page%2C13%2C13%2C570", "_uetsid": "ddc0cf80ec0411eea60095acfb805f07", "_gcl_au": "1.1.1409873185.1711521614", "_pin_unauth": "dWlkPU4yUTFOVEF6TXpZdFlqYzRaaTAwWXpGa0xXRTVNell0TjJJd05qaGtPVFUzTkdNMw", "AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1", "_rdt_uuid": "1711521612889.741c0512-609f-4857-9453-d2627aab72f2", "cX_P": "lu9fryu88xq1rnhv", "_meta_facebookTag_sync": "1711521612864", "_dj_id.9183": ".1711521612.1.1711521612..42d375e3-8210-4847-b37b-e2413e02fe5a..1bafccfe-479d-43d7-ae40-2c0b27f1273b.1711521612160.1", "usr_prof_v2": "eyJwIjp7InBzIjowLjg4LCJxIjowLjg2fSwiY3AiOnsiZWMiOiJTdGFibGUiLCJwYyI6MC4wMTQzMywicHNyIjowLjMyNTEsInRkIjoxNzI1LCJhZCI6MjgsInFjIjozMCwicW8iOjI3LCJzY2VuIjp7ImNoZSI6MC4wMzA0MywiY2huIjowLjAzMDgyLCJjaGEiOjAuMDE0MzMsImNocCI6MC4wMTczfX0sIm9wIjp7ImkiOiI2MTdiY2UwMCIsImoiOnsianQiOiJlbGdtIn19LCJpYyI6M30%3D", "_scid": "22e08d14-b460-4edb-8046-8b897104f696", "ResponsiveConditional_initialBreakpoint": "md", "vcdpaApplies": "false", "_pctx": "%7Bu%7DN4IgrgzgpgThIC5QDYoFYBGBjNB2AhgEwAMAzACYbHH6kCcGuhuGAZgIzEAcWhUhebFla5y5QqRzssxNPnKs0zOqwAspKKox06yYAHcIAKwC%2BiUAAcYUVgEsAHohCGjIADQgALgE8LUJwDCABogJiYekLAAyp74npBO%2BAB2APZJ7iAQtp5QAJLkTnSExaSkaEpcaMi4yGjsdKSqoUA", "cX_G": "cx%3Aar1n90irbdrh1nz3umsn41upp%3A12bdufvkc9frm", "ab_uuid": "5be368fe-5b01-4451-8d6c-54c8bd163f1b", "AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1585540135%7CMCIDTS%7C19810%7CMCMID%7C48333472371426108300129167066567447308%7CMCAID%7CNONE%7CMCOPTOUT-1711528813s%7CNONE%7CMCAAMLH-1712126413%7C6%7CMCAAMB-1712126413%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCSYNCSOP%7C411-19817%7CvVersion%7C4.4.0", "datadome": "i1JSVSww9u2lCnQC7CmZsfQQNOATMw1rMpQl2syLPun2T1iS1iZ9PP~uUddO0Tp~ABwhQ~K~x~CIeUhxxhz18uVd1dpdc5wK8gDSrOlrovmG8ozGm4KjIXF~mas7NsGe", "gdprApplies": "false", "ccpaApplies": "false", "_fbp": "fb.1.1711521612864.2074745750", "ajs_anonymous_id": "86ceb415-2de7-4a04-bbf7-b3b952852670", "utag_main": "v_id:018e7ea33c2c004ea140fb3e2df00507d0013075007e8$_sn:1$_se:1$_ss:1$_st:1711523410801$ses_id:1711521610801%3Bexp-session$_pn:1%3Bexp-session$_prevpage:CWSJ_Home_Home%20Page%3Bexp-1711525210821$vapi_domain:wsj.com", "regulationApplies": "gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse", "ca_rt": "MI50RGh7xOF_Ha5M-pfrXQ.IWGd-OI_q1g7AlutTc36h9FX7n1uQrebrg16x7dunYcPqFafWtPhVQ0ZQqvmKc6OzBixBYWSZvxCXF_gye9vCE0vT01vzuzUI8a-JK4X4LE", "_dj_ses.9183": "*", "DJSESSION": "country%3Dsg%7C%7Ccontinent%3Das%7C%7Cregion%3D", "ca_id": "eJxdkMtSgzAUht8la2hzIYSwkg7o4NTWqVUXjsOEXGwstB1Ixer47qYXN-7O-f7_XL_Bm_3Qm2ojWg1S0IruAAJgRGubwz-oW2Ebn36t1hjS5ArFZCS3rVf2e6u8oDVmnFETMgNVGOEkCmvMVChNLDGKGElU4t2uE3J9Kog1rSVlAkOiaggF4TXDrDYIJhJrTFktpWFKYSIpkpAKZagfYSKio5rz2Dfrto3uQfoCrhdFsShuwnKWl09l_phNvfr8cBuW91l-Di8gm-WLeZlXy2wyLZYXeDeflNMCvAZA7N2qcvZ4OWIIUYwoTwIgOy2cVpVwntMYR5gzjAJgT-DPyP2bPndnwCk5Adv7BcHKuV2fjsfDMIyG_v34ubFsrN448PMLjPFu1g.qGey7P4In7Rq_zwb3rdDqeMgXg4ctbinCQ8wcWje7hmNwg48tJ5nKGnGeVLUEIynG34nBOAARDWFeZPNyFFPQr5JS-xjPAnfuuCYxSkS8Z6C3FVwBZ0D4rkSU11Ts67PVCwzfI3f4qxYy9M8JB4WnM6PQYU0wJ_WCZAzb2pDxEASxzKfzzm_M5FdybgMHkY-4WcbF-Zp0V8RHWG7eH9OvmIYZZNq621vKCZXg6hnJmeh6FZdVxiSXPrOG4K1zGgtb-wHrKJFVu5VmGXY_ygilEFec8v27wiASm9IiMmqZ-wQ_ej0u9OS2YMIA5Fzn7kqx1mgYOWCL3eLVYR1R01oigJx4q2GdyrQDVyp0X_8Z7aaGSj4UgYR8Q2mxfrB7AWeWqeKbec8RdJPDh9kjKvcs_KVPtfNVyzHKH-f9fv7hnh2Rmot44XSUPb4WqNUJx8N1FNXV8mjOLP37oVCZZaeVW2NoNNeNbN75WYS4kSjBhuUtBN8iumYaxR5xmCk41UYbKldPN4qyFE0J1hBt64mXbhGD0clVCFdBMWpw29ZIMdkXwpqX3Ig0FVGku6QO4pXlOWeqVPb2Tc78yGqxmg-Nqc90C2-nANOTdT1AZt27-FuB6KAiadLNFlcVcYNYi8vD6ylHGZiHB8DemoFPLoKN1WH5ggodOBRyvMFO2Asry8", "_scid_r": "22e08d14-b460-4edb-8046-8b897104f696", "_ncg_g_id_": "cbeb2daa-da53-494b-a606-90a386ee0b55.3.1711521616.1774593614929", "TR": "V2-6e5bc57a203db00a39b727bf108c2e257bccf7dd23c51c05adf5279f43e4b996", "_sctr": "1%7C1711468800000", "_pcid": "%7B%22browserId%22%3A%22lu9fryu88xq1rnhv%22%7D", "wsjregion": "asia%2Ccn"}
\ No newline at end of file
from bs4 import BeautifulSoup
import requests, time, json
import redis,random
from kafka import KafkaProducer
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from apscheduler.schedulers.blocking import BlockingScheduler
import sys
sys.path.append("../../base")
from base import BaseCore
log = BaseCore.BaseCore().getLogger()
def create_driver():
ip = {
'https': 'https://127.0.0.1:1080',
'http': 'http://127.0.0.1:1080'
}
edge_service = Service(r'D:\soft\msedgedriver.exe')
edge_options = Options()
# 开启开发者模式
edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 禁用启用Blink运行时的功能
edge_options.add_argument('--disable-blink-features=AutomationControlled')
edge_options.add_argument('--proxy-server=%s' % ip['http'])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver = webdriver.Edge(service=edge_service, options=edge_options)
return driver
def create_google():
driver_path = r'D:\cmd100\chromedriver.exe'
chromr_bin = r'D:\Google\Chrome\Application\chrome.exe'
chrome_driver = driver_path
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.binary_location = chromr_bin
driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
return driver
def login():
driver = create_google()
un = 'zhk2058@163.com'
pw = 'ZZM205899'
driver.get(
"https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https%3A%2F%2Fcn.wsj.com%2Fclient%2Fauth&response_type=code&scope=openid%20idp_id%20roles%20tags%20email%20given_name%20family_name%20uuid%20djid%20djUsername%20djStatus%20trackid%20prts%20updated_at%20created_at%20offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https%253A%252F%252Fcn.wsj.com%252F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin")
time.sleep(5)
driver.find_element(By.XPATH, "//div/input[@name = 'username']").send_keys(un)
# //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
driver.find_element(By.XPATH, '//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]').click()
time.sleep(3)
# //*[@id="password-login"]/div/form/div[5]/button
driver.find_element(By.ID, "password-login-password").send_keys(pw)
driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
time.sleep(3)
cookies = driver.get_cookies()
return cookies, driver
def parser_content(href, driver):
while True:
driver.get(href)
time.sleep(2)
news_soup = BeautifulSoup(driver.page_source, 'html.parser')
news_content = news_soup.find('div', class_='article-content')
if news_content is None:
driver.refresh()
time.sleep(3)
log.info('封号')
return None, None
else:
break
content = news_content.text
return content, news_content
def getData(key):
keys = r.scan_iter(f"{key}*")
for key in keys:
fields = r.hgetall(key)
decode_fields = {k.decode(): v.decode() for k, v in fields.items()}
# 获取一条信息
# r.delete(key)
# print(f"删除成功{key}")
newsUrl = decode_fields['newsUrl']
# todo: 判断是否已采集
try:
flag = r_2.sismember('IN-20240403-0041', newsUrl)
if flag:
log.info('信息已采集入库过')
continue
except Exception as e:
continue
publishDate = decode_fields['publishDate']
title = decode_fields['title']
summary = decode_fields['summary']
# todo:发送kafka
sid = '1775455062911447042'
info_code = "IN-20240403-0041"
dic_news = {
'content': '',
'contentWithTag': '',
'id': '',
'summary': summary,
'origin': '华尔街日报中文网-科技',
'publishDate': publishDate,
'sid': sid,
'sourceAddress': newsUrl,
'title': title,
'source': '16',
'type': ''
}
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
log.info(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
r_2.sadd(info_code, newsUrl)
except Exception as e:
log.info(e)
log.info(f'传输失败:{dic_news["title"]}、{dic_news["publishDate"]}')
# 不用对内容做处理
# content, contentWithTag = parser_content(newsUrl, driver)
# if content is None:
# time.sleep(20*60)
# else:
# log.info(f'成功--{decode_fields}')
# r.delete(key)
# time.sleep(3)
return True
if __name__ == '__main__':
r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
r_2 = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=5)
key = 'WSJ:NewsInfo'
# ip = {
# 'https': 'https://127.0.0.1:1080',
# 'http': 'http://127.0.0.1:1080'
#
# }
# cookies, driver = login()
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
# "content-type": "application/json; charset=UTF-8",
# "Connection": "keep-alive"
# }
# with open('wsj_cookie.txt', 'r') as g:
# cookies = g.read()
# cookies = json.loads(cookies)
#
# # url = 'https://cn.wsj.com/articles/欧盟根据新数字竞争法对苹果-meta-谷歌展开调查-732a3d4f'
# url = 'https://www.wsj.com/economy/china-industrial-profits-return-to-growth-d3530ec5'
#
# driver = create_driver()
# for cookie in cookies:
# driver.add_cookie(cookie)
# driver.get(url)
while True:
getData(key)
time.sleep(60*60*1)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
import time, json
def create_driver():
ip = {
'https': 'https://127.0.0.1:1080',
'http': 'http://127.0.0.1:1080'
}
edge_service = Service(r'D:\soft\msedgedriver.exe')
edge_options = Options()
# 开启开发者模式
edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 禁用启用Blink运行时的功能
edge_options.add_argument('--disable-blink-features=AutomationControlled')
edge_options.add_argument('--proxy-server=%s' % ip['http'])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver = webdriver.Edge(service=edge_service, options=edge_options)
return driver
def login():
driver = create_driver()
un = 'zhk2058@163.com'
pw = 'ZZM205899'
driver.get(
"https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https%3A%2F%2Fcn.wsj.com%2Fclient%2Fauth&response_type=code&scope=openid%20idp_id%20roles%20tags%20email%20given_name%20family_name%20uuid%20djid%20djUsername%20djStatus%20trackid%20prts%20updated_at%20created_at%20offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https%253A%252F%252Fcn.wsj.com%252F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin")
time.sleep(5)
driver.find_element(By.XPATH, "//div/input[@name = 'username']").send_keys(un)
# //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
driver.find_element(By.XPATH, '//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]').click()
time.sleep(3)
# //*[@id="password-login"]/div/form/div[5]/button
driver.find_element(By.ID, "password-login-password").send_keys(pw)
driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
time.sleep(3)
cookie = driver.get_cookies()
return cookie, driver
if __name__ == '__main__':
cookie, driver = login()
cookies = {}
for item in cookie:
cookies[item['name']] = item['value']
with open("wsj_cookie.txt", "w") as f:
f.write(json.dumps(cookies))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论