提交 f434a907 作者: XveLingKun

华尔街采集

上级 a16f8aa1
{"s_tp": "4333", "_ncg_domain_id_": "7c5b7036-4ad7-4687-8961-1b6f5d273984.1.1711521614929.1774593614929", "_ncg_sp_id.5378": "7c5b7036-4ad7-4687-8961-1b6f5d273984.1711521615.1.1711521616.1711521615.89221ca3-0ff7-447c-bd6b-88bc0d7b9ca1", "__eoi": "ID=74957e9c589e06c8:T=1711521614:RT=1711521614:S=AA-AfjbpSmTSwP45PV1P_Sfr-2eA", "_ncg_id_": "7c5b7036-4ad7-4687-8961-1b6f5d273984", "__gpi": "UID=00000d6a82c6423a:T=1711521614:RT=1711521614:S=ALNI_Mamy4ax3m1xyG-SWHlZCu8YbWNV7w", "__gads": "ID=4e5002beb21f7800:T=1711521614:RT=1711521614:S=ALNI_MaWml862ei0DYTIgH5jH8-qiduUyA", "dicbo_id": "%7B%22dicbo_fetch%22%3A1711521614435%7D", "s_cc": "true", "_ncg_sp_ses.5378": "*", "_dj_sp_id": "f24fbbcc-3872-4f04-b628-048e0da2d503", "_uetvid": "ddc10a40ec0411ee968a03a12159858e", "s_ppv": "CWSJ_Home_Home%2520Page%2C13%2C13%2C570", "_uetsid": "ddc0cf80ec0411eea60095acfb805f07", "_gcl_au": "1.1.1409873185.1711521614", "_pin_unauth": "dWlkPU4yUTFOVEF6TXpZdFlqYzRaaTAwWXpGa0xXRTVNell0TjJJd05qaGtPVFUzTkdNMw", "AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1", "_rdt_uuid": "1711521612889.741c0512-609f-4857-9453-d2627aab72f2", "cX_P": "lu9fryu88xq1rnhv", "_meta_facebookTag_sync": "1711521612864", "_dj_id.9183": ".1711521612.1.1711521612..42d375e3-8210-4847-b37b-e2413e02fe5a..1bafccfe-479d-43d7-ae40-2c0b27f1273b.1711521612160.1", "usr_prof_v2": "eyJwIjp7InBzIjowLjg4LCJxIjowLjg2fSwiY3AiOnsiZWMiOiJTdGFibGUiLCJwYyI6MC4wMTQzMywicHNyIjowLjMyNTEsInRkIjoxNzI1LCJhZCI6MjgsInFjIjozMCwicW8iOjI3LCJzY2VuIjp7ImNoZSI6MC4wMzA0MywiY2huIjowLjAzMDgyLCJjaGEiOjAuMDE0MzMsImNocCI6MC4wMTczfX0sIm9wIjp7ImkiOiI2MTdiY2UwMCIsImoiOnsianQiOiJlbGdtIn19LCJpYyI6M30%3D", "_scid": "22e08d14-b460-4edb-8046-8b897104f696", "ResponsiveConditional_initialBreakpoint": "md", "vcdpaApplies": "false", "_pctx": "%7Bu%7DN4IgrgzgpgThIC5QDYoFYBGBjNB2AhgEwAMAzACYbHH6kCcGuhuGAZgIzEAcWhUhebFla5y5QqRzssxNPnKs0zOqwAspKKox06yYAHcIAKwC%2BiUAAcYUVgEsAHohCGjIADQgALgE8LUJwDCABogJiYekLAAyp74npBO%2BAB2APZJ7iAQtp5QAJLkTnSExaSkaEpcaMi4yGjsdKSqoUA", "cX_G": "cx%3Aar1n90irbdrh1nz3umsn41upp%3A12bdufvkc9frm", "ab_uuid": "5be368fe-5b01-4451-8d6c-54c8bd163f1b", "AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1585540135%7CMCIDTS%7C19810%7CMCMID%7C48333472371426108300129167066567447308%7CMCAID%7CNONE%7CMCOPTOUT-1711528813s%7CNONE%7CMCAAMLH-1712126413%7C6%7CMCAAMB-1712126413%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCSYNCSOP%7C411-19817%7CvVersion%7C4.4.0", "datadome": "i1JSVSww9u2lCnQC7CmZsfQQNOATMw1rMpQl2syLPun2T1iS1iZ9PP~uUddO0Tp~ABwhQ~K~x~CIeUhxxhz18uVd1dpdc5wK8gDSrOlrovmG8ozGm4KjIXF~mas7NsGe", "gdprApplies": "false", "ccpaApplies": "false", "_fbp": "fb.1.1711521612864.2074745750", "ajs_anonymous_id": "86ceb415-2de7-4a04-bbf7-b3b952852670", "utag_main": "v_id:018e7ea33c2c004ea140fb3e2df00507d0013075007e8$_sn:1$_se:1$_ss:1$_st:1711523410801$ses_id:1711521610801%3Bexp-session$_pn:1%3Bexp-session$_prevpage:CWSJ_Home_Home%20Page%3Bexp-1711525210821$vapi_domain:wsj.com", "regulationApplies": "gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse", "ca_rt": "MI50RGh7xOF_Ha5M-pfrXQ.IWGd-OI_q1g7AlutTc36h9FX7n1uQrebrg16x7dunYcPqFafWtPhVQ0ZQqvmKc6OzBixBYWSZvxCXF_gye9vCE0vT01vzuzUI8a-JK4X4LE", "_dj_ses.9183": "*", "DJSESSION": "country%3Dsg%7C%7Ccontinent%3Das%7C%7Cregion%3D", "ca_id": "eJxdkMtSgzAUht8la2hzIYSwkg7o4NTWqVUXjsOEXGwstB1Ixer47qYXN-7O-f7_XL_Bm_3Qm2ojWg1S0IruAAJgRGubwz-oW2Ebn36t1hjS5ArFZCS3rVf2e6u8oDVmnFETMgNVGOEkCmvMVChNLDGKGElU4t2uE3J9Kog1rSVlAkOiaggF4TXDrDYIJhJrTFktpWFKYSIpkpAKZagfYSKio5rz2Dfrto3uQfoCrhdFsShuwnKWl09l_phNvfr8cBuW91l-Di8gm-WLeZlXy2wyLZYXeDeflNMCvAZA7N2qcvZ4OWIIUYwoTwIgOy2cVpVwntMYR5gzjAJgT-DPyP2bPndnwCk5Adv7BcHKuV2fjsfDMIyG_v34ubFsrN448PMLjPFu1g.qGey7P4In7Rq_zwb3rdDqeMgXg4ctbinCQ8wcWje7hmNwg48tJ5nKGnGeVLUEIynG34nBOAARDWFeZPNyFFPQr5JS-xjPAnfuuCYxSkS8Z6C3FVwBZ0D4rkSU11Ts67PVCwzfI3f4qxYy9M8JB4WnM6PQYU0wJ_WCZAzb2pDxEASxzKfzzm_M5FdybgMHkY-4WcbF-Zp0V8RHWG7eH9OvmIYZZNq621vKCZXg6hnJmeh6FZdVxiSXPrOG4K1zGgtb-wHrKJFVu5VmGXY_ygilEFec8v27wiASm9IiMmqZ-wQ_ej0u9OS2YMIA5Fzn7kqx1mgYOWCL3eLVYR1R01oigJx4q2GdyrQDVyp0X_8Z7aaGSj4UgYR8Q2mxfrB7AWeWqeKbec8RdJPDh9kjKvcs_KVPtfNVyzHKH-f9fv7hnh2Rmot44XSUPb4WqNUJx8N1FNXV8mjOLP37oVCZZaeVW2NoNNeNbN75WYS4kSjBhuUtBN8iumYaxR5xmCk41UYbKldPN4qyFE0J1hBt64mXbhGD0clVCFdBMWpw29ZIMdkXwpqX3Ig0FVGku6QO4pXlOWeqVPb2Tc78yGqxmg-Nqc90C2-nANOTdT1AZt27-FuB6KAiadLNFlcVcYNYi8vD6ylHGZiHB8DemoFPLoKN1WH5ggodOBRyvMFO2Asry8", "_scid_r": "22e08d14-b460-4edb-8046-8b897104f696", "_ncg_g_id_": "cbeb2daa-da53-494b-a606-90a386ee0b55.3.1711521616.1774593614929", "TR": "V2-6e5bc57a203db00a39b727bf108c2e257bccf7dd23c51c05adf5279f43e4b996", "_sctr": "1%7C1711468800000", "_pcid": "%7B%22browserId%22%3A%22lu9fryu88xq1rnhv%22%7D", "wsjregion": "asia%2Ccn"}
\ No newline at end of file
from bs4 import BeautifulSoup
import requests, time, json
import redis,random
from kafka import KafkaProducer
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from apscheduler.schedulers.blocking import BlockingScheduler
import sys
sys.path.append("../../base")
from base import BaseCore
log = BaseCore.BaseCore().getLogger()
def create_driver():
ip = {
'https': 'https://127.0.0.1:1080',
'http': 'http://127.0.0.1:1080'
}
edge_service = Service(r'D:\soft\msedgedriver.exe')
edge_options = Options()
# 开启开发者模式
edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 禁用启用Blink运行时的功能
edge_options.add_argument('--disable-blink-features=AutomationControlled')
edge_options.add_argument('--proxy-server=%s' % ip['http'])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver = webdriver.Edge(service=edge_service, options=edge_options)
return driver
def create_google():
driver_path = r'D:\cmd100\chromedriver.exe'
chromr_bin = r'D:\Google\Chrome\Application\chrome.exe'
chrome_driver = driver_path
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.binary_location = chromr_bin
driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
return driver
def login():
driver = create_google()
un = 'zhk2058@163.com'
pw = 'ZZM205899'
driver.get(
"https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https%3A%2F%2Fcn.wsj.com%2Fclient%2Fauth&response_type=code&scope=openid%20idp_id%20roles%20tags%20email%20given_name%20family_name%20uuid%20djid%20djUsername%20djStatus%20trackid%20prts%20updated_at%20created_at%20offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https%253A%252F%252Fcn.wsj.com%252F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin")
time.sleep(5)
driver.find_element(By.XPATH, "//div/input[@name = 'username']").send_keys(un)
# //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
driver.find_element(By.XPATH, '//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]').click()
time.sleep(3)
# //*[@id="password-login"]/div/form/div[5]/button
driver.find_element(By.ID, "password-login-password").send_keys(pw)
driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
time.sleep(3)
cookies = driver.get_cookies()
return cookies, driver
def parser_content(href, driver):
while True:
driver.get(href)
time.sleep(2)
news_soup = BeautifulSoup(driver.page_source, 'html.parser')
news_content = news_soup.find('div', class_='article-content')
if news_content is None:
driver.refresh()
time.sleep(3)
log.info('封号')
return None, None
else:
break
content = news_content.text
return content, news_content
def getData(key):
keys = r.scan_iter(f"{key}*")
for key in keys:
fields = r.hgetall(key)
decode_fields = {k.decode(): v.decode() for k, v in fields.items()}
# 获取一条信息
# r.delete(key)
# print(f"删除成功{key}")
newsUrl = decode_fields['newsUrl']
# todo: 判断是否已采集
try:
flag = r_2.sismember('IN-20240403-0041', newsUrl)
if flag:
log.info('信息已采集入库过')
continue
except Exception as e:
continue
publishDate = decode_fields['publishDate']
title = decode_fields['title']
summary = decode_fields['summary']
# todo:发送kafka
sid = '1775455062911447042'
info_code = "IN-20240403-0041"
dic_news = {
'content': '',
'contentWithTag': '',
'id': '',
'summary': summary,
'origin': '华尔街日报中文网-科技',
'publishDate': publishDate,
'sid': sid,
'sourceAddress': newsUrl,
'title': title,
'source': '16',
'type': ''
}
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
log.info(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
r_2.sadd(info_code, newsUrl)
except Exception as e:
log.info(e)
log.info(f'传输失败:{dic_news["title"]}、{dic_news["publishDate"]}')
# 不用对内容做处理
# content, contentWithTag = parser_content(newsUrl, driver)
# if content is None:
# time.sleep(20*60)
# else:
# log.info(f'成功--{decode_fields}')
# r.delete(key)
# time.sleep(3)
return True
if __name__ == '__main__':
r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=6)
r_2 = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=5)
key = 'WSJ:NewsInfo'
# ip = {
# 'https': 'https://127.0.0.1:1080',
# 'http': 'http://127.0.0.1:1080'
#
# }
# cookies, driver = login()
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
# "content-type": "application/json; charset=UTF-8",
# "Connection": "keep-alive"
# }
# with open('wsj_cookie.txt', 'r') as g:
# cookies = g.read()
# cookies = json.loads(cookies)
#
# # url = 'https://cn.wsj.com/articles/欧盟根据新数字竞争法对苹果-meta-谷歌展开调查-732a3d4f'
# url = 'https://www.wsj.com/economy/china-industrial-profits-return-to-growth-d3530ec5'
#
# driver = create_driver()
# for cookie in cookies:
# driver.add_cookie(cookie)
# driver.get(url)
while True:
getData(key)
time.sleep(60*60*1)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
import time, json
def create_driver():
ip = {
'https': 'https://127.0.0.1:1080',
'http': 'http://127.0.0.1:1080'
}
edge_service = Service(r'D:\soft\msedgedriver.exe')
edge_options = Options()
# 开启开发者模式
edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 禁用启用Blink运行时的功能
edge_options.add_argument('--disable-blink-features=AutomationControlled')
edge_options.add_argument('--proxy-server=%s' % ip['http'])
# prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
# edge_options.add_experimental_option("prefs", prefs)
driver = webdriver.Edge(service=edge_service, options=edge_options)
return driver
def login():
driver = create_driver()
un = 'zhk2058@163.com'
pw = 'ZZM205899'
driver.get(
"https://sso.accounts.dowjones.com/login-page?client_id=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO&redirect_uri=https%3A%2F%2Fcn.wsj.com%2Fclient%2Fauth&response_type=code&scope=openid%20idp_id%20roles%20tags%20email%20given_name%20family_name%20uuid%20djid%20djUsername%20djStatus%20trackid%20prts%20updated_at%20created_at%20offline_access&ui_locales=zh-tw-x-cwsj-27-2&nonce=beaaad3a-6919-4893-8198-c3769d6d54af&state=73NKOEQds-P9ZH7w.ie3C279-7mV69dSbgfC_fu7R0sZqMkGovzhN3NJbUfU&resource=https%253A%252F%252Fcn.wsj.com%252F&protocol=oauth2&client=5hssEAdMy0mJTICnJNvC9TXEw3Va7jfO#!/signin")
time.sleep(5)
driver.find_element(By.XPATH, "//div/input[@name = 'username']").send_keys(un)
# //*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]
driver.find_element(By.XPATH, '//*[@id="basic-login"]/div[1]/form/div[5]/div[1]/button[2]').click()
time.sleep(3)
# //*[@id="password-login"]/div/form/div[5]/button
driver.find_element(By.ID, "password-login-password").send_keys(pw)
driver.find_element(By.XPATH, '//*[@id="password-login"]/div/form/div[5]/button').click()
time.sleep(3)
cookie = driver.get_cookies()
return cookie, driver
if __name__ == '__main__':
cookie, driver = login()
cookies = {}
for item in cookie:
cookies[item['name']] = item['value']
with open("wsj_cookie.txt", "w") as f:
f.write(json.dumps(cookies))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论