提交 0b43a864 作者: XveLingKun

中央全面深化改革会议

上级 fb623647
......@@ -10,7 +10,7 @@ from datetime import datetime
from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
sys.path.append(r'D:\PycharmProjects\zzsn\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
......@@ -51,6 +51,26 @@ headers = {
}
if __name__ == "__main__":
from selenium import webdriver
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
opt = webdriver.ChromeOptions()
opt.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
# opt.binary_location = r'F:\spider\Google\Chrome\Application\chrome.exe'
# chromedriver = r'F:\spider\cmd100\chromedriver.exe'
opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
# 中央全面深化改革委员会会议
r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=5)
# 中央全面深化改革领导小组会议
......@@ -86,14 +106,21 @@ if __name__ == "__main__":
summary = ul.find('a').text
# todo: 链接判重
try:
flag = r.sismember(info_code, newsUrl)
if flag:
flag_ = r.sismember(info_code, newsUrl)
if flag_:
log.info('信息已采集入库过')
continue
except Exception as e:
continue
news_request = requests.get(url=newsUrl, headers=headers, allow_redirects=False)
if news_request == 200:
news_soup = BeautifulSoup(news_request.content, 'html.parser')
else:
browser.get(newsUrl)
page_source = browser.page_source
# print(page_source)
news_soup = BeautifulSoup(page_source, 'html.parser')
# print(news_soup)
try:
title = news_soup.find('h1', class_='big_title').text
......@@ -128,10 +155,10 @@ if __name__ == "__main__":
kafka_result = producer.send("research_center_fourth",
json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
# r.sadd(info_code + '-test', sourceAddress)
print('发送kafka结束')
log.info('发送kafka结束')
except Exception as e:
print(e)
print('发送kafka异常!')
log.info(e)
log.info('发送kafka异常!')
finally:
producer.close()
flag += 1
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论