提交 0b43a864 作者: XveLingKun

中央全面深化改革会议

上级 fb623647
...@@ -10,7 +10,7 @@ from datetime import datetime ...@@ -10,7 +10,7 @@ from datetime import datetime
from kafka import KafkaProducer from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base') sys.path.append(r'D:\PycharmProjects\zzsn\base')
import BaseCore import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
...@@ -51,6 +51,26 @@ headers = { ...@@ -51,6 +51,26 @@ headers = {
} }
if __name__ == "__main__": if __name__ == "__main__":
from selenium import webdriver
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
opt = webdriver.ChromeOptions()
opt.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
# opt.binary_location = r'F:\spider\Google\Chrome\Application\chrome.exe'
# chromedriver = r'F:\spider\cmd100\chromedriver.exe'
opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
# 中央全面深化改革委员会会议 # 中央全面深化改革委员会会议
r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=5) r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=5)
# 中央全面深化改革领导小组会议 # 中央全面深化改革领导小组会议
...@@ -86,14 +106,21 @@ if __name__ == "__main__": ...@@ -86,14 +106,21 @@ if __name__ == "__main__":
summary = ul.find('a').text summary = ul.find('a').text
# todo: 链接判重 # todo: 链接判重
try: try:
flag = r.sismember(info_code, newsUrl) flag_ = r.sismember(info_code, newsUrl)
if flag: if flag_:
log.info('信息已采集入库过') log.info('信息已采集入库过')
continue continue
except Exception as e: except Exception as e:
continue continue
news_request = requests.get(url=newsUrl, headers=headers, allow_redirects=False) news_request = requests.get(url=newsUrl, headers=headers, allow_redirects=False)
news_soup = BeautifulSoup(news_request.content, 'html.parser') if news_request == 200:
news_soup = BeautifulSoup(news_request.content, 'html.parser')
else:
browser.get(newsUrl)
page_source = browser.page_source
# print(page_source)
news_soup = BeautifulSoup(page_source, 'html.parser')
# print(news_soup) # print(news_soup)
try: try:
title = news_soup.find('h1', class_='big_title').text title = news_soup.find('h1', class_='big_title').text
...@@ -128,10 +155,10 @@ if __name__ == "__main__": ...@@ -128,10 +155,10 @@ if __name__ == "__main__":
kafka_result = producer.send("research_center_fourth", kafka_result = producer.send("research_center_fourth",
json.dumps(dic_info, ensure_ascii=False).encode('utf8')) json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
# r.sadd(info_code + '-test', sourceAddress) # r.sadd(info_code + '-test', sourceAddress)
print('发送kafka结束') log.info('发送kafka结束')
except Exception as e: except Exception as e:
print(e) log.info(e)
print('发送kafka异常!') log.info('发送kafka异常!')
finally: finally:
producer.close() producer.close()
flag += 1 flag += 1
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论