提交 510f029c 作者: XveLingKun

国务院问答对处理

上级 0b43a864
# coding=utf-8
# coding=utf-8
import time
import pandas as pd
import pymongo
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from typing import Tuple
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'国务院问答对']
def get_html(url: str) -> str:
chrome_options = Options()
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
"""
opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe'
"""
try:
browser = webdriver.Chrome(
# executable_path="./chromedriver-linux64/chromedriver",
executable_path=r"D:\cmd100\chromedriver.exe",
options=chrome_options
)
# max loading time
browser.set_page_load_timeout(30)
browser.set_script_timeout(30)
browser.get(url)
time.sleep(2)
text = browser.page_source
except Exception as e:
print(f"Error: {e}")
text = ""
finally:
browser.quit()
return text
def parse_html(html: str) -> Tuple[str, str]:
soup = BeautifulSoup(html, "html.parser")
# 提取问题
question = soup.find("h2").get_text(strip=True)
# 提取答案
answer_paragraphs = soup.find("div", class_="detail-content").find_all("p")
answer = "\n".join([
p.get_text(strip=True)
for p in answer_paragraphs if p.get_text(strip=True)
])
return question, answer
if __name__ == "__main__":
# 通过读取表格获取链接
df = pd.read_excel("国务院政府问答对.xlsx", sheet_name="Sheet1")
urls = df["Url"].tolist()
for url in urls:
try:
# url = "http://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id=30361"
print(f'当前处理链接:{url}')
html = get_html(url)
question, answer = parse_html(html)
print(question)
# print(answer)
# 将问答对存储到mongo中
dic = {
"问题": question,
"答案": answer,
}
db_storage.insert_one(dic)
except:
print("--------------处理失败---------------")
continue
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论