国务院问答对处理

510f029c · XveLingKun · 0b43a864 · 510f029c
--- a/国务院问答对处理/get_html.py
+++ b/国务院问答对处理/get_html.py
+# coding=utf-8
+# coding=utf-8
+
+import time
+import pandas as pd
+import pymongo
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from typing import Tuple
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
+    '国务院问答对']
+
+def get_html(url: str) -> str:
+    chrome_options = Options()
+
+    chrome_options.add_argument("--mute-audio")
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+    """
+    opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+        chromedriver = r'D:\cmd100\chromedriver.exe'
+    """
+    try:
+        browser = webdriver.Chrome(
+            # executable_path="./chromedriver-linux64/chromedriver",
+            executable_path=r"D:\cmd100\chromedriver.exe",
+            options=chrome_options
+        )
+
+        # max loading time
+        browser.set_page_load_timeout(30)
+        browser.set_script_timeout(30)
+
+        browser.get(url)
+        time.sleep(2)
+
+        text = browser.page_source
+    except Exception as e:
+        print(f"Error: {e}")
+        text = ""
+    finally:
+        browser.quit()
+
+    return text
+
+
+def parse_html(html: str) -> Tuple[str, str]:
+    soup = BeautifulSoup(html, "html.parser")
+
+    # 提取问题
+    question = soup.find("h2").get_text(strip=True)
+
+    # 提取答案
+    answer_paragraphs = soup.find("div", class_="detail-content").find_all("p")
+    answer = "\n".join([
+        p.get_text(strip=True)
+        for p in answer_paragraphs if p.get_text(strip=True)
+    ])
+
+    return question, answer
+
+
+if __name__ == "__main__":
+    # 通过读取表格获取链接
+    df = pd.read_excel("国务院政府问答对.xlsx", sheet_name="Sheet1")
+    urls = df["Url"].tolist()
+    for url in urls:
+        try:
+            # url = "http://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id=30361"
+            print(f'当前处理链接：{url}')
+            html = get_html(url)
+            question, answer = parse_html(html)
+            print(question)
+            # print(answer)
+            # 将问答对存储到mongo中
+            dic = {
+                "问题": question,
+                "答案": answer,
+            }
+            db_storage.insert_one(dic)
+        except:
+            print("--------------处理失败---------------")
+            continue