中央集团企业名单

255f8c19 · XveLingKun · 510f029c · 255f8c19 · 255f8c19
--- a/中国外汇交易中心/data/20240903_YJZX_中央集团企业名单（国资委官网）.xlsx
+++ b/中国外汇交易中心/data/20240903_YJZX_中央集团企业名单（国资委官网）.xlsx
--- a/中国外汇交易中心/spider_main.py
+++ b/中国外汇交易中心/spider_main.py
+#  读取表中的数据，转化成list
+#  读取表中的数据，转化成list
+import re
+import time
+
+import pandas as pd
+import pymongo
+import requests
+
+db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
+    '中国外汇交易中心']
+file_path = "data/20240903_YJZX_中央集团企业名单（国资委官网）.xlsx"
+sheet_name = "中央集团企业名单_97家"
+url = "https://www.chinamoney.com.cn/ags/ms/cm-u-notice-issue/financeRepo?year=&type=&orgName={}&pageSize=30&pageNo={}&inextp=3%2C5&limit=1"
+url_domian = "https://www.chinamoney.com.cn"
+headers = {
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'Accept-Encoding': 'gzip, deflate, br, zstd',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+        'Connection': 'keep-alive',
+        'Cookie': 'apache=bbfde8c184f3e1c6074ffab28a313c87; ags=b168c5dd63e5c0bebdd4fb78b2b4704a; lss=f7cb2cf4b1607aec30e411e90d47c685; _ulta_id.CM-Prod.e9dc=0d0cc17a407b843d; AlteonP10=AnbeKSw/F6wwussN2RunXw$$; _ulta_ses.CM-Prod.e9dc=f58a710b199989dd; isLogin=0',
+        'Host': 'www.chinamoney.com.cn',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-origin',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0',
+        'X-Requested-With': 'XMLHttpRequest',
+        'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"'
+}
+
+
+def getcomlist(file_path, sheet_name):
+    df = pd.read_excel(file_path, sheet_name=sheet_name)
+    dataList = df.values.tolist()
+
+    return dataList
+
+
+def getrequest(href, headers):
+    req = requests.get(url=href, headers=headers)
+    if req.status_code == 200:
+        return req.json()
+
+
+def classify_report_type(title):
+    if "年年度报告" in title or re.match(r'\d{4}年度报告', title):
+        return "年度报告"
+    elif "半年" in title:
+        return "半年度报告"
+    elif "一季度" in title or "一季" in title:
+        return "一季度报告"
+    elif "二季度" in title:
+        return "二季度报告"
+    elif "三季度" in title:
+        return "三季度报告"
+    elif "审计" in title:
+        if "未" in title:
+            return "未审计年报"
+        else:
+            return "审计年报"
+    else:
+        return "其他相关报告"
+
+def parase_year(title):
+    # 使用正则表达式匹配年份
+    year = re.search(r'\d{4}', title)
+
+    # 打印匹配到的年份
+    if year:
+       return year.group()
+    else:
+        return None
+
+def parase(com_name, social_code, dataJson):
+    infolist = dataJson['records']
+    for _ in infolist:
+
+        draftPath = url_domian + _['draftPath']  # /chinese/cwbg/20240830/2953916.html
+        releaseDate = _['releaseDate']
+        title = _['title']
+        type = classify_report_type(title)
+        year = parase_year(title)
+        contentId = _['contentId']
+        pdf_path = f"https://www.chinamoney.com.cn/dqs/cm-s-notice-query/fileDownLoad.do?mode=open&contentId={contentId}&priority=0"
+
+        dic_info = {
+            "企业信用代码": social_code,
+            "企业名称": com_name,
+            "标题": title,
+            "发布时间": releaseDate,
+            "网址链接": draftPath,
+            "报告链接": pdf_path,
+            "报告类别": type,
+            "报告年份": year
+        }
+        db_storage2.insert_one(dic_info)
+        time.sleep(1)
+
+
+if __name__ == "__main__":
+    dataList = getcomlist(file_path, sheet_name)
+    # print(dataList)
+    for item in enumerate(dataList):
+        social_code = item[1]
+        com_name = item[2]
+        print(f"正在采集:{com_name}")
+        href = url.format(com_name, 1)
+        dataJson = getrequest(href, headers)
+        # print(dataJson)
+        total_page = int(dataJson['data']['pageTotalSize'])
+        for page in range(1, total_page + 1):
+            if page == 1:
+                dataJson_page = dataJson
+            else:
+                href_page = url.format(com_name, page)
+                dataJson_page = getrequest(href_page, headers)
+            parase(com_name, social_code, dataJson_page)
+            time.sleep(2)
+