提交 255f8c19 作者: XveLingKun

中央集团企业名单

上级 510f029c
# 读取表中的数据,转化成list
# 读取表中的数据,转化成list
import re
import time
import pandas as pd
import pymongo
import requests
db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'中国外汇交易中心']
file_path = "data/20240903_YJZX_中央集团企业名单(国资委官网).xlsx"
sheet_name = "中央集团企业名单_97家"
url = "https://www.chinamoney.com.cn/ags/ms/cm-u-notice-issue/financeRepo?year=&type=&orgName={}&pageSize=30&pageNo={}&inextp=3%2C5&limit=1"
url_domian = "https://www.chinamoney.com.cn"
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Cookie': 'apache=bbfde8c184f3e1c6074ffab28a313c87; ags=b168c5dd63e5c0bebdd4fb78b2b4704a; lss=f7cb2cf4b1607aec30e411e90d47c685; _ulta_id.CM-Prod.e9dc=0d0cc17a407b843d; AlteonP10=AnbeKSw/F6wwussN2RunXw$$; _ulta_ses.CM-Prod.e9dc=f58a710b199989dd; isLogin=0',
'Host': 'www.chinamoney.com.cn',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
def getcomlist(file_path, sheet_name):
df = pd.read_excel(file_path, sheet_name=sheet_name)
dataList = df.values.tolist()
return dataList
def getrequest(href, headers):
req = requests.get(url=href, headers=headers)
if req.status_code == 200:
return req.json()
def classify_report_type(title):
if "年年度报告" in title or re.match(r'\d{4}年度报告', title):
return "年度报告"
elif "半年" in title:
return "半年度报告"
elif "一季度" in title or "一季" in title:
return "一季度报告"
elif "二季度" in title:
return "二季度报告"
elif "三季度" in title:
return "三季度报告"
elif "审计" in title:
if "未" in title:
return "未审计年报"
else:
return "审计年报"
else:
return "其他相关报告"
def parase_year(title):
# 使用正则表达式匹配年份
year = re.search(r'\d{4}', title)
# 打印匹配到的年份
if year:
return year.group()
else:
return None
def parase(com_name, social_code, dataJson):
infolist = dataJson['records']
for _ in infolist:
draftPath = url_domian + _['draftPath'] # /chinese/cwbg/20240830/2953916.html
releaseDate = _['releaseDate']
title = _['title']
type = classify_report_type(title)
year = parase_year(title)
contentId = _['contentId']
pdf_path = f"https://www.chinamoney.com.cn/dqs/cm-s-notice-query/fileDownLoad.do?mode=open&contentId={contentId}&priority=0"
dic_info = {
"企业信用代码": social_code,
"企业名称": com_name,
"标题": title,
"发布时间": releaseDate,
"网址链接": draftPath,
"报告链接": pdf_path,
"报告类别": type,
"报告年份": year
}
db_storage2.insert_one(dic_info)
time.sleep(1)
if __name__ == "__main__":
dataList = getcomlist(file_path, sheet_name)
# print(dataList)
for item in enumerate(dataList):
social_code = item[1]
com_name = item[2]
print(f"正在采集:{com_name}")
href = url.format(com_name, 1)
dataJson = getrequest(href, headers)
# print(dataJson)
total_page = int(dataJson['data']['pageTotalSize'])
for page in range(1, total_page + 1):
if page == 1:
dataJson_page = dataJson
else:
href_page = url.format(com_name, page)
dataJson_page = getrequest(href_page, headers)
parase(com_name, social_code, dataJson_page)
time.sleep(2)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论