提交 dd9d719d 作者: 薛凌堃

10/8

上级 222110f7
...@@ -421,6 +421,7 @@ def NQEnterprise(): ...@@ -421,6 +421,7 @@ def NQEnterprise():
nq_social_list = [item[0] for item in nq_result] nq_social_list = [item[0] for item in nq_result]
for item in nq_social_list: for item in nq_social_list:
#新三板企业财务数据 上市信息 核心人员已采集 企业动态、企业公告未采集 企业公告脚本已开发,企业动态需要每天放入redis
# r.rpush('NQEnterprise:nq_Ipo', item) # r.rpush('NQEnterprise:nq_Ipo', item)
r.rpush('NQEnterprise:nq_finance',item) r.rpush('NQEnterprise:nq_finance',item)
# r.rpush('NQEnterprise:nq_notice',item) # r.rpush('NQEnterprise:nq_notice',item)
...@@ -451,11 +452,26 @@ def omeng(): ...@@ -451,11 +452,26 @@ def omeng():
#单项冠军 #单项冠军
def danxiangguanjun(): def danxiangguanjun():
pass cnx, cursor = connectSql()
query = "SELECT CompanyName FROM champion"
cursor.execute(query)
result = cursor.fetchall()
cnx.commit()
com_namelist = [item[0] for item in result]
for item in com_namelist:
r.rpush('champion:baseinfo',item)
#科改示范 #科改示范
def kegaishifan(): def kegaishifan():
pass cnx, cursor = connectSql()
query = "SELECT CompanyName FROM technological"
cursor.execute(query)
result = cursor.fetchall()
cnx.commit()
com_namelist = [item[0] for item in result]
for item in com_namelist:
r.rpush('technological:baseinfo',item)
#双百企业 #双百企业
def shuangbaiqiye(): def shuangbaiqiye():
...@@ -467,6 +483,8 @@ def zhuangjingtexind(): ...@@ -467,6 +483,8 @@ def zhuangjingtexind():
if __name__ == "__main__": if __name__ == "__main__":
start = time.time() start = time.time()
# danxiangguanjun()
kegaishifan()
# NoticeEnterprise() # NoticeEnterprise()
# AnnualEnterpriseIPO() # AnnualEnterpriseIPO()
# AnnualEnterprise() # AnnualEnterprise()
...@@ -477,7 +495,7 @@ if __name__ == "__main__": ...@@ -477,7 +495,7 @@ if __name__ == "__main__":
# FBS() # FBS()
# MengZhi() # MengZhi()
# NQEnterprise() # NQEnterprise()
SEC_CIK() # SEC_CIK()
# omeng() # omeng()
# AnnualEnterpriseUS() # AnnualEnterpriseUS()
# NoticeEnterprise_task() # NoticeEnterprise_task()
......
"""
解析json数据 两个链接:
https://data.sec.gov/api/xbrl/companyfacts/CIK0000320193.json 数据值和gaap字段
https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/MetaLinks.json html字段和gaap字段映射
step1:拼接链接
step2:
"""
import json
import time
import requests
from kafka import KafkaProducer
from operator import itemgetter
from itertools import groupby
from base.BaseCore import BaseCore
# import urllib3
# urllib3.disable_warings()
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
def fromcikgetinfo(cik):
query = f"select * from mgzqyjwyh_list where cik='{cik}' "
cursor.execute(query)
data = cursor.fetchone()
return data
def getRequest(url):
headers = {
'Host': 'data.sec.gov',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '_ga=GA1.2.784424676.1695174651; _4c_=%7B%22_4c_s_%22%3A%22lZFLT4QwFIX%2FyqRrILS0pbAzmBgXajQ%2BlhNpLwOZcUoKDo4T%2Fru3gMbHym5ov55zcjk9kaGGPcmpzARNuVRcxElAtnDsSH4irjH%2BcyA50awsDTUq1ElShZwZCMuKmbASSQUUKsYoIwF5w6w0ZpmIpeBKqTEgul0yTkRbA5hFs4iqKA6rDh39OxKuYty2zppX3a%2F7Y%2BtlA5SrzmzxwsCh0bAeGtPX3s8m%2BUJraDZ1jzhlE22dl0QC90OzN3b47Vvol0%2BkFGnp7NCB9xa1sy%2BwolQitlgEeZocfloHFTg3yfDUNb0ftAMdbexhAVjezMKZPTaemtV9cYf8%2Bhu5LW6uFtT6jv0YO6ufdz4UnyUgF2frh8tz%2F2%2BKc8ZlKqPPpxKUjHPfCJiksRAZldhnvyO5kjz2a5yTp%2FrpTzVXWfZXPbcQ%2Bulh%2Fx%2FrOH4A%22%7D; _ga_300V1CHKH1=GS1.1.1695174651.1.1.1695174684.0.0.0; ak_bmsc=91C6D28D093861656DB8C1FC1972DAB6~000000000000000000000000000000~YAAQlQ8kF2U6orCKAQAAgyl9uxX8kNk3C77pkMi6N6RxnsUqDbYEmIcNjtLSa8W6kfGL9cQMRHBUaYcbEA1+oXsvUwUF80G8hmH/F4S0ZOEnVCrlcBLx219N24l2qmoSKtVDH+VKe7c1bji9MHc7tO2R56R7juZJv9gceAdtKEuArkPfD8ijx/TyEgIrM+XruGtzCRmLnfq86UoJYP+j+tXcaWkc/qm1zHDReDNf/cHd6h2aRMs4lsES8+uh6YTjE7bfCp8h2DNJ2e07pm0ojcI/kdycUPHmuTqWPdTBEjUybad31E1hRNBAE8PbGjy2lvlPY/piuN3HX3Q5ifsmTqCNJzynN2kjGm6i4SHhmEAijUeIzNQXB11GrVmALJVV6pEjd/uu; bm_sv=FD8981426EA388050697DFB615BAFFE3~YAAQ1wcsF5K72ZSKAQAAsvl/uxUw0do3nknGCkllXH27UZBpM7kQUXm4crBNTAkhek5YSDKIrrm2uFWidfpBfyxbRSr+w7FH7Y0w4cXMAa7BELzcc/B9Uf8T6e2I2W29wjurKkBFtSseslHSqYD3BWx9/GidJMW+dFNrlzNUMd1dONUR9J1TDnYifPhE6A/zSLPHVrCTJl7xzg7VlW/05Ay0i+Bo7TynZdWgotfjET3vg2/ZVixVSGaWeQo4~1'
}
for m in range(0,3):
try:
response = requests.get(url=url,headers=headers,verify=False)
break
except Exception as e:
log.error(f"request请求异常-------{e}")
continue
# 检查响应状态码
if response.status_code == 200:
jsonData = response.json()
return jsonData
else:
return False
if __name__=='__main__':
taskType = '财务数据/SEC'
zcfzb_mapping = {
'AccountsAndOtherReceivablesNetCurrent':'指标1'
}
lrb_mapping = {
}
xjllb_mapping = {
}
while True:
start_time = time.time
# todo:从redis中获取企业cik
# cik = baseCore.redicPullData('sec_cik_US:uscik')
cik = '320193'
#通过cik去数据库中获取信息
data = fromcikgetinfo(cik)
com_name = data[2]
com_code = data[3]
exchange = data[4]
#拼接链接的cik是十位数
url_cik = cik
while True:
if len(url_cik) < 10:
url_cik = '0' + url_cik
else:
break
url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{url_cik}.json'
jsonData = getRequest(url)
if jsonData:
pass
print(jsonData)
try:
us_gaap = jsonData['facts']['us-gaap']
except:
continue
# 遍历map的key值
Listzcfzb = []
for key in zcfzb_mapping.keys():
# 一个财务指标的所有年份和金额
usd_list = us_gaap[key]['units']['USD']
# form: 10-K fp: FY
for j in usd_list:
form = usd_list[j]['form']
fp = usd_list[j]['fp']
if form=='10-K' and fp=='FY':
pass
else:
continue
date = usd_list[j]['end']
if date.endswith('03-31') or date.endswith('06-30') or date.endswith('09-30') or date.endswith('12-31'):
pass
else:
continue
val = usd_list[j]['val']
zcfzb_dic ={
'zbname': key,
'riqi': date,
'jine': val,
'fp': fp,
'form': form
}
# 资产负债表所有年份指标
Listzcfzb.append(zcfzb_dic)
Listzcfzb.sort(key=itemgetter('riqi'))
groups = groupby(Listzcfzb, key=itemgetter('riqi'))
# 遍历每个分组,并打印分类结果
for riqi, group in groups:
print(f"riqi: {riqi}")
# 迭代表达式
listbydate = [item for item in group]
print()
"""从html页面中抽取表格"""
import requests
from bs4 import BeautifulSoup
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
def getRequest(url):
headers = {
'Referer': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/356037/000035603723000038/cspi-20230630x10q.htm',
'Sec-Ch-Ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31',
}
for m in range(0,3):
try:
response = requests.get(url=url,headers=headers,verify=False)
break
except Exception as e:
log.error(f"request请求异常-------{e}")
continue
# 检查响应状态码
if response.status_code == 200:
soup = BeautifulSoup(response.content,'html.parser')
return soup
else:
return False
def getzcfztable(soup):
table_list = soup.find_all('table')
for table in table_list:
aa = table.find_all(text='Current assets:')
if aa:
# print(table)
trlist = table.find_all('tr')
date1 = trlist[1].find_all('td')[1].text.replace('\n', '')
date2 = trlist[1].find_all('td')[-1].text.replace('\n', '')
print(date1, date2)
# todo:把td内容为空的去掉
for tr in trlist[2:]:
filtered_tags = tr(lambda tag: tag.name == 'td' and '$' in tag.text)
for tag in filtered_tags:
tag.extract()
# filtered_tags2 = tr(lambda tag:tag.name=='td' and tag.text==' ')
filtered_tags2 = tr(lambda tag: tag.name == 'td' and tag.text == '')
for tag in filtered_tags2:
tag.extract()
try:
zbtag = tr.find_all('td')[0].text.replace('\n', '')
except:
zbtag = ''
try:
cash1 = tr.find_all('td')[1].text.replace('\n', '')
except:
cash1 = ''
try:
cash2 = tr.find_all('td')[2].text.replace('\n', '')
except:
cash2 = ''
if zbtag != '' and cash1 != '' and cash2 != '':
print(f'字段:{zbtag} 值1:{cash1} 值2:{cash2}')
if __name__=='__main__':
url = 'https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/aapl-20210925.htm'
soup = getRequest(url)
#html解析表格 资产负债表
getzcfztable(soup)
# -*- coding: utf-8 -*-
import time
from urllib.parse import quote
import requests
import urllib3
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# headers = {
# 'Host': 'xcx.qcc.com',
# 'Connection': 'keep-alive',
# 'Qcc-Platform': 'mp-weixin',
# 'Qcc-Timestamp': '',
# 'Qcc-Version': '1.0.0',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br,'
# }
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'x-request-device-type': 'Android',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
'Content-Type': 'application/json',
'Qcc-Version': '1.0.0',
'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
'xweb_xhr': '1',
'xcx-version': '2023.09.27',
'Qcc-Platform': 'mp-weixin',
'Qcc-CurrentPage': '/company-subpackages/business/index',
'Qcc-Timestamp': '1696661787803',
'Qcc-RefPage': '/company-subpackages/detail/index',
'Accept': '*/*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh'
}
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(start,token,name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except Exception as e:
print(f'{e}-------------重试')
time.sleep(5)
continue
time.sleep(2)
#{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频,请升级小程序版本'}
if resp_dict['status']==40101:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
if resp_dict['status']==401:
KeyNo = False
log.info(f'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
try:
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = 'null'
else:
KeyNo = 'null'
except:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
return KeyNo
log.info("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
\ No newline at end of file
# -*- coding: utf-8 -*-
import time
from urllib.parse import quote
import requests
import urllib3
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# headers = {
# 'Host': 'xcx.qcc.com',
# 'Connection': 'keep-alive',
# 'Qcc-Platform': 'mp-weixin',
# 'Qcc-Timestamp': '',
# 'Qcc-Version': '1.0.0',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br,'
# }
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'x-request-device-type': 'Android',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
'Content-Type': 'application/json',
'Qcc-Version': '1.0.0',
'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
'xweb_xhr': '1',
'xcx-version': '2023.09.27',
'Qcc-Platform': 'mp-weixin',
'Qcc-CurrentPage': '/company-subpackages/business/index',
'Qcc-Timestamp': '1696661787803',
'Qcc-RefPage': '/company-subpackages/detail/index',
'Accept': '*/*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh'
}
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(start,token,name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except Exception as e:
print(f'{e}-------------重试')
time.sleep(5)
continue
time.sleep(2)
#{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频,请升级小程序版本'}
if resp_dict['status']==40101:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
if resp_dict['status']==401:
KeyNo = False
log.info(f'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
try:
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = 'null'
else:
KeyNo = 'null'
except:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
return KeyNo
log.info("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
\ No newline at end of file
import json import json
...@@ -5,7 +5,9 @@ import requests ...@@ -5,7 +5,9 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from base import BaseCore from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
...@@ -16,7 +18,57 @@ cnx_ = baseCore.cnx_ ...@@ -16,7 +18,57 @@ cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_ cursor_ = baseCore.cursor_
taskType = '企业公告/证监会' taskType = '企业公告/证监会'
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
name = pdf_name + '.pdf'
result = obsClient.putContent('zzsn', 'ZJH/'+name, content=resp_content)
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('/ZJH')[0]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
retData['page_size'] = page_size
except:
return retData
return retData
def secrchATT(item_id, name, type_id): def secrchATT(item_id, name, type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s ''' sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
...@@ -164,16 +216,20 @@ def getUrl(code, url_parms, Catagory2_parms): ...@@ -164,16 +216,20 @@ def getUrl(code, url_parms, Catagory2_parms):
return dic_parms return dic_parms
def InsterInto(short_name, social_code, pdf_url): def ifInstert(short_name, social_code, pdf_url):
inster = False ifexist = True
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' ''' sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' '''
cursor.execute(sel_sql, (social_code, pdf_url)) cursor.execute(sel_sql, (social_code, pdf_url))
selects = cursor.fetchone() selects = cursor.fetchone()
#如果数据库中存在 则跳过
if selects: if selects:
print(f'com_name:{short_name}、{pdf_url}已存在') ifexist = False
return inster log.info(f'com_name:{short_name}、{pdf_url}已存在')
return ifexist
else:
return ifexist
def InsterInto(short_name, social_code, pdf_url):
# 信息插入数据库 # 信息插入数据库
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
...@@ -197,8 +253,8 @@ def InsterInto(short_name, social_code, pdf_url): ...@@ -197,8 +253,8 @@ def InsterInto(short_name, social_code, pdf_url):
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num): def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
#上传至文件服务器 #上传至华为云服务器
retData = baseCore.upLoadToServe(pdf_url,8,social_code) retData = uptoOBS(pdf_url,pdf_name,8,social_code)
#附件插入att数据库 #附件插入att数据库
if retData['state']: if retData['state']:
pass pass
...@@ -323,10 +379,10 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -323,10 +379,10 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
year = pub_time[:4] year = pub_time[:4]
report_type = td_list[4].text.strip() report_type = td_list[4].text.strip()
# 信息插入数据库 # 判断数据库中是否有该条资讯
insert = InsterInto(short_name, social_code, name_pdf) ifexist = ifInstert(short_name, social_code, pdf_url)
#如果不存在 ifexist = True
if insert: if ifexist:
# # 公告信息列表 # # 公告信息列表
# okCount = okCount + 1 # okCount = okCount + 1
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败 # 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
......
import pandas as pd
import pandas as pd
import glob
# 查找当前目录及其子目录下所有以.txt结尾的文件
csv_files = glob.glob(r"D:\机械项目研报\机械项目研报*.xlsx", recursive=True)
# 创建一个空的DataFrame用于存储合并后的数据
merged_data = pd.DataFrame()
# 逐个读取CSV文件并合并到DataFrame中
for file in csv_files:
data = pd.read_excel(file,dtype=str)
# 去掉最后一列
# data = data.iloc[:, :-1]
dad=pd.DataFrame(data,dtype=str)
merged_data = merged_data.append(dad, ignore_index=True)
sorted_df = merged_data.sort_values('industry')
grouped = merged_data.groupby('industry')
# 将合并后的数据保存到新的CSV文件中
# merged_data.to_csv(r"D:\hg\tmp\11.csv", encoding='gbk', index=False, quoting=1, quotechar='"', escapechar='\\')
# merged_data.to_excel(r"D:\机械项目研报\机械项目研报汇总.xlsx", index=False, engine='openpyxl')
with pd.ExcelWriter(r'D:\机械项目研报\机械项目研报汇总2.xlsx') as writer:
for group_name, group_df in grouped:
group_df.to_excel(writer, sheet_name=group_name, index=False)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论