"""从html页面中抽取表格"""
import requests
from bs4 import BeautifulSoup
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()


def getRequest(url):
    headers = {
        'Referer': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/356037/000035603723000038/cspi-20230630x10q.htm',
        'Sec-Ch-Ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31',
           }
    for m in range(0,3):
        try:
            response = requests.get(url=url,headers=headers,verify=False)
            break
        except Exception as e:
            log.error(f"request请求异常-------{e}")
            continue
    # 检查响应状态码
    if response.status_code == 200:
        soup = BeautifulSoup(response.content,'html.parser')
        return soup
    else:
        return False

def getzcfztable(soup):
    table_list = soup.find_all('table')
    for table in table_list:
        aa = table.find_all(text='Current assets:')
        if aa:
            # print(table)
            trlist = table.find_all('tr')
            date1 = trlist[1].find_all('td')[1].text.replace('\n', '')
            date2 = trlist[1].find_all('td')[-1].text.replace('\n', '')
            print(date1, date2)
            # todo:把td内容为空的去掉
            for tr in trlist[2:]:
                filtered_tags = tr(lambda tag: tag.name == 'td' and '$' in tag.text)
                for tag in filtered_tags:
                    tag.extract()
                # filtered_tags2 = tr(lambda tag:tag.name=='td' and tag.text==' ')
                filtered_tags2 = tr(lambda tag: tag.name == 'td' and tag.text == '')
                for tag in filtered_tags2:
                    tag.extract()
                try:
                    zbtag = tr.find_all('td')[0].text.replace('\n', '')
                except:
                    zbtag = ''
                try:
                    cash1 = tr.find_all('td')[1].text.replace('\n', '')
                except:
                    cash1 = ''
                try:
                    cash2 = tr.find_all('td')[2].text.replace('\n', '')
                except:
                    cash2 = ''
                if zbtag != '' and cash1 != '' and cash2 != '':
                    print(f'字段:{zbtag}  值1:{cash1}  值2:{cash2}')

if __name__=='__main__':
    url = 'https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/aapl-20210925.htm'
    soup = getRequest(url)
    #html解析表格 资产负债表
    getzcfztable(soup)


