import json

import pandas as pd
import pymongo
import requests
from bs4 import BeautifulSoup
from retry import retry

db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
    '2022年福布斯企业人数']
url = 'https://web.archive.org/web/20220929184024/https://www.forbes.com/lists/global2000/'
headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'max-age=0',
        'Cookie': 'lux_uid=166447682647510727; donation-identifier=aab33e1c4e293a8fcd5490465688bb01; bafp=79fcddb0-4e71-11ee-8a81-b762f64bf85c',
        'Priority': 'u=0, i',
        'Sec-Ch-Ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': 'Windows"',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
}

proxies = {
    'https': 'http://127.0.0.1:1080',
    'http': 'http://127.0.0.1:1080',
}

@retry(tries=5, delay=2)
def detail(href):
    try:
        req = requests.get(headers=headers, url=href, verify=False, proxies=proxies)
        soup_ = BeautifulSoup(req.text, 'lxml')
        scripts = soup_.find_all('script')
        req.close()
        return scripts
    except:
        raise


@retry(tries=3, delay=2)
def spider():
    response = requests.get(url=url, headers=headers, proxies=proxies)
    soup = BeautifulSoup(response.text, 'html.parser')
    # print(soup)
    tables = soup.find_all('div', class_="table-row-group")
    print(len(tables))
    for idx, table in enumerate(tables):
        print(f'正在遍历第{idx}个table')
        a_list = table.find_all('a', class_="table-row")
        for a in a_list:
            rank = a.find('div', class_="rank").text.replace('.', '')
            print(f'排名： {rank}')
            organizationName = a.find('div', class_="organizationName").text
            href = a.get('href')
            try:
                scripts = detail(href)
            except:
                print(f'error--:{idx},{rank},{organizationName}')
                item = str(idx) + ',' + rank + ',' + organizationName
                with open('./error_2022.txt', 'a', encoding='utf-8')as f:
                    f.write(item)
                continue
            # print(scripts)
            for script in scripts:
                if 'numberOfEmployees' in script.text:
                    break
                else:
                    continue
                    # print(f'{rank}--{uri}---not found')
            try:
                employeesJson = script.text
                # print(employeesJson)

                employeesJson = json.loads(employeesJson)
                numberOfEmployees = employeesJson['numberOfEmployees'].replace(',', '')
            except:
                numberOfEmployees = '--'
            dic = {
                '排名': rank,
                '企业名称': organizationName,
                '员工人数': numberOfEmployees,
            }
            # print(dic)
            db_storage.insert_one(dic)
            print(f'{rank}==={organizationName}===已入库')


def spider2():
    # 读取excel
    df = pd.read_excel('./2022年福布斯榜单.xlsx', sheet_name='待补充')
    # 获取数据
    data = df.values.tolist()
    for idx, row in enumerate(data):
        # 获取排名、公司名称、链接
        rank = row[1]
        organizationName = row[2]
        # 将名称转化成小写
        organizationName = organizationName.lower().replace(' ', '-')
        href = f'https://web.archive.org/web/20220929184024/https://www.forbes.com/companies/{organizationName}/?list=global2000'
        # 调用爬虫
        try:
            scripts = detail(href)
        except:
            print(f'error--:{idx},{rank},{organizationName}')
            item = str(idx) + ',' + rank + ',' + organizationName
            with open('./error_2022.txt', 'a', encoding='utf-8') as f:
                f.write(item)
            continue
        # print(scripts)
        for script in scripts:
            if 'numberOfEmployees' in script.text:
                break
            else:
                continue
                # print(f'{rank}--{uri}---not found')
        try:
            employeesJson = script.text
            # print(employeesJson)

            employeesJson = json.loads(employeesJson)
            numberOfEmployees = employeesJson['numberOfEmployees'].replace(',', '')
        except:
            numberOfEmployees = '--'
        dic = {
            '排名': rank,
            '企业名称': organizationName,
            '员工人数': numberOfEmployees,
        }
        # print(dic)
        db_storage.insert_one(dic)
        print(f'{rank}==={organizationName}===已入库')


if __name__ == '__main__':
    # spider()
    spider2()