提交 8fb070a6 作者: 刘伟刚

上传新文件

上级 e9a7d947
# coding: utf-8
# In[1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import time
import json
import demjson
# In[ ]:
list_every_year_info = []
for year in range(2012,2023):
print(year)
url_all = "https://www.caifuzhongwen.com/fortune500/paiming/global500/{}_%e4%b8%96%e7%95%8c500%e5%bc%ba.htm".format(year)
list_all_info_url = []
list_top = []
list_name_ch = []
list_name_en = []
list_shouru = []
list_lirun = []
list_country = []
list_hangye = []
list_zongbu = []
list_ceo = []
list_renshu = []
list_shouru_add = []
list_lirun_add = []
list_zichan = []
list_quanyi = []
list_jinglilv = []
list_shouyilv = []
list_url = []
response_all = requests.get(url_all)
soup_all = BeautifulSoup(response_all.content, 'html.parser')
list_all_com = soup_all.find('tbody',{'style':'word-break:break-all'}).find_all('tr')
top = 1
for com in list_all_com[1:]: #获取企业名称、拼接企业URL
list_com_info = com.find_all('td')
name_ch = re.findall(">(.*?)<",str(list_com_info[1]))[1]
name_en = re.findall(">(.*?)<",str(list_com_info[1]))[2]
url_com = "https://www.caifuzhongwen.com/fortune500/" + list_com_info[1].find('a').get('href')[5:]
list_top.append(top)
list_name_ch.append(name_ch)
list_name_en.append(name_en)
list_all_info_url.append(url_com)
top = top+1
try:
try:
soup_text = soup_all.find_all('script',{'src':''})[1].text.replace("\n","").replace("\t",'').replace(" ",'')
soup_text_2 = re.findall("varcompanyDetails=(.*?)vartable",soup_text)[0].replace("\\","").replace("item1","\'item1\'").replace("item2","\'item2\'").replace("item3","\'item3\'")
except:
soup_text = soup_all.find_all('script',{'src':''})[2].text.replace("\n","").replace("\t",'').replace(" ",'')
soup_text_2 = re.findall("varcompanyDetails=(.*?)vartable",soup_text)[0].replace("\\","").replace("item1","\'item1\'").replace("item2","\'item2\'").replace("item3","\'item3\'")
except:
soup_text = soup_all.find_all('script',{'src':''})[3].text.replace("\n","").replace("\t",'').replace(" ",'')
soup_text_2 = re.findall("varcompanyDetails=(.*?)vartable",soup_text)[0].replace("\\","").replace("item1","\'item1\'").replace("item2","\'item2\'").replace("item3","\'item3\'")
dic_list = re.findall('{(.*?)}',soup_text_2[1:])
list_all_com_money_info = []
list_one_com_money_info = []
num = 0
num_2 = 0
for i in range(0,len(dic_list)):
num = num+1
if num == 7 or num == 8:
one_con_money_info1 = "{" + dic_list[i] + "}"
else:
one_con_money_info1 = "{" + dic_list[i][:-1] + "}"
one_con_money_info2 = one_con_money_info1.replace("\'","\"")
dic_one_con_money_info2 = json.loads(str(one_con_money_info2))
list_one_com_money_info.append(dic_one_con_money_info2)
if num == 8:
list_all_com_money_info.append(list_one_com_money_info)
list_one_com_money_info = []
num = 0
for one_con_money_info in list_all_com_money_info: #获取各个企业的收入等数据,list_all_com_money_info由后面字段取出后做成字典
shouru = one_con_money_info[1]['item2']
shouru_add = one_con_money_info[1]['item3']
lirun = one_con_money_info[2]['item2']
lirun_add = one_con_money_info[2]['item3']
zichan = one_con_money_info[3]['item2']
quanyi = one_con_money_info[4]['item2']
jinglilv = one_con_money_info[6]['item2']
shouyilv = one_con_money_info[7]['item2']
list_shouru.append(shouru)
list_shouru_add.append(shouru_add)
list_lirun.append(lirun)
list_lirun_add.append(lirun_add)
list_zichan.append(zichan)
list_quanyi.append(quanyi)
list_jinglilv.append(jinglilv)
list_shouyilv.append(shouyilv)
for com_url in list_all_info_url: #进入每个企业网页,获取每个企业信息,此处最好加time.sleep
response_one_com_info = requests.get(com_url)
soup_one_url_info = BeautifulSoup(response_one_com_info.content, 'html.parser')
list_one_com_info = soup_one_url_info.find('table').find_all('tr')
ceo = list_one_com_info[0].find_all('td')[1].text
country = list_one_com_info[1].find_all('td')[1].text
hangye = list_one_com_info[2].find_all('td')[1].text
zongbu = list_one_com_info[3].find_all('td')[1].text
renshu = list_one_com_info[4].find_all('td')[1].text
url = list_one_com_info[5].find_all('td')[1].text
list_ceo.append(ceo)
list_country.append(country)
list_hangye.append(hangye)
list_zongbu.append(zongbu)
list_renshu.append(renshu)
list_url.append(url)
print(com_url+":爬取完成")
time.sleep(2)
dic_all_com_info = {
'排名':list_top,
'中文名称':list_name_ch,
'英文名称':list_name_en,
'营业收入(百万美元)':list_shouru,
'利润(百万美元)':list_lirun,
'企业所属国家':list_country,
'行业':list_hangye,
'企业总部地址':list_zongbu,
'企业首席执行官(CEO)':list_ceo,
'企业员工数':list_renshu,
'企业官网':list_url,
'营业收入:百万美元':list_shouru,
'营业收入:年增减%':list_shouru_add,
'利润:百万美元':list_lirun,
'利润:年增减%':list_lirun_add,
'资产:百万美元':list_zichan,
'资产:年增减%':'--',
'股东权益:百万美元':list_quanyi,
'股东权益:年增减%':'--',
'利润占比:%':'None',
'利润占比:年增减%':'None',
'净利率:%':list_jinglilv,
'净利率:年增减%':'None',
'资产收益率:%':list_shouyilv,
'资产收益率:年增减%':'None'
}
list_all_year.append(dic_all_com_info)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论