Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
8fb070a6
提交
8fb070a6
authored
8月 02, 2023
作者:
刘伟刚
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
上传新文件
上级
e9a7d947
显示空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
178 行增加
和
0 行删除
+178
-0
500强.py
test/500强.py
+178
-0
没有找到文件。
test/500强.py
0 → 100644
浏览文件 @
8fb070a6
# coding: utf-8
# In[1]:
from
selenium
import
webdriver
from
bs4
import
BeautifulSoup
import
pandas
as
pd
import
requests
import
re
import
time
import
json
import
demjson
# In[ ]:
list_every_year_info
=
[]
for
year
in
range
(
2012
,
2023
):
print
(
year
)
url_all
=
"https://www.caifuzhongwen.com/fortune500/paiming/global500/{}_
%
e4
%
b8
%96%
e7
%95%8
c500
%
e5
%
bc
%
ba.htm"
.
format
(
year
)
list_all_info_url
=
[]
list_top
=
[]
list_name_ch
=
[]
list_name_en
=
[]
list_shouru
=
[]
list_lirun
=
[]
list_country
=
[]
list_hangye
=
[]
list_zongbu
=
[]
list_ceo
=
[]
list_renshu
=
[]
list_shouru_add
=
[]
list_lirun_add
=
[]
list_zichan
=
[]
list_quanyi
=
[]
list_jinglilv
=
[]
list_shouyilv
=
[]
list_url
=
[]
response_all
=
requests
.
get
(
url_all
)
soup_all
=
BeautifulSoup
(
response_all
.
content
,
'html.parser'
)
list_all_com
=
soup_all
.
find
(
'tbody'
,{
'style'
:
'word-break:break-all'
})
.
find_all
(
'tr'
)
top
=
1
for
com
in
list_all_com
[
1
:]:
#获取企业名称、拼接企业URL
list_com_info
=
com
.
find_all
(
'td'
)
name_ch
=
re
.
findall
(
">(.*?)<"
,
str
(
list_com_info
[
1
]))[
1
]
name_en
=
re
.
findall
(
">(.*?)<"
,
str
(
list_com_info
[
1
]))[
2
]
url_com
=
"https://www.caifuzhongwen.com/fortune500/"
+
list_com_info
[
1
]
.
find
(
'a'
)
.
get
(
'href'
)[
5
:]
list_top
.
append
(
top
)
list_name_ch
.
append
(
name_ch
)
list_name_en
.
append
(
name_en
)
list_all_info_url
.
append
(
url_com
)
top
=
top
+
1
try
:
try
:
soup_text
=
soup_all
.
find_all
(
'script'
,{
'src'
:
''
})[
1
]
.
text
.
replace
(
"
\n
"
,
""
)
.
replace
(
"
\t
"
,
''
)
.
replace
(
" "
,
''
)
soup_text_2
=
re
.
findall
(
"varcompanyDetails=(.*?)vartable"
,
soup_text
)[
0
]
.
replace
(
"
\\
"
,
""
)
.
replace
(
"item1"
,
"
\'
item1
\'
"
)
.
replace
(
"item2"
,
"
\'
item2
\'
"
)
.
replace
(
"item3"
,
"
\'
item3
\'
"
)
except
:
soup_text
=
soup_all
.
find_all
(
'script'
,{
'src'
:
''
})[
2
]
.
text
.
replace
(
"
\n
"
,
""
)
.
replace
(
"
\t
"
,
''
)
.
replace
(
" "
,
''
)
soup_text_2
=
re
.
findall
(
"varcompanyDetails=(.*?)vartable"
,
soup_text
)[
0
]
.
replace
(
"
\\
"
,
""
)
.
replace
(
"item1"
,
"
\'
item1
\'
"
)
.
replace
(
"item2"
,
"
\'
item2
\'
"
)
.
replace
(
"item3"
,
"
\'
item3
\'
"
)
except
:
soup_text
=
soup_all
.
find_all
(
'script'
,{
'src'
:
''
})[
3
]
.
text
.
replace
(
"
\n
"
,
""
)
.
replace
(
"
\t
"
,
''
)
.
replace
(
" "
,
''
)
soup_text_2
=
re
.
findall
(
"varcompanyDetails=(.*?)vartable"
,
soup_text
)[
0
]
.
replace
(
"
\\
"
,
""
)
.
replace
(
"item1"
,
"
\'
item1
\'
"
)
.
replace
(
"item2"
,
"
\'
item2
\'
"
)
.
replace
(
"item3"
,
"
\'
item3
\'
"
)
dic_list
=
re
.
findall
(
'{(.*?)}'
,
soup_text_2
[
1
:])
list_all_com_money_info
=
[]
list_one_com_money_info
=
[]
num
=
0
num_2
=
0
for
i
in
range
(
0
,
len
(
dic_list
)):
num
=
num
+
1
if
num
==
7
or
num
==
8
:
one_con_money_info1
=
"{"
+
dic_list
[
i
]
+
"}"
else
:
one_con_money_info1
=
"{"
+
dic_list
[
i
][:
-
1
]
+
"}"
one_con_money_info2
=
one_con_money_info1
.
replace
(
"
\'
"
,
"
\"
"
)
dic_one_con_money_info2
=
json
.
loads
(
str
(
one_con_money_info2
))
list_one_com_money_info
.
append
(
dic_one_con_money_info2
)
if
num
==
8
:
list_all_com_money_info
.
append
(
list_one_com_money_info
)
list_one_com_money_info
=
[]
num
=
0
for
one_con_money_info
in
list_all_com_money_info
:
#获取各个企业的收入等数据,list_all_com_money_info由后面字段取出后做成字典
shouru
=
one_con_money_info
[
1
][
'item2'
]
shouru_add
=
one_con_money_info
[
1
][
'item3'
]
lirun
=
one_con_money_info
[
2
][
'item2'
]
lirun_add
=
one_con_money_info
[
2
][
'item3'
]
zichan
=
one_con_money_info
[
3
][
'item2'
]
quanyi
=
one_con_money_info
[
4
][
'item2'
]
jinglilv
=
one_con_money_info
[
6
][
'item2'
]
shouyilv
=
one_con_money_info
[
7
][
'item2'
]
list_shouru
.
append
(
shouru
)
list_shouru_add
.
append
(
shouru_add
)
list_lirun
.
append
(
lirun
)
list_lirun_add
.
append
(
lirun_add
)
list_zichan
.
append
(
zichan
)
list_quanyi
.
append
(
quanyi
)
list_jinglilv
.
append
(
jinglilv
)
list_shouyilv
.
append
(
shouyilv
)
for
com_url
in
list_all_info_url
:
#进入每个企业网页,获取每个企业信息,此处最好加time.sleep
response_one_com_info
=
requests
.
get
(
com_url
)
soup_one_url_info
=
BeautifulSoup
(
response_one_com_info
.
content
,
'html.parser'
)
list_one_com_info
=
soup_one_url_info
.
find
(
'table'
)
.
find_all
(
'tr'
)
ceo
=
list_one_com_info
[
0
]
.
find_all
(
'td'
)[
1
]
.
text
country
=
list_one_com_info
[
1
]
.
find_all
(
'td'
)[
1
]
.
text
hangye
=
list_one_com_info
[
2
]
.
find_all
(
'td'
)[
1
]
.
text
zongbu
=
list_one_com_info
[
3
]
.
find_all
(
'td'
)[
1
]
.
text
renshu
=
list_one_com_info
[
4
]
.
find_all
(
'td'
)[
1
]
.
text
url
=
list_one_com_info
[
5
]
.
find_all
(
'td'
)[
1
]
.
text
list_ceo
.
append
(
ceo
)
list_country
.
append
(
country
)
list_hangye
.
append
(
hangye
)
list_zongbu
.
append
(
zongbu
)
list_renshu
.
append
(
renshu
)
list_url
.
append
(
url
)
print
(
com_url
+
":爬取完成"
)
time
.
sleep
(
2
)
dic_all_com_info
=
{
'排名'
:
list_top
,
'中文名称'
:
list_name_ch
,
'英文名称'
:
list_name_en
,
'营业收入(百万美元)'
:
list_shouru
,
'利润(百万美元)'
:
list_lirun
,
'企业所属国家'
:
list_country
,
'行业'
:
list_hangye
,
'企业总部地址'
:
list_zongbu
,
'企业首席执行官(CEO)'
:
list_ceo
,
'企业员工数'
:
list_renshu
,
'企业官网'
:
list_url
,
'营业收入:百万美元'
:
list_shouru
,
'营业收入:年增减
%
'
:
list_shouru_add
,
'利润:百万美元'
:
list_lirun
,
'利润:年增减
%
'
:
list_lirun_add
,
'资产:百万美元'
:
list_zichan
,
'资产:年增减
%
'
:
'--'
,
'股东权益:百万美元'
:
list_quanyi
,
'股东权益:年增减
%
'
:
'--'
,
'利润占比:
%
'
:
'None'
,
'利润占比:年增减
%
'
:
'None'
,
'净利率:
%
'
:
list_jinglilv
,
'净利率:年增减
%
'
:
'None'
,
'资产收益率:
%
'
:
list_shouyilv
,
'资产收益率:年增减
%
'
:
'None'
}
list_all_year
.
append
(
dic_all_com_info
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论