Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
dd34432c
提交
dd34432c
authored
8月 28, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
国家图书馆
上级
3f728307
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
198 行增加
和
0 行删除
+198
-0
国家图书馆.py
国资委书单采集/国家图书馆.py
+198
-0
没有找到文件。
国资委书单采集/国家图书馆.py
0 → 100644
浏览文件 @
dd34432c
import
json
import
json
import
time
import
re
import
sys
import
time
import
pandas
as
pd
import
requests
import
xlsxwriter
from
bs4
import
BeautifulSoup
sys
.
path
.
append
(
'../base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
(
sqlFlg
=
False
)
log
=
baseCore
.
getLogger
()
from
retry
import
retry
from
urllib.parse
import
urlencode
headers
=
{
'Accept'
:
'*/*'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
,
'Connection'
:
'keep-alive'
,
# 'Content-Length': '575',
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Host'
:
'find.nlc.cn'
,
'Origin'
:
'http://find.nlc.cn'
,
# 'Origin': 'http://find.nlc.cn/search/doSearch?',
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
,
'X-Requested-With'
:
'XMLHttpRequest'
}
@retry
(
tries
=
3
,
delay
=
3
)
def
getrequest
(
url
,
headers
):
try
:
req
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
if
req
.
status_code
==
200
:
# print(req.text)
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
return
soup
else
:
raise
except
:
raise
def
getDetailInfo
(
soup
):
pageBookInfo
=
[]
tableList
=
soup
.
find_all
(
'div'
,
class_
=
'article_item'
)
for
book1
in
tableList
:
info
=
book1
.
find
(
'div'
,
class_
=
'book_name'
)
"""<a href="javascript:void(0);" onclick="makeDetailUrl(this, '/search/showDocDetails?', '-49925015253155232', 'ucs01', '国资委');" target="_blank">"""
# http://find.nlc.cn/search/showDocDetails?docId=-5060698567676905396&dataSource=ucs01&query=企业混合所有制改革
# print(info)
hrefInfo
=
str
(
info
)
.
split
(
'onclick="makeDetailUrl(this,'
)[
1
]
hrefInfo
=
hrefInfo
.
split
(
');" target='
)[
0
]
.
replace
(
"'"
,
''
)
.
strip
(
' '
)
hrefInfo_list
=
hrefInfo
.
split
(
','
)
href
=
f
'http://find.nlc.cn/search/showDocDetails?docId={hrefInfo_list[1].strip(" ")}&dataSource={hrefInfo_list[2].strip(" ")}&query={hrefInfo_list[-1].strip(" ")}'
# 采集字段,书名、出版发行者、出版发行时间、所有责任者、标识号ISBN、关键词、分类(中图文类)、丛编题名、载体形态
detail_soup
=
getrequest
(
href
,
headers
)
# 创建一个空字典来存储提取的数据
book_info
=
{}
# 查找所有包含类名'book_item'的div标签
book_items
=
detail_soup
.
find
(
'div'
,
id
=
'book_wr'
)
.
find_all
(
'div'
,
class_
=
'book_item'
)
# 遍历每个book_item,提取键和值
# 第一个书名作为特殊处理,不包含冒号
book_info
[
'书名'
]
=
detail_soup
.
find
(
'div'
,
class_
=
'book_name'
)
.
get_text
()
.
strip
()
# 遍历每个book_item,提取键和值
for
item
in
book_items
:
try
:
key
=
item
.
find
(
'span'
,
class_
=
'book_type'
)
.
get_text
()
.
replace
(
':'
,
''
)
.
replace
(
"
\n
"
,
""
)
.
strip
(
" "
)
try
:
value
=
item
.
find
(
'span'
,
class_
=
'book_val'
)
.
get_text
()
.
replace
(
"
\n
"
,
""
)
.
strip
(
" "
)
except
:
value
=
item
.
find
(
'span'
,
class_
=
'book_t_val'
)
.
get_text
()
.
replace
(
"
\n
"
,
""
)
.
strip
(
" "
)
book_info
[
key
]
=
value
except
Exception
as
e
:
continue
book_items2
=
detail_soup
.
find
(
'div'
,
id
=
'detail-info'
)
.
find_all
(
'div'
,
class_
=
'book_item'
)
# 遍历每个book_item,提取键和值
for
item
in
book_items2
:
key
=
item
.
find
(
'span'
,
class_
=
'book_val'
)
.
get_text
()
.
replace
(
':'
,
''
)
.
replace
(
"
\n
"
,
""
)
.
strip
(
" "
)
value
=
item
.
find
(
'span'
,
class_
=
'book_type'
)
.
get_text
()
.
replace
(
"
\n
"
,
""
)
.
strip
(
" "
)
book_info
[
key
]
=
value
log
.
info
(
book_info
)
pageBookInfo
.
append
(
book_info
)
time
.
sleep
(
1
)
return
pageBookInfo
def
main
(
url
,
keyword
,
headers
):
payload_
=
{
'query'
:
keyword
,
'secQuery'
:
''
,
'actualQuery'
:
f
'{keyword} mediatype:(0 OR 1 OR 2) '
,
'pageNo'
:
1
,
'orderBy'
:
'RELATIVE'
,
'queryField'
:
''
,
'fldText'
:
'全部检索字段'
,
'isGroup'
:
'isGroup'
,
'showcount'
:
0
,
'docType'
:
'图书'
,
'targetField'
:
''
,
'targetFieldLog'
:
'全部字段'
,
'orginQuery'
:
f
'{keyword} mediatype:(0 OR 1 OR 2) '
,
'searchType'
:
'2'
}
# ip = baseCore.get_proxy()
# 使用urlencode将字典转换为URL查询字符串
query_string
=
urlencode
(
payload_
)
soup
=
getrequest
(
url
+
query_string
,
headers
)
if
soup
:
# 获取当前页数
totalCount
=
int
(
soup
.
find
(
'div'
,
class_
=
'search_result'
)
.
find
(
'b'
)
.
text
)
if
totalCount
:
pass
# 计算总页数
totalPage
=
int
(
totalCount
/
10
)
+
(
1
if
totalCount
%
10
!=
0
else
0
)
log
.
info
(
f
'当前关键词:{keyword},总页数:{totalPage}'
)
allBooksInfo
=
[]
for
i
in
range
(
1
,
totalPage
+
1
):
payload
=
{
'query'
:
keyword
,
'secQuery'
:
''
,
'actualQuery'
:
f
'{keyword} mediatype:(0 OR 1 OR 2) '
,
'pageNo'
:
i
,
'orderBy'
:
'RELATIVE'
,
'queryField'
:
''
,
'fldText'
:
'全部检索字段'
,
'isGroup'
:
'isGroup'
,
'showcount'
:
0
,
'docType'
:
'图书'
,
'targetField'
:
''
,
'targetFieldLog'
:
'全部字段'
,
'orginQuery'
:
f
'{keyword} mediatype:(0 OR 1 OR 2) '
,
'searchType'
:
'2'
}
start
=
time
.
time
()
if
i
!=
1
:
query_string1
=
urlencode
(
payload
)
soup
=
getrequest
(
url
+
query_string1
,
headers
)
try
:
pageInfoBookList
=
getDetailInfo
(
soup
)
except
Exception
as
e
:
log
.
info
(
f
'{keyword} 获取第{i}页失败------{e}'
)
pageInfoBookList
=
[]
allBooksInfo
.
extend
(
pageInfoBookList
)
time
.
sleep
(
5
)
log
.
info
(
f
'采集第{i}页 耗时 {baseCore.getTimeCost(start, time.time())}'
)
# break
# 使用pandas将字典写入Excel表格
dfInfo
=
pd
.
DataFrame
(
allBooksInfo
)
excel_path
=
f
'data_{keyword}.xlsx'
dfInfo
.
to_excel
(
excel_path
,
sheet_name
=
keyword
,
index
=
False
)
return
dfInfo
if
__name__
==
"__main__"
:
keywords
=
[
'国资委'
,
'辅导读本'
,
'辅导百问'
,
'企业混合所有制改革'
,
'企业改革'
]
# keywords = ['国资委']
excel_path
=
f
'./data/国家图书馆.xlsx'
# Excel文件路径
dfs
=
{}
# url = 'http://find.nlc.cn/search/ajaxSearch'
url
=
'http://find.nlc.cn/search/doSearch?'
for
keyword
in
keywords
:
start1
=
time
.
time
()
try
:
dfInfo
=
main
(
url
,
keyword
,
headers
)
dfs
[
keyword
]
=
dfInfo
except
Exception
as
e
:
log
.
info
(
f
'{keyword} 爬取失败------{e}'
)
continue
log
.
info
(
f
'采集第关键词{keyword} 耗时 {baseCore.getTimeCost(start1, time.time())}'
)
# 创建一个xlsxwriter Workbook对象
with
xlsxwriter
.
Workbook
(
excel_path
,
{
'nan_inf_to_errors'
:
True
})
as
writer
:
for
sheet_name
,
df
in
dfs
.
items
():
# 为每个DataFrame创建一个sheet
worksheet
=
writer
.
add_worksheet
(
sheet_name
)
# 获取DataFrame的列标题和值
headers
=
list
(
df
.
columns
)
data
=
[
headers
]
+
df
.
values
.
tolist
()
# 写入列标题和值
for
row_num
,
row_data
in
enumerate
(
data
):
worksheet
.
write_row
(
row_num
,
0
,
row_data
)
# 打印日志信息
log
.
info
(
f
"数据已写入 {excel_path} 的 {sheet_name} sheet页"
)
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论