提交 dd34432c 作者: XveLingKun

国家图书馆

上级 3f728307
import json
import json
import time
import re
import sys
import time
import pandas as pd
import requests
import xlsxwriter
from bs4 import BeautifulSoup
sys.path.append('../base')
import BaseCore
baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger()
from retry import retry
from urllib.parse import urlencode
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
# 'Content-Length': '575',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'find.nlc.cn',
'Origin': 'http://find.nlc.cn',
# 'Origin': 'http://find.nlc.cn/search/doSearch?',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
'X-Requested-With': 'XMLHttpRequest'
}
@retry(tries=3, delay=3)
def getrequest(url, headers):
try:
req = requests.get(url=url,headers=headers)
if req.status_code == 200:
# print(req.text)
soup = BeautifulSoup(req.text, 'html.parser')
return soup
else:
raise
except:
raise
def getDetailInfo(soup):
pageBookInfo = []
tableList = soup.find_all('div', class_='article_item')
for book1 in tableList:
info = book1.find('div', class_='book_name')
"""<a href="javascript:void(0);" onclick="makeDetailUrl(this, '/search/showDocDetails?', '-49925015253155232', 'ucs01', '国资委');" target="_blank">"""
# http://find.nlc.cn/search/showDocDetails?docId=-5060698567676905396&dataSource=ucs01&query=企业混合所有制改革
# print(info)
hrefInfo = str(info).split('onclick="makeDetailUrl(this,')[1]
hrefInfo = hrefInfo.split(');" target=')[0].replace("'", '').strip(' ')
hrefInfo_list = hrefInfo.split(',')
href = f'http://find.nlc.cn/search/showDocDetails?docId={hrefInfo_list[1].strip(" ")}&dataSource={hrefInfo_list[2].strip(" ")}&query={hrefInfo_list[-1].strip(" ")}'
# 采集字段,书名、出版发行者、出版发行时间、所有责任者、标识号ISBN、关键词、分类(中图文类)、丛编题名、载体形态
detail_soup = getrequest(href, headers)
# 创建一个空字典来存储提取的数据
book_info = {}
# 查找所有包含类名'book_item'的div标签
book_items = detail_soup.find('div', id='book_wr').find_all('div', class_='book_item')
# 遍历每个book_item,提取键和值
# 第一个书名作为特殊处理,不包含冒号
book_info['书名'] = detail_soup.find('div', class_='book_name').get_text().strip()
# 遍历每个book_item,提取键和值
for item in book_items:
try:
key = item.find('span', class_='book_type').get_text().replace(':', '').replace("\n", "").strip(" ")
try:
value = item.find('span', class_='book_val').get_text().replace("\n", "").strip(" ")
except:
value = item.find('span', class_='book_t_val').get_text().replace("\n", "").strip(" ")
book_info[key] = value
except Exception as e:
continue
book_items2 = detail_soup.find('div', id='detail-info').find_all('div', class_='book_item')
# 遍历每个book_item,提取键和值
for item in book_items2:
key = item.find('span', class_='book_val').get_text().replace(':', '').replace("\n", "").strip(" ")
value = item.find('span', class_='book_type').get_text().replace("\n", "").strip(" ")
book_info[key] = value
log.info(book_info)
pageBookInfo.append(book_info)
time.sleep(1)
return pageBookInfo
def main(url, keyword, headers):
payload_ = {
'query': keyword,
'secQuery': '',
'actualQuery': f'{keyword} mediatype:(0 OR 1 OR 2) ',
'pageNo': 1,
'orderBy': 'RELATIVE',
'queryField': '',
'fldText': '全部检索字段',
'isGroup': 'isGroup',
'showcount': 0,
'docType': '图书',
'targetField': '',
'targetFieldLog': '全部字段',
'orginQuery': f'{keyword} mediatype:(0 OR 1 OR 2) ',
'searchType': '2'
}
# ip = baseCore.get_proxy()
# 使用urlencode将字典转换为URL查询字符串
query_string = urlencode(payload_)
soup = getrequest(url+query_string, headers)
if soup:
# 获取当前页数
totalCount = int(soup.find('div', class_='search_result').find('b').text)
if totalCount:
pass
# 计算总页数
totalPage = int(totalCount / 10) + (1 if totalCount % 10 != 0 else 0)
log.info(f'当前关键词:{keyword},总页数:{totalPage}')
allBooksInfo = []
for i in range(1, totalPage + 1):
payload = {
'query': keyword,
'secQuery': '',
'actualQuery': f'{keyword} mediatype:(0 OR 1 OR 2) ',
'pageNo': i,
'orderBy': 'RELATIVE',
'queryField': '',
'fldText': '全部检索字段',
'isGroup': 'isGroup',
'showcount': 0,
'docType': '图书',
'targetField': '',
'targetFieldLog': '全部字段',
'orginQuery': f'{keyword} mediatype:(0 OR 1 OR 2) ',
'searchType': '2'
}
start = time.time()
if i != 1:
query_string1 = urlencode(payload)
soup = getrequest(url+query_string1, headers)
try:
pageInfoBookList = getDetailInfo(soup)
except Exception as e:
log.info(f'{keyword} 获取第{i}页失败------{e}')
pageInfoBookList = []
allBooksInfo.extend(pageInfoBookList)
time.sleep(5)
log.info(f'采集第{i}页 耗时 {baseCore.getTimeCost(start, time.time())}')
# break
# 使用pandas将字典写入Excel表格
dfInfo = pd.DataFrame(allBooksInfo)
excel_path = f'data_{keyword}.xlsx'
dfInfo.to_excel(excel_path, sheet_name=keyword, index=False)
return dfInfo
if __name__ == "__main__":
keywords = ['国资委', '辅导读本', '辅导百问', '企业混合所有制改革', '企业改革']
# keywords = ['国资委']
excel_path = f'./data/国家图书馆.xlsx' # Excel文件路径
dfs = {}
# url = 'http://find.nlc.cn/search/ajaxSearch'
url = 'http://find.nlc.cn/search/doSearch?'
for keyword in keywords:
start1 = time.time()
try:
dfInfo = main(url, keyword, headers)
dfs[keyword] = dfInfo
except Exception as e:
log.info(f'{keyword} 爬取失败------{e}')
continue
log.info(f'采集第关键词{keyword} 耗时 {baseCore.getTimeCost(start1, time.time())}')
# 创建一个xlsxwriter Workbook对象
with xlsxwriter.Workbook(excel_path, {'nan_inf_to_errors': True}) as writer:
for sheet_name, df in dfs.items():
# 为每个DataFrame创建一个sheet
worksheet = writer.add_worksheet(sheet_name)
# 获取DataFrame的列标题和值
headers = list(df.columns)
data = [headers] + df.values.tolist()
# 写入列标题和值
for row_num, row_data in enumerate(data):
worksheet.write_row(row_num, 0, row_data)
# 打印日志信息
log.info(f"数据已写入 {excel_path} 的 {sheet_name} sheet页")
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论