提交 fb623647 作者: XveLingKun

全国图书参考资讯联盟

上级 dd34432c
import re
import re
import sys
import time
import pandas as pd
import requests
import xlsxwriter
from bs4 import BeautifulSoup
sys.path.append('../base')
import BaseCore
baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger()
from retry import retry
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Cookie': '__dxca=7e6b49ee-a68d-44c4-ad3a-a6aee9f61d15; userIPType_abo=1; msign_dsr=1724389027576; JSESSIONID=10695447F441B3A8419F617399A43B6A.tomcat511; nopubuser_abo=0; groupenctype_abo=1; partuser=431%5f7320%5f18703752600%5f2%5fC0FD03D19F12C14125544CF87BA03AD2; userId_abo=4950050; groupId=431; schoolid_abo=7320; userName_dsr=18703752600; user_enc_abo=997B3770399404A1EC79EDA3E33509F1; schoolName_abo=%u5168%u56fd%u56fe%u4e66%u9986%u53c2%u8003%u54a8%u8be2%u8054%u76df; displayname_abo=%u859b%u51cc%u5803; AID_dsr=7320; enc_abo=F9BF0B6B1D5CD539CC87E0D91901C463; DSSTASH_LOG=C%5f35%2dUN%5f7320%2dUS%5f0%2dT%5f1724389027576; duxiu=userName%5fdsr%2c%3d18703752600%2c%21userid%5fdsr%2c%3d%2d1%2c%21char%5fdsr%2c%3d%2c%21metaType%2c%3d0%2c%21logo%5fdsr%2c%3dareas%2fucdrs%2fimages%2flogo%2ejpg%2c%21logosmall%5fdsr%2c%3darea%2fucdrs%2flogosmall%2ejpg%2c%21title%5fdsr%2c%3d%u5168%u56fd%u56fe%u4e66%u9986%u53c2%u8003%u54a8%u8be2%u8054%u76df%2c%21url%5fdsr%2c%3d%2c%21compcode%5fdsr%2c%3d%2c%21province%5fdsr%2c%3d%u5176%u5b83%2c%21isdomain%2c%3d3%2c%21showcol%2c%3d0%2c%21isfirst%2c%3d0%2c%21og%2c%3d0%2c%21ogvalue%2c%3d0%2c%21cdb%2c%3d0%2c%21userIPType%2c%3d1%2c%21lt%2c%3d0%2c%21enc%5fdsr%2c%3d7C0C26A7F2C9F1F2F3C447AF3465EFEC; idxdom=www%2eucdrs%2esuperlib%2enet; conter_abo=1; searchcount=4',
'Host': 'book.ucdrs.superlib.net',
'Referer': 'http://book.ucdrs.superlib.net/search?channel=search&gtag=&sw=%E5%9B%BD%E8%B5%84%E5%A7%94&ecode=utf-8&Field=all&Sort=&adminid=&btype=&seb=0&pid=0&year=&sectyear=&showc=0&fenleiID=&searchtype=&authid=0&exp=0&expertsw=&Pages=2',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
}
@retry(tries=3, delay=3)
def getrequest(url, headers):
try:
req = requests.get(url=url, headers=headers)
if req.status_code == 200:
# print(req.text)
soup = BeautifulSoup(req.text, 'html.parser')
return soup
else:
raise
except:
raise
def getDetailInfo(soup):
pageBookInfo = []
tableList = soup.find_all('table', class_='book1')
for book1 in tableList:
td = book1.find_all('td')[-1]
href = 'http://book.ucdrs.superlib.net/' + td.find('a').get('href')
# 书名、作者、形态项、出版项-出版社、出版项-出版日期、ISBN号、中图法分类、原书定价、主题词
detail_soup = getrequest(href, headers)
dd_tags = detail_soup.find('dl').find_all('dd')
# 创建一个空字典来存储提取的数据
data_dict = {}
# tutilte
data_dict['书名'] = detail_soup.find('div', class_='tutilte').get_text()
# 遍历<dd>标签,提取数据
for dd in dd_tags:
# 假设每个<dd>标签都是以【开头和】结尾
content = dd.get_text()
key = content[1:content.find('】')].strip()
value = content[content.find('】') + 1:].strip()
data_dict[key] = value
pageBookInfo.append(data_dict)
time.sleep(15)
return pageBookInfo
def main(url, keyword):
url_ = url.format(keyword, 1)
# ip = baseCore.get_proxy()
soup = getrequest(url_, headers)
if soup:
# 获取当前页数
current_page = soup.find('div', id='searchinfo').find('b').text
# 使用正则表达式从中匹配出总数
match = re.search(r'(\d+)\s*种', current_page)
if match:
totalCount = match.group(1)
else:
print(f'{keyword} 匹配失败')
totalCount = 0
if totalCount:
pass
# 计算总页数
totalPage = int(totalCount) // 10 + 1
print(f'当前关键词:{keyword},总页数:{totalPage}')
allBooksInfo = []
for i in range(1, totalPage+1):
start = time.time()
if i != 1:
soup = getrequest(url.format(keyword, i), headers)
try:
pageInfoBookList = getDetailInfo(soup)
except Exception as e:
print(f'{keyword} 获取第{i}页失败------{e}')
pageInfoBookList = []
allBooksInfo.extend(pageInfoBookList)
time.sleep(30)
print(f'采集第{i}页 耗时 {baseCore.getTimeCost(start, time.time())}')
# 使用pandas将字典写入Excel表格
dfInfo = pd.DataFrame(allBooksInfo)
excel_path = f'data_{keyword}.xlsx'
dfInfo.to_excel(excel_path, sheet_name=keyword, index=False)
return dfInfo
if __name__ == "__main__":
"""
全国图书馆参考联盟 采集5页需过验证 正常采集每页两秒,
减少请求速度 现在采集一页耗时36秒 25页耗时
"""
# keywords = ['国资委', '辅导读本', '辅导百问', '企业混合所有制改革', '企业改革']
keywords = ['企业改革']
excel_path = f'./data/全国图书馆参考资讯联盟2.xlsx' # Excel文件路径
dfs = {}
url = 'http://book.ucdrs.superlib.net/search?channel=search&gtag=&sw={}&ecode=utf-8&Field=all&Sort=3&adminid=&btype=&seb=0&pid=0&year=&sectyear=&showc=0&fenleiID=&searchtype=&authid=0&exp=0&expertsw=&Pages={}'
for keyword in keywords:
start1 = time.time()
try:
dfInfo = main(url, keyword)
dfs[keyword] = dfInfo
except Exception as e:
print(f'{keyword} 爬取失败------{e}')
continue
print(f'采集第关键词{keyword} 耗时 {baseCore.getTimeCost(start1, time.time())}')
# 创建一个xlsxwriter Workbook对象
with xlsxwriter.Workbook(excel_path, {'nan_inf_to_errors': True}) as writer:
for sheet_name, df in dfs.items():
# 为每个DataFrame创建一个sheet
worksheet = writer.add_worksheet(sheet_name)
# 获取DataFrame的列标题和值
headers = list(df.columns)
data = [headers] + df.values.tolist()
# 写入列标题和值
for row_num, row_data in enumerate(data):
worksheet.write_row(row_num, 0, row_data)
# 打印日志信息
print(f"数据已写入 {excel_path} 的 {sheet_name} sheet页")
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论