提交 9cf093b3 作者: LiuLiYuan

lly 2024-07-23

上级 887d3377
......@@ -6,7 +6,6 @@
<component name="ChangeListManager">
<list default="true" id="bc1f53d8-47f4-4dbb-add3-cecaf77d3733" name="变更" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/zzsn.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/zzsn.iml" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
......@@ -75,7 +74,7 @@
<workItem from="1721120377469" duration="1993000" />
<workItem from="1721124092579" duration="948000" />
<workItem from="1721132090791" duration="2000" />
<workItem from="1721723146118" duration="951000" />
<workItem from="1721723146118" duration="2239000" />
</task>
<task id="LOCAL-00001" summary="lly 2024-07-16">
<created>1721124469886</created>
......@@ -91,7 +90,14 @@
<option name="project" value="LOCAL" />
<updated>1721723190158</updated>
</task>
<option name="localTasksCounter" value="3" />
<task id="LOCAL-00003" summary="lly 2024-07-23">
<created>1721724737883</created>
<option name="number" value="00003" />
<option name="presentableId" value="LOCAL-00003" />
<option name="project" value="LOCAL" />
<updated>1721724737883</updated>
</task>
<option name="localTasksCounter" value="4" />
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
......
# -*- coding: utf-8 -*-
import datetime
import json
import re
from base import BaseCore
import requests
from bs4 import BeautifulSoup
from download import download,sendKafka,paserUrl
baseCore = BaseCore.BaseCore(sqlflg=False)
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Content-Type': 'application/json',
# 'Origin': 'https://www.rolandberger.com',
'Priority': 'u=1, i',
'Sec-Ch-Ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0',
}
def getDic(url,title,type):
attachmentIds = []
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'lxml')
soup = paserUrl(soup,url)
publishDate = soup.find('div',class_='c-text-timestamp').find('span').text.strip()
publishDate = datetime.datetime.strptime(publishDate,'%B %d, %Y').strftime("%Y-%m-%d 00:00:00")
pdfHref = json.loads(re.findall('downloads=(.*)',soup.find('section',class_='m-download').get('ng-init'))[0])['en']['url']
try:
attachmentId = download(pdfHref,title,publishDate,headers,)
attachmentIds.append(attachmentId)
except:
log.error(f'{url}===附件下载失败')
return False
contentWithTag = soup.find('div',class_='c-text-timestamp').find('span')
content = contentWithTag.text
lang = baseCore.detect_language(content)
dic_news = {
'attachmentIds': attachmentIds,
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'lang': lang,
'origin': 'Kiel Institute for the World Economy',
'publishDate': publishDate,
'sid': sid,
'sourceAddress': url,
'title': title,
}
return dic_news
def doJob():
for page in range(0, 6):
if page == 0:
url = 'https://rolandberger-search-api.e-spirit.cloud/v1/prepared_search/InsightsGlobal/execute/?language=en&query=*'
else:
url = f'https://rolandberger-search-api.e-spirit.cloud/v1/prepared_search/InsightsGlobal/execute/?language=en&query=*&haupia_pageNumber={page}&haupia_pageSize=6'
req = requests.get(url, headers=headers)
datasJson = req.json()['results']
for dataJson in datasJson:
soup = BeautifulSoup(str(dataJson['snippets___tile']),'lxml')
href = soup.find('a').get('href')
title = soup.select('a > div.headline > span')[0].text
type = soup.select('a > div.label-wrapper > div')[0].text
dic = getDic(href,title,type)
if __name__ == '__main__':
# 解析列表页 接口拿数据
# 解析详情页 根据不同的标签 处理结果不同
url = 'https://www.rolandberger.com/publications/publication_pdf/Roland_Berger_Private-equity-DACH_State-of-the-Region.pdf'
req = requests.get(url,headers=headers)
with open(r'./a.pdf','wb') as f:
f.write(req.content)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论