lly 2024-07-23

9cf093b3 · LiuLiYuan · 887d3377 · 9cf093b3 · 9cf093b3
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -6,7 +6,6 @@
  <component name="ChangeListManager">
    <list default="true" id="bc1f53d8-47f4-4dbb-add3-cecaf77d3733" name="变更" comment="">
      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.idea/zzsn.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/zzsn.iml" afterDir="false" />
    </list>
    <option name="SHOW_DIALOG" value="false" />
    <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -75,7 +74,7 @@
      <workItem from="1721120377469" duration="1993000" />
      <workItem from="1721124092579" duration="948000" />
      <workItem from="1721132090791" duration="2000" />
-      <workItem from="1721723146118" duration="951000" />
+      <workItem from="1721723146118" duration="2239000" />
    </task>
    <task id="LOCAL-00001" summary="lly 2024-07-16">
      <created>1721124469886</created>
@@ -91,7 +90,14 @@
      <option name="project" value="LOCAL" />
      <updated>1721723190158</updated>
    </task>
-    <option name="localTasksCounter" value="3" />
+    <task id="LOCAL-00003" summary="lly 2024-07-23">
+      <created>1721724737883</created>
+      <option name="number" value="00003" />
+      <option name="presentableId" value="LOCAL-00003" />
+      <option name="project" value="LOCAL" />
+      <updated>1721724737883</updated>
+    </task>
+    <option name="localTasksCounter" value="4" />
    <servers />
  </component>
  <component name="TypeScriptGeneratedFilesManager">

--- a/yjzx/zk/rolandberger.py
+++ b/yjzx/zk/rolandberger.py
+# -*- coding: utf-8 -*-
+import datetime
+import json
+import re
+from base import BaseCore
+import requests
+from bs4 import BeautifulSoup
+from download import download,sendKafka,paserUrl
+baseCore = BaseCore.BaseCore(sqlflg=False)
+log = baseCore.getLogger()
+headers = {
+    'Accept': '*/*',
+    'Accept-Encoding': 'gzip, deflate, br, zstd',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'max-age=0',
+    'Content-Type': 'application/json',
+    # 'Origin': 'https://www.rolandberger.com',
+    'Priority': 'u=1, i',
+    'Sec-Ch-Ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"',
+    'Sec-Ch-Ua-Mobile': '?0',
+    'Sec-Ch-Ua-Platform': '"Windows"',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'cross-site',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0',
+}
+def getDic(url,title,type):
+    attachmentIds = []
+    req = requests.get(url, headers=headers)
+    soup = BeautifulSoup(req.text,'lxml')
+    soup = paserUrl(soup,url)
+    publishDate = soup.find('div',class_='c-text-timestamp').find('span').text.strip()
+    publishDate = datetime.datetime.strptime(publishDate,'%B %d, %Y').strftime("%Y-%m-%d 00:00:00")
+    pdfHref = json.loads(re.findall('downloads=(.*)',soup.find('section',class_='m-download').get('ng-init'))[0])['en']['url']
+    try:
+        attachmentId = download(pdfHref,title,publishDate,headers,)
+        attachmentIds.append(attachmentId)
+    except:
+        log.error(f'{url}===附件下载失败')
+        return False
+    contentWithTag = soup.find('div',class_='c-text-timestamp').find('span')
+    content = contentWithTag.text
+    lang = baseCore.detect_language(content)
+    dic_news = {
+        'attachmentIds': attachmentIds,
+        'content': content,
+        'contentWithTag': str(contentWithTag),
+        'createDate': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        'lang': lang,
+        'origin': 'Kiel Institute for the World Economy',
+        'publishDate': publishDate,
+        'sid': sid,
+        'sourceAddress': url,
+        'title': title,
+    }
+    return dic_news
+def doJob():
+    for page in range(0, 6):
+        if page == 0:
+            url = 'https://rolandberger-search-api.e-spirit.cloud/v1/prepared_search/InsightsGlobal/execute/?language=en&query=*'
+        else:
+            url = f'https://rolandberger-search-api.e-spirit.cloud/v1/prepared_search/InsightsGlobal/execute/?language=en&query=*&haupia_pageNumber={page}&haupia_pageSize=6'
+        req = requests.get(url, headers=headers)
+        datasJson = req.json()['results']
+        for dataJson in datasJson:
+            soup = BeautifulSoup(str(dataJson['snippets___tile']),'lxml')
+            href = soup.find('a').get('href')
+            title = soup.select('a > div.headline > span')[0].text
+            type = soup.select('a > div.label-wrapper > div')[0].text
+            dic = getDic(href,title,type)
+if __name__ == '__main__':
+    # 解析列表页    接口拿数据
+    # 解析详情页    根据不同的标签  处理结果不同
+    url = 'https://www.rolandberger.com/publications/publication_pdf/Roland_Berger_Private-equity-DACH_State-of-the-Region.pdf'
+    req = requests.get(url,headers=headers)
+    with open(r'./a.pdf','wb') as f:
+        f.write(req.content)
\ No newline at end of file