国外智库

a45d37fb · 薛凌堃 · 4f59604c · a45d37fb · a45d37fb
--- a/gwzk/BaseCore.py
+++ b/gwzk/BaseCore.py
--- a/gwzk/europa.py
+++ b/gwzk/europa.py
+"""
+国外智库-欧盟 经合组织
+"""
+import json
+import time
+import pymongo
+from bs4 import BeautifulSoup
+import requests
+from datetime import datetime
+from kafka import KafkaProducer
+from retry import retry
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
+    '国外智库']
+@retry(tries=2, delay=5)
+def sendKafka(dic):
+    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
+    kafka_result = producer.send("research_center_fourth",
+                                 json.dumps(dic, ensure_ascii=False).encode('utf8'))
+    log.info(f'{dic["sourceAddress"]}传输成功')
+def secrchATT(item_id, retData, type_id, order_by):
+    sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
+    baseCore.cursor_.execute(sel_sql, (item_id, retData['path'], type_id, order_by))
+    selects = baseCore.cursor_.fetchone()
+    return selects
+# 插入到att表 返回附件id
+def tableUpdate(retData, file_name, num, publishDate,origin):
+    item_id = retData['item_id']
+    type_id = retData['type_id']
+    group_name = retData['group_name']
+    path = retData['path']
+    full_path = retData['full_path']
+    category = retData['category']
+    file_size = retData['file_size']
+    status = retData['status']
+    create_by = retData['create_by']
+    page_size = retData['page_size']
+    create_time = retData['create_time']
+    order_by = num
+    object_key = full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1]
+    Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+    values = (
+        file_name+'.pdf', type_id, item_id, group_name, path, full_path, category, file_size, order_by,
+        status, create_by,
+        create_time, object_key, 'zzsn', publishDate,origin)
+    baseCore.cursor_.execute(Upsql, values)  # 插入
+    baseCore.cnx_.commit()  # 提交
+    baseCore.getLogger().info("更新完成:{}".format(Upsql))
+    selects = secrchATT(item_id, retData, type_id, order_by)
+    id = selects[0]
+    return id
+def save_data(dic_news):
+    aaa_dic = {
+        '附件id': dic_news['attachmentIds'],
+        '网址': dic_news['sourceAddress'],
+        'tid': '',
+        '来源': f"经济合作与发展组织",
+        '创建时间': dic_news['createDate'],
+        '带标签内容': dic_news['contentWithTag'][:100],
+        '发布时间': dic_news['publishDate'],
+        '标题': dic_news['title']
+    }
+    db_storage.insert_one(aaa_dic)
+@retry(tries=2, delay=5)
+def translate(title, contentWithTag):
+    headers = {
+        'Content-Type': 'application/json',
+    }
+    dic_info = {
+        'title': title,
+        # 'summary': '<div>apple</div>',
+        'contentWithTag': contentWithTag
+    }
+    dic_info = json.dumps(dic_info)
+    req = requests.post('http://117.78.23.14:5001/translate', data=dic_info, headers=headers)
+    dataJson = req.json()
+    if dataJson['status'] == 'failed':
+        raise
+    titleRaw = dataJson['title']
+    contentWithTagRaw = dataJson['contentWithTag']
+    titleRaw = BeautifulSoup(titleRaw,'html.parser')
+    titleRaw = titleRaw.text
+    contentWithTagRaw = BeautifulSoup(contentWithTagRaw,'html.parser')
+    return titleRaw, contentWithTagRaw
+def doJob():
+    num = 1
+    url = 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=1'
+    headers = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh;q=0.9',
+    'Cache-Control': 'max-age=0',
+    'Cookie': 'JSESSIONID=BHezogPwi8NJVECsKXCXqijdQ00-yMJHw_gR8wiC.ip-10-240-5-121; __cf_bm=c2byUypnSjXPS_UFDM7BMRGDxN6AQEkNVUjzw9HuSq8-1707054653-1-AbbI7JWWkfWKVGi8SKI06f0jGEjPdk5kvHAIRRpBHSSSnmxj1IcvGUT8+/O6R0U2RLZJECZdUzZIXAwFuEz5lPo=; _gcl_au=1.1.201344533.1707054655; _gid=GA1.2.557164000.1707054655; cb-enabled=enabled; cf_clearance=6tK6.WKHJbXXoV4NTgbyHRhetRxMdWPZofwlv01F65Y-1707054656-1-AfrYlWnLLZFC1sKxeFVQintPrZnjvjoJSZwRRhAYwqRHGdWbU5IFZQDJZJM21l20Tj6gk4JxNobWT0wGzp1Dgjw=; _ce.irv=new; cebs=1; _ce.clock_event=1; _ce.clock_data=72%2C123.149.3.159%2C1%2C9c1ce27f08b16479d2e17743062b28ed; custom_cookie_AB=1; AWSALB=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; AWSALBCORS=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; _gat_UA-1887794-2=1; _dc_gtm_UA-136634323-1=1; _ga_F5XZ540Q4V=GS1.1.1707054655.1.1.1707055119.7.0.0; _ga=GA1.1.1014316406.1707054655; _ga_F7KSNTXTRX=GS1.1.1707054655.1.1.1707055119.0.0.0; cebsp_=5; _ce.s=v~212f033193b9432855ae8335d6d3969cc1f8b751~lcw~1707055134688~lva~1707054658247~vpv~0~v11.fhb~1707054659602~v11.lhb~1707055126493~v11.cs~325107~v11.s~6d7ba630-c364-11ee-aba8-136dbbf9a447~v11.sla~1707055134688~v11.send~1707055135439~lcw~1707055135439',
+    'Referer': 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=2',
+    'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+    'Sec-Ch-Ua-Mobile': '?0',
+    'Sec-Ch-Ua-Platform': '"Windows"',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'same-origin',
+    'Sec-Fetch-User': '?1',
+    'Upgrade-Insecure-Requests': '1',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+    }
+    req = requests.get(url=url, headers=headers)
+    soup = BeautifulSoup(req.content, 'html.parser')
+    div_part = soup.find_all('div', class_='col-xs-12 body-section')[1]
+    div_list = div_part.find_all('div', class_='row panel')
+    for div in div_list:
+        start_time = time.time()
+        title = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title').text
+        href = 'https://www.oecd-ilibrary.org' + div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('a')['href']
+        is_href = db_storage.find_one({'网址': href})
+        if is_href:
+            log.info(f'{href}===已采集')
+            continue
+        pubtime_ = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title gray').text
+        # 定义原始时间的格式
+        time_format = "%d %b %Y"
+        # 转换为标准时间
+        standard_time = datetime.strptime(pubtime_, time_format).strftime("%Y-%m-%d")
+        if standard_time > '2023-01-30':
+            pass
+        else:
+            break
+        year = standard_time[:4]
+        pdf_part = div.find('div', class_='col-lg-5 col-xs-12 actions-item').find('ul', class_='actions').find_all('li')[1].find('a').get('href')
+        pdf_url = 'https://www.oecd-ilibrary.org' + pdf_part
+        req_news = requests.get(url=href, headers=headers)
+        soup_news = BeautifulSoup(req_news.content, 'html.parser')
+        # print(title, standard_time, pdf_url, href)
+        contentWithTag = soup_news.find('div', class_='description js-desc-fade show-all')
+        content = contentWithTag.get_text()
+        # todo:翻译
+        try:
+            titleRaw, contentWithTagRaw = translate(str(title), str(contentWithTag))
+            log.info(f'{href}===翻译成功')
+        except Exception as e:
+            log.error(f'{href}===翻译失败==={e}')
+            continue
+        retData = baseCore.uptoOBS(pdf_url, title, 15, '', pathType, taskType, start_time, create_by)
+        num += 1
+        id_list = []
+        if retData['state']:
+            att_id = tableUpdate(retData, title, num, standard_time, '经济合作与发展组织')
+            if att_id:
+                id_list.append(att_id)
+                now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+                lang = baseCore.detect_language(content)
+                contentRaw = contentWithTagRaw.text
+                contentWithTagRaw = str(contentWithTagRaw)
+                dic = {
+                    'id': f'1620244462491893761{int(time.time())}',
+                    'subjectId': '1620244462491893761',
+                    'checkStatus': 1,
+                    'deleteFlag': 0,
+                    'topNum': 0,
+                    'content': content,
+                    'contentRaw': contentRaw,
+                    'contentWithTag': str(contentWithTag),
+                    'contentWithTagRaw': contentWithTagRaw,
+                    'createDate': now,
+                    'labels': [
+                        {'labelMark': 'organization', 'relationId': '1619903523269271554', 'relationName': '经济合作与发展组织'}],
+                    'lang': lang,
+                    'origin': '经济合作与发展组织',
+                    'publishDate': standard_time,
+                    'sourceAddress': href,
+                    'title': title,
+                    'titleRaw': titleRaw,
+                    'updateDate': now,
+                    'attachmentIds':id_list
+                }
+                sendKafka(dic)
+                try:
+                    save_data(dic)
+                except:
+                    log.error(f'{href}===数据库保存失败')
+        # break
+if __name__ == "__main__":
+    pathType = 'PolicyDocuments/'
+    taskType = '国外智库-经合组织'
+    create_by = 'XueLingKun'
+    doJob()