提交 a45d37fb 作者: 薛凌堃

国外智库

上级 4f59604c
"""
国外智库-欧盟 经合组织
"""
import json
import time
import pymongo
from bs4 import BeautifulSoup
import requests
from datetime import datetime
from kafka import KafkaProducer
from retry import retry
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
'国外智库']
@retry(tries=2, delay=5)
def sendKafka(dic):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("research_center_fourth",
json.dumps(dic, ensure_ascii=False).encode('utf8'))
log.info(f'{dic["sourceAddress"]}传输成功')
def secrchATT(item_id, retData, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
baseCore.cursor_.execute(sel_sql, (item_id, retData['path'], type_id, order_by))
selects = baseCore.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(retData, file_name, num, publishDate,origin):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
object_key = full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1]
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name+'.pdf', type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, object_key, 'zzsn', publishDate,origin)
baseCore.cursor_.execute(Upsql, values) # 插入
baseCore.cnx_.commit() # 提交
baseCore.getLogger().info("更新完成:{}".format(Upsql))
selects = secrchATT(item_id, retData, type_id, order_by)
id = selects[0]
return id
def save_data(dic_news):
aaa_dic = {
'附件id': dic_news['attachmentIds'],
'网址': dic_news['sourceAddress'],
'tid': '',
'来源': f"经济合作与发展组织",
'创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100],
'发布时间': dic_news['publishDate'],
'标题': dic_news['title']
}
db_storage.insert_one(aaa_dic)
@retry(tries=2, delay=5)
def translate(title, contentWithTag):
headers = {
'Content-Type': 'application/json',
}
dic_info = {
'title': title,
# 'summary': '<div>apple</div>',
'contentWithTag': contentWithTag
}
dic_info = json.dumps(dic_info)
req = requests.post('http://117.78.23.14:5001/translate', data=dic_info, headers=headers)
dataJson = req.json()
if dataJson['status'] == 'failed':
raise
titleRaw = dataJson['title']
contentWithTagRaw = dataJson['contentWithTag']
titleRaw = BeautifulSoup(titleRaw,'html.parser')
titleRaw = titleRaw.text
contentWithTagRaw = BeautifulSoup(contentWithTagRaw,'html.parser')
return titleRaw, contentWithTagRaw
def doJob():
num = 1
url = 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=1'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Cookie': 'JSESSIONID=BHezogPwi8NJVECsKXCXqijdQ00-yMJHw_gR8wiC.ip-10-240-5-121; __cf_bm=c2byUypnSjXPS_UFDM7BMRGDxN6AQEkNVUjzw9HuSq8-1707054653-1-AbbI7JWWkfWKVGi8SKI06f0jGEjPdk5kvHAIRRpBHSSSnmxj1IcvGUT8+/O6R0U2RLZJECZdUzZIXAwFuEz5lPo=; _gcl_au=1.1.201344533.1707054655; _gid=GA1.2.557164000.1707054655; cb-enabled=enabled; cf_clearance=6tK6.WKHJbXXoV4NTgbyHRhetRxMdWPZofwlv01F65Y-1707054656-1-AfrYlWnLLZFC1sKxeFVQintPrZnjvjoJSZwRRhAYwqRHGdWbU5IFZQDJZJM21l20Tj6gk4JxNobWT0wGzp1Dgjw=; _ce.irv=new; cebs=1; _ce.clock_event=1; _ce.clock_data=72%2C123.149.3.159%2C1%2C9c1ce27f08b16479d2e17743062b28ed; custom_cookie_AB=1; AWSALB=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; AWSALBCORS=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; _gat_UA-1887794-2=1; _dc_gtm_UA-136634323-1=1; _ga_F5XZ540Q4V=GS1.1.1707054655.1.1.1707055119.7.0.0; _ga=GA1.1.1014316406.1707054655; _ga_F7KSNTXTRX=GS1.1.1707054655.1.1.1707055119.0.0.0; cebsp_=5; _ce.s=v~212f033193b9432855ae8335d6d3969cc1f8b751~lcw~1707055134688~lva~1707054658247~vpv~0~v11.fhb~1707054659602~v11.lhb~1707055126493~v11.cs~325107~v11.s~6d7ba630-c364-11ee-aba8-136dbbf9a447~v11.sla~1707055134688~v11.send~1707055135439~lcw~1707055135439',
'Referer': 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=2',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
req = requests.get(url=url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
div_part = soup.find_all('div', class_='col-xs-12 body-section')[1]
div_list = div_part.find_all('div', class_='row panel')
for div in div_list:
start_time = time.time()
title = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title').text
href = 'https://www.oecd-ilibrary.org' + div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('a')['href']
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info(f'{href}===已采集')
continue
pubtime_ = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title gray').text
# 定义原始时间的格式
time_format = "%d %b %Y"
# 转换为标准时间
standard_time = datetime.strptime(pubtime_, time_format).strftime("%Y-%m-%d")
if standard_time > '2023-01-30':
pass
else:
break
year = standard_time[:4]
pdf_part = div.find('div', class_='col-lg-5 col-xs-12 actions-item').find('ul', class_='actions').find_all('li')[1].find('a').get('href')
pdf_url = 'https://www.oecd-ilibrary.org' + pdf_part
req_news = requests.get(url=href, headers=headers)
soup_news = BeautifulSoup(req_news.content, 'html.parser')
# print(title, standard_time, pdf_url, href)
contentWithTag = soup_news.find('div', class_='description js-desc-fade show-all')
content = contentWithTag.get_text()
# todo:翻译
try:
titleRaw, contentWithTagRaw = translate(str(title), str(contentWithTag))
log.info(f'{href}===翻译成功')
except Exception as e:
log.error(f'{href}===翻译失败==={e}')
continue
retData = baseCore.uptoOBS(pdf_url, title, 15, '', pathType, taskType, start_time, create_by)
num += 1
id_list = []
if retData['state']:
att_id = tableUpdate(retData, title, num, standard_time, '经济合作与发展组织')
if att_id:
id_list.append(att_id)
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
lang = baseCore.detect_language(content)
contentRaw = contentWithTagRaw.text
contentWithTagRaw = str(contentWithTagRaw)
dic = {
'id': f'1620244462491893761{int(time.time())}',
'subjectId': '1620244462491893761',
'checkStatus': 1,
'deleteFlag': 0,
'topNum': 0,
'content': content,
'contentRaw': contentRaw,
'contentWithTag': str(contentWithTag),
'contentWithTagRaw': contentWithTagRaw,
'createDate': now,
'labels': [
{'labelMark': 'organization', 'relationId': '1619903523269271554', 'relationName': '经济合作与发展组织'}],
'lang': lang,
'origin': '经济合作与发展组织',
'publishDate': standard_time,
'sourceAddress': href,
'title': title,
'titleRaw': titleRaw,
'updateDate': now,
'attachmentIds':id_list
}
sendKafka(dic)
try:
save_data(dic)
except:
log.error(f'{href}===数据库保存失败')
# break
if __name__ == "__main__":
pathType = 'PolicyDocuments/'
taskType = '国外智库-经合组织'
create_by = 'XueLingKun'
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论