提交 a45d37fb 作者: 薛凌堃

国外智库

上级 4f59604c
......@@ -272,34 +272,6 @@ class BaseCore:
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
self.pool_11 = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.116.44.11',
port=3306,
user='caiji',
password='f7s0&7qqtK',
database='clb_project',
charset='utf8mb4'
)
def check_mysql_conn(self,conn):
try:
conn.ping()
......@@ -461,16 +433,6 @@ class BaseCore:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self, key):
try:
......@@ -496,460 +458,3 @@ class BaseCore:
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, social_code):
data = []
try:
sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 根据企业名称获取企业信息
def getBYnameInfomation(self, com_name):
data = []
try:
sql = f"SELECT * FROM EnterpriseInfo WHERE CompanyName = '{com_name}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 根据企业名称获取企业信息
def getBYtycidInfomation(self, com_name):
data = []
try:
sql = f"SELECT * FROM EnterpriseInfo WHERE TYCID = '{com_name}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql, values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
# 获取企查查token
def GetToken(self):
# 获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token_list = self.cursor.fetchall()
self.cnx.commit()
try:
token = token_list[random.randint(0, len(token_list) - 1)][0]
except:
token = ''
return token
# 删除失效的token
def delete_token(self, token):
deletesql = f"delete from QCC_token where token='{token}' "
self.cursor.execute(deletesql)
self.cnx.commit()
# 获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
self.cnx.commit()
return token
# 检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
if result[0] == 'ja':
return 'jp'
if result[0] == 'fr':
return 'fra'
if result[0] == 'es':
return 'spa'
if result[0] == 'fi':
return 'fin'
if result[0] == 'vi':
return 'vie'
if result[0] == 'ko':
return 'kor'
if result[0] == 'da':
return 'dan'
return result[0]
#创建excel文件
def check_excel_file(self,file_path):
if os.path.isfile(file_path):
self.getLogger().info("Excel文件已存在")
return True
else:
self.getLogger().info("Excel文件不存在,正在创建...")
return False
# 追加接入excel
def writerToExcel(self, detailList, filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename, engine='openpyxl', dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
# 对失败或者断掉的企业 重新放入redis
def rePutIntoR(self, key, item):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.r.rpush(key, item)
# 增加计数器的值并返回增加后的值
def incrSet(self, key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
# 获取key剩余的过期时间
def getttl(self, key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
# # 上传至文件服务器,并解析pdf的内容和页数
# def upLoadToServe(self, pdf_url, type_id, social_code):
# headers = {}
# retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
# 'full_path': '',
# 'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
# 'create_time': '', 'page_size': '', 'content': ''}
# headers['User-Agent'] = self.getRandomUserAgent()
# for i in range(0, 3):
# try:
# resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
# break
# except:
# time.sleep(3)
# continue
# page_size = 0
#
# for i in range(0, 3):
# try:
# result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
# with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
# for page in doc.pages():
# retData['content'] += page.get_text()
# break
# except:
# time.sleep(3)
# continue
# if page_size < 1:
# # pdf解析失败
# print(f'======pdf解析失败=====')
# return retData
# else:
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# retData['state'] = True
# retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
# retData['full_path'] = bytes.decode(result['Remote file_id'])
# retData['file_size'] = result['Uploaded size']
# retData['create_time'] = time_now
# retData['page_size'] = page_size
#
# return retData
def deliteATT(self,id):
delitesql = f"delete from clb_sys_attachment where id = '{id}' "
self.cursor_.execute(delitesql)
self.cnx_.commit()
def secrchATT(self, item_id, year, type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, year, pdf_name, num, pub_time,origin):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id, year, type_id)
if selects:
self.getLogger().info(f'com_name:{com_name}--{year}已存在')
id = ''
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
pub_time,origin)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id, year, type_id)
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self, social_code, cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
# 上传至obs华为云服务器,并解析破地方的内容和页数
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
# 查看obs文件是否已经上传
def obsexist(self, file_path):
# # 文件路径
# file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
# 检查文件是否存在
response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300:
self.getLogger().info('=====文件不存在obs=====')
return True
else:
self.getLogger().info(f'=====文件存在obs========{file_path}')
#uuid 根据时间戳生成 文件名 上传到obs
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
def uptoOBS(self, pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by,headers):
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
'create_time': '', 'page_size': '', 'content': ''}
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
name = str(self.getuuid()) + '.pdf'
now_time = time.strftime("%Y-%m")
try:
result = self.getOBSres(pathType, now_time, name, response)
except:
log = self.getLogger()
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log = self.getLogger()
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = self.getTimeCost(start_time, time.time())
self.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
@retry(tries=3, delay=1)
def getOBSres(self, pathType, now_time, name, response):
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
# resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
return result
def sendEmail(self, file_name):
file = open(file_name, 'rb').read()
# 发送邮箱地址
sender = '1195236739@qq.com'
# 接收邮箱地址
receiver = 'fujunxue@ciglobal.cn'
smtpserver = 'smtp.qq.com'
# 发送邮箱登录 账户 密码
username = '1195236739@qq.com'
password = 'gatvszshadvpgjci'
maile_title = '企业基本信息采集情况'
message = MIMEMultipart()
message['From'] = sender
message['To'] = receiver
message['Subject'] = Header(maile_title, 'utf-8')
message.attach(MIMEText('企业基本信息采集情况', 'plain', 'utf-8'))
xlsxApart = MIMEApplication(file)
xlsxApart.add_header('Content-Disposition', 'attachment', filename='企业基本信息采集情况.xlsx')
message.attach(xlsxApart)
smtpObj = smtplib.SMTP_SSL(smtpserver) # 注意:如果遇到发送失败的情况(提示远程主机拒接连接),这里要使用SMTP_SSL方法
smtpObj.connect(smtpserver, port=465)
smtpObj.login(username, password)
smtpObj.sendmail(sender, receiver, message.as_string())
print("邮件发送成功!!!")
smtpObj.quit()
"""
国外智库-欧盟 经合组织
"""
import json
import time
import pymongo
from bs4 import BeautifulSoup
import requests
from datetime import datetime
from kafka import KafkaProducer
from retry import retry
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
'国外智库']
@retry(tries=2, delay=5)
def sendKafka(dic):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("research_center_fourth",
json.dumps(dic, ensure_ascii=False).encode('utf8'))
log.info(f'{dic["sourceAddress"]}传输成功')
def secrchATT(item_id, retData, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
baseCore.cursor_.execute(sel_sql, (item_id, retData['path'], type_id, order_by))
selects = baseCore.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(retData, file_name, num, publishDate,origin):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
object_key = full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1]
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name+'.pdf', type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, object_key, 'zzsn', publishDate,origin)
baseCore.cursor_.execute(Upsql, values) # 插入
baseCore.cnx_.commit() # 提交
baseCore.getLogger().info("更新完成:{}".format(Upsql))
selects = secrchATT(item_id, retData, type_id, order_by)
id = selects[0]
return id
def save_data(dic_news):
aaa_dic = {
'附件id': dic_news['attachmentIds'],
'网址': dic_news['sourceAddress'],
'tid': '',
'来源': f"经济合作与发展组织",
'创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100],
'发布时间': dic_news['publishDate'],
'标题': dic_news['title']
}
db_storage.insert_one(aaa_dic)
@retry(tries=2, delay=5)
def translate(title, contentWithTag):
headers = {
'Content-Type': 'application/json',
}
dic_info = {
'title': title,
# 'summary': '<div>apple</div>',
'contentWithTag': contentWithTag
}
dic_info = json.dumps(dic_info)
req = requests.post('http://117.78.23.14:5001/translate', data=dic_info, headers=headers)
dataJson = req.json()
if dataJson['status'] == 'failed':
raise
titleRaw = dataJson['title']
contentWithTagRaw = dataJson['contentWithTag']
titleRaw = BeautifulSoup(titleRaw,'html.parser')
titleRaw = titleRaw.text
contentWithTagRaw = BeautifulSoup(contentWithTagRaw,'html.parser')
return titleRaw, contentWithTagRaw
def doJob():
num = 1
url = 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=1'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Cookie': 'JSESSIONID=BHezogPwi8NJVECsKXCXqijdQ00-yMJHw_gR8wiC.ip-10-240-5-121; __cf_bm=c2byUypnSjXPS_UFDM7BMRGDxN6AQEkNVUjzw9HuSq8-1707054653-1-AbbI7JWWkfWKVGi8SKI06f0jGEjPdk5kvHAIRRpBHSSSnmxj1IcvGUT8+/O6R0U2RLZJECZdUzZIXAwFuEz5lPo=; _gcl_au=1.1.201344533.1707054655; _gid=GA1.2.557164000.1707054655; cb-enabled=enabled; cf_clearance=6tK6.WKHJbXXoV4NTgbyHRhetRxMdWPZofwlv01F65Y-1707054656-1-AfrYlWnLLZFC1sKxeFVQintPrZnjvjoJSZwRRhAYwqRHGdWbU5IFZQDJZJM21l20Tj6gk4JxNobWT0wGzp1Dgjw=; _ce.irv=new; cebs=1; _ce.clock_event=1; _ce.clock_data=72%2C123.149.3.159%2C1%2C9c1ce27f08b16479d2e17743062b28ed; custom_cookie_AB=1; AWSALB=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; AWSALBCORS=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; _gat_UA-1887794-2=1; _dc_gtm_UA-136634323-1=1; _ga_F5XZ540Q4V=GS1.1.1707054655.1.1.1707055119.7.0.0; _ga=GA1.1.1014316406.1707054655; _ga_F7KSNTXTRX=GS1.1.1707054655.1.1.1707055119.0.0.0; cebsp_=5; _ce.s=v~212f033193b9432855ae8335d6d3969cc1f8b751~lcw~1707055134688~lva~1707054658247~vpv~0~v11.fhb~1707054659602~v11.lhb~1707055126493~v11.cs~325107~v11.s~6d7ba630-c364-11ee-aba8-136dbbf9a447~v11.sla~1707055134688~v11.send~1707055135439~lcw~1707055135439',
'Referer': 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=2',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
req = requests.get(url=url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
div_part = soup.find_all('div', class_='col-xs-12 body-section')[1]
div_list = div_part.find_all('div', class_='row panel')
for div in div_list:
start_time = time.time()
title = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title').text
href = 'https://www.oecd-ilibrary.org' + div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('a')['href']
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info(f'{href}===已采集')
continue
pubtime_ = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title gray').text
# 定义原始时间的格式
time_format = "%d %b %Y"
# 转换为标准时间
standard_time = datetime.strptime(pubtime_, time_format).strftime("%Y-%m-%d")
if standard_time > '2023-01-30':
pass
else:
break
year = standard_time[:4]
pdf_part = div.find('div', class_='col-lg-5 col-xs-12 actions-item').find('ul', class_='actions').find_all('li')[1].find('a').get('href')
pdf_url = 'https://www.oecd-ilibrary.org' + pdf_part
req_news = requests.get(url=href, headers=headers)
soup_news = BeautifulSoup(req_news.content, 'html.parser')
# print(title, standard_time, pdf_url, href)
contentWithTag = soup_news.find('div', class_='description js-desc-fade show-all')
content = contentWithTag.get_text()
# todo:翻译
try:
titleRaw, contentWithTagRaw = translate(str(title), str(contentWithTag))
log.info(f'{href}===翻译成功')
except Exception as e:
log.error(f'{href}===翻译失败==={e}')
continue
retData = baseCore.uptoOBS(pdf_url, title, 15, '', pathType, taskType, start_time, create_by)
num += 1
id_list = []
if retData['state']:
att_id = tableUpdate(retData, title, num, standard_time, '经济合作与发展组织')
if att_id:
id_list.append(att_id)
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
lang = baseCore.detect_language(content)
contentRaw = contentWithTagRaw.text
contentWithTagRaw = str(contentWithTagRaw)
dic = {
'id': f'1620244462491893761{int(time.time())}',
'subjectId': '1620244462491893761',
'checkStatus': 1,
'deleteFlag': 0,
'topNum': 0,
'content': content,
'contentRaw': contentRaw,
'contentWithTag': str(contentWithTag),
'contentWithTagRaw': contentWithTagRaw,
'createDate': now,
'labels': [
{'labelMark': 'organization', 'relationId': '1619903523269271554', 'relationName': '经济合作与发展组织'}],
'lang': lang,
'origin': '经济合作与发展组织',
'publishDate': standard_time,
'sourceAddress': href,
'title': title,
'titleRaw': titleRaw,
'updateDate': now,
'attachmentIds':id_list
}
sendKafka(dic)
try:
save_data(dic)
except:
log.error(f'{href}===数据库保存失败')
# break
if __name__ == "__main__":
pathType = 'PolicyDocuments/'
taskType = '国外智库-经合组织'
create_by = 'XueLingKun'
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论