提交 e96f6a29 作者: 薛凌堃

国外企业基本信息-高管信息-企业动态

上级 456ba4fa
# 雅虎财经企业动态获取 # 雅虎财经企业动态获取
# 雅虎财经企业动态获取 # 雅虎财经企业动态获取
import json
import time import time
import pymysql import pymysql
from kafka import KafkaProducer
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
...@@ -46,7 +48,7 @@ def getZx(xydm,url,title,cnx,path): ...@@ -46,7 +48,7 @@ def getZx(xydm,url,title,cnx,path):
'2', '2',
'zh' 'zh'
] ]
with cnx.cursor() as cursor:
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
cursor.execute(insert_sql, tuple(list_info)) cursor.execute(insert_sql, tuple(list_info))
...@@ -56,13 +58,76 @@ def getZx(xydm,url,title,cnx,path): ...@@ -56,13 +58,76 @@ def getZx(xydm,url,title,cnx,path):
log.error("保存数据库失败") log.error("保存数据库失败")
e1 = str(e1) + '.........保存数据库失败' e1 = str(e1) + '.........保存数据库失败'
return e1 return e1
log.info(f"文章耗时,耗时{baseCore.getTimeCost(start_time_content, time.time())}")
log.info(f"文章耗时,耗时{baseCore.getTimeCost(start_time_content,time.time())}") try:
sel_sql = "select article_id from brpa_source_article where source_address = %s and social_credit_code = %s"
cursor.execute(sel_sql, (url, social_code))
row = cursor.fetchone()
id = row[0]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:插入一条数据,并传入kafka
dic_news = {
'attachmentIds': id,
'author': '',
'content': content,
'contentWithTag': content,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '天眼查',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': url, # 原文链接
'summary': '',
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': pub_time[:4]
}
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, '')
# return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.error(dic_result)
e = str(e) + '操作失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
except Exception as e:
log.info(f'传输失败:{social_code}----{url}')
e = '传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
except Exception as e: except Exception as e:
log.error("获取正文失败") log.error("获取正文失败")
e = str(e)+'.........获取正文失败' e = str(e) + '.........获取正文失败'
return e return e
return ''
# 拖拽30次获取企业新闻 # 拖拽30次获取企业新闻
def scroll(driver): def scroll(driver):
...@@ -76,7 +141,7 @@ if __name__ == "__main__": ...@@ -76,7 +141,7 @@ if __name__ == "__main__":
path = r'D:\chrome\chromedriver.exe' path = r'D:\chrome\chromedriver.exe'
driver = baseCore.buildDriver(path) driver = baseCore.buildDriver(path)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor = cnx.cursor()
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code= baseCore.redicPullData(2) social_code= baseCore.redicPullData(2)
...@@ -131,7 +196,6 @@ if __name__ == "__main__": ...@@ -131,7 +196,6 @@ if __name__ == "__main__":
else: else:
continue continue
#判断url是否已经存在 #判断url是否已经存在
with cnx.cursor() as cursor:
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s ''' sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
cursor.execute(sel_sql, (news_url,xydm)) cursor.execute(sel_sql, (news_url,xydm))
selects = cursor.fetchall() selects = cursor.fetchall()
...@@ -159,5 +223,7 @@ if __name__ == "__main__": ...@@ -159,5 +223,7 @@ if __name__ == "__main__":
count += 1 count += 1
baseCore.updateRun(social_code,runType,count) baseCore.updateRun(social_code,runType,count)
cursor.close()
cnx.close()
#释放资源 #释放资源
baseCore.close() baseCore.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论