提交 0a703aad 作者: 薛凌堃

中国证券报·中证网

上级 79e1222f
......@@ -2,8 +2,12 @@
中证智能财讯
"""
import json
import os
import sys
import time
import redis
from kafka import KafkaProducer
from obs import ObsClient
import fitz
import requests
......@@ -11,6 +15,10 @@ from bs4 import BeautifulSoup
from retry import retry
from selenium.webdriver.common.by import By
from selenium import webdriver
from tempfile import NamedTemporaryFile
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
......@@ -36,66 +44,12 @@ def create_driver():
@retry(tries=3, delay=1)
def getOBSres(pathType, name, response):
result = obsClient.putContent('zzsn', f'{pathType}/' + name, content=response.content)
# result = obsClient.putFile('zzsn', pathType+name, file_path=response)
result = obsClient.putFile('zzsn', pathType+name, file_path=response)
return result
def uptoOBS(pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
name = str(baseCore.getuuid()) + '.pdf'
now_time = time.strftime("%Y-%m")
try:
result = getOBSres(pathType, now_time, name, response)
except:
log = baseCore.getLogger()
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log = baseCore.getLogger()
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = baseCore.convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
def zzcx():
driver = create_driver()
driver.maximize_window()
url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
headers = {
......@@ -119,7 +73,7 @@ def zzcx():
result_json = requests.post(url=url, data=payload, headers=headers).json()
print(result_json)
pages = result_json['data']['pages']
for page in range(1, int(pages + 1)):
for page in range(1, int(pages) + 1):
payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
payload_page = json.dumps(payload_page)
datas = requests.post(url=url, data=payload_page, headers=headers)
......@@ -128,23 +82,130 @@ def zzcx():
title = news['title']
news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
try:
flag = r.sismember('IN-20240129-0001', news_url)
if flag:
log.info('信息已采集入库过')
continue
except Exception as e:
continue
# news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=2eeeb171e36b42ada02dad77b80038b1'
# 使用模拟浏览器打开
driver = create_driver()
driver.get(news_url)
div_ = driver.find_element(By.ID, 'line')
div = div_.find_element(By.XPATH, '..')
image_data = div.screenshot_as_base64
# todo:保存到obs链接及标签替换
baseCore.uptoOBS()
html = driver.page_source
news_req = requests.get(url=news_url, headers=headers)
news_soup = BeautifulSoup(news_req.content, 'html.parser')
div_photo = driver.find_elements(By.ID, 'line')
for png_ in div_photo:
div = png_.find_element(By.XPATH, './/div/div[1]/div')
# div = png_.find_element(By.CLASS_NAME, 'ant-col ant-col-17')
# todo:滚轮需要滑动
driver.execute_script("arguments[0].scrollIntoView();", div)
time.sleep(1)
#todo:保存成临时文件
temp_file =NamedTemporaryFile(delete=False, suffix=".png")
temp_file.close()
div.screenshot(temp_file.name)
file_path = temp_file.name
# todo:保存到obs链接及标签替换
name = str(baseCore.getuuid())
result = getOBSres(pathType, name, file_path)
path = result['body']['objectUrl'].split('.com')[1]
full_path = result['body']['objectUrl']
#todo:替换标签 删除标签
dele_tag = png_.find_element(By.XPATH, './/div/div[1]//div')
driver.execute_script("arguments[0].remove()", dele_tag)
#todo:将图片塞进去 新建一个new_tag
append_tag = png_.find_element(By.XPATH, './/div/div[1]')
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
append_tag)
os.remove(file_path)
# div_undefined_line = driver.find_elements(By.ID, 'k-line-undefined')
div_undefined_line = driver.find_elements(By.ID, 'KLineSubscription')
for u_png in div_undefined_line:
div_u = u_png.find_element(By.XPATH, './/div')
# todo:滚轮需要滑动
driver.execute_script("arguments[0].scrollIntoView();", div_u)
time.sleep(3)
# todo:保存成临时文件
temp_file = NamedTemporaryFile(delete=False, suffix=".png")
temp_file.close()
div_u.screenshot(temp_file.name)
file_path = temp_file.name
# todo:保存到obs链接及标签替换
name = str(baseCore.getuuid())
result = getOBSres(pathType, name, file_path)
path = result['body']['objectUrl'].split('.com')[1]
full_path = result['body']['objectUrl']
# todo:替换标签 删除标签
dele_tag = u_png.find_element(By.XPATH, './/div')
driver.execute_script("arguments[0].remove()", dele_tag)
# todo:将图片塞进去 新建一个new_tag
# append_tag = u_png.find_element(By.XPATH, './/div')
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
u_png)
os.remove(file_path)
div_line_bar = driver.find_elements(By.ID, 'bar-line-bar-line')
for lin_bar_tag in div_line_bar:
line_bars = lin_bar_tag.find_elements(By.XPATH, './/div[contains(@class, "ant-col-11")]')
for line_bar in line_bars:
photo_line_bar = line_bar.find_element(By.XPATH, './/div')
# todo:滚轮需要滑动
driver.execute_script("arguments[0].scrollIntoView();", photo_line_bar)
time.sleep(1)
# todo:保存成临时文件
temp_file = NamedTemporaryFile(delete=False, suffix=".png")
temp_file.close()
photo_line_bar.screenshot(temp_file.name)
file_path = temp_file.name
# todo:保存到obs链接及标签替换
name = str(baseCore.getuuid())
result = getOBSres(pathType, name, file_path)
path = result['body']['objectUrl'].split('.com')[1]
full_path = result['body']['objectUrl']
# todo:替换标签 删除标签
dele_tag_ = line_bar.find_element(By.XPATH, './/div')
driver.execute_script("arguments[0].remove()", dele_tag_)
# todo:将图片塞进去 新建一个new_tag
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; newElement.style.width = '50%'; newElement.style.position = 'relative'; newElement.style.float = 'left'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
line_bar)
# #todo:创建清晰的图片标签
# driver.execute_script(f"""
# var img = new Image();
# img.src = "http://zzsn.luyuen.com{path}"; // 替换为你的图片路径
# img.onload = function() {{
# var canvas = document.createElement("canvas");
# canvas.width = img.width;
# canvas.height = img.height;
# var ctx = canvas.getContext("2d");
# ctx.drawImage(img, 0, 0);
# document.body.appendChild(canvas);
# }}; arguments[0].insertBefore(img, arguments[0].firstChild);
# """, line_bar)
os.remove(file_path)
html = driver.page_source
news_soup = BeautifulSoup(html, 'html.parser')
detail_info = news_soup.find('div', class_='subTitle___svblj')
div_list = detail_info.find_all('div')
origin = div_list[0].text
publishDate = div_list[1].text
contentWithTag = news_soup.find('div', class_='editable___1EtCQ editor-editable')
# print(contentWithTag)
for tag in contentWithTag.find_all('span'):
if tag.text == '\ufeff':
tag.decompose()
content = contentWithTag.text
info_code = 'IN-20240129-0001'
result_dict = {
......@@ -152,25 +213,29 @@ def zzcx():
'sid': '1751787750127857666',
'title': title,
'organ': origin,
'origin': '国务院国有资产监督管理委员会',
'origin': origin,
# '摘要': zhaiyao,
'source': 16,
'content': content,
'contentWithTag': contentWithTag,
'contentWithTag': str(contentWithTag),
'publishDate': publishDate,
'sourceAddress': news_url,
}
log.info(f'{page}--{title}--{href}')
# info_list.append(result_dict)
log.info(f'{page}--{title}--{news_url}')
print(result_dict)
# break
# break
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href)
r.sadd(info_code, news_url)
log.info('发送kafka成功!')
except Exception as e:
log.info(e)
finally:
producer.close()
if __name__ == "__main__":
pathType = 'PhotoDingzhi/'
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
zzcx()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论