提交 0a703aad 作者: 薛凌堃

中国证券报·中证网

上级 79e1222f
...@@ -2,8 +2,12 @@ ...@@ -2,8 +2,12 @@
中证智能财讯 中证智能财讯
""" """
import json import json
import os
import sys import sys
import time import time
import redis
from kafka import KafkaProducer
from obs import ObsClient from obs import ObsClient
import fitz import fitz
import requests import requests
...@@ -11,6 +15,10 @@ from bs4 import BeautifulSoup ...@@ -11,6 +15,10 @@ from bs4 import BeautifulSoup
from retry import retry from retry import retry
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium import webdriver from selenium import webdriver
from tempfile import NamedTemporaryFile
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
sys.path.append('D:\\kkwork\\zzsn_spider\\base') sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
...@@ -36,66 +44,12 @@ def create_driver(): ...@@ -36,66 +44,12 @@ def create_driver():
@retry(tries=3, delay=1) @retry(tries=3, delay=1)
def getOBSres(pathType, name, response): def getOBSres(pathType, name, response):
result = obsClient.putContent('zzsn', f'{pathType}/' + name, content=response.content) result = obsClient.putFile('zzsn', pathType+name, file_path=response)
# result = obsClient.putFile('zzsn', pathType+name, file_path=response)
return result return result
def uptoOBS(pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
name = str(baseCore.getuuid()) + '.pdf'
now_time = time.strftime("%Y-%m")
try:
result = getOBSres(pathType, now_time, name, response)
except:
log = baseCore.getLogger()
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log = baseCore.getLogger()
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = baseCore.convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
def zzcx(): def zzcx():
driver = create_driver()
driver.maximize_window()
url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES' url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""} payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
headers = { headers = {
...@@ -119,7 +73,7 @@ def zzcx(): ...@@ -119,7 +73,7 @@ def zzcx():
result_json = requests.post(url=url, data=payload, headers=headers).json() result_json = requests.post(url=url, data=payload, headers=headers).json()
print(result_json) print(result_json)
pages = result_json['data']['pages'] pages = result_json['data']['pages']
for page in range(1, int(pages + 1)): for page in range(1, int(pages) + 1):
payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""} payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
payload_page = json.dumps(payload_page) payload_page = json.dumps(payload_page)
datas = requests.post(url=url, data=payload_page, headers=headers) datas = requests.post(url=url, data=payload_page, headers=headers)
...@@ -128,23 +82,130 @@ def zzcx(): ...@@ -128,23 +82,130 @@ def zzcx():
title = news['title'] title = news['title']
news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId'] news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
try:
flag = r.sismember('IN-20240129-0001', news_url)
if flag:
log.info('信息已采集入库过')
continue
except Exception as e:
continue
# news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=2eeeb171e36b42ada02dad77b80038b1'
# 使用模拟浏览器打开 # 使用模拟浏览器打开
driver = create_driver()
driver.get(news_url) driver.get(news_url)
div_ = driver.find_element(By.ID, 'line')
div = div_.find_element(By.XPATH, '..')
image_data = div.screenshot_as_base64
# todo:保存到obs链接及标签替换
baseCore.uptoOBS()
html = driver.page_source
news_req = requests.get(url=news_url, headers=headers) div_photo = driver.find_elements(By.ID, 'line')
news_soup = BeautifulSoup(news_req.content, 'html.parser') for png_ in div_photo:
div = png_.find_element(By.XPATH, './/div/div[1]/div')
# div = png_.find_element(By.CLASS_NAME, 'ant-col ant-col-17')
# todo:滚轮需要滑动
driver.execute_script("arguments[0].scrollIntoView();", div)
time.sleep(1)
#todo:保存成临时文件
temp_file =NamedTemporaryFile(delete=False, suffix=".png")
temp_file.close()
div.screenshot(temp_file.name)
file_path = temp_file.name
# todo:保存到obs链接及标签替换
name = str(baseCore.getuuid())
result = getOBSres(pathType, name, file_path)
path = result['body']['objectUrl'].split('.com')[1]
full_path = result['body']['objectUrl']
#todo:替换标签 删除标签
dele_tag = png_.find_element(By.XPATH, './/div/div[1]//div')
driver.execute_script("arguments[0].remove()", dele_tag)
#todo:将图片塞进去 新建一个new_tag
append_tag = png_.find_element(By.XPATH, './/div/div[1]')
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
append_tag)
os.remove(file_path)
# div_undefined_line = driver.find_elements(By.ID, 'k-line-undefined')
div_undefined_line = driver.find_elements(By.ID, 'KLineSubscription')
for u_png in div_undefined_line:
div_u = u_png.find_element(By.XPATH, './/div')
# todo:滚轮需要滑动
driver.execute_script("arguments[0].scrollIntoView();", div_u)
time.sleep(3)
# todo:保存成临时文件
temp_file = NamedTemporaryFile(delete=False, suffix=".png")
temp_file.close()
div_u.screenshot(temp_file.name)
file_path = temp_file.name
# todo:保存到obs链接及标签替换
name = str(baseCore.getuuid())
result = getOBSres(pathType, name, file_path)
path = result['body']['objectUrl'].split('.com')[1]
full_path = result['body']['objectUrl']
# todo:替换标签 删除标签
dele_tag = u_png.find_element(By.XPATH, './/div')
driver.execute_script("arguments[0].remove()", dele_tag)
# todo:将图片塞进去 新建一个new_tag
# append_tag = u_png.find_element(By.XPATH, './/div')
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
u_png)
os.remove(file_path)
div_line_bar = driver.find_elements(By.ID, 'bar-line-bar-line')
for lin_bar_tag in div_line_bar:
line_bars = lin_bar_tag.find_elements(By.XPATH, './/div[contains(@class, "ant-col-11")]')
for line_bar in line_bars:
photo_line_bar = line_bar.find_element(By.XPATH, './/div')
# todo:滚轮需要滑动
driver.execute_script("arguments[0].scrollIntoView();", photo_line_bar)
time.sleep(1)
# todo:保存成临时文件
temp_file = NamedTemporaryFile(delete=False, suffix=".png")
temp_file.close()
photo_line_bar.screenshot(temp_file.name)
file_path = temp_file.name
# todo:保存到obs链接及标签替换
name = str(baseCore.getuuid())
result = getOBSres(pathType, name, file_path)
path = result['body']['objectUrl'].split('.com')[1]
full_path = result['body']['objectUrl']
# todo:替换标签 删除标签
dele_tag_ = line_bar.find_element(By.XPATH, './/div')
driver.execute_script("arguments[0].remove()", dele_tag_)
# todo:将图片塞进去 新建一个new_tag
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; newElement.style.width = '50%'; newElement.style.position = 'relative'; newElement.style.float = 'left'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
line_bar)
# #todo:创建清晰的图片标签
# driver.execute_script(f"""
# var img = new Image();
# img.src = "http://zzsn.luyuen.com{path}"; // 替换为你的图片路径
# img.onload = function() {{
# var canvas = document.createElement("canvas");
# canvas.width = img.width;
# canvas.height = img.height;
# var ctx = canvas.getContext("2d");
# ctx.drawImage(img, 0, 0);
# document.body.appendChild(canvas);
# }}; arguments[0].insertBefore(img, arguments[0].firstChild);
# """, line_bar)
os.remove(file_path)
html = driver.page_source
news_soup = BeautifulSoup(html, 'html.parser')
detail_info = news_soup.find('div', class_='subTitle___svblj') detail_info = news_soup.find('div', class_='subTitle___svblj')
div_list = detail_info.find_all('div') div_list = detail_info.find_all('div')
origin = div_list[0].text origin = div_list[0].text
publishDate = div_list[1].text publishDate = div_list[1].text
contentWithTag = news_soup.find('div', class_='editable___1EtCQ editor-editable') contentWithTag = news_soup.find('div', class_='editable___1EtCQ editor-editable')
# print(contentWithTag)
for tag in contentWithTag.find_all('span'):
if tag.text == '\ufeff':
tag.decompose()
content = contentWithTag.text content = contentWithTag.text
info_code = 'IN-20240129-0001' info_code = 'IN-20240129-0001'
result_dict = { result_dict = {
...@@ -152,25 +213,29 @@ def zzcx(): ...@@ -152,25 +213,29 @@ def zzcx():
'sid': '1751787750127857666', 'sid': '1751787750127857666',
'title': title, 'title': title,
'organ': origin, 'organ': origin,
'origin': '国务院国有资产监督管理委员会', 'origin': origin,
# '摘要': zhaiyao, # '摘要': zhaiyao,
'source': 16, 'source': 16,
'content': content, 'content': content,
'contentWithTag': contentWithTag, 'contentWithTag': str(contentWithTag),
'publishDate': publishDate, 'publishDate': publishDate,
'sourceAddress': news_url, 'sourceAddress': news_url,
} }
log.info(f'{page}--{title}--{href}') log.info(f'{page}--{title}--{news_url}')
# info_list.append(result_dict) print(result_dict)
# break
# break
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try: try:
kafka_result = producer.send("crawlerInfo", kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8')) json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href) r.sadd(info_code, news_url)
log.info('发送kafka成功!') log.info('发送kafka成功!')
except Exception as e: except Exception as e:
log.info(e) log.info(e)
finally: finally:
producer.close() producer.close()
if __name__ == "__main__": if __name__ == "__main__":
pathType = 'PhotoDingzhi/'
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
zzcx() zzcx()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论