提交 db863d92 作者: 薛凌堃

02/05

上级 7d517080
......@@ -582,22 +582,7 @@ def login():
url = 'https://www.qcc.com'
driver.get(url)
driver.maximize_window()
# from selenium.webdriver.support import expected_conditions as EC
# wait = WebDriverWait(driver, 10)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "nav-item")))
# # page_source = browser.page_source
# # soup = BeautifulSoup(page_source,'html.parser')
# # print(soup)
# driver.find_element(By.CLASS_NAME, 'nav-item').click()
# time.sleep(10)
# wait = WebDriverWait(driver, 10)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "login-change")))
# driver.find_element(By.CLASS_NAME, 'login-change').click()
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[1]/div[2]/a').click()
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[1]/input').send_keys('18703752600')
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[2]/input').send_keys('angel2468')
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[4]/button').click()
# time.sleep(3)
# cookie_list = driver.get_cookies()
cookieinfo = token.getToken()
if cookieinfo:
......@@ -607,14 +592,20 @@ def login():
time.sleep(30)
return
id_cookie = cookieinfo[0]
cookie_ = json.loads(cookieinfo[1])
cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721815475, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': f'{cookie_["CNZZDATA1254842228"]}'}, {'domain': '.qcc.com', 'expiry': 1740650660, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'bb480035-2a34-4270-9a8b-db8b7d9374b3'}, {'domain': '.qcc.com', 'expiry': 1706695474, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'ccf17b97219476a1faa8aaff79'}, {'domain': '.qcc.com', 'expiry': 1721815461, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '18d3aed87f3552-01ba17134bcbe9-4c657b58-e1000-18d3aed87f4c5d'}, {'domain': 'www.qcc.com', 'expiry': 1706092459, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '3d365a1c17060906591851865e848bfd116d30ed8d2ac3e144455c8ff8'}]
cookie_list = json.loads(cookieinfo[1])
# cookie_list = json.dumps(cookieinfo[1])
print(cookie_list)
# cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721815475, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': f'{cookie_["CNZZDATA1254842228"]}'}, {'domain': '.qcc.com', 'expiry': 1740650660, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'bb480035-2a34-4270-9a8b-db8b7d9374b3'}, {'domain': '.qcc.com', 'expiry': 1706695474, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'ccf17b97219476a1faa8aaff79'}, {'domain': '.qcc.com', 'expiry': 1721815461, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '18d3aed87f3552-01ba17134bcbe9-4c657b58-e1000-18d3aed87f4c5d'}, {'domain': 'www.qcc.com', 'expiry': 1706092459, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '3d365a1c17060906591851865e848bfd116d30ed8d2ac3e144455c8ff8'}]
for cookie in cookie_list:
cookie['expiry'] = int(cookie['expiry'])
# del cookie['expiry']
driver.add_cookie(cookie)
time.sleep(5)
driver.refresh()
url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
driver.get(url_test)
# driver.get('https://www.qcc.com/')
time.sleep(60)
return driver,id_cookie
......@@ -695,7 +686,7 @@ if __name__ == '__main__':
# category = ''
# exchange = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
# count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
time.sleep(10)
# break
# baseCore.r.close()
......
......@@ -34,10 +34,12 @@ def flushAndGetToken():
cookie_list = browser.get_cookies()
cookies = {}
print(cookie_list)
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
print(cookies)
return cookies
# for cookie in cookie_list:
# cookies[cookie['name']] = cookie['value']
# print(cookies)
# return cookies
print(type(cookie_list))
return cookie_list
if __name__ == "__main__":
urlqcc = 'https://www.qcc.com/'
......@@ -51,7 +53,10 @@ if __name__ == "__main__":
browser.find_element(By.CLASS_NAME, 'nav-item').click()
time.sleep(20)
cookies = flushAndGetToken()
cookies = json.dumps(cookies)
# print(cookies)
# cookies = json.dumps(cookies)
cookies = str(cookies)
# print(cookies)
insert = f"insert into QCC_token (cookies,create_time,fenghao_time,update_time) values ('{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),now())"
cursor_.execute(insert)
cnx_.commit()
......
"""
从es中拿到所有的信息
"""
import json
import threading
import time
import uuid
import redis
import requests
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
baseCore = BaseCore.BaseCore()
# 使用连接池
# cnx_ = baseCore.pool_11.connection()
# cursor_ = cnx_.cursor()
cnx_ = baseCore.cnx_
cursor_ = cnx_.cursor()
lock = threading.Lock()
pathType = 'QYNotice/'
taskType = '企业研报/东方财富网'
pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'researchreportdata'
def queryatt(self,index_name,pnum):
body = {
"query": {
"bool": {
"must": [
{
"match_phrase": {
"content": "Error Times"
}
},
{
"match": {
"type": "3"
}
}
]
}
},
"track_total_hits": True,
"size": 200,
"from": pnum
}
filter_path = ['hits.hits._id',
'hits.total.value',
'hits.hits._source.attachmentIds',
'hits.hits._source.sourceAddress',
'hits.hits._source.labels.relationId',
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
# log.info(result)
return result
def main(page, p, esMethod):
redis_conn = redis.Redis(connection_pool=pool)
result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# return
try:
msglist = result['hits']['hits']
except:
log.info(f'error-----{result}')
return
log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
for mms in msglist:
id = mms['_id']
sourceAddress = mms['_source']['sourceAddress']
att_id = mms['_source']['attachmentIds'][0]
social_code= mms['_source']['labels'][0]['relationId']
log.info(f'{id}--{att_id}---{sourceAddress}---')
item = f'{id}|{att_id}|{sourceAddress}|{social_code}'
redis_conn.lrem('Notice:id', 0, item)
redis_conn.lpush('Notice:id', item)
redis_conn.lrem('Notice:id111', 0, item)
redis_conn.lpush('Notice:id111', item)
def run_threads(num_threads,esMethod,j):
threads = []
for i in range(num_threads):
page = j + i + 1
p = j + i * 200
thread = threading.Thread(target=main, args=(page, p, esMethod))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
if __name__ == "__main__":
j = 0
for i in range(24):
esMethod = EsMethod()
# result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
# total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# break
start = time.time()
num_threads = 5
run_threads(num_threads, esMethod, j)
j += 1000
log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
"""
es查询出的数据放到redis中,从redis中取出数据,
重新下载链接,更新附件表中的文件路径 更新es中的content字段
"""
import json
import os
import threading
import time
import uuid
import redis
import requests
from bs4 import BeautifulSoup
from obs import ObsClient
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
baseCore = BaseCore.BaseCore()
import os
import subprocess
import uuid
import requests,time, json, sys
from retry import retry
from obs import ObsClient
import fitz
from urllib.parse import unquote
cnx_ = baseCore.cnx_
cursor_ = cnx_.cursor()
lock = threading.Lock()
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
index_name = 'researchreportdata'
taskType = '公告处理错误数据'
pathType = 'QYNotice/'
def updateaunn(index_name,id,content):
body = {
'doc': {
'content': [str(content)]
}
}
result = es.update(index=index_name
,id=id
,body=body)
log.info('更新结果:%s' % result)
#获取文件大小
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units)-1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
def uptoOBS(pdf_url,type_id,social_code):
headers = {}
category = os.path.splitext(pdf_url)[1]
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
if category == '.pdf':
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
# todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
pass
except:
log.error(f'文件损坏')
return retData
else:
for i in range(0, 3):
try:
page_size = 1
response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
retData['content'] = response.text
#todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
break
except:
time.sleep(3)
continue
name = str(getuuid()) + category
try:
result = getOBSres(pathType, name, response)
except:
log.error(f'OBS发送失败')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
@retry(tries=3, delay=1)
def getOBSres(pathType,name, response):
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
# resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
return result
def tableUpdate(retData,att_id):
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
page_size = retData['page_size']
try:
# Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
Upsql = """update clb_sys_attachment set path= %s, full_path= %s, category=%s,file_size=%s,page_size=%s,object_key=%s,bucket_name=%s where id=%s"""
values = (
path, full_path, category, file_size, page_size,full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1],'zzsn', att_id)
# lock.acquire()
cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交
# lock.release()
except Exception as e:
log.info(e)
return False
log.info(f"更新完成:{att_id}====")
return True
def GetContent(pdf_url, social_code, start_time,att_id):
# 上传至华为云服务器
retData = uptoOBS(pdf_url, 8, social_code)
# 附件插入att数据库
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
# 获取当前进程pid
current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9)
return False
att_id = tableUpdate(retData, att_id)
if att_id:
pass
else:
return False
content = retData['content']
return content
if __name__ == '__main__':
while True:
item = '23121403366|1731946331417709349|https://data.eastmoney.com/notices/detail/CG/AN202311081609279378.html|ZZSN230824151229535'
# item = baseCore.redicPullData('Notice:id')
log.info(item)
if item:
es_id = item.split('|')[0]
att_id = item.split('|')[1]
# 原文链接
href = item.split('|')[2]
social_code = item.split('|')[3]
art_code = href.split('/')[-1].split('.')[0]
t = int(time.time() * 1000)
json_url = f'https://np-cnotice-stock.eastmoney.com/api/content/ann?art_code={art_code}&client_source=web&page_index=1&_={t}'
for n1 in range(0, 3):
try:
ip = baseCore.get_proxy()
json_2 = requests.get(json_url, proxies=ip, verify=False).json()
break
except:
if n1 == 2:
sys.exit(0)
time.sleep(60)
continue
try:
pdf_url = json_2['data']['attach_url']
except:
pdf_url = ''
if pdf_url:
# todo: 下载pdf文件
start_time = time.time()
content = GetContent(pdf_url, social_code, start_time, att_id)
if content:
# 公告信息列表
log.info(f'{att_id}==============解析传输操作成功')
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')
# todo: 更新es
updateaunn(index_name, es_id, content)
time.sleep(2)
else:
baseCore.rePutIntoR('Noticeerror:id', item)
else:
log.info('######已无数据######')
break
"""
国外智库-欧盟 经合组织
"""
from bs4 import BeautifulSoup
import requests
from datetime import datetime
url = 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=1'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Cookie': 'JSESSIONID=BHezogPwi8NJVECsKXCXqijdQ00-yMJHw_gR8wiC.ip-10-240-5-121; __cf_bm=c2byUypnSjXPS_UFDM7BMRGDxN6AQEkNVUjzw9HuSq8-1707054653-1-AbbI7JWWkfWKVGi8SKI06f0jGEjPdk5kvHAIRRpBHSSSnmxj1IcvGUT8+/O6R0U2RLZJECZdUzZIXAwFuEz5lPo=; _gcl_au=1.1.201344533.1707054655; _gid=GA1.2.557164000.1707054655; cb-enabled=enabled; cf_clearance=6tK6.WKHJbXXoV4NTgbyHRhetRxMdWPZofwlv01F65Y-1707054656-1-AfrYlWnLLZFC1sKxeFVQintPrZnjvjoJSZwRRhAYwqRHGdWbU5IFZQDJZJM21l20Tj6gk4JxNobWT0wGzp1Dgjw=; _ce.irv=new; cebs=1; _ce.clock_event=1; _ce.clock_data=72%2C123.149.3.159%2C1%2C9c1ce27f08b16479d2e17743062b28ed; custom_cookie_AB=1; AWSALB=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; AWSALBCORS=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; _gat_UA-1887794-2=1; _dc_gtm_UA-136634323-1=1; _ga_F5XZ540Q4V=GS1.1.1707054655.1.1.1707055119.7.0.0; _ga=GA1.1.1014316406.1707054655; _ga_F7KSNTXTRX=GS1.1.1707054655.1.1.1707055119.0.0.0; cebsp_=5; _ce.s=v~212f033193b9432855ae8335d6d3969cc1f8b751~lcw~1707055134688~lva~1707054658247~vpv~0~v11.fhb~1707054659602~v11.lhb~1707055126493~v11.cs~325107~v11.s~6d7ba630-c364-11ee-aba8-136dbbf9a447~v11.sla~1707055134688~v11.send~1707055135439~lcw~1707055135439',
'Referer': 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=2',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
req = requests.get(url=url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
div_part = soup.find_all('div', class_='col-xs-12 body-section')[1]
div_list = div_part.find_all('div', class_='row panel')
for div in div_list[:1]:
# div = div_.find_all('div')
# print(div)
title = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title').text
href = 'https://www.oecd-ilibrary.org' + div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('a')['href']
pubtime_ = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title gray').text
# 定义原始时间的格式
time_format = "%d %b %Y"
# 转换为标准时间
standard_time = datetime.strptime(pubtime_, time_format)
pdf_part = div.find('div', class_='col-lg-5 col-xs-12 actions-item').find('ul', class_='actions').find_all('li')[1].find('a').get('href')
# print(pdf_part)
pdf_url = 'https://www.oecd-ilibrary.org' + pdf_part
req_news = requests.get(url=href, headers=headers)
soup_news = BeautifulSoup(req_news.content, 'html.parser')
print(title, standard_time, pdf_url, href)
contentWithTag = soup_news.find('div', class_='description js-desc-fade show-all')
content = contentWithTag.get_text()
......@@ -2,10 +2,98 @@
中证智能财讯
"""
import json
import sys
import time
from obs import ObsClient
import fitz
import requests
from bs4 import BeautifulSoup
from retry import retry
from selenium.webdriver.common.by import By
from selenium import webdriver
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
def create_driver():
path = r'D:\soft\msedgedriver.exe'
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
}
driver = webdriver.Edge(executable_path=path, capabilities=options)
return driver
@retry(tries=3, delay=1)
def getOBSres(pathType, name, response):
result = obsClient.putContent('zzsn', f'{pathType}/' + name, content=response.content)
# result = obsClient.putFile('zzsn', pathType+name, file_path=response)
return result
def uptoOBS(pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
name = str(baseCore.getuuid()) + '.pdf'
now_time = time.strftime("%Y-%m")
try:
result = getOBSres(pathType, now_time, name, response)
except:
log = baseCore.getLogger()
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log = baseCore.getLogger()
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = baseCore.convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
def zzcx():
url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
......@@ -39,13 +127,50 @@ def zzcx():
for news in records:
title = news['title']
news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
# 使用模拟浏览器打开
driver = create_driver()
driver.get(news_url)
div_ = driver.find_element(By.ID, 'line')
div = div_.find_element(By.XPATH, '..')
image_data = div.screenshot_as_base64
# todo:保存到obs链接及标签替换
baseCore.uptoOBS()
html = driver.page_source
news_req = requests.get(url=news_url, headers=headers)
news_soup = BeautifulSoup(news_req.content, 'html.parser')
detail_info = news_soup.find('div', class_='subTitle___svblj')
div_list = detail_info.find_all('div')
origin = div_list[0].text
publishDate = div_list[1].text
contentWithTag = news_soup.find('div', class_='editable___1EtCQ editor-editable')
content = contentWithTag.text
info_code = 'IN-20240129-0001'
result_dict = {
'id': '',
'sid': '1751787750127857666',
'title': title,
'organ': origin,
'origin': '国务院国有资产监督管理委员会',
# '摘要': zhaiyao,
'source': 16,
'content': content,
'contentWithTag': contentWithTag,
'publishDate': publishDate,
'sourceAddress': news_url,
}
log.info(f'{page}--{title}--{href}')
# info_list.append(result_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href)
log.info('发送kafka成功!')
except Exception as e:
log.info(e)
finally:
producer.close()
if __name__ == "__main__":
zzcx()
\ No newline at end of file
......@@ -151,7 +151,7 @@ log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
'数据源_0504']
datas = db_storage.find({'postCode':'2'}).limit(5)
datas = db_storage.find({'postCode':'2'}).limit(1)
for data in datas:
title = data['titleForeign']
contentWithTag = data['richTextForeign']
......@@ -170,5 +170,5 @@ for data in datas:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5001/translate',data=dic_info_,headers=headers)
log.info(req.text)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论