提交 3ad4e1eb 作者: 薛凌堃

11/10

上级 ce4c997a
# 核心工具包
# 核心工具包
......@@ -524,7 +524,7 @@ class BaseCore:
if category in file_name:
pass
else:
file_name = file_name + '.' + category
file_name = file_name + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + file_name, content=response.content)
break
except:
......
......@@ -564,13 +564,13 @@ if __name__ == "__main__":
# kegaishifan()
# shuangbaiqiye()
# zhuangjingtexind()
NoticeEnterprise()
# NoticeEnterprise()
# AnnualEnterpriseIPO()
# AnnualEnterprise()
# BaseInfoEnterprise()
# BaseInfoEnterpriseAbroad()
# NewsEnterprise_task()
# NewsEnterprise()
NewsEnterprise()
# CorPerson()
# china100()
# global100()
......@@ -585,8 +585,8 @@ if __name__ == "__main__":
# dujioashou()
# omeng()
# AnnualEnterpriseUS()
NoticeEnterprise_task()
AnnualEnterprise_task()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
# FinanceFromEast()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
"""模拟扫码登录"""
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
def createDriver():
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
return driver
def flushAndGetToken():
log.info('======刷新浏览器=====')
browser.refresh()
cookie_list = browser.get_cookies()
cookies = {}
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
print(cookies)
def getrequest_soup(headers,url):
req = requests.get(headers=headers,url=url)
result = BeautifulSoup(req.content,'html.parser')
return result
def dojob():
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; QCCSESSID=1d489139eea4830a062c3a1240; acw_tc=db9062ad16994955552435350e3b43e7e5cee64c77d9f807936897ab1f',
'Host': 'www.qcc.com',
'Referer': 'https://www.qcc.com/',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
url = 'https://www.qcc.com/web/search?key=%E5%B0%8F%E7%B1%B3%E9%80%9A%E8%AE%AF%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8'
soup = getrequest_soup(headers,url)
pass
if __name__ == "__main__":
urlqcc = 'https://www.qcc.com/'
browser = createDriver()
browser.get(urlqcc)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "nav-item")))
# page_source = browser.page_source
# soup = BeautifulSoup(page_source,'html.parser')
# print(soup)
browser.find_element(By.CLASS_NAME, 'nav-item').click()
time.sleep(20)
flushAndGetToken()
#企业动态 从redis中获取数据
import json
import os
import random
import subprocess
import requests, time, pymysql
import jieba
......@@ -50,7 +52,7 @@ headers = {
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
taskType = '企业动态/天眼查/补采20W+'
taskType = '企业动态/天眼查/补采专精特新'
def reqDetailmsg(url,headers):
......@@ -76,7 +78,7 @@ def beinWork(tyc_code, social_code,start_time):
t = time.time()
url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
try:
for m in range(0, 3):
for m in range(0,3):
ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent()
response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
......@@ -85,11 +87,18 @@ def beinWork(tyc_code, social_code,start_time):
if (response.status_code == 200):
pass
except Exception as e:
#todo:重新放入redis中
baseCore.rePutIntoR('NoticeEnterprise:gnqy_socialCode',social_code)
log.error(f"{tyc_code}-----获取总数接口失败")
error = '获取总数接口失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{error}----{e}')
#获取当前进程pid
current_pid = baseCore.getPID()
#todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid,9)
return retData
try:
json_1 = json.loads(response.content.decode('utf-8'))
......@@ -126,7 +135,7 @@ def beinWork(tyc_code, social_code,start_time):
ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent()
response_page = requests.get(url=url_page, headers=headers, proxies=ip, verify=False)
time.sleep(1)
# time.sleep(3)
break
except:
pass
......@@ -172,43 +181,25 @@ def beinWork(tyc_code, social_code,start_time):
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
#return retData
continue
return retData
try:
time_struct = time.localtime(int(info_page['rtm'] / 1000)) # 首先把时间戳转换为结构化时间
time_format = time.strftime("%Y-%m-%d %H:%M:%S", time_struct) # 把结构化时间转换为格式化时间
except:
time_format = baseCore.getNowTime(1)
#记录时间 对比时间
#if time_format > '2023-09-25' and time_format < '2023-10-01':
#pass
#else:
#continue
try:
# 开始进行智能解析
# lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang)
# req = requests.get(url=link,headers=headers,timeout=10)
# html = BeautifulSoup(req.content,'html.parser')
raw_html = reqDetailmsg(link,headers)
if raw_html:
# soup = BeautifulSoup(raw_html, 'html.parser')
try:
article = smart.extract_by_html(raw_html)
content = article.cleaned_text
contentText = article.text
except Exception as e:
log.info(f'抽取失败!!{e}')
# #带标签正文
# contentText = smart.extract_by_url(link).text
# #不带标签正文
# content = smart.extract_by_url(link).cleaned_text
# # time.sleep(3)
#带标签正文
contentText = smart.extract_by_url(link).text
#不带标签正文
content = smart.extract_by_url(link).cleaned_text
if len(content) < 300:
continue
# time.sleep(3)
except Exception as e:
contentText = ''
if contentText == '':
log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}')
e = '获取正文失败'
......@@ -253,7 +244,7 @@ def beinWork(tyc_code, social_code,start_time):
'lang': 'zh',
'origin': source,
'publishDate': time_format,
#'sid': '1684032033495392257',
# 'sid': '1684032033495392257',
'sid': '1714852232679067649',
'sourceAddress': link, # 原文链接
'summary': info_page['abstracts'],
......@@ -286,7 +277,7 @@ def beinWork(tyc_code, social_code,start_time):
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, '成功')
baseCore.recordLog(social_code, taskType, state, takeTime, link, '')
# return True
except Exception as e:
dic_result = {
......@@ -312,7 +303,7 @@ def doJob():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
# social_code = '913205007764477744'
# social_code = '912301001275921118'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
......@@ -376,16 +367,12 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
#break
cursor.close()
cnx.close()
# 释放资源
baseCore.close()
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
log.info(f'当前进程id为{baseCore.getPID()}')
doJob()
"""
"""
Elasticsearch 安装
pip install elasticsearch==7.8.1 版本的
使用时参考文章
https://blog.csdn.net/yangbisheng1121/article/details/128528112
https://blog.csdn.net/qiuweifan/article/details/128610083
"""
import json
import time
import uuid
import requests
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
pathType = 'QYNotice/'
taskType = '企业公告/证监会'
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'researchreportdata'
def queryatt(self,index_name):
body = {
"_source": ["attachmentIds", "createDate", "sourceAddress", "labels.relationId", "title", "year",
"publishDate", "createDate"],
"query": {
"bool": {
"must": [
{
"match": {
"type": "3"
}
},
{
"wildcard": {
"attachmentIds.keyword": "911*"
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "asc"
}
}
],
"track_total_hits": True,
"size": 200
}
filter_path = ['hits.hits._id',
'hits.total.value',
'hits.hits._source.attachmentIds', # 字段1
'hits.hits._source.title',
'hits.hits._source.sourceAddress',
'hits.hits._source.createDate',
'hits.hits._source.labels.relationId',
'hits.hits._source.publishDate',
'hits.hits._source.year',
'hits.hits._source.createDate',
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
log.info(result)
return result
def updateaunn(self,index_name,id,u_attid):
body = {
'doc': {
'attachmentIds': [str(u_attid)]
}
}
result = self.es.update(index=index_name
,id=id
,body=body)
log.info('更新结果:%s' % result)
def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
#获取文件大小
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units)-1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
start_time = time.time()
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
name = str(getuuid()) + '.pdf'
now_time = time.strftime("%Y-%m")
try:
result = getOBSres(pathType, name, response)
except:
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
@retry(tries=3, delay=1)
def getOBSres(pathType,name, response):
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
# resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
return result
def secrchATT(item_id, retData, type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
cursor_.execute(sel_sql, (item_id, retData['path'], type_id,order_by))
selects = cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(retData, year, pdf_name, num,pub_time,origin):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
# selects = secrchATT(item_id, pdf_name, type_id)
#
# if selects:
# log.info(f'pdf_name:{pdf_name}已存在')
# id = ''
# return id
# else:
try:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
pub_time, origin)
cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交
except Exception as e:
log.info(e)
log.info(f"更新完成:{item_id}===={pdf_name}")
selects = secrchATT(item_id, retData, type_id,order_by)
id = selects[0]
return id
def upload(sourceAddress,num):
# todo:链接上传obs
retData = uptoOBS(sourceAddress, title + '.pdf', 8, social_code)
# 附件插入att数据库
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return None
num = num + 1
origin = '证监会'
att_id = tableUpdate(retData, year, title + '.pdf', num, publishDate, origin)
if att_id:
return att_id
else:
return None
if __name__ == '__main__':
esMethod = EsMethod()
# esMethod.getFileds(index_name=esMethod.index_name)
page = 1
while True:
result = esMethod.queryatt(index_name=esMethod.index_name)
total = result['hits']['total']['value']
if total==0:
log.info('++++已没有数据+++++')
break
msglist = result['hits']['hits']
log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
# print(msglist)
num = 0
for mms in msglist:
id = mms['_id']
title = mms['_source']['title']
sourceAddress = mms['_source']['sourceAddress']
social_code = mms['_source']['labels'][0]['relationId']
year = mms['_source']['year']
publishDate = mms['_source']['publishDate']
createDate = mms['_source']['createDate']
log.info(f'{id}---{title}--{sourceAddress}---{social_code}')
att_id = upload(sourceAddress,num)
u_attid = att_id
esMethod.updateaunn(esMethod.index_name, str(id), u_attid)
page+=1
# # esMethod.delete(esMethod.index_name,str(id))
# print('跟新成功!!')
"""
"""
Elasticsearch 安装
pip install elasticsearch==7.8.1 版本的
使用时参考文章
https://blog.csdn.net/yangbisheng1121/article/details/128528112
https://blog.csdn.net/qiuweifan/article/details/128610083
"""
import json
import time
import uuid
import requests
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
pathType = 'QYNotice/'
taskType = '企业公告/证监会'
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'researchreportdata'
def queryatt(self,index_name):
body = {
"_source": ["attachmentIds", "createDate", "sourceAddress", "labels.relationId", "title", "year",
"publishDate", "createDate"],
"query": {
"bool": {
"must": [
{
"match": {
"type": "3"
}
},
{
"wildcard": {
"attachmentIds.keyword": "None"
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"track_total_hits": True,
"size": 200
}
filter_path = ['hits.hits._id',
'hits.total.value',
'hits.hits._source.attachmentIds', # 字段1
'hits.hits._source.title',
'hits.hits._source.sourceAddress',
'hits.hits._source.createDate',
'hits.hits._source.labels.relationId',
'hits.hits._source.publishDate',
'hits.hits._source.year',
'hits.hits._source.createDate',
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
log.info(result)
return result
def updateaunn(self,index_name,id,u_attid):
body = {
'doc': {
'attachmentIds': [str(u_attid)]
}
}
result = self.es.update(index=index_name
,id=id
,body=body)
log.info('更新结果:%s' % result)
def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
#获取文件大小
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units)-1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
start_time = time.time()
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
name = str(getuuid()) + '.pdf'
now_time = time.strftime("%Y-%m")
try:
result = getOBSres(pathType, name, response)
except:
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
@retry(tries=3, delay=1)
def getOBSres(pathType,name, response):
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
# resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
return result
def secrchATT(item_id, retData, type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
cursor_.execute(sel_sql, (item_id, retData['path'], type_id,order_by))
selects = cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(retData, year, pdf_name, num,pub_time,origin):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
# selects = secrchATT(item_id, pdf_name, type_id)
#
# if selects:
# log.info(f'pdf_name:{pdf_name}已存在')
# id = ''
# return id
# else:
try:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
pub_time, origin)
cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交
except Exception as e:
log.info(e)
log.info(f"更新完成:{item_id}===={pdf_name}")
selects = secrchATT(item_id, retData, type_id,order_by)
id = selects[0]
return id
def upload(sourceAddress,num):
# todo:链接上传obs
retData = uptoOBS(sourceAddress, title + '.pdf', 8, social_code)
# 附件插入att数据库
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return None
num = num + 1
origin = '证监会'
att_id = tableUpdate(retData, year, title + '.pdf', num, publishDate, origin)
if att_id:
return att_id
else:
return None
if __name__ == '__main__':
esMethod = EsMethod()
# esMethod.getFileds(index_name=esMethod.index_name)
page = 1
while True:
result = esMethod.queryatt(index_name=esMethod.index_name)
total = result['hits']['total']['value']
if total==0:
log.info('++++已没有数据+++++')
break
msglist = result['hits']['hits']
log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
# print(msglist)
num = 0
for mms in msglist:
id = mms['_id']
title = mms['_source']['title']
sourceAddress = mms['_source']['sourceAddress']
social_code = mms['_source']['labels'][0]['relationId']
year = mms['_source']['year']
publishDate = mms['_source']['publishDate']
createDate = mms['_source']['createDate']
log.info(f'{id}---{title}--{sourceAddress}---{social_code}')
att_id = upload(sourceAddress,num)
u_attid = att_id
esMethod.updateaunn(esMethod.index_name, str(id), u_attid)
page+=1
# # esMethod.delete(esMethod.index_name,str(id))
# print('跟新成功!!')
......@@ -313,7 +313,7 @@ def get_content1():
end_time = time.time()
log.info(f'共抓取国务院文件{num}条数据,共耗时{end_time-start_time}')
# 国务院部文件
# 国务院部文件
def get_content2():
pathType = 'policy/gwybmwj/'
def getTotalpage(bmfl,headers,session):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论