提交 826e44dc 作者: 薛凌堃

政策法规

上级 d4d70643
......@@ -39,6 +39,19 @@ headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}
# 将html中的相对地址转换成绝对地址
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def replaceUrl(hostUrl,src):
if '../' in src:
src = src.strip('../')
......@@ -157,6 +170,9 @@ def redefid(idList):
id_ = ','.join(map(str, idList))
return id_
def remove_dup():
pass
def get_content1():
start_time = time.time()
num = 0
......@@ -454,18 +470,7 @@ def get_content3():
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# 将html中的相对地址转换成绝对地址
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
# 北京
......@@ -556,9 +561,12 @@ def bei_jing():
pass
else:
continue
att_id = baseCore.tableUpdate(retData,'北京市国资委',file_name,num)
att_id,full_path = baseCore.tableUpdate(retData,'北京市国资委',file_name,num)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
id_ = redefid(id_list)
#todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -578,7 +586,7 @@ def bei_jing():
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': pub_time,
'sid': '0987654321',
'sid': '1697458829758697473',
'sourceAddress': '',
'summary': '',
'title': title
......
......@@ -469,12 +469,12 @@ class BaseCore:
pass
else:
return retData
page_size = 0
if category == '.doc' or category == '.docx':
# page_size = self.doc_page(file_href)
return retData
if category == '.pdf' or category == '.PDF':
page_size = self.pdf_page(resp_content)
# page_size = 0
# if category == '.doc' or category == '.docx':
# # page_size = self.doc_page(file_href)
# return retData
# if category == '.pdf' or category == '.PDF':
# page_size = self.pdf_page(resp_content)
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content)
......@@ -483,18 +483,18 @@ class BaseCore:
except:
time.sleep(3)
continue
if page_size>0:
pass
else:
self.getLogger().info(f'======解析失败=====')
return retData
# if page_size>0:
# pass
# else:
# self.getLogger().info(f'======解析失败=====')
# return retData
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
retData['page_size'] = page_size
# retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,file_name,type_id):
......@@ -522,21 +522,21 @@ class BaseCore:
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id
return id,full_path
else:
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
create_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,file_name,type_id)
id = selects[0]
return id
return id,full_path
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论