提交 826e44dc 作者: 薛凌堃

政策法规

上级 d4d70643
...@@ -39,6 +39,19 @@ headers = { ...@@ -39,6 +39,19 @@ headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
} }
# 将html中的相对地址转换成绝对地址
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def replaceUrl(hostUrl,src): def replaceUrl(hostUrl,src):
if '../' in src: if '../' in src:
src = src.strip('../') src = src.strip('../')
...@@ -157,6 +170,9 @@ def redefid(idList): ...@@ -157,6 +170,9 @@ def redefid(idList):
id_ = ','.join(map(str, idList)) id_ = ','.join(map(str, idList))
return id_ return id_
def remove_dup():
pass
def get_content1(): def get_content1():
start_time = time.time() start_time = time.time()
num = 0 num = 0
...@@ -454,18 +470,7 @@ def get_content3(): ...@@ -454,18 +470,7 @@ def get_content3():
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
# 将html中的相对地址转换成绝对地址
def paserUrl(html,listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
# 北京 # 北京
...@@ -556,9 +561,12 @@ def bei_jing(): ...@@ -556,9 +561,12 @@ def bei_jing():
pass pass
else: else:
continue continue
att_id = baseCore.tableUpdate(retData,'北京市国资委',file_name,num) att_id,full_path = baseCore.tableUpdate(retData,'北京市国资委',file_name,num)
id_list.append(att_id) id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
id_ = redefid(id_list) id_ = redefid(id_list)
#todo:替换完成之后,将附件上传至文件服务器 #todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -578,7 +586,7 @@ def bei_jing(): ...@@ -578,7 +586,7 @@ def bei_jing():
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': pub_time, 'writtenDate': pub_time,
'sid': '0987654321', 'sid': '1697458829758697473',
'sourceAddress': '', 'sourceAddress': '',
'summary': '', 'summary': '',
'title': title 'title': title
......
...@@ -469,12 +469,12 @@ class BaseCore: ...@@ -469,12 +469,12 @@ class BaseCore:
pass pass
else: else:
return retData return retData
page_size = 0 # page_size = 0
if category == '.doc' or category == '.docx': # if category == '.doc' or category == '.docx':
# page_size = self.doc_page(file_href) # # page_size = self.doc_page(file_href)
return retData # return retData
if category == '.pdf' or category == '.PDF': # if category == '.pdf' or category == '.PDF':
page_size = self.pdf_page(resp_content) # page_size = self.pdf_page(resp_content)
for i in range(0, 3): for i in range(0, 3):
try: try:
result = client.upload_by_buffer(resp_content) result = client.upload_by_buffer(resp_content)
...@@ -483,18 +483,18 @@ class BaseCore: ...@@ -483,18 +483,18 @@ class BaseCore:
except: except:
time.sleep(3) time.sleep(3)
continue continue
if page_size>0: # if page_size>0:
pass # pass
else: # else:
self.getLogger().info(f'======解析失败=====') # self.getLogger().info(f'======解析失败=====')
return retData # return retData
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '') retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id']) retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size'] retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now retData['create_time'] = time_now
retData['page_size'] = page_size # retData['page_size'] = page_size
return retData return retData
def secrchATT(self,item_id,file_name,type_id): def secrchATT(self,item_id,file_name,type_id):
...@@ -522,21 +522,21 @@ class BaseCore: ...@@ -522,21 +522,21 @@ class BaseCore:
if selects: if selects:
self.getLogger().info(f'com_name:{com_name}已存在') self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0] id = selects[0]
return id return id,full_path
else: else:
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = ( values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by, status, create_by,
create_time, page_size) create_time)
self.cursor_.execute(Upsql, values) # 插入 self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交 self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql)) self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,file_name,type_id) selects = self.secrchATT(item_id,file_name,type_id)
id = selects[0] id = selects[0]
return id return id,full_path
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论