提交 ef0082b2 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

...@@ -20,9 +20,6 @@ from DBUtils.PooledDB import PooledDB ...@@ -20,9 +20,6 @@ from DBUtils.PooledDB import PooledDB
# import sys # import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client') # sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf')
client = Fdfs_client(tracker_conf)
from obs import ObsClient from obs import ObsClient
import fitz import fitz
...@@ -444,72 +441,8 @@ class BaseCore: ...@@ -444,72 +441,8 @@ class BaseCore:
# def doc_page(self,file_path): # def doc_page(self,file_path):
# doc = Document(file_path) # doc = Document(file_path)
# return len(doc.sections) # return len(doc.sections)
def pdf_content(self,resp_content):
# 解析pdf文件内容
content = ''
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
for page in doc.pages():
content += page.get_text()
break
except:
time.sleep(3)
continue
return content
# 替换为绝对路径之后,解析出来a.href
def uploadToserver(self,file_href,item_id):
category = os.path.splitext(file_href)[1]
# 上传至文件服务器
headers = {}
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
resp_content = ''
for i in range(0, 3):
try:
resp_content = requests.get(file_href, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
if resp_content:
pass
else:
return retData
# page_size = 0
# if category == '.doc' or category == '.docx':
# # page_size = self.doc_page(file_href)
# return retData
# if category == '.pdf' or category == '.PDF':
# page_size = self.pdf_page(resp_content)
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content,file_ext_name=category.replace('.',''))
self.getLogger().info('-------文件上传成功------')
break
except:
time.sleep(3)
continue
# if page_size>0:
# pass
# else:
# self.getLogger().info(f'======解析失败=====')
# return retData
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
# retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,file_name,type_id,order_by): def secrchATT(self,item_id,file_name,type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s ''' sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
...@@ -518,7 +451,7 @@ class BaseCore: ...@@ -518,7 +451,7 @@ class BaseCore:
return selects return selects
#插入到att表 返回附件id #插入到att表 返回附件id
def tableUpdate(self,retData,com_name,file_name,num): def tableUpdate(self,retData,com_name,file_name,num,pub_time):
item_id = retData['item_id'] item_id = retData['item_id']
type_id = retData['type_id'] type_id = retData['type_id']
group_name = retData['group_name'] group_name = retData['group_name']
...@@ -533,12 +466,12 @@ class BaseCore: ...@@ -533,12 +466,12 @@ class BaseCore:
order_by = num order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = ( values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by, status, create_by,
create_time,path,'zzsn') create_time,path,'zzsn',pub_time)
self.cursor_.execute(Upsql, values) # 插入 self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交 self.cnx_.commit() # 提交
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
"""数据全量跑一遍,不做判重逻辑""" """数据全量跑一遍,不做判重逻辑"""
import datetime import datetime
import json import json
import os
import re import re
import time import time
...@@ -131,6 +132,7 @@ def redefid(idList): ...@@ -131,6 +132,7 @@ def redefid(idList):
def remove_dup(): def remove_dup():
pass pass
# 国务院文件 # 国务院文件
def get_content1(): def get_content1():
pathType = 'policy/gwywj/' pathType = 'policy/gwywj/'
...@@ -252,12 +254,15 @@ def get_content1(): ...@@ -252,12 +254,15 @@ def get_content1():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1766',file_name) retData = baseCore.uptoOBS(file_href,'1766',file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num) att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id) id_list.append(att_id)
#todo:将返回的地址更新到soup #todo:将返回的地址更新到soup
...@@ -408,12 +413,15 @@ def get_content2(): ...@@ -408,12 +413,15 @@ def get_content2():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1699',file_name) retData = baseCore.uptoOBS(file_href,'1699',file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num) att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id) id_list.append(att_id)
#todo:将返回的地址更新到soup #todo:将返回的地址更新到soup
...@@ -516,12 +524,15 @@ def get_content3(): ...@@ -516,12 +524,15 @@ def get_content3():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1642',file_name) retData = baseCore.uptoOBS(file_href,'1642',file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院国资委',file_name,num) att_id,full_path = baseCore.tableUpdate(retData,'国务院国资委',file_name,num,pub_time)
id_list.append(att_id) id_list.append(att_id)
#todo:将返回的地址更新到soup #todo:将返回的地址更新到soup
...@@ -624,7 +635,7 @@ def get_content3(): ...@@ -624,7 +635,7 @@ def get_content3():
# 北京 # 北京
def bei_jing(): def bei_jing():
num = 0
start_time = time.time() start_time = time.time()
pathType = 'policy/beijing/' pathType = 'policy/beijing/'
# 有反爬需要使用selenium # 有反爬需要使用selenium
...@@ -637,12 +648,12 @@ def bei_jing(): ...@@ -637,12 +648,12 @@ def bei_jing():
"excludeSwitches", ["enable-automation"]) "excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en') chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('log-level=3')
chrome_options.add_argument( chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36') 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe') # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
chrome_options.binary_location = r'D:/fbs_spider/Google/Chrome/Application/chrome.exe' chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:/fbs_spider/cmd100/chromedriver.exe' chromedriver = r'D:\cmd100\chromedriver.exe'
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver) bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
with open('../../base/stealth.min.js') as f: with open('../../base/stealth.min.js') as f:
js = f.read() js = f.read()
...@@ -732,12 +743,15 @@ def bei_jing(): ...@@ -732,12 +743,15 @@ def bei_jing():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1667',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '北京市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '北京市国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
...@@ -855,14 +869,17 @@ def nei_meng_gu(): ...@@ -855,14 +869,17 @@ def nei_meng_gu():
fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1] fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
fu_jian_href = fu_jian_re fu_jian_href = fu_jian_re
category = os.path.splitext(fu_jian_href)[1]
if category not in title:
file_name = title + category
# print(fu_jian_href) # print(fu_jian_href)
# todo:附件上传至文件服务器 # todo:附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1669',pathType,title) retData = baseCore.uptoOBS(fu_jian_href, '1669',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num) att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
log.info(title) log.info(title)
...@@ -1002,13 +1019,16 @@ def ji_lin(): ...@@ -1002,13 +1019,16 @@ def ji_lin():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
file_name = fu_jian_href.text.strip() file_name = fu_jian_href.text.strip()
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# print(fu_jian_href) # print(fu_jian_href)
retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# #
# # todo:将返回的地址更新到soup # # todo:将返回的地址更新到soup
...@@ -1044,12 +1064,15 @@ def ji_lin(): ...@@ -1044,12 +1064,15 @@ def ji_lin():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# print(fj_href) # print(fj_href)
category = os.path.splitext(fj_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name) retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# #
# # todo:将返回的地址更新到soup # # todo:将返回的地址更新到soup
...@@ -1106,7 +1129,6 @@ def ji_lin(): ...@@ -1106,7 +1129,6 @@ def ji_lin():
print('共', count, '条', '...........', '共耗时', end - start, '秒') print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 上海 # 上海
def shang_hai(): def shang_hai():
start = time.time() start = time.time()
pathType = 'policy/shanghai/' pathType = 'policy/shanghai/'
...@@ -1196,12 +1218,15 @@ def shang_hai(): ...@@ -1196,12 +1218,15 @@ def shang_hai():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '上海市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '上海市国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
...@@ -1434,13 +1459,16 @@ def fu_jian(): ...@@ -1434,13 +1459,16 @@ def fu_jian():
# 解析出pdf内容 # 解析出pdf内容
content = baseCore.pdf_content(resp_content) content = baseCore.pdf_content(resp_content)
contentwithtag = '' contentwithtag = ''
category = os.path.splitext(real_href)[1]
if category not in title:
file_name = title + category
# 文件上传至服务器 # 文件上传至服务器
retData = baseCore.uptoOBS(real_href, '1673',pathType,title) retData = baseCore.uptoOBS(real_href, '1673',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', title, num) att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,'')
id_list.append(att_id) id_list.append(att_id)
pub_hao = '' pub_hao = ''
pub_time = '' pub_time = ''
...@@ -1454,6 +1482,15 @@ def fu_jian(): ...@@ -1454,6 +1482,15 @@ def fu_jian():
i_soup = BeautifulSoup(i_html, 'html.parser') i_soup = BeautifulSoup(i_html, 'html.parser')
# 相对路径转化为绝对路径 # 相对路径转化为绝对路径
i_soup = paserUrl(i_soup, real_href) i_soup = paserUrl(i_soup, real_href)
source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip()
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = ''
# print(real_href) # print(real_href)
# todo:获取附件地址 # todo:获取附件地址
try: try:
...@@ -1470,6 +1507,9 @@ def fu_jian(): ...@@ -1470,6 +1507,9 @@ def fu_jian():
if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \ if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1]
if category not in file_name:
file_name = file_name + category
print(fj_href) print(fj_href)
# 找到附件后 上传至文件服务器 # 找到附件后 上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name) retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
...@@ -1477,21 +1517,11 @@ def fu_jian(): ...@@ -1477,21 +1517,11 @@ def fu_jian():
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# 将文件服务器的链接替换 # 将文件服务器的链接替换
fu_jian['href'] = full_path fu_jian['href'] = full_path
source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip()
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = ''
except: except:
pub_source = '' pub_source = ''
pub_time = '' pub_time = ''
...@@ -1714,13 +1744,16 @@ def guang_dong(): ...@@ -1714,13 +1744,16 @@ def guang_dong():
if '.doc' in fj_href or '.docx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \ if '.doc' in fj_href or '.docx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name) retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# 将文件服务器的链接替换 # 将文件服务器的链接替换
fu_jian['href'] = full_path fu_jian['href'] = full_path
...@@ -1835,13 +1868,16 @@ def hai_nan(): ...@@ -1835,13 +1868,16 @@ def hai_nan():
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.docx' in fu_jian_href or '.xlsx' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.docx' in fu_jian_href or '.xlsx' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# 将文件服务器的链接替换 # 将文件服务器的链接替换
fu_jian['href'] = full_path fu_jian['href'] = full_path
...@@ -1879,6 +1915,9 @@ def hai_nan(): ...@@ -1879,6 +1915,9 @@ def hai_nan():
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# print(f'----附件:{fu_jian_href}-----filename:{file_name}') # print(f'----附件:{fu_jian_href}-----filename:{file_name}')
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
...@@ -1887,7 +1926,7 @@ def hai_nan(): ...@@ -1887,7 +1926,7 @@ def hai_nan():
else: else:
continue continue
# 更新到数据库 # 更新到数据库
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = full_path fu_jian['href'] = full_path
except: except:
...@@ -2103,13 +2142,16 @@ def hai_nan(): ...@@ -2103,13 +2142,16 @@ def hai_nan():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
fu_jian['href'] = full_path fu_jian['href'] = full_path
...@@ -2216,13 +2258,16 @@ def hai_nan(): ...@@ -2216,13 +2258,16 @@ def hai_nan():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.xlsx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.xlsx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = full_path fu_jian['href'] = full_path
# print(f'----附件:{fu_jian_href}') # print(f'----附件:{fu_jian_href}')
...@@ -2492,13 +2537,16 @@ def si_chuan(): ...@@ -2492,13 +2537,16 @@ def si_chuan():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 对附件上传至文件服务器 # 对附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name)
if retData['stste']: if retData['stste']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = full_path fu_jian['href'] = full_path
...@@ -2622,6 +2670,9 @@ def guang_xi(): ...@@ -2622,6 +2670,9 @@ def guang_xi():
if '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \ if '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -2629,7 +2680,7 @@ def guang_xi(): ...@@ -2629,7 +2680,7 @@ def guang_xi():
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = full_path fu_jian['href'] = full_path
...@@ -2736,6 +2787,9 @@ def gui_zhou(): ...@@ -2736,6 +2787,9 @@ def gui_zhou():
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -2743,7 +2797,7 @@ def gui_zhou(): ...@@ -2743,7 +2797,7 @@ def gui_zhou():
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = full_path fu_jian['href'] = full_path
...@@ -2830,7 +2884,7 @@ def yun_nan(): ...@@ -2830,7 +2884,7 @@ def yun_nan():
str(doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content > p')[ str(doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content > p')[
0].text).split('(')[1].split(')')[0].replace('\n', '') 0].text).split('(')[1].split(')')[0].replace('\n', '')
contentwithTag = \ contentwithTag = \
doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content')[0] doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content')[0]
content = contentwithTag.text content = contentwithTag.text
if content == '' or content == None: if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----') log.info(f'-----{href}----{title}----内容为空-----')
...@@ -2846,6 +2900,9 @@ def yun_nan(): ...@@ -2846,6 +2900,9 @@ def yun_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try: try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -2853,7 +2910,7 @@ def yun_nan(): ...@@ -2853,7 +2910,7 @@ def yun_nan():
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num,'')
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = full_path fu_jian['href'] = full_path
...@@ -2964,6 +3021,9 @@ def yun_nan(): ...@@ -2964,6 +3021,9 @@ def yun_nan():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# print(fu_jian_href) # print(fu_jian_href)
try: try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -2971,7 +3031,7 @@ def yun_nan(): ...@@ -2971,7 +3031,7 @@ def yun_nan():
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = full_path fu_jian['href'] = full_path
...@@ -3108,6 +3168,9 @@ def chong_qing(): ...@@ -3108,6 +3168,9 @@ def chong_qing():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try: try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -3115,7 +3178,7 @@ def chong_qing(): ...@@ -3115,7 +3178,7 @@ def chong_qing():
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num,pub_time)
id_list.append(att_id) id_list.append(att_id)
# 将附件链接替换 # 将附件链接替换
fu_jian['href'] = full_path fu_jian['href'] = full_path
...@@ -3234,12 +3297,15 @@ def tian_jin(): ...@@ -3234,12 +3297,15 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1683',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -3362,12 +3428,15 @@ def tian_jin(): ...@@ -3362,12 +3428,15 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1683',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -3494,12 +3563,15 @@ def tian_jin(): ...@@ -3494,12 +3563,15 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1683',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -3600,12 +3672,15 @@ def xin_jiang(): ...@@ -3600,12 +3672,15 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1682',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -3704,12 +3779,15 @@ def xin_jiang(): ...@@ -3704,12 +3779,15 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1682',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -3829,12 +3907,15 @@ def shan_xi(): ...@@ -3829,12 +3907,15 @@ def shan_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1684',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -3946,12 +4027,15 @@ def liao_ning(): ...@@ -3946,12 +4027,15 @@ def liao_ning():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1685',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -4056,12 +4140,15 @@ def hei_long_jiang(): ...@@ -4056,12 +4140,15 @@ def hei_long_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1687',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -4169,12 +4256,15 @@ def jiang_su(): ...@@ -4169,12 +4256,15 @@ def jiang_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1687',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -4277,12 +4367,15 @@ def an_hui(): ...@@ -4277,12 +4367,15 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1688',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -4378,12 +4471,15 @@ def an_hui(): ...@@ -4378,12 +4471,15 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1688',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -4510,12 +4606,15 @@ def jiang_xi(): ...@@ -4510,12 +4606,15 @@ def jiang_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1689',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num,writtenDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -4611,12 +4710,15 @@ def he_nan(): ...@@ -4611,12 +4710,15 @@ def he_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1690',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -4725,12 +4827,15 @@ def hu_nan(): ...@@ -4725,12 +4827,15 @@ def hu_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1691',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -4857,23 +4962,26 @@ def gan_su(): ...@@ -4857,23 +4962,26 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1696',file_name) retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None: if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----') log.info(f'-----{href}----{title}----内容为空-----')
continue continue
t = time.strptime(publishDate, "%Y年%m月%d日") # t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t) # publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -5010,13 +5118,16 @@ def gan_su(): ...@@ -5010,13 +5118,16 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
log.info(f'{file_name}---{href}--') log.info(f'{file_name}---{href}--')
retData = baseCore.uptoOBS(file_href, '1696',file_name) retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -5028,8 +5139,8 @@ def gan_su(): ...@@ -5028,8 +5139,8 @@ def gan_su():
continue continue
if len(content) < 2: if len(content) < 2:
continue continue
t = time.strptime(publishDate, "%Y年%m月%d日") # t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t) # publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -5176,12 +5287,15 @@ def gan_su(): ...@@ -5176,12 +5287,15 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1696',file_name) retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -5286,12 +5400,15 @@ def ning_xia(): ...@@ -5286,12 +5400,15 @@ def ning_xia():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1697',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -5393,12 +5510,15 @@ def shanxi(): ...@@ -5393,12 +5510,15 @@ def shanxi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1680',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -5496,12 +5616,15 @@ def xi_zang(): ...@@ -5496,12 +5616,15 @@ def xi_zang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1695',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -5598,12 +5721,15 @@ def qing_hai(): ...@@ -5598,12 +5721,15 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1681',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -5636,8 +5762,8 @@ def qing_hai(): ...@@ -5636,8 +5762,8 @@ def qing_hai():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
num += 1 num += 1
count += 1 count += 1
except: except:
...@@ -5722,12 +5848,15 @@ def qing_hai(): ...@@ -5722,12 +5848,15 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1681',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -5813,12 +5942,15 @@ def he_bei(): ...@@ -5813,12 +5942,15 @@ def he_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1668',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
...@@ -5935,12 +6067,15 @@ def hu_bei(): ...@@ -5935,12 +6067,15 @@ def hu_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1675',file_name) category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
continue continue
att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num,publishDate)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论