提交 6a073342 作者: LiuLiYuan

政策法规 10/17

上级 bcb04605
......@@ -20,9 +20,6 @@ from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\comData\\policylaw\\client.conf')
client = Fdfs_client(tracker_conf)
from obs import ObsClient
import fitz
......@@ -444,72 +441,8 @@ class BaseCore:
# def doc_page(self,file_path):
# doc = Document(file_path)
# return len(doc.sections)
def pdf_content(self,resp_content):
# 解析pdf文件内容
content = ''
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
for page in doc.pages():
content += page.get_text()
break
except:
time.sleep(3)
continue
return content
# 替换为绝对路径之后,解析出来a.href
def uploadToserver(self,file_href,item_id):
category = os.path.splitext(file_href)[1]
# 上传至文件服务器
headers = {}
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
resp_content = ''
for i in range(0, 3):
try:
resp_content = requests.get(file_href, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
if resp_content:
pass
else:
return retData
# page_size = 0
# if category == '.doc' or category == '.docx':
# # page_size = self.doc_page(file_href)
# return retData
# if category == '.pdf' or category == '.PDF':
# page_size = self.pdf_page(resp_content)
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content,file_ext_name=category.replace('.',''))
self.getLogger().info('-------文件上传成功------')
break
except:
time.sleep(3)
continue
# if page_size>0:
# pass
# else:
# self.getLogger().info(f'======解析失败=====')
# return retData
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
# retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,file_name,type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
......@@ -518,7 +451,7 @@ class BaseCore:
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,file_name,num):
def tableUpdate(self,retData,com_name,file_name,num,pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
......@@ -533,12 +466,12 @@ class BaseCore:
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time,path,'zzsn')
create_time,path,'zzsn',pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
......
......@@ -3,6 +3,7 @@
"""数据全量跑一遍,不做判重逻辑"""
import datetime
import json
import os
import re
import time
......@@ -131,6 +132,7 @@ def redefid(idList):
def remove_dup():
pass
# 国务院文件
def get_content1():
pathType = 'policy/gwywj/'
......@@ -252,12 +254,15 @@ def get_content1():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1766',file_name)
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id)
#todo:将返回的地址更新到soup
......@@ -408,12 +413,15 @@ def get_content2():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1699',file_name)
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id)
#todo:将返回的地址更新到soup
......@@ -516,12 +524,15 @@ def get_content3():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1642',file_name)
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院国资委',file_name,num)
att_id,full_path = baseCore.tableUpdate(retData,'国务院国资委',file_name,num,pub_time)
id_list.append(att_id)
#todo:将返回的地址更新到soup
......@@ -624,7 +635,7 @@ def get_content3():
# 北京
def bei_jing():
num = 0
start_time = time.time()
pathType = 'policy/beijing/'
# 有反爬需要使用selenium
......@@ -637,12 +648,12 @@ def bei_jing():
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('log-level=3')
chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
chrome_options.binary_location = r'D:/fbs_spider/Google/Chrome/Application/chrome.exe'
chromedriver = r'D:/fbs_spider/cmd100/chromedriver.exe'
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe'
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
with open('../../base/stealth.min.js') as f:
js = f.read()
......@@ -732,12 +743,15 @@ def bei_jing():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1667',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '北京市国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '北京市国资委', file_name, num,pub_time)
id_list.append(att_id)
# todo:将返回的地址更新到soup
......@@ -855,14 +869,17 @@ def nei_meng_gu():
fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
fu_jian_href = fu_jian_re
category = os.path.splitext(fu_jian_href)[1]
if category not in title:
file_name = title + category
# print(fu_jian_href)
# todo:附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1669',pathType,title)
retData = baseCore.uptoOBS(fu_jian_href, '1669',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', file_name, num,pub_time)
id_list.append(att_id)
log.info(title)
......@@ -1002,13 +1019,16 @@ def ji_lin():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
file_name = fu_jian_href.text.strip()
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# print(fu_jian_href)
retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num,pub_time)
id_list.append(att_id)
#
# # todo:将返回的地址更新到soup
......@@ -1044,12 +1064,15 @@ def ji_lin():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# print(fj_href)
category = os.path.splitext(fj_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num,pub_time)
id_list.append(att_id)
#
# # todo:将返回的地址更新到soup
......@@ -1106,7 +1129,6 @@ def ji_lin():
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 上海
def shang_hai():
start = time.time()
pathType = 'policy/shanghai/'
......@@ -1196,12 +1218,15 @@ def shang_hai():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '上海市国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '上海市国资委', file_name, num,pub_time)
id_list.append(att_id)
# todo:将返回的地址更新到soup
......@@ -1434,13 +1459,16 @@ def fu_jian():
# 解析出pdf内容
content = baseCore.pdf_content(resp_content)
contentwithtag = ''
category = os.path.splitext(real_href)[1]
if category not in title:
file_name = title + category
# 文件上传至服务器
retData = baseCore.uptoOBS(real_href, '1673',pathType,title)
retData = baseCore.uptoOBS(real_href, '1673',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', title, num)
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,'')
id_list.append(att_id)
pub_hao = ''
pub_time = ''
......@@ -1454,6 +1482,15 @@ def fu_jian():
i_soup = BeautifulSoup(i_html, 'html.parser')
# 相对路径转化为绝对路径
i_soup = paserUrl(i_soup, real_href)
source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip()
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = ''
# print(real_href)
# todo:获取附件地址
try:
......@@ -1470,6 +1507,9 @@ def fu_jian():
if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1]
if category not in file_name:
file_name = file_name + category
print(fj_href)
# 找到附件后 上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
......@@ -1477,21 +1517,11 @@ def fu_jian():
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,pub_time)
id_list.append(att_id)
# 将文件服务器的链接替换
fu_jian['href'] = full_path
source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip()
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = ''
except:
pub_source = ''
pub_time = ''
......@@ -1714,13 +1744,16 @@ def guang_dong():
if '.doc' in fj_href or '.docx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num,pub_time)
id_list.append(att_id)
# 将文件服务器的链接替换
fu_jian['href'] = full_path
......@@ -1835,13 +1868,16 @@ def hai_nan():
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.docx' in fu_jian_href or '.xlsx' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,pub_time)
id_list.append(att_id)
# 将文件服务器的链接替换
fu_jian['href'] = full_path
......@@ -1879,6 +1915,9 @@ def hai_nan():
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# print(f'----附件:{fu_jian_href}-----filename:{file_name}')
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
......@@ -1887,7 +1926,7 @@ def hai_nan():
else:
continue
# 更新到数据库
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,pub_time)
id_list.append(att_id)
fu_jian['href'] = full_path
except:
......@@ -2103,13 +2142,16 @@ def hai_nan():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,pub_time)
id_list.append(att_id)
# todo:将返回的地址更新到soup
fu_jian['href'] = full_path
......@@ -2216,13 +2258,16 @@ def hai_nan():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.xlsx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,pub_time)
id_list.append(att_id)
fu_jian['href'] = full_path
# print(f'----附件:{fu_jian_href}')
......@@ -2492,13 +2537,16 @@ def si_chuan():
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 对附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name)
if retData['stste']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num,pub_time)
id_list.append(att_id)
fu_jian['href'] = full_path
......@@ -2622,6 +2670,9 @@ def guang_xi():
if '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name)
if retData['state']:
......@@ -2629,7 +2680,7 @@ def guang_xi():
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num,pub_time)
id_list.append(att_id)
# 将附件链接替换
fu_jian['href'] = full_path
......@@ -2736,6 +2787,9 @@ def gui_zhou():
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name)
if retData['state']:
......@@ -2743,7 +2797,7 @@ def gui_zhou():
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num,pub_time)
id_list.append(att_id)
# 将附件链接替换
fu_jian['href'] = full_path
......@@ -2846,6 +2900,9 @@ def yun_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
if retData['state']:
......@@ -2853,7 +2910,7 @@ def yun_nan():
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num,'')
id_list.append(att_id)
# 将附件链接替换
fu_jian['href'] = full_path
......@@ -2964,6 +3021,9 @@ def yun_nan():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
# print(fu_jian_href)
try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
if retData['state']:
......@@ -2971,7 +3031,7 @@ def yun_nan():
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '云南省国资委', file_name, num,pub_time)
id_list.append(att_id)
# 将附件链接替换
fu_jian['href'] = full_path
......@@ -3108,6 +3168,9 @@ def chong_qing():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name)
if retData['state']:
......@@ -3115,7 +3178,7 @@ def chong_qing():
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num,pub_time)
id_list.append(att_id)
# 将附件链接替换
fu_jian['href'] = full_path
......@@ -3234,12 +3297,15 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1683',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -3362,12 +3428,15 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1683',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -3494,12 +3563,15 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1683',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -3600,12 +3672,15 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1682',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -3704,12 +3779,15 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1682',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '新疆维吾尔自治区国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -3829,12 +3907,15 @@ def shan_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1684',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -3946,12 +4027,15 @@ def liao_ning():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1685',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -4056,12 +4140,15 @@ def hei_long_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1687',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -4169,12 +4256,15 @@ def jiang_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1687',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -4277,12 +4367,15 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1688',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -4378,12 +4471,15 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1688',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '安徽省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -4510,12 +4606,15 @@ def jiang_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1689',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num,writtenDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -4611,12 +4710,15 @@ def he_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1690',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -4725,12 +4827,15 @@ def hu_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1691',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -4857,23 +4962,26 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
id_ = redefid(id_list)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
# t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -5010,13 +5118,16 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
log.info(f'{file_name}---{href}--')
retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -5028,8 +5139,8 @@ def gan_su():
continue
if len(content) < 2:
continue
t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
# t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -5176,12 +5287,15 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -5286,12 +5400,15 @@ def ning_xia():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1697',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -5393,12 +5510,15 @@ def shanxi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1680',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -5496,12 +5616,15 @@ def xi_zang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1695',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -5598,12 +5721,15 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1681',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -5722,12 +5848,15 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1681',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -5813,12 +5942,15 @@ def he_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1668',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......@@ -5935,12 +6067,15 @@ def hu_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1675',file_name)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num)
att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num,publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论