提交 89859afd 作者: 薛凌堃

政策法规脚本调整

上级 59beffab
......@@ -38,8 +38,8 @@ taskType = '政策法规'
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
'国务院_国资委_copy1']
driver_path = r'D:\fbs_spider\cmd100\chromedriver.exe'
chromr_bin = r'D:\fbs_spider\Google\Chrome\Application\chrome.exe'
driver_path = r'D:\cmd100\chromedriver.exe'
chromr_bin = r'D:\Google\Chrome\Application\chrome.exe'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
......@@ -258,7 +258,7 @@ def get_content1():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href,'1766',pathType,file_name)
retData = baseCore.uptoOBS(file_href,'1766',file_name)
if retData['state']:
pass
else:
......@@ -414,7 +414,7 @@ def get_content2():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href,'1699',pathType,file_name)
retData = baseCore.uptoOBS(file_href,'1699',file_name)
if retData['state']:
pass
else:
......@@ -522,7 +522,7 @@ def get_content3():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href,'1642',pathType,file_name)
retData = baseCore.uptoOBS(file_href,'1642',file_name)
if retData['state']:
pass
else:
......@@ -738,7 +738,7 @@ def bei_jing():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1667',file_name)
if retData['state']:
pass
else:
......@@ -3240,7 +3240,7 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1683',file_name)
if retData['state']:
pass
else:
......@@ -3368,7 +3368,7 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1683',file_name)
if retData['state']:
pass
else:
......@@ -3500,7 +3500,7 @@ def tian_jin():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1683',file_name)
if retData['state']:
pass
else:
......@@ -3606,7 +3606,7 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1682',file_name)
if retData['state']:
pass
else:
......@@ -3710,7 +3710,7 @@ def xin_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1682',file_name)
if retData['state']:
pass
else:
......@@ -3835,7 +3835,7 @@ def shan_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1684',file_name)
if retData['state']:
pass
else:
......@@ -3952,7 +3952,7 @@ def liao_ning():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1685',file_name)
if retData['state']:
pass
else:
......@@ -4062,7 +4062,7 @@ def hei_long_jiang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1687',file_name)
if retData['state']:
pass
else:
......@@ -4175,7 +4175,7 @@ def jiang_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1687',file_name)
if retData['state']:
pass
else:
......@@ -4283,7 +4283,7 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1688',file_name)
if retData['state']:
pass
else:
......@@ -4384,7 +4384,7 @@ def an_hui():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1688',file_name)
if retData['state']:
pass
else:
......@@ -4516,7 +4516,7 @@ def jiang_xi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1689',file_name)
if retData['state']:
pass
else:
......@@ -4617,7 +4617,7 @@ def he_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1690',file_name)
if retData['state']:
pass
else:
......@@ -4731,7 +4731,7 @@ def hu_nan():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1691',file_name)
if retData['state']:
pass
else:
......@@ -4857,7 +4857,7 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']:
pass
else:
......@@ -4940,9 +4940,9 @@ def gan_su():
href = dd['href']
publishDate = dd['publishDate']
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
continue
# if is_href:
# num+=1
# continue
bro.get(href)
try:
alls = bro.find_element(By.CLASS_NAME, 'alls').text
......@@ -4997,15 +4997,16 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
log.info(f'{file_name}---{href}--')
# retData = baseCore.uptoOBS(file_href, '1696',file_name)
# if retData['state']:
# pass
# else:
# continue
# att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
# id_list.append(att_id)
# # todo:将返回的地址更新到soup
# file['href'] = full_path
contentWithTag = str(soup.prettify())
content = soup.text
......@@ -5018,30 +5019,30 @@ def gan_su():
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
'publishDate': publishDate,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
# dic_news = {
# 'attachmentIds': id_list,
# 'author': '',
# 'content': str(content),
# 'contentWithTag': str(contentWithTag),
# 'createDate': time_now,
# 'deleteFlag': 0,
# 'id': '',
# 'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
# 'origin': origin,
# 'organ': organ,
# 'topicClassification': topicClassification,
# 'issuedNumber': pub_hao,
# 'publishDate': publishDate,
# 'writtenDate': writtenDate,
# 'sid': '1697458829758697473',
# 'sourceAddress': href,
# 'summary': '',
# 'title': title
# }
# # print(dic_news)
# flag = sendKafka(dic_news)
# if flag:
# save_data(dic_news)
num += 1
count += 1
except Exception as e:
......@@ -5160,7 +5161,7 @@ def gan_su():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1696',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']:
pass
else:
......@@ -5215,9 +5216,9 @@ def gan_su():
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
gan_su1()
# gan_su1()
gan_su2()
gan_su3()
# gan_su3()
# 宁夏
def ning_xia():
......@@ -5268,7 +5269,7 @@ def ning_xia():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1697',file_name)
if retData['state']:
pass
else:
......@@ -5375,7 +5376,7 @@ def shanxi():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1680',file_name)
if retData['state']:
pass
else:
......@@ -5478,7 +5479,7 @@ def xi_zang():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1695',file_name)
if retData['state']:
pass
else:
......@@ -5580,7 +5581,7 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1681',file_name)
if retData['state']:
pass
else:
......@@ -5704,7 +5705,7 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1681',file_name)
if retData['state']:
pass
else:
......@@ -5795,7 +5796,7 @@ def he_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1668',file_name)
if retData['state']:
pass
else:
......@@ -5917,7 +5918,7 @@ def hu_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
retData = baseCore.uptoOBS(file_href, '1675',file_name)
if retData['state']:
pass
else:
......@@ -5970,41 +5971,41 @@ def hu_bei():
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == '__main__':
get_content1()
get_content2()
get_content3()
bei_jing()
nei_meng_gu()
ji_lin()
shang_hai()
zhe_jiang()
fu_jian()
shan_dong()
guang_dong()
hai_nan()
si_chuan()
guang_xi()
gui_zhou()
yun_nan()
chong_qing()
tian_jin()
xin_jiang()
shan_xi()
liao_ning()
hei_long_jiang()
jiang_su()
an_hui()
jiang_xi()
he_nan()
hu_nan()
# get_content1()
# get_content2()
# get_content3()
# bei_jing()
# nei_meng_gu()
# ji_lin()
# shang_hai()
# zhe_jiang()
# fu_jian()
# shan_dong()
# guang_dong()
# hai_nan()
# si_chuan()
# guang_xi()
# gui_zhou()
# yun_nan()
# chong_qing()
# tian_jin()
# xin_jiang()
# shan_xi()
# liao_ning()
# hei_long_jiang()
# jiang_su()
# an_hui()
# jiang_xi()
# he_nan()
# hu_nan()
gan_su()
ning_xia()
xi_zang()
shanxi()
qing_hai()
he_bei()
qing_hai()
current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds)
# ning_xia()
# xi_zang()
# shanxi()
# qing_hai()
# he_bei()
# qing_hai()
# current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论