提交 b376f641 作者: 刘伟刚

Merge remote-tracking branch 'origin/master'

...@@ -650,7 +650,7 @@ class BaseCore: ...@@ -650,7 +650,7 @@ class BaseCore:
return selects return selects
#插入到att表 返回附件id #插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num): def tableUpdate(self,retData,com_name,year,pdf_name,num,pub_time):
item_id = retData['item_id'] item_id = retData['item_id']
type_id = retData['type_id'] type_id = retData['type_id']
group_name = retData['group_name'] group_name = retData['group_name']
...@@ -670,12 +670,12 @@ class BaseCore: ...@@ -670,12 +670,12 @@ class BaseCore:
id = '' id = ''
return id return id
else: else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s.%s)'''
values = ( values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by, status, create_by,
create_time, page_size) create_time, page_size,pub_time)
self.cursor_.execute(Upsql, values) # 插入 self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交 self.cnx_.commit() # 提交
...@@ -759,7 +759,7 @@ class BaseCore: ...@@ -759,7 +759,7 @@ class BaseCore:
try: try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1] retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
retData['full_path'] = unquote(result['body']['objectUrl']) retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size) retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now retData['create_time'] = time_now
......
import json import json
...@@ -133,7 +133,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -133,7 +133,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
return False return False
#插入数据库获取att_id #插入数据库获取att_id
num = num + 1 num = num + 1
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num) att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num,pub_time)
if att_id: if att_id:
pass pass
else: else:
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -164,7 +164,7 @@ def spider_annual_report(dict_info,num): ...@@ -164,7 +164,7 @@ def spider_annual_report(dict_info,num):
return False return False
num = num + 1 num = num + 1
try: try:
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num) att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num,pub_time)
content = retData['content'] content = retData['content']
state = 1 state = 1
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
......
...@@ -91,7 +91,8 @@ def save_data(dic_news): ...@@ -91,7 +91,8 @@ def save_data(dic_news):
'网址': dic_news['sourceAddress'], '网址': dic_news['sourceAddress'],
'tid': dic_news['labels'][0]['relationId'], 'tid': dic_news['labels'][0]['relationId'],
'来源': dic_news['labels'][0]['relationName'], '来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate'] '创建时间': dic_news['createDate'],
'带标签内容':dic_news['contentWithTag'][:100]
} }
db_storage.insert_one(aaa_dic) db_storage.insert_one(aaa_dic)
...@@ -138,6 +139,7 @@ def remove_dup(): ...@@ -138,6 +139,7 @@ def remove_dup():
# 国务院文件 # 国务院文件
def get_content1(): def get_content1():
pathType = 'policy/gwywj/'
def getPageConunt(a_list, url, headers, s): def getPageConunt(a_list, url, headers, s):
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30, data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"], "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
...@@ -256,7 +258,7 @@ def get_content1(): ...@@ -256,7 +258,7 @@ def get_content1():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1766') retData = baseCore.uptoOBS(file_href,'1766',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -265,7 +267,7 @@ def get_content1(): ...@@ -265,7 +267,7 @@ def get_content1():
id_list.append(att_id) id_list.append(att_id)
#todo:将返回的地址更新到soup #todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
except: except:
log.error(f'{title}...{href}...获取内容失败') log.error(f'{title}...{href}...获取内容失败')
continue continue
...@@ -308,6 +310,7 @@ def get_content1(): ...@@ -308,6 +310,7 @@ def get_content1():
# 国务院部门文件 # 国务院部门文件
def get_content2(): def get_content2():
pathType = 'policy/gwybmwj/'
def getTotalpage(bmfl,headers,session): def getTotalpage(bmfl,headers,session):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
pageNo = 1 pageNo = 1
...@@ -336,6 +339,7 @@ def get_content2(): ...@@ -336,6 +339,7 @@ def get_content2():
session.keep_alive = False session.keep_alive = False
start_time = time.time() start_time = time.time()
num = 0 num = 0
count = 0
result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部', result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
'人力资源和社会保障部', '自然资源部', '生态环境部', '住房和城乡建设部', '交通运输部', '水利部', '农业农村部', '商务部', '文化和旅游部', '人力资源和社会保障部', '自然资源部', '生态环境部', '住房和城乡建设部', '交通运输部', '水利部', '农业农村部', '商务部', '文化和旅游部',
'国家卫生健康委员会', '国家卫生健康委员会',
...@@ -396,6 +400,9 @@ def get_content2(): ...@@ -396,6 +400,9 @@ def get_content2():
time.sleep(0.5) time.sleep(0.5)
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'}) contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}---{title}---内容为空---')
continue
fu_jian_soup = contentWithTag.find_all('a') fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup: for file in fu_jian_soup:
try: try:
...@@ -407,7 +414,7 @@ def get_content2(): ...@@ -407,7 +414,7 @@ def get_content2():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1699') retData = baseCore.uptoOBS(file_href,'1699',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -416,7 +423,7 @@ def get_content2(): ...@@ -416,7 +423,7 @@ def get_content2():
id_list.append(att_id) id_list.append(att_id)
#todo:将返回的地址更新到soup #todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
except: except:
log.error(f'{title}...{href}获取内容失败') log.error(f'{title}...{href}获取内容失败')
continue continue
...@@ -446,6 +453,7 @@ def get_content2(): ...@@ -446,6 +453,7 @@ def get_content2():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
count += 1
num += 1 num += 1
except: except:
log.error(f'{bmfl}...第{pageNo}页获取信息列表失败') log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
...@@ -454,10 +462,11 @@ def get_content2(): ...@@ -454,10 +462,11 @@ def get_content2():
log.error(f'{bmfl}...获取页数失败') log.error(f'{bmfl}...获取页数失败')
continue continue
end_time = time.time() end_time = time.time()
log.info(f'共抓取国务院部门文件{num}条数据,耗时{end_time - start_time}') log.info(f'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}')
# 国务院国有资产监督管理委员会-政策发布 # 国务院国有资产监督管理委员会-政策发布
def get_content3(): def get_content3():
pathType = 'policy/gyzc/'
def getPage(): def getPage():
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html" url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req = requests.get(url, headers=headers, verify=False) req = requests.get(url, headers=headers, verify=False)
...@@ -499,6 +508,9 @@ def get_content3(): ...@@ -499,6 +508,9 @@ def get_content3():
if len(pub_hao) > 15: if len(pub_hao) > 15:
pub_hao = '' pub_hao = ''
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}----{title}----内容为空----')
return
fu_jian_soup = contentWithTag.find_all('a') fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup: for file in fu_jian_soup:
try: try:
...@@ -510,7 +522,7 @@ def get_content3(): ...@@ -510,7 +522,7 @@ def get_content3():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1642') retData = baseCore.uptoOBS(file_href,'1642',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -519,7 +531,7 @@ def get_content3(): ...@@ -519,7 +531,7 @@ def get_content3():
id_list.append(att_id) id_list.append(att_id)
#todo:将返回的地址更新到soup #todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段 #todo:传kafka字段
dic_news = { dic_news = {
...@@ -542,7 +554,7 @@ def get_content3(): ...@@ -542,7 +554,7 @@ def get_content3():
'summary': '', #摘要 'summary': '', #摘要
'title': title #标题 'title': title #标题
} }
# print(title) # log.info(title)
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
...@@ -550,6 +562,7 @@ def get_content3(): ...@@ -550,6 +562,7 @@ def get_content3():
def partTwo(): def partTwo():
start_time = time.time() start_time = time.time()
num = 0 num = 0
count = 0
totalpage = getPage() totalpage = getPage()
for page in range(1, totalpage): for page in range(1, totalpage):
url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html" url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
...@@ -570,12 +583,14 @@ def get_content3(): ...@@ -570,12 +583,14 @@ def get_content3():
continue continue
sendContent(href, headers,title,pub_time,num) sendContent(href, headers,title,pub_time,num)
num += 1 num += 1
count += 1
end_time = time.time() end_time = time.time()
log.info(f'共抓取国资委文件{num}条数据,耗时{end_time - start_time}') log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
def partOne(): def partOne():
start_time = time.time() start_time = time.time()
num = 0 num = 0
count = 0
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html" url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
try: try:
# get请求,需要取消ssl验证 # get请求,需要取消ssl验证
...@@ -603,10 +618,11 @@ def get_content3(): ...@@ -603,10 +618,11 @@ def get_content3():
continue continue
sendContent(href, headers,title,pub_time,num) sendContent(href, headers,title,pub_time,num)
num += 1 num += 1
count += 1
except: except:
pass pass
end_time = time.time() end_time = time.time()
log.info(f'共抓取国资委文件{num}条数据,耗时{end_time - start_time}') log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
partOne() partOne()
# 增量执行需要注释掉partTwo() # 增量执行需要注释掉partTwo()
...@@ -614,7 +630,7 @@ def get_content3(): ...@@ -614,7 +630,7 @@ def get_content3():
# 北京 # 北京
def bei_jing(): def bei_jing():
num = 0
start_time = time.time() start_time = time.time()
pathType = 'policy/beijing/' pathType = 'policy/beijing/'
# 有反爬需要使用selenium # 有反爬需要使用selenium
...@@ -662,6 +678,7 @@ def bei_jing(): ...@@ -662,6 +678,7 @@ def bei_jing():
time.sleep(2) time.sleep(2)
log.info(f'------{len(hrefs)}条数据-------------') log.info(f'------{len(hrefs)}条数据-------------')
num = 0 num = 0
count = 0
for href in hrefs: for href in hrefs:
id_list = [] id_list = []
title = href[1] title = href[1]
...@@ -700,12 +717,15 @@ def bei_jing(): ...@@ -700,12 +717,15 @@ def bei_jing():
soup = paserUrl(soup_cont, href[0]) soup = paserUrl(soup_cont, href[0])
soup.prettify() soup.prettify()
if soup.text == '' or soup.text == 'None':
log.info(f'----{href[0]}----{title}----内容为空----')
continue
# todo:去掉扫一扫 # todo:去掉扫一扫
try: try:
soup.find('div', id='div_div').decompose() soup.find('div', id='div_div').decompose()
except: except:
continue continue
# print(title) # log.info(title)
fu_jian_soup = soup.find_all('a') fu_jian_soup = soup.find_all('a')
for file in fu_jian_soup: for file in fu_jian_soup:
...@@ -756,11 +776,10 @@ def bei_jing(): ...@@ -756,11 +776,10 @@ def bei_jing():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
# print(id)
# id_list.append(id)
num += 1 num += 1
count += 1
end_time = time.time() end_time = time.time()
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
bro.quit() bro.quit()
except Exception as e: except Exception as e:
log.info(e) log.info(e)
...@@ -827,6 +846,9 @@ def nei_meng_gu(): ...@@ -827,6 +846,9 @@ def nei_meng_gu():
else: else:
i_content = i_soup.find(class_='view TRS_UEDITOR trs_paper_default') i_content = i_soup.find(class_='view TRS_UEDITOR trs_paper_default')
content = str(i_content) content = str(i_content)
if i_content.text == '' or i_content.text == 'None':
log.info(f'{real_href}------{title}----内容为空-----')
continue
# todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中 # todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中
fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list') fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
fu_jian_result = re.findall('href="(.*?)"', str(fujian)) fu_jian_result = re.findall('href="(.*?)"', str(fujian))
...@@ -849,7 +871,7 @@ def nei_meng_gu(): ...@@ -849,7 +871,7 @@ def nei_meng_gu():
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num) att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
id_list.append(att_id) id_list.append(att_id)
print(title) log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -892,6 +914,7 @@ def ji_lin(): ...@@ -892,6 +914,7 @@ def ji_lin():
pathType = 'policy/jilin/' pathType = 'policy/jilin/'
start = time.time() start = time.time()
num = 0 num = 0
count = 0
url = 'http://gzw.jl.gov.cn/zwgk/zcwj/' url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
try: try:
resp_text = requests.get(url=url, headers=headers, verify=False) resp_text = requests.get(url=url, headers=headers, verify=False)
...@@ -964,6 +987,9 @@ def ji_lin(): ...@@ -964,6 +987,9 @@ def ji_lin():
i_content = soup i_content = soup
contentWithTag = soup.find(class_='zsy_comain') contentWithTag = soup.find(class_='zsy_comain')
content = contentWithTag.text.strip() content = contentWithTag.text.strip()
if content == '' or content == 'None':
log.info(f'{real_href}-----{title}----内容为空')
continue
# 发文字号 # 发文字号
find_hao = i_content.find_all('p')[:3] find_hao = i_content.find_all('p')[:3]
pub_hao = '' pub_hao = ''
...@@ -1010,6 +1036,9 @@ def ji_lin(): ...@@ -1010,6 +1036,9 @@ def ji_lin():
p.extract() p.extract()
contentWithTag = i_content contentWithTag = i_content
content = contentWithTag.text.strip() content = contentWithTag.text.strip()
if content == '' or content == 'None':
log.info(f'{real_href}-----{title}----内容为空')
continue
# 找到附件上传至文件服务器 # 找到附件上传至文件服务器
fj_soup = i_soup.find('div', class_='wenjianfujian') fj_soup = i_soup.find('div', class_='wenjianfujian')
fj_list = fj_soup.find_all('a') fj_list = fj_soup.find_all('a')
...@@ -1040,7 +1069,7 @@ def ji_lin(): ...@@ -1040,7 +1069,7 @@ def ji_lin():
soup.find('div', id='qr_container').decompose() soup.find('div', id='qr_container').decompose()
else: else:
pass pass
print(title) log.info(title)
# print('............................................................') # print('............................................................')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -1073,13 +1102,14 @@ def ji_lin(): ...@@ -1073,13 +1102,14 @@ def ji_lin():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num = num + 1 num = num + 1
count += 1
except Exception as e: except Exception as e:
log.info(e) log.info(e)
pass pass
except: except:
pass pass
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 上海 # 上海
...@@ -1087,6 +1117,7 @@ def shang_hai(): ...@@ -1087,6 +1117,7 @@ def shang_hai():
start = time.time() start = time.time()
pathType = 'policy/shanghai/' pathType = 'policy/shanghai/'
num = 0 num = 0
count =0
for page in range(1, 7): for page in range(1, 7):
if page == 1: if page == 1:
...@@ -1111,7 +1142,7 @@ def shang_hai(): ...@@ -1111,7 +1142,7 @@ def shang_hai():
num+=1 num+=1
continue continue
try: try:
href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html' # href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
href_text = requests.get(url=href, headers=headers, verify=False).text href_text = requests.get(url=href, headers=headers, verify=False).text
doc_href = pq(href_text) doc_href = pq(href_text)
doc_href_ = BeautifulSoup(href_text, 'html.parser') doc_href_ = BeautifulSoup(href_text, 'html.parser')
...@@ -1120,6 +1151,9 @@ def shang_hai(): ...@@ -1120,6 +1151,9 @@ def shang_hai():
info_list = doc_href_.find_all('span', style='text-align: center;margin-left: 42%;') info_list = doc_href_.find_all('span', style='text-align: center;margin-left: 42%;')
pub_source = info_list[1].find('b').text.split('信息来源:')[1] pub_source = info_list[1].find('b').text.split('信息来源:')[1]
content = doc_href_.find('div', attrs={'class': 'detail_03'}) content = doc_href_.find('div', attrs={'class': 'detail_03'})
if content == '' or content == 'None':
log.info(f'{href}-----{title}----内容为空')
continue
# 将文章中的附件字段删去 # 将文章中的附件字段删去
pattern = r'\d+\.' pattern = r'\d+\.'
...@@ -1181,7 +1215,7 @@ def shang_hai(): ...@@ -1181,7 +1215,7 @@ def shang_hai():
else: else:
continue continue
print(title) log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -1209,18 +1243,19 @@ def shang_hai(): ...@@ -1209,18 +1243,19 @@ def shang_hai():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num = num + 1 num = num + 1
count += 1
except: except:
pass pass
except: except:
pass pass
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 浙江 # 浙江
def zhe_jiang(): def zhe_jiang():
start = time.time() start = time.time()
pathType = 'policy/zhejiang/'
num = 0 num = 0
count = 0
url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html' url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
try: try:
res = requests.get(url, headers).content res = requests.get(url, headers).content
...@@ -1235,7 +1270,7 @@ def zhe_jiang(): ...@@ -1235,7 +1270,7 @@ def zhe_jiang():
href = li.find('a')['href'] href = li.find('a')['href']
pub_time = li.find('a').find('span').text pub_time = li.find('a').find('span').text
title = li.find('a').text.replace(pub_time, '').strip() title = li.find('a').text.replace(pub_time, '').strip()
# print(title) # log.info(title)
if 'http' in href: if 'http' in href:
href = href href = href
else: else:
...@@ -1302,9 +1337,12 @@ def zhe_jiang(): ...@@ -1302,9 +1337,12 @@ def zhe_jiang():
# fj_href_list.append(fujian_href) # fj_href_list.append(fujian_href)
# print(fj_href_list) # print(fj_href_list)
print(title) log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
if content == '' or content == 'None':
log.info(f'{href}-----{title}----内容为空')
continue
dic_news = { dic_news = {
'attachmentIds': [], 'attachmentIds': [],
'author': '', 'author': '',
...@@ -1329,20 +1367,21 @@ def zhe_jiang(): ...@@ -1329,20 +1367,21 @@ def zhe_jiang():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num = num + 1 num = num + 1
count += 1
except: except:
pass pass
except: except:
pass pass
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 福建 # 福建
def fu_jian(): def fu_jian():
error_tag = str(404) error_tag = str(404)
pathType = 'policy/fujian/' pathType = 'policy/fujian/'
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/' url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
try: try:
...@@ -1386,8 +1425,8 @@ def fu_jian(): ...@@ -1386,8 +1425,8 @@ def fu_jian():
i_html = href_text.text i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser') i_soup = BeautifulSoup(i_html, 'html.parser')
real_href = href real_href = href
# real_href = 'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm' # real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
# print(real_href) print(real_href)
is_href = db_storage.find_one({'网址': real_href}) is_href = db_storage.find_one({'网址': real_href})
if is_href: if is_href:
num+=1 num+=1
...@@ -1437,6 +1476,7 @@ def fu_jian(): ...@@ -1437,6 +1476,7 @@ def fu_jian():
if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \ if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
print(fj_href)
# 找到附件后 上传至文件服务器 # 找到附件后 上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name) retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -1453,6 +1493,9 @@ def fu_jian(): ...@@ -1453,6 +1493,9 @@ def fu_jian():
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip() pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'}) contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
content = i_soup.find('div', attrs={'class': 'xl_con1'}).text content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = '' pub_hao = ''
except: except:
...@@ -1460,6 +1503,9 @@ def fu_jian(): ...@@ -1460,6 +1503,9 @@ def fu_jian():
pub_time = '' pub_time = ''
contentwithtag = i_soup.find('tabs tab_base_01 rules_con1') contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
content = contentwithtag.text.strip() content = contentwithtag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = contentwithtag.find_all('div', class_='rules_tit1 b-free-read-leaf').text.dtrip() pub_hao = contentwithtag.find_all('div', class_='rules_tit1 b-free-read-leaf').text.dtrip()
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -1484,18 +1530,19 @@ def fu_jian(): ...@@ -1484,18 +1530,19 @@ def fu_jian():
'summary': '', 'summary': '',
'title': title 'title': title
} }
# print(dic_news) # log.info(dic_news)
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
print(title) log.info(title)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 山东 # 山东
def shan_dong(): def shan_dong():
...@@ -1505,6 +1552,7 @@ def shan_dong(): ...@@ -1505,6 +1552,7 @@ def shan_dong():
} }
start = time.time() start = time.time()
num = 0 num = 0
count = 0
url_list = ['http://gzw.shandong.gov.cn/channels/ch06086/', 'http://gzw.shandong.gov.cn/channels/ch06088/'] url_list = ['http://gzw.shandong.gov.cn/channels/ch06086/', 'http://gzw.shandong.gov.cn/channels/ch06088/']
for url in url_list: for url in url_list:
try: try:
...@@ -1539,6 +1587,9 @@ def shan_dong(): ...@@ -1539,6 +1587,9 @@ def shan_dong():
# print(pub_time,pub_source,pub_hao) # print(pub_time,pub_source,pub_hao)
content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle") contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if pub_hao == '无': if pub_hao == '无':
p_list = content.find_all('p') p_list = content.find_all('p')
for p in p_list: for p in p_list:
...@@ -1571,6 +1622,9 @@ def shan_dong(): ...@@ -1571,6 +1622,9 @@ def shan_dong():
i = i + 1 i = i + 1
content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle") contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -1597,23 +1651,22 @@ def shan_dong(): ...@@ -1597,23 +1651,22 @@ def shan_dong():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
if content == '' or content == 'None': log.info(title)
continue
else:
print(title)
num = num + 1 num = num + 1
count += 1
except: except:
pass pass
except: except:
pass pass
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 广东 # 广东
def guang_dong(): def guang_dong():
start = time.time() start = time.time()
pathType = 'policy/guangdong/' pathType = 'policy/guangdong/'
num = 0 num = 0
count = 0
url = 'http://gzw.gd.gov.cn/zcfg/index.html' url = 'http://gzw.gd.gov.cn/zcfg/index.html'
try: try:
resp_href = requests.get(url=url, headers=headers, verify=False) resp_href = requests.get(url=url, headers=headers, verify=False)
...@@ -1653,6 +1706,9 @@ def guang_dong(): ...@@ -1653,6 +1706,9 @@ def guang_dong():
i_soup = paserUrl(i_soup, href) i_soup = paserUrl(i_soup, href)
content = i_soup.find('div', attrs={'class', 'box_info'}) content = i_soup.find('div', attrs={'class', 'box_info'})
contentwithTag = str(content) contentwithTag = str(content)
if content == '' or content == None:
log.info(f'{href}-----{title}----内容为空----')
continue
fu_jian_list = content.find_all('a') fu_jian_list = content.find_all('a')
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
try: try:
...@@ -1701,15 +1757,15 @@ def guang_dong(): ...@@ -1701,15 +1757,15 @@ def guang_dong():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
print(title) log.info(title)
# save_data(result_dict)
num = num + 1 num = num + 1
count += 1
except: except:
pass pass
except: except:
pass pass
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 海南 # 海南
def hai_nan(): def hai_nan():
...@@ -1717,6 +1773,7 @@ def hai_nan(): ...@@ -1717,6 +1773,7 @@ def hai_nan():
def hai_nan1(): def hai_nan1():
# 部门文件 # 部门文件
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(13): for page in range(13):
if page == 0: if page == 0:
...@@ -1770,6 +1827,9 @@ def hai_nan(): ...@@ -1770,6 +1827,9 @@ def hai_nan():
except: except:
pass pass
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a') fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
try: try:
...@@ -1811,6 +1871,9 @@ def hai_nan(): ...@@ -1811,6 +1871,9 @@ def hai_nan():
topicClassification = tbody_text.split('分  类:')[1].split('发文机关:')[0].strip().lstrip() topicClassification = tbody_text.split('分  类:')[1].split('发文机关:')[0].strip().lstrip()
contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'}) contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'})
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = source.find_all('a') fu_jian_list = source.find_all('a')
try: try:
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
...@@ -1862,6 +1925,9 @@ def hai_nan(): ...@@ -1862,6 +1925,9 @@ def hai_nan():
topicClassification = '' topicClassification = ''
contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'}) contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'})
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -1888,19 +1954,20 @@ def hai_nan(): ...@@ -1888,19 +1954,20 @@ def hai_nan():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
print(title) log.info(title)
count += 1
num = num + 1 num = num + 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def hai_nan2(): def hai_nan2():
def hai_nan_sw(page_href): def hai_nan_sw(page_href):
num = 0 num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False) req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.text, 'html.parser') doc_resp = BeautifulSoup(req.text, 'html.parser')
...@@ -1936,6 +2003,9 @@ def hai_nan(): ...@@ -1936,6 +2003,9 @@ def hai_nan():
pub_time = str(pub_result[3]).split('发布日期:</strong>')[1].split('</span>')[0].strip() pub_time = str(pub_result[3]).split('发布日期:</strong>')[1].split('</span>')[0].strip()
contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content') contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -1961,10 +2031,11 @@ def hai_nan(): ...@@ -1961,10 +2031,11 @@ def hai_nan():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
href_text.close() log.info(title)
# save_data(result_dict)
print(title)
num += 1 num += 1
count += 1
href_text.close()
except: except:
pass pass
req.close() req.close()
...@@ -1972,6 +2043,7 @@ def hai_nan(): ...@@ -1972,6 +2043,7 @@ def hai_nan():
def hai_nan_szf(page_href): def hai_nan_szf(page_href):
num = 0 num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False) req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.text, 'html.parser') doc_resp = BeautifulSoup(req.text, 'html.parser')
...@@ -2010,6 +2082,9 @@ def hai_nan(): ...@@ -2010,6 +2082,9 @@ def hai_nan():
pub_time = str(pub_result[3]).split('发布日期:</strong>')[1].split('</span>')[0].strip() pub_time = str(pub_result[3]).split('发布日期:</strong>')[1].split('</span>')[0].strip()
contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content') contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
except: except:
# print(href) # print(href)
pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'}) pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
...@@ -2021,6 +2096,9 @@ def hai_nan(): ...@@ -2021,6 +2096,9 @@ def hai_nan():
writtenDate = '' writtenDate = ''
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'}) contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a') fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
try: try:
...@@ -2068,10 +2146,12 @@ def hai_nan(): ...@@ -2068,10 +2146,12 @@ def hai_nan():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
log.info(title)
num += 1
count += 1
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
print(title)
num += 1
except: except:
pass pass
req.close() req.close()
...@@ -2079,6 +2159,7 @@ def hai_nan(): ...@@ -2079,6 +2159,7 @@ def hai_nan():
def hai_nan_szfbgt(page_href): def hai_nan_szfbgt(page_href):
num = 0 num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False) req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.text, 'html.parser') doc_resp = BeautifulSoup(req.text, 'html.parser')
...@@ -2127,6 +2208,9 @@ def hai_nan(): ...@@ -2127,6 +2208,9 @@ def hai_nan():
writtenDate = '' writtenDate = ''
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'}) contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a') fu_jian_list = contentWithTag.find_all('a')
if fu_jian_list: if fu_jian_list:
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
...@@ -2147,7 +2231,7 @@ def hai_nan(): ...@@ -2147,7 +2231,7 @@ def hai_nan():
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
fu_jian['href'] = full_path fu_jian['href'] = full_path
print(f'----附件:{fu_jian_href}') # print(f'----附件:{fu_jian_href}')
else: else:
pass pass
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -2176,10 +2260,10 @@ def hai_nan(): ...@@ -2176,10 +2260,10 @@ def hai_nan():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
href_text.close() log.info(title)
# save_data(result_dict)
print(title)
num += 1 num += 1
count += 1
href_text.close()
except: except:
pass pass
req.close() req.close()
...@@ -2187,6 +2271,7 @@ def hai_nan(): ...@@ -2187,6 +2271,7 @@ def hai_nan():
def hai_nan_zy(page_href): def hai_nan_zy(page_href):
num = 0 num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False) req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.content, 'html.parser') doc_resp = BeautifulSoup(req.content, 'html.parser')
...@@ -2240,6 +2325,9 @@ def hai_nan(): ...@@ -2240,6 +2325,9 @@ def hai_nan():
pub_hao = '' pub_hao = ''
contentWithTag = doc_href.find(class_='pages_content') contentWithTag = doc_href.find(class_='pages_content')
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{i_href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -2266,10 +2354,12 @@ def hai_nan(): ...@@ -2266,10 +2354,12 @@ def hai_nan():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
log.info(title)
num += 1
count += 1
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
print(title)
num += 1
except: except:
pass pass
req.close() req.close()
...@@ -2277,6 +2367,7 @@ def hai_nan(): ...@@ -2277,6 +2367,7 @@ def hai_nan():
def start(): def start():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
url = "https://www.hainan.gov.cn/hainan/qzcwj/zywj.shtml" url = "https://www.hainan.gov.cn/hainan/qzcwj/zywj.shtml"
try: try:
...@@ -2306,7 +2397,7 @@ def hai_nan(): ...@@ -2306,7 +2397,7 @@ def hai_nan():
else: else:
page_href = str(url) + f'home_{page}.htm' page_href = str(url) + f'home_{page}.htm'
try: try:
num += hai_nan_zy(page_href) count += hai_nan_zy(page_href)
except: except:
pass pass
time.sleep(1) time.sleep(1)
...@@ -2320,7 +2411,7 @@ def hai_nan(): ...@@ -2320,7 +2411,7 @@ def hai_nan():
else: else:
page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1) page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
try: try:
num += hai_nan_sw(page_href) count += hai_nan_sw(page_href)
except: except:
pass pass
elif url == leibie_href_list[2]: elif url == leibie_href_list[2]:
...@@ -2332,7 +2423,7 @@ def hai_nan(): ...@@ -2332,7 +2423,7 @@ def hai_nan():
else: else:
page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1) page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
try: try:
num += hai_nan_szf(page_href) count += hai_nan_szf(page_href)
except: except:
pass pass
else: else:
...@@ -2343,22 +2434,22 @@ def hai_nan(): ...@@ -2343,22 +2434,22 @@ def hai_nan():
else: else:
page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1) page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
try: try:
num += hai_nan_szfbgt(page_href) count += hai_nan_szfbgt(page_href)
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
start() start()
hai_nan1() hai_nan1()
hai_nan2() hai_nan2()
# 四川 # 四川
def si_chuan(): def si_chuan():
num = 0 num = 0
count = 0
pathType = 'policy/sichuan/' pathType = 'policy/sichuan/'
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
...@@ -2393,6 +2484,9 @@ def si_chuan(): ...@@ -2393,6 +2484,9 @@ def si_chuan():
doc_href = paserUrl(doc_href, href) doc_href = paserUrl(doc_href, href)
contentWithTag = doc_href.find('div', id='scrollBox') contentWithTag = doc_href.find('div', id='scrollBox')
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = doc_href.find_all('a') fu_jian_list = doc_href.find_all('a')
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
...@@ -2441,19 +2535,20 @@ def si_chuan(): ...@@ -2441,19 +2535,20 @@ def si_chuan():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
print(title) log.info(title)
count += 1
num = num + 1 num = num + 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 广西 # 广西
def guang_xi(): def guang_xi():
num = 0 num = 0
count = 0
pathType = 'policy/guangxi/' pathType = 'policy/guangxi/'
start_time = time.time() start_time = time.time()
url_all = """ url_all = """
...@@ -2519,6 +2614,9 @@ def guang_xi(): ...@@ -2519,6 +2614,9 @@ def guang_xi():
contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser') contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
contentWithTag = paserUrl(contentWithTag, href) contentWithTag = paserUrl(contentWithTag, href)
content = contentWithTag.text.strip() content = contentWithTag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a') fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
...@@ -2568,14 +2666,14 @@ def guang_xi(): ...@@ -2568,14 +2666,14 @@ def guang_xi():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
print(title) log.info(title)
num = num + 1 num = num + 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 贵州 # 贵州
def gui_zhou(): def gui_zhou():
...@@ -2585,6 +2683,7 @@ def gui_zhou(): ...@@ -2585,6 +2683,7 @@ def gui_zhou():
""" """
pathType = 'policy/guizhou/' pathType = 'policy/guizhou/'
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(0, 11): for page in range(0, 11):
if page == 0: if page == 0:
...@@ -2630,6 +2729,9 @@ def gui_zhou(): ...@@ -2630,6 +2729,9 @@ def gui_zhou():
contentWithTag = paserUrl(contentWithTag, href) contentWithTag = paserUrl(contentWithTag, href)
content = contentWithTag.text.strip() content = contentWithTag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a') fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
try: try:
...@@ -2678,8 +2780,8 @@ def gui_zhou(): ...@@ -2678,8 +2780,8 @@ def gui_zhou():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
print(title) log.info(title)
# save_data(result_dict) count += 1
num = num + 1 num = num + 1
except: except:
pass pass
...@@ -2697,6 +2799,7 @@ def yun_nan(): ...@@ -2697,6 +2799,7 @@ def yun_nan():
http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_list.shtml 1 http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_list.shtml 1
""" """
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 6): for page in range(1, 6):
if page == 1: if page == 1:
...@@ -2735,6 +2838,9 @@ def yun_nan(): ...@@ -2735,6 +2838,9 @@ def yun_nan():
contentwithTag = \ contentwithTag = \
doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content')[0] doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content')[0]
content = contentwithTag.text content = contentwithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentwithTag.find_all('a') fu_jian_list = contentwithTag.find_all('a')
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
try: try:
...@@ -2793,18 +2899,20 @@ def yun_nan(): ...@@ -2793,18 +2899,20 @@ def yun_nan():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
print(title) log.info(title)
num = num + 1 num = num + 1
count += 1
except: except:
pass pass
resp.close() resp.close()
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def yun_nan2(): def yun_nan2():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 4): for page in range(1, 4):
if page == 1: if page == 1:
...@@ -2828,7 +2936,7 @@ def yun_nan(): ...@@ -2828,7 +2936,7 @@ def yun_nan():
num+=1 num+=1
continue continue
try: try:
print(href) # print(href)
if '.shtml' in href: if '.shtml' in href:
res_ = requests.get(href, headers) res_ = requests.get(href, headers)
page_text_ = res_.text.encode("ISO-8859-1") page_text_ = res_.text.encode("ISO-8859-1")
...@@ -2847,6 +2955,9 @@ def yun_nan(): ...@@ -2847,6 +2955,9 @@ def yun_nan():
pub_hao = '' pub_hao = ''
contentwithTag = page.find('div', attrs={'class': 'zfxxgk-right'}) contentwithTag = page.find('div', attrs={'class': 'zfxxgk-right'})
content = contentwithTag.text content = contentwithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentwithTag.find_all('a') fu_jian_list = contentwithTag.find_all('a')
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
try: try:
...@@ -2857,7 +2968,7 @@ def yun_nan(): ...@@ -2857,7 +2968,7 @@ def yun_nan():
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \ if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
print(fu_jian_href) # print(fu_jian_href)
try: try:
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
...@@ -2876,9 +2987,7 @@ def yun_nan(): ...@@ -2876,9 +2987,7 @@ def yun_nan():
elif 'display' in href: elif 'display' in href:
continue continue
else: else:
content = '' continue
contentwithTag = ''
pub_hao = ''
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
...@@ -2907,8 +3016,8 @@ def yun_nan(): ...@@ -2907,8 +3016,8 @@ def yun_nan():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
print(title) log.info(title)
count += 1
num = num + 1 num = num + 1
except: except:
pass pass
...@@ -2916,7 +3025,7 @@ def yun_nan(): ...@@ -2916,7 +3025,7 @@ def yun_nan():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
yun_nan1() yun_nan1()
yun_nan2() yun_nan2()
...@@ -2928,6 +3037,7 @@ def chong_qing(): ...@@ -2928,6 +3037,7 @@ def chong_qing():
http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2 http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2
""" """
num = 0 num = 0
count = 0
pathType = 'policy/chongqing/' pathType = 'policy/chongqing/'
start_time = time.time() start_time = time.time()
for page in range(0, 4): for page in range(0, 4):
...@@ -2955,7 +3065,7 @@ def chong_qing(): ...@@ -2955,7 +3065,7 @@ def chong_qing():
num+=1 num+=1
continue continue
try: try:
print(href) # print(href)
# href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html' # href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html'
href_text = requests.get(url=href, headers=headers, verify=False).content href_text = requests.get(url=href, headers=headers, verify=False).content
doc_href = pq(href_text) doc_href = pq(href_text)
...@@ -2978,6 +3088,9 @@ def chong_qing(): ...@@ -2978,6 +3088,9 @@ def chong_qing():
pass pass
contentWithTag = doc_href.find('div', class_='zwxl-article') contentWithTag = doc_href.find('div', class_='zwxl-article')
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
except: except:
origin = '' origin = ''
topicClassification = '' topicClassification = ''
...@@ -2986,7 +3099,9 @@ def chong_qing(): ...@@ -2986,7 +3099,9 @@ def chong_qing():
pub_hao = '' pub_hao = ''
contentWithTag = doc_href.find('div', class_='zwxl-content') contentWithTag = doc_href.find('div', class_='zwxl-content')
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a') fu_jian_list = contentWithTag.find_all('a')
# print(fu_jian_list) # print(fu_jian_list)
for fu_jian in fu_jian_list: for fu_jian in fu_jian_list:
...@@ -3039,21 +3154,22 @@ def chong_qing(): ...@@ -3039,21 +3154,22 @@ def chong_qing():
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
print(title) log.info(title)
# save_data(result_dict) count += 1
num += 1 num += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 天津 # 天津
def tian_jin(): def tian_jin():
pathType = 'policy/tianjin/' pathType = 'policy/tianjin/'
def tian_jin1(): def tian_jin1():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(0, 3): for page in range(0, 3):
if page == 0: if page == 0:
...@@ -3139,7 +3255,9 @@ def tian_jin(): ...@@ -3139,7 +3255,9 @@ def tian_jin():
if len(fu_jian_soup) < 1: if len(fu_jian_soup) < 1:
continue continue
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -3167,18 +3285,20 @@ def tian_jin(): ...@@ -3167,18 +3285,20 @@ def tian_jin():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin2(): def tian_jin2():
""" """
http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html 4 http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html 4
""" """
num = 0 num = 0
count =0
start_time = time.time() start_time = time.time()
for page in range(0, 5): for page in range(0, 5):
if page == 0: if page == 0:
...@@ -3263,7 +3383,9 @@ def tian_jin(): ...@@ -3263,7 +3383,9 @@ def tian_jin():
if len(fu_jian_soup) < 1: if len(fu_jian_soup) < 1:
continue continue
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -3291,15 +3413,17 @@ def tian_jin(): ...@@ -3291,15 +3413,17 @@ def tian_jin():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin3(): def tian_jin3():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
if page == 1: if page == 1:
...@@ -3391,7 +3515,9 @@ def tian_jin(): ...@@ -3391,7 +3515,9 @@ def tian_jin():
if len(fu_jian_soup) < 1: if len(fu_jian_soup) < 1:
continue continue
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -3419,12 +3545,13 @@ def tian_jin(): ...@@ -3419,12 +3545,13 @@ def tian_jin():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
tian_jin1() tian_jin1()
tian_jin2() tian_jin2()
...@@ -3435,6 +3562,7 @@ def xin_jiang(): ...@@ -3435,6 +3562,7 @@ def xin_jiang():
pathType = 'policy/xinjiang/' pathType = 'policy/xinjiang/'
def xin_jiang1(): def xin_jiang1():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 10): for page in range(1, 10):
if page == 1: if page == 1:
...@@ -3493,6 +3621,9 @@ def xin_jiang(): ...@@ -3493,6 +3621,9 @@ def xin_jiang():
if len(fu_jian_soup) < 1: if len(fu_jian_soup) < 1:
continue continue
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(新国.{1,}?号)|(国资.{1,}?号)' pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content) match_list = re.findall(pattern, content)
if len(match_list) > 0: if len(match_list) > 0:
...@@ -3528,15 +3659,17 @@ def xin_jiang(): ...@@ -3528,15 +3659,17 @@ def xin_jiang():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def xin_jiang_jsbt(): def xin_jiang_jsbt():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 6): for page in range(1, 6):
if page == 1: if page == 1:
...@@ -3592,6 +3725,9 @@ def xin_jiang(): ...@@ -3592,6 +3725,9 @@ def xin_jiang():
if len(fu_jian_soup) < 1: if len(fu_jian_soup) < 1:
continue continue
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(新国.{1,}?号)|(国资.{1,}?号)' pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content) match_list = re.findall(pattern, content)
if len(match_list) > 0: if len(match_list) > 0:
...@@ -3627,6 +3763,7 @@ def xin_jiang(): ...@@ -3627,6 +3763,7 @@ def xin_jiang():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
href_res.close() href_res.close()
except: except:
pass pass
...@@ -3634,7 +3771,7 @@ def xin_jiang(): ...@@ -3634,7 +3771,7 @@ def xin_jiang():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
xin_jiang1() xin_jiang1()
xin_jiang_jsbt() xin_jiang_jsbt()
...@@ -3643,6 +3780,7 @@ def xin_jiang(): ...@@ -3643,6 +3780,7 @@ def xin_jiang():
def shan_xi(): def shan_xi():
pathType = 'policy/shanxi/' pathType = 'policy/shanxi/'
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 7): for page in range(1, 7):
if page == 1: if page == 1:
...@@ -3712,6 +3850,9 @@ def shan_xi(): ...@@ -3712,6 +3850,9 @@ def shan_xi():
if len(fu_jian_soup) < 1: if len(fu_jian_soup) < 1:
continue continue
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(晋国资.{1,}?号)|(国资.{1,}?号)' pattern = r'(晋国资.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content) match_list = re.findall(pattern, content)
if len(match_list) > 0: if len(match_list) > 0:
...@@ -3747,17 +3888,19 @@ def shan_xi(): ...@@ -3747,17 +3888,19 @@ def shan_xi():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 辽宁 # 辽宁
def liao_ning(): def liao_ning():
pathType = 'policy/liaoning/' pathType = 'policy/liaoning/'
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
url = f'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml' url = f'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml'
...@@ -3823,6 +3966,9 @@ def liao_ning(): ...@@ -3823,6 +3966,9 @@ def liao_ning():
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
continue continue
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(辽国资.{1,}?号)|(国资.{1,}?号)' pattern = r'(辽国资.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content) match_list = re.findall(pattern, content)
if len(match_list) > 0: if len(match_list) > 0:
...@@ -3858,6 +4004,7 @@ def liao_ning(): ...@@ -3858,6 +4004,7 @@ def liao_ning():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
...@@ -3869,6 +4016,7 @@ def liao_ning(): ...@@ -3869,6 +4016,7 @@ def liao_ning():
def hei_long_jiang(): def hei_long_jiang():
pathType = 'policy/heilongjiang/' pathType = 'policy/heilongjiang/'
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 3): for page in range(1, 3):
url = f'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}' url = f'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
...@@ -3926,6 +4074,9 @@ def hei_long_jiang(): ...@@ -3926,6 +4074,9 @@ def hei_long_jiang():
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -3953,6 +4104,7 @@ def hei_long_jiang(): ...@@ -3953,6 +4104,7 @@ def hei_long_jiang():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
...@@ -3960,11 +4112,12 @@ def hei_long_jiang(): ...@@ -3960,11 +4112,12 @@ def hei_long_jiang():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 江苏 # 江苏
def jiang_su(): def jiang_su():
num = 0 num = 0
count = 0
pathType = 'policy/jiangsu/' pathType = 'policy/jiangsu/'
start_time = time.time() start_time = time.time()
pagestart = 1 pagestart = 1
...@@ -4034,6 +4187,9 @@ def jiang_su(): ...@@ -4034,6 +4187,9 @@ def jiang_su():
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(pub_hao) < 1: if len(pub_hao) < 1:
pattern = r'(苏国.{1,}?号)|(国.{1,}?号)' pattern = r'(苏国.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content) match_list = re.findall(pattern, content)
...@@ -4068,18 +4224,20 @@ def jiang_su(): ...@@ -4068,18 +4224,20 @@ def jiang_su():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 安徽 # 安徽
def an_hui(): def an_hui():
pathType = 'policy/anhui/' pathType = 'policy/anhui/'
def an_hui1(): def an_hui1():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 4): for page in range(1, 4):
url = f'http://gzw.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.4981381464472001&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=&catId=6717051&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc' url = f'http://gzw.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.4981381464472001&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=&catId=6717051&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
...@@ -4137,6 +4295,9 @@ def an_hui(): ...@@ -4137,6 +4295,9 @@ def an_hui():
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -4164,15 +4325,17 @@ def an_hui(): ...@@ -4164,15 +4325,17 @@ def an_hui():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def an_hui2(): def an_hui2():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
for page in range(1, 25): for page in range(1, 25):
url = f'http://gzw.ah.gov.cn/site/label/8888?_=0.5237800193505848&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=43793891%2C43793901&catId=&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc' url = f'http://gzw.ah.gov.cn/site/label/8888?_=0.5237800193505848&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=43793891%2C43793901&catId=&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
...@@ -4233,6 +4396,9 @@ def an_hui(): ...@@ -4233,6 +4396,9 @@ def an_hui():
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -4260,6 +4426,7 @@ def an_hui(): ...@@ -4260,6 +4426,7 @@ def an_hui():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
href_res.close() href_res.close()
except: except:
pass pass
...@@ -4267,7 +4434,7 @@ def an_hui(): ...@@ -4267,7 +4434,7 @@ def an_hui():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
an_hui1() an_hui1()
an_hui2() an_hui2()
...@@ -4280,6 +4447,7 @@ def jiang_xi(): ...@@ -4280,6 +4447,7 @@ def jiang_xi():
121-164 121-164
""" """
num = 0 num = 0
count = 0
pathType = 'policy/jiangxi/' pathType = 'policy/jiangxi/'
start_time = time.time() start_time = time.time()
startrecord = 1 startrecord = 1
...@@ -4360,6 +4528,9 @@ def jiang_xi(): ...@@ -4360,6 +4528,9 @@ def jiang_xi():
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(pub_hao) < 1: if len(pub_hao) < 1:
pattern = r'(赣国资.{1,}?号)|(国.{1,}?号)' pattern = r'(赣国资.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content) match_list = re.findall(pattern, content)
...@@ -4395,16 +4566,18 @@ def jiang_xi(): ...@@ -4395,16 +4566,18 @@ def jiang_xi():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 河南 # 河南
def he_nan(): def he_nan():
num = 0 num = 0
count = 0
pathType = 'policy/henan/' pathType = 'policy/henan/'
start_time = time.time() start_time = time.time()
for page in range(0, 7): for page in range(0, 7):
...@@ -4456,6 +4629,9 @@ def he_nan(): ...@@ -4456,6 +4629,9 @@ def he_nan():
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(豫国.{1,}?号)|(国.{1,}?号)' pattern = r'(豫国.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content) match_list = re.findall(pattern, content)
if len(match_list) > 0: if len(match_list) > 0:
...@@ -4489,16 +4665,18 @@ def he_nan(): ...@@ -4489,16 +4665,18 @@ def he_nan():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
href_res.close() href_res.close()
resp_text.close() resp_text.close()
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖南 # 湖南
def hu_nan(): def hu_nan():
num = 0 num = 0
count = 0
pathType = 'policy/hunan/' pathType = 'policy/hunan/'
start_time = time.time() start_time = time.time()
for page in range(1, 7): for page in range(1, 7):
...@@ -4565,6 +4743,9 @@ def hu_nan(): ...@@ -4565,6 +4743,9 @@ def hu_nan():
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -4592,18 +4773,20 @@ def hu_nan(): ...@@ -4592,18 +4773,20 @@ def hu_nan():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 甘肃 # 甘肃
def gan_su(): def gan_su():
pathType = 'policy/gansu/' pathType = 'policy/gansu/'
def gan_su1(): def gan_su1():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
bro = getDriver() bro = getDriver()
urls = ['http://gzw.gansu.gov.cn/gzw/c115543/xxgk_list.shtml', urls = ['http://gzw.gansu.gov.cn/gzw/c115543/xxgk_list.shtml',
...@@ -4686,6 +4869,9 @@ def gan_su(): ...@@ -4686,6 +4869,9 @@ def gan_su():
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
# t = time.strptime(publishDate, "%Y年%m月%d日") # t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t) # publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -4715,6 +4901,7 @@ def gan_su(): ...@@ -4715,6 +4901,7 @@ def gan_su():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except Exception as e: except Exception as e:
print(e) print(e)
pass pass
...@@ -4724,6 +4911,7 @@ def gan_su(): ...@@ -4724,6 +4911,7 @@ def gan_su():
def gan_su2(): def gan_su2():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
bro = getDriver() bro = getDriver()
url = 'http://gzw.gansu.gov.cn/gzw/c115552/xxgk_list.shtml' url = 'http://gzw.gansu.gov.cn/gzw/c115552/xxgk_list.shtml'
...@@ -4821,6 +5009,9 @@ def gan_su(): ...@@ -4821,6 +5009,9 @@ def gan_su():
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(content) < 2: if len(content) < 2:
continue continue
# t = time.strptime(publishDate, "%Y年%m月%d日") # t = time.strptime(publishDate, "%Y年%m月%d日")
...@@ -4852,6 +5043,7 @@ def gan_su(): ...@@ -4852,6 +5043,7 @@ def gan_su():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except Exception as e: except Exception as e:
print(e) print(e)
except Exception as e: except Exception as e:
...@@ -4859,10 +5051,11 @@ def gan_su(): ...@@ -4859,10 +5051,11 @@ def gan_su():
pass pass
bro.quit() bro.quit()
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def gan_su3(): def gan_su3():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
# # service = Service(r'D:/chrome/103/chromedriver.exe') # # service = Service(r'D:/chrome/103/chromedriver.exe')
# chrome_options = webdriver.ChromeOptions() # chrome_options = webdriver.ChromeOptions()
...@@ -4979,6 +5172,9 @@ def gan_su(): ...@@ -4979,6 +5172,9 @@ def gan_su():
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(content) < 2: if len(content) < 2:
continue continue
# t = time.strptime(publishDate, "%Y年%m月%d日") # t = time.strptime(publishDate, "%Y年%m月%d日")
...@@ -5010,13 +5206,14 @@ def gan_su(): ...@@ -5010,13 +5206,14 @@ def gan_su():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except Exception as e: except Exception as e:
print(e) print(e)
except: except:
pass pass
bro.quit() bro.quit()
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
gan_su1() gan_su1()
gan_su2() gan_su2()
...@@ -5025,6 +5222,7 @@ def gan_su(): ...@@ -5025,6 +5222,7 @@ def gan_su():
# 宁夏 # 宁夏
def ning_xia(): def ning_xia():
num = 0 num = 0
count = 0
pathType = 'policy/ningxia/' pathType = 'policy/ningxia/'
start_time = time.time() start_time = time.time()
for page in range(0, 3): for page in range(0, 3):
...@@ -5082,6 +5280,9 @@ def ning_xia(): ...@@ -5082,6 +5280,9 @@ def ning_xia():
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
t = time.strptime(publishDate, "%Y年%m月%d日") t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t) publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
...@@ -5111,16 +5312,18 @@ def ning_xia(): ...@@ -5111,16 +5312,18 @@ def ning_xia():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 陕西 # 陕西
def shanxi(): def shanxi():
num = 0 num = 0
count = 0
pathType = 'policy/shan_xi/' pathType = 'policy/shan_xi/'
start_time = time.time() start_time = time.time()
url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127' url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
...@@ -5184,6 +5387,9 @@ def shanxi(): ...@@ -5184,6 +5387,9 @@ def shanxi():
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -5211,6 +5417,7 @@ def shanxi(): ...@@ -5211,6 +5417,7 @@ def shanxi():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
res_href.close() res_href.close()
except: except:
pass pass
...@@ -5218,7 +5425,7 @@ def shanxi(): ...@@ -5218,7 +5425,7 @@ def shanxi():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 西藏 # 西藏
def xi_zang(): def xi_zang():
...@@ -5228,6 +5435,7 @@ def xi_zang(): ...@@ -5228,6 +5435,7 @@ def xi_zang():
'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ] 'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
for url in url_list: for url in url_list:
num = 0 num = 0
count = 0
try: try:
res = requests.get(url=url, headers=headers) res = requests.get(url=url, headers=headers)
res.encoding = res.apparent_encoding res.encoding = res.apparent_encoding
...@@ -5256,6 +5464,9 @@ def xi_zang(): ...@@ -5256,6 +5464,9 @@ def xi_zang():
contentWithTag = str(i_soup.find(id='NewsContent')) contentWithTag = str(i_soup.find(id='NewsContent'))
soup = BeautifulSoup(contentWithTag, 'html.parser') soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a') fu_jian_soup = soup.find_all('a')
id_list = [] id_list = []
for file in fu_jian_soup: for file in fu_jian_soup:
...@@ -5306,18 +5517,20 @@ def xi_zang(): ...@@ -5306,18 +5517,20 @@ def xi_zang():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 青海 # 青海
def qing_hai(): def qing_hai():
pathType = 'policy/qinghai/' pathType = 'policy/qinghai/'
def qing_hai1(): def qing_hai1():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
url_mode = 'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=604' url_mode = 'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=604'
try: try:
...@@ -5353,6 +5566,9 @@ def qing_hai(): ...@@ -5353,6 +5566,9 @@ def qing_hai():
origin = str(page.find('div', attrs={'class': 'foot-fb'})) origin = str(page.find('div', attrs={'class': 'foot-fb'}))
soup = BeautifulSoup(contentWithTag, 'html.parser') soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{durl}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a') fu_jian_soup = soup.find_all('a')
id_list = [] id_list = []
for file in fu_jian_soup: for file in fu_jian_soup:
...@@ -5364,7 +5580,7 @@ def qing_hai(): ...@@ -5364,7 +5580,7 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1681') retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5405,15 +5621,17 @@ def qing_hai(): ...@@ -5405,15 +5621,17 @@ def qing_hai():
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
num += 1 num += 1
count += 1
except: except:
pass pass
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def qing_hai2(): def qing_hai2():
num = 0 num = 0
count = 0
start_time = time.time() start_time = time.time()
urls = [ urls = [
'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=627', 'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=627',
...@@ -5446,6 +5664,7 @@ def qing_hai(): ...@@ -5446,6 +5664,7 @@ def qing_hai():
durl = tr.find('a').get('href') durl = tr.find('a').get('href')
is_href = db_storage.find_one({'网址': durl}) is_href = db_storage.find_one({'网址': durl})
if is_href: if is_href:
num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
continue continue
title = tr.find('a').text title = tr.find('a').text
...@@ -5471,6 +5690,9 @@ def qing_hai(): ...@@ -5471,6 +5690,9 @@ def qing_hai():
origin = '' origin = ''
soup = BeautifulSoup(contentWithTag, 'html.parser') soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{durl}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a') fu_jian_soup = soup.find_all('a')
id_list = [] id_list = []
for file in fu_jian_soup: for file in fu_jian_soup:
...@@ -5482,7 +5704,7 @@ def qing_hai(): ...@@ -5482,7 +5704,7 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1681') retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5490,7 +5712,7 @@ def qing_hai(): ...@@ -5490,7 +5712,7 @@ def qing_hai():
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
# todo:替换完成之后,将附件上传至文件服务器 # todo:替换完成之后,将附件上传至文件服务器
...@@ -5523,13 +5745,14 @@ def qing_hai(): ...@@ -5523,13 +5745,14 @@ def qing_hai():
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
num += 1 num += 1
count += 1
except: except:
pass pass
res.close() res.close()
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
qing_hai1() qing_hai1()
qing_hai2() qing_hai2()
...@@ -5537,6 +5760,8 @@ def qing_hai(): ...@@ -5537,6 +5760,8 @@ def qing_hai():
# 河北 # 河北
def he_bei(): def he_bei():
num = 0 num = 0
count = 0
pathType = 'policy/hebei/'
start_time = time.time() start_time = time.time()
url = 'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json' url = 'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
try: try:
...@@ -5551,6 +5776,7 @@ def he_bei(): ...@@ -5551,6 +5776,7 @@ def he_bei():
href = 'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id=' + str(id) href = 'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id=' + str(id)
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
pub_time_ = info['updated'] pub_time_ = info['updated']
m = round(pub_time_ / 1000) # 四舍五入取10位时间戳(秒级) m = round(pub_time_ / 1000) # 四舍五入取10位时间戳(秒级)
...@@ -5569,7 +5795,7 @@ def he_bei(): ...@@ -5569,7 +5795,7 @@ def he_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1668') retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5577,13 +5803,16 @@ def he_bei(): ...@@ -5577,13 +5803,16 @@ def he_bei():
att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1: if len(fu_jian_soup) < 1:
continue continue
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(冀国.{1,}?号)|(国资.{1,}?号)' pattern = r'(冀国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content) match_list = re.findall(pattern, content)
if len(match_list) > 0: if len(match_list) > 0:
...@@ -5619,14 +5848,17 @@ def he_bei(): ...@@ -5619,14 +5848,17 @@ def he_bei():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖北 # 湖北
def hu_bei(): def hu_bei():
num = 0 num = 0
count = 0
pathType = 'policy/hubei/'
start_time = time.time() start_time = time.time()
hrefs = [] hrefs = []
url = 'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/' url = 'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
...@@ -5649,6 +5881,7 @@ def hu_bei(): ...@@ -5649,6 +5881,7 @@ def hu_bei():
for href in hrefs: for href in hrefs:
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1
continue continue
try: try:
driver.get(href) driver.get(href)
...@@ -5684,7 +5917,7 @@ def hu_bei(): ...@@ -5684,7 +5917,7 @@ def hu_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1675') retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -5692,14 +5925,16 @@ def hu_bei(): ...@@ -5692,14 +5925,16 @@ def hu_bei():
att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num)
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = full_path
# id_ = redefid(id_list) # id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1: if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1: if len(fu_jian_soup) < 1:
continue continue
content = soup.text content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -5727,48 +5962,49 @@ def hu_bei(): ...@@ -5727,48 +5962,49 @@ def hu_bei():
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1
except Exception as e: except Exception as e:
pass pass
driver.close() driver.close()
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == '__main__': if __name__ == '__main__':
# get_content1() get_content1()
# get_content2() get_content2()
# get_content3() get_content3()
# bei_jing() bei_jing()
# nei_meng_gu() nei_meng_gu()
ji_lin() ji_lin()
# shang_hai() shang_hai()
# zhe_jiang() zhe_jiang()
# fu_jian() fu_jian()
# shan_dong() shan_dong()
# guang_dong() guang_dong()
# hai_nan() hai_nan()
# si_chuan() si_chuan()
# guang_xi() guang_xi()
# gui_zhou() gui_zhou()
# yun_nan() yun_nan()
# chong_qing() chong_qing()
# tian_jin() tian_jin()
# xin_jiang() xin_jiang()
# shan_xi() shan_xi()
# liao_ning() liao_ning()
# hei_long_jiang() hei_long_jiang()
# jiang_su() jiang_su()
# an_hui() an_hui()
# jiang_xi() jiang_xi()
# he_nan() he_nan()
# hu_nan() hu_nan()
# gan_su() gan_su()
# ning_xia() ning_xia()
# xi_zang() xi_zang()
# shanxi() shanxi()
# qing_hai() qing_hai()
# he_bei() he_bei()
# qing_hai() qing_hai()
# current_time = datetime.datetime.now() current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1) midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds() sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds) time.sleep(sleep_seconds)
...@@ -40,7 +40,8 @@ def save_data(dic_news): ...@@ -40,7 +40,8 @@ def save_data(dic_news):
'网址':dic_news['sourceAddress'], '网址':dic_news['sourceAddress'],
'tid':dic_news['labels'][0]['relationId'], 'tid':dic_news['labels'][0]['relationId'],
'来源':dic_news['labels'][0]['relationName'], '来源':dic_news['labels'][0]['relationName'],
'创建时间':dic_news['createDate'] '创建时间':dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100]
} }
db_storage.insert_one(aaa_dic) db_storage.insert_one(aaa_dic)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论