提交 b376f641 作者: 刘伟刚

Merge remote-tracking branch 'origin/master'

......@@ -650,7 +650,7 @@ class BaseCore:
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
def tableUpdate(self,retData,com_name,year,pdf_name,num,pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
......@@ -670,12 +670,12 @@ class BaseCore:
id = ''
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s.%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
create_time, page_size,pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
......@@ -759,7 +759,7 @@ class BaseCore:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
......
import json
import json
......@@ -133,7 +133,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
return False
#插入数据库获取att_id
num = num + 1
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num)
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num,pub_time)
if att_id:
pass
else:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -164,7 +164,7 @@ def spider_annual_report(dict_info,num):
return False
num = num + 1
try:
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num)
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num,pub_time)
content = retData['content']
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
......
......@@ -91,7 +91,8 @@ def save_data(dic_news):
'网址': dic_news['sourceAddress'],
'tid': dic_news['labels'][0]['relationId'],
'来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate']
'创建时间': dic_news['createDate'],
'带标签内容':dic_news['contentWithTag'][:100]
}
db_storage.insert_one(aaa_dic)
......@@ -138,6 +139,7 @@ def remove_dup():
# 国务院文件
def get_content1():
pathType = 'policy/gwywj/'
def getPageConunt(a_list, url, headers, s):
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
......@@ -256,7 +258,7 @@ def get_content1():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1766')
retData = baseCore.uptoOBS(file_href,'1766',pathType,file_name)
if retData['state']:
pass
else:
......@@ -265,7 +267,7 @@ def get_content1():
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
except:
log.error(f'{title}...{href}...获取内容失败')
continue
......@@ -308,6 +310,7 @@ def get_content1():
# 国务院部门文件
def get_content2():
pathType = 'policy/gwybmwj/'
def getTotalpage(bmfl,headers,session):
ip = baseCore.get_proxy()
pageNo = 1
......@@ -336,6 +339,7 @@ def get_content2():
session.keep_alive = False
start_time = time.time()
num = 0
count = 0
result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
'人力资源和社会保障部', '自然资源部', '生态环境部', '住房和城乡建设部', '交通运输部', '水利部', '农业农村部', '商务部', '文化和旅游部',
'国家卫生健康委员会',
......@@ -396,6 +400,9 @@ def get_content2():
time.sleep(0.5)
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}---{title}---内容为空---')
continue
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
......@@ -407,7 +414,7 @@ def get_content2():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1699')
retData = baseCore.uptoOBS(file_href,'1699',pathType,file_name)
if retData['state']:
pass
else:
......@@ -416,7 +423,7 @@ def get_content2():
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
except:
log.error(f'{title}...{href}获取内容失败')
continue
......@@ -446,6 +453,7 @@ def get_content2():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
count += 1
num += 1
except:
log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
......@@ -454,10 +462,11 @@ def get_content2():
log.error(f'{bmfl}...获取页数失败')
continue
end_time = time.time()
log.info(f'共抓取国务院部门文件{num}条数据,耗时{end_time - start_time}')
log.info(f'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}')
# 国务院国有资产监督管理委员会-政策发布
def get_content3():
pathType = 'policy/gyzc/'
def getPage():
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req = requests.get(url, headers=headers, verify=False)
......@@ -499,6 +508,9 @@ def get_content3():
if len(pub_hao) > 15:
pub_hao = ''
content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}----{title}----内容为空----')
return
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
......@@ -510,7 +522,7 @@ def get_content3():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1642')
retData = baseCore.uptoOBS(file_href,'1642',pathType,file_name)
if retData['state']:
pass
else:
......@@ -519,7 +531,7 @@ def get_content3():
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
......@@ -542,7 +554,7 @@ def get_content3():
'summary': '', #摘要
'title': title #标题
}
# print(title)
# log.info(title)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
......@@ -550,6 +562,7 @@ def get_content3():
def partTwo():
start_time = time.time()
num = 0
count = 0
totalpage = getPage()
for page in range(1, totalpage):
url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
......@@ -570,12 +583,14 @@ def get_content3():
continue
sendContent(href, headers,title,pub_time,num)
num += 1
count += 1
end_time = time.time()
log.info(f'共抓取国资委文件{num}条数据,耗时{end_time - start_time}')
log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
def partOne():
start_time = time.time()
num = 0
count = 0
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
try:
# get请求,需要取消ssl验证
......@@ -603,10 +618,11 @@ def get_content3():
continue
sendContent(href, headers,title,pub_time,num)
num += 1
count += 1
except:
pass
end_time = time.time()
log.info(f'共抓取国资委文件{num}条数据,耗时{end_time - start_time}')
log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
partOne()
# 增量执行需要注释掉partTwo()
......@@ -614,7 +630,7 @@ def get_content3():
# 北京
def bei_jing():
num = 0
start_time = time.time()
pathType = 'policy/beijing/'
# 有反爬需要使用selenium
......@@ -662,6 +678,7 @@ def bei_jing():
time.sleep(2)
log.info(f'------{len(hrefs)}条数据-------------')
num = 0
count = 0
for href in hrefs:
id_list = []
title = href[1]
......@@ -700,12 +717,15 @@ def bei_jing():
soup = paserUrl(soup_cont, href[0])
soup.prettify()
if soup.text == '' or soup.text == 'None':
log.info(f'----{href[0]}----{title}----内容为空----')
continue
# todo:去掉扫一扫
try:
soup.find('div', id='div_div').decompose()
except:
continue
# print(title)
# log.info(title)
fu_jian_soup = soup.find_all('a')
for file in fu_jian_soup:
......@@ -756,11 +776,10 @@ def bei_jing():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
# print(id)
# id_list.append(id)
num += 1
num += 1
count += 1
end_time = time.time()
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
bro.quit()
except Exception as e:
log.info(e)
......@@ -827,6 +846,9 @@ def nei_meng_gu():
else:
i_content = i_soup.find(class_='view TRS_UEDITOR trs_paper_default')
content = str(i_content)
if i_content.text == '' or i_content.text == 'None':
log.info(f'{real_href}------{title}----内容为空-----')
continue
# todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中
fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
fu_jian_result = re.findall('href="(.*?)"', str(fujian))
......@@ -849,7 +871,7 @@ def nei_meng_gu():
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
id_list.append(att_id)
print(title)
log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
......@@ -892,6 +914,7 @@ def ji_lin():
pathType = 'policy/jilin/'
start = time.time()
num = 0
count = 0
url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
try:
resp_text = requests.get(url=url, headers=headers, verify=False)
......@@ -964,6 +987,9 @@ def ji_lin():
i_content = soup
contentWithTag = soup.find(class_='zsy_comain')
content = contentWithTag.text.strip()
if content == '' or content == 'None':
log.info(f'{real_href}-----{title}----内容为空')
continue
# 发文字号
find_hao = i_content.find_all('p')[:3]
pub_hao = ''
......@@ -1010,6 +1036,9 @@ def ji_lin():
p.extract()
contentWithTag = i_content
content = contentWithTag.text.strip()
if content == '' or content == 'None':
log.info(f'{real_href}-----{title}----内容为空')
continue
# 找到附件上传至文件服务器
fj_soup = i_soup.find('div', class_='wenjianfujian')
fj_list = fj_soup.find_all('a')
......@@ -1040,7 +1069,7 @@ def ji_lin():
soup.find('div', id='qr_container').decompose()
else:
pass
print(title)
log.info(title)
# print('............................................................')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
......@@ -1073,13 +1102,14 @@ def ji_lin():
if flag:
save_data(dic_news)
num = num + 1
count += 1
except Exception as e:
log.info(e)
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 上海
......@@ -1087,6 +1117,7 @@ def shang_hai():
start = time.time()
pathType = 'policy/shanghai/'
num = 0
count =0
for page in range(1, 7):
if page == 1:
......@@ -1111,7 +1142,7 @@ def shang_hai():
num+=1
continue
try:
href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
# href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
href_text = requests.get(url=href, headers=headers, verify=False).text
doc_href = pq(href_text)
doc_href_ = BeautifulSoup(href_text, 'html.parser')
......@@ -1120,6 +1151,9 @@ def shang_hai():
info_list = doc_href_.find_all('span', style='text-align: center;margin-left: 42%;')
pub_source = info_list[1].find('b').text.split('信息来源:')[1]
content = doc_href_.find('div', attrs={'class': 'detail_03'})
if content == '' or content == 'None':
log.info(f'{href}-----{title}----内容为空')
continue
# 将文章中的附件字段删去
pattern = r'\d+\.'
......@@ -1181,7 +1215,7 @@ def shang_hai():
else:
continue
print(title)
log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
......@@ -1208,19 +1242,20 @@ def shang_hai():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num = num + 1
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 浙江
def zhe_jiang():
start = time.time()
pathType = 'policy/zhejiang/'
num = 0
count = 0
url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
try:
res = requests.get(url, headers).content
......@@ -1235,7 +1270,7 @@ def zhe_jiang():
href = li.find('a')['href']
pub_time = li.find('a').find('span').text
title = li.find('a').text.replace(pub_time, '').strip()
# print(title)
# log.info(title)
if 'http' in href:
href = href
else:
......@@ -1302,9 +1337,12 @@ def zhe_jiang():
# fj_href_list.append(fujian_href)
# print(fj_href_list)
print(title)
log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
if content == '' or content == 'None':
log.info(f'{href}-----{title}----内容为空')
continue
dic_news = {
'attachmentIds': [],
'author': '',
......@@ -1329,20 +1367,21 @@ def zhe_jiang():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num = num + 1
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 福建
def fu_jian():
error_tag = str(404)
pathType = 'policy/fujian/'
num = 0
count = 0
start_time = time.time()
url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
try:
......@@ -1386,8 +1425,8 @@ def fu_jian():
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
real_href = href
# real_href = 'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm'
# print(real_href)
# real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
print(real_href)
is_href = db_storage.find_one({'网址': real_href})
if is_href:
num+=1
......@@ -1437,6 +1476,7 @@ def fu_jian():
if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
print(fj_href)
# 找到附件后 上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
if retData['state']:
......@@ -1453,6 +1493,9 @@ def fu_jian():
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = ''
except:
......@@ -1460,6 +1503,9 @@ def fu_jian():
pub_time = ''
contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
content = contentwithtag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = contentwithtag.find_all('div', class_='rules_tit1 b-free-read-leaf').text.dtrip()
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -1484,18 +1530,19 @@ def fu_jian():
'summary': '',
'title': title
}
# print(dic_news)
# log.info(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num += 1
log.info(title)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 山东
def shan_dong():
......@@ -1505,6 +1552,7 @@ def shan_dong():
}
start = time.time()
num = 0
count = 0
url_list = ['http://gzw.shandong.gov.cn/channels/ch06086/', 'http://gzw.shandong.gov.cn/channels/ch06088/']
for url in url_list:
try:
......@@ -1539,6 +1587,9 @@ def shan_dong():
# print(pub_time,pub_source,pub_hao)
content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if pub_hao == '无':
p_list = content.find_all('p')
for p in p_list:
......@@ -1571,6 +1622,9 @@ def shan_dong():
i = i + 1
content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -1597,23 +1651,22 @@ def shan_dong():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
if content == '' or content == 'None':
continue
else:
print(title)
log.info(title)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 广东
def guang_dong():
start = time.time()
pathType = 'policy/guangdong/'
num = 0
count = 0
url = 'http://gzw.gd.gov.cn/zcfg/index.html'
try:
resp_href = requests.get(url=url, headers=headers, verify=False)
......@@ -1653,6 +1706,9 @@ def guang_dong():
i_soup = paserUrl(i_soup, href)
content = i_soup.find('div', attrs={'class', 'box_info'})
contentwithTag = str(content)
if content == '' or content == None:
log.info(f'{href}-----{title}----内容为空----')
continue
fu_jian_list = content.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -1701,15 +1757,15 @@ def guang_dong():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
# save_data(result_dict)
num = num + 1
log.info(title)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
print('共', count, '条', '...........', '共耗时', end - start, '秒')
# 海南
def hai_nan():
......@@ -1717,6 +1773,7 @@ def hai_nan():
def hai_nan1():
# 部门文件
num = 0
count = 0
start_time = time.time()
for page in range(13):
if page == 0:
......@@ -1770,6 +1827,9 @@ def hai_nan():
except:
pass
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -1811,6 +1871,9 @@ def hai_nan():
topicClassification = tbody_text.split('分  类:')[1].split('发文机关:')[0].strip().lstrip()
contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'})
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = source.find_all('a')
try:
for fu_jian in fu_jian_list:
......@@ -1862,6 +1925,9 @@ def hai_nan():
topicClassification = ''
contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'})
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -1888,19 +1954,20 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num = num + 1
log.info(title)
count += 1
num = num + 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def hai_nan2():
def hai_nan_sw(page_href):
num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.text, 'html.parser')
......@@ -1936,6 +2003,9 @@ def hai_nan():
pub_time = str(pub_result[3]).split('发布日期:</strong>')[1].split('</span>')[0].strip()
contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -1961,10 +2031,11 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
log.info(title)
num += 1
count += 1
href_text.close()
# save_data(result_dict)
print(title)
num += 1
except:
pass
req.close()
......@@ -1972,6 +2043,7 @@ def hai_nan():
def hai_nan_szf(page_href):
num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.text, 'html.parser')
......@@ -2010,6 +2082,9 @@ def hai_nan():
pub_time = str(pub_result[3]).split('发布日期:</strong>')[1].split('</span>')[0].strip()
contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
except:
# print(href)
pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
......@@ -2021,6 +2096,9 @@ def hai_nan():
writtenDate = ''
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -2068,10 +2146,12 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
log.info(title)
num += 1
count += 1
href_text.close()
# save_data(result_dict)
print(title)
num += 1
except:
pass
req.close()
......@@ -2079,6 +2159,7 @@ def hai_nan():
def hai_nan_szfbgt(page_href):
num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.text, 'html.parser')
......@@ -2127,6 +2208,9 @@ def hai_nan():
writtenDate = ''
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
if fu_jian_list:
for fu_jian in fu_jian_list:
......@@ -2147,7 +2231,7 @@ def hai_nan():
att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
id_list.append(att_id)
fu_jian['href'] = full_path
print(f'----附件:{fu_jian_href}')
# print(f'----附件:{fu_jian_href}')
else:
pass
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -2176,10 +2260,10 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
log.info(title)
num += 1
count += 1
href_text.close()
# save_data(result_dict)
print(title)
num += 1
except:
pass
req.close()
......@@ -2187,6 +2271,7 @@ def hai_nan():
def hai_nan_zy(page_href):
num = 0
count = 0
req = requests.get(url=page_href, headers=headers, verify=False)
req.encoding = req.apparent_encoding
doc_resp = BeautifulSoup(req.content, 'html.parser')
......@@ -2240,6 +2325,9 @@ def hai_nan():
pub_hao = ''
contentWithTag = doc_href.find(class_='pages_content')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{i_href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -2266,10 +2354,12 @@ def hai_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
log.info(title)
num += 1
count += 1
href_text.close()
# save_data(result_dict)
print(title)
num += 1
except:
pass
req.close()
......@@ -2277,6 +2367,7 @@ def hai_nan():
def start():
num = 0
count = 0
start_time = time.time()
url = "https://www.hainan.gov.cn/hainan/qzcwj/zywj.shtml"
try:
......@@ -2306,7 +2397,7 @@ def hai_nan():
else:
page_href = str(url) + f'home_{page}.htm'
try:
num += hai_nan_zy(page_href)
count += hai_nan_zy(page_href)
except:
pass
time.sleep(1)
......@@ -2320,7 +2411,7 @@ def hai_nan():
else:
page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
try:
num += hai_nan_sw(page_href)
count += hai_nan_sw(page_href)
except:
pass
elif url == leibie_href_list[2]:
......@@ -2332,7 +2423,7 @@ def hai_nan():
else:
page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
try:
num += hai_nan_szf(page_href)
count += hai_nan_szf(page_href)
except:
pass
else:
......@@ -2343,22 +2434,22 @@ def hai_nan():
else:
page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
try:
num += hai_nan_szfbgt(page_href)
count += hai_nan_szfbgt(page_href)
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
start()
hai_nan1()
hai_nan2()
# 四川
def si_chuan():
num = 0
count = 0
pathType = 'policy/sichuan/'
start_time = time.time()
for page in range(1, 3):
......@@ -2393,6 +2484,9 @@ def si_chuan():
doc_href = paserUrl(doc_href, href)
contentWithTag = doc_href.find('div', id='scrollBox')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = doc_href.find_all('a')
for fu_jian in fu_jian_list:
......@@ -2441,19 +2535,20 @@ def si_chuan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num = num + 1
log.info(title)
count += 1
num = num + 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 广西
def guang_xi():
num = 0
count = 0
pathType = 'policy/guangxi/'
start_time = time.time()
url_all = """
......@@ -2519,6 +2614,9 @@ def guang_xi():
contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
contentWithTag = paserUrl(contentWithTag, href)
content = contentWithTag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
......@@ -2568,14 +2666,14 @@ def guang_xi():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num = num + 1
log.info(title)
num = num + 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 贵州
def gui_zhou():
......@@ -2585,6 +2683,7 @@ def gui_zhou():
"""
pathType = 'policy/guizhou/'
num = 0
count = 0
start_time = time.time()
for page in range(0, 11):
if page == 0:
......@@ -2630,6 +2729,9 @@ def gui_zhou():
contentWithTag = paserUrl(contentWithTag, href)
content = contentWithTag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -2678,9 +2780,9 @@ def gui_zhou():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
# save_data(result_dict)
num = num + 1
log.info(title)
count += 1
num = num + 1
except:
pass
except:
......@@ -2697,6 +2799,7 @@ def yun_nan():
http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_list.shtml 1
"""
num = 0
count = 0
start_time = time.time()
for page in range(1, 6):
if page == 1:
......@@ -2735,6 +2838,9 @@ def yun_nan():
contentwithTag = \
doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content')[0]
content = contentwithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentwithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -2793,18 +2899,20 @@ def yun_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num = num + 1
log.info(title)
num = num + 1
count += 1
except:
pass
resp.close()
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def yun_nan2():
num = 0
count = 0
start_time = time.time()
for page in range(1, 4):
if page == 1:
......@@ -2828,7 +2936,7 @@ def yun_nan():
num+=1
continue
try:
print(href)
# print(href)
if '.shtml' in href:
res_ = requests.get(href, headers)
page_text_ = res_.text.encode("ISO-8859-1")
......@@ -2847,6 +2955,9 @@ def yun_nan():
pub_hao = ''
contentwithTag = page.find('div', attrs={'class': 'zfxxgk-right'})
content = contentwithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentwithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
......@@ -2857,7 +2968,7 @@ def yun_nan():
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
print(fu_jian_href)
# print(fu_jian_href)
try:
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
......@@ -2876,9 +2987,7 @@ def yun_nan():
elif 'display' in href:
continue
else:
content = ''
contentwithTag = ''
pub_hao = ''
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
......@@ -2907,16 +3016,16 @@ def yun_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num = num + 1
log.info(title)
count += 1
num = num + 1
except:
pass
res.close()
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
yun_nan1()
yun_nan2()
......@@ -2928,6 +3037,7 @@ def chong_qing():
http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2
"""
num = 0
count = 0
pathType = 'policy/chongqing/'
start_time = time.time()
for page in range(0, 4):
......@@ -2955,7 +3065,7 @@ def chong_qing():
num+=1
continue
try:
print(href)
# print(href)
# href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html'
href_text = requests.get(url=href, headers=headers, verify=False).content
doc_href = pq(href_text)
......@@ -2978,6 +3088,9 @@ def chong_qing():
pass
contentWithTag = doc_href.find('div', class_='zwxl-article')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
except:
origin = ''
topicClassification = ''
......@@ -2986,7 +3099,9 @@ def chong_qing():
pub_hao = ''
contentWithTag = doc_href.find('div', class_='zwxl-content')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
# print(fu_jian_list)
for fu_jian in fu_jian_list:
......@@ -3039,21 +3154,22 @@ def chong_qing():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
# save_data(result_dict)
num += 1
log.info(title)
count += 1
num += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 天津
def tian_jin():
pathType = 'policy/tianjin/'
def tian_jin1():
num = 0
count = 0
start_time = time.time()
for page in range(0, 3):
if page == 0:
......@@ -3139,7 +3255,9 @@ def tian_jin():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -3166,19 +3284,21 @@ def tian_jin():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin2():
"""
http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html 4
"""
num = 0
count =0
start_time = time.time()
for page in range(0, 5):
if page == 0:
......@@ -3263,7 +3383,9 @@ def tian_jin():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -3290,16 +3412,18 @@ def tian_jin():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin3():
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
if page == 1:
......@@ -3391,7 +3515,9 @@ def tian_jin():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -3418,13 +3544,14 @@ def tian_jin():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
tian_jin1()
tian_jin2()
......@@ -3435,6 +3562,7 @@ def xin_jiang():
pathType = 'policy/xinjiang/'
def xin_jiang1():
num = 0
count = 0
start_time = time.time()
for page in range(1, 10):
if page == 1:
......@@ -3493,6 +3621,9 @@ def xin_jiang():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -3527,16 +3658,18 @@ def xin_jiang():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def xin_jiang_jsbt():
num = 0
count = 0
start_time = time.time()
for page in range(1, 6):
if page == 1:
......@@ -3592,6 +3725,9 @@ def xin_jiang():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -3626,7 +3762,8 @@ def xin_jiang():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
href_res.close()
except:
pass
......@@ -3634,7 +3771,7 @@ def xin_jiang():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
xin_jiang1()
xin_jiang_jsbt()
......@@ -3643,6 +3780,7 @@ def xin_jiang():
def shan_xi():
pathType = 'policy/shanxi/'
num = 0
count = 0
start_time = time.time()
for page in range(1, 7):
if page == 1:
......@@ -3712,6 +3850,9 @@ def shan_xi():
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(晋国资.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -3746,18 +3887,20 @@ def shan_xi():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 辽宁
def liao_ning():
pathType = 'policy/liaoning/'
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
url = f'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml'
......@@ -3823,6 +3966,9 @@ def liao_ning():
if len(contentWithTag) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(辽国资.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -3857,7 +4003,8 @@ def liao_ning():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
......@@ -3869,6 +4016,7 @@ def liao_ning():
def hei_long_jiang():
pathType = 'policy/heilongjiang/'
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
url = f'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
......@@ -3926,6 +4074,9 @@ def hei_long_jiang():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -3952,7 +4103,8 @@ def hei_long_jiang():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
......@@ -3960,11 +4112,12 @@ def hei_long_jiang():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 江苏
def jiang_su():
num = 0
count = 0
pathType = 'policy/jiangsu/'
start_time = time.time()
pagestart = 1
......@@ -4034,6 +4187,9 @@ def jiang_su():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(pub_hao) < 1:
pattern = r'(苏国.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content)
......@@ -4067,19 +4223,21 @@ def jiang_su():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 安徽
def an_hui():
pathType = 'policy/anhui/'
def an_hui1():
num = 0
count = 0
start_time = time.time()
for page in range(1, 4):
url = f'http://gzw.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.4981381464472001&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=&catId=6717051&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
......@@ -4137,6 +4295,9 @@ def an_hui():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -4163,16 +4324,18 @@ def an_hui():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def an_hui2():
num = 0
count = 0
start_time = time.time()
for page in range(1, 25):
url = f'http://gzw.ah.gov.cn/site/label/8888?_=0.5237800193505848&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=43793891%2C43793901&catId=&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
......@@ -4233,6 +4396,9 @@ def an_hui():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -4259,7 +4425,8 @@ def an_hui():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
href_res.close()
except:
pass
......@@ -4267,7 +4434,7 @@ def an_hui():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
an_hui1()
an_hui2()
......@@ -4280,6 +4447,7 @@ def jiang_xi():
121-164
"""
num = 0
count = 0
pathType = 'policy/jiangxi/'
start_time = time.time()
startrecord = 1
......@@ -4360,6 +4528,9 @@ def jiang_xi():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(pub_hao) < 1:
pattern = r'(赣国资.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content)
......@@ -4394,17 +4565,19 @@ def jiang_xi():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 河南
def he_nan():
num = 0
count = 0
pathType = 'policy/henan/'
start_time = time.time()
for page in range(0, 7):
......@@ -4456,6 +4629,9 @@ def he_nan():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(豫国.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -4488,17 +4664,19 @@ def he_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
href_res.close()
resp_text.close()
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖南
def hu_nan():
num = 0
count = 0
pathType = 'policy/hunan/'
start_time = time.time()
for page in range(1, 7):
......@@ -4565,6 +4743,9 @@ def hu_nan():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -4591,19 +4772,21 @@ def hu_nan():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 甘肃
def gan_su():
pathType = 'policy/gansu/'
def gan_su1():
num = 0
count = 0
start_time = time.time()
bro = getDriver()
urls = ['http://gzw.gansu.gov.cn/gzw/c115543/xxgk_list.shtml',
......@@ -4686,6 +4869,9 @@ def gan_su():
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -4714,7 +4900,8 @@ def gan_su():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except Exception as e:
print(e)
pass
......@@ -4724,6 +4911,7 @@ def gan_su():
def gan_su2():
num = 0
count = 0
start_time = time.time()
bro = getDriver()
url = 'http://gzw.gansu.gov.cn/gzw/c115552/xxgk_list.shtml'
......@@ -4821,6 +5009,9 @@ def gan_su():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(content) < 2:
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
......@@ -4851,7 +5042,8 @@ def gan_su():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except Exception as e:
print(e)
except Exception as e:
......@@ -4859,10 +5051,11 @@ def gan_su():
pass
bro.quit()
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def gan_su3():
num = 0
count = 0
start_time = time.time()
# # service = Service(r'D:/chrome/103/chromedriver.exe')
# chrome_options = webdriver.ChromeOptions()
......@@ -4979,6 +5172,9 @@ def gan_su():
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(content) < 2:
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
......@@ -5009,14 +5205,15 @@ def gan_su():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except Exception as e:
print(e)
except:
pass
bro.quit()
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
gan_su1()
gan_su2()
......@@ -5025,6 +5222,7 @@ def gan_su():
# 宁夏
def ning_xia():
num = 0
count = 0
pathType = 'policy/ningxia/'
start_time = time.time()
for page in range(0, 3):
......@@ -5082,6 +5280,9 @@ def ning_xia():
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
......@@ -5110,17 +5311,19 @@ def ning_xia():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 陕西
def shanxi():
num = 0
count = 0
pathType = 'policy/shan_xi/'
start_time = time.time()
url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
......@@ -5184,6 +5387,9 @@ def shanxi():
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -5210,7 +5416,8 @@ def shanxi():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
res_href.close()
except:
pass
......@@ -5218,7 +5425,7 @@ def shanxi():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 西藏
def xi_zang():
......@@ -5228,6 +5435,7 @@ def xi_zang():
'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
for url in url_list:
num = 0
count = 0
try:
res = requests.get(url=url, headers=headers)
res.encoding = res.apparent_encoding
......@@ -5256,6 +5464,9 @@ def xi_zang():
contentWithTag = str(i_soup.find(id='NewsContent'))
soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
......@@ -5305,19 +5516,21 @@ def xi_zang():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 青海
def qing_hai():
pathType = 'policy/qinghai/'
def qing_hai1():
num = 0
count = 0
start_time = time.time()
url_mode = 'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=604'
try:
......@@ -5353,6 +5566,9 @@ def qing_hai():
origin = str(page.find('div', attrs={'class': 'foot-fb'}))
soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text
if content == '' or content == None:
log.info(f'-----{durl}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
......@@ -5364,7 +5580,7 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1681')
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']:
pass
else:
......@@ -5404,16 +5620,18 @@ def qing_hai():
save_data(dic_news)
# print(id)
# id_list.append(id)
num += 1
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def qing_hai2():
num = 0
count = 0
start_time = time.time()
urls = [
'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=627',
......@@ -5446,6 +5664,7 @@ def qing_hai():
durl = tr.find('a').get('href')
is_href = db_storage.find_one({'网址': durl})
if is_href:
num+=1
log.info('已采集----------跳过')
continue
title = tr.find('a').text
......@@ -5471,6 +5690,9 @@ def qing_hai():
origin = ''
soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text
if content == '' or content == None:
log.info(f'-----{durl}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
......@@ -5482,7 +5704,7 @@ def qing_hai():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1681')
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']:
pass
else:
......@@ -5490,7 +5712,7 @@ def qing_hai():
att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
# todo:替换完成之后,将附件上传至文件服务器
......@@ -5520,16 +5742,17 @@ def qing_hai():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
# print(id)
# id_list.append(id)
num += 1
# print(id)
# id_list.append(id)
num += 1
count += 1
except:
pass
res.close()
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
qing_hai1()
qing_hai2()
......@@ -5537,6 +5760,8 @@ def qing_hai():
# 河北
def he_bei():
num = 0
count = 0
pathType = 'policy/hebei/'
start_time = time.time()
url = 'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
try:
......@@ -5551,6 +5776,7 @@ def he_bei():
href = 'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id=' + str(id)
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
continue
pub_time_ = info['updated']
m = round(pub_time_ / 1000) # 四舍五入取10位时间戳(秒级)
......@@ -5569,7 +5795,7 @@ def he_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1668')
retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
if retData['state']:
pass
else:
......@@ -5577,13 +5803,16 @@ def he_bei():
att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(冀国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
......@@ -5618,15 +5847,18 @@ def he_bei():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖北
def hu_bei():
num = 0
count = 0
pathType = 'policy/hubei/'
start_time = time.time()
hrefs = []
url = 'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
......@@ -5649,6 +5881,7 @@ def hu_bei():
for href in hrefs:
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
continue
try:
driver.get(href)
......@@ -5684,7 +5917,7 @@ def hu_bei():
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href, '1675')
retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
if retData['state']:
pass
else:
......@@ -5692,14 +5925,16 @@ def hu_bei():
att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
file['href'] = full_path
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -5726,49 +5961,50 @@ def hu_bei():
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
num += 1
count += 1
except Exception as e:
pass
driver.close()
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == '__main__':
# get_content1()
# get_content2()
# get_content3()
# bei_jing()
# nei_meng_gu()
get_content1()
get_content2()
get_content3()
bei_jing()
nei_meng_gu()
ji_lin()
# shang_hai()
# zhe_jiang()
# fu_jian()
# shan_dong()
# guang_dong()
# hai_nan()
# si_chuan()
# guang_xi()
# gui_zhou()
# yun_nan()
# chong_qing()
# tian_jin()
# xin_jiang()
# shan_xi()
# liao_ning()
# hei_long_jiang()
# jiang_su()
# an_hui()
# jiang_xi()
# he_nan()
# hu_nan()
# gan_su()
# ning_xia()
# xi_zang()
# shanxi()
# qing_hai()
# he_bei()
# qing_hai()
# current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds)
shang_hai()
zhe_jiang()
fu_jian()
shan_dong()
guang_dong()
hai_nan()
si_chuan()
guang_xi()
gui_zhou()
yun_nan()
chong_qing()
tian_jin()
xin_jiang()
shan_xi()
liao_ning()
hei_long_jiang()
jiang_su()
an_hui()
jiang_xi()
he_nan()
hu_nan()
gan_su()
ning_xia()
xi_zang()
shanxi()
qing_hai()
he_bei()
qing_hai()
current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds)
......@@ -40,7 +40,8 @@ def save_data(dic_news):
'网址':dic_news['sourceAddress'],
'tid':dic_news['labels'][0]['relationId'],
'来源':dic_news['labels'][0]['relationName'],
'创建时间':dic_news['createDate']
'创建时间':dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100]
}
db_storage.insert_one(aaa_dic)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论