提交 4d6ca3e2 作者: LiuLiYuan

政策法规采集 10/21

上级 aa593218
# _*_ coding:utf-8 _*_
"""数据全量跑一遍,不做判重逻辑"""
import datetime
import json
import os
import re
import time
import datetime
import fitz
import pymongo
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from pyquery import PyQuery as pq
from requests.packages import urllib3
from requests.adapters import HTTPAdapter
from urllib.parse import urljoin
from BaseCore import BaseCore
baseCore = BaseCore()
urllib3.disable_warnings()
......@@ -24,8 +22,8 @@ from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from lxml import etree
from random import choice
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests.adapters import HTTPAdapter
log = baseCore.getLogger()
taskType = '政策法规'
......@@ -36,11 +34,10 @@ taskType = '政策法规'
各地方国资委
"""
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
'国务院_国资委_copy1']
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']
driver_path = r'F:\spider\cmd100\chromedriver.exe'
chromr_bin = r'F:\spider\Google\Chrome\Application\chrome.exe'
driver_path= r'D:\cmd100\chromedriver.exe'
chromr_bin= r'D:\Google\Chrome\Application\chrome.exe'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
......@@ -64,9 +61,10 @@ def paserUrl(html, listurl):
def getDriver():
service = Service(driver_path)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('log-level=3')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 屏蔽chrome自动化受控提示
chrome_options.add_argument("--disable-blink-features=AutomationControlled") # 禁用启用Blink运行时的功能去掉webdriver痕迹
......@@ -77,6 +75,12 @@ def getDriver():
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return bro
def save_data(dic_news):
......@@ -203,109 +207,111 @@ def get_content1():
s.keep_alive = False
pcodeJiguan = a_list[0]
try:
pageCount = getPageConunt(a_list, url, headers, s)
for pageNo in range(1, pageCount + 1):
#pageCount = getPageConunt(a_list, url, headers, s)
#for pageNo in range(1, pageCount + 1):
pageNo = 1
try:
try:
page_list = getList(a_list, url, headers, pageNo, s)
except:
s.close()
page_list = getList(a_list, url, headers, pageNo, s)
for page in page_list:
id_list = []
# 获取所需信息
title = page['maintitle'] # 标题
pub_time1 = page['publish_time'] # 发布时间
pub_time2 = page['cwrq'] # 成文时间
pub_code = page['fwzh'] # 发文字号
href = page['pub_url'] # 网址
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
log.info('已采集----------跳过')
time.sleep(0.5)
continue
try:
page_list = getList(a_list, url, headers, pageNo, s)
except:
s.close()
page_list = getList(a_list, url, headers, pageNo, s)
for page in page_list:
id_list = []
# 获取所需信息
title = page['maintitle'] # 标题
pub_time1 = page['publish_time'] # 发布时间
pub_time2 = page['cwrq'] # 成文时间
pub_code = page['fwzh'] # 发文字号
href = page['pub_url'] # 网址
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
log.info('已采集----------跳过')
resp_href = requests.get(url=href, headers=headers_, verify=False)
resp_href.encoding = resp_href.apparent_encoding
i_html = resp_href.text
if '您访问的页面不存在或已删除' in i_html:
# log.error(f'{title}...{href}...页面不存在或已删除')
continue
try:
resp_href = requests.get(url=href, headers=headers_, verify=False)
resp_href.encoding = resp_href.apparent_encoding
i_html = resp_href.text
if '您访问的页面不存在或已删除' in i_html:
# log.error(f'{title}...{href}...页面不存在或已删除')
i_soup = BeautifulSoup(i_html, 'html.parser')
i_soup = paserUrl(i_soup, href)
source = str(i_soup.find_all('tbody')[0])
pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[
0] # 发文机关
child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0] # 主题分类
contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
# 去除扫一扫
contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
content = contentWithTag.text # 不带标签正文
fu_jian_soup = contentWithTag.find_all('a')
time.sleep(0.5)
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
i_soup = BeautifulSoup(i_html, 'html.parser')
i_soup = paserUrl(i_soup, href)
source = str(i_soup.find_all('tbody')[0])
pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[
0] # 发文机关
child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0] # 主题分类
contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
# 去除扫一扫
contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
content = contentWithTag.text # 不带标签正文
fu_jian_soup = contentWithTag.find_all('a')
time.sleep(0.5)
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1766',file_name)
if retData['state']:
pass
else:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1766',file_name)
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id)
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = full_path
except:
log.error(f'{title}...{href}...获取内容失败')
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
continue
#todo:将返回的地址更新到soup
file['href'] = full_path
except:
log.error(f'{title}...{href}...获取内容失败')
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
continue
except:
log.error(f'{pcodeJiguan}...获取总数失败')
continue
end_time = time.time()
log.info(f'共抓取国务院文件{num}条数据,共耗时{start_time - end_time}')
end_time = time.time()
log.info(f'共抓取国务院文件{num}条数据,共耗时{end_time-start_time}')
# 国务院部门文件
def get_content2():
......@@ -355,114 +361,117 @@ def get_content2():
'国家知识产权局', '国家档案局', '国家保密局', '国家密码管理局', '国家宗教事务局', '国务院台湾事务办公室', '国家乡村振兴局', '国家电影局']
for bmfl in result_list:
#try:
#totalpage = getTotalpage(bmfl,headers,session)
#for pageNo in range(1,totalpage+1):
#for pageNo in range(1,6):
pageNo = 1
try:
totalpage = getTotalpage(bmfl,headers,session)
for pageNo in range(1,totalpage+1):
try:
content_list = getContentList(bmfl,pageNo,headers,session)
except:
session.close()
content_list = getContentList(bmfl,pageNo,headers,session)
for content_dict in content_list:
id_list = []
href = content_dict['url'] # 详情页
title = content_dict['title'] # 标题
pub_code = content_dict['pcode'] # 发文字号
try:
try:
content_list = getContentList(bmfl,pageNo,headers,session)
except:
session.close()
content_list = getContentList(bmfl,pageNo,headers,session)
for content_dict in content_list:
id_list = []
href = content_dict['url'] # 详情页
title = content_dict['title'] # 标题
pub_code = content_dict['pcode'] # 发文字号
try:
pub_time = int(content_dict['pubtime'] / 1000) # 发布时间
pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
except:
pub_time1 = ''
try:
p_time = int(content_dict['ptime'] / 1000) # 成文时间
pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
except:
pub_time2 = ''
pub_org = content_dict['puborg'] # 发文机关
pub_time = int(content_dict['pubtime'] / 1000) # 发布时间
pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
except:
pub_time1 = None
try:
p_time = int(content_dict['ptime'] / 1000) # 成文时间
pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
except:
pub_time2 = None
pub_org = content_dict['puborg'] # 发文机关
try:
child_type = content_dict['childtype'] # 主题分类
except:
child_type = ''
# # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
log.info('已采集----------跳过')
time.sleep(0.5)
continue
try:
resp = requests.get(url=href, headers=headers, verify=False)
resp.encoding = resp.apparent_encoding
resp_text = resp.text
soup = BeautifulSoup(resp_text, 'html.parser')
soup = paserUrl(soup,href)
time.sleep(0.5)
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}---{title}---内容为空---')
continue
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
child_type = content_dict['childtype'] # 主题分类
except:
child_type = ''
# # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
log.info('已采集----------跳过')
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
try:
resp = requests.get(url=href, headers=headers, verify=False)
resp.encoding = resp.apparent_encoding
resp_text = resp.text
soup = BeautifulSoup(resp_text, 'html.parser')
soup = paserUrl(soup,href)
time.sleep(0.5)
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}---{title}---内容为空---')
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1699',file_name)
if retData['state']:
pass
else:
continue
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1699',file_name)
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id)
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = full_path
except:
log.error(f'{title}...{href}获取内容失败')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
count += 1
num += 1
#todo:将返回的地址更新到soup
file['href'] = full_path
except:
log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
log.error(f'{title}...{href}获取内容失败')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
count += 1
num += 1
except:
log.error(f'{bmfl}...获取页数失败')
log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
continue
#except:
# log.error(f'{bmfl}...获取页数失败')
# continue
end_time = time.time()
log.info(f'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}')
......@@ -553,7 +562,7 @@ def get_content3():
'topicClassification': '', #政策文件分类
'issuedNumber': pub_hao, #发文字号
'publishDate': pub_time, #发布时间
'writtenDate': '', #成文时间
'writtenDate': None, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
......@@ -744,7 +753,7 @@ def bei_jing():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
if retData['state']:
......@@ -870,7 +879,7 @@ def nei_meng_gu():
fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
fu_jian_href = fu_jian_re
category = os.path.splitext(fu_jian_href)[1]
if category not in title:
if category not in title :
file_name = title + category
# print(fu_jian_href)
# todo:附件上传至文件服务器
......@@ -918,7 +927,7 @@ def nei_meng_gu():
pass
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
log.info('共', num, '条', '...........', '共耗时', end - start, '秒')
# 吉林
def ji_lin():
......@@ -982,7 +991,7 @@ def ji_lin():
# print(pub_come)
i_content = soup.find(class_='zsy_comain')
if i_content:
print(real_href)
#print(real_href)
# 去掉扫一扫
try:
soup.find('div', id='qr_container').decompose()
......@@ -1020,7 +1029,7 @@ def ji_lin():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
file_name = fu_jian_href.text.strip()
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# print(fu_jian_href)
retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name)
......@@ -1065,7 +1074,7 @@ def ji_lin():
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# print(fj_href)
category = os.path.splitext(fj_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name)
if retData['state']:
......@@ -1104,7 +1113,7 @@ def ji_lin():
'topicClassification': '',
'issuedNumber': '',
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': real_href,
'summary': '',
......@@ -1126,7 +1135,7 @@ def ji_lin():
except:
pass
end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒')
log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 上海
def shang_hai():
......@@ -1219,7 +1228,7 @@ def shang_hai():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name)
if retData['state']:
......@@ -1252,7 +1261,7 @@ def shang_hai():
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -1268,7 +1277,7 @@ def shang_hai():
except:
pass
end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒')
log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 浙江
def zhe_jiang():
......@@ -1376,7 +1385,7 @@ def zhe_jiang():
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -1393,7 +1402,7 @@ def zhe_jiang():
except:
pass
end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒')
log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 福建
def fu_jian():
......@@ -1445,7 +1454,7 @@ def fu_jian():
i_soup = BeautifulSoup(i_html, 'html.parser')
real_href = href
# real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
print(real_href)
#print(real_href)
is_href = db_storage.find_one({'网址': real_href})
if is_href:
num+=1
......@@ -1460,7 +1469,7 @@ def fu_jian():
content = baseCore.pdf_content(resp_content)
contentwithtag = ''
category = os.path.splitext(real_href)[1]
if category not in title:
if category not in title :
file_name = title + category
# 文件上传至服务器
retData = baseCore.uptoOBS(real_href, '1673',pathType,file_name)
......@@ -1471,7 +1480,7 @@ def fu_jian():
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,'')
id_list.append(att_id)
pub_hao = ''
pub_time = ''
pub_time = None
pub_source = ''
else:
......@@ -1508,7 +1517,7 @@ def fu_jian():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
print(fj_href)
# 找到附件后 上传至文件服务器
......@@ -1524,7 +1533,7 @@ def fu_jian():
except:
pub_source = ''
pub_time = ''
pub_time = None
contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
content = contentwithtag.text.strip()
if content == '' or content == None:
......@@ -1548,7 +1557,7 @@ def fu_jian():
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': real_href,
'summary': '',
......@@ -1566,7 +1575,7 @@ def fu_jian():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 山东
def shan_dong():
......@@ -1633,7 +1642,7 @@ def shan_dong():
for h1 in h1_list:
title = title + str(h1.text)
title.strip().lstrip()
pub_time = ''
pub_time = None
span_list = source.find_all('span')
i = 0
for span in span_list:
......@@ -1683,7 +1692,7 @@ def shan_dong():
except:
pass
end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒')
log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 广东
def guang_dong():
......@@ -1745,7 +1754,7 @@ def guang_dong():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name)
......@@ -1774,7 +1783,7 @@ def guang_dong():
'topicClassification': '',
'issuedNumber': '',
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -1792,7 +1801,7 @@ def guang_dong():
except:
pass
end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒')
log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 海南
def hai_nan():
......@@ -1869,7 +1878,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
......@@ -1916,7 +1925,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# print(f'----附件:{fu_jian_href}-----filename:{file_name}')
# 附件上传至文件服务器
......@@ -1995,7 +2004,7 @@ def hai_nan():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def hai_nan2():
def hai_nan_sw(page_href):
......@@ -2126,7 +2135,7 @@ def hai_nan():
pub_source = ''
pub_time = str(pub_result.text).split('来源:')[0].lstrip().strip()
pub_hao = ''
writtenDate = ''
writtenDate = None,
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text
if content == '' or content == None:
......@@ -2143,7 +2152,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
......@@ -2241,7 +2250,7 @@ def hai_nan():
pub_time = str(pub_result.text).split('来源:')[0].lstrip().strip()
pub_hao = ''
pub_source = ''
writtenDate = ''
writtenDate = None,
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text
if content == '' or content == None:
......@@ -2259,7 +2268,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
......@@ -2360,7 +2369,7 @@ def hai_nan():
0].strip()
except:
pub_source = ''
pub_time = ''
pub_time = None
pub_hao = ''
contentWithTag = doc_href.find(class_='pages_content')
content = contentWithTag.text
......@@ -2383,7 +2392,7 @@ def hai_nan():
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': i_href,
'summary': '',
......@@ -2479,7 +2488,7 @@ def hai_nan():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
start()
hai_nan1()
......@@ -2538,7 +2547,7 @@ def si_chuan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 对附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name)
......@@ -2567,7 +2576,7 @@ def si_chuan():
'topicClassification': '',
'issuedNumber': '',
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -2585,7 +2594,7 @@ def si_chuan():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 广西
def guang_xi():
......@@ -2671,7 +2680,7 @@ def guang_xi():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name)
......@@ -2701,7 +2710,7 @@ def guang_xi():
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -2718,7 +2727,7 @@ def guang_xi():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 贵州
def gui_zhou():
......@@ -2788,7 +2797,7 @@ def gui_zhou():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name)
......@@ -2818,7 +2827,7 @@ def gui_zhou():
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -2836,7 +2845,7 @@ def gui_zhou():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 云南
def yun_nan():
......@@ -2870,7 +2879,7 @@ def yun_nan():
continue
try:
fu_jian_href_list = []
print(href)
#print(href)
if '.shtml' in href:
href_resp = requests.get(url=href, headers=headers, verify=False)
href_resp.encoding = href_resp.apparent_encoding
......@@ -2901,7 +2910,7 @@ def yun_nan():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
......@@ -2939,8 +2948,8 @@ def yun_nan():
'organ': '',
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': '',
'writtenDate': '',
'publishDate': None,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -2959,7 +2968,7 @@ def yun_nan():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def yun_nan2():
num = 0
......@@ -3022,7 +3031,7 @@ def yun_nan():
# print(fu_jian_href)
try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
......@@ -3060,7 +3069,7 @@ def yun_nan():
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -3079,7 +3088,7 @@ def yun_nan():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
yun_nan1()
yun_nan2()
......@@ -3148,8 +3157,8 @@ def chong_qing():
except:
origin = ''
topicClassification = ''
pub_time = ''
writtenDate = ''
pub_time = None
writtenDate = None
pub_hao = ''
contentWithTag = doc_href.find('div', class_='zwxl-content')
content = contentWithTag.text
......@@ -3169,7 +3178,7 @@ def chong_qing():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name)
......@@ -3219,7 +3228,7 @@ def chong_qing():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 天津
def tian_jin():
......@@ -3282,7 +3291,7 @@ def tian_jin():
rmtag2.remove()
contentWithTag = doc_href('div[id="zoom"]')
if len(writtenDate) < 1:
writtenDate = ''
writtenDate = None
if len(publishDate) < 1:
publishDate = doc_href('meta[name="PubDate"]').attr('content')
soup = paserUrl(str(contentWithTag), href)
......@@ -3298,7 +3307,7 @@ def tian_jin():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']:
......@@ -3351,7 +3360,7 @@ def tian_jin():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin2():
"""
......@@ -3413,7 +3422,7 @@ def tian_jin():
rmtag2.remove()
contentWithTag = doc_href('div[id="zoom"]')
if len(writtenDate) < 1:
writtenDate = ''
writtenDate = None
if len(publishDate) < 1:
publishDate = doc_href('meta[name="PubDate"]').attr('content')
soup = paserUrl(str(contentWithTag), href)
......@@ -3429,7 +3438,7 @@ def tian_jin():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']:
......@@ -3482,7 +3491,7 @@ def tian_jin():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin3():
num = 0
......@@ -3507,7 +3516,7 @@ def tian_jin():
try:
publishDate = li.find('div', attrs={'class': 'other'}).text
except:
publishDate = ''
publishDate = None
if 'http' not in href:
if '../../../' in href:
href = href.replace('../../../', 'https://sasac.tj.gov.cn/')
......@@ -3548,7 +3557,7 @@ def tian_jin():
rmtag2.remove()
contentWithTag = doc_href('div[id="zoom"]')
if len(writtenDate) < 1:
writtenDate = ''
writtenDate = None
if len(publishDate) < 1:
publishDate = doc_href('meta[name="PubDate"]').attr('content')
soup = paserUrl(str(contentWithTag), href)
......@@ -3564,7 +3573,7 @@ def tian_jin():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']:
......@@ -3617,7 +3626,7 @@ def tian_jin():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
tian_jin1()
tian_jin2()
......@@ -3673,7 +3682,7 @@ def xin_jiang():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']:
......@@ -3717,7 +3726,7 @@ def xin_jiang():
'topicClassification': "",
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': "",
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -3734,7 +3743,7 @@ def xin_jiang():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def xin_jiang_jsbt():
num = 0
......@@ -3780,7 +3789,7 @@ def xin_jiang():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']:
......@@ -3824,7 +3833,7 @@ def xin_jiang():
'topicClassification': "",
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': "",
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -3843,7 +3852,7 @@ def xin_jiang():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
xin_jiang1()
xin_jiang_jsbt()
......@@ -3881,7 +3890,7 @@ def shan_xi():
try:
if ".pdf" in href:
content = ''
publishDate = ''
publishDate = None
origin = ''
fu_jian_soup = [href]
contentWithTag = ''
......@@ -3908,7 +3917,7 @@ def shan_xi():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
if retData['state']:
......@@ -3952,7 +3961,7 @@ def shan_xi():
'topicClassification': "",
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': "",
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -3969,7 +3978,7 @@ def shan_xi():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 辽宁
def liao_ning():
......@@ -4028,7 +4037,7 @@ def liao_ning():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
if retData['state']:
......@@ -4071,7 +4080,7 @@ def liao_ning():
'topicClassification': "",
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': "",
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -4088,7 +4097,7 @@ def liao_ning():
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 黑龙江
def hei_long_jiang():
......@@ -4141,7 +4150,7 @@ def hei_long_jiang():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']:
......@@ -4174,7 +4183,7 @@ def hei_long_jiang():
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': publishDate,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -4193,7 +4202,7 @@ def hei_long_jiang():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 江苏
def jiang_su():
......@@ -4257,7 +4266,7 @@ def jiang_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']:
......@@ -4314,7 +4323,7 @@ def jiang_su():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 安徽
def an_hui():
......@@ -4368,7 +4377,7 @@ def an_hui():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']:
......@@ -4418,7 +4427,7 @@ def an_hui():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def an_hui2():
num = 0
......@@ -4472,7 +4481,7 @@ def an_hui():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']:
......@@ -4524,7 +4533,7 @@ def an_hui():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
an_hui1()
an_hui2()
......@@ -4607,7 +4616,7 @@ def jiang_xi():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
if retData['state']:
......@@ -4647,7 +4656,7 @@ def jiang_xi():
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
'publishDate': '',
'publishDate': None,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
......@@ -4665,7 +4674,7 @@ def jiang_xi():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 河南
def he_nan():
......@@ -4711,7 +4720,7 @@ def he_nan():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
if retData['state']:
......@@ -4750,7 +4759,7 @@ def he_nan():
'topicClassification': '',
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': '',
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -4767,7 +4776,7 @@ def he_nan():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖南
def hu_nan():
......@@ -4828,7 +4837,7 @@ def hu_nan():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
if retData['state']:
......@@ -4878,7 +4887,7 @@ def hu_nan():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 甘肃
def gan_su():
......@@ -4963,7 +4972,7 @@ def gan_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']:
......@@ -5015,7 +5024,7 @@ def gan_su():
pass
bro.quit()
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
def gan_su2():
num = 0
......@@ -5097,7 +5106,7 @@ def gan_su():
origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
contentWithTag = doc('div[id="content"]')
print(title)
#print(title)
soup = paserUrl(str(contentWithTag), href)
try:
......@@ -5119,7 +5128,7 @@ def gan_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
log.info(f'{file_name}---{href}--')
retData = baseCore.uptoOBS(file_href, '1696',file_name)
......@@ -5176,7 +5185,7 @@ def gan_su():
pass
bro.quit()
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def gan_su3():
num = 0
......@@ -5260,13 +5269,13 @@ def gan_su():
origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
contentWithTag = doc('div[id="content"]')
print(title)
#print(title)
if len(title) == 0 or contentWithTag.text() == '':
title = doc('div[class="main"]>h1').text().lstrip().strip()
writtenDate = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('日期:')[0].split(' ')[0].lstrip().strip()
origin = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('来源:')[0].lstrip().strip()
contentWithTag = doc('div[class="detailContent"]')
print(title)
#print(title)
soup = paserUrl(str(contentWithTag), href)
try:
......@@ -5288,7 +5297,7 @@ def gan_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']:
......@@ -5304,7 +5313,7 @@ def gan_su():
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
print(bro.page_source)
#print(bro.page_source)
continue
if len(content) < 2:
continue
......@@ -5345,7 +5354,7 @@ def gan_su():
pass
bro.quit()
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
gan_su1()
gan_su2()
......@@ -5401,7 +5410,7 @@ def ning_xia():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
if retData['state']:
......@@ -5453,7 +5462,7 @@ def ning_xia():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 陕西
def shanxi():
......@@ -5511,7 +5520,7 @@ def shanxi():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
if retData['state']:
......@@ -5544,7 +5553,7 @@ def shanxi():
'topicClassification': "",
'issuedNumber': "",
'publishDate': publishDate,
'writtenDate': "",
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -5563,7 +5572,7 @@ def shanxi():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 西藏
def xi_zang():
......@@ -5617,7 +5626,7 @@ def xi_zang():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
if retData['state']:
......@@ -5647,7 +5656,7 @@ def xi_zang():
'topicClassification': "",
'issuedNumber': "",
'publishDate': publishDate,
'writtenDate': "",
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -5664,7 +5673,7 @@ def xi_zang():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 青海
def qing_hai():
......@@ -5722,7 +5731,7 @@ def qing_hai():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']:
......@@ -5771,7 +5780,7 @@ def qing_hai():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def qing_hai2():
num = 0
......@@ -5849,7 +5858,7 @@ def qing_hai():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']:
......@@ -5899,7 +5908,7 @@ def qing_hai():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
qing_hai1()
qing_hai2()
......@@ -5943,7 +5952,7 @@ def he_bei():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
if retData['state']:
......@@ -5987,7 +5996,7 @@ def he_bei():
'topicClassification': "",
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': "",
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
......@@ -6002,7 +6011,7 @@ def he_bei():
except:
pass
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖北
def hu_bei():
......@@ -6068,7 +6077,7 @@ def hu_bei():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
if category not in file_name :
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
if retData['state']:
......@@ -6120,44 +6129,45 @@ def hu_bei():
pass
driver.close()
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == '__main__':
# get_content1()
# get_content2()
# get_content3()
# bei_jing()
# nei_meng_gu()
# ji_lin()
# shang_hai()
# zhe_jiang()
# fu_jian()
# shan_dong()
# guang_dong()
# hai_nan()
# si_chuan()
# guang_xi()
# gui_zhou()
# yun_nan()
# chong_qing()
# tian_jin()
# xin_jiang()
# shan_xi()
# liao_ning()
# hei_long_jiang()
# jiang_su()
# an_hui()
# jiang_xi()
# he_nan()
# hu_nan()
get_content1()
get_content2()
get_content3()
bei_jing()
nei_meng_gu()
ji_lin()
shang_hai()
zhe_jiang()
fu_jian()
shan_dong()
guang_dong()
hai_nan()
si_chuan()
guang_xi()
gui_zhou()
yun_nan()
chong_qing()
tian_jin()
xin_jiang()
shan_xi()
liao_ning()
hei_long_jiang()
jiang_su()
an_hui()
jiang_xi()
he_nan()
hu_nan()
gan_su()
# ning_xia()
# xi_zang()
# shanxi()
# qing_hai()
# he_bei()
# qing_hai()
# current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds)
ning_xia()
xi_zang()
shanxi()
qing_hai()
he_bei()
qing_hai()
current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds)
import datetime
import json
import random
import time
from urllib.parse import urljoin
import datetime
import pymongo
from kafka import KafkaProducer
from tqdm import tqdm
......@@ -12,15 +11,31 @@ import pymysql
import requests
from bs4 import BeautifulSoup
import urllib3
from base.BaseCore import BaseCore
from lxml import etree
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx = baseCore.cnx
cursor = baseCore.cursor
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
'国务院_国资委_copy1']
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
'Host': 'www.sasac.gov.cn',
'Pragma': 'no-cache',
'Referer': 'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
def paserUrl(html,listurl):
def paserUrl(html, listurl):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img'])
......@@ -36,18 +51,19 @@ def paserUrl(html,listurl):
def save_data(dic_news):
aaa_dic = {
'附件id':dic_news['attachmentIds'],
'网址':dic_news['sourceAddress'],
'tid':dic_news['labels'][0]['relationId'],
'来源':dic_news['labels'][0]['relationName'],
'创建时间':dic_news['createDate'],
'附件id': dic_news['attachmentIds'],
'网址': dic_news['sourceAddress'],
'tid': dic_news['labels'][0]['relationId'],
'来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100]
}
db_storage.insert_one(aaa_dic)
def sendKafka(dic_news):
start_time = time.time()
try:#114.116.116.241
try: # 114.116.116.241
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("policy",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
......@@ -78,215 +94,233 @@ def sendKafka(dic_news):
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
def work(href_type,ting_type,relationId):
ip = baseCore.get_proxy()
log.info(f'\n================厅局类别==={ting_type}========================')
if 'http' in href_type:
url_type = href_type
else:
url_type = 'http://www.sasac.gov.cn/' + href_type.replace('../', '')
# print(url_type)
i_res = requests.get(url=url_type, headers=headers, proxies=ip)
i_soup = BeautifulSoup(i_res.content, 'html.parser')
time.sleep(2)
news_list = i_soup.find('div', class_='tjywBottom').find_all('li')
# 文章列表
# print('================新闻列表==================')
for news in tqdm(news_list):
try:
news_href = news.find('a')['href']
except:
continue
if 'http' in news_href:
news_url = news_href
else:
news_url = 'http://www.sasac.gov.cn/' + news_href.replace('../', '')
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': news_url})
if is_href:
log.info('已采集----------跳过')
continue
news_title = news.find('a').text.split('[')[0]
log.info(f'\n----正在采集: {news_title}-------')
pub_time = news.find('span').text.replace('[', '').replace(']', '')
# 文章信息
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cookie': 'wdcid=30ffdae06d11dbde; __jsluid_h=e623973ba12a5f48b086f8c5cee6fffa; SF_cookie_1=67313298; Hm_lvt_fa835457efbc11dfb88752e70521d23b=1693808034; zh_choose=n; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1694078708; wdses=381c6ab86ce01570; wdlast=1694163647; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1694163647; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1694165617',
'Host': 'www.sasac.gov.cn',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28651762/content.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
# news_url = 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28102228/content.html'
ii_res = requests.get(url=news_url, headers=header, proxies=ip)
ii_soup = BeautifulSoup(ii_res.content, 'html.parser')
# todo:相对路径转化为绝对路径
ii_soup = paserUrl(ii_soup, news_url)
# 去掉扫一扫
try:
ii_soup.find('div', id='qr_container').decompose()
except:
pass
# 去掉style标签
for styleTag in ii_soup.find_all('style'):
styleTag.extract()
time.sleep(2)
try:
news_info = ii_soup.find('div', class_='zsy_cotitle')
except Exception as e:
log.error(e)
news_info = ''
if news_info:
# 国资委_内设机构
def gzw_nsjg():
# 获取页面数据
def get_page_nsjg(href, ting_type, relationId, page):
start_time = time.time()
num = 0
for pageNo in range(1, page + 1):
if pageNo != 1:
href = href.replace(f'_{pageNo - 1}.html', f'_{pageNo}.html')
if pageNo == page:
tag = href.split('/')[-1]
href = href.replace(tag, 'index.html')
try:
# origin
pub_source = news_info.find('p').text.split('文章来源:')[1].split('发布时间')[0].strip()
req = requests.get(url=href, headers=headers, verify=False)
req_text = req.text.encode("ISO-8859-1")
req_text = req_text.decode("utf-8")
soup = BeautifulSoup(req_text, 'html.parser')
soup = paserUrl(soup, href)
li_list = soup.find('ul', attrs={'class': 'ld-tjywList'}).find_all('li')
except:
pub_source = ''
try:
contentWithTag = ii_soup.find('div', 'zsy_comain')
content = contentWithTag.text.strip()
except:
content = ''
contentWithTag = ''
if len(content) > 100:
pass
else:
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': [],
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': relationId, 'relationName': ting_type, 'labelMark': "policy"}],
'origin': pub_source,
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'publishDate': pub_time,
'writtenDate': '',
'sid': '1697458829758697473',
'sourceAddress': news_url,
'summary': '',
'title': news_title
}
sendKafka(dic_news)
save_data(dic_news)
log.info(f'{ting_type}-----{news_title}----发送成功', )
else:
dic_error = {
'标题': news_title,
'原文链接': news_url,
'厅局类别': ting_type
}
log.error(dic_error)
req = requests.get(url=href, headers=headers, verify=False)
req_text = req.text.encode("ISO-8859-1")
req_text = req_text.decode("utf-8")
soup = BeautifulSoup(req_text, 'html.parser')
soup = paserUrl(soup, href)
li_list = soup.find_all('li')
for li in li_list:
try:
real_href = li.find('a').get('href')
except:
continue
is_href = db_storage.find_one({'网址': real_href})
if is_href:
log.info('已采集----------跳过')
continue
try:
try:
try:
req_ = requests.get(url=real_href, headers=headers, verify=False)
req_.encoding = req_.apparent_encoding
soup_ = BeautifulSoup(req_.text, 'html.parser')
div_content = soup_.find('div', attrs={'class': 'zsy_content'})
pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
try:
title = str(pub_result.text).split('文章来源:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
pub_source = str(pub_result.text).split('文章来源:')[1].split('发布时间:')[0].lstrip().strip()
except:
title = str(pub_result.text).split('发布时间:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
except:
req_ = requests.get(url=real_href, headers=headers, verify=False)
req_.encoding = req_.apparent_encoding
soup_ = BeautifulSoup(req_.text, 'html.parser')
pub_result = soup_.find('div', attrs={'class': 'zsy_cotitle'})
real_href = str(pub_result.text).split('location.href="')[1].split('";')[0].lstrip().strip()
req_.close()
req_ = requests.get(url=real_href, headers=headers, verify=False)
req_.encoding = req_.apparent_encoding
soup_ = BeautifulSoup(req_.text, 'html.parser')
div_content = soup_.find('div', attrs={'class': 'zsy_content'})
pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
try:
title = str(pub_result.text).split('文章来源:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
pub_source = str(pub_result.text).split('文章来源:')[1].split('发布时间:')[0].lstrip().strip()
except:
title = str(pub_result.text).split('发布时间:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
req_.close()
except:
req_ = requests.get(url=real_href, headers=headers, verify=False)
req_.encoding = req_.apparent_encoding
soup_ = BeautifulSoup(req_.text, 'html.parser')
yaoqiu_list = soup_.find('div', attrs={'class': 'yaoqiu_list'})
li_list_ = yaoqiu_list.find_all('li')
for li_ in li_list_:
href_ = li_.find('a').get('href')
real_href = href_.replace('../../../', 'http://www.sasac.gov.cn/')
req_ = requests.get(url=real_href, headers=headers, verify=False)
req_.encoding = req_.apparent_encoding
soup_ = BeautifulSoup(req_.text, 'html.parser')
div_content = soup_.find('div', attrs={'class': 'zsy_content'})
pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
try:
title = str(pub_result.text).split('文章来源:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
pub_source = str(pub_result.text).split('文章来源:')[1].split('发布时间:')[0].lstrip().strip()
except:
title = str(pub_result.text).split('发布时间:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
pub_source = ''
if 'location.href' in title:
continue
if '404 Ba' in str(div_content):
continue
contentWithTag = div_content.find('div',class_='zsy_comain')
try:
contentWithTag.find('div', id='qr_container').decompose()
except:
pass
# 去掉style标签
for styleTag in contentWithTag.find_all('style'):
styleTag.extract()
content = contentWithTag.text
if content == '':
log.error(f'{real_href}===获取正文失败')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': [],
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': relationId, 'relationName': ting_type, 'labelMark': "policy"}],
'origin': pub_source,
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'publishDate': publishDate,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': real_href,
'summary': '',
'title': title
}
#print(content)
#print(contentWithTag)
sendKafka(dic_news)
save_data(dic_news)
log.info(f'{ting_type}-----{title}----发送成功', )
num += 1
except Exception as e:
pass
req.close()
end_time = time.time()
print(f'抓取{num}条数据,共耗时{end_time - start_time}')
# 获取页面列表
def get_page_nsjg_list(href, institution, tid):
href_list = {
'办公厅(党委办公厅)': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/index_2642999_1.html', 9],
'综合研究局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591482/n2591484/index_2656923_1.html', 5],
'政策法规局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590860/n2590862/index_2644230_1.html', 21],
'规划发展局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590902/n2590904/index_2646556_1.html', 9],
'财务监管与运行评价局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590944/n2590946/index_2647546_1.html', 9],
'产权管理局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591020/n2591022/index_2648251_1.html', 7],
'企业改革局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591064/n2591066/index_2648748_1.html', 15],
'考核分配局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591106/n2591108/index_2649149_1.html', 6],
'资本运营与收益管理局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591192/n2591194/index_2649585_1.html', 3],
'科技创新局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591148/n2591150/index_2650085_1.html', 14],
'社会责任局': ['http://www.sasac.gov.cn/n2588020/n2588072/n23746822/n23746853/index_23747054_.html', 10],
'综合监督局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591284/n2591286/index.html', 1],
'监督追责局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591266/n2591268/index_2654822_1.html', 2],
'企业领导人员管理一局(董事会工作局)': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591302/n2591304/index_2657539_1.html', 4],
'企业领导人员管理二局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591344/n2591346/index_2657636_1.html', 4],
'党建工作局(党委组织部、党委统战部)': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591386/n2591388/index_2656630_1.html', 14],
'宣传工作局(党委宣传部)': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591426/n2591428/index_2656835_1.html',
21],
'国际合作局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591548/n2591550/index_2657011_1.html', 28],
'人事局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591586/n2591588/index_2656275_1.html', 7],
'行业协会商会党建工作局(行业协会商会工作局)': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591626/n2591628/index_2656076_1.html', 4],
'机关服务管理局(离退休干部管理局)': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591644/n2591646/index_2655780_1.html', 9],
'机关党委': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591684/n2591686/index_2655222_1.html', 33],
'党委巡视工作办公室、国资委巡视组': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591770/n2591772/index_2655029_1.html', 8],
'中央纪委国家监委驻国资委纪检监察组': ['http://www.sasac.gov.cn/n2588020/n2877928/n2878219/index_2879099_1.html', 18]}
href_ = href_list[institution][0]
page = href_list[institution][1]
get_page_nsjg(href_, institution, tid, page)
#中央纪委国家监委驻国资委纪检监察组
def job1(a_type):
href = a_type['href']
ting_type = a_type.text
return href,ting_type
# 开始
def gzw_nsjg_start():
url = 'http://www.sasac.gov.cn/n2588020/index.html'
req = requests.get(url=url, headers=headers, verify=False)
req_text = req.text.encode("ISO-8859-1")
req_text = req_text.decode("utf-8")
all_institution = []
tree = etree.HTML(req_text)
institution = tree.xpath('/html/body/div[4]/div[2]/div/dl[1]/dt/a/text()')[0].replace('\n', '').replace('\r',
'')
institution_href = tree.xpath('/html/body/div[4]/div[2]/div/dl[1]/dt/a/@href')[0].replace('../',
'http://www.sasac.gov.cn/')
all_institution.append([institution, institution_href])
dd_list = tree.xpath('/html/body/div[4]/div[2]/div/dl[2]/dd')
for dd in dd_list:
institution = dd.xpath('./a/text()')[0].replace('\n', '').replace('\r', '')
institution_href = dd.xpath('./a/@href')[0].replace('../', 'http://www.sasac.gov.cn/')
all_institution.append([institution, institution_href])
def job():
url = 'http://www.sasac.gov.cn/n2588020/index.html'
ip = baseCore.get_proxy()
res = requests.get(url=url, headers=headers, proxies=ip)
soup = BeautifulSoup(res.content, 'html.parser')
time.sleep(2)
# 厅局列表
list_type = soup.find('div', class_='l-jgkk-right column').find_all('dd')[:22]
a_soup = soup.find('div', class_='l-jgkk-right column').find_all('dt')[0]
a_type = a_soup.text.strip()
a_href = a_soup.find('a')['href']
a_id = '1874'
list_error = []
num = 0
start_time = time.time()
work(a_href,a_type, a_id)
for type in tqdm(list_type):
list_news = []
href_type = type.find('a')['href']
ting_type = type.find('a').text
try:
relationId = mapId_dic[ting_type]
except:
continue
work(href_type,ting_type,relationId)
num += 1
end_time = time.time()
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
time.sleep(1)
# writer.save()
# df_error = pd.DataFrame(list_error)
# df_error.to_excel('未采到文章.xlsx',index=False)
tids = {'办公厅(党委办公厅)': 1643, '综合研究局': 1644, '政策法规局': 1645, '规划发展局': 1646, '财务监管与运行评价局': 1647, '产权管理局': 1648,
'企业改革局': 1649, '考核分配局': 1650, '资本运营与收益管理局': 1651, '科技创新局': 1652, '社会责任局': 2064, '综合监督局': 1653,
'监督追责局': 1654,
'企业领导人员管理一局(董事会工作局)': 1655, '企业领导人员管理二局': 1656, '党建工作局(党委组织部、党委统战部)': 1657, '宣传工作局(党委宣传部)': 1658,
'国际合作局': 1659, '人事局': 1660, '行业协会商会党建工作局(行业协会商会工作局)': 1661, '机关服务管理局(离退休干部管理局)': 1662, '机关党委': 1663,
'党委巡视工作办公室、国资委巡视组': 1664, '中央纪委国家监委驻国资委纪检监察组': 1874}
for a in all_institution:
institution = a[0]
href = a[1]
tid = tids[institution]
log.info(f'\n================厅局类别==={institution}========================')
get_page_nsjg_list(href, institution, tid)
gzw_nsjg_start()
if __name__=='__main__':
mapId_dic = {
'办公厅(党委办公厅)':'1643',
'综合研究局':'1644',
'政策法规局':'1645',
'规划发展局':'1646',
'财务监管与运行评价局':'1647',
'产权管理局':'1648',
'企业改革局':'1649',
'考核分配局':'1650',
'资本运营与收益管理局':'1651',
'科技创新局':'1652',
'综合监督局':'1653',
'监督追责局':'1654',
'企业领导人员管理一局(董事会工作局)':'1655',
'企业领导人员管理二局':'1656',
'党建工作局(党委组织部、党委统战部)':'1657',
'宣传工作局(党委宣传部)':'1658',
'国际合作局':'1659',
'人事局':'1660',
'机关服务管理局(离退休干部管理局)':'1662',
'机关党委':'1663',
'党委巡视工作办公室、国资委巡视组':'1664',
}
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
'Host':'www.sasac.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
if __name__ == '__main__':
try:
job()
gzw_nsjg()
except Exception as e:
print(e)
current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds)
log.error(e)
#current_time = datetime.datetime.now()
#midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
#sleep_seconds = (midnight_time - current_time).total_seconds()
#time.sleep(sleep_seconds)
# 创建一个ExcelWriter对象
# writer = pd.ExcelWriter('国务院厅局.xlsx')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论