提交 4d6ca3e2 作者: LiuLiYuan

政策法规采集 10/21

上级 aa593218
# _*_ coding:utf-8 _*_ # _*_ coding:utf-8 _*_
"""数据全量跑一遍,不做判重逻辑""" """数据全量跑一遍,不做判重逻辑"""
import datetime
import json import json
import os
import re import re
import time import time
import datetime
import fitz import fitz
import pymongo import pymongo
import requests import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from requests.packages import urllib3 from requests.packages import urllib3
from requests.adapters import HTTPAdapter from urllib.parse import urljoin
from BaseCore import BaseCore from BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
urllib3.disable_warnings() urllib3.disable_warnings()
...@@ -24,8 +22,8 @@ from selenium.webdriver.chrome.service import Service ...@@ -24,8 +22,8 @@ from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from lxml import etree from lxml import etree
from random import choice from random import choice
from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter
from urllib.parse import urljoin
log = baseCore.getLogger() log = baseCore.getLogger()
taskType = '政策法规' taskType = '政策法规'
...@@ -36,11 +34,10 @@ taskType = '政策法规' ...@@ -36,11 +34,10 @@ taskType = '政策法规'
各地方国资委 各地方国资委
""" """
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']
'国务院_国资委_copy1']
driver_path = r'F:\spider\cmd100\chromedriver.exe' driver_path= r'D:\cmd100\chromedriver.exe'
chromr_bin = r'F:\spider\Google\Chrome\Application\chrome.exe' chromr_bin= r'D:\Google\Chrome\Application\chrome.exe'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
...@@ -64,9 +61,10 @@ def paserUrl(html, listurl): ...@@ -64,9 +61,10 @@ def paserUrl(html, listurl):
def getDriver(): def getDriver():
service = Service(driver_path) service = Service(driver_path)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--no-sandbox') # chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('log-level=3')
chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 屏蔽chrome自动化受控提示 chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 屏蔽chrome自动化受控提示
chrome_options.add_argument("--disable-blink-features=AutomationControlled") # 禁用启用Blink运行时的功能去掉webdriver痕迹 chrome_options.add_argument("--disable-blink-features=AutomationControlled") # 禁用启用Blink运行时的功能去掉webdriver痕迹
...@@ -77,6 +75,12 @@ def getDriver(): ...@@ -77,6 +75,12 @@ def getDriver():
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36') 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service) # bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path) bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return bro return bro
def save_data(dic_news): def save_data(dic_news):
...@@ -203,109 +207,111 @@ def get_content1(): ...@@ -203,109 +207,111 @@ def get_content1():
s.keep_alive = False s.keep_alive = False
pcodeJiguan = a_list[0] pcodeJiguan = a_list[0]
try: try:
pageCount = getPageConunt(a_list, url, headers, s) #pageCount = getPageConunt(a_list, url, headers, s)
for pageNo in range(1, pageCount + 1): #for pageNo in range(1, pageCount + 1):
pageNo = 1
try:
try: try:
page_list = getList(a_list, url, headers, pageNo, s)
except:
s.close()
page_list = getList(a_list, url, headers, pageNo, s)
for page in page_list:
id_list = []
# 获取所需信息
title = page['maintitle'] # 标题
pub_time1 = page['publish_time'] # 发布时间
pub_time2 = page['cwrq'] # 成文时间
pub_code = page['fwzh'] # 发文字号
href = page['pub_url'] # 网址
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
log.info('已采集----------跳过')
time.sleep(0.5)
continue
try: try:
page_list = getList(a_list, url, headers, pageNo, s) resp_href = requests.get(url=href, headers=headers_, verify=False)
except: resp_href.encoding = resp_href.apparent_encoding
s.close() i_html = resp_href.text
page_list = getList(a_list, url, headers, pageNo, s) if '您访问的页面不存在或已删除' in i_html:
for page in page_list: # log.error(f'{title}...{href}...页面不存在或已删除')
id_list = []
# 获取所需信息
title = page['maintitle'] # 标题
pub_time1 = page['publish_time'] # 发布时间
pub_time2 = page['cwrq'] # 成文时间
pub_code = page['fwzh'] # 发文字号
href = page['pub_url'] # 网址
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
log.info('已采集----------跳过')
continue continue
try: i_soup = BeautifulSoup(i_html, 'html.parser')
resp_href = requests.get(url=href, headers=headers_, verify=False) i_soup = paserUrl(i_soup, href)
resp_href.encoding = resp_href.apparent_encoding source = str(i_soup.find_all('tbody')[0])
i_html = resp_href.text pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[
if '您访问的页面不存在或已删除' in i_html: 0] # 发文机关
# log.error(f'{title}...{href}...页面不存在或已删除') child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0] # 主题分类
contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
# 去除扫一扫
contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
content = contentWithTag.text # 不带标签正文
fu_jian_soup = contentWithTag.find_all('a')
time.sleep(0.5)
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue continue
i_soup = BeautifulSoup(i_html, 'html.parser') if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
i_soup = paserUrl(i_soup, href) or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
source = str(i_soup.find_all('tbody')[0]) or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[ file_name = file.text.strip()
0] # 发文机关 category = os.path.splitext(file_href)[1]
child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0] # 主题分类 if category not in file_name:
contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content') file_name = file_name + category
# 去除扫一扫 retData = baseCore.uptoOBS(file_href,'1766',file_name)
contentWithTag.find('div', attrs={'id': 'div_div'}).decompose() if retData['state']:
content = contentWithTag.text # 不带标签正文 pass
fu_jian_soup = contentWithTag.find_all('a') else:
time.sleep(0.5)
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \ att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ id_list.append(att_id)
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1766',file_name)
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id)
#todo:将返回的地址更新到soup #todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
except: except:
log.error(f'{title}...{href}...获取内容失败') log.error(f'{title}...{href}...获取内容失败')
continue continue
#todo:替换完成之后,将附件上传至文件服务器 #todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段 #todo:传kafka字段
dic_news = { dic_news = {
'attachmentIds': id_list, #附件id 'attachmentIds': id_list, #附件id
'author': '', #作者 'author': '', #作者
'content': content, #正文不带标签 'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签 'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间 'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除) 'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', # 'id': '', #
'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识 'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关 'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关 'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类 'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号 'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间 'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间 'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id 'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接 'sourceAddress': href, #原文链接
'summary': '', #摘要 'summary': '', #摘要
'title': title #标题 'title': title #标题
} }
# print(dic_news) # print(dic_news)
flag = sendKafka(dic_news) flag = sendKafka(dic_news)
if flag: if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败') log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
continue continue
except: except:
log.error(f'{pcodeJiguan}...获取总数失败') log.error(f'{pcodeJiguan}...获取总数失败')
continue continue
end_time = time.time() end_time = time.time()
log.info(f'共抓取国务院文件{num}条数据,共耗时{start_time - end_time}') log.info(f'共抓取国务院文件{num}条数据,共耗时{end_time-start_time}')
# 国务院部门文件 # 国务院部门文件
def get_content2(): def get_content2():
...@@ -355,114 +361,117 @@ def get_content2(): ...@@ -355,114 +361,117 @@ def get_content2():
'国家知识产权局', '国家档案局', '国家保密局', '国家密码管理局', '国家宗教事务局', '国务院台湾事务办公室', '国家乡村振兴局', '国家电影局'] '国家知识产权局', '国家档案局', '国家保密局', '国家密码管理局', '国家宗教事务局', '国务院台湾事务办公室', '国家乡村振兴局', '国家电影局']
for bmfl in result_list: for bmfl in result_list:
#try:
#totalpage = getTotalpage(bmfl,headers,session)
#for pageNo in range(1,totalpage+1):
#for pageNo in range(1,6):
pageNo = 1
try: try:
totalpage = getTotalpage(bmfl,headers,session) try:
for pageNo in range(1,totalpage+1): content_list = getContentList(bmfl,pageNo,headers,session)
except:
session.close()
content_list = getContentList(bmfl,pageNo,headers,session)
for content_dict in content_list:
id_list = []
href = content_dict['url'] # 详情页
title = content_dict['title'] # 标题
pub_code = content_dict['pcode'] # 发文字号
try: try:
try: pub_time = int(content_dict['pubtime'] / 1000) # 发布时间
content_list = getContentList(bmfl,pageNo,headers,session) pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
except: except:
session.close() pub_time1 = None
content_list = getContentList(bmfl,pageNo,headers,session) try:
for content_dict in content_list: p_time = int(content_dict['ptime'] / 1000) # 成文时间
id_list = [] pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
href = content_dict['url'] # 详情页 except:
title = content_dict['title'] # 标题 pub_time2 = None
pub_code = content_dict['pcode'] # 发文字号 pub_org = content_dict['puborg'] # 发文机关
try: try:
pub_time = int(content_dict['pubtime'] / 1000) # 发布时间 child_type = content_dict['childtype'] # 主题分类
pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time)) except:
except: child_type = ''
pub_time1 = '' # # 判断是否已经爬取过
try: is_href = db_storage.find_one({'网址': href})
p_time = int(content_dict['ptime'] / 1000) # 成文时间 if is_href:
pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time)) num+=1
except: log.info('已采集----------跳过')
pub_time2 = '' time.sleep(0.5)
pub_org = content_dict['puborg'] # 发文机关 continue
try:
resp = requests.get(url=href, headers=headers, verify=False)
resp.encoding = resp.apparent_encoding
resp_text = resp.text
soup = BeautifulSoup(resp_text, 'html.parser')
soup = paserUrl(soup,href)
time.sleep(0.5)
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}---{title}---内容为空---')
continue
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try: try:
child_type = content_dict['childtype'] # 主题分类 file_href = file['href']
except: except Exception as e:
child_type = '' log.info(f'---{href}--------{e}-------')
# # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
num+=1
log.info('已采集----------跳过')
continue continue
try: if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
resp = requests.get(url=href, headers=headers, verify=False) or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
resp.encoding = resp.apparent_encoding or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
resp_text = resp.text file_name = file.text.strip()
soup = BeautifulSoup(resp_text, 'html.parser') category = os.path.splitext(file_href)[1]
soup = paserUrl(soup,href) if category not in file_name:
time.sleep(0.5) file_name = file_name + category
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'}) retData = baseCore.uptoOBS(file_href,'1699',file_name)
content = contentWithTag.text if retData['state']:
if content == '' or content == 'None': pass
log.info(f'----{href}---{title}---内容为空---') else:
continue continue
fu_jian_soup = contentWithTag.find_all('a') att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
for file in fu_jian_soup: id_list.append(att_id)
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href,'1699',file_name)
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
id_list.append(att_id)
#todo:将返回的地址更新到soup #todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
except:
log.error(f'{title}...{href}获取内容失败')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
count += 1
num += 1
except: except:
log.error(f'{bmfl}...第{pageNo}页获取信息列表失败') log.error(f'{title}...{href}获取内容失败')
continue continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
count += 1
num += 1
except: except:
log.error(f'{bmfl}...获取页数失败') log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
continue continue
#except:
# log.error(f'{bmfl}...获取页数失败')
# continue
end_time = time.time() end_time = time.time()
log.info(f'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}') log.info(f'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}')
...@@ -553,7 +562,7 @@ def get_content3(): ...@@ -553,7 +562,7 @@ def get_content3():
'topicClassification': '', #政策文件分类 'topicClassification': '', #政策文件分类
'issuedNumber': pub_hao, #发文字号 'issuedNumber': pub_hao, #发文字号
'publishDate': pub_time, #发布时间 'publishDate': pub_time, #发布时间
'writtenDate': '', #成文时间 'writtenDate': None, #成文时间
'sid': '1697458829758697473', #信息源id 'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接 'sourceAddress': href, #原文链接
'summary': '', #摘要 'summary': '', #摘要
...@@ -744,7 +753,7 @@ def bei_jing(): ...@@ -744,7 +753,7 @@ def bei_jing():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -870,7 +879,7 @@ def nei_meng_gu(): ...@@ -870,7 +879,7 @@ def nei_meng_gu():
fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1] fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
fu_jian_href = fu_jian_re fu_jian_href = fu_jian_re
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in title: if category not in title :
file_name = title + category file_name = title + category
# print(fu_jian_href) # print(fu_jian_href)
# todo:附件上传至文件服务器 # todo:附件上传至文件服务器
...@@ -918,7 +927,7 @@ def nei_meng_gu(): ...@@ -918,7 +927,7 @@ def nei_meng_gu():
pass pass
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') log.info('共', num, '条', '...........', '共耗时', end - start, '秒')
# 吉林 # 吉林
def ji_lin(): def ji_lin():
...@@ -982,7 +991,7 @@ def ji_lin(): ...@@ -982,7 +991,7 @@ def ji_lin():
# print(pub_come) # print(pub_come)
i_content = soup.find(class_='zsy_comain') i_content = soup.find(class_='zsy_comain')
if i_content: if i_content:
print(real_href) #print(real_href)
# 去掉扫一扫 # 去掉扫一扫
try: try:
soup.find('div', id='qr_container').decompose() soup.find('div', id='qr_container').decompose()
...@@ -1020,7 +1029,7 @@ def ji_lin(): ...@@ -1020,7 +1029,7 @@ def ji_lin():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
file_name = fu_jian_href.text.strip() file_name = fu_jian_href.text.strip()
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# print(fu_jian_href) # print(fu_jian_href)
retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name)
...@@ -1065,7 +1074,7 @@ def ji_lin(): ...@@ -1065,7 +1074,7 @@ def ji_lin():
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# print(fj_href) # print(fj_href)
category = os.path.splitext(fj_href)[1] category = os.path.splitext(fj_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name) retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -1104,7 +1113,7 @@ def ji_lin(): ...@@ -1104,7 +1113,7 @@ def ji_lin():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': '', 'issuedNumber': '',
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': real_href, 'sourceAddress': real_href,
'summary': '', 'summary': '',
...@@ -1126,7 +1135,7 @@ def ji_lin(): ...@@ -1126,7 +1135,7 @@ def ji_lin():
except: except:
pass pass
end = time.time() end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒') log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 上海 # 上海
def shang_hai(): def shang_hai():
...@@ -1219,7 +1228,7 @@ def shang_hai(): ...@@ -1219,7 +1228,7 @@ def shang_hai():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -1252,7 +1261,7 @@ def shang_hai(): ...@@ -1252,7 +1261,7 @@ def shang_hai():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -1268,7 +1277,7 @@ def shang_hai(): ...@@ -1268,7 +1277,7 @@ def shang_hai():
except: except:
pass pass
end = time.time() end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒') log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 浙江 # 浙江
def zhe_jiang(): def zhe_jiang():
...@@ -1376,7 +1385,7 @@ def zhe_jiang(): ...@@ -1376,7 +1385,7 @@ def zhe_jiang():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -1393,7 +1402,7 @@ def zhe_jiang(): ...@@ -1393,7 +1402,7 @@ def zhe_jiang():
except: except:
pass pass
end = time.time() end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒') log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 福建 # 福建
def fu_jian(): def fu_jian():
...@@ -1445,7 +1454,7 @@ def fu_jian(): ...@@ -1445,7 +1454,7 @@ def fu_jian():
i_soup = BeautifulSoup(i_html, 'html.parser') i_soup = BeautifulSoup(i_html, 'html.parser')
real_href = href real_href = href
# real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm' # real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
print(real_href) #print(real_href)
is_href = db_storage.find_one({'网址': real_href}) is_href = db_storage.find_one({'网址': real_href})
if is_href: if is_href:
num+=1 num+=1
...@@ -1460,7 +1469,7 @@ def fu_jian(): ...@@ -1460,7 +1469,7 @@ def fu_jian():
content = baseCore.pdf_content(resp_content) content = baseCore.pdf_content(resp_content)
contentwithtag = '' contentwithtag = ''
category = os.path.splitext(real_href)[1] category = os.path.splitext(real_href)[1]
if category not in title: if category not in title :
file_name = title + category file_name = title + category
# 文件上传至服务器 # 文件上传至服务器
retData = baseCore.uptoOBS(real_href, '1673',pathType,file_name) retData = baseCore.uptoOBS(real_href, '1673',pathType,file_name)
...@@ -1471,7 +1480,7 @@ def fu_jian(): ...@@ -1471,7 +1480,7 @@ def fu_jian():
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,'') att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,'')
id_list.append(att_id) id_list.append(att_id)
pub_hao = '' pub_hao = ''
pub_time = '' pub_time = None
pub_source = '' pub_source = ''
else: else:
...@@ -1508,7 +1517,7 @@ def fu_jian(): ...@@ -1508,7 +1517,7 @@ def fu_jian():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1] category = os.path.splitext(fj_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
print(fj_href) print(fj_href)
# 找到附件后 上传至文件服务器 # 找到附件后 上传至文件服务器
...@@ -1524,7 +1533,7 @@ def fu_jian(): ...@@ -1524,7 +1533,7 @@ def fu_jian():
except: except:
pub_source = '' pub_source = ''
pub_time = '' pub_time = None
contentwithtag = i_soup.find('tabs tab_base_01 rules_con1') contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
content = contentwithtag.text.strip() content = contentwithtag.text.strip()
if content == '' or content == None: if content == '' or content == None:
...@@ -1548,7 +1557,7 @@ def fu_jian(): ...@@ -1548,7 +1557,7 @@ def fu_jian():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': real_href, 'sourceAddress': real_href,
'summary': '', 'summary': '',
...@@ -1566,7 +1575,7 @@ def fu_jian(): ...@@ -1566,7 +1575,7 @@ def fu_jian():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 山东 # 山东
def shan_dong(): def shan_dong():
...@@ -1633,7 +1642,7 @@ def shan_dong(): ...@@ -1633,7 +1642,7 @@ def shan_dong():
for h1 in h1_list: for h1 in h1_list:
title = title + str(h1.text) title = title + str(h1.text)
title.strip().lstrip() title.strip().lstrip()
pub_time = '' pub_time = None
span_list = source.find_all('span') span_list = source.find_all('span')
i = 0 i = 0
for span in span_list: for span in span_list:
...@@ -1683,7 +1692,7 @@ def shan_dong(): ...@@ -1683,7 +1692,7 @@ def shan_dong():
except: except:
pass pass
end = time.time() end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒') log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 广东 # 广东
def guang_dong(): def guang_dong():
...@@ -1745,7 +1754,7 @@ def guang_dong(): ...@@ -1745,7 +1754,7 @@ def guang_dong():
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \ or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href: or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1] category = os.path.splitext(fj_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name) retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name)
...@@ -1774,7 +1783,7 @@ def guang_dong(): ...@@ -1774,7 +1783,7 @@ def guang_dong():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': '', 'issuedNumber': '',
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -1792,7 +1801,7 @@ def guang_dong(): ...@@ -1792,7 +1801,7 @@ def guang_dong():
except: except:
pass pass
end = time.time() end = time.time()
print('共', count, '条', '...........', '共耗时', end - start, '秒') log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
# 海南 # 海南
def hai_nan(): def hai_nan():
...@@ -1869,7 +1878,7 @@ def hai_nan(): ...@@ -1869,7 +1878,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
...@@ -1916,7 +1925,7 @@ def hai_nan(): ...@@ -1916,7 +1925,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# print(f'----附件:{fu_jian_href}-----filename:{file_name}') # print(f'----附件:{fu_jian_href}-----filename:{file_name}')
# 附件上传至文件服务器 # 附件上传至文件服务器
...@@ -1995,7 +2004,7 @@ def hai_nan(): ...@@ -1995,7 +2004,7 @@ def hai_nan():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def hai_nan2(): def hai_nan2():
def hai_nan_sw(page_href): def hai_nan_sw(page_href):
...@@ -2126,7 +2135,7 @@ def hai_nan(): ...@@ -2126,7 +2135,7 @@ def hai_nan():
pub_source = '' pub_source = ''
pub_time = str(pub_result.text).split('来源:')[0].lstrip().strip() pub_time = str(pub_result.text).split('来源:')[0].lstrip().strip()
pub_hao = '' pub_hao = ''
writtenDate = '' writtenDate = None,
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'}) contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None: if content == '' or content == None:
...@@ -2143,7 +2152,7 @@ def hai_nan(): ...@@ -2143,7 +2152,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
...@@ -2241,7 +2250,7 @@ def hai_nan(): ...@@ -2241,7 +2250,7 @@ def hai_nan():
pub_time = str(pub_result.text).split('来源:')[0].lstrip().strip() pub_time = str(pub_result.text).split('来源:')[0].lstrip().strip()
pub_hao = '' pub_hao = ''
pub_source = '' pub_source = ''
writtenDate = '' writtenDate = None,
contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'}) contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
content = contentWithTag.text content = contentWithTag.text
if content == '' or content == None: if content == '' or content == None:
...@@ -2259,7 +2268,7 @@ def hai_nan(): ...@@ -2259,7 +2268,7 @@ def hai_nan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 上传至文件服务器 # 上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
...@@ -2360,7 +2369,7 @@ def hai_nan(): ...@@ -2360,7 +2369,7 @@ def hai_nan():
0].strip() 0].strip()
except: except:
pub_source = '' pub_source = ''
pub_time = '' pub_time = None
pub_hao = '' pub_hao = ''
contentWithTag = doc_href.find(class_='pages_content') contentWithTag = doc_href.find(class_='pages_content')
content = contentWithTag.text content = contentWithTag.text
...@@ -2383,7 +2392,7 @@ def hai_nan(): ...@@ -2383,7 +2392,7 @@ def hai_nan():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': i_href, 'sourceAddress': i_href,
'summary': '', 'summary': '',
...@@ -2479,7 +2488,7 @@ def hai_nan(): ...@@ -2479,7 +2488,7 @@ def hai_nan():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
start() start()
hai_nan1() hai_nan1()
...@@ -2538,7 +2547,7 @@ def si_chuan(): ...@@ -2538,7 +2547,7 @@ def si_chuan():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 对附件上传至文件服务器 # 对附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name)
...@@ -2567,7 +2576,7 @@ def si_chuan(): ...@@ -2567,7 +2576,7 @@ def si_chuan():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': '', 'issuedNumber': '',
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -2585,7 +2594,7 @@ def si_chuan(): ...@@ -2585,7 +2594,7 @@ def si_chuan():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 广西 # 广西
def guang_xi(): def guang_xi():
...@@ -2671,7 +2680,7 @@ def guang_xi(): ...@@ -2671,7 +2680,7 @@ def guang_xi():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name)
...@@ -2701,7 +2710,7 @@ def guang_xi(): ...@@ -2701,7 +2710,7 @@ def guang_xi():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -2718,7 +2727,7 @@ def guang_xi(): ...@@ -2718,7 +2727,7 @@ def guang_xi():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 贵州 # 贵州
def gui_zhou(): def gui_zhou():
...@@ -2788,7 +2797,7 @@ def gui_zhou(): ...@@ -2788,7 +2797,7 @@ def gui_zhou():
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \ or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name)
...@@ -2818,7 +2827,7 @@ def gui_zhou(): ...@@ -2818,7 +2827,7 @@ def gui_zhou():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -2836,7 +2845,7 @@ def gui_zhou(): ...@@ -2836,7 +2845,7 @@ def gui_zhou():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 云南 # 云南
def yun_nan(): def yun_nan():
...@@ -2870,7 +2879,7 @@ def yun_nan(): ...@@ -2870,7 +2879,7 @@ def yun_nan():
continue continue
try: try:
fu_jian_href_list = [] fu_jian_href_list = []
print(href) #print(href)
if '.shtml' in href: if '.shtml' in href:
href_resp = requests.get(url=href, headers=headers, verify=False) href_resp = requests.get(url=href, headers=headers, verify=False)
href_resp.encoding = href_resp.apparent_encoding href_resp.encoding = href_resp.apparent_encoding
...@@ -2901,7 +2910,7 @@ def yun_nan(): ...@@ -2901,7 +2910,7 @@ def yun_nan():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try: try:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
...@@ -2939,8 +2948,8 @@ def yun_nan(): ...@@ -2939,8 +2948,8 @@ def yun_nan():
'organ': '', 'organ': '',
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': '', 'publishDate': None,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -2959,7 +2968,7 @@ def yun_nan(): ...@@ -2959,7 +2968,7 @@ def yun_nan():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def yun_nan2(): def yun_nan2():
num = 0 num = 0
...@@ -3022,7 +3031,7 @@ def yun_nan(): ...@@ -3022,7 +3031,7 @@ def yun_nan():
# print(fu_jian_href) # print(fu_jian_href)
try: try:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
...@@ -3060,7 +3069,7 @@ def yun_nan(): ...@@ -3060,7 +3069,7 @@ def yun_nan():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -3079,7 +3088,7 @@ def yun_nan(): ...@@ -3079,7 +3088,7 @@ def yun_nan():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
yun_nan1() yun_nan1()
yun_nan2() yun_nan2()
...@@ -3148,8 +3157,8 @@ def chong_qing(): ...@@ -3148,8 +3157,8 @@ def chong_qing():
except: except:
origin = '' origin = ''
topicClassification = '' topicClassification = ''
pub_time = '' pub_time = None
writtenDate = '' writtenDate = None
pub_hao = '' pub_hao = ''
contentWithTag = doc_href.find('div', class_='zwxl-content') contentWithTag = doc_href.find('div', class_='zwxl-content')
content = contentWithTag.text content = contentWithTag.text
...@@ -3169,7 +3178,7 @@ def chong_qing(): ...@@ -3169,7 +3178,7 @@ def chong_qing():
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href: or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try: try:
category = os.path.splitext(fu_jian_href)[1] category = os.path.splitext(fu_jian_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
# 附件上传至文件服务器 # 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name) retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name)
...@@ -3219,7 +3228,7 @@ def chong_qing(): ...@@ -3219,7 +3228,7 @@ def chong_qing():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 天津 # 天津
def tian_jin(): def tian_jin():
...@@ -3282,7 +3291,7 @@ def tian_jin(): ...@@ -3282,7 +3291,7 @@ def tian_jin():
rmtag2.remove() rmtag2.remove()
contentWithTag = doc_href('div[id="zoom"]') contentWithTag = doc_href('div[id="zoom"]')
if len(writtenDate) < 1: if len(writtenDate) < 1:
writtenDate = '' writtenDate = None
if len(publishDate) < 1: if len(publishDate) < 1:
publishDate = doc_href('meta[name="PubDate"]').attr('content') publishDate = doc_href('meta[name="PubDate"]').attr('content')
soup = paserUrl(str(contentWithTag), href) soup = paserUrl(str(contentWithTag), href)
...@@ -3298,7 +3307,7 @@ def tian_jin(): ...@@ -3298,7 +3307,7 @@ def tian_jin():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -3351,7 +3360,7 @@ def tian_jin(): ...@@ -3351,7 +3360,7 @@ def tian_jin():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin2(): def tian_jin2():
""" """
...@@ -3413,7 +3422,7 @@ def tian_jin(): ...@@ -3413,7 +3422,7 @@ def tian_jin():
rmtag2.remove() rmtag2.remove()
contentWithTag = doc_href('div[id="zoom"]') contentWithTag = doc_href('div[id="zoom"]')
if len(writtenDate) < 1: if len(writtenDate) < 1:
writtenDate = '' writtenDate = None
if len(publishDate) < 1: if len(publishDate) < 1:
publishDate = doc_href('meta[name="PubDate"]').attr('content') publishDate = doc_href('meta[name="PubDate"]').attr('content')
soup = paserUrl(str(contentWithTag), href) soup = paserUrl(str(contentWithTag), href)
...@@ -3429,7 +3438,7 @@ def tian_jin(): ...@@ -3429,7 +3438,7 @@ def tian_jin():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -3482,7 +3491,7 @@ def tian_jin(): ...@@ -3482,7 +3491,7 @@ def tian_jin():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin3(): def tian_jin3():
num = 0 num = 0
...@@ -3507,7 +3516,7 @@ def tian_jin(): ...@@ -3507,7 +3516,7 @@ def tian_jin():
try: try:
publishDate = li.find('div', attrs={'class': 'other'}).text publishDate = li.find('div', attrs={'class': 'other'}).text
except: except:
publishDate = '' publishDate = None
if 'http' not in href: if 'http' not in href:
if '../../../' in href: if '../../../' in href:
href = href.replace('../../../', 'https://sasac.tj.gov.cn/') href = href.replace('../../../', 'https://sasac.tj.gov.cn/')
...@@ -3548,7 +3557,7 @@ def tian_jin(): ...@@ -3548,7 +3557,7 @@ def tian_jin():
rmtag2.remove() rmtag2.remove()
contentWithTag = doc_href('div[id="zoom"]') contentWithTag = doc_href('div[id="zoom"]')
if len(writtenDate) < 1: if len(writtenDate) < 1:
writtenDate = '' writtenDate = None
if len(publishDate) < 1: if len(publishDate) < 1:
publishDate = doc_href('meta[name="PubDate"]').attr('content') publishDate = doc_href('meta[name="PubDate"]').attr('content')
soup = paserUrl(str(contentWithTag), href) soup = paserUrl(str(contentWithTag), href)
...@@ -3564,7 +3573,7 @@ def tian_jin(): ...@@ -3564,7 +3573,7 @@ def tian_jin():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -3617,7 +3626,7 @@ def tian_jin(): ...@@ -3617,7 +3626,7 @@ def tian_jin():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
tian_jin1() tian_jin1()
tian_jin2() tian_jin2()
...@@ -3673,7 +3682,7 @@ def xin_jiang(): ...@@ -3673,7 +3682,7 @@ def xin_jiang():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -3717,7 +3726,7 @@ def xin_jiang(): ...@@ -3717,7 +3726,7 @@ def xin_jiang():
'topicClassification': "", 'topicClassification': "",
'issuedNumber': issuedNumber, 'issuedNumber': issuedNumber,
'publishDate': publishDate, 'publishDate': publishDate,
'writtenDate': "", 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -3734,7 +3743,7 @@ def xin_jiang(): ...@@ -3734,7 +3743,7 @@ def xin_jiang():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def xin_jiang_jsbt(): def xin_jiang_jsbt():
num = 0 num = 0
...@@ -3780,7 +3789,7 @@ def xin_jiang(): ...@@ -3780,7 +3789,7 @@ def xin_jiang():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -3824,7 +3833,7 @@ def xin_jiang(): ...@@ -3824,7 +3833,7 @@ def xin_jiang():
'topicClassification': "", 'topicClassification': "",
'issuedNumber': issuedNumber, 'issuedNumber': issuedNumber,
'publishDate': publishDate, 'publishDate': publishDate,
'writtenDate': "", 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -3843,7 +3852,7 @@ def xin_jiang(): ...@@ -3843,7 +3852,7 @@ def xin_jiang():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
xin_jiang1() xin_jiang1()
xin_jiang_jsbt() xin_jiang_jsbt()
...@@ -3881,7 +3890,7 @@ def shan_xi(): ...@@ -3881,7 +3890,7 @@ def shan_xi():
try: try:
if ".pdf" in href: if ".pdf" in href:
content = '' content = ''
publishDate = '' publishDate = None
origin = '' origin = ''
fu_jian_soup = [href] fu_jian_soup = [href]
contentWithTag = '' contentWithTag = ''
...@@ -3908,7 +3917,7 @@ def shan_xi(): ...@@ -3908,7 +3917,7 @@ def shan_xi():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -3952,7 +3961,7 @@ def shan_xi(): ...@@ -3952,7 +3961,7 @@ def shan_xi():
'topicClassification': "", 'topicClassification': "",
'issuedNumber': issuedNumber, 'issuedNumber': issuedNumber,
'publishDate': publishDate, 'publishDate': publishDate,
'writtenDate': "", 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -3969,7 +3978,7 @@ def shan_xi(): ...@@ -3969,7 +3978,7 @@ def shan_xi():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 辽宁 # 辽宁
def liao_ning(): def liao_ning():
...@@ -4028,7 +4037,7 @@ def liao_ning(): ...@@ -4028,7 +4037,7 @@ def liao_ning():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -4071,7 +4080,7 @@ def liao_ning(): ...@@ -4071,7 +4080,7 @@ def liao_ning():
'topicClassification': "", 'topicClassification': "",
'issuedNumber': issuedNumber, 'issuedNumber': issuedNumber,
'publishDate': publishDate, 'publishDate': publishDate,
'writtenDate': "", 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -4088,7 +4097,7 @@ def liao_ning(): ...@@ -4088,7 +4097,7 @@ def liao_ning():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 黑龙江 # 黑龙江
def hei_long_jiang(): def hei_long_jiang():
...@@ -4141,7 +4150,7 @@ def hei_long_jiang(): ...@@ -4141,7 +4150,7 @@ def hei_long_jiang():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -4174,7 +4183,7 @@ def hei_long_jiang(): ...@@ -4174,7 +4183,7 @@ def hei_long_jiang():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': publishDate, 'publishDate': publishDate,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -4193,7 +4202,7 @@ def hei_long_jiang(): ...@@ -4193,7 +4202,7 @@ def hei_long_jiang():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 江苏 # 江苏
def jiang_su(): def jiang_su():
...@@ -4257,7 +4266,7 @@ def jiang_su(): ...@@ -4257,7 +4266,7 @@ def jiang_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -4314,7 +4323,7 @@ def jiang_su(): ...@@ -4314,7 +4323,7 @@ def jiang_su():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 安徽 # 安徽
def an_hui(): def an_hui():
...@@ -4368,7 +4377,7 @@ def an_hui(): ...@@ -4368,7 +4377,7 @@ def an_hui():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -4418,7 +4427,7 @@ def an_hui(): ...@@ -4418,7 +4427,7 @@ def an_hui():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def an_hui2(): def an_hui2():
num = 0 num = 0
...@@ -4472,7 +4481,7 @@ def an_hui(): ...@@ -4472,7 +4481,7 @@ def an_hui():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -4524,7 +4533,7 @@ def an_hui(): ...@@ -4524,7 +4533,7 @@ def an_hui():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
an_hui1() an_hui1()
an_hui2() an_hui2()
...@@ -4607,7 +4616,7 @@ def jiang_xi(): ...@@ -4607,7 +4616,7 @@ def jiang_xi():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -4647,7 +4656,7 @@ def jiang_xi(): ...@@ -4647,7 +4656,7 @@ def jiang_xi():
'organ': organ, 'organ': organ,
'topicClassification': topicClassification, 'topicClassification': topicClassification,
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': '', 'publishDate': None,
'writtenDate': writtenDate, 'writtenDate': writtenDate,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
...@@ -4665,7 +4674,7 @@ def jiang_xi(): ...@@ -4665,7 +4674,7 @@ def jiang_xi():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 河南 # 河南
def he_nan(): def he_nan():
...@@ -4711,7 +4720,7 @@ def he_nan(): ...@@ -4711,7 +4720,7 @@ def he_nan():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -4750,7 +4759,7 @@ def he_nan(): ...@@ -4750,7 +4759,7 @@ def he_nan():
'topicClassification': '', 'topicClassification': '',
'issuedNumber': issuedNumber, 'issuedNumber': issuedNumber,
'publishDate': publishDate, 'publishDate': publishDate,
'writtenDate': '', 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -4767,7 +4776,7 @@ def he_nan(): ...@@ -4767,7 +4776,7 @@ def he_nan():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖南 # 湖南
def hu_nan(): def hu_nan():
...@@ -4828,7 +4837,7 @@ def hu_nan(): ...@@ -4828,7 +4837,7 @@ def hu_nan():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -4878,7 +4887,7 @@ def hu_nan(): ...@@ -4878,7 +4887,7 @@ def hu_nan():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 甘肃 # 甘肃
def gan_su(): def gan_su():
...@@ -4963,7 +4972,7 @@ def gan_su(): ...@@ -4963,7 +4972,7 @@ def gan_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1696',file_name) retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']: if retData['state']:
...@@ -5015,7 +5024,7 @@ def gan_su(): ...@@ -5015,7 +5024,7 @@ def gan_su():
pass pass
bro.quit() bro.quit()
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
def gan_su2(): def gan_su2():
num = 0 num = 0
...@@ -5097,7 +5106,7 @@ def gan_su(): ...@@ -5097,7 +5106,7 @@ def gan_su():
origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text() origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text() pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
contentWithTag = doc('div[id="content"]') contentWithTag = doc('div[id="content"]')
print(title) #print(title)
soup = paserUrl(str(contentWithTag), href) soup = paserUrl(str(contentWithTag), href)
try: try:
...@@ -5119,7 +5128,7 @@ def gan_su(): ...@@ -5119,7 +5128,7 @@ def gan_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
log.info(f'{file_name}---{href}--') log.info(f'{file_name}---{href}--')
retData = baseCore.uptoOBS(file_href, '1696',file_name) retData = baseCore.uptoOBS(file_href, '1696',file_name)
...@@ -5176,7 +5185,7 @@ def gan_su(): ...@@ -5176,7 +5185,7 @@ def gan_su():
pass pass
bro.quit() bro.quit()
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def gan_su3(): def gan_su3():
num = 0 num = 0
...@@ -5260,13 +5269,13 @@ def gan_su(): ...@@ -5260,13 +5269,13 @@ def gan_su():
origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text() origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text() pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
contentWithTag = doc('div[id="content"]') contentWithTag = doc('div[id="content"]')
print(title) #print(title)
if len(title) == 0 or contentWithTag.text() == '': if len(title) == 0 or contentWithTag.text() == '':
title = doc('div[class="main"]>h1').text().lstrip().strip() title = doc('div[class="main"]>h1').text().lstrip().strip()
writtenDate = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('日期:')[0].split(' ')[0].lstrip().strip() writtenDate = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('日期:')[0].split(' ')[0].lstrip().strip()
origin = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('来源:')[0].lstrip().strip() origin = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('来源:')[0].lstrip().strip()
contentWithTag = doc('div[class="detailContent"]') contentWithTag = doc('div[class="detailContent"]')
print(title) #print(title)
soup = paserUrl(str(contentWithTag), href) soup = paserUrl(str(contentWithTag), href)
try: try:
...@@ -5288,7 +5297,7 @@ def gan_su(): ...@@ -5288,7 +5297,7 @@ def gan_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1696',file_name) retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']: if retData['state']:
...@@ -5304,7 +5313,7 @@ def gan_su(): ...@@ -5304,7 +5313,7 @@ def gan_su():
content = soup.text content = soup.text
if content == '' or content == None: if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----') log.info(f'-----{href}----{title}----内容为空-----')
print(bro.page_source) #print(bro.page_source)
continue continue
if len(content) < 2: if len(content) < 2:
continue continue
...@@ -5345,7 +5354,7 @@ def gan_su(): ...@@ -5345,7 +5354,7 @@ def gan_su():
pass pass
bro.quit() bro.quit()
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
gan_su1() gan_su1()
gan_su2() gan_su2()
...@@ -5401,7 +5410,7 @@ def ning_xia(): ...@@ -5401,7 +5410,7 @@ def ning_xia():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -5453,7 +5462,7 @@ def ning_xia(): ...@@ -5453,7 +5462,7 @@ def ning_xia():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 陕西 # 陕西
def shanxi(): def shanxi():
...@@ -5511,7 +5520,7 @@ def shanxi(): ...@@ -5511,7 +5520,7 @@ def shanxi():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -5544,7 +5553,7 @@ def shanxi(): ...@@ -5544,7 +5553,7 @@ def shanxi():
'topicClassification': "", 'topicClassification': "",
'issuedNumber': "", 'issuedNumber': "",
'publishDate': publishDate, 'publishDate': publishDate,
'writtenDate': "", 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -5563,7 +5572,7 @@ def shanxi(): ...@@ -5563,7 +5572,7 @@ def shanxi():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 西藏 # 西藏
def xi_zang(): def xi_zang():
...@@ -5617,7 +5626,7 @@ def xi_zang(): ...@@ -5617,7 +5626,7 @@ def xi_zang():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -5647,7 +5656,7 @@ def xi_zang(): ...@@ -5647,7 +5656,7 @@ def xi_zang():
'topicClassification': "", 'topicClassification': "",
'issuedNumber': "", 'issuedNumber': "",
'publishDate': publishDate, 'publishDate': publishDate,
'writtenDate': "", 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -5664,7 +5673,7 @@ def xi_zang(): ...@@ -5664,7 +5673,7 @@ def xi_zang():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 青海 # 青海
def qing_hai(): def qing_hai():
...@@ -5722,7 +5731,7 @@ def qing_hai(): ...@@ -5722,7 +5731,7 @@ def qing_hai():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -5771,7 +5780,7 @@ def qing_hai(): ...@@ -5771,7 +5780,7 @@ def qing_hai():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def qing_hai2(): def qing_hai2():
num = 0 num = 0
...@@ -5849,7 +5858,7 @@ def qing_hai(): ...@@ -5849,7 +5858,7 @@ def qing_hai():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -5899,7 +5908,7 @@ def qing_hai(): ...@@ -5899,7 +5908,7 @@ def qing_hai():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
qing_hai1() qing_hai1()
qing_hai2() qing_hai2()
...@@ -5943,7 +5952,7 @@ def he_bei(): ...@@ -5943,7 +5952,7 @@ def he_bei():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -5987,7 +5996,7 @@ def he_bei(): ...@@ -5987,7 +5996,7 @@ def he_bei():
'topicClassification': "", 'topicClassification': "",
'issuedNumber': issuedNumber, 'issuedNumber': issuedNumber,
'publishDate': publishDate, 'publishDate': publishDate,
'writtenDate': "", 'writtenDate': None,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': href, 'sourceAddress': href,
'summary': '', 'summary': '',
...@@ -6002,7 +6011,7 @@ def he_bei(): ...@@ -6002,7 +6011,7 @@ def he_bei():
except: except:
pass pass
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# 湖北 # 湖北
def hu_bei(): def hu_bei():
...@@ -6068,7 +6077,7 @@ def hu_bei(): ...@@ -6068,7 +6077,7 @@ def hu_bei():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name :
file_name = file_name + category file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name) retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
if retData['state']: if retData['state']:
...@@ -6120,44 +6129,45 @@ def hu_bei(): ...@@ -6120,44 +6129,45 @@ def hu_bei():
pass pass
driver.close() driver.close()
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == '__main__': if __name__ == '__main__':
# get_content1() get_content1()
# get_content2() get_content2()
# get_content3() get_content3()
# bei_jing() bei_jing()
# nei_meng_gu() nei_meng_gu()
# ji_lin() ji_lin()
# shang_hai() shang_hai()
# zhe_jiang() zhe_jiang()
# fu_jian() fu_jian()
# shan_dong() shan_dong()
# guang_dong() guang_dong()
# hai_nan() hai_nan()
# si_chuan() si_chuan()
# guang_xi() guang_xi()
# gui_zhou() gui_zhou()
# yun_nan() yun_nan()
# chong_qing() chong_qing()
# tian_jin() tian_jin()
# xin_jiang() xin_jiang()
# shan_xi() shan_xi()
# liao_ning() liao_ning()
# hei_long_jiang() hei_long_jiang()
# jiang_su() jiang_su()
# an_hui() an_hui()
# jiang_xi() jiang_xi()
# he_nan() he_nan()
# hu_nan() hu_nan()
gan_su() gan_su()
# ning_xia() ning_xia()
# xi_zang() xi_zang()
# shanxi() shanxi()
# qing_hai() qing_hai()
# he_bei() he_bei()
# qing_hai() qing_hai()
# current_time = datetime.datetime.now() current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1) midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds() sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds) time.sleep(sleep_seconds)
import datetime
import json import json
import random import random
import time import time
from urllib.parse import urljoin from urllib.parse import urljoin
import datetime
import pymongo import pymongo
from kafka import KafkaProducer from kafka import KafkaProducer
from tqdm import tqdm from tqdm import tqdm
...@@ -12,15 +11,31 @@ import pymysql ...@@ -12,15 +11,31 @@ import pymysql
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib3 import urllib3
from base.BaseCore import BaseCore from lxml import etree
from BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx = baseCore.cnx cnx = baseCore.cnx
cursor = baseCore.cursor cursor = baseCore.cursor
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1'] db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
'国务院_国资委_copy1']
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
'Host': 'www.sasac.gov.cn',
'Pragma': 'no-cache',
'Referer': 'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
def paserUrl(html,listurl): def paserUrl(html, listurl):
# soup = BeautifulSoup(html, 'html.parser') # soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签 # 获取所有的<a>标签和<img>标签
links = html.find_all(['a', 'img']) links = html.find_all(['a', 'img'])
...@@ -36,18 +51,19 @@ def paserUrl(html,listurl): ...@@ -36,18 +51,19 @@ def paserUrl(html,listurl):
def save_data(dic_news): def save_data(dic_news):
aaa_dic = { aaa_dic = {
'附件id':dic_news['attachmentIds'], '附件id': dic_news['attachmentIds'],
'网址':dic_news['sourceAddress'], '网址': dic_news['sourceAddress'],
'tid':dic_news['labels'][0]['relationId'], 'tid': dic_news['labels'][0]['relationId'],
'来源':dic_news['labels'][0]['relationName'], '来源': dic_news['labels'][0]['relationName'],
'创建时间':dic_news['createDate'], '创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100] '带标签内容': dic_news['contentWithTag'][:100]
} }
db_storage.insert_one(aaa_dic) db_storage.insert_one(aaa_dic)
def sendKafka(dic_news): def sendKafka(dic_news):
start_time = time.time() start_time = time.time()
try:#114.116.116.241 try: # 114.116.116.241
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092']) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("policy", kafka_result = producer.send("policy",
json.dumps(dic_news, ensure_ascii=False).encode('utf8')) json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
...@@ -78,215 +94,233 @@ def sendKafka(dic_news): ...@@ -78,215 +94,233 @@ def sendKafka(dic_news):
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
def work(href_type,ting_type,relationId): # 国资委_内设机构
ip = baseCore.get_proxy() def gzw_nsjg():
log.info(f'\n================厅局类别==={ting_type}========================') # 获取页面数据
if 'http' in href_type: def get_page_nsjg(href, ting_type, relationId, page):
url_type = href_type start_time = time.time()
else: num = 0
url_type = 'http://www.sasac.gov.cn/' + href_type.replace('../', '') for pageNo in range(1, page + 1):
# print(url_type) if pageNo != 1:
i_res = requests.get(url=url_type, headers=headers, proxies=ip) href = href.replace(f'_{pageNo - 1}.html', f'_{pageNo}.html')
i_soup = BeautifulSoup(i_res.content, 'html.parser') if pageNo == page:
time.sleep(2) tag = href.split('/')[-1]
news_list = i_soup.find('div', class_='tjywBottom').find_all('li') href = href.replace(tag, 'index.html')
# 文章列表
# print('================新闻列表==================')
for news in tqdm(news_list):
try:
news_href = news.find('a')['href']
except:
continue
if 'http' in news_href:
news_url = news_href
else:
news_url = 'http://www.sasac.gov.cn/' + news_href.replace('../', '')
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': news_url})
if is_href:
log.info('已采集----------跳过')
continue
news_title = news.find('a').text.split('[')[0]
log.info(f'\n----正在采集: {news_title}-------')
pub_time = news.find('span').text.replace('[', '').replace(']', '')
# 文章信息
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cookie': 'wdcid=30ffdae06d11dbde; __jsluid_h=e623973ba12a5f48b086f8c5cee6fffa; SF_cookie_1=67313298; Hm_lvt_fa835457efbc11dfb88752e70521d23b=1693808034; zh_choose=n; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1694078708; wdses=381c6ab86ce01570; wdlast=1694163647; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1694163647; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1694165617',
'Host': 'www.sasac.gov.cn',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28651762/content.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
# news_url = 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28102228/content.html'
ii_res = requests.get(url=news_url, headers=header, proxies=ip)
ii_soup = BeautifulSoup(ii_res.content, 'html.parser')
# todo:相对路径转化为绝对路径
ii_soup = paserUrl(ii_soup, news_url)
# 去掉扫一扫
try:
ii_soup.find('div', id='qr_container').decompose()
except:
pass
# 去掉style标签
for styleTag in ii_soup.find_all('style'):
styleTag.extract()
time.sleep(2)
try:
news_info = ii_soup.find('div', class_='zsy_cotitle')
except Exception as e:
log.error(e)
news_info = ''
if news_info:
try: try:
# origin req = requests.get(url=href, headers=headers, verify=False)
pub_source = news_info.find('p').text.split('文章来源:')[1].split('发布时间')[0].strip() req_text = req.text.encode("ISO-8859-1")
req_text = req_text.decode("utf-8")
soup = BeautifulSoup(req_text, 'html.parser')
soup = paserUrl(soup, href)
li_list = soup.find('ul', attrs={'class': 'ld-tjywList'}).find_all('li')
except: except:
pub_source = '' req = requests.get(url=href, headers=headers, verify=False)
try: req_text = req.text.encode("ISO-8859-1")
contentWithTag = ii_soup.find('div', 'zsy_comain') req_text = req_text.decode("utf-8")
content = contentWithTag.text.strip() soup = BeautifulSoup(req_text, 'html.parser')
soup = paserUrl(soup, href)
except: li_list = soup.find_all('li')
content = '' for li in li_list:
contentWithTag = '' try:
if len(content) > 100: real_href = li.find('a').get('href')
pass except:
else: continue
continue is_href = db_storage.find_one({'网址': real_href})
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if is_href:
log.info('已采集----------跳过')
dic_news = { continue
'attachmentIds': [], try:
'author': '', try:
'content': content, try:
'contentWithTag': str(contentWithTag), req_ = requests.get(url=real_href, headers=headers, verify=False)
'createDate': time_now, req_.encoding = req_.apparent_encoding
'deleteFlag': 0, soup_ = BeautifulSoup(req_.text, 'html.parser')
'id': '', div_content = soup_.find('div', attrs={'class': 'zsy_content'})
'labels': [{'relationId': relationId, 'relationName': ting_type, 'labelMark': "policy"}], pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
'origin': pub_source, try:
'organ': '', title = str(pub_result.text).split('文章来源:')[0].replace('\n', '').replace('\r',
'topicClassification': '', '').lstrip().strip()
'issuedNumber': '', publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
'publishDate': pub_time, pub_source = str(pub_result.text).split('文章来源:')[1].split('发布时间:')[0].lstrip().strip()
'writtenDate': '', except:
'sid': '1697458829758697473', title = str(pub_result.text).split('发布时间:')[0].replace('\n', '').replace('\r',
'sourceAddress': news_url, '').lstrip().strip()
'summary': '', publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
'title': news_title except:
} req_ = requests.get(url=real_href, headers=headers, verify=False)
sendKafka(dic_news) req_.encoding = req_.apparent_encoding
save_data(dic_news) soup_ = BeautifulSoup(req_.text, 'html.parser')
log.info(f'{ting_type}-----{news_title}----发送成功', ) pub_result = soup_.find('div', attrs={'class': 'zsy_cotitle'})
else: real_href = str(pub_result.text).split('location.href="')[1].split('";')[0].lstrip().strip()
dic_error = { req_.close()
'标题': news_title, req_ = requests.get(url=real_href, headers=headers, verify=False)
'原文链接': news_url, req_.encoding = req_.apparent_encoding
'厅局类别': ting_type soup_ = BeautifulSoup(req_.text, 'html.parser')
} div_content = soup_.find('div', attrs={'class': 'zsy_content'})
log.error(dic_error) pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
try:
title = str(pub_result.text).split('文章来源:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
pub_source = str(pub_result.text).split('文章来源:')[1].split('发布时间:')[0].lstrip().strip()
except:
title = str(pub_result.text).split('发布时间:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
req_.close()
except:
req_ = requests.get(url=real_href, headers=headers, verify=False)
req_.encoding = req_.apparent_encoding
soup_ = BeautifulSoup(req_.text, 'html.parser')
yaoqiu_list = soup_.find('div', attrs={'class': 'yaoqiu_list'})
li_list_ = yaoqiu_list.find_all('li')
for li_ in li_list_:
href_ = li_.find('a').get('href')
real_href = href_.replace('../../../', 'http://www.sasac.gov.cn/')
req_ = requests.get(url=real_href, headers=headers, verify=False)
req_.encoding = req_.apparent_encoding
soup_ = BeautifulSoup(req_.text, 'html.parser')
div_content = soup_.find('div', attrs={'class': 'zsy_content'})
pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
try:
title = str(pub_result.text).split('文章来源:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
pub_source = str(pub_result.text).split('文章来源:')[1].split('发布时间:')[0].lstrip().strip()
except:
title = str(pub_result.text).split('发布时间:')[0].replace('\n', '').replace('\r',
'').lstrip().strip()
publishDate = str(pub_result.text).split('发布时间:')[1].strip().lstrip()
pub_source = ''
if 'location.href' in title:
continue
if '404 Ba' in str(div_content):
continue
contentWithTag = div_content.find('div',class_='zsy_comain')
try:
contentWithTag.find('div', id='qr_container').decompose()
except:
pass
# 去掉style标签
for styleTag in contentWithTag.find_all('style'):
styleTag.extract()
content = contentWithTag.text
if content == '':
log.error(f'{real_href}===获取正文失败')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': [],
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': relationId, 'relationName': ting_type, 'labelMark': "policy"}],
'origin': pub_source,
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'publishDate': publishDate,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': real_href,
'summary': '',
'title': title
}
#print(content)
#print(contentWithTag)
sendKafka(dic_news)
save_data(dic_news)
log.info(f'{ting_type}-----{title}----发送成功', )
num += 1
except Exception as e:
pass
req.close()
end_time = time.time()
print(f'抓取{num}条数据,共耗时{end_time - start_time}')
# 获取页面列表
def get_page_nsjg_list(href, institution, tid):
href_list = {
'办公厅(党委办公厅)': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/index_2642999_1.html', 9],
'综合研究局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591482/n2591484/index_2656923_1.html', 5],
'政策法规局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590860/n2590862/index_2644230_1.html', 21],
'规划发展局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590902/n2590904/index_2646556_1.html', 9],
'财务监管与运行评价局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590944/n2590946/index_2647546_1.html', 9],
'产权管理局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591020/n2591022/index_2648251_1.html', 7],
'企业改革局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591064/n2591066/index_2648748_1.html', 15],
'考核分配局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591106/n2591108/index_2649149_1.html', 6],
'资本运营与收益管理局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591192/n2591194/index_2649585_1.html', 3],
'科技创新局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591148/n2591150/index_2650085_1.html', 14],
'社会责任局': ['http://www.sasac.gov.cn/n2588020/n2588072/n23746822/n23746853/index_23747054_.html', 10],
'综合监督局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591284/n2591286/index.html', 1],
'监督追责局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591266/n2591268/index_2654822_1.html', 2],
'企业领导人员管理一局(董事会工作局)': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591302/n2591304/index_2657539_1.html', 4],
'企业领导人员管理二局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591344/n2591346/index_2657636_1.html', 4],
'党建工作局(党委组织部、党委统战部)': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591386/n2591388/index_2656630_1.html', 14],
'宣传工作局(党委宣传部)': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591426/n2591428/index_2656835_1.html',
21],
'国际合作局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591548/n2591550/index_2657011_1.html', 28],
'人事局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591586/n2591588/index_2656275_1.html', 7],
'行业协会商会党建工作局(行业协会商会工作局)': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591626/n2591628/index_2656076_1.html', 4],
'机关服务管理局(离退休干部管理局)': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591644/n2591646/index_2655780_1.html', 9],
'机关党委': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591684/n2591686/index_2655222_1.html', 33],
'党委巡视工作办公室、国资委巡视组': [
'http://www.sasac.gov.cn/n2588020/n2588072/n2591770/n2591772/index_2655029_1.html', 8],
'中央纪委国家监委驻国资委纪检监察组': ['http://www.sasac.gov.cn/n2588020/n2877928/n2878219/index_2879099_1.html', 18]}
href_ = href_list[institution][0]
page = href_list[institution][1]
get_page_nsjg(href_, institution, tid, page)
#中央纪委国家监委驻国资委纪检监察组 # 开始
def job1(a_type): def gzw_nsjg_start():
href = a_type['href'] url = 'http://www.sasac.gov.cn/n2588020/index.html'
ting_type = a_type.text req = requests.get(url=url, headers=headers, verify=False)
return href,ting_type req_text = req.text.encode("ISO-8859-1")
req_text = req_text.decode("utf-8")
all_institution = []
tree = etree.HTML(req_text)
institution = tree.xpath('/html/body/div[4]/div[2]/div/dl[1]/dt/a/text()')[0].replace('\n', '').replace('\r',
'')
institution_href = tree.xpath('/html/body/div[4]/div[2]/div/dl[1]/dt/a/@href')[0].replace('../',
'http://www.sasac.gov.cn/')
all_institution.append([institution, institution_href])
dd_list = tree.xpath('/html/body/div[4]/div[2]/div/dl[2]/dd')
for dd in dd_list:
institution = dd.xpath('./a/text()')[0].replace('\n', '').replace('\r', '')
institution_href = dd.xpath('./a/@href')[0].replace('../', 'http://www.sasac.gov.cn/')
all_institution.append([institution, institution_href])
def job(): tids = {'办公厅(党委办公厅)': 1643, '综合研究局': 1644, '政策法规局': 1645, '规划发展局': 1646, '财务监管与运行评价局': 1647, '产权管理局': 1648,
url = 'http://www.sasac.gov.cn/n2588020/index.html' '企业改革局': 1649, '考核分配局': 1650, '资本运营与收益管理局': 1651, '科技创新局': 1652, '社会责任局': 2064, '综合监督局': 1653,
ip = baseCore.get_proxy() '监督追责局': 1654,
res = requests.get(url=url, headers=headers, proxies=ip) '企业领导人员管理一局(董事会工作局)': 1655, '企业领导人员管理二局': 1656, '党建工作局(党委组织部、党委统战部)': 1657, '宣传工作局(党委宣传部)': 1658,
soup = BeautifulSoup(res.content, 'html.parser') '国际合作局': 1659, '人事局': 1660, '行业协会商会党建工作局(行业协会商会工作局)': 1661, '机关服务管理局(离退休干部管理局)': 1662, '机关党委': 1663,
time.sleep(2) '党委巡视工作办公室、国资委巡视组': 1664, '中央纪委国家监委驻国资委纪检监察组': 1874}
# 厅局列表 for a in all_institution:
list_type = soup.find('div', class_='l-jgkk-right column').find_all('dd')[:22] institution = a[0]
a_soup = soup.find('div', class_='l-jgkk-right column').find_all('dt')[0] href = a[1]
a_type = a_soup.text.strip() tid = tids[institution]
a_href = a_soup.find('a')['href'] log.info(f'\n================厅局类别==={institution}========================')
a_id = '1874' get_page_nsjg_list(href, institution, tid)
list_error = []
num = 0
start_time = time.time()
work(a_href,a_type, a_id)
for type in tqdm(list_type):
list_news = []
href_type = type.find('a')['href']
ting_type = type.find('a').text
try:
relationId = mapId_dic[ting_type]
except:
continue
work(href_type,ting_type,relationId)
num += 1
end_time = time.time()
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
time.sleep(1)
# writer.save()
# df_error = pd.DataFrame(list_error)
# df_error.to_excel('未采到文章.xlsx',index=False)
gzw_nsjg_start()
if __name__=='__main__':
mapId_dic = { if __name__ == '__main__':
'办公厅(党委办公厅)':'1643',
'综合研究局':'1644',
'政策法规局':'1645',
'规划发展局':'1646',
'财务监管与运行评价局':'1647',
'产权管理局':'1648',
'企业改革局':'1649',
'考核分配局':'1650',
'资本运营与收益管理局':'1651',
'科技创新局':'1652',
'综合监督局':'1653',
'监督追责局':'1654',
'企业领导人员管理一局(董事会工作局)':'1655',
'企业领导人员管理二局':'1656',
'党建工作局(党委组织部、党委统战部)':'1657',
'宣传工作局(党委宣传部)':'1658',
'国际合作局':'1659',
'人事局':'1660',
'机关服务管理局(离退休干部管理局)':'1662',
'机关党委':'1663',
'党委巡视工作办公室、国资委巡视组':'1664',
}
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
'Host':'www.sasac.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
try: try:
job() gzw_nsjg()
except Exception as e: except Exception as e:
print(e) log.error(e)
current_time = datetime.datetime.now() #current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1) #midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds() #sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds) #time.sleep(sleep_seconds)
# 创建一个ExcelWriter对象 # 创建一个ExcelWriter对象
# writer = pd.ExcelWriter('国务院厅局.xlsx') # writer = pd.ExcelWriter('国务院厅局.xlsx')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论