提交 af27b7ec 作者: 薛凌堃

政策法规脚本维护

上级 687dbf5e
...@@ -501,25 +501,26 @@ class BaseCore: ...@@ -501,25 +501,26 @@ class BaseCore:
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun', 'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''} 'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent() headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(file_href, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
for i in range(0, 3):
try:
name = str(self.getuuid()) + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content)
break
except:
time.sleep(3)
continue
try: try:
for i in range(0, 3):
try:
response = requests.get(file_href, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
for i in range(0, 3):
try:
name = str(self.getuuid()) + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content)
break
except:
time.sleep(3)
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1] retData['path'] = result['body']['objectUrl'].split('.com')[1]
......
#!/usr/bin/env python
# coding=utf-8
import json
import time
import pymongo
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from requests.packages import urllib3
from urllib.parse import urljoin
from BaseCore import BaseCore
baseCore = BaseCore()
urllib3.disable_warnings()
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
log = baseCore.getLogger()
class ClassTool():
def __init__(self):
self.taskType = '政策法规'
self.db_storage =pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
'国务院_国资委_copy1']
self.driver_path = r'D:\cmd100\chromedriver.exe'
self.chromr_bin = r'D:\Google\Chrome\Application\chrome.exe'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}
# 将html中的相对地址转换成绝对地址
def paserUrl(self, html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def getDriver(self):
service = Service(self.driver_path)
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('log-level=3')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 屏蔽chrome自动化受控提示
chrome_options.add_argument("--disable-blink-features=AutomationControlled") # 禁用启用Blink运行时的功能去掉webdriver痕迹
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.binary_location = self.chromr_bin
chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=self.driver_path)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return bro
def save_data(self, dic_news):
aaa_dic = {
'附件id': dic_news['attachmentIds'],
'网址': dic_news['sourceAddress'],
'tid': dic_news['labels'][0]['relationId'],
'来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100],
'发布时间': dic_news['publishDate']
}
self.db_storage.insert_one(aaa_dic)
def sendKafka(self, dic_news):
try: # 114.116.116.241
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("policy",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.error(dic_result)
return False
\ No newline at end of file
import os
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium import webdriver
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 北京
def bei_jing():
num = 0
start_time = time.time()
# 有反爬需要使用selenium
# service = Service(r'D:/chrome/113/chromedriver.exe')
# 配置selenium
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('log-level=3')
chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe'
# bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
bro = webdriver.Chrome(options=chrome_options, executable_path=chromedriver)
# with open('../../base/stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
url = 'http://gzw.beijing.gov.cn/xxfb/zcfg/index.html'
hrefs = []
try:
bro.get(url)
time.sleep(2)
bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
while True:
# 获取所有要爬取页面的url
ul = bro.find_element(By.CLASS_NAME, 'public_list_team')
li_list = ul.find_elements(By.TAG_NAME, 'li')
for li in li_list:
href_ = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
title_ = li.find_element(By.TAG_NAME, 'a').get_attribute('title')
hrefs.append([href_, title_])
updown = bro.find_element(By.CLASS_NAME, 'fanye').find_elements(By.TAG_NAME, 'a')[-1]
if updown.get_attribute('title') != '下一页':
break
updown.click()
time.sleep(2)
log.info(f'------{len(hrefs)}条数据-------------')
num = 0
count = 0
for href in hrefs:
id_list = []
title = href[1]
# todo:测试需要 注释掉判重
# 判断是否已经爬取过
is_href = baseTool.db_storage.find_one({'网址': href[0]})
if is_href:
num += 1
log.info('已采集----------跳过')
continue
# 对获取信息页面发送请求
bro.get(href[0])
time.sleep(1)
# 获取所要信息
pub = bro.find_element(By.CLASS_NAME, 'doc-info')
topic = str(pub.text).split('[主题分类] ')[1].split('\n')[0].strip()
# 发文机构
organ = str(pub.text).split('[发文机构] ')[1].split('\n')[0].strip()
pub_time = str(pub.text).split('[发布日期] ')[1].split('[有效性] ')[0].strip().lstrip()
writtenDate = str(pub.text).split('[成文日期] ')[1].split('\n')[0].strip()
# pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
pub_hao = pub.find_element(By.CLASS_NAME, 'fwzh').text.replace('[发文字号] ', '').lstrip().strip()
try:
pub_list = bro.find_elements(By.CLASS_NAME, 'article-info')
for source in pub_list:
if '来源' in source.text:
pub_source = source.text.split('来源:')[1].split('\n')[0]
# print(pub_source)
except:
pub_source = ''
# .split('来源:')[1]
if '号' not in pub_hao:
pub_hao = ''
cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML')
soup_cont = BeautifulSoup(cont, 'lxml')
soup = baseTool.paserUrl(soup_cont, href[0])
soup.prettify()
if soup.text == '' or soup.text == 'None':
log.info(f'----{href[0]}----{title}----内容为空----')
continue
# todo:去掉扫一扫
try:
soup.find('div', id='div_div').decompose()
except:
continue
# log.info(title)
fu_jian_soup = soup.find_all('a')
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href[0]}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1667', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '北京市国资委', file_name, num, pub_time)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(soup.text),
'contentWithTag': str(soup),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1667", 'relationName': "北京市国资委", 'labelMark': "policy"}],
'origin': pub_source,
'organ': organ,
'topicClassification': topic,
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href[0],
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
bro.quit()
except Exception as e:
log.info(e)
pass
if __name__ == "__main__":
bei_jing()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 重庆
def chong_qing():
"""
http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/xzgfxwj/ 4
http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/ 2
"""
num = 0
count = 0
pathType = 'policy/chongqing/'
start_time = time.time()
for page in range(0, 4):
if page == 0:
url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index.html'
else:
url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index_{}.html'.format(page)
# url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index_3.html'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
doc_resp = pq(resp_text)
doc_items = doc_resp('.zsj-fr-main').items()
for doc_item in doc_items:
id_list = []
titles = doc_item('a').items()
for title_item in titles:
title = title_item.text().strip()
href = title_item('a').attr('href')
if '../' in href:
href = url.split('zcwj/index')[0] + title_item('a').attr('href').replace('../', '')
else:
href = url.split('index')[0] + title_item('a').attr('href').replace('./', '')
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
# print(href)
# href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html'
href_text = requests.get(url=href, headers=baseTool.headers, verify=False).content
doc_href = pq(href_text)
try:
pub_result = doc_href('.zwxl-table').text().replace(' ', '')
pub_time = pub_result.split('[发布日期]')[1].strip() + ' 00:00:00'
pub_hao = pub_result.split('[发文字号]')[1].split('[主题分类]')[0].strip()
topicClassification = pub_result.split('[主题分类]')[1].split('[体裁分类]')[0].strip()
origin = pub_result.split('[发布机构]')[1].split('[成文日期]')[0].strip()
writtenDate = pub_result.split('[成文日期]')[1].split('[发布日期]')[0].strip()
doc_href = BeautifulSoup(str(doc_href), 'html.parser')
# 相对路径转化为绝对路径
doc_href = baseTool.paserUrl(doc_href, href)
# 去掉扫一扫
try:
doc_href.find('div', id='div_div').decompose()
# 去掉分享
doc_href.find('div', class_='bdsharebuttonbox').decompose()
except:
pass
contentWithTag = doc_href.find('div', class_='zwxl-article')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
except:
origin = ''
topicClassification = ''
pub_time = None
writtenDate = None
pub_hao = ''
contentWithTag = doc_href.find('div', class_='zwxl-content')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
# print(fu_jian_list)
for fu_jian in fu_jian_list:
try:
fu_jian_href = fu_jian['href']
except:
continue
file_name = fu_jian.text
if '.pdf' in fu_jian_href or '.docx' in fu_jian_href or '.doc' in fu_jian_href or 'xlsx' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
try:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1693', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num,
pub_time)
id_list.append(att_id)
# 将附件链接替换
fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
except:
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1693", 'relationName': "重庆市国资委",
'labelMark': "policy"}],
'origin': origin,
'organ': '',
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
log.info(title)
count += 1
num += 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
chong_qing()
\ No newline at end of file
import os
import time
import requests
from bs4 import BeautifulSoup
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 福建
def fu_jian():
error_tag = str(404)
num = 0
count = 0
start_time = time.time()
url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
resp_text.encoding = 'utf-8'
html = resp_text.text
soup = BeautifulSoup(html, 'html.parser')
# print(soup)
result = soup.find_all(class_='borbot-line')
for li_list in result:
li = li_list.find_all('li')
for a in li:
id_list = []
# print(a)
a_text = str(a)
title = a_text.split('title="')[-1].split('">')[0].replace('\n', '')
href_ = str(a.find('a').get('href')) # 网站链接
href = href_.replace('../', './').replace('./', 'http://gzw.fujian.gov.cn/zwgk/')
href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
href_text.encoding = href_text.apparent_encoding
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
try:
error_ = str(i_soup.find('strong').text)
except:
error_ = ''
if error_ == error_tag:
href = href_.replace('../', './').replace('./', 'http://gzw.fujian.gov.cn/zwgk/zcfg/')
href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
href_text.encoding = href_text.apparent_encoding
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
try:
error_ = str(i_soup.find('strong').text)
except:
error_ = ''
if error_ == error_tag:
href = href_.replace('../../', 'http://gzw.fujian.gov.cn/')
href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
href_text.encoding = href_text.apparent_encoding
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
real_href = href
# real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
# print(real_href)
is_href = baseTool.db_storage.find_one({'网址': real_href})
if is_href:
num += 1
continue
try:
# 文章是远程pdf
# 直接下载文件至服务器,解析出正文内容
if '.pdf' in real_href:
# pass
resp_content = requests.get(real_href, headers=baseTool.headers, verify=False, timeout=20).content
# 解析出pdf内容
content = baseCore.pdf_content(resp_content)
contentwithtag = ''
category = os.path.splitext(real_href)[1]
if category not in title:
file_name = title + category
# 文件上传至服务器
retData = baseCore.uptoOBS(real_href, '1673', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num, '')
id_list.append(att_id)
pub_hao = ''
pub_time = None
pub_source = ''
else:
try:
href_text = requests.get(url=real_href, headers=baseTool.headers, verify=False)
href_text.encoding = href_text.apparent_encoding
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
# 相对路径转化为绝对路径
i_soup = baseTool.paserUrl(i_soup, real_href)
source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
pub_source = source_.split('来源:')[1].split('发布时间:')[0].strip().lstrip()
pub_time = source_.split('发布时间:')[1].split('浏览量:')[0].strip().lstrip()
contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = ''
# print(real_href)
# todo:获取附件地址
try:
fu_jian_list = i_soup.find('ul', class_='clearflx myzj_xl_list').find_all('a')
except:
pass
fu_jian_list = []
for fu_jian in fu_jian_list:
try:
fj_href = fu_jian['href']
except:
continue
file_name = fu_jian.text
if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1]
if category not in file_name:
file_name = file_name + category
print(fj_href)
# 找到附件后 上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1673', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,
pub_time)
id_list.append(att_id)
# 将文件服务器的链接替换
fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
except:
pub_source = ''
pub_time = None
contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
content = contentwithtag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pub_hao = contentwithtag.find_all('div', class_='rules_tit1 b-free-read-leaf').text.dtrip()
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': str(contentwithtag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1673", 'relationName': "福建省国资委", 'labelMark': "policy"}],
'origin': pub_source,
'organ': '',
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': real_href,
'summary': '',
'title': title
}
# log.info(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
log.info(title)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
fu_jian()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 广东
def guang_dong():
start = time.time()
num = 0
count = 0
url = 'http://gzw.gd.gov.cn/zcfg/index.html'
try:
resp_href = requests.get(url=url, headers=baseTool.headers, verify=False)
resp_href.encoding = resp_href.apparent_encoding
doc_resp = BeautifulSoup(resp_href.text, 'html.parser')
page_items = str(doc_resp.find('div', attrs={'class': 'page'}).text)
total = page_items.split('共 ')[1].split(' 条')[0].strip().lstrip()
total = int(total)
if total % 23 != 0:
pagen = total / 23 + 1
else:
pagen = total / 23
for page in range(1, int(pagen + 1)):
if page == 1:
url = 'http://gzw.gd.gov.cn/zcfg/index.html'
else:
url = f'http://gzw.gd.gov.cn/zcfg/index_{page}.html'
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).text
doc_resp = pq(resp_text)
doc_items = doc_resp('.list li').items()
for doc_item in doc_items:
id_list = []
title = doc_item('a').text().replace('\n', '')
href = doc_item('a').attr('href')
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
# print(href)
href_text = requests.get(url=href, headers=baseTool.headers, verify=False).text
doc_href = pq(href_text)
pub_result = doc_href('.title_info_sub').text()
pub_time = pub_result.split('文章来源:')[0].replace('发布时间:', '').strip() + ' 00:00:00'
pub_source = pub_result.split('文章来源:')[1].strip()
i_soup = BeautifulSoup(href_text, 'html.parser')
i_soup = baseTool.paserUrl(i_soup, href)
content = i_soup.find('div', attrs={'class', 'box_info'})
contentwithTag = str(content)
if content == '' or content == None:
log.info(f'{href}-----{title}----内容为空----')
continue
fu_jian_list = content.find_all('a')
for fu_jian in fu_jian_list:
try:
file_name = fu_jian.text
fj_href = fu_jian['href']
except:
continue
if '.doc' in fj_href or '.docx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
category = os.path.splitext(fj_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fj_href, '1676', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num, pub_time)
id_list.append(att_id)
# 将文件服务器的链接替换
fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': content.text,
'contentWithTag': str(contentwithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1676", 'relationName': "广东省国资委", 'labelMark': "policy"}],
'origin': pub_source,
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'publishDate': pub_time,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
log.info(title)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
log.info(f'共抓取{count}条数据,共耗时{end - start}')
if __name__ == "__main__":
guang_dong()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 广西
def guang_xi():
num = 0
count = 0
start_time = time.time()
url_all = """
http://gzw.gxzf.gov.cn/wjzx/2023nwj/ 1
http://gzw.gxzf.gov.cn/wjzx/2022nwj/ 1
http://gzw.gxzf.gov.cn/wjzx/2021nwj/ 1
http://gzw.gxzf.gov.cn/wjzx/2020nwj/ 1
http://gzw.gxzf.gov.cn/wjzx/2019nwj/ 1
http://gzw.gxzf.gov.cn/wjzx/2018nwj/ 2
http://gzw.gxzf.gov.cn/wjzx/2017nwj/ 2
http://gzw.gxzf.gov.cn/wjzx/2016nwj/ 2
http://gzw.gxzf.gov.cn/wjzx/2015nwj/ 3
http://gzw.gxzf.gov.cn/wjzx/2014nwj/ 2
http://gzw.gxzf.gov.cn/wjzx/2013nwj/ 2
http://gzw.gxzf.gov.cn/wjzx/2012nwj/ 2
http://gzw.gxzf.gov.cn/wjzx/2011nwj/ 5
http://gzw.gxzf.gov.cn/wjzx/wjhbdej2008n2010n/ 1
http://gzw.gxzf.gov.cn/wjzx/wjhbdyj2004n2007n/ 1
http://gzw.gxzf.gov.cn/wjzx/gfxwjhb2004n2013n/ 1
http://gzw.gxzf.gov.cn/wjzx/jshgfxwj2004n2015n/ 1
http://gzw.gxzf.gov.cn/wjzx/gfxwjhb2004n2015n/ 1
"""
url_list = url_all.split('\n')
for url_info in url_list[1:-1]:
url_info = url_info.strip()
url_1 = url_info.split(' ')[0].strip()
for page in range(0, 1):
if page == 0:
url = f'{url_1}index.shtml'
else:
url = f'{url_1}index_{page}.shtml'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
doc_resp = pq(resp_text)
doc_items = doc_resp('#morelist li').items()
for doc_item in doc_items:
id_list = []
title = doc_item('a').attr('title').strip()
href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '')
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
# print(href)
href_text = requests.get(url=href, headers=baseTool.headers, verify=False).content
doc_href = pq(href_text)
pub_result = doc_href('.article-inf-left').text()
pub_hao_result = doc_href('.article-h2').text()
if '﹝' in pub_hao_result and '﹞' in pub_hao_result:
pub_hao = pub_hao_result.replace('﹝', '〔').replace('﹞', '〕')
elif '〔' in pub_hao_result and '〕' in pub_hao_result:
pub_hao = pub_hao_result
else:
pub_hao = ''
pub_time = pub_result.split('来源:')[0].strip() + ':00'
try:
pub_source = pub_result.split('来源:')[1].split('作者:')[0].strip()
except:
pub_source = pub_result.split('来源:')[1].strip()
contentWithTag = doc_href('.article-con div:first-child')
# 相对路径转化为绝对路径
contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
contentWithTag = baseTool.paserUrl(contentWithTag, href)
content = contentWithTag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
fu_jian_href = fu_jian['href']
except:
continue
file_name = fu_jian.text.strip()
if '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1692', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num,
pub_time)
id_list.append(att_id)
# 将附件链接替换
fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1692", 'relationName': "广西壮族自治区国资委", 'labelMark': "policy"}],
'origin': '',
'organ': pub_source,
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
log.info(title)
num = num + 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
guang_xi()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 贵州
def gui_zhou():
"""
http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/ 11
http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/ 1
"""
num = 0
count = 0
start_time = time.time()
for page in range(0, 11):
if page == 0:
url = 'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/alist.html'
else:
url = f'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/alist_{page}.html'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
doc_resp = pq(resp_text)
doc_items = doc_resp('.c').items()
for doc_item in doc_items:
id_list = []
href = doc_item('a').attr('href')
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
# print(href)
# href = 'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/hyzcfg/202110/t20211026_71215292.html'
title = doc_item('a').text().strip()
href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
if '404 Not Found' in href_text.text:
continue
doc_href = pq(href_text.content)
# 发文机构
organ = doc_href('#NewsArticleSource').text()
pub_result = doc_href('.xxgk_xl_top').text().replace('var str = ""; var str_1 = "', '').replace(
'"; if (str == "") { document.write(str_1); } else { document.write(str); }', '')
pub_time = pub_result.split('发文日期: ')[1].split('文号:')[0].strip().replace('年', '-').replace('月',
'-').replace(
'日', ' ') + ' 00:00:00'
# origin
pub_source = pub_result.split('发布机构:')[1].split('发文日期:')[0].strip()
pub_hao = pub_result.split('文号:')[1].split('是否有效:')[0].strip()
topicClassification = pub_result.split('信息分类:')[1].split('发布机构:')[0].strip()
if pub_source == '无':
pub_source = ''
if pub_hao == '无':
pub_hao = ''
contentWithTag = doc_href('#Zoom').children()
contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
contentWithTag = baseTool.paserUrl(contentWithTag, href)
content = contentWithTag.text.strip()
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = contentWithTag.find_all('a')
for fu_jian in fu_jian_list:
try:
fu_jian_href = fu_jian['href']
except:
continue
file_name = fu_jian.text.strip()
if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1694', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num, pub_time)
id_list.append(att_id)
# 将附件链接替换
fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1694", 'relationName': "贵州省国资委", 'labelMark': "policy"}],
'origin': pub_source,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
log.info(title)
count += 1
num = num + 1
except Exception as e:
pass
except Exception as e:
pass
end_time = time.time()
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
gui_zhou()
\ No newline at end of file
import json
import os
import time
from random import choice
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 国务院部门文件
def get_content2():
def getTotalpage(bmfl, headers, session):
ip = baseCore.get_proxy()
pageNo = 1
time.sleep(2)
# 拼接url
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
resp = session.get(url=url_, headers=headers, verify=False, proxies=ip)
resp_text = resp.text
resp_json = json.loads(resp_text)
totalpage = resp_json['searchVO']['totalpage']
return totalpage
def getContentList(bmfl, pageNo, headers, session):
ip = baseCore.get_proxy()
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
# 请求结果为json格式
resp = session.get(url=url_, headers=headers, verify=False, proxies=ip)
resp_text = resp.text
resp_json = json.loads(resp_text)
content_list = resp_json['searchVO']['listVO']
return content_list
session = requests.session()
session.mount('https://', HTTPAdapter(max_retries=3))
session.mount('http://', HTTPAdapter(max_retries=3))
session.keep_alive = False
start_time = time.time()
num = 0
count = 0
result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
'人力资源和社会保障部', '自然资源部', '生态环境部', '住房和城乡建设部', '交通运输部', '水利部', '农业农村部', '商务部', '文化和旅游部',
'国家卫生健康委员会',
'退役军人事务部',
'应急管理部', '人民银行', '审计署', '国务院国有资产监督管理委员会', '海关总署', '国家税务总局', '国家市场监督管理总局', '国家金融监督管理总局',
'国家广播电视总局',
'国家体育总局',
'国家统计局', '国家国际发展合作署', '国家医疗保障局', '国家机关事务管理局', '国家标准化管理委员会', '国家新闻出版署', '国家版权局', '国家互联网信息办公室',
'中国科学院',
'中国社会科学院', '中国工程院', '中国气象局', '中国银行保险监督管理委员会', '中国证券监督管理委员会', '国家粮食和物资储备局', '国家能源局', '国家国防科技工业局',
'国家烟草专卖局',
'国家移民管理局', '国家林业和草原局', '国家铁路局', '中国民用航空局', '国家邮政局', '国家文物局', '国家中医药管理局', '国家矿山安全监察局', '国家外汇管理局',
'国家药品监督管理局',
'国家知识产权局', '国家档案局', '国家保密局', '国家密码管理局', '国家宗教事务局', '国务院台湾事务办公室', '国家乡村振兴局', '国家电影局']
for bmfl in result_list:
# try:
# totalpage = getTotalpage(bmfl,headers,session)
# for pageNo in range(1,totalpage+1):
# for pageNo in range(1,6):
pageNo = 1
try:
try:
content_list = getContentList(bmfl, pageNo, baseTool.headers, session)
except:
session.close()
content_list = getContentList(bmfl, pageNo, baseTool.headers, session)
for content_dict in content_list:
id_list = []
href = content_dict['url'] # 详情页
title = content_dict['title'] # 标题
pub_code = content_dict['pcode'] # 发文字号
try:
pub_time = int(content_dict['pubtime'] / 1000) # 发布时间
pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
except:
pub_time1 = None
try:
p_time = int(content_dict['ptime'] / 1000) # 成文时间
pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
except:
pub_time2 = None
pub_org = content_dict['puborg'] # 发文机关
try:
child_type = content_dict['childtype'] # 主题分类
except:
child_type = ''
# # 判断是否已经爬取过
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
log.info('已采集----------跳过')
time.sleep(1)
continue
try:
resp = requests.get(url=href, headers=headers, verify=False)
resp.encoding = resp.apparent_encoding
resp_text = resp.text
soup = BeautifulSoup(resp_text, 'html.parser')
soup = baseTool.paserUrl(soup, href)
time.sleep(0.5)
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}---{title}---内容为空---')
continue
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1699', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '国务院文件', file_name, num, pub_time1)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
except:
log.error(f'{title}...{href}获取内容失败')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list, # 附件id
'author': '', # 作者
'content': content, # 正文不带标签
'contentWithTag': str(contentWithTag), # 正文带标签
'createDate': time_now, # 创建时间
'deleteFlag': 0, # 是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}],
# 关联标签id 关联标签名称 关联标签标识
'origin': '', # 政策发布机关
'organ': pub_org, # 政策发文机关
'topicClassification': child_type, # 政策文件分类
'issuedNumber': pub_code, # 发文字号
'publishDate': pub_time1, # 发布时间
'writtenDate': pub_time2, # 成文时间
'sid': '1697458829758697473', # 信息源id
'sourceAddress': href, # 原文链接
'summary': '', # 摘要
'title': title # 标题
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
count += 1
num += 1
except:
log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
continue
# except:
# log.error(f'{bmfl}...获取页数失败')
# continue
end_time = time.time()
log.info(f'共抓取国务院部门文件{count}条数据,耗时{end_time - start_time}')
if __name__ == "__main__":
get_content2()
\ No newline at end of file
import os
import re
import time
from pyquery import PyQuery as pq
import requests
from bs4 import BeautifulSoup
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 国务院国有资产监督管理委员会-政策发布
def get_content3():
pathType = 'policy/gyzc/'
def getPage():
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req = requests.get(url, headers=baseTool.headers, verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
# totalpage = re.findall("总页数:(.*)", soup.select('#pag_2603340')[0].text)[0]
totalpage = '17'
return int(totalpage)
def sendContent(href, headers, title, pub_time, num):
id_list = []
resp_href = requests.request("GET", href, headers=headers, verify=False)
resp_href.encoding = resp_href.apparent_encoding
soup = BeautifulSoup(resp_href.text, 'lxml')
soup = baseTool.paserUrl(soup, href)
doc_href = soup.find('div', class_='zsy_content')
try:
org_content = doc_href.select('.zsy_cotitle')[0]
org = re.findall('文章来源:(.*?)发布时间:', org_content)[0].strip()
except:
org = ''
try:
contentWithTag = doc_href.find('div', class_='zsy_comain')
except:
return
contentWithTag.select('#qr_container')[0].decompose()
contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
contentWithTag.find('div', class_='related').decompose()
contentWithTag.find('div', class_='jiathis_style_24x24').decompose()
try:
p_list = contentWithTag.findAll('p')
pub_hao = ''
for p in p_list:
p = str(p.text)
if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
try:
pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
except:
pub_hao = p.strip().lstrip()
break
except:
pub_hao = ''
if len(pub_hao) > 15:
pub_hao = ''
content = contentWithTag.text
if content == '' or content == 'None':
log.info(f'----{href}----{title}----内容为空----')
return
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1642', file_name)
if retData['state']:
pass
else:
continue
try:
att_id, full_path = baseCore.tableUpdate(retData, '国务院国资委', file_name, num, pub_time)
id_list.append(att_id)
except:
continue
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list, # 附件id
'author': '', # 作者
'content': content, # 正文不带标签
'contentWithTag': str(contentWithTag), # 正文带标签
'createDate': time_now, # 创建时间
'deleteFlag': 0, # 是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}],
# 关联标签id 关联标签名称 关联标签标识
'origin': '', # 政策发布机关
'organ': org, # 政策发文机关
'topicClassification': '', # 政策文件分类
'issuedNumber': pub_hao, # 发文字号
'publishDate': pub_time, # 发布时间
'writtenDate': None, # 成文时间
'sid': '1697458829758697473', # 信息源id
'sourceAddress': href, # 原文链接
'summary': '', # 摘要
'title': title # 标题
}
# log.info(title)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
def partTwo():
start_time = time.time()
num = 0
count = 0
totalpage = getPage()
for page in range(1, totalpage):
url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
href_resp = requests.request("GET", url, headers=baseTool.headers, verify=False)
resp_text = href_resp.content.decode('UTF-8')
li_list = resp_text.split('<li>')
del (li_list[0])
for li in li_list:
id_list = []
href_ = li.split('<a href="')[1].split('" target=')[0]
title = li.split('title="')[1].split('">')[0]
href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
pub_time = li.split('<span>[')[1].split(']</span>')[0]
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
log.info('已采集----------跳过')
continue
sendContent(href, baseTool.headers, title, pub_time, num)
num += 1
count += 1
end_time = time.time()
log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
def partOne():
start_time = time.time()
num = 0
count = 0
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
try:
# get请求,需要取消ssl验证
href_resp = requests.request("GET", url, headers=baseTool.headers, verify=False)
resp_text = href_resp.content.decode('UTF-8')
doc_resp = pq(resp_text)
doc_items = doc_resp('.zsy_conlist li').items()
time.sleep(1)
for doc_item in doc_items:
# 获取所需数据
try:
href_ = doc_item('a').attr('href')
if href_ is None:
continue
href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
# 判断是否已经爬取过
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
log.info('已采集----------跳过')
continue
title = doc_item('a').attr('title')
pub_time = doc_item('span').text().replace('[', '').replace(']', '')
except:
continue
sendContent(href, baseTool.headers, title, pub_time, num)
num += 1
count += 1
except:
pass
end_time = time.time()
log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
# partOne()
# 增量执行需要注释掉partTwo()
partTwo()
if __name__ == "__main__":
get_content3()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 河北
def he_bei():
num = 0
count = 0
start_time = time.time()
url = 'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
try:
res = requests.get(url, baseTool.headers)
# print(res)
json = res.json()
# print(json)
for info in json:
title = info['title']
contentWithTag = info['content']
id = info['id']
href = 'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id=' + str(id)
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
pub_time_ = info['updated']
m = round(pub_time_ / 1000) # 四舍五入取10位时间戳(秒级)
n = time.localtime(m) # 将时间戳转换成时间元祖tuple
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", n)[:10] # 格式化输出时间
origin = ''
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1668', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(冀国.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
issuedNumber = match_list[0][0]
if len(issuedNumber) > 20:
issuedNumber = ''
else:
issuedNumber = ''
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1668", 'relationName': "河北省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': "",
'topicClassification': "",
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
he_bei()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 河南
def he_nan():
num = 0
count = 0
pathType = 'policy/henan/'
start_time = time.time()
for page in range(0, 7):
if page == 0:
url = 'http://gzw.henan.gov.cn/xxgk/fdzdgknr/zcfg/index.html'
else:
url = f'http://gzw.henan.gov.cn/xxgk/fdzdgknr/zcfg/index_{page}.html'
try:
resp_text = requests.get(url=url, headers=headers, verify=False)
doc_resp = pq(resp_text.content)
doc_items = doc_resp('.mt15.list-box li').items()
for doc_item in doc_items:
title = doc_item('a').text().strip()
href = doc_item('a').attr('href')
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
href_res = requests.get(url=href, headers=headers, verify=False)
href_res.encoding = href_res.apparent_encoding
href_text = href_res.text
soup = BeautifulSoup(href_text, 'html.parser')
origin = soup.select('#source')[0].text
publishDate = soup.select('#pubDate')[0].text
contentWithTag = str(soup.select('#content')[0])
# contentWithTag =doc('div[class="information-zt-show"]')
# soup=BeautifulSoup(str(contentWithTag), 'html.parser')
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1690', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(豫国.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
issuedNumber = match_list[0][0]
else:
issuedNumber = ''
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1690", 'relationName': "河南省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': '',
'topicClassification': '',
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
href_res.close()
resp_text.close()
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
he_nan()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 黑龙江
def hei_long_jiang():
pathType = 'policy/heilongjiang/'
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
url = f'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
try:
web = requests.get(url=url, headers=baseTool.headers, verify=False)
text = web.json()
rows = text['data']['rows']
try:
for row in range(int(rows)):
result = text['data']['results'][row]
title = result['title']
href = 'http://gzw.hlj.gov.cn' + result['url']
publishDate = result['publishedTimeStr']
list_all = text['data']['results'][row]['domainMetaList'][1]['resultList'][0]
if list_all['name'] == '文号':
pub_hao = list_all['value']
else:
pub_hao = ''
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
contentWithTag = text['data']['results'][row]['contentHtml']
href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
href_text.encoding = href_text.apparent_encoding
href_text = href_text.text
doc_href = BeautifulSoup(href_text, 'html.parser')
origin = doc_href.find(class_='ly')
if origin:
origin = origin.find('b').text
else:
origin = ''
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1687", 'relationName': "江苏省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': '',
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': publishDate,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
hei_long_jiang()
\ No newline at end of file
import os
import time
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 湖北
def hu_bei(chromr_bin=None):
num = 0
count = 0
start_time = time.time()
hrefs = []
url = 'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
chrome_driver = baseTool.driver_path
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.binary_location = baseTool.chromr_bin
driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
driver.get(url)
time.sleep(2)
ul = driver.find_element(By.ID, 'ulList')
li_list = ul.find_elements(By.TAG_NAME, 'li')
time.sleep(1)
for li in li_list:
href = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
hrefs.append(href)
for href in hrefs:
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
driver.get(href)
time.sleep(2)
dhtml = driver.page_source
if len(dhtml) < 400:
driver.get(href)
time.sleep(2)
doc = pq(dhtml)
article = doc('div[class="article"]')
adoc = pq(article)
title = adoc('h2').text()
publishDate = adoc('div[class="info"]>span:nth-child(1)').text()
origin = adoc('div[class="info"]>span:nth-child(3)').text()
organ = ''
topicClassification = adoc('td[bfdi="93"]').text()
issuedNumber = adoc('td[bfdi="101"]').text()
writtenDate = adoc('td[bfdi="98"]').text()
rmtag = adoc('p:contains("附件:")')
rmtag2 = adoc('div[class="hbgov-qrcode-content"]')
rmtag.remove()
rmtag2.remove()
contentWithTag = adoc('div[class="article-box"]')
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1675', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1675", 'relationName': "湖北省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except Exception as e:
pass
driver.close()
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
hu_bei()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 湖南
def hu_nan():
num = 0
count = 0
pathType = 'policy/hunan/'
start_time = time.time()
for page in range(1, 7):
if page == 1:
# http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index.html
url = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index.html'
else:
url = f'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index_{page}.html'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
doc_resp = pq(resp_text)
doc_items = doc_resp('.table tbody tr').items()
for doc_item in doc_items:
href = 'http://gzw.hunan.gov.cn' + doc_item('a').attr('href')
publishDate = doc_item('td:nth-child(3)').text()
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
# href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html'
try:
res = requests.get(url=href, headers=baseTool.headers, verify=False)
res.encoding = res.apparent_encoding
res_text = res.text
# soup = BeautifulSoup(res_text, 'html.parser')
soup = baseTool.paserUrl(res_text, href)
# pub_result = str(soup.find('div', attrs={'class': 'information-zt-list fn-clear'}).text)
# writtenDate = pub_result.split('发文日期:')[1].split('名称:')[0].strip() + ':00'
# title = pub_result.split('名称:')[1].split('主题分类:')[0].lstrip().strip()
# organ = pub_result.split('发布机构: ')[1].split('if(')[0].lstrip().strip()
doc = pq(str(soup))
organ = doc('div[class="information-zt-list fn-clear"]>ul>li:nth-child(3)').text().replace('发布机构:',
'')
if 'document.write' in organ:
organ = ''
writtenDate = doc('div[class="information-zt-list fn-clear"]>ul>li:nth-child(4)').text().replace(
'发文日期:', '')
title = doc('div[class="information-zt-list fn-clear"]>ul>li:nth-child(5)').text().replace('名称:',
'')
topicClassification = doc(
'div[class="information-zt-list fn-clear"]>ul>li:nth-child(6)').text().replace('主题分类:', '')
contentWithTag = doc('div[class="information-zt-show"]')
soup = BeautifulSoup(str(contentWithTag), 'html.parser')
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1691', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1691", 'relationName': "湖南省国资委", 'labelMark': "policy"}],
'origin': '',
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': '',
'publishDate': publishDate,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
hu_nan()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 江苏
def jiang_su():
num = 0
count = 0
pathType = 'policy/jiangsu/'
start_time = time.time()
pagestart = 1
pageend = 45
for page in range(1, 3):
url = f"http://jsgzw.jiangsu.gov.cn/module/web/jpage/dataproxy.jsp?startrecord={pagestart}&endrecord={pageend}&perpage=15"
pagestart = pageend + 1
pageend = pageend + 45
payload = "col=1&appid=1&webid=39&path=%2F&columnid=85683&sourceContentType=1&unitid=369983&webname=%E6%B1%9F%E8%8B%8F%E7%9C%81%E5%9B%BD%E8%B5%84%E5%A7%94&permissiontype=0"
header = {
'Connection': 'keep-alive',
'Accept': 'application/xml, text/xml, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'http://jsgzw.jiangsu.gov.cn',
'Referer': 'http://jsgzw.jiangsu.gov.cn/col/col61490/index.html?uid=247686&pageNum=4',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'JSESSIONID=ADB520E83E1FC10429D961634BAD303D; __jsluid_h=02c2c950abb71f547a79da79719246aa; _gscu_210493472=24936291qq5dvl18; _gscbrs_210493472=1; yunsuo_session_verify=60cc00825d4e2dd3dee278a301f60f1e; _gscs_210493472=24936291p77pyu18|pv:3'
}
try:
resp_text = requests.request("POST", url, headers=header, data=payload).text
li_list = re.findall('CDATA\[(.*?)\]\]></record>', str(resp_text))
for li in li_list:
a = BeautifulSoup(li, 'lxml').find('a')
href = 'https://jsgzw.jiangsu.gov.cn/' + a['href']
title = a.text
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
href_text.encoding = href_text.apparent_encoding
href_text = href_text.text
doc_href = BeautifulSoup(href_text, 'html.parser')
soup = baseTool.paserUrl(href_text, href)
doc = pq(str(soup))
publishDate = doc('div[class="cf tip"]>span:contains(发布日期:)').text().replace('发布日期:', '')
writtenDate = doc('table[class="xlt_table"]>tbody>tr:nth-child(1)>td:nth-child(4)').text()
organ = doc('table[class="xlt_table"]>tbody>tr:nth-child(2)>td:nth-child(2)').text()
pub_hao = doc('table[class="xlt_table"]>tbody>tr:nth-child(2)>td:nth-child(4)').text()
contentWithTag = doc('div[id="zoom"]')
if len(contentWithTag) < 1:
contentWithTag = doc('div[class="main-txt"]')
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1687', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(pub_hao) < 1:
pattern = r'(苏国.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
pub_hao = match_list[0][0]
else:
pub_hao = ''
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1687", 'relationName': "江苏省国资委", 'labelMark': "policy"}],
'origin': '',
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': publishDate,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
jiang_su()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 江西
def jiang_xi():
"""
1-60
61-120
121-164
"""
num = 0
count = 0
pathType = 'policy/jiangxi/'
start_time = time.time()
startrecord = 1
endrecord = 60
for page in range(1, 3):
url = f"http://gzw.jiangxi.gov.cn/module/web/jpage/dataproxy.jsp?startrecord={startrecord}&endrecord={endrecord}&perpage=20"
startrecord = endrecord + 1
endrecord = endrecord + 60
payload = "col=1&webid=175&path=http%3A%2F%2Fgzw.jiangxi.gov.cn%2F&columnid=22977&sourceContentType=1&unitid=402016&webname=%E6%B1%9F%E8%A5%BF%E7%9C%81%E5%9B%BD%E6%9C%89%E8%B5%84%E4%BA%A7%E7%9B%91%E7%9D%A3%E7%AE%A1%E7%90%86%E5%A7%94%E5%91%98%E4%BC%9A&permissiontype=0"
header = {
'Connection': 'keep-alive',
'Accept': 'application/xml, text/xml, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'http://gzw.jiangxi.gov.cn',
'Referer': 'http://gzw.jiangxi.gov.cn/col/col22977/index.html?uid=402016&pageNum=9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'JSESSIONID=F601A052571881210819664F5BD38015; JSESSIONID=6E54DB27D82E844B825DD675AE19E399'
}
try:
resp_text = requests.request("POST", url, headers=header, data=payload).text
href_list = re.findall("href='(.*?)'", resp_text)
for href in href_list:
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
href_res = requests.get(url=href, headers=baseTool.headers, verify=False)
href_res.encoding = href_res.apparent_encoding
href_text = href_res.text
soup = baseTool.paserUrl(href_text, href)
doc = pq(str(soup))
try:
# origin=soup.find(text='信息来源:').text.replace('信息来源:','')
origin = doc('td:contains("信息来源:")').text().replace('信息来源:', '')
except Exception as e:
origin = ''
title = doc('tr[class="biaoti"]>td:nth-child(1)').text().replace('标题:', '')
organ = doc('div[class="xxgk-quote"]>table>tbody>tr:nth-child(1)>td:nth-child(2)').text().replace(
'发文机关:', '')
pub_hao = doc('div[class="xxgk-quote"]>table>tbody>tr:nth-child(1)>td:nth-child(3)').text().replace(
'文号:', '')
topicClassification = doc(
'div[class="xxgk-quote"]>table>tbody>tr:nth-child(2)>td:nth-child(1)').text().replace('主题分类:',
'')
writtenDate = doc(
'div[class="xxgk-quote"]>table>tbody>tr:nth-child(2)>td:nth-child(3)').text().replace('成文日期:',
'')
# pub_result = str(soup.find('div', attrs={'class': 'xxgk-quote'}).text)
# title = pub_result.split('标??????题: ')[1].split('有??效??性: ')[0].lstrip().strip()
# organ = pub_result.split('发文机关:')[1].split('文??????号:')[0].lstrip().strip()
# pub_hao = pub_result.split('文??????号:')[1].split('主题分类: ')[0].lstrip().strip()
# writtenDate = pub_result.split('成文日期:')[1].split('标??????题: ')[0].lstrip().strip()
contentWithTag = doc('div[id="zoom"]')
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1689', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num, writtenDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if len(pub_hao) < 1:
pattern = r'(赣国资.{1,}?号)|(国.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
pub_hao = match_list[0][0]
else:
pub_hao = ''
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1689", 'relationName': "江西省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
'publishDate': None,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
jiang_xi()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 辽宁
def liao_ning():
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
url = f'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
resp_text.encoding = resp_text.apparent_encoding
resp_text = resp_text.text
doc_resp = BeautifulSoup(resp_text, 'html.parser')
doc_items = doc_resp.select(
'#aa25154996104f57858a48e0b1aecca9 > div:nth-of-type(2) > div.tablist-show > div.tab-list-page')[0]
li_list = doc_items.select('li')
for li in li_list:
# print(li)
href = str(li.select('a')[0].get('href'))
if 'http' not in href:
if 'https' not in href:
href = 'https://gzw.ln.gov.cn/' + href
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
href_text.encoding = href_text.apparent_encoding
href_text = href_text.text
doc_href = baseTool.paserUrl(href_text, href)
doc = pq(str(doc_href))
title = doc('p[class="govxlTText"]').text().strip()
origintag = doc('p[class="govxlTText2"]').text().strip()
origin = origintag.split('文章来源:')[1].split('发布时间:')[0].strip()
publishDate = origintag.split('发布时间:')[1].strip().replace('年', '-').replace('月', '-').replace('日',
'') + ' 00:00:00'
contentWithTag = doc('div[class="TRS_Editor"]')
if len(title) < 1:
title = doc('h1[class="title"]')
issuedNumber = doc('p[class="wjh"]')
if len(contentWithTag) < 1:
contentWithTag = doc('div[class="content"]')
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1685', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(辽国资.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
issuedNumber = match_list[0][0]
if len(issuedNumber) > 20:
issuedNumber = ''
else:
issuedNumber = ''
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1685", 'relationName': "辽宁省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': "",
'topicClassification': "",
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
liao_ning()
\ No newline at end of file
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 内蒙古
def nei_meng_gu():
start = time.time()
num = 0
url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
resp_text.encoding = 'utf-8'
html = resp_text.text
soup = BeautifulSoup(html, 'html.parser')
result = soup.find(class_='right_two')
li_list = result.find_all(class_='font14wr')
for a in li_list:
id_list = []
a_text = str(a)
real_href = 'https://gzw.nmg.gov.cn/zfxxgk' + a_text.split('href="..')[-1].split('" target="_blank')[0]
# # 判断是否已经爬取过
# todo:测试用 注释掉判重
is_href = baseTool.db_storage.find_one({'网址': real_href})
if is_href:
num += 1
continue
try:
# 获取所需信息
title = a_text.split('target="_blank">')[-1].split('</a>')[0]
href_text = requests.get(url=real_href, headers=baseTool.headers, verify=False)
href_text.encoding = 'utf-8'
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
# todo:将html中的a标签相对路径改为绝对路径
i_soup = baseTool.paserUrl(i_soup, real_href)
i_result = i_soup.find('div', id='d_laiyuan')
time_ = i_result.find_all('span')[0]
time_ = str(time_)
pub_time = time_.split('<span>')[1].split('</span>')[0].replace('发布时间:', '')
# 发布机关
origin = i_result.find_all('span')[1]
origin = str(origin)
pub_source = origin.split('<span>')[1].split('</span>')[0].replace('来源:', '')
# 发文机关
organ = origin
fwzh = i_soup.find_all('td')[7]
pub_hao_result = re.findall('〔(.*?)〕', str(fwzh))
if len(pub_hao_result) == 0:
pub_hao = ''
else:
if '内' in str(fwzh):
pub_hao = str(fwzh).split('<td>')[1].split('</td>')[0]
else:
pub_hao = ''
# 成文时间
writtenDate = i_soup.find_all('td')[9].text
topicClassification = i_soup.find_all('td')[3].text
i_content = i_soup.find(class_='d_show')
if i_content:
content = str(i_content)
else:
i_content = i_soup.find(class_='view TRS_UEDITOR trs_paper_default')
content = str(i_content)
if i_content.text == '' or i_content.text == 'None':
log.info(f'{real_href}------{title}----内容为空-----')
continue
# todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中
fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
fu_jian_result = re.findall('href="(.*?)"', str(fujian))
if len(fu_jian_result) > 0:
for fu_jian_re in fu_jian_result:
if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
fu_jian_href = fu_jian_re
category = os.path.splitext(fu_jian_href)[1]
if category not in title:
file_name = title + category
# print(fu_jian_href)
# todo:附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1669', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', file_name, num, pub_time)
id_list.append(att_id)
log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': i_content.text,
'contentWithTag': content,
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1669", 'relationName': "内蒙古自治区国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': real_href,
'summary': '',
'title': title
}
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num = num + 1
except:
pass
except:
pass
end = time.time()
log.info(f'共抓取{num}条数据,共耗时{end - start}')
if __name__ == "__main__":
nei_meng_gu()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 宁夏
def ning_xia():
num = 0
count = 0
pathType = 'policy/ningxia/'
start_time = time.time()
for page in range(0, 3):
if page == 0:
url = 'http://gzw.nx.gov.cn/zcfg/zcwj/gzwwj/index.html'
else:
url = f'http://gzw.nx.gov.cn/zcfg/zcwj/gzwwj/index_{page}.html'
try:
res = requests.get(url=url, headers=baseTool.headers, verify=False)
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
li_list = soup.find('div', attrs={'class': 'stdnewslist'}).find_all('li')
for li in li_list:
title = li.find('a').get('title').replace('</p>', '').replace('<p>', '')
href = url.split('index')[0] + li.find('a').get('href').replace('./', '')
publishDate = li.find('span', attrs={'class': 'stdnewslistspan'}).text
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
href_res = requests.get(url=href, headers=baseTool.headers, verify=False)
href_res.encoding = href_res.apparent_encoding
href_text = href_res.text
# soup_ = BeautifulSoup(href_text, 'html.parser')
soup_ = baseTool.paserUrl(href_text, href)
pub_result = soup_.find('table', attrs={'class': 'gk-xl-table'}).text.replace(' ', '')
writtenDate = pub_result.split('生成日期')[1].split('发文字号')[0].strip() + ' 00:00:00'
pub_hao = pub_result.split('发文字号')[1].split('公开形式')[0].strip()
organ = pub_result.split('所属机构')[1].split('有效性')[0].strip()
contentWithTag = soup_.find('div', attrs={'class': 'content'}).find('div',
attrs={'class': 'TRS_UEDITOR'})
soup = BeautifulSoup(str(contentWithTag), 'html.parser')
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1697', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1697", 'relationName': "宁夏回族自治区国资委", 'labelMark': "policy"}],
'origin': '',
'organ': organ,
'topicClassification': "",
'issuedNumber': pub_hao,
'publishDate': publishDate,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
ning_xia()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 山东
def shan_dong():
headers = {
'Cookie': 'COLLCK=2502513302; COLLCK=2493627587',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.183'
}
start = time.time()
num = 0
count = 0
url_list = ['http://gzw.shandong.gov.cn/channels/ch06086/', 'http://gzw.shandong.gov.cn/channels/ch06088/']
for url in url_list:
try:
resp_text = requests.get(url=url, headers=headers, verify=False)
resp_text.encoding = 'utf-8'
html = resp_text.text
soup = BeautifulSoup(html, 'html.parser')
result = soup.find_all(class_='pagedContent')
for li in result:
href = li.find('a')['href']
is_href = db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
href_text = requests.get(url=href, headers=headers, verify=False)
href_text.encoding = href_text.apparent_encoding
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
try:
source = i_soup.find_all('tbody')[0]
title = str(source).split('标  题:</strong>')[1].split('</td>')[0].replace('\r', '').replace('\n',
'')
pub_time = re.findall('<strong>发布日期:</strong>(.*?)</td>', str(source))
pub_time = ''.join(pub_time)
pub_hao = re.findall('<strong>发文字号:</strong>(.*?)</td>', str(source))
pub_hao = ''.join(pub_hao)
pub_source = re.findall('<strong>发文机关:</strong>(.*?)</td>', str(source))
pub_source = ''.join(pub_source)
writtenDate = re.findall('<strong>成文日期:</strong>(.*?)</td>', str(source))
writtenDate = ''.join(writtenDate)
# print(pub_time,pub_source,pub_hao)
content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
if pub_hao == '无':
p_list = content.find_all('p')
for p in p_list:
p_text = p.text
if '〔' and '〕' in p_text:
pub_hao = p_text
break
else:
continue
except:
try:
title = str(i_soup.find('div', attrs={'class': 'wz_title'}).text).strip().lstrip()
except:
title = ''
source = i_soup.find('div', attrs={'id': 'nr'})
h1_list = source.find_all('h1')
for h1 in h1_list:
title = title + str(h1.text)
title.strip().lstrip()
pub_time = None
span_list = source.find_all('span')
i = 0
for span in span_list:
span_text = span.text
if '〔' and '〕' in span_text or '鲁国' in span_text or '国办发' in span_text:
pub_hao = str(span_text)
if '号' not in pub_hao:
pub_hao = pub_hao + str(span_list[i + 1].text)
break
i = i + 1
content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': [],
'author': '',
'content': content,
'contentWithTag': str(contentwithtag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1674", 'relationName': "山东省国资委", 'labelMark': "policy"}],
'origin': '',
'organ': pub_source,
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
log.info(title)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
if __name__ == "__main__":
shan_dong()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from lxml import etree
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 山西
def shan_xi():
num = 0
count = 0
start_time = time.time()
for page in range(1, 7):
if page == 1:
url = 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/'
else:
url = f'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/index_{page - 1}.shtml'
try:
res = requests.get(url, baseTool.headers)
page_text = res.text.encode("ISO-8859-1")
page_text = page_text.decode("utf-8")
tree = etree.HTML(page_text)
tr_list = tree.xpath(
'/html/body/table[3]/tbody/tr/td[2]/table/tbody/tr[3]/td/table[2]/tbody/tr[3]/td/form/table/tbody/tr')
for tr in tr_list:
href = tr.xpath('./td[1]/a/@href')
if href == []:
continue
href = href[0].replace('../../', 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/').replace('./',
'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/')
title = tr.xpath('./td[1]/a/span//text()')[0]
publishDate_ = str(tr.xpath('./td[2]/span/text()')[0]).strip()
time_obj = datetime.datetime.strptime(publishDate_, "%Y/%m/%d")
# 将datetime对象格式化为年月日的字符串
publishDate = time_obj.strftime("%Y-%m-%d")
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
if ".pdf" in href:
content = ''
publishDate = None
origin = ''
fu_jian_soup = [href]
contentWithTag = ''
else:
res = requests.get(href, baseTool.headers)
page_text = res.text.encode("ISO-8859-1")
page_text = page_text.decode("utf-8")
page = baseTool.paserUrl(page_text, href)
doc = pq(str(page))
title = doc('title').text()
origin = ''
contentWithTag = doc('div[id="vsb_content"]')
soup = baseTool.paserUrl(str(contentWithTag), href)
if len(fu_jian_soup) < 1:
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1684', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
pattern = r'(晋国资.{1,}?号)|(国资.{1,}?号)'
match_list = re.findall(pattern, content)
if len(match_list) > 0:
issuedNumber = match_list[0][0]
if len(issuedNumber) > 20:
issuedNumber = ''
else:
issuedNumber = ''
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1684", 'relationName': "山西省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': "",
'topicClassification': "",
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except Exception as e:
pass
except Exception as e:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
shan_xi()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 上海
def shang_hai():
start = time.time()
num = 0
count = 0
for page in range(1, 7):
if page == 1:
url = 'https://www.gzw.sh.gov.cn/shgzw_flfg_zcfg_gfxwj/index.html'
else:
url = f'https://www.gzw.sh.gov.cn/shgzw_flfg_zcfg_gfxwj/index_{page}.html'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).text
doc_resp = pq(resp_text)
doc_items = doc_resp('.gqzc_list_right ul li').items()
for doc_item in doc_items:
id_list = []
title = doc_item('a').attr('title').strip()
pub_time = doc_item('span').text() + ' 00:00:00'
href = doc_item('a').attr('href')
if 'https:/' in href:
pass
else:
href = 'https://www.gzw.sh.gov.cn' + href
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
# href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
href_text = requests.get(url=href, headers=baseTool.headers, verify=False).text
doc_href = pq(href_text)
doc_href_ = BeautifulSoup(href_text, 'html.parser')
# 相对路径转化为绝对路径
doc_href_ = baseTool.paserUrl(doc_href_, href)
info_list = doc_href_.find_all('span', style='text-align: center;margin-left: 42%;')
pub_source = info_list[1].find('b').text.split('信息来源:')[1]
content = doc_href_.find('div', attrs={'class': 'detail_03'})
if content == '' or content == 'None':
log.info(f'{href}-----{title}----内容为空')
continue
# 将文章中的附件字段删去
pattern = r'\d+\.'
for p in content.find_all('p')[-22:]:
p_text = p.text
if len(p_text) > 50:
continue
matches = re.findall(pattern, p_text)
for k in matches:
if k in p_text:
p.extract()
try:
pub_result = doc_href('.detail_03')
pub_result('meta')
pub_result = '沪' + str(pub_result('meta')).split('沪')[1].split('号')[0].strip() + '号'
except:
try:
pub_result = str(
'沪' + doc_href('.detail_03 ul').text().split('沪')[1].split('号')[0].strip() + '号')
except:
pub_result = str(doc_href('.detail_03 p').text().split('号')[0].strip() + '号')
if '﹝' in pub_result and '﹞' in pub_result:
pub_hao = pub_result.replace('﹝', '〔').replace('﹞', '〕')
elif '〔' in pub_result and '〕' in pub_result:
pub_hao = pub_result
elif '【' in pub_result and '】' in pub_result:
pub_hao = pub_result
elif '[' in pub_result and ']' in pub_result:
pub_hao = pub_result
else:
pub_hao = ''
if len(pub_hao) > 20:
pub_hao = ''
# todo:找到附件标签,正文内容带有附件
fu_jian_soup = content.find('ul')
if fu_jian_soup:
li_list = fu_jian_soup.find_all('a')
else:
li_list = []
for a in li_list:
fu_jian_href = a['href']
file_name = a.text
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(fu_jian_href, '1671', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '上海市国资委', file_name, num, pub_time)
id_list.append(att_id)
# todo:将返回的地址更新到soup
a['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
else:
continue
log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': content.text,
'contentWithTag': str(content),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1671", 'relationName': "上海市国资委", 'labelMark': "policy"}],
'origin': pub_source,
'organ': '',
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
log.info(f'共抓取{count}条数据,共耗时{end - start}')
if __name__ == "__main__":
shang_hai()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 陕西
def shanxi():
num = 0
count = 0
start_time = time.time()
url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
# url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
try:
res = requests.get(url=url, headers=baseTool.headers)
res.encoding = res.apparent_encoding
res_text = res.text
soup = BeautifulSoup(res_text, 'html.parser')
# soup = paserUrl(res_text, 'https://sxgz.shaanxi.gov.cn')
# print(soup)
result = soup.find(class_='scroll_cont')
li_list = result.find_all('li')
for li in li_list:
href = li.find('a')['href']
if 'http' in str(href):
href = href
else:
href = 'https://sxgz.shaanxi.gov.cn/' + href
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
res_href = requests.get(url=href, headers=baseTool.headers)
res_href.encoding = res_href.apparent_encoding
res_text = res_href.text
# i_soup = BeautifulSoup(res_text, 'html.parser')
i_soup = baseTool.paserUrl(res_text, href)
title = i_soup.find(class_='m-gk-title').text
i_result = i_soup.find(class_='ftitle')
span_list = i_result.find_all('span')
origin = str(span_list[0]).split('<span>')[1].split('</span>')[0]
publishDate = str(span_list[2]).split('<span>')[1].split('</span>')[0]
t = time.strptime(publishDate, "%Y/%m/%d %H:%M:%S")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
contentWithTag = i_soup.find(class_='scroll_cont')
soup = BeautifulSoup(str(contentWithTag), 'html.parser')
div_tag = soup.find(id='ztl')
div_tag.extract()
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1680', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1680", 'relationName': "陕西省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': "",
'topicClassification': "",
'issuedNumber': "",
'publishDate': publishDate,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
res_href.close()
except Exception as e:
pass
res.close()
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
shanxi()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 四川
def si_chuan():
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
if page == 1:
url = 'http://gzw.sc.gov.cn/scsgzw/CU2304010701/cu_xxgk_xzgfxwj.shtml'
else:
url = 'http://gzw.sc.gov.cn/scsgzw/CU2304010701/cu_xxgk_xzgfxwj_2.shtml'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).text
doc_resp = pq(resp_text)
doc_items_ = doc_resp('.biaobody')
doc_items = doc_items_('li').items()
for doc_item in doc_items:
id_list = []
# print(doc_item)
pub_time = doc_item('.lie4').text().strip() + ' 00:00:00'
pub_hao = doc_item('.lie3').text().strip()
title = doc_item('a').attr('title')
href = doc_item('a').attr('href')
if 'http:' not in href:
href = 'http://gzw.sc.gov.cn' + doc_item('a').attr('href')
# href = 'http://gzw.sc.gov.cn/scsgzw/CU2304010701/2018/1/22/9c5db691e09f4efdafce41763a0d7e03.shtml'
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
# print(href)
href_text = requests.get(url=href, headers=baseTool.headers, verify=False).text
doc_href = pq(href_text)
# title = str(doc_href('.xxgkzn_title').text()).replace('\n', '').replace('\r', '')
# content = str(doc_href('#scrollBox').children())
# 将doc_href转化为BeautifulSoup
doc_href = BeautifulSoup(str(doc_href), 'html.parser')
# 相对路径转化为绝对路径
doc_href = baseTool.paserUrl(doc_href, href)
contentWithTag = doc_href.find('div', id='scrollBox')
content = contentWithTag.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_list = doc_href.find_all('a')
for fu_jian in fu_jian_list:
try:
fu_jian_href = fu_jian['href']
except:
continue
file_name = fu_jian.text.strip()
if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# 对附件上传至文件服务器
retData = baseCore.uptoOBS(fu_jian_href, '1678', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num, pub_time)
id_list.append(att_id)
fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# fu_jian_href_list.append(fu_jian_href)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1678", 'relationName': "四川省国资委", 'labelMark': "policy"}],
'origin': '',
'organ': '',
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
log.info(title)
count += 1
num = num + 1
except Exception as e:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
si_chuan()
\ No newline at end of file
import datetime
import time
from comData.policylaw.an_hui import an_hui
from comData.policylaw.bei_jing import bei_jing
from comData.policylaw.chong_qing import chong_qing
from comData.policylaw.fu_jian import fu_jian
from comData.policylaw.guang_dong import guang_dong
from comData.policylaw.guang_xi import guang_xi
from comData.policylaw.gui_zhou import gui_zhou
from comData.policylaw.gwyfile import get_content1
from comData.policylaw.gwyparts import get_content2
from comData.policylaw.gwysasac import get_content3
from comData.policylaw.hai_nan import hai_nan
from comData.policylaw.he_nan import he_nan
from comData.policylaw.hei_long_jiang import hei_long_jiang
from comData.policylaw.ji_lin import ji_lin
from comData.policylaw.jiang_su import jiang_su
from comData.policylaw.jiang_xi import jiang_xi
from comData.policylaw.liao_ning import liao_ning
from comData.policylaw.nei_meng_gu import nei_meng_gu
from comData.policylaw.shan_dong import shan_dong
from comData.policylaw.shan_xi import shan_xi
from comData.policylaw.shang_hai import shang_hai
from comData.policylaw.si_chuan import si_chuan
from comData.policylaw.tian_jin import tian_jin
from comData.policylaw.xin_jiang import xin_jiang
from comData.policylaw.yun_nan import yun_nan
from comData.policylaw.zhe_jiang import zhe_jiang
if __name__ == "__main__":
get_content1()
get_content3()
bei_jing()
nei_meng_gu()
ji_lin()
shang_hai()
zhe_jiang()
fu_jian()
shan_dong()
guang_dong()
hai_nan()
si_chuan()
guang_xi()
gui_zhou()
yun_nan()
chong_qing()
tian_jin()
xin_jiang()
shan_xi()
liao_ning()
hei_long_jiang()
jiang_su()
an_hui()
jiang_xi()
he_nan()
hu_nan()
gan_su()
ning_xia()
xi_zang()
shanxi()
qing_hai()
he_bei()
qing_hai()
get_content2()
current_time = datetime.datetime.now()
midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
sleep_seconds = (midnight_time - current_time).total_seconds()
time.sleep(sleep_seconds)
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 西藏
def xi_zang():
start_time = time.time()
pathType = 'policy/xizang/'
url_list = ['http://gzw.lasa.gov.cn/gzw/zccfg/common_list.shtml',
'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
for url in url_list:
num = 0
count = 0
try:
res = requests.get(url=url, headers=baseTool.headers)
res.encoding = res.apparent_encoding
res_text = res.text
# soup = BeautifulSoup(res_text, 'html.parser')
soup = baseTool.paserUrl(res_text, url)
result = soup.find('ul', class_='list')
li_list = result.find_all('li')
for li in li_list:
href = li.find('a')['href']
title = li.find('a').text
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
res_href = requests.get(url=href, headers=baseTool.headers)
res_href.encoding = res_href.apparent_encoding
res_href = res_href.text
# i_soup = BeautifulSoup(res_href, 'html.parser')
i_soup = baseTool.paserUrl(res_href, href)
i_result = i_soup.find(class_='inform')
div_list = i_result.find_all('div')
publishDate = str(div_list[0]).split('<div>')[1].split('</div>')[0].replace('发布时间:', '')
origin = str(div_list[1]).split('<div>')[1].split('</div>')[0].replace('来源:', '')
contentWithTag = str(i_soup.find(id='NewsContent'))
soup = BeautifulSoup(contentWithTag, 'html.parser')
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1695', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
# todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1695", 'relationName': "西藏自治区国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': "",
'topicClassification': "",
'issuedNumber': "",
'publishDate': publishDate,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
if __name__ == "__main__":
xi_zang()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 浙江
def zhe_jiang():
start = time.time()
num = 0
count = 0
url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
try:
res = requests.get(url, baseTool.headers).content
soup = BeautifulSoup(res, 'html.parser')
# print(soup)
# recordset = soup.find('recordset')
list_li = re.findall('CDATA\[\\n(.*?)\]\]></record>', str(soup))
# print(list_li)
for li in list_li:
fj_href_list = []
li = BeautifulSoup(li, 'lxml')
href = li.find('a')['href']
pub_time = li.find('a').find('span').text
title = li.find('a').text.replace(pub_time, '').strip()
# log.info(title)
if 'http' in href:
href = href
else:
href = 'http://gzw.zj.gov.cn/' + href
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
href_text.encoding = href_text.apparent_encoding
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
# 将相对路径转化为绝对路径
i_soup = baseTool.paserUrl(i_soup, href)
# g_xxgk_table cf
i_info = i_soup.find_all(class_='g_xxgk_td')
if len(i_info) != 0:
try:
pub_source = str(i_info[4]).split('"g_xxgk_td">')[1].split('</div>')[0]
# pub_time = str(i_info[5]).split('"g_xxgk_td">')[1].split('</div>')[0]
pub_hao = str(i_info[2]).split('"g_xxgk_td">')[1].split('</div>')[0]
content = i_soup.find(class_='g_content').text
contentWithTag = str(i_soup.find(class_='g_content'))
except:
# pub_source = str(i_info[3])
# print(pub_source)
pub_source = str(i_info[2]).split('"g_xxgk_td">')[1].split('</div>')[0]
# pub_time = str(i_info[3]).split('"g_xxgk_td">')[1].split('</div>')[0]
pub_hao = ''
content = i_soup.find(class_='g_content').text
contentWithTag = str(i_soup.find(class_='g_content'))
else:
try:
source = i_soup.find('span', class_='rich_media_meta rich_media_meta_nickname')
pub_source = source.find('a').text
time_ = i_soup.find('em', id='publish_time')
pub_time = time_.text
pub_hao = ''
content = i_soup.find(
class_='zh_CN wx_wap_page wx_wap_desktop_fontsize_2 mm_appmsg comment_feature discuss_tab appmsg_skin_default appmsg_style_default pages_skin_pc not_in_mm').text
contentWithTag = str(i_soup.find(
class_='zh_CN wx_wap_page wx_wap_desktop_fontsize_2 mm_appmsg comment_feature discuss_tab appmsg_skin_default appmsg_style_default pages_skin_pc not_in_mm'))
except:
try:
source = i_soup.find_all(class_='ant-space-item')
# pub_time = str(source[1]).split('<span>')[1].split('</span>')[0]
pub_source = str(source[0]).split('<span>')[1].split('</span>')[0].replace('来源:', '')
pub_hao = ''
content = i_soup.find(class_='index_wrapper__L_zqV').text
contentWithTag = str(i_soup.find(class_='index_wrapper__L_zqV'))
except:
source = i_soup.find('div', class_='zsy_cotitle').find('p').text
pub_source = source.split('文章来源:')[1].split('发布时间:')[0]
pub_hao = ''
content = i_soup.find('div', class_='zsy_comain').replace('扫一扫在手机打开当前页', '').strip().text
contentWithTag = str(i_soup.find('div', class_='zsy_comain')).replace('扫一扫在手机打开当前页',
'').strip()
# fujian_list = i_soup.find(class_='related').find_all('li')
# for fujian in fujian_list:
# # print(fujian)
# fujian_href = 'http://www.sasac.gov.cn/' + str(fujian.find('a')['href']).replace('../', '')
# fj_href_list.append(fujian_href)
# print(fj_href_list)
log.info(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
if content == '' or content == 'None':
log.info(f'{href}-----{title}----内容为空')
continue
dic_news = {
'attachmentIds': [],
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1672", 'relationName': "浙江省国资委", 'labelMark': "policy"}],
'origin': pub_source,
'organ': pub_source,
'topicClassification': '',
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num = num + 1
count += 1
except:
pass
except:
pass
end = time.time()
log.info(f'共抓取{num}条数据,共耗时{end - start}')
if __name__ == "__main__":
zhe_jiang()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论