提交 db64a87a 作者: 薛凌堃

江苏省人民政府

上级 78a94cdb
import os import os
...@@ -8,11 +8,18 @@ import requests ...@@ -8,11 +8,18 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from base import BaseCore import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '江苏省人民政府'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Content-Type': 'application/x-www-form-urlencoded', 'Content-Type': 'application/x-www-form-urlencoded',
...@@ -35,9 +42,8 @@ def getFjContent(url): ...@@ -35,9 +42,8 @@ def getFjContent(url):
return req.content return req.content
def getContentA(url, num, publishDate, title): def getContentA(url, num, publishDate, title, origin, summary):
fjhref_list = '' id_list = []
fjtitle_list = ''
soup = getSoup(url) soup = getSoup(url)
organ = soup.find('div', class_='sp_time').text.split('来源:')[1].split('字体')[0].lstrip().strip() organ = soup.find('div', class_='sp_time').text.split('来源:')[1].split('字体')[0].lstrip().strip()
contentWithTag = soup.find('div', attrs={'id': 'zoom'}) contentWithTag = soup.find('div', attrs={'id': 'zoom'})
...@@ -60,31 +66,56 @@ def getContentA(url, num, publishDate, title): ...@@ -60,31 +66,56 @@ def getContentA(url, num, publishDate, title):
fj_href = img.get('src') fj_href = img.get('src')
try: try:
fj_href = 'http://www.jiangsu.gov.cn' + fj_href fj_href = 'http://www.jiangsu.gov.cn' + fj_href
fjhref_list += fj_href + '\n'
fj_title = img.get('title').lstrip().strip() fj_title = img.get('title').lstrip().strip()
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
except: except:
if 'img/png' in fj_href: if 'img/png' in fj_href:
fj_title = f'{num}-{publishDate}-{title}-{num_}.png' fj_title = f'{title}-{num_}.png'
elif 'img/jpg' in fj_href: elif 'img/jpg' in fj_href:
fj_title = f'{num}-{publishDate}-{title}-{num_}.jpg' fj_title = f'{title}-{num_}.jpg'
num_ += 1 num_ += 1
fjcontent = getFjContent(fj_href) att_id, full_path = policy.attuributefile(fj_title, fj_href,num, publishDate)
file = f'./相关政策/江苏省人民政府/政策文件/{fj_title}' if att_id:
with open(file, 'wb') as f: id_list.append(att_id)
f.write(fjcontent) img['href'] = full_path
log.info(f'{fj_title}===附件下载成功') else:
pass
except: except:
pass pass
content = contentWithTag.text content = contentWithTag.text
return organ, content, fjtitle_list, fjhref_list contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': url,
'writtenDate': None,
'organ': organ,
'topicClassification': '',
'issuedNumber': '',
'summary': summary,
'createDate': time_now,
'sid': '1729042894974537730',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, url)
log.info(f'采集成功--{title}--{url}')
except Exception as e:
for att_id in id_list:
baseCore.deliteATT(att_id)
return
def getContentB(url, num, publishDate, title): def getContentB(url, num, publishDate, title, origin, summary):
fjhref_list = '' id_list = []
fjtitle_list = ''
soup = getSoup(url) soup = getSoup(url)
info = soup.find('table', class_='xxgk_table').text.replace(' ','') info = soup.find('table', class_='xxgk_table').text.replace(' ','')
organ = info.split('发布机构:')[1].split('发文日期')[0].lstrip().strip() organ = info.split('发布机构:')[1].split('发文日期')[0].lstrip().strip()
...@@ -110,61 +141,88 @@ def getContentB(url, num, publishDate, title): ...@@ -110,61 +141,88 @@ def getContentB(url, num, publishDate, title):
fj_href = img.get('src') fj_href = img.get('src')
try: try:
fj_title = img.get('title').lstrip().strip() fj_title = img.get('title').lstrip().strip()
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fj_href = 'http://www.jiangsu.gov.cn' + fj_href fj_href = 'http://www.jiangsu.gov.cn' + fj_href
fjhref_list += fj_href + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/江苏省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except: except:
if 'image/png' in fj_href: if 'image/png' in fj_href:
fj_title = f'{num}-{publishDate}-{title}-{num_}.png' fj_title = f'{title}-{num_}.png'
elif 'image/jpg' in fj_href: elif 'image/jpg' in fj_href:
fj_title = f'{num}-{publishDate}-{title}-{num_}.jpg' fj_title = f'{title}-{num_}.jpg'
num_ += 1 num_ += 1
fjtitle_list += fj_title + '\n' try:
att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
except:
att_id = ''
if att_id:
id_list.append(att_id)
img['href'] = full_path
else:
pass
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
return organ, writtenDate, pub_hao, content, fjtitle_list, fjhref_list contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': url,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729042894974537730',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, url)
log.info(f'采集成功--{title}--{url}')
except Exception as e:
for att_id in id_list:
baseCore.deliteATT(att_id)
return
def doJob(): def doJob():
if not os.path.exists('./相关政策/江苏省人民政府/政策文件'):
os.makedirs('./相关政策/江苏省人民政府/政策文件')
pattern = r"\d{4}-\d{2}-\d{2}" pattern = r"\d{4}-\d{2}-\d{2}"
url = 'http://www.jiangsu.gov.cn/jsearchfront/search.do?websiteid=320000000100000&searchid=12&pg=&p=1&tpl=38&serviceType=&cateid=27&q=REITs&pq=&oq=&eq=&pos=&sortType=0&begin=&end=' url = 'http://www.jiangsu.gov.cn/jsearchfront/search.do?websiteid=320000000100000&searchid=12&pg=&p=1&tpl=38&serviceType=&cateid=27&q=REITs&pq=&oq=&eq=&pos=&sortType=0&begin=&end='
driver = baseCore.buildDriver() # driver = baseCore.buildDriver()
driver = policy.createDriver()
driver.get(url) driver.get(url)
time.sleep(5) time.sleep(5)
div_list = driver.find_elements(By.CLASS_NAME,'news-result') div_list = driver.find_elements(By.CLASS_NAME,'news-result')
num = 1 num = 1
data_list = []
for div in div_list: for div in div_list:
id_list = []
title = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('title').lstrip().strip() title = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('title').lstrip().strip()
href = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('href') href = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('href')
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
type = div.find_element(By.CLASS_NAME, 'biaoqian').text.lstrip().strip() type = div.find_element(By.CLASS_NAME, 'biaoqian').text.lstrip().strip()
summary = div.find_element(By.CLASS_NAME, 'jcse-news-abs-content').text.lstrip().strip() summary = div.find_element(By.CLASS_NAME, 'jcse-news-abs-content').text.lstrip().strip()
dateInfo = div.find_element(By.CLASS_NAME, 'jcse-news-date').text dateInfo = div.find_element(By.CLASS_NAME, 'jcse-news-date').text
publishDate = re.findall(pattern, dateInfo)[0] publishDate = re.findall(pattern, dateInfo)[0]
origin = dateInfo.replace(publishDate, '').lstrip().strip() origin = dateInfo.replace(publishDate, '').lstrip().strip()
if type == '政务公开': if type == '政务公开':
organ, content, fjtitle_list, fjhref_list = getContentA(href, num, publishDate, title) getContentA(href, num, publishDate, title, origin, summary)
writtenDate = ''
pub_hao = ''
else: else:
organ, writtenDate, pub_hao, content, fjtitle_list, fjhref_list = getContentB(href, num, publishDate, title) getContentB(href, num, publishDate, title, origin, summary)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1 num += 1
time.sleep(5) time.sleep(5)
driver.close() driver.close()
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./江苏省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论