提交 1eda3e0e 作者: 薛凌堃

附件不是文件

上级 1d2e0f39
import os import os
...@@ -5,11 +5,17 @@ import numpy as np ...@@ -5,11 +5,17 @@ import numpy as np
import pandas as pd import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import BaseCore
from base import BaseCore
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '吉林市人民政府'
headers = { headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
...@@ -42,14 +48,14 @@ def getFjContent(url): ...@@ -42,14 +48,14 @@ def getFjContent(url):
return req.content return req.content
def getData(num, title, href, origin, publishDate, summary): def getData(num, title, url, origin, publishDate, summary):
writtenDate = '' id_list = []
writtenDate = None
pub_hao = '' pub_hao = ''
organ = '' organ = ''
fjhref_list = ''
fjtitle_list = '' # ip = baseCore.get_proxy()
ip = baseCore.get_proxy() req = requests.get(url, headers=headers)#, proxies=ip)
req = requests.get(href, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser') soup = BeautifulSoup(req.text, 'html.parser')
try: try:
...@@ -108,19 +114,17 @@ def getData(num, title, href, origin, publishDate, summary): ...@@ -108,19 +114,17 @@ def getData(num, title, href, origin, publishDate, summary):
if '.html' in a.get('href') or '.shtml' in a.get('href') or '.htm' in a.get('href'): if '.html' in a.get('href') or '.shtml' in a.get('href') or '.htm' in a.get('href'):
continue continue
href = a.get('href') href = a.get('href')
fjhref_list += href + '\n'
category = os.path.splitext(href)[1] category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}' fj_title = a.text.lstrip().strip()
if '<' in fj_title or '>' in fj_title: if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '') fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
fjtitle_list += fj_title + '\n' att_id, full_path = policy.attuributefile(fj_title, href, num, publishDate)
fjcontent = getFjContent(href) if att_id:
file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}' id_list.append(att_id)
with open(file, 'wb') as f: a['href'] = full_path
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except: except:
pass pass
try: try:
...@@ -129,32 +133,59 @@ def getData(num, title, href, origin, publishDate, summary): ...@@ -129,32 +133,59 @@ def getData(num, title, href, origin, publishDate, summary):
if '.html' in a.get('href') or '.shtml' in a.get('href') or '.htm' in a.get('href'): if '.html' in a.get('href') or '.shtml' in a.get('href') or '.htm' in a.get('href'):
continue continue
href = a.get('href') href = a.get('href')
fjhref_list += href + '\n'
category = os.path.splitext(href)[1] category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}' fj_title = a.text.lstrip().strip()
if '<' in fj_title or '>' in fj_title: if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '') fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
fjtitle_list += fj_title + '\n' att_id, full_path = policy.attuributefile(fj_title, href, num, publishDate)
fjcontent = getFjContent(href) if att_id:
file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}' id_list.append(att_id)
with open(file, 'wb') as f: a['href'] = full_path
f.write(fjcontent) else:
log.info(f'{fj_title}===附件下载成功') return
except: except:
pass pass
try:
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
data_ = [num, title, writtenDate, origin, href, publishDate, organ, pub_hao, summary, content, fjtitle_list, except:
fjhref_list] log.info(url)
return data_ return
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': url,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1729042375596158978',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, url)
log.info(f'采集成功--{title}--{url}')
except Exception as e:
for att_id in id_list:
baseCore.deliteATT(att_id)
return
def doJob(): def doJob():
if not os.path.exists('./相关政策/吉林省人民政府/政策文件'):
os.makedirs('./相关政策/吉林省人民政府/政策文件')
data_list = []
num = 1 num = 1
url = 'https://intellsearch.jl.gov.cn/api/data/list' url = 'https://intellsearch.jl.gov.cn/api/data/list'
total = getTotal(url) total = getTotal(url)
...@@ -164,17 +195,17 @@ def doJob(): ...@@ -164,17 +195,17 @@ def doJob():
title = data_['title'] title = data_['title']
title = BeautifulSoup(title, 'lxml').find('p').text.lstrip().strip() title = BeautifulSoup(title, 'lxml').find('p').text.lstrip().strip()
href = data_['url'] href = data_['url']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
continue
origin = data_['websiteName'] origin = data_['websiteName']
publishDate = data_['pubtime'].replace('/', '-') publishDate = data_['pubtime'].replace('/', '-')
summary = data_['content'] summary = data_['content']
summary = BeautifulSoup(summary, 'lxml').find('p').text.lstrip().strip() summary = BeautifulSoup(summary, 'lxml').find('p').text.lstrip().strip()
data = getData(num, title, href, origin, publishDate, summary) data = getData(num, title, href, origin, publishDate, summary)
data_list.append(data) # data_list.append(data)
num += 1 num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./吉林省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__': if __name__ == '__main__':
doJob() doJob()
......
import os import os
...@@ -125,6 +125,9 @@ def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary): ...@@ -125,6 +125,9 @@ def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary):
if att_id: if att_id:
id_list.append(att_id) id_list.append(att_id)
a['href'] = full_path a['href'] = full_path
else:
log.info(f'附件解析失败==={url}')
return
except: except:
pass pass
# try: # try:
......
import os import os
...@@ -113,7 +113,11 @@ class Policy(): ...@@ -113,7 +113,11 @@ class Policy():
else: else:
return '', '' return '', ''
att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate) att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate)
return att_id,full_path return att_id, full_path
else:
log.info('不是文件格式')
return '', ''
def downloadfile(self,file_href,path): def downloadfile(self,file_href,path):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论