提交 f7a4f608 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

import re
import re
import requests
from bs4 import BeautifulSoup
from base import BaseCore
from retry import retry
baseCore = BaseCore.BaseCore()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
def getContentA(url):
pass
def getContentB(url):
req = requests.get(url,headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text,'html.parser')
contentWithTag = soup.find('div',attrs={'id':'desContent'})
a_list = contentWithTag.find_all('a')
for a in a_list:
href = a.get('href')
file_name = a.text.strip()
content = contentWithTag.text.strip()
def doJob():
urls = ['http://reits.szse.cn/lawrule/laws/index.html',
'http://reits.szse.cn/lawrule/regulations/csrcorder/index.html',
'http://reits.szse.cn/lawrule/regulations/csrcannoun/index.html']
for url in urls:
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
li_list = soup.find('ul', class_='newslist').find_all('li')
for li in li_list:
info = str(li.find('script'))
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', 'http://reits.szse.cn/lawrule/laws/')
title = re.findall('curTitle =\'(.*?)\';', info)[0]
publishDate = li.find('span', class_='time').text.strip()
if '.html' in href:
getContentA(href)
else:
getContentB(href)
if __name__ == '__main__':
doJob()
import os
import os
import re
import time
import requests
from bs4 import BeautifulSoup
import BaseCore
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '深圳证券交易所REITs'
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
def getContent(url, publishDate, num, id_list):
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
contentWithTag = soup.find('div', attrs={'id': 'desContent'})
pub_hao = contentWithTag.find('p').text.strip()
if pub_hao == '':
pub_hao = contentWithTag.find_all('p')[1].text.strip()
if '号' not in pub_hao:
pub_hao = ''
a_list = contentWithTag.find_all('a')
for a in a_list:
fj_href = a.get('href')
if not fj_href:
continue
fj_title = a.text.strip()
category = os.path.splitext(fj_href)[1]
if '.' not in category or '.cn' in category:
continue
if category not in fj_title:
fj_title = fj_title + category
# 上传附件至obs
att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
if att_id:
id_list.append(att_id)
a['href'] = full_path
content = contentWithTag.text.strip()
return pub_hao, content, id_list, str(contentWithTag)
def doJob():
urls = ['http://reits.szse.cn/lawrule/bussrules/latest/index.html',
'http://reits.szse.cn/lawrule/bussrules/supervise/index.html']
num = 1
for url in urls:
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
li_list = soup.find('ul', class_='newslist').find_all('li')
for li in li_list:
id_list = []
info = str(li.find('script'))
href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', url.replace(url.split('/')[-1], ''))
title = re.findall('curTitle =\'(.*?)\';', info)[0]
publishDate = li.find('span', class_='time').text.strip()
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, href)
if is_member:
log.info(f'{title}===已采集')
continue
origin = '深圳证券交易所'
writtenDate = publishDate
organ = '深圳证券交易所'
summary = ''
pub_hao, content, id_list, contentWithTag = getContent(href, publishDate, num, id_list)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': pub_hao,
'summary': summary,
'createDate': time_now,
'sid': '1730508406971613186',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'采集成功--{title}--{href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
num += 1
time.sleep(3)
if __name__ == '__main__':
doJob()
import time import os
import os
import time import time
from urllib.parse import urljoin
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
...@@ -14,8 +16,116 @@ headers = { ...@@ -14,8 +16,116 @@ headers = {
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
} }
topic = 'policy'
webname = '浙江省人民政府'
@retry(tries=3,delay=10) class Policy():
def getrequest_soup(self, headers, url):
req = requests.get(headers=headers, url=url)
result = BeautifulSoup(req.content, 'html.parser')
return result
def getrequest_json(self, headers, url):
req = requests.get(headers=headers, url=url)
result = req.json()
return result
def requestPost(self, headers, url, payload):
req = requests.post(headers=headers, url=url, data=payload)
data_json = req.json()
return data_json
def requestPost_html(self, headers, url, payload):
req = requests.post(headers=headers, url=url, data=payload)
result = BeautifulSoup(req.content, 'html.parser')
return result
def deletep(self, soup, i, tag, attribute_to_delete, value_to_delete):
# 查找带有指定属性的标签并删除
tags = soup.find_all(tag, {attribute_to_delete: value_to_delete})
for tag in tags[:i]:
tag.decompose()
def deletespan(self, td):
spans = td.find_all('span')
for span in spans:
span.extract() # 删除span标签
def deletetag(self, td, tag):
tags = td.find_all(tag)
for tag_ in tags:
tag_.extract() # 删除指定标签
def deletetext(self, soup, tag, text): # 删除带有特定内容的标签
tags = soup.find_all(tag)[:10]
for tag_ in tags:
text_ = tag_.text
if text in text_:
tag_.extract()
def deletek(self, soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video",
"br"] and tag.name != "br" or tag.get_text() == ' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
def paserUrl(self, html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def attuributefile(self, file_name, file_href, num, publishDate):
# 下载附件到本地,并上传文件服务器
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '', file_name)
if retData['state']:
pass
else:
return '', ''
att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate)
return att_id, full_path
else:
return '', ''
policy = Policy()
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
@retry(tries=3, delay=10)
def getPageSize(): def getPageSize():
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do' url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
...@@ -42,9 +152,10 @@ def getPageSize(): ...@@ -42,9 +152,10 @@ def getPageSize():
req.close() req.close()
return pageSize return pageSize
@retry(tries=3,delay=10)
@retry(tries=3, delay=10)
def getDataJson(page): def getDataJson(page):
ip = baseCore.get_proxy() # ip = baseCore.get_proxy()
url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do' url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
data_post = { data_post = {
'websiteid': '330000000000000', 'websiteid': '330000000000000',
...@@ -59,18 +170,300 @@ def getDataJson(page): ...@@ -59,18 +170,300 @@ def getDataJson(page):
'pos': 'content,filenumber', 'pos': 'content,filenumber',
'sortType': '1', 'sortType': '1',
} }
req = requests.post(url, headers=headers, data=data_post, proxies=ip) req = requests.post(url, headers=headers, data=data_post)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
data_json = req.json()['result'] data_json = req.json()['result']
req.close()
return data_json return data_json
def getContent(url, publishDate, num):
id_list = []
req = requests.get(url, headers=headers)
if 'weixin' in url:
req.encoding = 'utf-8'
else:
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
soup = paserUrl(soup, url)
contentWithTag = soup.find('div', class_='box_wzy_ys')
if not contentWithTag:
contentWithTag = soup.find('div', class_='oh_main_cont_flbox_show_cont')
if not contentWithTag:
contentWithTag = soup.find('div',class_='article-content')
try:
contentWithTag.find('table',class_='xxgk_table').decompose()
except:
pass
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'zoom'})
try:
contentWithTag.find('div', class_='audioBox').decompose()
except:
pass
try:
contentWithTag.find('div', class_='zcbox').decompose()
div_list = soup.find_all('div', class_='yybb')
for div in div_list:
div.decompose()
except:
pass
try:
contentWithTag.find('div', class_='fz_xx').decompose()
except:
pass
try:
contentWithTag.find('a', class_='zcjdlj').decompose()
except:
pass
try:
contentWithTag.find('div', class_='fenxiang').decompose()
except:
pass
try:
contentWithTag.find('div', class_='Interpretation').decompose()
except:
pass
try:
contentWithTag.find('a', class_='bmjd').decompose()
except:
pass
try:
contentWithTag.find('a', class_='tjlj').decompose()
except:
pass
if not contentWithTag:
contentWithTag = soup.find('div', class_='g_content')
if not contentWithTag:
contentWithTag = soup.find('span', class_='zcjdlink')
if not contentWithTag:
contentWithTag = soup.find('div', class_='main_section')
try:
contentWithTag = contentWithTag.find('div', class_='main_section')
except:
pass
if not contentWithTag:
contentWithTag = soup.find('div', class_='zoomnr')
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'mainText'})
if not contentWithTag:
contentWithTag = soup.find('div', class_='text')
if not contentWithTag:
contentWithTag = soup.find('div', class_='wz')
if not contentWithTag:
contentWithTag = soup.find('div', class_='news_content')
try:
contentWithTag.find('div', class_='ywlj').decompose()
except:
pass
try:
contentWithTag.find('div', class_='zcjd').decompose()
except:
pass
try:
contentWithTag.find('div', class_='tpjd').decompose()
except:
pass
try:
contentWithTag.find('div', class_='spjd').decompose()
except:
pass
try:
contentWithTag.find('div', class_='jgfzr').decompose()
except:
pass
try:
contentWithTag.find('div', class_='fzr').decompose()
except:
pass
try:
contentWithTag.find('div', class_='jgdz').decompose()
except:
pass
try:
contentWithTag.find('div', class_='lxfs').decompose()
except:
pass
try:
contentWithTag.find('div', class_='gkdh').decompose()
except:
pass
try:
contentWithTag.find('div', class_='zipcode').decompose()
except:
pass
try:
contentWithTag.find('div', class_='fax').decompose()
except:
pass
try:
contentWithTag.find('div', class_='mail').decompose()
except:
pass
try:
contentWithTag.find('div',class_='bgsj').decompose()
except:
pass
if not contentWithTag:
try:
contentWithTag = soup.find('div', class_='mian').find('div', class_='article_text')
except:
contentWithTag = None
if not contentWithTag:
contentWithTag = soup.find('div', class_='wenz')
if not contentWithTag:
# contentWithTag = soup.find('table', attrs={'id': 'word'})
contentWithTag = soup.find('table', attrs={'id': 'inside'})
if not contentWithTag:
contentWithTag = soup.find('div', class_='ewb-content')
if not contentWithTag:
contentWithTag = soup.find('div', class_='content-info-content')
if not contentWithTag:
contentWithTag = soup.find('div', class_='main-txt')
if not contentWithTag:
contentWithTag = soup.find('div', class_='zoom')
if not contentWithTag:
contentWithTag = soup.find('div', class_='showPage')
if not contentWithTag:
try:
contentWithTag = soup.find_all('div', class_='content')[1]
try:
contentWithTag.find('div', class_='linke').decompose()
except:
contentWithTag = None
except:
pass
if not contentWithTag:
contentWithTag = soup.find('div', class_='article')
if not contentWithTag:
contentWithTag = soup.find('div', class_='content')
try:
contentWithTag.find('div', class_='dy').decompose()
except:
pass
try:
contentWithTag.find('div', class_='con_top').decompose()
contentWithTag.find('div', class_='flex_between').decompose()
except:
pass
try:
contentWithTag.find('div', class_='dqwz').decompose()
contentWithTag.find('div', class_='top').decompose()
except:
pass
try:
contentWithTag.find('h4', class_='fr').decompose()
except:
pass
try:
contentWithTag.find('ul', class_='Fileclass').decompose()
contentWithTag.find('h4').decompose()
except:
pass
if not contentWithTag:
contentWithTag = soup.find('div', class_='main-body')
if not contentWithTag:
contentWithTag = soup.find('div', class_='articlePage_content')
if not contentWithTag:
contentWithTag = soup.find('div', class_='Gbc_Cm')
if not contentWithTag:
contentWithTag = soup.find('div', class_='zhengw')
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'zhengw'})
if not contentWithTag:
contentWithTag = soup.find('div', class_='xy-detail')
if not contentWithTag:
contentWithTag = soup.find('td', class_='bt_content')
if not contentWithTag:
contentWithTag = soup.find('div', class_='xy-detail')
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'js_content'})
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'cr'})
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'art_c'})
if not contentWithTag:
contentWithTag = soup.find('article', class_='content_main')
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'ivs_content'})
if not contentWithTag:
contentWithTag = soup.find('div', class_='con_con')
try:
div_list = contentWithTag.find('div', class_='yybb')
for div in div_list:
div.decompose()
except:
pass
if not contentWithTag:
contentWithTag = soup.find('div', class_='pic')
if not contentWithTag:
contentWithTag = soup.find('td', class_='bt_content')
if not contentWithTag:
contentWithTag = soup.find('div', class_='rich_media_content')
if not contentWithTag:
contentWithTag = soup.find('div', class_='xl_main_con')
if not contentWithTag:
contentWithTag = soup.find('div', class_='jh_xl_m2')
try:
contentWithTag.find('span', class_='jiedu-link-box').decompose()
except:
pass
if not contentWithTag:
contentWithTag = soup.find('div', class_='nrEmit')
if not contentWithTag:
contentWithTag = soup.find('div', class_='details-content')
if not contentWithTag:
contentWithTag = soup.find('div', class_='zf-jd-nr')
if not contentWithTag:
contentWithTag = soup.find('div', class_='article-conter')
if not contentWithTag:
contentWithTag = soup.find('div',class_='class="rich_media_area_primary"')
if not contentWithTag:
contentWithTag = soup.find('body > div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(3)')
if not contentWithTag:
contentWithTag = soup.find('div',class_='detail-pic')
try:
contentWithTag.find('video').decompose()
contentWithTag = None
except:
pass
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
a_list = contentWithTag.find_all('a')
for a in a_list:
href = a.get('href')
fj_title = a.text.strip().lstrip()
category = os.path.splitext(href)[1]
if category not in fj_title:
fj_title = fj_title + category
att_id, full_path = policy.attuributefile(fj_title, href, num, publishDate)
if att_id:
id_list.append(att_id)
a['href'] = full_path
content = contentWithTag.text
return str(contentWithTag), content, id_list
def getDatas(page): def getDatas(page):
data_json = getDataJson(page) data_json = getDataJson(page)
num = 1
for data_ in data_json: for data_ in data_json:
soup = BeautifulSoup(data_, 'lxml') soup = BeautifulSoup(data_, 'lxml')
title = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').text.lstrip().strip().replace(' ','').replace('\r\n',' ') title = soup.find('div', class_='titleWrapper').find('a').text.lstrip().strip().replace(' ',
href = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').get('href') '').replace(
href = href.split('url=')[1].split('.html')[0].replace('%3A',':').replace('%2F','/') + '.html' '\r\n', ' ')
href = soup.find('div', class_='titleWrapper').find('a').get('href')
href = href.split('url=')[1].split('.html')[0].replace('%3A', ':').replace('%2F', '/') + '.html'
try: try:
info = soup.find('table', class_='fgwj_table_list').text info = soup.find('table', class_='fgwj_table_list').text
organ = info.split('发布机构:')[1].split('成文日期:')[0].lstrip().strip() organ = info.split('发布机构:')[1].split('成文日期:')[0].lstrip().strip()
...@@ -78,21 +471,48 @@ def getDatas(page): ...@@ -78,21 +471,48 @@ def getDatas(page):
except: except:
organ = '' organ = ''
writtenDate = None writtenDate = None
origin = soup.find('div', class_='sourceTime').text.split('来源:')[1].split('时间:')[0].lstrip().strip().replace(' ','').replace(' ', '').replace('\r\n', '') origin = soup.find('div', class_='sourceTime').text.split('来源:')[1].split('时间:')[0].lstrip().strip().replace(
' ', '').replace(' ', '').replace('\r\n', '')
publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip() publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip()
log.info(origin) contentWithTag, content, id_list = getContent(href, publishDate, num)
num += 1
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag,
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': origin,
'sourceAddress': href,
'writtenDate': writtenDate,
'organ': organ,
'topicClassification': '',
'issuedNumber': '',
'summary': '',
'createDate': time_now,
'sid': '1729041791539326977',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'{title}===完成')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
log.error(f'第{page}页==={title}===失败')
time.sleep(5) time.sleep(5)
def doJob(): def doJob():
pageSize = getPageSize() pageSize = getPageSize()
for page in range(1, pageSize + 1): for page in range(1, pageSize + 1):
datas = getDatas(page) getDatas(page)
if __name__ == '__main__': if __name__ == '__main__':
doJob() doJob()
# url = 'http%3A%2F%2Fwww.zj.gov.cn%2Fart%2F2022%2F4%2F18%2Fart_1229630461_2401403.html'
# req = requests.get(url,headers=headers)
# req.encoding = req.apparent_encoding
baseCore.close() baseCore.close()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论