Merge remote-tracking branch 'origin/master'

f7a4f608 · 薛凌堃 · 519c0609 · b2dd89c5 · f7a4f608 · f7a4f608
--- a/REITs专题数据/LawRules-2-shenzhen.py
+++ b/REITs专题数据/LawRules-2-shenzhen.py
+import re
+import re
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+from retry import retry
+baseCore = BaseCore.BaseCore()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
+}
+def getContentA(url):
+    pass
+def getContentB(url):
+    req = requests.get(url,headers=headers)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text,'html.parser')
+    contentWithTag = soup.find('div',attrs={'id':'desContent'})
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        href = a.get('href')
+        file_name = a.text.strip()
+    content = contentWithTag.text.strip()
+def doJob():
+    urls = ['http://reits.szse.cn/lawrule/laws/index.html',
+            'http://reits.szse.cn/lawrule/regulations/csrcorder/index.html',
+            'http://reits.szse.cn/lawrule/regulations/csrcannoun/index.html']
+    for url in urls:
+        req = requests.get(url, headers=headers)
+        req.encoding = req.apparent_encoding
+        soup = BeautifulSoup(req.text, 'lxml')
+        li_list = soup.find('ul', class_='newslist').find_all('li')
+        for li in li_list:
+            info = str(li.find('script'))
+            href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', 'http://reits.szse.cn/lawrule/laws/')
+            title = re.findall('curTitle =\'(.*?)\';', info)[0]
+            publishDate = li.find('span', class_='time').text.strip()
+            if '.html' in href:
+                getContentA(href)
+            else:
+                getContentB(href)
+if __name__ == '__main__':
+    doJob()
--- a/REITs专题数据/RuleGuide-shenzhen.py
+++ b/REITs专题数据/RuleGuide-shenzhen.py
+import os
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+import BaseCore
+from reits import Policy
+policy = Policy()
+topic = 'policy'
+webname = '深圳证券交易所REITs'
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
+}
+def getContent(url, publishDate, num, id_list):
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    contentWithTag = soup.find('div', attrs={'id': 'desContent'})
+    pub_hao = contentWithTag.find('p').text.strip()
+    if pub_hao == '':
+        pub_hao = contentWithTag.find_all('p')[1].text.strip()
+    if '号' not in pub_hao:
+        pub_hao = ''
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        fj_href = a.get('href')
+        if not fj_href:
+            continue
+        fj_title = a.text.strip()
+        category = os.path.splitext(fj_href)[1]
+        if '.' not in category or '.cn' in category:
+            continue
+        if category not in fj_title:
+            fj_title = fj_title + category
+        # 上传附件至obs
+        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
+        if att_id:
+            id_list.append(att_id)
+            a['href'] = full_path
+    content = contentWithTag.text.strip()
+    return pub_hao, content, id_list, str(contentWithTag)
+def doJob():
+    urls = ['http://reits.szse.cn/lawrule/bussrules/latest/index.html',
+            'http://reits.szse.cn/lawrule/bussrules/supervise/index.html']
+    num = 1
+    for url in urls:
+        req = requests.get(url, headers=headers)
+        req.encoding = req.apparent_encoding
+        soup = BeautifulSoup(req.text, 'lxml')
+        li_list = soup.find('ul', class_='newslist').find_all('li')
+        for li in li_list:
+            id_list = []
+            info = str(li.find('script'))
+            href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', url.replace(url.split('/')[-1], ''))
+            title = re.findall('curTitle =\'(.*?)\';', info)[0]
+            publishDate = li.find('span', class_='time').text.strip()
+            # 根据链接判重
+            is_member = baseCore.r.sismember('REITs::' + webname, href)
+            if is_member:
+                log.info(f'{title}===已采集')
+                continue
+            origin = '深圳证券交易所'
+            writtenDate = publishDate
+            organ = '深圳证券交易所'
+            summary = ''
+            pub_hao, content, id_list, contentWithTag = getContent(href, publishDate, num, id_list)
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            dic_info = {
+                'attachmentIds': id_list,
+                'author': '',
+                'content': content,
+                'contentWithTag': str(contentWithTag),
+                'deleteFlag': 0,
+                'id': '',
+                'title': title,
+                'publishDate': publishDate,
+                'origin': origin,
+                'sourceAddress': href,
+                'writtenDate': writtenDate,
+                'organ': organ,
+                'topicClassification': '',
+                'issuedNumber': pub_hao,
+                'summary': summary,
+                'createDate': time_now,
+                'sid': '1730508406971613186',
+            }
+            try:
+                baseCore.sendkafka(dic_info, topic)
+                baseCore.r.sadd('REITs::' + webname, href)
+                log.info(f'采集成功--{title}--{href}')
+            except:
+                for att_id in id_list:
+                    baseCore.deliteATT(att_id)
+            num += 1
+            time.sleep(3)
+if __name__ == '__main__':
+    doJob()
--- a/REITs专题数据/policy-zhejiang.py
+++ b/REITs专题数据/policy-zhejiang.py
-import time
+import os
+import os
 import time
+from urllib.parse import urljoin
 import requests
 from bs4 import BeautifulSoup
@@ -14,8 +16,116 @@ headers = {
    'X-Requested-With': 'XMLHttpRequest',
 }
+topic = 'policy'
+webname = '浙江省人民政府'
-@retry(tries=3,delay=10)
+class Policy():
+    def getrequest_soup(self, headers, url):
+        req = requests.get(headers=headers, url=url)
+        result = BeautifulSoup(req.content, 'html.parser')
+        return result
+    def getrequest_json(self, headers, url):
+        req = requests.get(headers=headers, url=url)
+        result = req.json()
+        return result
+    def requestPost(self, headers, url, payload):
+        req = requests.post(headers=headers, url=url, data=payload)
+        data_json = req.json()
+        return data_json
+    def requestPost_html(self, headers, url, payload):
+        req = requests.post(headers=headers, url=url, data=payload)
+        result = BeautifulSoup(req.content, 'html.parser')
+        return result
+    def deletep(self, soup, i, tag, attribute_to_delete, value_to_delete):
+        # 查找带有指定属性的标签并删除
+        tags = soup.find_all(tag, {attribute_to_delete: value_to_delete})
+        for tag in tags[:i]:
+            tag.decompose()
+    def deletespan(self, td):
+        spans = td.find_all('span')
+        for span in spans:
+            span.extract()  # 删除span标签
+    def deletetag(self, td, tag):
+        tags = td.find_all(tag)
+        for tag_ in tags:
+            tag_.extract()  # 删除指定标签
+    def deletetext(self, soup, tag, text):  # 删除带有特定内容的标签
+        tags = soup.find_all(tag)[:10]
+        for tag_ in tags:
+            text_ = tag_.text
+            if text in text_:
+                tag_.extract()
+    def deletek(self, soup):
+        # 删除空白标签（例如<p></p>、<p><br></p>, img、video、hr除外）
+        for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video",
+                                                                                         "br"] and tag.name != "br" or tag.get_text() == ' '):
+            for j in i.descendants:
+                if j.name in ["img", "video", "br"]:
+                    break
+            else:
+                i.decompose()
+    def paserUrl(self, html, listurl):
+        # 获取所有的<a>标签和<img>标签
+        if isinstance(html, str):
+            html = BeautifulSoup(html, 'html.parser')
+        links = html.find_all(['a', 'img'])
+        # 遍历标签，将相对地址转换为绝对地址
+        for link in links:
+            if 'href' in link.attrs:
+                link['href'] = urljoin(listurl, link['href'])
+            elif 'src' in link.attrs:
+                link['src'] = urljoin(listurl, link['src'])
+        return html
+    def attuributefile(self, file_name, file_href, num, publishDate):
+        # 下载附件到本地，并上传文件服务器
+        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+            category = os.path.splitext(file_href)[1]
+            if category not in file_name:
+                file_name = file_name + category
+            retData = baseCore.uptoOBS(file_href, '', file_name)
+            if retData['state']:
+                pass
+            else:
+                return '', ''
+            att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate)
+            return att_id, full_path
+        else:
+            return '', ''
+policy = Policy()
+def paserUrl(html, listurl):
+    # 获取所有的<a>标签和<img>标签
+    if isinstance(html, str):
+        html = BeautifulSoup(html, 'html.parser')
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+@retry(tries=3, delay=10)
 def getPageSize():
    ip = baseCore.get_proxy()
    url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
@@ -42,9 +152,10 @@ def getPageSize():
    req.close()
    return pageSize
-@retry(tries=3,delay=10)
+@retry(tries=3, delay=10)
 def getDataJson(page):
-    ip = baseCore.get_proxy()
+    # ip = baseCore.get_proxy()
    url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
    data_post = {
        'websiteid': '330000000000000',
@@ -59,18 +170,300 @@ def getDataJson(page):
        'pos': 'content,filenumber',
        'sortType': '1',
    }
-    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req = requests.post(url, headers=headers, data=data_post)
    req.encoding = req.apparent_encoding
    data_json = req.json()['result']
+    req.close()
    return data_json
+def getContent(url, publishDate, num):
+    id_list = []
+    req = requests.get(url, headers=headers)
+    if 'weixin' in url:
+        req.encoding = 'utf-8'
+    else:
+        req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'lxml')
+    soup = paserUrl(soup, url)
+    contentWithTag = soup.find('div', class_='box_wzy_ys')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='oh_main_cont_flbox_show_cont')
+    if not contentWithTag:
+        contentWithTag = soup.find('div',class_='article-content')
+        try:
+            contentWithTag.find('table',class_='xxgk_table').decompose()
+        except:
+            pass
+    if not contentWithTag:
+        contentWithTag = soup.find('div', attrs={'id': 'zoom'})
+        try:
+            contentWithTag.find('div', class_='audioBox').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='zcbox').decompose()
+            div_list = soup.find_all('div', class_='yybb')
+            for div in div_list:
+                div.decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='fz_xx').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('a', class_='zcjdlj').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='fenxiang').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='Interpretation').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('a', class_='bmjd').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('a', class_='tjlj').decompose()
+        except:
+            pass
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='g_content')
+    if not contentWithTag:
+        contentWithTag = soup.find('span', class_='zcjdlink')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='main_section')
+        try:
+            contentWithTag = contentWithTag.find('div', class_='main_section')
+        except:
+            pass
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='zoomnr')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', attrs={'id': 'mainText'})
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='text')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='wz')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='news_content')
+        try:
+            contentWithTag.find('div', class_='ywlj').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='zcjd').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='tpjd').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='spjd').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='jgfzr').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='fzr').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='jgdz').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='lxfs').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='gkdh').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='zipcode').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='fax').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='mail').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div',class_='bgsj').decompose()
+        except:
+            pass
+    if not contentWithTag:
+        try:
+            contentWithTag = soup.find('div', class_='mian').find('div', class_='article_text')
+        except:
+            contentWithTag = None
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='wenz')
+    if not contentWithTag:
+        # contentWithTag = soup.find('table', attrs={'id': 'word'})
+        contentWithTag = soup.find('table', attrs={'id': 'inside'})
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='ewb-content')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='content-info-content')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='main-txt')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='zoom')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='showPage')
+    if not contentWithTag:
+        try:
+            contentWithTag = soup.find_all('div', class_='content')[1]
+            try:
+                contentWithTag.find('div', class_='linke').decompose()
+            except:
+                contentWithTag = None
+        except:
+            pass
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='article')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='content')
+        try:
+            contentWithTag.find('div', class_='dy').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='con_top').decompose()
+            contentWithTag.find('div', class_='flex_between').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('div', class_='dqwz').decompose()
+            contentWithTag.find('div', class_='top').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('h4', class_='fr').decompose()
+        except:
+            pass
+        try:
+            contentWithTag.find('ul', class_='Fileclass').decompose()
+            contentWithTag.find('h4').decompose()
+        except:
+            pass
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='main-body')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='articlePage_content')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='Gbc_Cm')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='zhengw')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', attrs={'id': 'zhengw'})
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='xy-detail')
+    if not contentWithTag:
+        contentWithTag = soup.find('td', class_='bt_content')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='xy-detail')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', attrs={'id': 'js_content'})
+    if not contentWithTag:
+        contentWithTag = soup.find('div', attrs={'id': 'cr'})
+    if not contentWithTag:
+        contentWithTag = soup.find('div', attrs={'id': 'art_c'})
+    if not contentWithTag:
+        contentWithTag = soup.find('article', class_='content_main')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', attrs={'id': 'ivs_content'})
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='con_con')
+        try:
+            div_list = contentWithTag.find('div', class_='yybb')
+            for div in div_list:
+                div.decompose()
+        except:
+            pass
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='pic')
+    if not contentWithTag:
+        contentWithTag = soup.find('td', class_='bt_content')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='rich_media_content')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='xl_main_con')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='jh_xl_m2')
+        try:
+            contentWithTag.find('span', class_='jiedu-link-box').decompose()
+        except:
+            pass
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='nrEmit')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='details-content')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='zf-jd-nr')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='article-conter')
+    if not contentWithTag:
+        contentWithTag = soup.find('div',class_='class="rich_media_area_primary"')
+    if not contentWithTag:
+        contentWithTag = soup.find('body > div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(3)')
+    if not contentWithTag:
+        contentWithTag = soup.find('div',class_='detail-pic')
+    try:
+        contentWithTag.find('video').decompose()
+        contentWithTag = None
+    except:
+        pass
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        href = a.get('href')
+        fj_title = a.text.strip().lstrip()
+        category = os.path.splitext(href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        att_id, full_path = policy.attuributefile(fj_title, href, num, publishDate)
+        if att_id:
+            id_list.append(att_id)
+            a['href'] = full_path
+    content = contentWithTag.text
+    return str(contentWithTag), content, id_list
 def getDatas(page):
    data_json = getDataJson(page)
+    num = 1
    for data_ in data_json:
        soup = BeautifulSoup(data_, 'lxml')
-        title = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').text.lstrip().strip().replace(' ','').replace('\r\n',' ')
+        title = soup.find('div', class_='titleWrapper').find('a').text.lstrip().strip().replace(' ',
-        href = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').get('href')
+                                                                                                '').replace(
-        href = href.split('url=')[1].split('.html')[0].replace('%3A',':').replace('%2F','/') + '.html'
+            '\r\n', ' ')
+        href = soup.find('div', class_='titleWrapper').find('a').get('href')
+        href = href.split('url=')[1].split('.html')[0].replace('%3A', ':').replace('%2F', '/') + '.html'
        try:
            info = soup.find('table', class_='fgwj_table_list').text
            organ = info.split('发布机构：')[1].split('成文日期：')[0].lstrip().strip()
@@ -78,21 +471,48 @@ def getDatas(page):
        except:
            organ = ''
            writtenDate = None
-        origin = soup.find('div', class_='sourceTime').text.split('来源:')[1].split('时间:')[0].lstrip().strip().replace(' ','').replace(' ', '').replace('\r\n', '')
+        origin = soup.find('div', class_='sourceTime').text.split('来源:')[1].split('时间:')[0].lstrip().strip().replace(
+            ' ', '').replace(' ', '').replace('\r\n', '')
        publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip()
-        log.info(origin)
+        contentWithTag, content, id_list = getContent(href, publishDate, num)
+        num += 1
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        dic_info = {
+            'attachmentIds': id_list,
+            'author': '',
+            'content': content,
+            'contentWithTag': contentWithTag,
+            'deleteFlag': 0,
+            'id': '',
+            'title': title,
+            'publishDate': publishDate,
+            'origin': origin,
+            'sourceAddress': href,
+            'writtenDate': writtenDate,
+            'organ': organ,
+            'topicClassification': '',
+            'issuedNumber': '',
+            'summary': '',
+            'createDate': time_now,
+            'sid': '1729041791539326977',
+        }
+        try:
+            baseCore.sendkafka(dic_info, topic)
+            baseCore.r.sadd('REITs::' + webname, href)
+            log.info(f'{title}===完成')
+        except:
+            for att_id in id_list:
+                baseCore.deliteATT(att_id)
+            log.error(f'第{page}页==={title}===失败')
        time.sleep(5)
 def doJob():
    pageSize = getPageSize()
    for page in range(1, pageSize + 1):
-        datas = getDatas(page)
+        getDatas(page)
 if __name__ == '__main__':
    doJob()
-    # url = 'http%3A%2F%2Fwww.zj.gov.cn%2Fart%2F2022%2F4%2F18%2Fart_1229630461_2401403.html'
-    # req = requests.get(url,headers=headers)
-    # req.encoding = req.apparent_encoding
    baseCore.close()