11/15

f97acdf8 · 薛凌堃 · 0261ed3e · f97acdf8 · f97acdf8 · f97acdf8
--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
 import os
@@ -388,7 +388,7 @@ def zhengquanqihuo(wb,file_path):
 #上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
-def sse():
+def sse(wb,file_path):
    url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
    headers = {
        'Accept': '*/*',
@@ -454,7 +454,7 @@ def sse():
                    '原文链接': newsUrl,
                    '发文时间': '',
                    '发文机构': '',
-                    '发文字号': pubHao,
+                    '发文字号': '',
                    '摘要': summary,
                    '正文': content,
                    '附件名称': fu_jian_name,
@@ -468,11 +468,14 @@ def sse():
                # print(content_)
                # #  将链接替换为绝对路径
                contentWithTag = policy.paserUrl(content_, newsUrl)
+                try:
                    pubHao = contentWithTag.find('p',style='text-align: center;').text.strip(' ')
                    if '〔' in pubHao:
                        pass
                    else:
                        pubHao = ''
+                except:
+                    pubHao = ''
                # print(contentWithTag)
                content = contentWithTag.text
@@ -482,11 +485,18 @@ def sse():
                for fujian in fujian_list:
                    file_href = fujian['href']
                    file_name = fujian.text.strip(' ')
-                    rename_file = f'{str(num)}_{publishDate}_{file_name}'
+                    category = os.path.splitext(file_href)[1]
+                    if category in file_name:
+                        pass
+                    else:
+                        file_name = file_name + category
+                    rename_file = f'{str(num)}_{publishDate}_{file_name}'.replace('\\','').replace('/','').replace('|','').replace('>','').replace('<','').replace('*','').replace('：','').replace('？','').replace('—','')
                    fu_jian_name += rename_file + '\n'
                    fu_jian_href += file_href + '\n'
+                    try:
                        policy.downloadfile(file_href, f'{path}/{rename_file}')
+                    except:
+                        log.info(f'--{page}-{num}======{newsUrl}')
                dic_info = {
                    '序号': num,
                    '标题': title,
@@ -615,8 +625,9 @@ def beijing():
 if __name__=="__main__":
-    file_path = f'data/REITs专题数据.xlsx'
+    file_path = f'data/REITs国家改革发展委员会.xlsx'
    wb = policy.createfile(file_path)
    # reform(wb,file_path)
-    zhengquanqihuo(wb,file_path)
+    # zhengquanqihuo(wb,file_path)
+    sse(wb,file_path)
 # zhengquanqihuo()
\ No newline at end of file
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
 # 核心工具包
 import os
 import random
+import smtplib
 import socket
 import sys
 import time
+from email.header import Header
+from email.mime.application import MIMEApplication
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
 import MySQLdb
 import logbook
@@ -852,3 +857,34 @@ class BaseCore:
        result = obsClient.putContent('zzsn', pathType + name, content=response.content)
        # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
        return result
+    def sendEmail(self, file_name):
+        file = open(file_name, 'rb').read()
+        # 发送邮箱地址
+        sender = '1195236739@qq.com'
+        # 接收邮箱地址
+        receiver = '1074481431@qq.com'
+        smtpserver = 'smtp.qq.com'
+        # 发送邮箱登录 账户 密码
+        username = '1195236739@qq.com'
+        password = 'gatvszshadvpgjci'
+        maile_title = '企业基本信息采集情况'
+        message = MIMEMultipart()
+        message['From'] = sender
+        message['To'] = receiver
+        message['Subject'] = Header(maile_title, 'utf-8')
+        message.attach(MIMEText('企业基本信息采集情况', 'plain', 'utf-8'))
+        xlsxApart = MIMEApplication(file)
+        xlsxApart.add_header('Content-Disposition', 'attachment', filename='企业基本信息采集情况.xlsx')
+        message.attach(xlsxApart)
+        smtpObj = smtplib.SMTP_SSL(smtpserver)  # 注意：如果遇到发送失败的情况（提示远程主机拒接连接），这里要使用SMTP_SSL方法
+        smtpObj.connect(smtpserver, port=465)
+        smtpObj.login(username, password)
+        smtpObj.sendmail(sender, receiver, message.as_string())
+        print("邮件发送成功！！！")
+        smtpObj.quit()
--- a/comData/BaseInfo_qcc/baseinfo1113.py
+++ b/comData/BaseInfo_qcc/baseinfo1113.py
+# -*- coding: utf-8 -*-
+import json
+import openpyxl
+import re
+import time
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+log = baseCore.getLogger()
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+from openpyxl import Workbook, load_workbook
+#创建文件
+def createFile(file_name):
+    wb = Workbook()
+    sheet = wb.active
+    # 更改默认的sheet名称
+    sheet.title = "需处理企业"
+    sheet.append(["企业名称", "社会信用代码"])
+    # 创建另一个sheet
+    sheet2 = wb.create_sheet("获取基本信息成功企业")
+    sheet2.append(["企业名称", "社会信用代码", "采到的信用代码"])
+    wb.save(file_name)
+    wb.close()
+#追加数据
+def appenddata(file_name,sheet,data):
+    # 打开现有的Excel文件
+    wb = load_workbook(file_name)
+    # 选择要追加数据的sheet
+    sheet = wb[sheet]
+    sheet.append(data)
+    # 保存Excel文件
+    wb.save(file_name)
+    wb.close()
+def sendkafka(post_data):
+    try:
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
+        kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
+        print(kafka_result.get(timeout=10))
+    except:
+        exception = 'kafka传输失败'
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
+        log.info(f"{com_name}--{social_code}--kafka传输失败")
+def deletep(soup,tag_,attribute_to_delete,value_to_delete):
+    if attribute_to_delete and value_to_delete:
+        # 查找带有指定属性的P标签并删除
+        tags = soup.find_all(tag_, {attribute_to_delete: value_to_delete})
+        for tag in tags:
+            # print(tag)
+            tag.decompose()
+    else:
+        tags = soup.find_all(tag_)
+        for tag in tags:
+            # print(tag)
+            tag.decompose()
+def deletek(soup):
+    # 删除空白标签（例如<p></p>、<p><br></p>, img、video、hr除外）
+    for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' 'or tag.get_text()==' '):
+        for j in i.descendants:
+            if j.name in ["img", "video", "br"]:
+                break
+        else:
+            i.decompose()
+def deletespan(td):
+    spans = td.find_all('span', class_='app-copy copy-button-item')
+    for span in spans:
+        if '复制' in span.text:
+            span.extract()  # 删除span标签
+    spans2 = td.find_all('span', slot='content')
+    for span2 in spans2:
+        if '趋势图' in span2.text:
+            span2.extract()
+    spans3 = td.find_all('span', class_='m-l-r-10')
+    for span3 in spans3:
+        if '年报' in span3.text:
+            span3.extract()
+def getinfo(dict1,dict2):
+    # 取出两个字典的key值集合
+    keys1 = set(dict1.keys())
+    keys2 = set(dict2.keys())
+    # 取出并集
+    union_keys = keys1 | keys2
+    # 根据并集的key值，从两个字典中取出value值，组成新的字典
+    result_dict = {key: dict1.get(key, None) or dict2.get(key, None) for key in union_keys}
+    return result_dict
+def baseinfo(com_soup):
+    baseinfo = com_soup.find('div', class_='contact-info')
+    cominfo_list = baseinfo.find_all('span', class_='f')
+    data = {}
+    for cominfo in cominfo_list:
+        # print(cominfo)
+        value = cominfo.find('span', class_='val').text.replace('复制', '').strip(' ')
+        pattern = r'\（\d{4}\s*年\）'
+        match = re.search(pattern, value)
+        if match:
+            # print(match.group(0))
+            value = value.split(match.group(0))[0]
+        # print(value)
+        deletep(cominfo, 'span', 'class', 'val')
+        deletep(cominfo, 'a', '', '')
+        deletek(cominfo)
+        # print(cominfo)
+        name = cominfo.text.replace('\n', '').replace('复制', '').strip(' ').replace('：', '')
+        # print(name,value)
+        data[name] = value
+    return data
+def checklogin(key):
+    # url = f'https://www.qcc.com/web/search?key=91110108558521630L'
+    url = f'https://www.qcc.com/web/search?key={key}'
+    req = requests.get(headers=headers, url=url)
+    soup = BeautifulSoup(req.content, 'html.parser')
+    if soup.find('title').text == '会员登录 - 企查查':
+        log.info('状态---未登录')
+        soup = ''
+        return soup
+    return soup
+def redaytowork(com_name,social_code):
+    if social_code:
+        dic_info = baseCore.getInfomation(social_code)
+    elif not social_code:
+        dic_info = baseCore.getBYnameInfomation(com_name)
+    else:
+        dic_info = ''
+    if dic_info:
+        pass
+    log.info(f'----当前企业{social_code}-{com_name}--开始处理---')
+    count = dic_info[14]
+    # 企查查id
+    company_id = dic_info[12]
+    # 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
+    if social_code:
+        soup = checklogin(social_code)
+    else:
+        soup = checklogin(com_name)
+    if not soup:
+        log.info("登录失效===重新放入redis")
+        baseCore.rePutIntoR('BaseInfoEnterprise:gnqy_socialCode', company_field)
+        # baseCore.delete_token(token)
+        log.info('=====已重新放入redis,失效token已删除======')
+        time.sleep(20)
+        return count
+    else:
+        searchinfo = soup.find_all('div', class_='npanel-heading')[1].find('span', class_='text-danger').text
+        if searchinfo == '0':
+            log.info('=====搜索不到该企业====')
+            data = [com_name, social_code]
+            # todo:搜不到的企业需要返回到一个表格中
+            appenddata(file_name, '需处理企业', data)
+            return count
+        else:
+            # 开始采集
+            try:
+                spiderwork(soup, com_name)
+                count += 1
+                log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
+                return count
+            except:
+                log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
+                baseCore.rePutIntoR('BaseInfoEnterprise:gnqy_social_code', social_code)
+                # baseCore.delete_token(token)
+                log.info('=====已重新放入redis,失效token已删除======')
+                return count
+def spiderwork(soup,receptname):
+    company_url = ''
+    company_list = soup.find('table',class_='app-ltable ntable ntable-list ntable ntable-list')
+    tr_list = company_list.find_all('tr',class_='tsd0')
+    # receptname = '小米通讯技术有限公司'
+    for tr in tr_list:
+        info_t = tr.find('span',class_='copy-title')
+        getname = info_t.find('span').text
+        log.info(f'接收到的企业名称--{com_name}---采到的企业名称--{getname}')
+        if getname == receptname:
+            company_url = info_t.find('a')['href']
+            break
+        else:
+            continue
+    if company_url:
+        # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
+        # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
+        qccid = company_url.split('firm/')[1].split('.html')[0]
+        #将采集到的企查查id更新
+        updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
+        cursor_.execute(updateSql)
+        cnx_.commit()
+        req_ = requests.get(headers=headers,url=company_url)
+        com_soup = BeautifulSoup(req_.content,'html.parser')
+        try:
+            businessinfo = com_soup.find('div', class_='cominfo-normal')
+        except:
+            businessinfo = ''
+        if businessinfo:
+            data_businfo = {}
+            data_baseinfo = baseinfo(com_soup)
+            # print(data_baseinfo)
+            try:
+                name = businessinfo.find('div', class_='ntag text-gray original-tag').text
+                value = businessinfo.find('div', class_='original-name-list').text.replace('展开', '').replace(' ', '').replace('…', '').replace('\n', '').replace('复制', '')
+            except:
+                name = '曾用名'
+                value = ''
+            data_businfo[name] = value
+            td_tags = businessinfo.find_all('td')
+            # print(td_tags)
+            for td in td_tags:
+                if 'class' in td.attrs and 'tb' in td['class']:
+                    div_tags = td.find_all('div')
+                    texts = [div.text for div in div_tags]
+                    if len(texts) > 0:
+                        for text in texts[::-1]:
+                            data_businfo[text.replace('复制', '').replace('\n', '').strip(' ')] = None
+                    else:
+                        data_businfo[td.text.replace('复制', '').replace('\n', '').strip(' ')] = None
+                else:
+                    # 没有class='tb'属性的标签
+                    att_list = ['inline-block', 'ntag-v2', 'm-l-r-10', 'm-l-sm']
+                    for att in att_list:
+                        deletep(td, 'a', 'class', att)
+                    deletek(td)
+                    deletep(td,'div','class','text-gray clearfix original-name-part')
+                    deletespan(td)
+                    # if len(result_dict) <= len(td_tags) // 2:
+                    div_tags = td.find_all('div')
+                    texts = [div.text for div in div_tags if len(div.attrs) == 0]
+                    if len(texts) > 0:
+                        i = 1
+                        for text in texts:
+                            if text == ' ':
+                                continue
+                            data_businfo[list(data_businfo.keys())[-i]] = text.replace('复制', '').replace('\n', '').replace(' ','')
+                            i += 1
+                    else:
+                        if '实缴资本' in td.text:
+                            # pattern = r"\d+万美元"
+                            # match = re.search(pattern, td.text.replace('复制', '').replace('\n', '').replace(' ', ''))
+                            # if match:
+                            #     value = match.group()
+                            value = td.text.replace('复制', '').replace('\n', '').replace(' ', '').split('实缴资本')[0]
+                            data_businfo[list(data_businfo.keys())[-1]] = value
+                        else:
+                            data_businfo[list(data_businfo.keys())[-1]] = td.text.replace('复制', '').replace('\n', '').replace(' ', '')
+            result_dict = getinfo(data_businfo,data_baseinfo)
+            print(result_dict)
+            #采集成功的企业
+            data = [com_name,social_code,result_dict['统一社会信用代码']]
+            appenddata(file_name,'获取基本信息成功企业',data)
+            # sendkafka(result_dict)
+        else:
+            data_baseinfo = baseinfo(com_soup)
+    else:
+        #没有搜到相同的企业名称
+        data = [com_name, social_code]
+        appenddata(file_name, '需处理企业',data)
+if __name__ == '__main__':
+    taskType = '基本信息/企查查'
+    # 从redis里拿数据
+    nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
+    file_name = f'./data/企业基本信息采集情况_{nowtime}.xlsx'
+    createFile(file_name)
+    while True:
+        # TODO:需要隔两个小时左右抓包修改,token从数据库中获得
+        # token = baseCore.GetToken()
+        # if token:
+        #     pass
+        # else:
+        #     log.info('==========已无token==========')
+        #     time.sleep(30)
+        #     continue
+        headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Connection': 'keep-alive',
+            'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; QCCSESSID=4e595fd804c28ae43780e55183; acw_tc=7522281e16999324472113552e97729806c88361a71c9bc96f8d5ff1c0',
+            'Host': 'www.qcc.com',
+            'Referer': 'https://www.qcc.com/',
+            'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+            'Sec-Ch-Ua-Mobile': '?0',
+            'Sec-Ch-Ua-Platform': '"Windows"',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'same-origin',
+            'Sec-Fetch-User': '?1',
+            'Upgrade-Insecure-Requests': '1',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
+        }
+        start_time = time.time()
+        # 获取企业信息
+        # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
+        company_field = '小米通讯技术有限公司|91110108558521630L'
+        if company_field == 'end':
+            # 本轮处理完毕，需要发送邮件，并且进入下一轮
+            baseCore.sendEmail(file_name)
+            time.sleep(20)
+            #创建下一轮的文件
+            nowtime = baseCore.getNowTime(1).replace('-', '')[:10]
+            file_name = f'./企业基本信息采集情况_{nowtime}.xlsx'
+            createFile(file_name)
+            continue
+        if company_field == '' or company_field is None:
+            # 本轮结束后没有新增的企业要采集
+            time.sleep(20)
+            continue
+        com_name = company_field.split('|')[0]
+        social_code = company_field.split('|')[1]
+        count = redaytowork(com_name,social_code)
+        # 信息采集完成后将该企业的采集次数更新
+        runType = 'BaseInfoRunCount'
+        baseCore.updateRun(social_code, runType, count)
\ No newline at end of file