10/26

4e84d611 · 薛凌堃 · c2749092 · 4e84d611 · 4e84d611 · 4e84d611
--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
--- a/base/smart/smart_extractor.py
+++ b/base/smart/smart_extractor.py
 # -*- coding: utf-8 -*-
+import sys
 import pandas as pd
 import requests
 from goose3 import Goose
 from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
-from base.smart.entity import *
+sys.path.append('D:\\kkwork\\zzsn_spider\\base\\smart')
-from base.smart.smart_extractor_utility import SmartExtractorUtility
+from entity import *
+from smart_extractor_utility import SmartExtractorUtility
 # goose3自带的lxml，提示找不到etree，但仍可使用
 from lxml import etree
 from lxml.html import HtmlElement
@@ -135,6 +138,16 @@ class SmartExtractor:
        return self.get_extraction_result(article, link_text)
+    def extract_by_html(self, html, link_text=''):
+        """
+        按HTML采集内容
+        """
+        # 采集正文：传入html
+        article = self.goose.extract(raw_html=html)
+        return self.get_extraction_result(article, link_text)
 #url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....]
 def extract_by_url_test(url_list,list_info_all):
    # 测试：按URL采集

--- a/comData/Tyc/getTycId.py
+++ b/comData/Tyc/getTycId.py
 # 根据信用代码获取天眼查id
 import json
 import random
+import sys
 import time
 import pymysql
 import requests
-from base.BaseCore import BaseCore
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 requests.adapters.DEFAULT_RETRIES = 5
-baseCore = BaseCore()
+baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
 headers = {
        'Accept': 'application/json, text/plain, */*',

--- a/comData/Tyc/newsbucai.py
+++ b/comData/Tyc/newsbucai.py
@@ -6,11 +6,12 @@ import requests, time, pymysql
 import jieba
 import sys
+from bs4 import BeautifulSoup
 from kafka import KafkaProducer
 from getTycId import getTycIdByXYDM
 # from base.BaseCore import BaseCore
 # from base.smart import smart_extractor
-sys.path.append('D:\\zzsn_spider\\base')
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
 import BaseCore
 from smart import smart_extractor
 import urllib3
@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor
 taskType = '企业动态/天眼查/补采20W+'
+def reqDetailmsg(url,headers):
+    # proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
+    for i in range(0,1):
+        try:
+            response=requests.get(url=url,headers=headers,timeout=8,verify=False)
+            response.encoding = response.apparent_encoding
+            htmltext=response.text
+        except Exception as e:
+            htmltext=''
+            log.info(f'{url}---详情请求失败--{e}')
+        if htmltext:
+            log.info(f'{url}---详情请求成功')
+            break
+    return htmltext
 def beinWork(tyc_code, social_code,start_time):
    time.sleep(3)
@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time):
                # 开始进行智能解析
                # lang = baseCore.detect_language(title)
                # smart = smart_extractor.SmartExtractor(lang)
-                #带标签正文
+                # req = requests.get(url=link,headers=headers,timeout=10)
-                contentText = smart.extract_by_url(link).text
+                # html = BeautifulSoup(req.content,'html.parser')
-                #不带标签正文
+                raw_html = reqDetailmsg(link,headers)
-                content = smart.extract_by_url(link).cleaned_text
+                if raw_html:
-                # time.sleep(3)
+                    # soup = BeautifulSoup(raw_html, 'html.parser')
+                    try:
+                        article = smart.extract_by_html(raw_html)
+                        content = article.cleaned_text
+                        contentText = article.text
+                    except Exception as e:
+                        log.info(f'抽取失败！！{e}')
+                # #带标签正文
+                # contentText = smart.extract_by_url(link).text
+                # #不带标签正文
+                # content = smart.extract_by_url(link).cleaned_text
+                # # time.sleep(3)
            except Exception as e:
                contentText = ''
            if contentText == '':
                log.error(f'获取正文失败：--------{tyc_code}--------{num}--------{link}')
                e = '获取正文失败'
@@ -281,7 +312,7 @@ def doJob():
    while True:
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
-        #social_code = '91440300665899831W'
+        # social_code = '913205007764477744'
        # 判断 如果Redis中已经没有数据，则等待
        if social_code == None:
            time.sleep(20)

--- a/comData/annualReport/雪球网-年报.py
+++ b/comData/annualReport/雪球网-年报.py
 # -*- coding: utf-8 -*-
@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num):
                'sid': '1684032033495392257',
                'sourceAddress': year_url,  # 原文链接
                'summary': '',
-                'title': name_pdf.replace(',pdf', ''),
+                'title': name_pdf.replace('.pdf', ''),
                'type': 1,
                'socialCreditCode': social_code,
                'year': year
@@ -260,7 +260,7 @@ if __name__ == '__main__':
        start_time = time.time()
        # 获取企业信息
        # social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
-        social_code = '913412007050444417'
+        social_code = '91330000734507783B'
        if not social_code:
            time.sleep(20)
            continue

--- a/comData/negative_news/creditchina.py
+++ b/comData/negative_news/creditchina.py
@@ -33,13 +33,14 @@ def getRequest(url,headers):
    return json_data
 # 严重失信
-def dishonesty():
+def dishonesty(headers,com_name,social_code):
+    list_dishonesty = []
    param = {
        'tableName':'credit_zgf_fr_sxbzxr',
        'searchState': '1',
        'scenes': 'defaultscenario',
-        'keyword': '雷州市白金银座演艺文化实业有限公司',
+        'keyword': com_name,
-        'tyshxydm': '91440882315032592M',
+        'tyshxydm': social_code,
        'page': '1',
        'pageSize': '10'
    }
@@ -50,14 +51,14 @@ def dishonesty():
    if json_data['status'] == 1:
        pass
    total_size = json_data['data']['totalSize']
-    for page in total_size:
+    for page in range(1,total_size+1):
        param_page = {
            'tableName': 'credit_zgf_fr_sxbzxr',
            'searchState': '1',
            'scenes': 'defaultscenario',
-            'keyword': '雷州市白金银座演艺文化实业有限公司',
+            'keyword': com_name,
-            'tyshxydm': '91440882315032592M',
+            'tyshxydm': social_code,
-            'page': f'{page}',
+            'page': page,
            'pageSize': '10'
        }
        url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
@@ -67,7 +68,7 @@ def dishonesty():
            pass
        info_list = json_data['data']['list']
        for info in info_list:
-            entity = info['entity']
+            entity = info
            iname = entity['iname']  # 失信被执行人姓名/名称
            cardnumber = entity['cardnumber']  # 组织机构代码
            court_name = entity['court_name']  # 执行法院
@@ -83,15 +84,34 @@ def dishonesty():
            performed_part = entity['performed_part']  # 已履行部分
            unperform_part = entity['unperform_part']  # 未履行部分
            dataSource = info['dataSource']  # 数据来源
+            dic_dishonesty = {
+                '失信被执行人姓名/名称': iname,
+                '组织机构代码':cardnumber,
+                '执行法院':court_name,
+                '省份':area_name,
+                '执行依据文号':case_code,
+                '立案时间':reg_date,
+                '案号':gist_cid,
+                '做出执行依据单位':gist_unit,
+                '生效法律文书确定的义务':duty,
+                '被执行人的履行情况':performance,
+                '失信被执行人行为具体情形':disreput_type_name,
+                '发布时间':publish_date,
+                '已履行部分':performed_part,
+                '未履行部分':unperform_part,
+                '数据来源':dataSource
+            }
+            list_dishonesty.append(dic_dishonesty)
+    return list_dishonesty
 # 行政处罚
-def punish():
+def punish(headers,com_name,social_code):
+    list_punish = []
    param = {
        'tableName':'credit_xyzx_fr_xzcf_new',
        'searchState': '1',
        'scenes': 'defaultscenario',
-        'keyword': '雷州市白金银座演艺文化实业有限公司',
+        'keyword': com_name,
-        'tyshxydm': '91440882315032592M',
+        'tyshxydm': social_code,
        'page': '1',
        'pageSize': '10'
    }
@@ -106,15 +126,16 @@ def punish():
    if total_size > 0:
        pass
    else:
-        log.info()
+        log.info(f'该企业{com_name}无行政处罚信息')
-    for page in total_size:
+        return list_punish
+    for page in range(1,total_size+1):
        param_page = {
            'tableName': 'credit_xyzx_fr_xzcf_new',
            'searchState': '1',
            'scenes': 'defaultscenario',
-            'keyword': '雷州市白金银座演艺文化实业有限公司',
+            'keyword': com_name,
-            'tyshxydm': '91440882315032592M',
+            'tyshxydm': social_code,
-            'page': f'{page}',
+            'page': page,
            'pageSize': '10'
        }
        url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
@@ -141,6 +162,88 @@ def punish():
            cf_sjly = entity['cf_sjly']     # 数据来源
            cf_sjlydm = entity['cf_sjlydm']     # 数据来源单位统一社会信用代码
+            dic_punish = {
+                '行政处罚决定书文号':cf_wsh,
+                '处罚类别':cf_cflb,
+                '处罚决定日期':cf_jdrq,
+                '处罚内容':cf_nr,
+                '罚款金额（万元）':cf_nr_fk,
+                '没收违法所得、没收非法财物的金额（万元）':cf_nr_wfff,
+                '暂扣或吊销证照名称及编号':cf_nr_zkdx,
+                '违法行为类型':cf_wfxw,
+                '违法事实':cf_sy,
+                '处罚依据':cf_yj,
+                '处罚机关':cf_cfjg,
+                '处罚机关统一社会信用代码':cf_cfjgdm,
+                '数据来源':cf_sjly,
+                '数据来源单位统一社会信用代码':cf_sjlydm
+            }
+            list_punish.append(dic_punish)
+    return list_punish
+# 经营异常
+def abnormal(headers,com_name,social_code):
+    list_abhormal = []
+    param = {
+        'tableName': 'credit_scjdglzj_fr_ycjyml',
+        'searchState': '1',
+        'scenes': 'defaultscenario',
+        'keyword': com_name,
+        'tyshxydm': social_code,
+        'page': '1',
+        'pageSize': '10'
+    }
+    url = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page=1&pageSize=10'
+    json_data = getRequest(url, headers)
+    # print(json_data)
+    if json_data['status'] == 1:
+        pass
+    # 总条数
+    total_size = json_data['data']['totalSize']
+    if total_size > 0:
+        pass
+    else:
+        log.info()
+    for page in total_size:
+        param_page = {
+            'tableName': 'credit_xyzx_fr_xzcf_new',
+            'searchState': '1',
+            'scenes': 'defaultscenario',
+            'keyword': com_name,
+            'tyshxydm': social_code,
+            'page': page,
+            'pageSize': '10'
+        }
+        url = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
+        json_data = getRequest(url, headers)
+        if json_data['status'] == 1:
+            pass
+        info_list = json_data['data']['list']
+        for entity in info_list:
+            entname = entity['entname']     # 企业名称
+            uniscid = entity['uniscid']  # 社会统一信用代码
+            lerep = entity['lerep']  # 法定代表人
+            pripid = entity['pripid']  # 主体身份代码
+            regno = entity['regno']  # 注册号
+            specausename = entity['specausename']  # 列入经营异常名录原因类型名称
+            abntime = entity['abntime']  # 设立日期
+            decorgname = entity['decorgname']  # 列入决定机关名称
+            dataSource = entity['dataSource']  # 数据来源
+            dic_abnormal = {
+                '企业名称':entname,
+                '社会统一信用代码':uniscid,
+                '法定代表人':lerep,
+                '主体身份代码':pripid,
+                '注册号':regno,
+                '列入经营异常名录原因类型名称':specausename,
+                '设立日期':abntime,
+                '列入决定机关名称':decorgname,
+                '数据来源':dataSource
+            }
+            list_abhormal.append(dic_abnormal)
+    return list_abhormal
 if __name__=='__main__':
@@ -154,16 +257,18 @@ if __name__=='__main__':
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
    }
-    type_list = ['严重失信主体名单','行政管理']
+    com_name = '石家庄交投集团工程服务有限责任公司'
-    com_name = ''
+    social_code = '91130100MA7EK14C8L'
-    social_code = ''
+    # list_dishonesty = dishonesty(headers,com_name,social_code)
-    dishonesty()
+    # print(list_dishonesty)
-    punish()
+    list_punish = punish(headers,com_name,social_code)
+    print(list_punish)
+    # abnormal(headers,com_name,social_code)
    # 报告链接
-    url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
+    # url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
-    report_json = getRequest(url_report, headers)
+    # report_json = getRequest(url_report, headers)
-    reportNumber = report_json['data']['reportNumber']
+    # reportNumber = report_json['data']['reportNumber']
-    pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
+    # pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
    # respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30)

--- a/comData/tyctest/tycdt.py
+++ b/comData/tyctest/tycdt.py
@@ -58,8 +58,8 @@ class Tycdt(object):
    def doJob(self):
        while True:
            # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
-            # social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
+            social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
-            social_code = '913205002517479347'
+            # social_code = '913205002517479347'
            # 判断 如果Redis中已经没有数据，则等待
            if social_code == None:
                time.sleep(20)

--- a/comData/weixin_solo/get_tokenCookies.py
+++ b/comData/weixin_solo/get_tokenCookies.py
@@ -50,7 +50,7 @@ if __name__=="__main__":
    opt.add_experimental_option("excludeSwitches", ["enable-automation"])
    opt.add_experimental_option('excludeSwitches', ['enable-logging'])
    opt.add_experimental_option('useAutomationExtension', False)
-    opt.binary_location = r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
+    opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
    chromedriver = r'D:\cmd100\chromedriver.exe'
    browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
    url = "https://mp.weixin.qq.com/"

--- a/qiushi_leaderspeech.py
+++ b/qiushi_leaderspeech.py
+import datetime
+import json
+import time
+import redis
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+from kafka import KafkaProducer
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
+def sendKafka(dic_news):
+    try:
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
+        kafka_result = producer.send("crawlerInfo",
+                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+        print(kafka_result.get(timeout=10))
+        dic_result = {
+            'success': 'ture',
+            'message': '操作成功',
+            'code': '200',
+        }
+        log.info(dic_result)
+        return True
+    except Exception as e:
+        dic_result = {
+            'success': 'false',
+            'message': '操作失败',
+            'code': '204',
+            'e': e
+        }
+        log.info(dic_result)
+        return False
+def getRequest(url,headers):
+    req = requests.get(url=url, headers=headers, timeout=30)
+    if req.status_code == 200:
+        pass
+    soup = BeautifulSoup(req.content, 'html.parser')
+    return soup
+def deletep(soup,attribute_to_delete,value_to_delete):
+    # 查找带有指定属性的P标签并删除
+    p_tags = soup.find_all('p', {attribute_to_delete: value_to_delete})
+    for p_tag in p_tags:
+        p_tag.decompose()
+def deletek(soup):
+    # 删除空白标签（例如<p></p>、<p><br></p>, img、video、hr除外）
+    for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' '):
+        for j in i.descendants:
+            if j.name in ["img", "video", "br"]:
+                break
+        else:
+            i.decompose()
+# 将html中的相对地址转换成绝对地址
+def paserUrl(html, listurl):
+    # 获取所有的<a>标签和<img>标签
+    if isinstance(html, str):
+        html = BeautifulSoup(html, 'html.parser')
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+if __name__=='__main__':
+    headers = {
+        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding':'gzip, deflate',
+        'Accept-Language':'zh-CN,zh;q=0.9',
+        'Cache-Control':'max-age=0',
+        'Cookie':'UM_distinctid=18b5f64f72a580-0d0997e58eee04-26031e51-e1000-18b5f64f72bab5; wdcid=23a1d057521777ff; wdses=22f0d407e263a31e; CNZZDATA30019853=cnzz_eid%3D744929620-1698112534-%26ntime%3D1698112562; wdlast=1698112562',
+        'Host':'www.qstheory.cn',
+        'Proxy-Connection':'keep-alive',
+        'Upgrade-Insecure-Requests':'1',
+        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
+    }
+    url = 'http://www.qstheory.cn/qs/mulu.htm'
+    soup_report = getRequest(url,headers)
+    report_list = soup_report.find_all('div', class_='col-sm-3')
+    for book in report_list:
+        href = book.find('div', class_='booktitle').find('a')['href']
+        year = book.find('div', class_='booktitle').find('a').text
+        soup_href = getRequest(href,headers)
+        period = soup_href.find('div', class_='highlight')
+        deletep(period,'align','center')
+        deletek(period)
+        period_list = period.find_all('p')
+        for p in period_list:
+            period_href = p.find('a')['href']
+            period_title = p.find('a').text
+            soup_news = getRequest(period_href,headers)
+            deletep(soup_news, 'align', 'center')
+            deletek(soup_news)
+            title_list = soup_news.select('div[class="highlight"]>p')[1:]
+            for new in title_list:
+                try:
+                    deletek(new)
+                    try:
+                        author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
+                    except:
+                        continue
+                    if len(author)>4:
+                        continue
+                    # if '（' in author or '本刊' in author or '国家' in author\
+                    #     or '中共' in author or '记者' in author or '新闻社' in author\
+                    #     or '党委' in author or '调研组' in author or '研究中心' in author\
+                    #     or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
+                    if '（' in author or '本刊' in author or '国家' in author \
+                            or '中共' in author or '记者' in author or '新闻社' in author \
+                            or '党委' in author or '”' in author\
+                            or '大学' in author or '洛桑江村' in author:
+                        continue
+                    new_href = new.find('a')['href']
+                    is_member = r.sismember('qiushileaderspeech::' + period_title, new_href)
+                    if is_member:
+                        continue
+                    new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
+                except:
+                    continue
+                soup_new = getRequest(new_href,headers)
+                deletek(soup_new)
+                deletep(soup_new, 'style', 'TEXT-ALIGN: center')
+                result = soup_new.find('div', class_='inner')
+                if result:
+                    pass
+                else:
+                    continue
+                span_list = result.find_all('span')
+                source = span_list[0].text.replace('来源：', '').strip('\r\n')
+                pub_time = span_list[2].text.strip('\r\n')
+                content = soup_new.find('div', class_='highlight').text
+                paserUrl(soup_new, new_href)
+                contentWithTag = soup_new.find('div', class_='highlight')
+                nowDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                dic_news = {
+                    'sid': '1716996740019585025',
+                    'title': new_title,
+                    'source': "16",
+                    'origin': source,
+                    'author': author,
+                    'publishDate': pub_time,
+                    'content': content,
+                    'contentWithTag': str(contentWithTag),
+                    'sourceAddress': new_href,
+                    "createDate": nowDate
+                }
+                # log.info(dic_news)
+                if sendKafka(dic_news):
+                    r.sadd('qiushileaderspeech::' + period_title, new_href)
+                    log.info(f'采集成功----{dic_news["sourceAddress"]}')