提交 4e84d611 作者: 薛凌堃

10/26

上级 c2749092
# -*- coding: utf-8 -*-
import sys
import pandas as pd
import requests
from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
from base.smart.entity import *
from base.smart.smart_extractor_utility import SmartExtractorUtility
sys.path.append('D:\\kkwork\\zzsn_spider\\base\\smart')
from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree
from lxml.html import HtmlElement
......@@ -135,6 +138,16 @@ class SmartExtractor:
return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
#url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....]
def extract_by_url_test(url_list,list_info_all):
# 测试:按URL采集
......
# 根据信用代码获取天眼查id
import json
import random
import sys
import time
import pymysql
import requests
from base.BaseCore import BaseCore
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
requests.adapters.DEFAULT_RETRIES = 5
baseCore = BaseCore()
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/plain, */*',
......
......@@ -6,11 +6,12 @@ import requests, time, pymysql
import jieba
import sys
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from getTycId import getTycIdByXYDM
# from base.BaseCore import BaseCore
# from base.smart import smart_extractor
sys.path.append('D:\\zzsn_spider\\base')
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
from smart import smart_extractor
import urllib3
......@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor
taskType = '企业动态/天眼查/补采20W+'
def reqDetailmsg(url,headers):
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for i in range(0,1):
try:
response=requests.get(url=url,headers=headers,timeout=8,verify=False)
response.encoding = response.apparent_encoding
htmltext=response.text
except Exception as e:
htmltext=''
log.info(f'{url}---详情请求失败--{e}')
if htmltext:
log.info(f'{url}---详情请求成功')
break
return htmltext
def beinWork(tyc_code, social_code,start_time):
time.sleep(3)
......@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time):
# 开始进行智能解析
# lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang)
#带标签正文
contentText = smart.extract_by_url(link).text
#不带标签正文
content = smart.extract_by_url(link).cleaned_text
# time.sleep(3)
# req = requests.get(url=link,headers=headers,timeout=10)
# html = BeautifulSoup(req.content,'html.parser')
raw_html = reqDetailmsg(link,headers)
if raw_html:
# soup = BeautifulSoup(raw_html, 'html.parser')
try:
article = smart.extract_by_html(raw_html)
content = article.cleaned_text
contentText = article.text
except Exception as e:
log.info(f'抽取失败!!{e}')
# #带标签正文
# contentText = smart.extract_by_url(link).text
# #不带标签正文
# content = smart.extract_by_url(link).cleaned_text
# # time.sleep(3)
except Exception as e:
contentText = ''
if contentText == '':
log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}')
e = '获取正文失败'
......@@ -281,7 +312,7 @@ def doJob():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
#social_code = '91440300665899831W'
# social_code = '913205007764477744'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num):
'sid': '1684032033495392257',
'sourceAddress': year_url, # 原文链接
'summary': '',
'title': name_pdf.replace(',pdf', ''),
'title': name_pdf.replace('.pdf', ''),
'type': 1,
'socialCreditCode': social_code,
'year': year
......@@ -260,7 +260,7 @@ if __name__ == '__main__':
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '913412007050444417'
social_code = '91330000734507783B'
if not social_code:
time.sleep(20)
continue
......
......@@ -33,13 +33,14 @@ def getRequest(url,headers):
return json_data
# 严重失信
def dishonesty():
def dishonesty(headers,com_name,social_code):
list_dishonesty = []
param = {
'tableName':'credit_zgf_fr_sxbzxr',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司',
'tyshxydm': '91440882315032592M',
'keyword': com_name,
'tyshxydm': social_code,
'page': '1',
'pageSize': '10'
}
......@@ -50,14 +51,14 @@ def dishonesty():
if json_data['status'] == 1:
pass
total_size = json_data['data']['totalSize']
for page in total_size:
for page in range(1,total_size+1):
param_page = {
'tableName': 'credit_zgf_fr_sxbzxr',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司',
'tyshxydm': '91440882315032592M',
'page': f'{page}',
'keyword': com_name,
'tyshxydm': social_code,
'page': page,
'pageSize': '10'
}
url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
......@@ -67,7 +68,7 @@ def dishonesty():
pass
info_list = json_data['data']['list']
for info in info_list:
entity = info['entity']
entity = info
iname = entity['iname'] # 失信被执行人姓名/名称
cardnumber = entity['cardnumber'] # 组织机构代码
court_name = entity['court_name'] # 执行法院
......@@ -83,15 +84,34 @@ def dishonesty():
performed_part = entity['performed_part'] # 已履行部分
unperform_part = entity['unperform_part'] # 未履行部分
dataSource = info['dataSource'] # 数据来源
dic_dishonesty = {
'失信被执行人姓名/名称': iname,
'组织机构代码':cardnumber,
'执行法院':court_name,
'省份':area_name,
'执行依据文号':case_code,
'立案时间':reg_date,
'案号':gist_cid,
'做出执行依据单位':gist_unit,
'生效法律文书确定的义务':duty,
'被执行人的履行情况':performance,
'失信被执行人行为具体情形':disreput_type_name,
'发布时间':publish_date,
'已履行部分':performed_part,
'未履行部分':unperform_part,
'数据来源':dataSource
}
list_dishonesty.append(dic_dishonesty)
return list_dishonesty
# 行政处罚
def punish():
def punish(headers,com_name,social_code):
list_punish = []
param = {
'tableName':'credit_xyzx_fr_xzcf_new',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司',
'tyshxydm': '91440882315032592M',
'keyword': com_name,
'tyshxydm': social_code,
'page': '1',
'pageSize': '10'
}
......@@ -106,15 +126,16 @@ def punish():
if total_size > 0:
pass
else:
log.info()
for page in total_size:
log.info(f'该企业{com_name}无行政处罚信息')
return list_punish
for page in range(1,total_size+1):
param_page = {
'tableName': 'credit_xyzx_fr_xzcf_new',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司',
'tyshxydm': '91440882315032592M',
'page': f'{page}',
'keyword': com_name,
'tyshxydm': social_code,
'page': page,
'pageSize': '10'
}
url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
......@@ -141,6 +162,88 @@ def punish():
cf_sjly = entity['cf_sjly'] # 数据来源
cf_sjlydm = entity['cf_sjlydm'] # 数据来源单位统一社会信用代码
dic_punish = {
'行政处罚决定书文号':cf_wsh,
'处罚类别':cf_cflb,
'处罚决定日期':cf_jdrq,
'处罚内容':cf_nr,
'罚款金额(万元)':cf_nr_fk,
'没收违法所得、没收非法财物的金额(万元)':cf_nr_wfff,
'暂扣或吊销证照名称及编号':cf_nr_zkdx,
'违法行为类型':cf_wfxw,
'违法事实':cf_sy,
'处罚依据':cf_yj,
'处罚机关':cf_cfjg,
'处罚机关统一社会信用代码':cf_cfjgdm,
'数据来源':cf_sjly,
'数据来源单位统一社会信用代码':cf_sjlydm
}
list_punish.append(dic_punish)
return list_punish
# 经营异常
def abnormal(headers,com_name,social_code):
list_abhormal = []
param = {
'tableName': 'credit_scjdglzj_fr_ycjyml',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': com_name,
'tyshxydm': social_code,
'page': '1',
'pageSize': '10'
}
url = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page=1&pageSize=10'
json_data = getRequest(url, headers)
# print(json_data)
if json_data['status'] == 1:
pass
# 总条数
total_size = json_data['data']['totalSize']
if total_size > 0:
pass
else:
log.info()
for page in total_size:
param_page = {
'tableName': 'credit_xyzx_fr_xzcf_new',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': com_name,
'tyshxydm': social_code,
'page': page,
'pageSize': '10'
}
url = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
json_data = getRequest(url, headers)
if json_data['status'] == 1:
pass
info_list = json_data['data']['list']
for entity in info_list:
entname = entity['entname'] # 企业名称
uniscid = entity['uniscid'] # 社会统一信用代码
lerep = entity['lerep'] # 法定代表人
pripid = entity['pripid'] # 主体身份代码
regno = entity['regno'] # 注册号
specausename = entity['specausename'] # 列入经营异常名录原因类型名称
abntime = entity['abntime'] # 设立日期
decorgname = entity['decorgname'] # 列入决定机关名称
dataSource = entity['dataSource'] # 数据来源
dic_abnormal = {
'企业名称':entname,
'社会统一信用代码':uniscid,
'法定代表人':lerep,
'主体身份代码':pripid,
'注册号':regno,
'列入经营异常名录原因类型名称':specausename,
'设立日期':abntime,
'列入决定机关名称':decorgname,
'数据来源':dataSource
}
list_abhormal.append(dic_abnormal)
return list_abhormal
if __name__=='__main__':
......@@ -154,16 +257,18 @@ if __name__=='__main__':
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
type_list = ['严重失信主体名单','行政管理']
com_name = ''
social_code = ''
dishonesty()
punish()
com_name = '石家庄交投集团工程服务有限责任公司'
social_code = '91130100MA7EK14C8L'
# list_dishonesty = dishonesty(headers,com_name,social_code)
# print(list_dishonesty)
list_punish = punish(headers,com_name,social_code)
print(list_punish)
# abnormal(headers,com_name,social_code)
# 报告链接
url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
report_json = getRequest(url_report, headers)
reportNumber = report_json['data']['reportNumber']
pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
# url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
# report_json = getRequest(url_report, headers)
# reportNumber = report_json['data']['reportNumber']
# pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
# respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30)
......
......@@ -58,8 +58,8 @@ class Tycdt(object):
def doJob(self):
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
social_code = '913205002517479347'
social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
# social_code = '913205002517479347'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
......
......@@ -50,7 +50,7 @@ if __name__=="__main__":
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = "https://mp.weixin.qq.com/"
......
import datetime
import json
import time
import redis
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from kafka import KafkaProducer
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
def sendKafka(dic_news):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.info(dic_result)
return False
def getRequest(url,headers):
req = requests.get(url=url, headers=headers, timeout=30)
if req.status_code == 200:
pass
soup = BeautifulSoup(req.content, 'html.parser')
return soup
def deletep(soup,attribute_to_delete,value_to_delete):
# 查找带有指定属性的P标签并删除
p_tags = soup.find_all('p', {attribute_to_delete: value_to_delete})
for p_tag in p_tags:
p_tag.decompose()
def deletek(soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
# 将html中的相对地址转换成绝对地址
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
if __name__=='__main__':
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Cookie':'UM_distinctid=18b5f64f72a580-0d0997e58eee04-26031e51-e1000-18b5f64f72bab5; wdcid=23a1d057521777ff; wdses=22f0d407e263a31e; CNZZDATA30019853=cnzz_eid%3D744929620-1698112534-%26ntime%3D1698112562; wdlast=1698112562',
'Host':'www.qstheory.cn',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
url = 'http://www.qstheory.cn/qs/mulu.htm'
soup_report = getRequest(url,headers)
report_list = soup_report.find_all('div', class_='col-sm-3')
for book in report_list:
href = book.find('div', class_='booktitle').find('a')['href']
year = book.find('div', class_='booktitle').find('a').text
soup_href = getRequest(href,headers)
period = soup_href.find('div', class_='highlight')
deletep(period,'align','center')
deletek(period)
period_list = period.find_all('p')
for p in period_list:
period_href = p.find('a')['href']
period_title = p.find('a').text
soup_news = getRequest(period_href,headers)
deletep(soup_news, 'align', 'center')
deletek(soup_news)
title_list = soup_news.select('div[class="highlight"]>p')[1:]
for new in title_list:
try:
deletek(new)
try:
author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
except:
continue
if len(author)>4:
continue
# if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
if '(' in author or '本刊' in author or '国家' in author \
or '中共' in author or '记者' in author or '新闻社' in author \
or '党委' in author or '”' in author\
or '大学' in author or '洛桑江村' in author:
continue
new_href = new.find('a')['href']
is_member = r.sismember('qiushileaderspeech::' + period_title, new_href)
if is_member:
continue
new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
except:
continue
soup_new = getRequest(new_href,headers)
deletek(soup_new)
deletep(soup_new, 'style', 'TEXT-ALIGN: center')
result = soup_new.find('div', class_='inner')
if result:
pass
else:
continue
span_list = result.find_all('span')
source = span_list[0].text.replace('来源:', '').strip('\r\n')
pub_time = span_list[2].text.strip('\r\n')
content = soup_new.find('div', class_='highlight').text
paserUrl(soup_new, new_href)
contentWithTag = soup_new.find('div', class_='highlight')
nowDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dic_news = {
'sid': '1716996740019585025',
'title': new_title,
'source': "16",
'origin': source,
'author': author,
'publishDate': pub_time,
'content': content,
'contentWithTag': str(contentWithTag),
'sourceAddress': new_href,
"createDate": nowDate
}
# log.info(dic_news)
if sendKafka(dic_news):
r.sadd('qiushileaderspeech::' + period_title, new_href)
log.info(f'采集成功----{dic_news["sourceAddress"]}')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论