提交 4e84d611 作者: 薛凌堃

10/26

上级 c2749092
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys
import pandas as pd import pandas as pd
import requests import requests
from goose3 import Goose from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
from base.smart.entity import * sys.path.append('D:\\kkwork\\zzsn_spider\\base\\smart')
from base.smart.smart_extractor_utility import SmartExtractorUtility from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用 # goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree from lxml import etree
from lxml.html import HtmlElement from lxml.html import HtmlElement
...@@ -135,6 +138,16 @@ class SmartExtractor: ...@@ -135,6 +138,16 @@ class SmartExtractor:
return self.get_extraction_result(article, link_text) return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
#url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....] #url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....]
def extract_by_url_test(url_list,list_info_all): def extract_by_url_test(url_list,list_info_all):
# 测试:按URL采集 # 测试:按URL采集
......
# 根据信用代码获取天眼查id # 根据信用代码获取天眼查id
import json import json
import random import random
import sys
import time import time
import pymysql import pymysql
import requests import requests
from base.BaseCore import BaseCore sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
requests.adapters.DEFAULT_RETRIES = 5 requests.adapters.DEFAULT_RETRIES = 5
baseCore = BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
headers = { headers = {
'Accept': 'application/json, text/plain, */*', 'Accept': 'application/json, text/plain, */*',
......
...@@ -6,11 +6,12 @@ import requests, time, pymysql ...@@ -6,11 +6,12 @@ import requests, time, pymysql
import jieba import jieba
import sys import sys
from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from getTycId import getTycIdByXYDM from getTycId import getTycIdByXYDM
# from base.BaseCore import BaseCore # from base.BaseCore import BaseCore
# from base.smart import smart_extractor # from base.smart import smart_extractor
sys.path.append('D:\\zzsn_spider\\base') sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore import BaseCore
from smart import smart_extractor from smart import smart_extractor
import urllib3 import urllib3
...@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor ...@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor
taskType = '企业动态/天眼查/补采20W+' taskType = '企业动态/天眼查/补采20W+'
def reqDetailmsg(url,headers):
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for i in range(0,1):
try:
response=requests.get(url=url,headers=headers,timeout=8,verify=False)
response.encoding = response.apparent_encoding
htmltext=response.text
except Exception as e:
htmltext=''
log.info(f'{url}---详情请求失败--{e}')
if htmltext:
log.info(f'{url}---详情请求成功')
break
return htmltext
def beinWork(tyc_code, social_code,start_time): def beinWork(tyc_code, social_code,start_time):
time.sleep(3) time.sleep(3)
...@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time):
# 开始进行智能解析 # 开始进行智能解析
# lang = baseCore.detect_language(title) # lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang) # smart = smart_extractor.SmartExtractor(lang)
#带标签正文 # req = requests.get(url=link,headers=headers,timeout=10)
contentText = smart.extract_by_url(link).text # html = BeautifulSoup(req.content,'html.parser')
#不带标签正文 raw_html = reqDetailmsg(link,headers)
content = smart.extract_by_url(link).cleaned_text if raw_html:
# time.sleep(3)
# soup = BeautifulSoup(raw_html, 'html.parser')
try:
article = smart.extract_by_html(raw_html)
content = article.cleaned_text
contentText = article.text
except Exception as e:
log.info(f'抽取失败!!{e}')
# #带标签正文
# contentText = smart.extract_by_url(link).text
# #不带标签正文
# content = smart.extract_by_url(link).cleaned_text
# # time.sleep(3)
except Exception as e: except Exception as e:
contentText = '' contentText = ''
if contentText == '': if contentText == '':
log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}') log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}')
e = '获取正文失败' e = '获取正文失败'
...@@ -281,7 +312,7 @@ def doJob(): ...@@ -281,7 +312,7 @@ def doJob():
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode') social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
#social_code = '91440300665899831W' # social_code = '913205007764477744'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num): ...@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num):
'sid': '1684032033495392257', 'sid': '1684032033495392257',
'sourceAddress': year_url, # 原文链接 'sourceAddress': year_url, # 原文链接
'summary': '', 'summary': '',
'title': name_pdf.replace(',pdf', ''), 'title': name_pdf.replace('.pdf', ''),
'type': 1, 'type': 1,
'socialCreditCode': social_code, 'socialCreditCode': social_code,
'year': year 'year': year
...@@ -260,7 +260,7 @@ if __name__ == '__main__': ...@@ -260,7 +260,7 @@ if __name__ == '__main__':
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode') # social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '913412007050444417' social_code = '91330000734507783B'
if not social_code: if not social_code:
time.sleep(20) time.sleep(20)
continue continue
......
...@@ -33,13 +33,14 @@ def getRequest(url,headers): ...@@ -33,13 +33,14 @@ def getRequest(url,headers):
return json_data return json_data
# 严重失信 # 严重失信
def dishonesty(): def dishonesty(headers,com_name,social_code):
list_dishonesty = []
param = { param = {
'tableName':'credit_zgf_fr_sxbzxr', 'tableName':'credit_zgf_fr_sxbzxr',
'searchState': '1', 'searchState': '1',
'scenes': 'defaultscenario', 'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司', 'keyword': com_name,
'tyshxydm': '91440882315032592M', 'tyshxydm': social_code,
'page': '1', 'page': '1',
'pageSize': '10' 'pageSize': '10'
} }
...@@ -50,14 +51,14 @@ def dishonesty(): ...@@ -50,14 +51,14 @@ def dishonesty():
if json_data['status'] == 1: if json_data['status'] == 1:
pass pass
total_size = json_data['data']['totalSize'] total_size = json_data['data']['totalSize']
for page in total_size: for page in range(1,total_size+1):
param_page = { param_page = {
'tableName': 'credit_zgf_fr_sxbzxr', 'tableName': 'credit_zgf_fr_sxbzxr',
'searchState': '1', 'searchState': '1',
'scenes': 'defaultscenario', 'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司', 'keyword': com_name,
'tyshxydm': '91440882315032592M', 'tyshxydm': social_code,
'page': f'{page}', 'page': page,
'pageSize': '10' 'pageSize': '10'
} }
url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10' url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...@@ -67,7 +68,7 @@ def dishonesty(): ...@@ -67,7 +68,7 @@ def dishonesty():
pass pass
info_list = json_data['data']['list'] info_list = json_data['data']['list']
for info in info_list: for info in info_list:
entity = info['entity'] entity = info
iname = entity['iname'] # 失信被执行人姓名/名称 iname = entity['iname'] # 失信被执行人姓名/名称
cardnumber = entity['cardnumber'] # 组织机构代码 cardnumber = entity['cardnumber'] # 组织机构代码
court_name = entity['court_name'] # 执行法院 court_name = entity['court_name'] # 执行法院
...@@ -83,15 +84,34 @@ def dishonesty(): ...@@ -83,15 +84,34 @@ def dishonesty():
performed_part = entity['performed_part'] # 已履行部分 performed_part = entity['performed_part'] # 已履行部分
unperform_part = entity['unperform_part'] # 未履行部分 unperform_part = entity['unperform_part'] # 未履行部分
dataSource = info['dataSource'] # 数据来源 dataSource = info['dataSource'] # 数据来源
dic_dishonesty = {
'失信被执行人姓名/名称': iname,
'组织机构代码':cardnumber,
'执行法院':court_name,
'省份':area_name,
'执行依据文号':case_code,
'立案时间':reg_date,
'案号':gist_cid,
'做出执行依据单位':gist_unit,
'生效法律文书确定的义务':duty,
'被执行人的履行情况':performance,
'失信被执行人行为具体情形':disreput_type_name,
'发布时间':publish_date,
'已履行部分':performed_part,
'未履行部分':unperform_part,
'数据来源':dataSource
}
list_dishonesty.append(dic_dishonesty)
return list_dishonesty
# 行政处罚 # 行政处罚
def punish(): def punish(headers,com_name,social_code):
list_punish = []
param = { param = {
'tableName':'credit_xyzx_fr_xzcf_new', 'tableName':'credit_xyzx_fr_xzcf_new',
'searchState': '1', 'searchState': '1',
'scenes': 'defaultscenario', 'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司', 'keyword': com_name,
'tyshxydm': '91440882315032592M', 'tyshxydm': social_code,
'page': '1', 'page': '1',
'pageSize': '10' 'pageSize': '10'
} }
...@@ -106,15 +126,16 @@ def punish(): ...@@ -106,15 +126,16 @@ def punish():
if total_size > 0: if total_size > 0:
pass pass
else: else:
log.info() log.info(f'该企业{com_name}无行政处罚信息')
for page in total_size: return list_punish
for page in range(1,total_size+1):
param_page = { param_page = {
'tableName': 'credit_xyzx_fr_xzcf_new', 'tableName': 'credit_xyzx_fr_xzcf_new',
'searchState': '1', 'searchState': '1',
'scenes': 'defaultscenario', 'scenes': 'defaultscenario',
'keyword': '雷州市白金银座演艺文化实业有限公司', 'keyword': com_name,
'tyshxydm': '91440882315032592M', 'tyshxydm': social_code,
'page': f'{page}', 'page': page,
'pageSize': '10' 'pageSize': '10'
} }
url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10' url_page = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...@@ -141,6 +162,88 @@ def punish(): ...@@ -141,6 +162,88 @@ def punish():
cf_sjly = entity['cf_sjly'] # 数据来源 cf_sjly = entity['cf_sjly'] # 数据来源
cf_sjlydm = entity['cf_sjlydm'] # 数据来源单位统一社会信用代码 cf_sjlydm = entity['cf_sjlydm'] # 数据来源单位统一社会信用代码
dic_punish = {
'行政处罚决定书文号':cf_wsh,
'处罚类别':cf_cflb,
'处罚决定日期':cf_jdrq,
'处罚内容':cf_nr,
'罚款金额(万元)':cf_nr_fk,
'没收违法所得、没收非法财物的金额(万元)':cf_nr_wfff,
'暂扣或吊销证照名称及编号':cf_nr_zkdx,
'违法行为类型':cf_wfxw,
'违法事实':cf_sy,
'处罚依据':cf_yj,
'处罚机关':cf_cfjg,
'处罚机关统一社会信用代码':cf_cfjgdm,
'数据来源':cf_sjly,
'数据来源单位统一社会信用代码':cf_sjlydm
}
list_punish.append(dic_punish)
return list_punish
# 经营异常
def abnormal(headers,com_name,social_code):
list_abhormal = []
param = {
'tableName': 'credit_scjdglzj_fr_ycjyml',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': com_name,
'tyshxydm': social_code,
'page': '1',
'pageSize': '10'
}
url = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page=1&pageSize=10'
json_data = getRequest(url, headers)
# print(json_data)
if json_data['status'] == 1:
pass
# 总条数
total_size = json_data['data']['totalSize']
if total_size > 0:
pass
else:
log.info()
for page in total_size:
param_page = {
'tableName': 'credit_xyzx_fr_xzcf_new',
'searchState': '1',
'scenes': 'defaultscenario',
'keyword': com_name,
'tyshxydm': social_code,
'page': page,
'pageSize': '10'
}
url = f'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
json_data = getRequest(url, headers)
if json_data['status'] == 1:
pass
info_list = json_data['data']['list']
for entity in info_list:
entname = entity['entname'] # 企业名称
uniscid = entity['uniscid'] # 社会统一信用代码
lerep = entity['lerep'] # 法定代表人
pripid = entity['pripid'] # 主体身份代码
regno = entity['regno'] # 注册号
specausename = entity['specausename'] # 列入经营异常名录原因类型名称
abntime = entity['abntime'] # 设立日期
decorgname = entity['decorgname'] # 列入决定机关名称
dataSource = entity['dataSource'] # 数据来源
dic_abnormal = {
'企业名称':entname,
'社会统一信用代码':uniscid,
'法定代表人':lerep,
'主体身份代码':pripid,
'注册号':regno,
'列入经营异常名录原因类型名称':specausename,
'设立日期':abntime,
'列入决定机关名称':decorgname,
'数据来源':dataSource
}
list_abhormal.append(dic_abnormal)
return list_abhormal
if __name__=='__main__': if __name__=='__main__':
...@@ -154,16 +257,18 @@ if __name__=='__main__': ...@@ -154,16 +257,18 @@ if __name__=='__main__':
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"' 'sec-ch-ua-platform': '"Windows"'
} }
type_list = ['严重失信主体名单','行政管理'] com_name = '石家庄交投集团工程服务有限责任公司'
com_name = '' social_code = '91130100MA7EK14C8L'
social_code = '' # list_dishonesty = dishonesty(headers,com_name,social_code)
dishonesty() # print(list_dishonesty)
punish() list_punish = punish(headers,com_name,social_code)
print(list_punish)
# abnormal(headers,com_name,social_code)
# 报告链接 # 报告链接
url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}' # url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
report_json = getRequest(url_report, headers) # report_json = getRequest(url_report, headers)
reportNumber = report_json['data']['reportNumber'] # reportNumber = report_json['data']['reportNumber']
pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}' # pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
# respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30) # respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30)
......
...@@ -58,8 +58,8 @@ class Tycdt(object): ...@@ -58,8 +58,8 @@ class Tycdt(object):
def doJob(self): def doJob(self):
while True: while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode') social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
social_code = '913205002517479347' # social_code = '913205002517479347'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
......
...@@ -50,7 +50,7 @@ if __name__=="__main__": ...@@ -50,7 +50,7 @@ if __name__=="__main__":
opt.add_experimental_option("excludeSwitches", ["enable-automation"]) opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging']) opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False) opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe' opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe' chromedriver = r'D:\cmd100\chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver) browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = "https://mp.weixin.qq.com/" url = "https://mp.weixin.qq.com/"
......
import datetime
import json
import time
import redis
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from kafka import KafkaProducer
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
def sendKafka(dic_news):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.info(dic_result)
return False
def getRequest(url,headers):
req = requests.get(url=url, headers=headers, timeout=30)
if req.status_code == 200:
pass
soup = BeautifulSoup(req.content, 'html.parser')
return soup
def deletep(soup,attribute_to_delete,value_to_delete):
# 查找带有指定属性的P标签并删除
p_tags = soup.find_all('p', {attribute_to_delete: value_to_delete})
for p_tag in p_tags:
p_tag.decompose()
def deletek(soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
# 将html中的相对地址转换成绝对地址
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
if __name__=='__main__':
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Cookie':'UM_distinctid=18b5f64f72a580-0d0997e58eee04-26031e51-e1000-18b5f64f72bab5; wdcid=23a1d057521777ff; wdses=22f0d407e263a31e; CNZZDATA30019853=cnzz_eid%3D744929620-1698112534-%26ntime%3D1698112562; wdlast=1698112562',
'Host':'www.qstheory.cn',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
url = 'http://www.qstheory.cn/qs/mulu.htm'
soup_report = getRequest(url,headers)
report_list = soup_report.find_all('div', class_='col-sm-3')
for book in report_list:
href = book.find('div', class_='booktitle').find('a')['href']
year = book.find('div', class_='booktitle').find('a').text
soup_href = getRequest(href,headers)
period = soup_href.find('div', class_='highlight')
deletep(period,'align','center')
deletek(period)
period_list = period.find_all('p')
for p in period_list:
period_href = p.find('a')['href']
period_title = p.find('a').text
soup_news = getRequest(period_href,headers)
deletep(soup_news, 'align', 'center')
deletek(soup_news)
title_list = soup_news.select('div[class="highlight"]>p')[1:]
for new in title_list:
try:
deletek(new)
try:
author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
except:
continue
if len(author)>4:
continue
# if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
if '(' in author or '本刊' in author or '国家' in author \
or '中共' in author or '记者' in author or '新闻社' in author \
or '党委' in author or '”' in author\
or '大学' in author or '洛桑江村' in author:
continue
new_href = new.find('a')['href']
is_member = r.sismember('qiushileaderspeech::' + period_title, new_href)
if is_member:
continue
new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
except:
continue
soup_new = getRequest(new_href,headers)
deletek(soup_new)
deletep(soup_new, 'style', 'TEXT-ALIGN: center')
result = soup_new.find('div', class_='inner')
if result:
pass
else:
continue
span_list = result.find_all('span')
source = span_list[0].text.replace('来源:', '').strip('\r\n')
pub_time = span_list[2].text.strip('\r\n')
content = soup_new.find('div', class_='highlight').text
paserUrl(soup_new, new_href)
contentWithTag = soup_new.find('div', class_='highlight')
nowDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dic_news = {
'sid': '1716996740019585025',
'title': new_title,
'source': "16",
'origin': source,
'author': author,
'publishDate': pub_time,
'content': content,
'contentWithTag': str(contentWithTag),
'sourceAddress': new_href,
"createDate": nowDate
}
# log.info(dic_news)
if sendKafka(dic_news):
r.sadd('qiushileaderspeech::' + period_title, new_href)
log.info(f'采集成功----{dic_news["sourceAddress"]}')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论