提交 1bb5b282 作者: 薛凌堃

10/27

上级 fb61875d
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -58,7 +58,7 @@ if __name__ == '__main__': ...@@ -58,7 +58,7 @@ if __name__ == '__main__':
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
} }
query = "select * from clb_sys_attachment where id= 383007" query = "SELECT * FROM clb_sys_attachment WHERE type_id=1 AND source='证监会'"
cursor_.execute(query) cursor_.execute(query)
results = cursor_.fetchall() results = cursor_.fetchall()
for result in results: for result in results:
...@@ -74,9 +74,10 @@ if __name__ == '__main__': ...@@ -74,9 +74,10 @@ if __name__ == '__main__':
pass pass
else: else:
com_name = selects[1] com_name = selects[1]
full_path = 'http://114.115.215.96/' + result[6] full_path = 'http://zzsn.luyuen.com/' + result[19]
year = result[9] year = result[9]
create_time = result[13] create_time = result[13]
publish = str(result[21])
content = '' content = ''
for i in range(0, 3): for i in range(0, 3):
try: try:
...@@ -102,9 +103,9 @@ if __name__ == '__main__': ...@@ -102,9 +103,9 @@ if __name__ == '__main__':
'id': '', 'id': '',
'keyWords': '', 'keyWords': '',
'lang': detect_language, 'lang': detect_language,
'origin': com_name + '企业官网', 'origin': '证监会',
# 'origin': '雪球网', # 'origin': '雪球网',
'publishDate': str(year) + '-12-31', 'publishDate': publish,
'sid': '1684032033495392257', 'sid': '1684032033495392257',
'sourceAddress': '', # 原文链接 'sourceAddress': '', # 原文链接
'summary': '', 'summary': '',
......
import json import json
...@@ -125,8 +125,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -125,8 +125,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '') year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
except Exception as e: except Exception as e:
# pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4] # pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
year = int(pub_time) - 1 year = int(pub_time[:4]) - 1
year = str(year) # year = str(year)
# page_size = 0 # page_size = 0
...@@ -322,7 +322,7 @@ if __name__ == '__main__': ...@@ -322,7 +322,7 @@ if __name__ == '__main__':
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode') social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
# social_code = '91100000100003962T' # social_code = '91210800765420138L'
if not social_code: if not social_code:
time.sleep(20) time.sleep(20)
continue continue
......
...@@ -180,6 +180,7 @@ if __name__=='__main__': ...@@ -180,6 +180,7 @@ if __name__=='__main__':
#retData, com_name, year, pdf_name, num, pub_time #retData, com_name, year, pdf_name, num, pub_time
att_id= baseCore.tableUpdate(retData_f, cname,file_year,file_name, num,file_year+'-12-31',origin) att_id= baseCore.tableUpdate(retData_f, cname,file_year,file_name, num,file_year+'-12-31',origin)
if att_id: if att_id:
detect_language = baseCore.detect_language(content)
dic_news = { dic_news = {
'attachmentIds': att_id, 'attachmentIds': att_id,
'author': '', 'author': '',
...@@ -189,7 +190,7 @@ if __name__=='__main__': ...@@ -189,7 +190,7 @@ if __name__=='__main__':
'deleteFlag': '0', 'deleteFlag': '0',
'id': '', 'id': '',
'keyWords': '', 'keyWords': '',
'lang': 'zh', 'lang': detect_language,
'origin': origin, 'origin': origin,
'publishDate': file_year + '-12-31', 'publishDate': file_year + '-12-31',
'sid': '1684032033495392257', 'sid': '1684032033495392257',
......
...@@ -12,11 +12,14 @@ ...@@ -12,11 +12,14 @@
pageSize: 10 pageSize: 10
} }
""" """
import json
import time import time
from urllib import parse from urllib import parse
import redis
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from kafka import KafkaProducer
from retry import retry from retry import retry
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
...@@ -24,6 +27,41 @@ baseCore = BaseCore() ...@@ -24,6 +27,41 @@ baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx = baseCore.cnx cnx = baseCore.cnx
cursor = baseCore.cursor cursor = baseCore.cursor
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
taskType = '企业负面新闻'
def sendKafka(dic_news):
start_time = time.time()
try: # 114.116.116.241
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, dic_news['title'], 'Kafka操作失败')
log.info(dic_result)
return False
@retry(tries=3,delay=1) @retry(tries=3,delay=1)
def getRequest(url,headers): def getRequest(url,headers):
...@@ -51,6 +89,11 @@ def dishonesty(headers,com_name,social_code): ...@@ -51,6 +89,11 @@ def dishonesty(headers,com_name,social_code):
if json_data['status'] == 1: if json_data['status'] == 1:
pass pass
total_size = json_data['data']['totalSize'] total_size = json_data['data']['totalSize']
if total_size > 0:
pass
else:
log.info(f'该企业{com_name}无严重失信信息')
return list_dishonesty
for page in range(1,total_size+1): for page in range(1,total_size+1):
param_page = { param_page = {
'tableName': 'credit_zgf_fr_sxbzxr', 'tableName': 'credit_zgf_fr_sxbzxr',
...@@ -102,7 +145,9 @@ def dishonesty(headers,com_name,social_code): ...@@ -102,7 +145,9 @@ def dishonesty(headers,com_name,social_code):
'数据来源':dataSource '数据来源':dataSource
} }
list_dishonesty.append(dic_dishonesty) list_dishonesty.append(dic_dishonesty)
return list_dishonesty # r.sadd('dishonesty::' +social_code , )
return url,list_dishonesty
# 行政处罚 # 行政处罚
def punish(headers,com_name,social_code): def punish(headers,com_name,social_code):
list_punish = [] list_punish = []
...@@ -179,7 +224,7 @@ def punish(headers,com_name,social_code): ...@@ -179,7 +224,7 @@ def punish(headers,com_name,social_code):
'数据来源单位统一社会信用代码':cf_sjlydm '数据来源单位统一社会信用代码':cf_sjlydm
} }
list_punish.append(dic_punish) list_punish.append(dic_punish)
return list_punish return url,list_punish
# 经营异常 # 经营异常
def abnormal(headers,com_name,social_code): def abnormal(headers,com_name,social_code):
...@@ -204,8 +249,9 @@ def abnormal(headers,com_name,social_code): ...@@ -204,8 +249,9 @@ def abnormal(headers,com_name,social_code):
if total_size > 0: if total_size > 0:
pass pass
else: else:
log.info() log.info(f'该企业{com_name}无经营异常信息')
for page in total_size: return list_abhormal
for page in range(1, total_size+1):
param_page = { param_page = {
'tableName': 'credit_xyzx_fr_xzcf_new', 'tableName': 'credit_xyzx_fr_xzcf_new',
'searchState': '1', 'searchState': '1',
...@@ -242,8 +288,20 @@ def abnormal(headers,com_name,social_code): ...@@ -242,8 +288,20 @@ def abnormal(headers,com_name,social_code):
'数据来源':dataSource '数据来源':dataSource
} }
list_abhormal.append(dic_abnormal) list_abhormal.append(dic_abnormal)
return list_abhormal return url,list_abhormal
def dic_data(com_name,listData,type,detailurl):
dic_news = {
'title':com_name + type,
'structuredData':listData,
'ynStructure':1,
'content': '',
'contentHtml': '',
'source': '信用中国',
'publishtime': '',
'detailurl': detailurl,
}
return dic_news
if __name__=='__main__': if __name__=='__main__':
...@@ -259,11 +317,20 @@ if __name__=='__main__': ...@@ -259,11 +317,20 @@ if __name__=='__main__':
} }
com_name = '石家庄交投集团工程服务有限责任公司' com_name = '石家庄交投集团工程服务有限责任公司'
social_code = '91130100MA7EK14C8L' social_code = '91130100MA7EK14C8L'
# list_dishonesty = dishonesty(headers,com_name,social_code) url_dishonesty,list_dishonesty = dishonesty(headers,com_name,social_code)
# print(list_dishonesty) dic_dishonesty = dic_data(com_name,list_dishonesty,'严重违法失信信息',url_dishonesty)
list_punish = punish(headers,com_name,social_code) sendKafka(dic_dishonesty)
print(list_punish)
# abnormal(headers,com_name,social_code) url_punish,list_punish = punish(headers,com_name,social_code)
dic_punish = dic_data(com_name, list_punish, '行政处罚信息', url_punish)
# print(dic_punish)
sendKafka(dic_punish)
url_abnormal,list_abnormal = abnormal(headers,com_name,social_code)
dic_abnormal = dic_data(com_name, list_abnormal, '经营异常信息', url_abnormal)
# print(dic_abnormal)
sendKafka(dic_abnormal)
# 报告链接 # 报告链接
# url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}' # url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
# report_json = getRequest(url_report, headers) # report_json = getRequest(url_report, headers)
...@@ -273,3 +340,4 @@ if __name__=='__main__': ...@@ -273,3 +340,4 @@ if __name__=='__main__':
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论