提交 1bb5b282 作者: 薛凌堃

10/27

上级 fb61875d
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -58,7 +58,7 @@ if __name__ == '__main__':
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
}
query = "select * from clb_sys_attachment where id= 383007"
query = "SELECT * FROM clb_sys_attachment WHERE type_id=1 AND source='证监会'"
cursor_.execute(query)
results = cursor_.fetchall()
for result in results:
......@@ -74,9 +74,10 @@ if __name__ == '__main__':
pass
else:
com_name = selects[1]
full_path = 'http://114.115.215.96/' + result[6]
full_path = 'http://zzsn.luyuen.com/' + result[19]
year = result[9]
create_time = result[13]
publish = str(result[21])
content = ''
for i in range(0, 3):
try:
......@@ -102,9 +103,9 @@ if __name__ == '__main__':
'id': '',
'keyWords': '',
'lang': detect_language,
'origin': com_name + '企业官网',
'origin': '证监会',
# 'origin': '雪球网',
'publishDate': str(year) + '-12-31',
'publishDate': publish,
'sid': '1684032033495392257',
'sourceAddress': '', # 原文链接
'summary': '',
......
import json
import json
......@@ -125,8 +125,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
except Exception as e:
# pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
year = int(pub_time) - 1
year = str(year)
year = int(pub_time[:4]) - 1
# year = str(year)
# page_size = 0
......@@ -322,7 +322,7 @@ if __name__ == '__main__':
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
# social_code = '91100000100003962T'
# social_code = '91210800765420138L'
if not social_code:
time.sleep(20)
continue
......
......@@ -180,6 +180,7 @@ if __name__=='__main__':
#retData, com_name, year, pdf_name, num, pub_time
att_id= baseCore.tableUpdate(retData_f, cname,file_year,file_name, num,file_year+'-12-31',origin)
if att_id:
detect_language = baseCore.detect_language(content)
dic_news = {
'attachmentIds': att_id,
'author': '',
......@@ -189,7 +190,7 @@ if __name__=='__main__':
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'lang': detect_language,
'origin': origin,
'publishDate': file_year + '-12-31',
'sid': '1684032033495392257',
......
......@@ -12,11 +12,14 @@
pageSize: 10
}
"""
import json
import time
from urllib import parse
import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from retry import retry
from base.BaseCore import BaseCore
......@@ -24,6 +27,41 @@ baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
taskType = '企业负面新闻'
def sendKafka(dic_news):
start_time = time.time()
try: # 114.116.116.241
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
kafka_result = producer.send("crawlerInfo",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, dic_news['title'], 'Kafka操作失败')
log.info(dic_result)
return False
@retry(tries=3,delay=1)
def getRequest(url,headers):
......@@ -51,6 +89,11 @@ def dishonesty(headers,com_name,social_code):
if json_data['status'] == 1:
pass
total_size = json_data['data']['totalSize']
if total_size > 0:
pass
else:
log.info(f'该企业{com_name}无严重失信信息')
return list_dishonesty
for page in range(1,total_size+1):
param_page = {
'tableName': 'credit_zgf_fr_sxbzxr',
......@@ -102,7 +145,9 @@ def dishonesty(headers,com_name,social_code):
'数据来源':dataSource
}
list_dishonesty.append(dic_dishonesty)
return list_dishonesty
# r.sadd('dishonesty::' +social_code , )
return url,list_dishonesty
# 行政处罚
def punish(headers,com_name,social_code):
list_punish = []
......@@ -179,7 +224,7 @@ def punish(headers,com_name,social_code):
'数据来源单位统一社会信用代码':cf_sjlydm
}
list_punish.append(dic_punish)
return list_punish
return url,list_punish
# 经营异常
def abnormal(headers,com_name,social_code):
......@@ -204,8 +249,9 @@ def abnormal(headers,com_name,social_code):
if total_size > 0:
pass
else:
log.info()
for page in total_size:
log.info(f'该企业{com_name}无经营异常信息')
return list_abhormal
for page in range(1, total_size+1):
param_page = {
'tableName': 'credit_xyzx_fr_xzcf_new',
'searchState': '1',
......@@ -242,8 +288,20 @@ def abnormal(headers,com_name,social_code):
'数据来源':dataSource
}
list_abhormal.append(dic_abnormal)
return list_abhormal
return url,list_abhormal
def dic_data(com_name,listData,type,detailurl):
dic_news = {
'title':com_name + type,
'structuredData':listData,
'ynStructure':1,
'content': '',
'contentHtml': '',
'source': '信用中国',
'publishtime': '',
'detailurl': detailurl,
}
return dic_news
if __name__=='__main__':
......@@ -259,11 +317,20 @@ if __name__=='__main__':
}
com_name = '石家庄交投集团工程服务有限责任公司'
social_code = '91130100MA7EK14C8L'
# list_dishonesty = dishonesty(headers,com_name,social_code)
# print(list_dishonesty)
list_punish = punish(headers,com_name,social_code)
print(list_punish)
# abnormal(headers,com_name,social_code)
url_dishonesty,list_dishonesty = dishonesty(headers,com_name,social_code)
dic_dishonesty = dic_data(com_name,list_dishonesty,'严重违法失信信息',url_dishonesty)
sendKafka(dic_dishonesty)
url_punish,list_punish = punish(headers,com_name,social_code)
dic_punish = dic_data(com_name, list_punish, '行政处罚信息', url_punish)
# print(dic_punish)
sendKafka(dic_punish)
url_abnormal,list_abnormal = abnormal(headers,com_name,social_code)
dic_abnormal = dic_data(com_name, list_abnormal, '经营异常信息', url_abnormal)
# print(dic_abnormal)
sendKafka(dic_abnormal)
# 报告链接
# url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
# report_json = getRequest(url_report, headers)
......@@ -273,3 +340,4 @@ if __name__=='__main__':
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论