提交 5122cc37 作者: 薛凌堃

2023/8/12

上级 98ca1672
...@@ -364,14 +364,14 @@ class BaseCore: ...@@ -364,14 +364,14 @@ class BaseCore:
return str return str
# 繁体字转简体字 # 繁体字转简体字
def hant_2_hans(hant_str: str): def hant_2_hans(self,hant_str: str):
''' '''
Function: 将 hant_str 由繁体转化为简体 Function: 将 hant_str 由繁体转化为简体
''' '''
return zhconv.convert(hant_str, 'zh-hans') return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字 # 判断字符串里是否含数字
def str_have_num(str_num): def str_have_num(self,str_num):
panduan = False panduan = False
for str_1 in str_num: for str_1 in str_num:
...@@ -463,6 +463,7 @@ class BaseCore: ...@@ -463,6 +463,7 @@ class BaseCore:
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改 # token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query) self.cursor.execute(query)
token = self.cursor.fetchone()[0] token = self.cursor.fetchone()[0]
return token
#检测语言 #检测语言
def detect_language(self, text): def detect_language(self, text):
......
# -*- coding: utf-8 -*-
import pandas as pd
import time
import requests
import json
from kafka import KafkaProducer
from base.BaseCore import BaseCore
from getQccId import find_id_by_name
baseCore = BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
# 通过企查查id获取企业基本信息
def info_by_id(com_id,com_name):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
com_jc_name = ''
try:
result_dict = resp_dict['result']['Company']
except:
print(com_name + ":获取失败")
#
company_name = result_dict['Name']
CreditCode = result_dict['CreditCode']
if CreditCode is None:
CreditCode = ''
try:
OperName = result_dict['Oper']['Name']
except:
OperName = ''
if OperName is None:
OperName = ''
if baseCore.str_have_num(OperName):
OperName = ''
try:
Status = result_dict['ShortStatus']
except:
Status = ''
if Status is None:
Status = ''
try:
StartDate = result_dict['StartDate']
except:
StartDate = ''
if StartDate is None:
StartDate = ''
try:
RegistCapi = result_dict['RegistCapi']
except:
RegistCapi = ''
if RegistCapi is None:
RegistCapi = ''
RecCap = '' # result_dict['RecCap'] #实际缴纳金额,现已没有显示
if RecCap is None:
RecCap = ''
try:
OrgNo = result_dict['CreditCode'][8:-2] + '-' + result_dict['CreditCode'][-2] # 组织机构代码,现已没有显示
except:
OrgNo = ''
if OrgNo is None:
OrgNo = ''
try:
TaxNo = result_dict['TaxNo']
except:
TaxNo = ''
if TaxNo is None:
TaxNo = ''
try:
EconKind = result_dict['EconKind']
except:
EconKind = ''
if EconKind is None:
EconKind = ''
TermStart = '' # result_dict['TermStart'] 营业期限自,现已没有显示
if TermStart is None:
TermStart = ''
TeamEnd = '' # result_dict['TeamEnd']营业期限至,现已没有显示
if TeamEnd is None:
TeamEnd = ''
try:
SubIndustry = result_dict['Industry']['SubIndustry']
except:
SubIndustry = ''
if SubIndustry is None:
SubIndustry = ''
try:
Province = result_dict['Area']['Province']
except:
Province = ''
try:
City = result_dict['Area']['City']
except:
City = ''
try:
County = result_dict['Area']['County']
except:
County = ''
try:
region = Province + City + County
except:
region = ''
BelongOrg = '' # result_dict['BelongOrg']登记机关,现已没有显示
can_bao = ''
CommonList = [] # result_dict['CommonList']参保人数,现已没有显示
for Common_dict in CommonList:
try:
KeyDesc = Common_dict['KeyDesc']
except:
continue
if KeyDesc == '参保人数':
can_bao = Common_dict['Value']
if can_bao == '0':
can_bao = ''
OriginalName = ''
try:
OriginalName_lists = result_dict['OriginalName']
for OriginalName_dict in OriginalName_lists:
OriginalName += OriginalName_dict['Name'] + ' '
except:
OriginalName = ''
try:
OriginalName.strip()
except:
OriginalName = ''
EnglishName = '' # result_dict['EnglishName']企业英文名,现已没有显示
if EnglishName is None:
EnglishName = ''
IxCode = '' # result_dict['IxCode']进出口企业代码,现已没有显示
if IxCode is None:
IxCode = ''
Address = result_dict['Address']
if Address is None:
Address = ''
Scope = '' # result_dict['Scope']经营范围,现已没有显示
if Scope is None:
Scope = ''
try:
PhoneNumber = result_dict['companyExtendInfo']['Tel']
except:
PhoneNumber = ''
if PhoneNumber is None:
PhoneNumber = ''
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
try:
Email = result_dict['companyExtendInfo']['Email']
except:
Email = ''
if Email is None:
Email = ''
try:
Desc = result_dict['companyExtendInfo']['Desc']
except:
Desc = ''
if Desc is None:
Desc = ''
try:
Info = result_dict['companyExtendInfo']['Info']
except:
Info = ''
if Info is None:
Info = ''
company_name = baseCore.hant_2_hans(company_name)
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}".format(token, t,
com_id)
resp_dict2 = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(1)
try:
com2 = resp_dict2['result']['Company']
except:
com2 = ''
try:
Scope = com2['Scope']
except:
Scope = ''
try:
CheckDate = com2['CheckDate']
except:
CheckDate = ''
if CheckDate is None:
CheckDate = ''
try:
TaxpayerType = com2['TaxpayerType'] #纳税人资质
except:
TaxpayerType = ''
if TaxpayerType is None:
TaxpayerType = ''
try:
No = com2['No']
except:
No = ''
if No is None:
No = ''
try:
IxCode = com2['IxCode']
except:
IxCode = ''
try:
OrgNo = com2['OrgNo']
except:
OrgNo = ''
try:
for Common_t in com2['CommonList']:
try:
if Common_t['KeyDesc'] == '参保人数':
can_bao = Common_t['Value']
except:
pass
except:
can_bao = ''
try:
TermStart = com2['TermStart']
except:
TermStart = ''
try:
TeamEnd = com2['TeamEnd']
except:
TeamEnd = ''
try:
RecCap = com2['RecCap']
except:
RecCap = ''
try:
No = com2['No']
except:
No = ''
try:
SubIndustry = com2['IndustryArray'][-1]
except:
SubIndustry = ''
try:
BelongOrg = com2['BelongOrg']
except:
BelongOrg = ''
try:
EnglishName = com2['EnglishName']
except:
EnglishName = ''
aa_dict = {
'qccId': com_id, # 企查查企业id
'name': company_name, # 企业名称
'shortName': com_jc_name, # 企业简称
'socialCreditCode': CreditCode, # 统一社会信用代码
'legalPerson': OperName, # 法定代表人
'officialPhone': PhoneNumber, # 电话
'officialUrl': WebSite, # 官网
'officialEmail': Email, # 邮箱
'briefInfo': Desc, # 简介
'registerStatus': Status, # 登记状态
'incorporationDate': StartDate, # 成立日期
'capital': RegistCapi, # 注册资本
'paidCapital': RecCap, # 实缴资本
'approvalDate': CheckDate, # 核准日期
'organizationCode': OrgNo, # 组织机构代码
'registerNo': No, # 工商注册号
'taxpayerNo': CreditCode, # 纳税人识别号
'type': EconKind, # 企业类型
'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': SubIndustry, # 所属行业
'region': region,
'province': Province, # 所属省
'city': City, # 所属市
'county': County, # 所属县
'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模
'insured': can_bao, # 参保人数
'beforeName': OriginalName, # 曾用名
'englishName': EnglishName, # 英文名
'importExportEnterpriseCode': IxCode, # 进出口企业代码
'address': Address, # 地址
'businessRange': Scope, # 经营范围
'status': 0, # 状态
}
aa_dict_list.append(aa_dict)
print(company_name + ":爬取完成")
return aa_dict_list
if __name__ == '__main__':
taskType = '基本信息/企查查'
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
#从redis里拿数据
while True:
start = time.time()
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken()
list_weicha = []
list_all_info = []
name_list = []
start_time = time.time()
# 获取企业信息
query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state1=1 limit 1 "
#兴业银行
# query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'"
cursor.execute(query)
row = cursor.fetchone()
if row:
pass
else:
print('没有数据了,结束脚本')
break
com_name = row[6]
social_code = row[4]
code = row[7]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
updateBeginSql = f"update Tfbs set state1=0,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
company_id = find_id_by_name(start,token,social_code)
if company_id == False:
#表示token失效
time.sleep(10)
updateBeginSql = f"update Tfbs set state1=1,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
continue
if company_id == "":
log.info(com_name + ":企业ID获取失败")
list_weicha.append(com_name + ":企业ID获取失败")
#400表示企业更新失败
updateBeginSql = f"update Tfbs set state1=400,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
continue
else:
post_data_list = info_by_id(company_id,social_code)
for post_data in post_data_list:
list_all_info.append(post_data)
if post_data is None:
log.info(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败")
# 400表示企业更新失败
updateBeginSql = f"update Tfbs set state1=400,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
continue
get_name = post_data['name']
get_socialcode = post_data['socialCreditCode']
name_compile = {
'yuan_name':com_name,
'get_name':get_name
}
name_list.append(name_compile)
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
except:
exception = 'kafka传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
#200表示成功
updateBeginSql = f"update Tfbs set state1=200,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor.execute(updateBeginSql)
cnx.commit()
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
...@@ -5,8 +5,10 @@ import time ...@@ -5,8 +5,10 @@ import time
from urllib.parse import quote from urllib.parse import quote
import requests import requests
import urllib3 import urllib3
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
headers = { headers = {
'Host': 'xcx.qcc.com', 'Host': 'xcx.qcc.com',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
...@@ -19,7 +21,7 @@ headers = { ...@@ -19,7 +21,7 @@ headers = {
'Accept-Encoding': 'gzip, deflate, br,' 'Accept-Encoding': 'gzip, deflate, br,'
} }
# 通过企业名称或信用代码获取企查查id # 通过企业名称或信用代码获取企查查id
def find_id_by_name(name): def find_id_by_name(start,token,name):
urllib3.disable_warnings() urllib3.disable_warnings()
qcc_key = name qcc_key = name
...@@ -35,14 +37,19 @@ def find_id_by_name(name): ...@@ -35,14 +37,19 @@ def find_id_by_name(name):
time.sleep(5) time.sleep(5)
continue continue
time.sleep(2) time.sleep(2)
if resp_dict['result']['Result']: try:
result_dict = resp_dict['result']['Result'][0] if resp_dict['result']['Result']:
KeyNo = result_dict['KeyNo'] result_dict = resp_dict['result']['Result'][0]
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip() KeyNo = result_dict['KeyNo']
if Name == '': Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = ''
else:
KeyNo = '' KeyNo = ''
else: except:
KeyNo = '' KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
return KeyNo
print("{},企业代码为:{}".format(qcc_key, KeyNo)) print("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo return KeyNo
\ No newline at end of file
import json
import requests,time,re,random,pymysql
import pandas as pd
from bs4 import BeautifulSoup
import urllib3
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx = pymysql.connect(host='114.115.159.144',user='root', password='zzsn9988', db='clb_project', charset='utf8mb4')
cursor = cnx.cursor()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
def get_proxy():
sql = "select proxy from clb_proxy"
cursor.execute(sql)
proxy_lists = cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list
headers = {
'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
list_code = []
while True:
list_weicha = []
list_all_info = []
name_list = []
start_time = time.time()
# 获取企业信息
query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state2 is null limit 1 "
#兴业银行
# query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'"
cursor_.execute(query)
row = cursor_.fetchone()
if row:
pass
else:
print('没有数据了,结束脚本')
break
com_name = row[6]
social_code = row[4]
code = row[7]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
updateBeginSql = f"update Tfbs set state1=0,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor_.execute(updateBeginSql)
cnx_.commit()
t = time.time()
ip = get_proxy()[random.randint(0,3)]
url_t = f'https://www.tianyancha.com/search?key={social_code}&sessionNo={t}'
res_t = requests.get(url_t,headers=headers, proxies=ip,verify=False) #, proxies=ip,verify=False
time.sleep(10)
soup_t = BeautifulSoup(res_t.content, 'html.parser')
try:
com_id = soup_t.find('div',{'class':'index_header__x2QZ3'}).find('a').get('href').split('/')[-1]
print(f"{com_name}:{com_id}")
except:
com_id = '--'
print(f'{com_name}:没有查询到该企业')
#colext1获取天眼查id
updateBeginSql = f"update Tfbs set state2=0,colext1='{com_id}',date2='{time_now}' where col3='{social_code}' "
cursor_.execute(updateBeginSql)
cnx_.commit()
log.info(f'{com_name}===天眼查id更新入库===== ')
if com_id == '--':
continue
list_one_info = []
list_all_1 = []
list_all_2 = []
# 采集天眼查企业核心人员并通过接口入库
log.info('=====开始采集企业核心人员=======')
print(f'{social_code}:{com_id}')
num = 1
for page in range(1, 2):
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={com_id}&pageSize=20&pageNum={page}'
ip = get_proxy()[random.randint(0, 3)]
res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
time.sleep(10)
list_all = res.json()['data']['dataList']
if list_all:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
position = one_info['position']
Salary = one_info['salary']
try:
birthYear = 2023 - int(one_info['age'])
except:
birthYear = ''
StockKeepings = one_info['numberOfShares']
currentTerm = one_info['term']
personInfo = one_info['resume']
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
list_all_2.append(dic_json_img)
else:
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={com_id}&pageSize=20&pageNum={page}'
ip = get_proxy()[random.randint(0, 3)]
res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
list_all = res.json()['data']['result']
for one_info in list_all:
name = one_info['name']
sex = ''
education = ''
position = one_info['typeSore']
Salary = ''
birthYear = ''
shareRatio = one_info['percent']
try:
benefitShare = one_info['finalBenefitShares']
except:
benefitShare = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{com_id}'
person_res = requests.get(person_url, headers=headers, proxies=ip)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
list_all_2.append(dic_json_img)
log.info(f'{com_name}===该企业采集完成====')
df_info = pd.DataFrame(list_one_info)
df_info.to_excel('主要人员.xlsx', index=False)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
continue
else:
pass
response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
verify=False)
print(response.text)
cnx.close()
cursor.close()
baseCore.close()
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像-23年500强新榜.xlsx',index=False)
from fdfs_client.client import get_tracker_conf, Fdfs_client
from bs4 import BeautifulSoup
import requests, re, time, pymysql, fitz
import urllib3
from base import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
# conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl')
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor_ = cnx.cursor()
cnx_ = baseCore.cnx
cursor = baseCore.cursor
tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf)
taskType = '企业年报/证监会'
# def get_proxy():
# cursor = cnx_ip.cursor()
# sql = "select proxy from clb_proxy"
# cursor.execute(sql)
# proxy_lists = cursor.fetchall()
# ip_list = []
# for proxy_ in proxy_lists:
# ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
# proxy_list = []
# for str_ip in ip_list:
# str_ip_list = str_ip.split('-')
# proxyMeta = "http://%(host)s:%(port)s" % {
# "host": str_ip_list[0],
# "port": str_ip_list[1],
# }
# proxy = {
# "HTTP": proxyMeta,
# "HTTPS": proxyMeta
# }
# proxy_list.append(proxy)
# return proxy_list
def RequestUrl(url, payload, item_id, start_time):
# ip = get_proxy()[random.randint(0, 3)]
response = requests.post(url=url, headers=headers, data=payload) # ,proxies=ip)
response.encoding = response.apparent_encoding
# 检查响应状态码
if response.status_code == 200:
# 请求成功,处理响应数据
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
pass
else:
# 请求失败,输出错误信息
print('请求失败:', response.status_code, response.text)
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, url, '请求失败')
soup = ''
return soup
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, page_size):
sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s'''
cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone()
if selects:
print(f'{name_pdf},{year}已存在')
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by,
create_time, page_size)
cursor_.execute(Upsql, values) # 插入
cnx.commit() # 提交
print("更新完成:{}".format(Upsql))
# 采集信息
def SpiderByZJH(url, payload, dic_info, num, start_time):
item_id = dic_info[2]
# years = dic_info['call_year']
short_name = dic_info[4]
soup = RequestUrl(url, payload, item_id, start_time)
if soup == '':
return
# 先获取页数
page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
total = re.findall(r'\d+', page)[0]
r_page = int(total) % 15
if r_page == 0:
Maxpage = int(total) // 15
else:
Maxpage = int(total) // 15 + 1
# 首页和其他页不同,遍历 如果是首页 修改一下链接
for i in range(1, Maxpage + 1):
if i == 1:
href = url
else:
# http://eid.csrc.gov.cn/101811/index_3_f.html
href = url.split('index')[0] + f'index_{i}_f.html'
soup = RequestUrl(href, payload, item_id, start_time)
if soup == '':
continue
tr_list = soup.find('div', id='txt').find_all('tr')
for tr in tr_list[1:]:
td_list = tr.find_all('td')
pdf_url_info = td_list[2]
# print(pdf_url)
pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'')
# pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
# print(name)
report_type = td_list[4].text.strip()
# print(report_type)
if report_type == '年报':
if '摘要' in name_pdf:
continue
# 年份还从pdf名称里抽取
try:
year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
except Exception as e:
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
year = int(pub_time) - 1
year = str(year)
page_size = 0
sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s'''
cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone()
if selects:
print(f'com_name:{short_name}、{year}已存在')
continue
else:
# 类型为年报的话就解析该年报pdf,并入库
for i in range(0, 3):
try:
resp_content = requests.request("GET", pdf_url).content
# 获取pdf页数
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
break
except Exception as e:
print(e)
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
print(f'==={short_name}、{year}===pdf解析失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, 'pdf解析失败')
continue
result = ''
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
break
except Exception as e:
print(e)
time.sleep(3)
continue
if result == '':
e = '上传服务器失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
continue
if 'Remote file_id' in str(result) and 'Uploaded size' in str(result):
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
type_id = '1'
item_id = item_id
group_name = 'group1'
path = bytes.decode(result['Remote file_id']).replace('group1', '')
full_path = bytes.decode(result['Remote file_id'])
category = 'pdf'
file_size = result['Uploaded size']
order_by = num
status = 1
create_by = 'XueLingKun'
create_time = time_now
page_size = page_size
try:
tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path,
category, file_size, order_by, status, create_by, create_time, page_size)
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, '')
except:
e = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
num = num + 1
time.sleep(2)
else:
e = '采集失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
continue
else:
continue
def getUrl(code, url_parms, Catagory2_parms):
# 深市
if code[0] == '2' or code[0] == '0' or code[0] == '3':
url = f'http://eid.csrc.gov.cn/{url_parms[1]}/index_f.html'
Catagory2 = Catagory2_parms[1]
# 构建POST请求的参数,prodType --- 股票代码
payload2 = {
'prodType': f'{code}',
'prodType2': '代码/简称/拼音缩写 ',
'keyWord': '',
'keyWord2': '关键字',
'startDate': '',
'startDate2': '请输入开始时间',
'endDate': '',
'endDate2': '请输入结束时间',
'selCatagory2': f'{Catagory2}',
'selBoardCode0': '',
'selBoardCode': ''
}
dic_parms = {
'code': code,
'url': url,
'Catagory2': Catagory2,
'payload': payload2
}
# 沪市
if code[0] == '9' or code[0] == '6':
url = f'http://eid.csrc.gov.cn/{url_parms[0]}/index_f.html'
Catagory2 = Catagory2_parms[0]
payload1 = {
'prodType': f'{code}',
'prodType2': '代码/简称/拼音缩写 ',
'keyWord': '',
'keyWord2': '关键字',
'startDate': '',
'startDate2': '请输入开始时间',
'endDate': '',
'endDate2': '请输入结束时间',
'selCatagory2': f'{Catagory2}',
'selCatagory3': '',
'selBoardCode0': '',
'selBoardCode': '',
}
dic_parms = {
'code': code,
'url': url,
'Catagory2': Catagory2,
'payload': payload1
}
# 北交所
if code[0] == '8' or code[0] == '4':
try:
url = f'http://eid.csrc.gov.cn/{url_parms[2]}/index_f.html'
except:
return
Catagory2 = Catagory2_parms[2]
payload3 = {
'prodType': f'{code}',
'prodType2': '代码/简称/拼音缩写 ',
'keyWord': '',
'keyWord2': '关键字',
'startDate': '',
'startDate2': '请输入开始时间',
'endDate': '',
'endDate2': '请输入结束时间',
'selCatagory2': f'{Catagory2}'
}
dic_parms = {
'code': code,
'url': url,
'Catagory2': Catagory2,
'payload': payload3
}
return dic_parms
#state1
if __name__ == '__main__':
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Length': '380',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': 'acw_tc=01c6049e16908026442931294e4d0b65d95e3ba93ac19993d151844ac6',
'Host': 'eid.csrc.gov.cn',
'Origin': 'http://eid.csrc.gov.cn',
'Pragma': 'no-cache',
'Referer': 'http://eid.csrc.gov.cn/101111/index_1_f.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-4c21c93a%2Ccdgd%2C5c8b%2Cc32e%2C8g0229546a17; ba17301551dcbaf9_gdp_session_id_dc777856-a24e-4008-a8a6-af88d75bae2b=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:3%2C%22VISIT%22:2%2C%22PAGE%22:2}; acw_tc=71dbb29c16908906086793104e8117f44af84d756f68927c202e9a70b1',
'Host': 'static.sse.com.cn',
'Pragma': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
# 读取数据库获取股票代码 简称 以及 社会信用代码
num = 1
while True:
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnqy_socialCode')
# if social_code == '':
# time.sleep(20)
# continue
# 获取企业信息
query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state1='1' limit 1 "
# 兴业银行
# query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='通威股份'"
cursor.execute(query)
row = cursor.fetchone()
if row:
pass
else:
print('没有数据了,结束脚本')
break
# tycid = row[14]
com_name = row[6]
social_code = row[4]
code = row[7]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#1表示拿到数据
updateBeginSql = f"update Tfbs_bak set state1='0' and date1='{time_now}' where col3='{social_code}' "
cursor.execute(updateBeginSql)
cnx.commit()
dic_info = baseCore.getInfomation(social_code)
# count = dic_info[15]
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
url_parms = ['101111', '101811', '102611']
Catagory2_parms = ['9604', '10058', '10162']
# 根据股票代码选链接
# 股票代码0、2、3开头的为深圳交易所,6、9开头的为上海交易所,4、8开头的为北京交易所
try:
code = dic_info[3]
except Exception as e:
print(e,social_code)
continue
dic_parms = getUrl(code, url_parms, Catagory2_parms)
SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time)
end_time = time.time()
print(f'{com_name} ---- 该企业耗时 ---- {end_time - start_time}')
# count += 1
runType = 'AnnualReportCount'
# baseCore.updateRun(social_code, runType, count)
cnx.close()
cursor_.close()
baseCore.close()
...@@ -123,6 +123,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -123,6 +123,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
report_type = td_list[4].text.strip() report_type = td_list[4].text.strip()
# print(report_type) # print(report_type)
if report_type == '年报': if report_type == '年报':
if '摘要' in name_pdf:
continue
# 年份还从pdf名称里抽取 # 年份还从pdf名称里抽取
try: try:
year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '') year = re.findall('\d{4}\s*年', name_pdf)[0].replace('年', '')
......
"""
证监会公告采集,只能按照搜索企业来采,从上市库里拿企业数据,sys_enterprise_ipo_copy1
craw_state:已采集过表示为True,未采集表示为0,拿取数据表示为ing,解析失败表示为400
update_state:为1 表示需要更新,用来增量循环
如何统计出来该报告采到了没有,dt_error库统计失败的信息
"""
import json
import re
import time
import fitz
import pymysql
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from datetime import datetime
from base import BaseCore
from fdfs_client.client import get_tracker_conf, Fdfs_client
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
# cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cnx_ = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
# cnx_ip = pymysql.connect(host='114.115.159.144',user='root', password='zzsn9988', db='clb_project', charset='utf8mb4')
# cursor = cnx.cursor()
cursor_ = cnx_.cursor()
cnx = baseCore.cnx
cursor = baseCore.cursor
tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf)
taskType = '企业公告/证监会'
def RequestUrl(url, payload, social_code,start_time):
# ip = get_proxy()[random.randint(0, 3)]
for m in range(0, 3):
try:
response = requests.post(url=url, headers=headers, data=payload) # ,proxies=ip)
response.encoding = response.apparent_encoding
break
except Exception as e:
log.error(f"request请求异常----{m}-----{e}")
pass
# 检查响应状态码
if response.status_code == 200:
# 请求成功,处理响应数据
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
pass
else:
# 请求失败,输出错误信息
log.error('请求失败:', url)
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
soup = ''
return soup
def getUrl(code, url_parms, Catagory2_parms):
# 深市
if code[0] == '2' or code[0] == '0' or code[0] == '3':
url = f'http://eid.csrc.gov.cn/{url_parms[1]}/index_f.html'
Catagory2 = Catagory2_parms[1]
# 构建POST请求的参数,prodType --- 股票代码
payload2 = {
'prodType': f'{code}',
'prodType2': '代码/简称/拼音缩写 ',
'keyWord': '',
'keyWord2': '关键字',
'startDate': '',
'startDate2': '请输入开始时间',
'endDate': '',
'endDate2': '请输入结束时间',
'selCatagory2': f'{Catagory2}',
'selBoardCode0': '',
'selBoardCode': ''
}
dic_parms = {
'code': code,
'url': url,
'Catagory2': Catagory2,
'payload': payload2
}
# 沪市
if code[0] == '9' or code[0] == '6':
url = f'http://eid.csrc.gov.cn/{url_parms[0]}/index_f.html'
Catagory2 = Catagory2_parms[0]
payload1 = {
'prodType': f'{code}',
'prodType2': '代码/简称/拼音缩写 ',
'keyWord': '',
'keyWord2': '关键字',
'startDate': '',
'startDate2': '请输入开始时间',
'endDate': '',
'endDate2': '请输入结束时间',
'selCatagory2': f'{Catagory2}',
'selCatagory3': '',
'selBoardCode0': '',
'selBoardCode': '',
}
dic_parms = {
'code': code,
'url': url,
'Catagory2': Catagory2,
'payload': payload1
}
# 北交所
if code[0] == '8' or code[0] == '4':
try:
url = f'http://eid.csrc.gov.cn/{url_parms[2]}/index_f.html'
except:
return
Catagory2 = Catagory2_parms[2]
payload3 = {
'prodType': f'{code}',
'prodType2': '代码/简称/拼音缩写 ',
'keyWord': '',
'keyWord2': '关键字',
'startDate': '',
'startDate2': '请输入开始时间',
'endDate': '',
'endDate2': '请输入结束时间',
'selCatagory2': f'{Catagory2}'
}
dic_parms = {
'code': code,
'url': url,
'Catagory2': Catagory2,
'payload': payload3
}
return dic_parms
def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type):
inster = False
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s'''
cursor_.execute(sel_sql, (social_code, pdf_url))
selects = cursor_.fetchone()
if selects:
print(f'com_name:{short_name}、{pdf_url}已存在')
return inster
# 信息插入数据库
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
list_info = [
social_code,
name_pdf,
'', # 摘要
'', # 正文
pub_time, # 发布时间
pdf_url,
'证监会',
report_type,
'1',
'zh'
]
cursor_.execute(insert_sql, tuple(list_info))
cnx_.commit()
insert = True
return insert
except:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
return insert
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time):
sel_sql = "select article_id from brpa_source_article where source_address = %s"
cursor_.execute(sel_sql, pdf_url)
row = cursor_.fetchone()
id = row[0]
# 先获取PDF链接下载pdf,在解析内容
try:
res = requests.get(pdf_url)
content = ''
# 读取文件内容,
with fitz.open(stream=res.content, filetype='pdf') as doc:
for page in doc.pages():
content += page.get_text()
except:
# print('解析失败')
dic_result = {
'success': 'false',
'message': 'PDF解析失败',
'code': '204',
}
print(dic_result)
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, dic_result['message'])
return False
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': id,
'author': '',
'content': content,
'contentWithTag': '',
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '证监会',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': pdf_url, # 原文链接
'summary': '',
'title': pdf_name,
'type': 3,
'socialCreditCode': social_code,
'year': year
}
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
print(dic_result)
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
print(dic_result)
return False
# 采集信息
def SpiderByZJH(url, payload, dic_info, start_time): # dic_info 数据库中获取到的基本信息
okCount = 0
errorCount = 0
social_code = dic_info[2]
short_name = dic_info[4]
soup = RequestUrl(url, payload, social_code, start_time)
if soup == '':
return False
# 先获取页数
try:
page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
except:
e = f"该企业没有{dic_parms['Catagory2']}数据"
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, dic_parms['url'], 'Kafka操作失败')
return False
total = re.findall(r'\d+', page)[0]
r_page = int(total) % 15
if r_page == 0:
Maxpage = int(total) // 15
else:
Maxpage = int(total) // 15 + 1
log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
# 首页和其他页不同,遍历 如果是首页 修改一下链接
for i in range(1, Maxpage + 1):
log.info(f'==========正在采集第{i}页=========')
if i == 1:
href = url
else:
# http://eid.csrc.gov.cn/101811/index_3_f.html
href = url.split('index')[0] + f'index_{i}_f.html'
soup = RequestUrl(href, payload, social_code, start_time)
if soup == '':
continue
tr_list = soup.find('div', id='txt').find_all('tr')
pageIndex = 0
for tr in tr_list[1:]:
pageIndex += 1
td_list = tr.find_all('td')
pdf_url_info = td_list[2]
# print(pdf_url)
pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'')
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
year = pub_time[:4]
report_type = td_list[4].text.strip()
# 信息插入数据库
insert = InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type)
log.info(f'======={short_name}========{code}===插入公告库成功')
if insert:
# # 公告信息列表
# okCount = okCount + 1
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time)
if result:
# 公告信息列表
okCount = okCount + 1
log.info(f'{short_name}==============解析传输操作成功')
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '')
pass
else:
errorCount += 1
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.error(f'{short_name}=============解析或传输操作失败')
# try:
# insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex,type) values('{social_code}','证监会','{pdf_url}','{name_pdf}','{pub_time}',' ',now(),1,{i},{pageIndex},'1')"
# cursor_.execute(insert_err_sql)
# cnx_.commit()
# except:
# pass
continue
return True
#state2
if __name__ == '__main__':
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Length': '380',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': 'acw_tc=01c6049e16908026442931294e4d0b65d95e3ba93ac19993d151844ac6',
'Host': 'eid.csrc.gov.cn',
'Origin': 'http://eid.csrc.gov.cn',
'Pragma': 'no-cache',
'Referer': 'http://eid.csrc.gov.cn/101111/index_1_f.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-4c21c93a%2Ccdgd%2C5c8b%2Cc32e%2C8g0229546a17; ba17301551dcbaf9_gdp_session_id_dc777856-a24e-4008-a8a6-af88d75bae2b=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:3%2C%22VISIT%22:2%2C%22PAGE%22:2}; acw_tc=71dbb29c16908906086793104e8117f44af84d756f68927c202e9a70b1',
'Host': 'static.sse.com.cn',
'Pragma': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
# dic_parms = {}
# 读取数据库获取股票代码 简称 以及 社会信用代码
while True:
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
# if social_code == None:
# time.sleep(20)
# continue
# 获取企业信息
# query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state2 is Null limit 1 "
# 兴业银行
query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'"
cursor.execute(query)
row = cursor.fetchone()
if row:
pass
else:
print('没有数据了,结束脚本')
break
# tycid = row[14]
com_name = row[6]
social_code = row[4]
code = row[7]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# 1表示拿到数据
updateBeginSql = f"update Tfbs_bak set state2='1',date1='{time_now}' where col3='{social_code}' "
cursor.execute(updateBeginSql)
cnx.commit()
dic_info = baseCore.getInfomation(social_code)
count = dic_info[16]
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
# 发行上市公告,北交所没有该栏目
url_parms = ['101110', '101810']
Catagory2_parms = ['9603', '10057']
# 临时报告
url_parms_ls = ['101112', '101812', '102612']
Catagory2_parms_ls = ['9605', '10059', '10163']
# 根据股票代码选链接
# 股票代码0、2、3开头的为深圳交易所,6、9开头的为上海交易所,4、8开头的为北京交易所
code = dic_info[3]
short_name = dic_info[4]
dic_parms = getUrl(code, url_parms, Catagory2_parms)
dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)
if len(dic_parms) > 0:
start_time_cj = time.time()
result = SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time)
if result:
log.info(f'{code}==========={short_name},发行公告成功,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
else:
log.info(f'{code}==========={short_name},发行公告失败,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
start_time_ls = time.time()
result_ls = SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time)
if result_ls:
log.info(f'{code}==========={short_name},临时报告成功,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
else:
log.info(f'{code}==========={short_name},临时报告失败,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
# UpdateInfoSql(retData,retData_ls,social_code)
# log.info(f'{code}================更新成功')
end_time = time.time()
log.info(f'{short_name} ---- 该企业耗时 ---- {baseCore.getTimeCost(start_time, end_time)}-----------')
count += 1
# runType = 'NoticeReportCount'
# baseCore.updateRun(code, runType, count)
cursor.close()
cnx.close()
cursor_.close()
cnx_.close()
# 释放资源
baseCore.close()
"""
增量采集:
取state为3、update_state为空的企业 表示上次采集成功的企业,
新增update_state字段,取一个企业更新为2,表示该企业正在采集。
采集完毕更新为1.
表示已经采集完成。跟据date_time 来排列 每次就不会拿到重复的数据。
okCount
errorCount
repectCount
新增三个字段分别对应更新的up_okCount up_errorCount up_repectCount ,
记录这些更新的数据 然后加到原来的数据上表示该企业已采集多少动态
8.8日改版,企业动态也传kafka
"""
import json
import requests,time,pymysql
import jieba
import sys
from kafka import KafkaProducer
from base.BaseCore import BaseCore
from base.smart import smart_extractor
# sys.path.append('D:/KK/zzsn_spider/base')
# import BaseCore
# from smart import smart_extractor
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
jieba.cut("必须加载jieba")
# 初始化,设置中文分词
smart =smart_extractor.SmartExtractor('cn')
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor= cnx.cursor()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
pageSize = 10
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Cookie':'jsid=SEO-BAIDU-ALL-SY-000001; TYCID=77e997401d5f11ee9e91d5a0fd3c0b89; ssuid=6450041974; _ga=GA1.2.858826166.1688800641; _gid=GA1.2.2142449376.1689575510; tyc-user-info-save-time=1689764135027; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22309757777%22%2C%22first_id%22%3A%22189345cb10257d-0cfee05327f673-26031d51-1327104-189345cb10375b%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg5MzQ1Y2IxMDI1N2QtMGNmZWUwNTMyN2Y2NzMtMjYwMzFkNTEtMTMyNzEwNC0xODkzNDVjYjEwMzc1YiIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwOTc1Nzc3NyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22309757777%22%7D%2C%22%24device_id%22%3A%22189345cb10257d-0cfee05327f673-26031d51-1327104-189345cb10375b%22%7D; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1689752829,1689821665,1689831487,1689845884; searchSessionId=1689845917.81838207; HWWAFSESID=146bb1d25b1515339d3; HWWAFSESTIME=1689858023324; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1689859758',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51'
}
taskType = '企业动态/天眼查'
def beinWork(tyc_code, social_code):
start_time = time.time()
time.sleep(3)
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
t = time.time()
url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
for m in range(0, 3):
try:
ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent()
response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
# time.sleep(random.randint(3, 5))
break
except Exception as e:
pass
if (response.status_code == 200):
pass
else:
log.error(f"{tyc_code}-----获取总数接口失败")
e = '获取总数接口失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
return retData
try:
json_1 = json.loads(response.content.decode('utf-8'))
total = json_1['data']['total']
except:
log.error(f"{tyc_code}-----获取总数失败")
e = '获取总数失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
return retData
if (total > 0):
if (total % pageSize == 0):
totalPage = total // pageSize
else:
totalPage = total // pageSize + 1
else:
log.error(f"{tyc_code}--------总数为0")
retData['state'] = True
return retData
log.info(f"{tyc_code}-------总数:{total}----总页数:{totalPage}")
retData['total'] = total
up_okCount = 0
up_errorCount = 0
up_repetCount = 0
for num in range(1, totalPage + 1):
time.sleep(3)
log.info(f"获取分页数据--{tyc_code}----分页{num}----开始")
start_page = time.time()
url_page = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={time.time()}&id={tyc_code}&ps={pageSize}&pn={num}&emotion=-100&event=-100'
for m in range(0, 3):
try:
ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent()
response_page = requests.get(url=url_page, headers=headers, proxies=ip, verify=False)
# time.sleep(3)
break
except:
pass
if (response_page.status_code == 200):
pass
else:
log.error(f"{tyc_code}--{num}页---获取分页数据失败")
e = '获取分页数据失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
up_errorCount = up_errorCount + pageSize
continue
try:
json_page = json.loads(response_page.content.decode('utf-8'))
info_list_page = json_page['data']['items']
except:
log.error(f"{tyc_code}--{num}页---获取分页数据失败")
e = '获取分页数据失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
up_errorCount = up_errorCount + pageSize
continue
pageIndex = 0
for info_page in info_list_page:
pageIndex = pageIndex + 1
title = info_page['title']
source = info_page['website']
link = info_page['uri']
try:
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
cursor.execute(sel_sql, (link, social_code))
except Exception as e:
print(e)
selects = cursor.fetchone()
if selects:
log.info(f'{tyc_code}-----{social_code}----{link}:已经存在')
# todo:如果该条数据存在则说明该条数据之后的都已经采集完成,就可以跳出函数,执行下一个企业
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
return retData
try:
time_struct = time.localtime(int(info_page['rtm'] / 1000)) # 首先把时间戳转换为结构化时间
time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct) # 把结构化时间转换为格式化时间
except:
time_format = baseCore.getNowTime(1)
try:
# 开始进行智能解析
lang = baseCore.detect_language(title)
smart = smart_extractor.SmartExtractor(lang)
contentText = smart.extract_by_url(link).text
# time.sleep(3)
except Exception as e:
contentText = ''
if contentText == '':
log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}')
e = '获取正文失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
up_errorCount = up_errorCount + 1
try:
insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex) values('{social_code}','{source}','{link}','{title}','{time_format}','{info_page['abstracts']}',now(),1,{num},{pageIndex})"
cursor.execute(insert_err_sql)
cnx.commit()
except:
pass
continue
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
# 动态信息列表
up_okCount = up_okCount + 1
list_info = [
social_code,
title,
info_page['abstracts'], # 摘要
contentText, # 正文
time_format, # 发布时间
link,
'天眼查',
source,
'2',
'zh'
]
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
# 采集一条资讯记录一条,记录该企业采到了多少的资讯
log.info(f'{social_code}----{link}:新增一条')
sel_sql = "select article_id from brpa_source_article where source_address = %s and social_credit_code = %s"
cursor.execute(sel_sql, (link, social_code))
row = cursor.fetchone()
id = row[0]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:插入一条数据,并传入kafka
dic_news = {
'attachmentIds': id,
'author': '',
'content': contentText,
'contentWithTag': contentText,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '天眼查',
'publishDate': time_format,
'sid': '1684032033495392257',
'sourceAddress': link, # 原文链接
'summary': info_page['abstracts'],
'title': contentText,
'type': 2,
'socialCreditCode': social_code,
'year': time_format[:4]
}
except Exception as e:
log.info(f'传输失败:{social_code}----{link}')
e = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
continue
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, '')
# return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.error(dic_result)
e = 'Kafka操作失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
log.info(f"获取分页数据--{tyc_code}----分页{num},耗时{baseCore.getTimeCost(start_page, time.time())}")
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
return retData
def doJob():
while True:
# 获取企业信息
query = "SELECT * FROM Tfbs_bak where col3 is not null and length(col3)>3 and col6 not like '%HK%' and col3 not like 'ZZSN%' and state3 is null limit 1 "
# 兴业银行
# query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'"
cursor_.execute(query)
row = cursor_.fetchone()
if row:
pass
else:
print('没有数据了,结束脚本')
break
tycid = row[16]
com_name = row[6]
xydm = row[4]
code = row[7]
count = 0
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#0 表示拿取数据
updateBeginSql = f"update Tfbs_bak set state3='0',date3='{time_now}' where col3='{xydm}' "
# print(updateBeginSql)
cursor_.execute(updateBeginSql)
cnx_.commit()
log.info(f"{id}---{xydm}----{tycid}----开始处理")
start_time = time.time()
# 开始采集企业动态
retData = beinWork(tycid, xydm)
# 信息采集完成后将该企业的采集次数更新
runType = 'NewsRunCount'
count += 1
# baseCore.updateRun(xydm, runType, count)
total = retData['total']
up_okCount = retData['up_okCount']
up_errorCount = retData['up_errorCount']
up_repetCount = retData['up_repetCount']
log.info(
f"{id}---{xydm}----{tycid}----结束处理,耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}")
# 200 表示成功
updateBeginSql = f"update Tfbs_bak set state3='200',date3='{time_now}' where col3='{xydm}' "
# print(updateBeginSql)
cursor_.execute(updateBeginSql)
cnx_.commit()
cursor.close()
cnx.close()
# 释放资源
baseCore.close()
# Press the green button in the gutter to run the script.
#state3
if __name__ == '__main__':
doJob()
...@@ -5,26 +5,19 @@ import langid ...@@ -5,26 +5,19 @@ import langid
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
baseCore =BaseCore() baseCore =BaseCore()
import pymysql
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数")) # print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
# cnx_ = baseCore.cnx
# cursor_ = baseCore.cursor
cnx_ = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor_ = cnx_.cursor()
updateBeginSql = f"update Tfbs set state3=%s where col3=%s "
# print(updateBeginSql)
cursor_.execute(updateBeginSql,(200,'91350000158142711F'))
cnx_.commit()
#
# def detect_language(text):
# # 使用langid.py判断文本的语言
# lang, confidence = langid.classify(text)
# print(lang,confidence)
# return lang
# detect_language("123")
from textblob import TextBlob
def detect_language(text):
blob = TextBlob(text)
lang = blob.detect_language()
return lang
text = "Hello, how are you?"
language = detect_language(text)
print(language)
''' '''
补充智库动态没有公众号信息数据的公众号 记录一天能采多少公众号
从库中读取信息,根据域名找到属于公众号的链接,
设置time.sleep 等待到每天执行
''' '''
import requests, time, random, json, pymysql, redis
import requests, time, re, datetime, random, json, pymysql, redis
import pandas as pd import pandas as pd
import urllib3 import urllib3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
...@@ -216,7 +213,7 @@ if __name__=="__main__": ...@@ -216,7 +213,7 @@ if __name__=="__main__":
# browser2.get(url) # browser2.get(url)
# browser3.get(url) # browser3.get(url)
# 可改动 # 可改动
time.sleep(50) time.sleep(30)
num_b = 0 num_b = 0
browser_run = list_b[0] browser_run = list_b[0]
log.info('======刷新浏览器=====') log.info('======刷新浏览器=====')
...@@ -313,13 +310,13 @@ if __name__=="__main__": ...@@ -313,13 +310,13 @@ if __name__=="__main__":
count = 0 count = 0
try: try:
ip = get_proxy()[random.randint(0, 3)] ip = get_proxy()[random.randint(0, 3)]
json_search = s.get(url_search, headers=baseCore.getRandomUserAgent(), proxies=ip, json_search = s.get(url_search, headers=headers, proxies=ip,
verify=False).json() # , proxies=ip, verify=False verify=False).json() # , proxies=ip, verify=False
time.sleep(2) time.sleep(2)
break break
except: except:
log.info(f'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}===') log.info(f'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}===')
error_text = str(json_search) # error_text = str(json_search)
json_search = '' json_search = ''
aa = time.sleep(600) aa = time.sleep(600)
log.info(f'======等待时间{aa}=======') log.info(f'======等待时间{aa}=======')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论