提交 08e4725c 作者: 薛凌堃

12/21

上级 8f2915d4
import reits
import reits
import reits
import policy_beijing, policy_chongqing, policy_fujian, policy_guangdong
import policy_chongqing, policy_fujian, policy_guangdong
import policy_guangxi, policy_gwy, policy_hainan, policy_heilongjiang, policy_hubei, policy_jiangsu
import policy_jiangxi, policy_jilin, policy_liaoning, policy_neimenggu, policy_shandong, policy_hubei
import policy_shanxi, policy_sichuan, policy_tianjin, policy_yunnan, policy_zhejiang
import RuleGuide_shanghai, RuleGuide_shenzhen
import LawRules_shenzhen, LawRules_2_shenzhen
from REITs_policyData.policy_beijing import beijing
if __name__ == "__mian__":
policy_beijing.beijing()
beijing()
reits.sse()
reits.reform()
reits.hebei()
......
# -*- coding: utf-8 -*-
"""
模拟点击的方法不行,涉及到需要账号登录
"""
import json
import re
import time
......@@ -296,7 +292,7 @@ def dic_handle(result_dic):
return aa_dict
# 采集准备
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
# if social_code:
# dic_info = baseCore.getInfomation(social_code)
......@@ -342,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
else:
# 开始采集
try:
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
token.updateTokeen(id_cookie,3)
......@@ -377,7 +373,7 @@ def ifbeforename(company_url):
return ''
# 采集基本信息和工商信息
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('firm/')[1].split('.html')[0]
# 将采集到的企查查id更新
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
......@@ -467,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
# print(aa_dic)
sendkafka(aa_dic)
......@@ -486,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
sendkafka(aa_dic)
# 判断名称是否统一
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name):
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
company_url = ''
try:
company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list')
......@@ -530,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name)
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
else:
#没有搜到相同的企业名称
data = [com_name, social_code]
......@@ -544,7 +540,7 @@ if __name__ == '__main__':
while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况_{nowtime}.xlsx'
file_name = f'./data/国内企业基本信息采集情况.xlsx'
file.createFile(file_name)
cookieinfo = token.getToken()
......@@ -553,6 +549,7 @@ if __name__ == '__main__':
else:
log.info('==========已无cookies==========')
time.sleep(30)
continue
id_cookie = cookieinfo[0]
cookie_ = json.loads(cookieinfo[1])
......@@ -599,6 +596,11 @@ if __name__ == '__main__':
while flag:
log.info('--------已没有数据---------')
time.sleep(30)
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = cnx_.cursor()
log.info('===11数据库重新连接成功===')
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
if company_field:
flag = False
......@@ -608,7 +610,7 @@ if __name__ == '__main__':
continue
social_code = company_field.split('|')[0]
com_name = company_field.split('|')[2].replace(' ', '')
com_name = company_field.split('|')[1].replace(' ', '')
ynDomestic = company_field.split('|')[15]
countryName = company_field.split('|')[16]
......@@ -617,6 +619,7 @@ if __name__ == '__main__':
listingDate = company_field.split('|')[21]
category = company_field.split('|')[19]
exchange = company_field.split('|')[20]
listType = company_field.split('|')[21]
# ynDomestic = ''
# countryName = ''
# securitiesCode = ''
......@@ -625,8 +628,8 @@ if __name__ == '__main__':
# category = ''
# exchange = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name)
time.sleep(40)
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
time.sleep(2)
# break
# baseCore.r.close()
# baseCore.sendEmail(file_name)
......
# 证监会沪市、gong深市 公司债券和企业债券采集
"""
证监会企业名单
"""
import time
import random
import requests
from bs4 import BeautifulSoup
from retry import retry
from base import BaseCore
from obs import ObsClient
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
taskType = '企业名单/证监会'
def createDriver():
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('user-agent='+'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
chrome_options.add_argument('--headless')
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
return driver
@retry(tries=3, delay=5)
def RequestUrl(url):
# ip = baseCore.get_proxy()
# proxy = {'https': 'http://127.0.0.1:8888', 'http': 'http://127.0.0.1:8888'}
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
return soup
else:
raise
def browserRequest(url):
browser = createDriver()
browser.get(url)
wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "m-table2")))
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'html.parser')
return soup
def getUrl(url_parm):
# 深市
# 沪市
url = f'http://eid.csrc.gov.cn/{url_parm}/index_f.html'
# 北交所
return url
# 映射关系
def getmap(dic_info):
data_dic = {
'债券代码': 'zhaiquan_code',
'名称': 'zhaiquan_name',
'上市地': 'ipo_place',
'全称': 'full_name',
'发行人': 'issure',
'发行量(亿元)': 'volume',
'发行价格(元)': 'money',
'发行方式': 'method',
'期限(年)': 'tenure',
'到期日期': 'last_date',
'票面利率(%)': 'rate',
'利率类型': 'lilvtype',
'付息方式': 'payment',
'起息日期': 'start_date',
'上市日期': 'list_date',
}
dict3 = {value: dic_info.get(key, '') for key, value in data_dic.items()}
print(dict3)
return dict3
# for key1,value1 in data_dic:
# for key2 in dic_info.keys():
# if key2 == key1:
# dic_info[data_dic[key1]] = dic_info[key2]
# del dic_info[key2]
# break
# else:
# dic_info[data_dic[key1]] = ''
# continue
# print(data_dic)
# 采集信息
def SpiderByZJH(url, start_time): # dic_info 数据库中获取到的基本信息
try:
soup = RequestUrl(url)
except:
# 请求失败,输出错误信息
log.error(f'请求失败:{url}')
#重新放入redis
time.sleep(random.randint(60, 120))
soup = ''
if soup == '':
return
# 判断查找内容是否存在
# try:
# is_exist = soup.find('div',class_='con').text
# if is_exist == '没有查询到数据':
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, url, '没有查询到数据')
# return
# except:
# pass
# 先获取页数
page = soup.find('div', class_='pages').find_all('li')[-1]
total = page.find('b').text
for i in range(1,int(total)+1):
log.info(f'==========正在采集第{i}页=========')
if i == 1:
href = url
else:
# http://eid.csrc.gov.cn/101811/index_3_f.html
href = url.split('index')[0] + f'index_{i}.html'
try:
soup = browserRequest(href)
except:
# 请求失败,输出错误信息
log.error(f'请求失败:{url}')
# 重新放入redis
tr_list1 = soup.find('table', class_='m-table2')
# print(tr_list1)
tr_list = tr_list1.find_all('tr')
# pageIndex = 0
for tr in tr_list[1:]:
dic_info = {}
# pageIndex += 1
td_list = tr.find_all('td')
zhaiquan_code = td_list[0].text.replace('\r', '').replace('\n', '').replace(' ','')
zhaiquan_name = td_list[1].text.replace('\r', '').replace('\n', '').replace(' ','')
ipo_place = td_list[2].text.replace('\r', '').replace('\n', '').replace(' ','')
list_date = td_list[3].text.replace('\r', '').replace('\n', '').replace(' ','')
last_date = td_list[4].text.replace('\r', '').replace('\n', '').replace(' ','')
# print(pdf_url)
selectSql = f"select count(1) from debt_secutity where zhaiquan_code='{zhaiquan_code}' and zhaiquan_name='{zhaiquan_name}'"
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{zhaiquan_code}-------{zhaiquan_name}---已经存在")
continue
else:
# dic_info = {
# '债券代码': zhaiquan_code,
# '名称': zhaiquan_name,
# '上市地': ipo_place,
# '上市日期': list_date,
# '到期日期': last_date,
# }
info_url = 'http://eid.csrc.gov.cn/' + td_list[0].find('a')['href']
soup_info = RequestUrl(info_url)
try:
info_list = soup_info.find('table',class_='m-table3').find_all('tr')
except Exception as e:
log.info(f'error---{e}---第{i}页--{info_url}')
info_list = []
dic_info = {
'债券代码': zhaiquan_code,
'名称': zhaiquan_name,
'上市地': ipo_place,
'上市日期': list_date,
'到期日期': last_date,
}
for tr_ in info_list:
td_list = tr_.find_all('td')
for td in td_list:
value = td.find('span').text.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', '')
span_tag = td.find('span')
span_tag.decompose()
name = td.text.replace(':', '').replace(':', '').replace('\r', '').replace('\n', '').replace(' ', '')
dic_info[name] = value
# 插入数据库
final_dic = getmap(dic_info)
values_tuple = tuple(final_dic.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into debt_secutity(zhaiquan_code,zhaiquan_name,ipo_place,full_name,issure,volume,money,method,tenure,last_date,rate,lilvtype,payment,start_date,list_date) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertSql,values_tuple)
cnx.commit()
log.info(f"{zhaiquan_code}-------{zhaiquan_name}---新增")
log.info(f"【{i}/{total}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
if __name__ == '__main__':
num = 0
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
# 'Cookie': 'yfx_c_g_u_id_10008998=_ck23112014074614515077233960865; yfx_f_l_v_t_10008998=f_t_1700460466453__r_t_1700460466453__v_t_1700460466453__r_c_0; yfx_mr_10008998=%3A%3Amarket_type_free_search%3A%3A%3A%3Abaidu%3A%3A%3A%3A%3A%3A%3A%3Awww.baidu.com%3A%3A%3A%3Apmf_from_free_search; yfx_mr_f_10008998=%3A%3Amarket_type_free_search%3A%3A%3A%3Abaidu%3A%3A%3A%3A%3A%3A%3A%3Awww.baidu.com%3A%3A%3A%3Apmf_from_free_search; yfx_key_10008998=; _yfx_session_10008998=%7B%22_yfx_firsttime%22%3A%221701508120899%22%2C%22_yfx_lasttime%22%3A%221701508120899%22%2C%22_yfx_visittime%22%3A%221701508120899%22%2C%22_yfx_domidgroup%22%3A%221701508120899%22%2C%22_yfx_domallsize%22%3A%22100%22%2C%22_yfx_cookie%22%3A%2220231202170840906620987838830281%22%7D; acw_tc=01c604a717025467485993784e5c9f1847d885d2c82ee192efdfd627ba',
'Host': 'eid.csrc.gov.cn',
'If-Modified-Since': 'Thu, 14 Dec 2023 08:06:01 GMT',
'If-None-Match': '"657ab769-95b5"',
# 'Referer': 'http://eid.csrc.gov.cn/201010/index_3.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
dic_parms = {}
# 读取数据库获取股票代码 简称 以及 社会信用代码
while True:
start_time = time.time()
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
# 沪市主板 沪市科创板
# url_parms = ['201010', '201014']
# url_parms = ['201011', '201013']
url_parms = ['201411', '201414', '202011', '202014']
# url_parms = ['202011', '202014']
for url_parm in url_parms:
url = getUrl(url_parm)
start_time_cj = time.time()
log.info(f'======开始处理======')
SpiderByZJH(url, start_time)
break
cursor.close()
cnx.close()
baseCore.close()
......@@ -94,7 +94,7 @@ def get_content2():
child_type = content_dict['childtype'] # 主题分类
except:
child_type = ''
# # 判断是否已经爬取过
# 判断是否已经爬取过
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
......@@ -102,6 +102,7 @@ def get_content2():
time.sleep(1)
continue
try:
# href = 'https://www.gov.cn/zhengce/zhengceku/202312/content_6921452.htm'
resp = requests.get(url=href, headers=baseTool.headers, verify=False)
resp.encoding = resp.apparent_encoding
resp_text = resp.text
......@@ -120,9 +121,7 @@ def get_content2():
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.odf' in file_href:
if '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
......
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查专利/国内上市'
def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
start_time = time.time()
log.info(f'===正在处理第{page}页===')
# list_all_info = []
t = int(time.time() * 1000)
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzI3MzczNzEzMSIsImlhdCI6MTcwMzE1MjEzMSwiZXhwIjoxNzA1NzQ0MTMxfQ.3tF-UFhorC_mS4h2UIBOZamApfcaJEfjBbr8K11d2yHhELBM1pEvjd6yccxhLzVKRoyFdTn-1Cz6__ZpzgjnGg',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
try:
ip = baseCore.get_proxy()
except:
time.sleep(2)
ip = baseCore.get_proxy()
try:
res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
except:
for i in range(3):
try:
res_j = requests.get(url=url, headers=header, verify=False).json()
except:
time.sleep(2)
continue
# print(res_j)
list_all = res_j['data']['items']
# print(list_all)
if list_all:
for one_zhuanli in list_all:
title = one_zhuanli['title']
try:
shenqingri = one_zhuanli['applicationTime']
except:
shenqingri = ''
try:
shenqing_code = one_zhuanli['patentNum']
except:
shenqing_code = ''
try:
leixing = one_zhuanli['patentType']
except:
leixing = ''
try:
status = one_zhuanli['lprs']
except:
status = ''
try:
gongkairi = one_zhuanli['pubDate']
except:
gongkairi = ''
try:
gongkai_code = one_zhuanli['pubnumber']
except:
gongkai_code = ''
try:
famingren = one_zhuanli['inventor']
except:
famingren = ''
try:
shenqingren = one_zhuanli['applicantName']
except:
shenqingren = ''
try:
gongneng = one_zhuanli['cat']
except:
gongneng = ''
try:
uuid = one_zhuanli['uuid']
except:
uuid = ''
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'专利名称': title,
'申请日': shenqingri,
'申请号': shenqing_code,
'专利类型': leixing,
'专利状态': status,
'公开日': gongkairi,
'公开号': gongkai_code,
'发明人': famingren,
'申请人': shenqingren,
'功能': gongneng,
'天眼查详情id': uuid,
'年份': shenqingri[:4]
}
selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{shenqing_code}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{shenqing_code}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
return page
else:
return 0
if __name__ == "__main__":
while True:
list_all_info = []
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
# social_code = '9111010566840059XP'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
page = 1
while True:
page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info)
if page != 0:
page += 1
else:
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log.info(f"{id}---{xydm}----{tycid}----结束处理")
break
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论