提交 55610b8f 作者: 薛凌堃

24/01/05

上级 23d4dd76
......@@ -403,6 +403,7 @@ class BaseCore:
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
self.cnx.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
......
import pandas as pd
# from pandas import DataFrame as df
import pymysql
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
df_all = pd.read_excel('D:\\企业数据\\数据组提供\\第五批专精特新企业名单汇总_修订版_20240102.xlsx', dtype=str)
list_com = []
for num_df in range(len(df_all)):
com_name = str(df_all['企业名称'][num_df])
dic_com = {
'social_code': '',
'com_name': com_name
}
with cnx.cursor() as cursor:
sel_sql = '''select social_credit_code from sys_base_enterprise where name = %s '''
cursor.execute(sel_sql, com_name)
selects = cursor.fetchone()
if selects:
print(f'【{num_df}/{len(df_all)}】==={com_name}找到')
social_code = selects[0]
else:
print(f'【{num_df}/{len(df_all)}】==={com_name}未找到')
social_code = ''
df_all['信用代码'][num_df] = str(social_code)
df_all.to_excel('D:\\企业数据\\数据组提供\\第五批专精特新企业名单汇总_修订版_20240102.xlsx', index=False)
\ No newline at end of file
......@@ -228,7 +228,7 @@ def download(data, order_by):
'sid': sid,
'sourceAddress': sourceAddress,
'summary': summary,
'title': name_pdf,
'title': name_pdf.split('.pdf')[0],
'type': '0'
}
# 将相应字段通过kafka传输保存
......@@ -396,8 +396,8 @@ def Mob():
# usecount = loginfo.split('|')[2]
usecount = 0
# 测试用
# account = '13636711746'
# password = 'Zhenghao123'
account = '13636711746'
password = 'Zhenghao123'
# account = '18703752600'
# password = 'Axlk010208!'
......@@ -407,8 +407,8 @@ def Mob():
# password = 'xlk123456!'
# account = '17103126138'
# password = '171BlackOne'
account = '17103128590'
password = '171BlackTwo'
# account = '17103128590'
# password = '171BlackTwo'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
f_url = 'https://www.mob.com/developer/login'
browser.get(f_url)
......@@ -469,12 +469,8 @@ def Mob():
i_soup = BeautifulSoup(res_href,'html.parser')
summary_list = i_soup.find(class_='picture-content htmlContent').find_all('h3')
news_url = f'https://api.os.mob.com/api/academy_report/download/{report_id}'
# headers['token'] = '92b42171-7a33-4f3b-a25b-9ca689699e10'
# headers['token'] = '495f9714-7ea8-4987-91c0-2b0ede38238b'
# headers['token'] = '0dcbde4a-9aaa-4651-b886-856add4b8df9'
# headers['token'] = '2fcdd67b-da81-4f2f-9d6f-529fdbf6ae1f'
# headers['token'] = 'dd54bc77-50fa-4a25-aec7-95ec45bd17f8'
headers['token'] = '2fd143d3-a1ec-4d9d-9d9b-38a1d4cf8387'
headers['token'] = '05bc441a-b09b-40cb-ab65-8d9e63e5c529'
news_req = session.get(url=news_url,headers=headers)
pdf_url = news_req.json()['data']
......@@ -693,31 +689,75 @@ def juliangsuanshu():
getnews(browser)
browser.quit()
def ke36switch(browser,info_url):
try:
browser.get(info_url) # 跳到指定页面
page_source = browser.page_source # 获取页面信息
soup_info = BeautifulSoup(page_source, 'html.parser')
info_date = soup_info.find('meta', {'property': 'article:published_time'}).get('content')[:10]
return soup_info
except:
browser.quit()
proxy = baseCore.get_proxy()
# proxy = {
# 'http': '222.90.4.73:40018',
# 'httpS': '222.90.4.73:40018'
# }
opt.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# opt.add_argument('--proxy-server=' + proxy['http'])
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
browser.refresh()
ke36switch(browser,info_url)
# 36氪
def ke36():
# browser = webdriver.Chrome(chromedriver)
proxy = baseCore.get_proxy()
opt.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# opt.add_argument('--proxy-server=' + proxy['http'])
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = 'https://36kr.com/academe'
browser.get(url)#跳到指定页面
time.sleep(3)
for i in range(10):
try:
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'show-more')))
js = "var q=document.documentElement.scrollTop=3000"
browser.execute_script(js)
time.sleep(2)
browser.find_element(By.CLASS_NAME, 'show-more').click()
except:
break
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
page_source = browser.page_source#获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('div',{'class':'report-list-wrapper'}).find_all('div',{'class':'report-card type-4'})
for one_info in list_all:
for one_info in list_all[::-1]:
info_title = one_info.find('div',{'class':'title'}).text
info_zhaiyao = one_info.find('div',{'class':'desc'}).text
info_url = one_info.a.get('href')
# is_member = r.sismember('report_pdf_three_history', info_url)
# if is_member:
# continue
soup_info = ke36switch(browser,info_url)
browser.get(info_url)#跳到指定页面
page_source = browser.page_source#获取页面信息
soup_info = BeautifulSoup(page_source, 'html.parser')
info_date = soup_info.find('meta',{'property':'article:published_time'}).get('content')[:10]
info_content = soup_info.find('div',{'class':'common-width margin-bottom-20'}).text
info_date = soup_info.find('meta', {'property': 'article:published_time'}).get('content')[:10]
if info_date < '2023-05-10':
pass
else:
time.sleep(1)
continue
try:
info_content = soup_info.find('div',{'class':'common-width margin-bottom-20'}).text
except:
proxy = baseCore.get_proxy()
opt.add_argument('--proxy-server=' + proxy['http'])
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
ke36switch(browser, info_url)
dic_post = {
'title': info_title, # 报告名称
'url_pdf': '', # 报告链接
......@@ -734,7 +774,7 @@ def ke36():
'sid': '1662008421217378306', # 信息源id
}
order_by = 1
download(dic_post, order_by)
# download(dic_post, order_by)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
......@@ -742,6 +782,7 @@ def ke36():
# res = requests.post(url, data=json.dumps(dic_post))
# print(res.json())
time.sleep(2)
browser.quit()
......@@ -922,6 +963,28 @@ def shijiejingjiluntan():
time.sleep(2)
browser.quit()
def get_json(key_word,page,headers):
param = {
"uid": "",
"keyword": key_word,
"type": ["researchReport"],
"client": "web",
"clientVersion": "curr",
"clientType": "web",
"param": {"researchReport": {"client": "web", "pageSize": 10, "pageIndex": page}}
}
param_url = parse.quote(str(param).replace(" ", ""))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res = requests.get(url=url, headers=headers).text[1:-1]
res_json = json.loads(res)
return res_json
# 东方财富网
def dongfangcaifu():
headers = {
......@@ -965,101 +1028,70 @@ def dongfangcaifu():
page = 1
# for page in range(1,500):
# log.info(page)
param = {
"uid": "",
"keyword": key_word,
"type": ["researchReport"],
"client": "web",
"clientVersion": "curr",
"clientType": "web",
"param": {"researchReport": {"client": "web", "pageSize": 10, "pageIndex": page}}
}
param_url = parse.quote(str(param).replace(" ", ""))
# param_url = parse.quote(str(param))
# param_url = f'%7B"uid"%3A""%2C"keyword"%3A"{key_word}"%2C"type"%3A%5B"researchReport"%5D%2C"client"%3A"web"%2C"clientVersion"%3A"curr"%2C"clientType"%3A"web"%2C"param"%3A%7B"researchReport"%3A%7B"client"%3A"web"%2C"pageSize"%3A10%2C"pageIndex"%3A{page}%7D%7D%7D'
t = int(time.time() * 1000)
url = f'https://search-api-web.eastmoney.com/search/jsonp?cb=&param={param_url}&_={t}'
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969&param=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
res = requests.get(url=url,headers=headers).text[1:-1]
res_json = json.loads(res)
list_all = res_json['result']['researchReport']
if list_all:
pass
else:
continue
for one_news in list_all:
news_title = one_news['title']
news_title = news_title.replace('<em>', '').replace('</em>', '')
news_date = one_news['date'][:10]
comparison_date = "2023-12-08"
# 比较发布日期是否小于2023-10-06
if news_date < comparison_date:
continue
else:
res_json_ = get_json(key_word, page, headers)
# 添加页数
total = res_json_['hitsTotal']
page = (total/10) + 1
for page_ in range(1,page+1):
res_json = get_json(key_word,page_,headers)
list_all = res_json['result']['researchReport']
if list_all:
pass
news_come = one_news['source']
news_code = one_news['code']
else:
continue
for one_news in list_all:
news_title = one_news['title']
news_title = news_title.replace('<em>', '').replace('</em>', '')
news_date = one_news['date'][:10]
comparison_date = "2023-12-08"
# 比较发布日期是否小于2023-10-06
if news_date < comparison_date:
continue
else:
pass
news_come = one_news['source']
news_code = one_news['code']
news_url = f'https://data.eastmoney.com/report/zw_stock.jshtml?infocode={news_code}'
news_url = f'https://data.eastmoney.com/report/zw_stock.jshtml?infocode={news_code}'
news_res = requests.get(news_url)
news_soup = BeautifulSoup(news_res.content, 'html.parser')
news_res = requests.get(news_url)
news_soup = BeautifulSoup(news_res.content, 'html.parser')
try:
if '抱歉,您访问的页面不存在或已删除!' in news_soup.title.text:
try:
if '抱歉,您访问的页面不存在或已删除!' in news_soup.title.text:
continue
except:
continue
except:
continue
try:
news_content = news_soup.find('div', {'class': 'newsContent'}).text.strip()
except:
news_content = news_soup.find('div', {'class': 'ctx-content'}).text.strip()
try:
news_content = news_soup.find('div', {'class': 'newsContent'}).text.strip()
except:
news_content = news_soup.find('div', {'class': 'ctx-content'}).text.strip()
try:
news_pdf = news_soup.find('div', {'class': 'detail-header'}).find_all('a')[-1].get('href')
except:
news_pdf = news_soup.find('span', {'class': 'to-link'}).a.get('href')
try:
news_pdf = news_soup.find('div', {'class': 'detail-header'}).find_all('a')[-1].get('href')
except:
news_pdf = news_soup.find('span', {'class': 'to-link'}).a.get('href')
dic_post = {
'title': news_title, # 报告名称
'url_pdf': news_pdf, # 报告链接
'year': news_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': social_code, # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': news_date, # 时间
'origin': '东方财富网-研报中心', # 来源
'sourceAddress': news_url, # 原文链接
'content': '', # 内容
'summary': news_content, # 摘要
'sid': '1662008733005160449', # 信息源id
'come': news_come,
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# log.info(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
# # report-list
# res = requests.post(url, data=json.dumps(dic_post))
# log.info(res.json())
# dic_news = {
# '关键字':key_word,
# '标题':news_title,
# '时间':news_date,
# '来源':news_come,
# '摘要':news_content,
# '原文链接':news_url,
# 'PDF链接':news_pdf,
# }
# list_all_info.append(dic_news)
# if len(list_all) != 10:
# break
dic_post = {
'title': news_title, # 报告名称
'url_pdf': news_pdf, # 报告链接
'year': news_date[:4], # 报告年份
'type_id': '4', # 报告种类,(年报:1,季报:2,月报:3,研报:4)
'item_id': social_code, # 关联记录id,如:企业信用代码
'category': 'pdf', # 文件后缀名,如:pdf
'create_by': 'TangYuHang', # 创建人,使用驼峰命名,如:TangYuHang
'publishDate': news_date, # 时间
'origin': '东方财富网-研报中心', # 来源
'sourceAddress': news_url, # 原文链接
'content': '', # 内容
'summary': news_content, # 摘要
'sid': '1662008733005160449', # 信息源id
'come': news_come,
}
order_by = 1
download(dic_post, order_by)
order_by += 1
# 东方财富网2
def dongfangcaifu2():
......@@ -1590,11 +1622,11 @@ def dongfangcaifu7():
if __name__ == '__main__':
try:
log.info('mob')
Mob()
except Exception as e:
pass
# try:
# log.info('mob')
# Mob()
# except Exception as e:
# pass
# try:
# log.info('yidong_guanxiangtai')
# yidong_guanxiangtai()
......@@ -1605,11 +1637,12 @@ if __name__ == '__main__':
# juliangsuanshu()
# except Exception as e:
# pass
# try:
# log.info('ke36')
# ke36()
# except:
# pass
try:
log.info('ke36')
ke36()
except Exception as e:
ke36()
pass
# try:
# log.info('qianyanzhishiku')
# qianyanzhishiku()
......
......@@ -121,7 +121,7 @@ def get_content2():
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
if '.wps' in file_href or '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
......
# 天眼查商标申请数量
# 接口 https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_=1703216298337
# 请求方式 POST
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查商标/中国500强'
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTcwMjcxMjg4MywiZXhwIjoxNzA1MzA0ODgzfQ.mVTR6Wz7W_IBjf4rLYhKacG9CRxGTzIGKmlqrR9jN-_t0Z4vUYVYwOTMzo7vT9IClJELruhl4d31KBHX0bZ1NQ',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
if __name__ == "__main__":
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ShangBiao:zg500shSocial_code')
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
continue
# count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
t = int(time.time()*1000)
# url = f'https://capi.tianyancha.com/cloud-intellectual-property/intellectualProperty/trademarkList?_={t}'
url = f'https://capi.tianyancha.com/cloud-intellectual-property/trademark/statistics?_={t}&cgid={tycid}'
# tycid = '209252214'
# payload = {"id": tycid, "ps": 10, "pn": 1, "int_cls": "-100", "status": "-100", "app_year": "-100",
# "regYear": "-100", "searchType": "-100", "category": "-100", "fullSearchText": "", "sortField": "",
# "sortType": "-100"}
request = requests.get(url=url, headers=header, verify=False)
# request = requests.post(url=url, headers=header, data=payload)
# print(request.text)
data_json = request.json()
# print(data_json)
try:
all_data = data_json['data']['applyYearGraph']['statisticGraphData']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}----已经存在---无商标数据")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-----新增---无商标数据")
continue
for info in all_data:
year = info['desc']
num = info['num'] # 申请商标数量
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'年份': year,
'数量': num
}
selectSql = f"select count(1) from shangbiao_sh_tyc where social_code='{xydm}' and year='{year}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{year}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into shangbiao_sh_tyc(com_name,social_code,year,num) values (%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{year}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{xydm}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ShangBiao:zg500shSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查专利/国内上市'
def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
start_time = time.time()
log.info(f'===正在处理第{page}页===')
# list_all_info = []
t = int(time.time() * 1000)
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzI3MzczNzEzMSIsImlhdCI6MTcwMzE1MjEzMSwiZXhwIjoxNzA1NzQ0MTMxfQ.3tF-UFhorC_mS4h2UIBOZamApfcaJEfjBbr8K11d2yHhELBM1pEvjd6yccxhLzVKRoyFdTn-1Cz6__ZpzgjnGg',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
try:
ip = baseCore.get_proxy()
except:
time.sleep(2)
ip = baseCore.get_proxy()
try:
res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
except:
for i in range(3):
try:
res_j = requests.get(url=url, headers=header, verify=False).json()
except:
time.sleep(2)
continue
# print(res_j)
try:
list_all = res_j['data']['items']
except:
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code
}
selectSql = f"select count(1) from zhuanli_sh_tyc where social_code='{social_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}---{social_code}---已经存在---无专利")
return 0
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code) values (%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}---{social_code}---新增---无专利")
return 0
# print(list_all)
if list_all:
for one_zhuanli in list_all:
title = one_zhuanli['title']
try:
shenqingri = one_zhuanli['applicationTime']
except:
shenqingri = ''
try:
shenqing_code = one_zhuanli['patentNum']
except:
shenqing_code = ''
try:
leixing = one_zhuanli['patentType']
except:
leixing = ''
try:
status = one_zhuanli['lprs']
except:
status = ''
try:
gongkairi = one_zhuanli['pubDate']
except:
gongkairi = ''
try:
gongkai_code = one_zhuanli['pubnumber']
except:
gongkai_code = ''
try:
famingren = one_zhuanli['inventor']
except:
famingren = ''
try:
shenqingren = one_zhuanli['applicantName']
except:
shenqingren = ''
try:
gongneng = one_zhuanli['cat']
except:
gongneng = ''
try:
uuid = one_zhuanli['uuid']
except:
uuid = ''
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'专利名称': title,
'申请日': shenqingri,
'申请号': shenqing_code,
'专利类型': leixing,
'专利状态': status,
'公开日': gongkairi,
'公开号': gongkai_code,
'发明人': famingren,
'申请人': shenqingren,
'功能': gongneng,
'天眼查详情id': uuid,
'年份': shenqingri[:4]
}
selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{shenqing_code}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{shenqing_code}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
return page
else:
return 0
if __name__ == "__main__":
while True:
list_all_info = []
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code_zg500')
# social_code = '91350700856994874M'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code_zg500', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
page = 1
while True:
page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info)
if page != 0:
page += 1
else:
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log.info(f"{id}---{xydm}----{tycid}----结束处理")
break
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code_zg500', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论