提交 08e4725c 作者: 薛凌堃

12/21

上级 8f2915d4
import reits import reits
import reits import reits
import policy_beijing, policy_chongqing, policy_fujian, policy_guangdong import policy_chongqing, policy_fujian, policy_guangdong
import policy_guangxi, policy_gwy, policy_hainan, policy_heilongjiang, policy_hubei, policy_jiangsu import policy_guangxi, policy_gwy, policy_hainan, policy_heilongjiang, policy_hubei, policy_jiangsu
import policy_jiangxi, policy_jilin, policy_liaoning, policy_neimenggu, policy_shandong, policy_hubei import policy_jiangxi, policy_jilin, policy_liaoning, policy_neimenggu, policy_shandong, policy_hubei
import policy_shanxi, policy_sichuan, policy_tianjin, policy_yunnan, policy_zhejiang import policy_shanxi, policy_sichuan, policy_tianjin, policy_yunnan, policy_zhejiang
import RuleGuide_shanghai, RuleGuide_shenzhen import RuleGuide_shanghai, RuleGuide_shenzhen
import LawRules_shenzhen, LawRules_2_shenzhen import LawRules_shenzhen, LawRules_2_shenzhen
from REITs_policyData.policy_beijing import beijing
if __name__ == "__mian__": if __name__ == "__mian__":
policy_beijing.beijing() beijing()
reits.sse() reits.sse()
reits.reform() reits.reform()
reits.hebei() reits.hebei()
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""
模拟点击的方法不行,涉及到需要账号登录
"""
import json import json
import re import re
import time import time
...@@ -296,7 +292,7 @@ def dic_handle(result_dic): ...@@ -296,7 +292,7 @@ def dic_handle(result_dic):
return aa_dict return aa_dict
# 采集准备 # 采集准备
def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
# if social_code: # if social_code:
# dic_info = baseCore.getInfomation(social_code) # dic_info = baseCore.getInfomation(social_code)
...@@ -342,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -342,7 +338,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
else: else:
# 开始采集 # 开始采集
try: try:
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
count += 1 count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}') log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
token.updateTokeen(id_cookie,3) token.updateTokeen(id_cookie,3)
...@@ -377,7 +373,7 @@ def ifbeforename(company_url): ...@@ -377,7 +373,7 @@ def ifbeforename(company_url):
return '' return ''
# 采集基本信息和工商信息 # 采集基本信息和工商信息
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('firm/')[1].split('.html')[0] qccid = company_url.split('firm/')[1].split('.html')[0]
# 将采集到的企查查id更新 # 将采集到的企查查id更新
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'" updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
...@@ -467,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -467,7 +463,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate aa_dic['listingDate'] = listingDate
aa_dic['category'] = category aa_dic['category'] = category
aa_dic['exchange'] = exchange aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
# print(aa_dic) # print(aa_dic)
sendkafka(aa_dic) sendkafka(aa_dic)
...@@ -486,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -486,11 +482,11 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
aa_dic['listingDate'] = listingDate aa_dic['listingDate'] = listingDate
aa_dic['category'] = category aa_dic['category'] = category
aa_dic['exchange'] = exchange aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
sendkafka(aa_dic) sendkafka(aa_dic)
# 判断名称是否统一 # 判断名称是否统一
def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name): def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
company_url = '' company_url = ''
try: try:
company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list') company_list = soup.find('table', class_='app-ltable ntable ntable-list ntable ntable-list')
...@@ -530,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -530,7 +526,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_url = info_t.find('a')['href'] company_url = info_t.find('a')['href']
beforename = ifbeforename(company_url) beforename = ifbeforename(company_url)
if beforename == receptname: if beforename == receptname:
spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, ynDomestic, countryName, file_name) spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
else: else:
#没有搜到相同的企业名称 #没有搜到相同的企业名称
data = [com_name, social_code] data = [com_name, social_code]
...@@ -544,7 +540,7 @@ if __name__ == '__main__': ...@@ -544,7 +540,7 @@ if __name__ == '__main__':
while True: while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8] nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况_{nowtime}.xlsx' file_name = f'./data/国内企业基本信息采集情况.xlsx'
file.createFile(file_name) file.createFile(file_name)
cookieinfo = token.getToken() cookieinfo = token.getToken()
...@@ -553,6 +549,7 @@ if __name__ == '__main__': ...@@ -553,6 +549,7 @@ if __name__ == '__main__':
else: else:
log.info('==========已无cookies==========') log.info('==========已无cookies==========')
time.sleep(30) time.sleep(30)
continue continue
id_cookie = cookieinfo[0] id_cookie = cookieinfo[0]
cookie_ = json.loads(cookieinfo[1]) cookie_ = json.loads(cookieinfo[1])
...@@ -599,6 +596,11 @@ if __name__ == '__main__': ...@@ -599,6 +596,11 @@ if __name__ == '__main__':
while flag: while flag:
log.info('--------已没有数据---------') log.info('--------已没有数据---------')
time.sleep(30) time.sleep(30)
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = cnx_.cursor()
log.info('===11数据库重新连接成功===')
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
if company_field: if company_field:
flag = False flag = False
...@@ -608,7 +610,7 @@ if __name__ == '__main__': ...@@ -608,7 +610,7 @@ if __name__ == '__main__':
continue continue
social_code = company_field.split('|')[0] social_code = company_field.split('|')[0]
com_name = company_field.split('|')[2].replace(' ', '') com_name = company_field.split('|')[1].replace(' ', '')
ynDomestic = company_field.split('|')[15] ynDomestic = company_field.split('|')[15]
countryName = company_field.split('|')[16] countryName = company_field.split('|')[16]
...@@ -617,6 +619,7 @@ if __name__ == '__main__': ...@@ -617,6 +619,7 @@ if __name__ == '__main__':
listingDate = company_field.split('|')[21] listingDate = company_field.split('|')[21]
category = company_field.split('|')[19] category = company_field.split('|')[19]
exchange = company_field.split('|')[20] exchange = company_field.split('|')[20]
listType = company_field.split('|')[21]
# ynDomestic = '' # ynDomestic = ''
# countryName = '' # countryName = ''
# securitiesCode = '' # securitiesCode = ''
...@@ -625,8 +628,8 @@ if __name__ == '__main__': ...@@ -625,8 +628,8 @@ if __name__ == '__main__':
# category = '' # category = ''
# exchange = '' # exchange = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name) count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
time.sleep(40) time.sleep(2)
# break # break
# baseCore.r.close() # baseCore.r.close()
# baseCore.sendEmail(file_name) # baseCore.sendEmail(file_name)
......
...@@ -94,7 +94,7 @@ def get_content2(): ...@@ -94,7 +94,7 @@ def get_content2():
child_type = content_dict['childtype'] # 主题分类 child_type = content_dict['childtype'] # 主题分类
except: except:
child_type = '' child_type = ''
# # 判断是否已经爬取过 # 判断是否已经爬取过
is_href = baseTool.db_storage.find_one({'网址': href}) is_href = baseTool.db_storage.find_one({'网址': href})
if is_href: if is_href:
num += 1 num += 1
...@@ -102,6 +102,7 @@ def get_content2(): ...@@ -102,6 +102,7 @@ def get_content2():
time.sleep(1) time.sleep(1)
continue continue
try: try:
# href = 'https://www.gov.cn/zhengce/zhengceku/202312/content_6921452.htm'
resp = requests.get(url=href, headers=baseTool.headers, verify=False) resp = requests.get(url=href, headers=baseTool.headers, verify=False)
resp.encoding = resp.apparent_encoding resp.encoding = resp.apparent_encoding
resp_text = resp.text resp_text = resp.text
...@@ -120,9 +121,7 @@ def get_content2(): ...@@ -120,9 +121,7 @@ def get_content2():
except Exception as e: except Exception as e:
log.info(f'---{href}--------{e}-------') log.info(f'---{href}--------{e}-------')
continue continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \ if '.ofd' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.pdf' in file_href:
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href or '.odf' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
category = os.path.splitext(file_href)[1] category = os.path.splitext(file_href)[1]
if category not in file_name: if category not in file_name:
......
import requests,time,re,random
from base import BaseCore
import pandas as pd
from bs4 import BeautifulSoup as bs
from comData.Tyc.getTycId import getTycIdByXYDM
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
taskType = '天眼查专利/国内上市'
def spider_zhuanli(com_name, social_code, tycid, page, list_all_info):
start_time = time.time()
log.info(f'===正在处理第{page}页===')
# list_all_info = []
t = int(time.time() * 1000)
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzI3MzczNzEzMSIsImlhdCI6MTcwMzE1MjEzMSwiZXhwIjoxNzA1NzQ0MTMxfQ.3tF-UFhorC_mS4h2UIBOZamApfcaJEfjBbr8K11d2yHhELBM1pEvjd6yccxhLzVKRoyFdTn-1Cz6__ZpzgjnGg',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
url = f'https://capi.tianyancha.com/cloud-intellectual-property/patent/patentListV6?_={t}&id={tycid}&pageSize=100&pageNum={page}&type=-100&lprs=-100&applyYear=-100&pubYear=-100&fullSearchText=&sortField=&sortType=-100'
try:
ip = baseCore.get_proxy()
except:
time.sleep(2)
ip = baseCore.get_proxy()
try:
res_j = requests.get(url=url, headers=header, proxies=ip, verify=False).json()
except:
for i in range(3):
try:
res_j = requests.get(url=url, headers=header, verify=False).json()
except:
time.sleep(2)
continue
# print(res_j)
list_all = res_j['data']['items']
# print(list_all)
if list_all:
for one_zhuanli in list_all:
title = one_zhuanli['title']
try:
shenqingri = one_zhuanli['applicationTime']
except:
shenqingri = ''
try:
shenqing_code = one_zhuanli['patentNum']
except:
shenqing_code = ''
try:
leixing = one_zhuanli['patentType']
except:
leixing = ''
try:
status = one_zhuanli['lprs']
except:
status = ''
try:
gongkairi = one_zhuanli['pubDate']
except:
gongkairi = ''
try:
gongkai_code = one_zhuanli['pubnumber']
except:
gongkai_code = ''
try:
famingren = one_zhuanli['inventor']
except:
famingren = ''
try:
shenqingren = one_zhuanli['applicantName']
except:
shenqingren = ''
try:
gongneng = one_zhuanli['cat']
except:
gongneng = ''
try:
uuid = one_zhuanli['uuid']
except:
uuid = ''
dic_info = {
'企业名称': com_name,
'统一信用代码': social_code,
'专利名称': title,
'申请日': shenqingri,
'申请号': shenqing_code,
'专利类型': leixing,
'专利状态': status,
'公开日': gongkairi,
'公开号': gongkai_code,
'发明人': famingren,
'申请人': shenqingren,
'功能': gongneng,
'天眼查详情id': uuid,
'年份': shenqingri[:4]
}
selectSql = f"select count(1) from zhuanli_sh_tyc where shenqing_code='{shenqing_code}' "
cursor.execute(selectSql)
count = cursor.fetchone()[0]
if count > 0:
log.info(f"{com_name}-------{shenqing_code}---已经存在")
continue
else:
values_tuple = tuple(dic_info.values())
# log.info(f"{gpdm}-------{companyname}---新增")
insertSql = f"insert into zhuanli_sh_tyc(com_name,social_code,title,shenqingri,shenqing_code,leixing,status,gongkairi,gongkai_code,famingren,shenqingren,gongneng,uuid,year) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertSql, values_tuple)
cnx.commit()
log.info(f"{com_name}-------{shenqing_code}---新增")
time.sleep(2)
# list_all_info.append(dic_info)
log.info(f"【{page}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")
return page
else:
return 0
if __name__ == "__main__":
while True:
list_all_info = []
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
# social_code = '9111010566840059XP'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
break
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor.execute(updateSql)
cnx.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
page = 1
while True:
page = spider_zhuanli(com_name, xydm, tycid, page, list_all_info)
if page != 0:
page += 1
else:
# print(len(list_all_info))
# df_all_info = pd.DataFrame(list_all_info)
# df_all_info.to_excel('中国上市企业专利.xlsx', index=False)
log.info(f"{id}---{xydm}----{tycid}----结束处理")
break
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败==={e}=')
# 重新塞入redis
baseCore.rePutIntoR('ZhuanLi:gnshSocial_code', social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论