提交 b9021d22 作者: 薛凌堃

9/7

上级 204d228a
......@@ -327,8 +327,8 @@ def FBS():
#省属国有企业 盟市国有企业
def MengZhi():
cnx, cursor = connectSql()
gn_query = "select a.SocialCode from EnterpriseInfo a,EnterpriseType b where b.type=5 and a.SocialCode=b.SocialCode;"
cnx, cursor = cnn11()
gn_query = "select * from t_0906 a where not exists (select 1 from sys_base_enterprise_executive b where a.xydm =b.social_credit_code)"
# gn_query = "select a.SocialCode from EnterpriseInfo a,EnterpriseType b where b.type=4 and a.SocialCode=b.SocialCode;"
cursor.execute(gn_query)
gn_result = cursor.fetchall()
......@@ -336,7 +336,7 @@ def MengZhi():
gn_social_list = [item[0] for item in gn_result]
for item in gn_social_list:
if not r.exists(item):
r.rpush('BaseInfoEnterpriseMz:gnqy_socialCode', item)
# r.rpush('BaseInfoEnterpriseMz:gnqy_socialCode', item)
r.rpush('CorPersonEnterprise:gnqy_socialCode', item)
closeSql(cnx, cursor)
......
import json
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
url = 'http://www.imlooker.com/v4/company/142660.html'
req = requests.get(url)
soup = BeautifulSoup(req.content,'html.parser')
# print(soup)
info = soup.find('div',id='mydiv1')
company_name = '台玻集团'
WebSite =info.find('div').text.split('官网:')[1]
#简介
info.find('div').decompose()
briefInfo = info.text.strip()
table_info = soup.find_all('div',class_='com_340')[1]
# print(table_info)
td_list = table_info.find_all('td')
# print(td_list)
incorporationDate = td_list[1].text
businessRange = td_list[3].text
scale = td_list[11].text
address = td_list[13].text
aa_dict = {
'qccId': '', # 企查查企业id
'name': company_name, # 企业名称
'shortName': '', # 企业简称
'socialCreditCode': '', # 统一社会信用代码
'legalPerson': '', # 法定代表人
'officialPhone': '', # 电话
'officialUrl': WebSite, # 官网
'officialEmail': '', # 邮箱
'briefInfo': briefInfo, # 简介
'registerStatus': '', # 登记状态
'incorporationDate': incorporationDate, # 成立日期
'capital': '', # 注册资本
'paidCapital': '', # 实缴资本
'approvalDate': '', # 核准日期
'organizationCode': '', # 组织机构代码
'registerNo': '', # 工商注册号
'taxpayerNo': '', # 纳税人识别号
'type': '', # 企业类型
'businessStartDate': '', # 营业期限自
'businessEndDate': '', # 营业期限至
'taxpayerQualification': '', # 纳税人资质
'industry': '', # 所属行业
'region': '',
'province': '台湾省', # 所属省
'city': '台北市', # 所属市
'county': '松山区', # 所属县
'registerDepartment': '', # 登记机关
'scale': scale, # 人员规模
'insured': '', # 参保人数
'beforeName': '', # 曾用名
'englishName': 'Taiwan Glass Group', # 英文名
'importExportEnterpriseCode': '', # 进出口企业代码
'address': address, # 地址
'businessRange': businessRange, # 经营范围
'status': 0, # 状态
}
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
except:
exception = 'kafka传输失败'
"""
知网论文采集 模拟点击 封ip
"""
import pymysql
import requests,re,time,random
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from base.BaseCore import BaseCore
baseCore = BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
def get_proxy():
sql = "select proxy from clb_proxy"
cursor.execute(sql)
proxy_lists = cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list
# 设置浏览器启动参数
capabilities = webdriver.DesiredCapabilities.CHROME.copy()
get_proxy().proxy.add_to_capabilities(capabilities)
info = pd.read_excel('全球创新指标数据(1).xlsx')
enterprise_name_list = []
industry_list = []
for i in range(info.shape[0]):
# print(info['contrast_name'][i])
if info['contrast_name'][i]=='发表论文数量' :
enterprise_name = info['enterprise_name'][i]
if enterprise_name == '中国石油天然气股份有限公司':
pass
else:
continue
industry = info['industry'][i]
industry_list.append(industry)
enterprise_name_list.append(enterprise_name)
df_all = pd.DataFrame({'公司名称':enterprise_name_list,
'行业':industry_list})
df_all['文章发表数'] = ''
# for year in range(2022,1989,-1):
# df_all[f'{year}'] = ''
# print(df_all)
list_one_info = []
def get_num(com_name,com_industry):
url = f'https://kns.cnki.net/kns8/DefaultResult/Index?dbcode=CFLQ&kw={com_name}&korder=AF'
browser.get(url) # 跳到指定页面
time.sleep(2)
btn = browser.find_element(By.XPATH, '/html/body/div[3]/div[1]/div/div/a/span')
btn.click()
print('点击1成功')
time.sleep(3)
btn2 = browser.find_element(By.XPATH,'//*[@id="divGroup"]/dl[3]/dt')
btn2.click()
print("点击2成功")
time.sleep(1)
page_source = browser.page_source # 获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
num_all = soup.find_all('div', {'class': 'resultlist'})[3].find('ul').find_all('li')
if num_all:
for li in num_all:
year = li.find('a').text
num = li.find('span').text.split('(')[1].split(')')[0]
dic_json = {
'enterprise_name':com_name,
'year':year,
'num':num,
'source':'国内外企业发布文章数量来源:中国知网',
'industry':com_industry
}
list_one_info.append(dic_json)
else:
dic_json = {
'enterprise_name': com_name,
'year': '',
'num': '',
'source': '国内外企业发布文章数量来源:中国知网',
'industry': com_industry
}
list_one_info.append(dic_json)
return list_one_info
chromedriver = 'D:\Chrome\chromedriver.exe'
browser = webdriver.Chrome(chromedriver)
for i in range(0,len(df_all)):
com_name = df_all['公司名称'][i]
com_industry=df_all['行业'][i]
try:
list_one_info = get_num(com_name,com_industry)
except:
continue
print(list_one_info)
df_info = pd.DataFrame(list_one_info)
df_info.to_excel('年份-论文发表数量.xlsx',index=False)
import requests, re, time, pymysql
++ /dev/null
import requests, re, time, pymysql
from bs4 import BeautifulSoup as bs
from fdfs_client.client import get_tracker_conf, Fdfs_client
from base import BaseCore
baseCore = BaseCore.BaseCore()
requests.adapters.DEFAULT_RETRIES = 3
log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor = cnx.cursor()
tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf)
taskType = '企业年报/雪球网'
def tableUpdate(year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time):
sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s'''
cursor.execute(sel_sql, (item_id, year))
selects = cursor.fetchone()
if selects:
print(f'{com_name},{year}已存在')
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by,
create_time)
cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交
print("更新完成:{}".format(Upsql))
def getContent(social_code, com_name, code,start_time):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
}
comp = re.compile('-?[1-9]\d*')
num = 1
ip = baseCore.get_proxy()
url_1 = f'https://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/{code}/page_type/ndbg.phtml'
res_1 = requests.get(url_1, proxies=ip)
soup = bs(res_1.content, 'html.parser',from_encoding='gb2312')
# 获取年度报告列表
try:
list_all = soup.find('div', {'class': 'datelist'}).find_all('a')
except:
log.info(f'{social_code}.........年度报告列表为空')
exception = '年度报告列表为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
return
# 获取年报详细信息
for href in list_all:
ip = baseCore.get_proxy()
year_url = 'https://vip.stock.finance.sina.com.cn' + href.get('href')
year_name = href.text
res_2 = requests.get(year_url, proxies=ip)
soup_2 = bs(res_2.content, 'html.parser',from_encoding='gb2312')
try:
pdf_url = soup_2.find('th', {'style': 'text-align:center'}).find('a').get('href')
except:
log.error(f'{social_code}....{year_url}....无下载链接')
exception = '无下载链接'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
continue
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
try:
year = comp.findall(year_name)[0]
except:
continue
name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
result = ''
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
break
except Exception as e:
log.error(f'{social_code}...年报上传服务器出错:{e}')
time.sleep(3)
continue
if result == '':
exception = '上传服务器失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
type_id = '1'
item_id = social_code
group_name = 'group1'
path = bytes.decode(result['Remote file_id']).replace('group1', '')
full_path = bytes.decode(result['Remote file_id'])
category = 'pdf'
file_size = result['Uploaded size']
order_by = num
status = 1
create_by = 'XueLingKun'
create_time = time_now
try:
tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size,
order_by, status, create_by, create_time)
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '')
except:
exception = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
def begin():
while True:
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '91100000100003962T'
if not social_code:
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
if social_code == '':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(social_code)
count = dic_info[15]
code = dic_info[3]
com_name = dic_info[4]
if code is None:
exeception = '股票代码为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
continue
getContent(social_code, com_name, code,start_time)
count += 1
runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count)
break
if __name__ == '__main__':
begin()
cursor.close()
cnx.close()
baseCore.close()
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -97,10 +97,10 @@ def spider_annual_report(dict_info,num):
try:
# 标题中有年份,
year = re.findall('\d{4}', year_name)[0]
year = re.findall('\d{4}\s*年', year_name)[0]
if com_name != 'null':
name_pdf = f"{com_name}:{year}年报.pdf".replace('*', '')
name_pdf = f"{com_name}:{year}年报.pdf".replace('*', '')
else:
name_pdf = pdf_name_a + '.pdf'
except:
......@@ -144,6 +144,10 @@ def spider_annual_report(dict_info,num):
# name_pdf = pdf_name_a + '.pdf'
with cnx.cursor() as cursor:
if '年' in year:
year = year.split('年')[0]
else:
pass
sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id="1" '''
cursor.execute(sel_sql, (social_code, int(year)))
selects = cursor.fetchone()
......@@ -208,7 +212,7 @@ def spider_annual_report(dict_info,num):
'code': '200',
}
print(dic_result)
return True
# return True
except Exception as e:
dic_result = {
'success': 'false',
......@@ -225,6 +229,8 @@ def spider_annual_report(dict_info,num):
# num = num + 1
time.sleep(2)
# browser.quit()
return True
#state1
if __name__ == '__main__':
......@@ -233,8 +239,8 @@ if __name__ == '__main__':
while True:
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
# social_code = '911100007109288314'
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '9133020071331910XJ'
if not social_code:
time.sleep(20)
continue
......
"""
"""
......@@ -534,7 +534,7 @@ def job(taskType):
baseCore.close()
if __name__=='__main__':
task_type = '财务数据/东方财富网/福布斯'
task_type = '财务数据/东方财富网'
job(task_type)
......
import random
import random
import time
from tqdm import tqdm
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
import urllib3
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx = baseCore.cnx
cursor = baseCore.cursor
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
'Host':'www.sasac.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
# 创建一个ExcelWriter对象
writer = pd.ExcelWriter('国务院厅局.xlsx')
url = 'http://www.sasac.gov.cn/n2588020/index.html'
ip = baseCore.get_proxy()
res = requests.get(url,headers,proxies=ip)
soup = BeautifulSoup(res.content,'html.parser')
time.sleep(2)
#厅局列表
list_type = soup.find('div',class_='l-jgkk-right column').find_all('dd')
list_error = []
for type in tqdm(list_type[:2]):
list_news = []
href_type = type.find('a')['href']
ting_type = type.find('a').text
print(f'\n================厅局类别==={ting_type}========================')
if 'http' in href_type:
url_type = href_type
else:
url_type = 'http://www.sasac.gov.cn/' + href_type.replace('../','')
# print(url_type)
i_res = requests.get(url_type,headers)
i_soup = BeautifulSoup(i_res.content,'html.parser')
time.sleep(2)
news_list = i_soup.find('div',class_='tjywBottom').find_all('li')
#文章列表
# print('================新闻列表==================')
for news in tqdm(news_list[:2]):
try:
news_href = news.find('a')['href']
except:
continue
if 'http' in news_href:
news_url = news_href
else:
news_url = 'http://www.sasac.gov.cn/' + news_href.replace('../','')
news_title = news.find('a').text.split('[')[0]
print(f'\n----正在采集: {news_title}-------')
pub_time = news.find('span').text.replace('[','').replace(']','')
#文章信息
ii_res = requests.get(news_url,headers)
ii_soup = BeautifulSoup(ii_res.content,'html.parser')
# todo:相对路径转化为绝对路径
time.sleep(2)
try:
news_info = ii_soup.find('div',class_='zsy_cotitle')
except Exception as e:
print(e)
news_info = ''
if news_info:
try:
pub_source = news_info.find('p').text.split('文章来源:')[1].split('发布时间')[0]
except:
pub_source = ''
try:
content = ii_soup.find('div','zsy_comain').text.replace('扫一扫在手机打开当前页','').strip()
except:
content = ''
# print(news_url)
dic_news = {
'标题':news_title,
'发布时间':pub_time,
'来源':pub_source,
'内容':content,
'原文链接':news_url
}
list_news.append(dic_news)
else:
dic_error = {
'标题': news_title,
'原文链接':news_url,
'厅局类别':ting_type
}
list_error.append(dic_error)
df = pd.DataFrame(list_news)
# 将数据写入不同的sheet页
df.to_excel(writer, sheet_name=ting_type,index=False)
print(f'=============当前sheet页{ting_type}---数据总数:{len(df)}================')
time.sleep(1)
writer.save()
df_error = pd.DataFrame(list_error)
df_error.to_excel('未采到文章.xlsx',index=False)
......@@ -13,7 +13,7 @@ baseCore = BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
headers = {
'Cookie':'HWWAFSESID=0e10b77869899be8365; HWWAFSESTIME=1688781923708; csrfToken=VeTF4UIZKJ0q6yWmgfC_FLqv; TYCID=e7cec7501d3311eea9dcb9fb7af79aad; ssuid=3142278034; sajssdk_2015_cross_new_user=1; bannerFlag=true; _ga=GA1.2.1006597844.1688781929; _gid=GA1.2.146077413.1688781929; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1688781929; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2217103123002%22}; tyc-user-info-save-time=1688781977329; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyMzAwMiIsImlhdCI6MTY4ODc4MTk3NiwiZXhwIjoxNjkxMzczOTc2fQ.Luw0DCFul8WxRNOM8X5-NCmy_z3BwJC5JBvofWqWkSQOleJ6zJU0SRbqwAobPfOfVyGFDUBqmxxWd4YKCeCWeQ; tyc-user-phone=%255B%252217103123002%2522%255D; searchSessionId=1688778331.16177575; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22302953956%22%2C%22first_id%22%3A%22189333f38cb947-0fb9b252742a6c-26031d51-921600-189333f38cdcdd%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg5MzMzZjM4Y2I5NDctMGZiOWIyNTI3NDJhNmMtMjYwMzFkNTEtOTIxNjAwLTE4OTMzM2YzOGNkY2RkIiwiJGlkZW50aXR5X2xvZ2luX2lkIjoiMzAyOTUzOTU2In0%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22302953956%22%7D%2C%22%24device_id%22%3A%22189333f38cb947-0fb9b252742a6c-26031d51-921600-189333f38cdcdd%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1688781980',
'Cookie':'HWWAFSESID=b6312a4594bea18413c; HWWAFSESTIME=1686818921445; csrfToken=e7sNDKWelJwlcjnm6Rlny887; TYCID=6ff6bc600b5911ee89d35bf79a73a3b1; bannerFlag=true; ssuid=1534238432; refresh_page=0; _ga=GA1.2.1790752229.1688467828; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22307016917%22%2C%22first_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg4YmUzZTMzN2U0YmYtMGQ4NTcxNmQzNjZlNDQtMjYwMzFkNTEtMTA0OTA4OC0xODhiZTNlMzM3ZjE5ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNzAxNjkxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22307016917%22%7D%2C%22%24device_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%7D; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=7; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1693986307; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2213592481839%22%7D; tyc-user-info-save-time=1693986377592; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5Mzk4NjM3NywiZXhwIjoxNjk2NTc4Mzc3fQ.xeK54nMtB5wt7ipdOjhrzdplT1azvezrTuoD1b8i3OguqMB97ZOR1pFbRsP7vsKRdZ3Fsf5Y5ZqlmRKAVHGraA; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1693986412',
# 'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
......@@ -27,7 +27,7 @@ def doJob():
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
# social_code = '91610000220568570K'
# social_code = '91110108778635402E'
if social_code == None:
time.sleep(20)
continue
......
......@@ -37,6 +37,11 @@ def getTycIdByXYDM(xydm):
response = requests.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip)
time.sleep(random.randint(3, 5))
retJsonData =json.loads(response.content.decode('utf-8'))
if retJsonData['data'] and retJsonData['state']== 'ok':
pass
else:
log.error(f"---{xydm}-未查询到该企业---")
return retData['tycData']
matchType=retJsonData['data'][0]['matchType']
if matchType=='信用代码匹配':
retData['state'] = True
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论