提交 9a76d2aa 作者: 丁双波

Merge remote-tracking branch 'origin/master'

...@@ -421,6 +421,7 @@ def NQEnterprise(): ...@@ -421,6 +421,7 @@ def NQEnterprise():
nq_social_list = [item[0] for item in nq_result] nq_social_list = [item[0] for item in nq_result]
for item in nq_social_list: for item in nq_social_list:
#新三板企业财务数据 上市信息 核心人员已采集 企业动态、企业公告未采集 企业公告脚本已开发,企业动态需要每天放入redis
# r.rpush('NQEnterprise:nq_Ipo', item) # r.rpush('NQEnterprise:nq_Ipo', item)
r.rpush('NQEnterprise:nq_finance',item) r.rpush('NQEnterprise:nq_finance',item)
# r.rpush('NQEnterprise:nq_notice',item) # r.rpush('NQEnterprise:nq_notice',item)
...@@ -451,11 +452,26 @@ def omeng(): ...@@ -451,11 +452,26 @@ def omeng():
#单项冠军 #单项冠军
def danxiangguanjun(): def danxiangguanjun():
pass cnx, cursor = connectSql()
query = "SELECT CompanyName FROM champion"
cursor.execute(query)
result = cursor.fetchall()
cnx.commit()
com_namelist = [item[0] for item in result]
for item in com_namelist:
r.rpush('champion:baseinfo',item)
#科改示范 #科改示范
def kegaishifan(): def kegaishifan():
pass cnx, cursor = connectSql()
query = "SELECT CompanyName FROM technological"
cursor.execute(query)
result = cursor.fetchall()
cnx.commit()
com_namelist = [item[0] for item in result]
for item in com_namelist:
r.rpush('technological:baseinfo',item)
#双百企业 #双百企业
def shuangbaiqiye(): def shuangbaiqiye():
...@@ -467,6 +483,8 @@ def zhuangjingtexind(): ...@@ -467,6 +483,8 @@ def zhuangjingtexind():
if __name__ == "__main__": if __name__ == "__main__":
start = time.time() start = time.time()
# danxiangguanjun()
kegaishifan()
# NoticeEnterprise() # NoticeEnterprise()
# AnnualEnterpriseIPO() # AnnualEnterpriseIPO()
# AnnualEnterprise() # AnnualEnterprise()
...@@ -477,7 +495,7 @@ if __name__ == "__main__": ...@@ -477,7 +495,7 @@ if __name__ == "__main__":
# FBS() # FBS()
# MengZhi() # MengZhi()
# NQEnterprise() # NQEnterprise()
SEC_CIK() # SEC_CIK()
# omeng() # omeng()
# AnnualEnterpriseUS() # AnnualEnterpriseUS()
# NoticeEnterprise_task() # NoticeEnterprise_task()
......
...@@ -85,7 +85,22 @@ if __name__=='__main__': ...@@ -85,7 +85,22 @@ if __name__=='__main__':
ein = jsonData['ein'] # 联邦税号 ein = jsonData['ein'] # 联邦税号
address = jsonData['addresses'] address = jsonData['addresses']
city = address['business']['city'] city = address['business']['city']
business_address = address['business']['street1'] + ',' + city + ' ' + address['business']['stateOrCountryDescription'] try:
if city:
business_address = address['business']['street1'] + ',' + city + ' ' + address['business'][
'stateOrCountryDescription']
else:
business_address = address['business']['stateOrCountryDescription']
except:
try:
business_address = address['business']['street1'] + ',' + city
except:
try:
business_address = city + ' ' + address['business']['stateOrCountryDescription']
except:
business_address = ''
# city = address['business']['city']
# business_address = address['business']['street1'] + ',' + city + ' ' + address['business']['stateOrCountryDescription']
phone = jsonData['phone'] # 电话 phone = jsonData['phone'] # 电话
try: try:
formerNames = jsonData['formerNames'][0]['name'] # 曾用名 formerNames = jsonData['formerNames'][0]['name'] # 曾用名
......
"""
解析json数据 两个链接:
https://data.sec.gov/api/xbrl/companyfacts/CIK0000320193.json 数据值和gaap字段
https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/MetaLinks.json html字段和gaap字段映射
step1:拼接链接
step2:
"""
import json
import time
import requests
from kafka import KafkaProducer
from operator import itemgetter
from itertools import groupby
from base.BaseCore import BaseCore
# import urllib3
# urllib3.disable_warings()
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
def fromcikgetinfo(cik):
query = f"select * from mgzqyjwyh_list where cik='{cik}' "
cursor.execute(query)
data = cursor.fetchone()
return data
def getRequest(url):
headers = {
'Host': 'data.sec.gov',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': '_ga=GA1.2.784424676.1695174651; _4c_=%7B%22_4c_s_%22%3A%22lZFLT4QwFIX%2FyqRrILS0pbAzmBgXajQ%2BlhNpLwOZcUoKDo4T%2Fru3gMbHym5ov55zcjk9kaGGPcmpzARNuVRcxElAtnDsSH4irjH%2BcyA50awsDTUq1ElShZwZCMuKmbASSQUUKsYoIwF5w6w0ZpmIpeBKqTEgul0yTkRbA5hFs4iqKA6rDh39OxKuYty2zppX3a%2F7Y%2BtlA5SrzmzxwsCh0bAeGtPX3s8m%2BUJraDZ1jzhlE22dl0QC90OzN3b47Vvol0%2BkFGnp7NCB9xa1sy%2BwolQitlgEeZocfloHFTg3yfDUNb0ftAMdbexhAVjezMKZPTaemtV9cYf8%2Bhu5LW6uFtT6jv0YO6ufdz4UnyUgF2frh8tz%2F2%2BKc8ZlKqPPpxKUjHPfCJiksRAZldhnvyO5kjz2a5yTp%2FrpTzVXWfZXPbcQ%2Bulh%2Fx%2FrOH4A%22%7D; _ga_300V1CHKH1=GS1.1.1695174651.1.1.1695174684.0.0.0; ak_bmsc=91C6D28D093861656DB8C1FC1972DAB6~000000000000000000000000000000~YAAQlQ8kF2U6orCKAQAAgyl9uxX8kNk3C77pkMi6N6RxnsUqDbYEmIcNjtLSa8W6kfGL9cQMRHBUaYcbEA1+oXsvUwUF80G8hmH/F4S0ZOEnVCrlcBLx219N24l2qmoSKtVDH+VKe7c1bji9MHc7tO2R56R7juZJv9gceAdtKEuArkPfD8ijx/TyEgIrM+XruGtzCRmLnfq86UoJYP+j+tXcaWkc/qm1zHDReDNf/cHd6h2aRMs4lsES8+uh6YTjE7bfCp8h2DNJ2e07pm0ojcI/kdycUPHmuTqWPdTBEjUybad31E1hRNBAE8PbGjy2lvlPY/piuN3HX3Q5ifsmTqCNJzynN2kjGm6i4SHhmEAijUeIzNQXB11GrVmALJVV6pEjd/uu; bm_sv=FD8981426EA388050697DFB615BAFFE3~YAAQ1wcsF5K72ZSKAQAAsvl/uxUw0do3nknGCkllXH27UZBpM7kQUXm4crBNTAkhek5YSDKIrrm2uFWidfpBfyxbRSr+w7FH7Y0w4cXMAa7BELzcc/B9Uf8T6e2I2W29wjurKkBFtSseslHSqYD3BWx9/GidJMW+dFNrlzNUMd1dONUR9J1TDnYifPhE6A/zSLPHVrCTJl7xzg7VlW/05Ay0i+Bo7TynZdWgotfjET3vg2/ZVixVSGaWeQo4~1'
}
for m in range(0,3):
try:
response = requests.get(url=url,headers=headers,verify=False)
break
except Exception as e:
log.error(f"request请求异常-------{e}")
continue
# 检查响应状态码
if response.status_code == 200:
jsonData = response.json()
return jsonData
else:
return False
if __name__=='__main__':
taskType = '财务数据/SEC'
zcfzb_mapping = {
'AccountsAndOtherReceivablesNetCurrent':'指标1'
}
lrb_mapping = {
}
xjllb_mapping = {
}
while True:
start_time = time.time
# todo:从redis中获取企业cik
# cik = baseCore.redicPullData('sec_cik_US:uscik')
cik = '320193'
#通过cik去数据库中获取信息
data = fromcikgetinfo(cik)
com_name = data[2]
com_code = data[3]
exchange = data[4]
#拼接链接的cik是十位数
url_cik = cik
while True:
if len(url_cik) < 10:
url_cik = '0' + url_cik
else:
break
url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{url_cik}.json'
jsonData = getRequest(url)
if jsonData:
pass
print(jsonData)
try:
us_gaap = jsonData['facts']['us-gaap']
except:
continue
# 遍历map的key值
Listzcfzb = []
for key in zcfzb_mapping.keys():
# 一个财务指标的所有年份和金额
usd_list = us_gaap[key]['units']['USD']
# form: 10-K fp: FY
for j in usd_list:
form = usd_list[j]['form']
fp = usd_list[j]['fp']
if form=='10-K' and fp=='FY':
pass
else:
continue
date = usd_list[j]['end']
if date.endswith('03-31') or date.endswith('06-30') or date.endswith('09-30') or date.endswith('12-31'):
pass
else:
continue
val = usd_list[j]['val']
zcfzb_dic ={
'zbname': key,
'riqi': date,
'jine': val,
'fp': fp,
'form': form
}
# 资产负债表所有年份指标
Listzcfzb.append(zcfzb_dic)
Listzcfzb.sort(key=itemgetter('riqi'))
groups = groupby(Listzcfzb, key=itemgetter('riqi'))
# 遍历每个分组,并打印分类结果
for riqi, group in groups:
print(f"riqi: {riqi}")
# 迭代表达式
listbydate = [item for item in group]
print()
"""从html页面中抽取表格"""
import requests
from bs4 import BeautifulSoup
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
def getRequest(url):
headers = {
'Referer': 'https://www.sec.gov/ix?doc=/Archives/edgar/data/356037/000035603723000038/cspi-20230630x10q.htm',
'Sec-Ch-Ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31',
}
for m in range(0,3):
try:
response = requests.get(url=url,headers=headers,verify=False)
break
except Exception as e:
log.error(f"request请求异常-------{e}")
continue
# 检查响应状态码
if response.status_code == 200:
soup = BeautifulSoup(response.content,'html.parser')
return soup
else:
return False
def getzcfztable(soup):
table_list = soup.find_all('table')
for table in table_list:
aa = table.find_all(text='Current assets:')
if aa:
# print(table)
trlist = table.find_all('tr')
date1 = trlist[1].find_all('td')[1].text.replace('\n', '')
date2 = trlist[1].find_all('td')[-1].text.replace('\n', '')
print(date1, date2)
# todo:把td内容为空的去掉
for tr in trlist[2:]:
filtered_tags = tr(lambda tag: tag.name == 'td' and '$' in tag.text)
for tag in filtered_tags:
tag.extract()
# filtered_tags2 = tr(lambda tag:tag.name=='td' and tag.text==' ')
filtered_tags2 = tr(lambda tag: tag.name == 'td' and tag.text == '')
for tag in filtered_tags2:
tag.extract()
try:
zbtag = tr.find_all('td')[0].text.replace('\n', '')
except:
zbtag = ''
try:
cash1 = tr.find_all('td')[1].text.replace('\n', '')
except:
cash1 = ''
try:
cash2 = tr.find_all('td')[2].text.replace('\n', '')
except:
cash2 = ''
if zbtag != '' and cash1 != '' and cash2 != '':
print(f'字段:{zbtag} 值1:{cash1} 值2:{cash2}')
if __name__=='__main__':
url = 'https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/aapl-20210925.htm'
soup = getRequest(url)
#html解析表格 资产负债表
getzcfztable(soup)
import json import json
import random
import requests, time, pymysql import requests, time, pymysql
import jieba import jieba
import sys import sys
...@@ -45,24 +47,21 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -45,24 +47,21 @@ def beinWork(tyc_code, social_code,start_time):
retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0} retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
t = time.time() t = time.time()
url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100' url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
for m in range(0, 3): try:
try: for m in range(0, 3):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent() headers['User-Agent'] = baseCore.getRandomUserAgent()
response = requests.get(url=url, headers=headers, proxies=ip, verify=False) response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
# time.sleep(random.randint(3, 5)) time.sleep(random.randint(3, 5))
break break
except Exception as e: if (response.status_code == 200):
pass pass
except Exception as e:
if (response.status_code == 200):
pass
else:
log.error(f"{tyc_code}-----获取总数接口失败") log.error(f"{tyc_code}-----获取总数接口失败")
e = '获取总数接口失败' error = '获取总数接口失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, e) baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{error}----{e}')
return retData return retData
try: try:
json_1 = json.loads(response.content.decode('utf-8')) json_1 = json.loads(response.content.decode('utf-8'))
...@@ -177,7 +176,7 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -177,7 +176,7 @@ def beinWork(tyc_code, social_code,start_time):
pass pass
continue continue
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
# 动态信息列表 # 动态信息列表
up_okCount = up_okCount + 1 up_okCount = up_okCount + 1
list_info = [ list_info = [
...@@ -185,6 +184,7 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -185,6 +184,7 @@ def beinWork(tyc_code, social_code,start_time):
link, link,
'天眼查', '天眼查',
'2', '2',
time_format
] ]
cursor_.execute(insert_sql, tuple(list_info)) cursor_.execute(insert_sql, tuple(list_info))
cnx_.commit() cnx_.commit()
...@@ -214,10 +214,10 @@ def beinWork(tyc_code, social_code,start_time): ...@@ -214,10 +214,10 @@ def beinWork(tyc_code, social_code,start_time):
} }
except Exception as e: except Exception as e:
log.info(f'传输失败:{social_code}----{link}') log.info(f'传输失败:{social_code}----{link}')
e = '数据库传输失败' error = '数据库传输失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e) baseCore.recordLog(social_code, taskType, state, takeTime, link, f'{error}----{e}')
continue continue
# print(dic_news) # print(dic_news)
# 将相应字段通过kafka传输保存 # 将相应字段通过kafka传输保存
......
import json import json
...@@ -21,6 +21,7 @@ tracker_conf = get_tracker_conf('./client.conf') ...@@ -21,6 +21,7 @@ tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
taskType = '企业年报/证监会' taskType = '企业年报/证监会'
pathType = 'ZJHAnnualReport/'
def RequestUrl(url, payload, item_id, start_time): def RequestUrl(url, payload, item_id, start_time):
# ip = get_proxy()[random.randint(0, 3)] # ip = get_proxy()[random.randint(0, 3)]
...@@ -43,26 +44,26 @@ def RequestUrl(url, payload, item_id, start_time): ...@@ -43,26 +44,26 @@ def RequestUrl(url, payload, item_id, start_time):
return soup return soup
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, # def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, page_size): # create_by, create_time, page_size):
#
sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1''' # sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
cursor_.execute(sel_sql, (item_id, year)) # cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone() # selects = cursor_.fetchone()
if selects: # if selects:
print(f'{name_pdf},{year}已存在') # print(f'{name_pdf},{year}已存在')
#
else: # else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' # Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
#
values = ( # values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, # year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, # create_by,
create_time, page_size) # create_time, page_size)
#
cursor_.execute(Upsql, values) # 插入 # cursor_.execute(Upsql, values) # 插入
cnx.commit() # 提交 # cnx.commit() # 提交
print("更新完成:{}".format(Upsql)) # print("更新完成:{}".format(Upsql))
# 采集信息 # 采集信息
def SpiderByZJH(url, payload, dic_info, num, start_time): def SpiderByZJH(url, payload, dic_info, num, start_time):
...@@ -121,19 +122,24 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -121,19 +122,24 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
cursor_.execute(sel_sql, (item_id, year)) cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone() selects = cursor_.fetchone()
if selects: if selects:
print(f'com_name:{short_name}、{year}已存在') log.info(f'com_name:{short_name}、{year}已存在')
continue continue
else: else:
retData = baseCore.upLoadToServe(pdf_url, 1, social_code) retData = baseCore.uptoOBS(pdf_url,name_pdf, 1, social_code,pathType,taskType,start_time)
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
#插入数据库获取att_id #插入数据库获取att_id
num = num + 1 num = num + 1
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num) att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num)
content = retData['content'] if att_id:
if retData['state']:
pass pass
else: else:
log.info(f'====pdf解析失败====')
return False return False
content = retData['content']
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
'attachmentIds': att_id, 'attachmentIds': att_id,
...@@ -169,7 +175,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -169,7 +175,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
'message': '操作成功', 'message': '操作成功',
'code': '200', 'code': '200',
} }
print(dic_result) log.info(dic_result)
return True return True
except Exception as e: except Exception as e:
dic_result = { dic_result = {
...@@ -181,7 +187,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -181,7 +187,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
print(dic_result) log.info(dic_result)
return False return False
else: else:
continue continue
...@@ -311,7 +317,8 @@ if __name__ == '__main__': ...@@ -311,7 +317,8 @@ if __name__ == '__main__':
time.sleep(20) time.sleep(20)
continue continue
dic_info = baseCore.getInfomation(social_code) dic_info = baseCore.getInfomation(social_code)
count = dic_info[15] count = dic_info[16]
log.info(f'====正在采集{social_code}=====')
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
url_parms = ['101111', '101811', '102611'] url_parms = ['101111', '101811', '102611']
...@@ -322,7 +329,7 @@ if __name__ == '__main__': ...@@ -322,7 +329,7 @@ if __name__ == '__main__':
dic_parms = getUrl(code, url_parms, Catagory2_parms) dic_parms = getUrl(code, url_parms, Catagory2_parms)
SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time) SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time)
end_time = time.time() end_time = time.time()
print(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}') log.info(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}')
count += 1 count += 1
runType = 'AnnualReportCount' runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count) baseCore.updateRun(social_code, runType, count)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -152,24 +152,23 @@ def spider_annual_report(dict_info,num): ...@@ -152,24 +152,23 @@ def spider_annual_report(dict_info,num):
cursor.execute(sel_sql, (social_code, int(year))) cursor.execute(sel_sql, (social_code, int(year)))
selects = cursor.fetchone() selects = cursor.fetchone()
if selects: if selects:
print(f'com_name:{com_name}、{year}已存在') log.info(f'com_name:{com_name}、{year}已存在')
continue continue
else: else:
page_size = 0 #上传文件至obs服务器
#上传文件至文件服务器 retData = baseCore.uptoOBS(pdf_url,name_pdf,1,social_code,pathType,taskType,start_time)
retData = baseCore.upLoadToServe(pdf_url,1,social_code) if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
num = num + 1 num = num + 1
try: try:
att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num) att_id = baseCore.tableUpdate(retData,com_name,year,name_pdf,num)
content = retData['content'] content = retData['content']
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
state = 1 state = 1
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '') baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '成功')
except: except:
exception = '数据库传输失败' exception = '数据库传输失败'
state = 0 state = 0
...@@ -236,6 +235,7 @@ def spider_annual_report(dict_info,num): ...@@ -236,6 +235,7 @@ def spider_annual_report(dict_info,num):
if __name__ == '__main__': if __name__ == '__main__':
num = 0 num = 0
taskType = '企业年报/雪球网' taskType = '企业年报/雪球网'
pathType = 'XQWAnnualReport/'
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
......
...@@ -14,6 +14,12 @@ def conn11(): ...@@ -14,6 +14,12 @@ def conn11():
cursor = conn.cursor() cursor = conn.cursor()
return conn,cursor return conn,cursor
def conn144():
conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
#企业公告 #企业公告
def shizhiCodeFromSql(): def shizhiCodeFromSql():
conn,cursor=conn11() conn,cursor=conn11()
...@@ -31,6 +37,7 @@ def shizhiCodeFromSql(): ...@@ -31,6 +37,7 @@ def shizhiCodeFromSql():
finally: finally:
cursor.close() cursor.close()
conn.close() conn.close()
#企业公告 #企业公告
def yahooCodeFromSql(): def yahooCodeFromSql():
conn,cursor=conn11() conn,cursor=conn11()
...@@ -49,6 +56,25 @@ def yahooCodeFromSql(): ...@@ -49,6 +56,25 @@ def yahooCodeFromSql():
cursor.close() cursor.close()
conn.close() conn.close()
#新浪纽交所股票对应的代码
def sinausstockCodeFromSql():
conn,cursor=conn144()
try:
gn_query = "select ticker from mgzqyjwyh_list where state=2 and exchange='NYSE'; "
cursor.execute(gn_query)
gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result]
print('sinausstockCodeFromSql开始将股票代码放入redis=======')
for item in gn_social_list:
r.rpush('sina_usstock:securities_code', item)
print('sinausstockCodeFromSql将股票代码放入redis结束')
except Exception as e:
log.info("数据查询异常")
finally:
cursor.close()
conn.close()
def yahooCode_task(): def yahooCode_task():
# 实例化一个调度器 # 实例化一个调度器
scheduler = BlockingScheduler() scheduler = BlockingScheduler()
...@@ -58,9 +84,12 @@ def yahooCode_task(): ...@@ -58,9 +84,12 @@ def yahooCode_task():
scheduler.add_job(yahooCodeFromSql, 'cron', day='*/3', hour=0, minute=0) scheduler.add_job(yahooCodeFromSql, 'cron', day='*/3', hour=0, minute=0)
# 每天执行一次 # 每天执行一次
scheduler.add_job(shizhiCodeFromSql, 'cron', hour=10,minute=0) scheduler.add_job(shizhiCodeFromSql, 'cron', hour=10,minute=0)
# 每天执行一次
scheduler.add_job(sinausstockCodeFromSql, 'cron', day='*/3', hour=0, minute=0)
try: try:
yahooCodeFromSql() # 定时开始前执行一次 # yahooCodeFromSql() # 定时开始前执行一次
shizhiCodeFromSql() # 定时开始前执行一次 # shizhiCodeFromSql() # 定时开始前执行一次
sinausstockCodeFromSql() # 定时开始前执行一次
scheduler.start() scheduler.start()
except Exception as e: except Exception as e:
print('定时采集异常', e) print('定时采集异常', e)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -373,6 +373,28 @@ class YahooCaiwu(object): ...@@ -373,6 +373,28 @@ class YahooCaiwu(object):
currency='' currency=''
return currency return currency
#对比指标计算
def calculateIndexReq(self):
get_url = 'http://114.115.236.206:8088/sync/calculateIndex'
try:
params={
'type':2
}
resp = requests.get(get_url,params=params)
print(resp.text)
text=json.loads(resp.text)
codee=text['code']
while codee==-200:
time.sleep(600)
resp = requests.get(get_url)
print(resp.text)
text=json.loads(resp.text)
codee=text['code']
if codee==-200:
break
print('调用接口成功!!')
except:
print('调用失败!')
if __name__ == '__main__': if __name__ == '__main__':
# parse_excel() # parse_excel()
#get_content1() #get_content1()
...@@ -383,8 +405,11 @@ if __name__ == '__main__': ...@@ -383,8 +405,11 @@ if __name__ == '__main__':
securitiescode=yahoo.getCodeFromRedis() securitiescode=yahoo.getCodeFromRedis()
yahoo.get_content2(securitiescode) yahoo.get_content2(securitiescode)
except Exception as e: except Exception as e:
print('没有数据暂停5分钟')
yahoo.calculateIndexReq()
if securitiescode: if securitiescode:
yahoo.r.rpush('NoticeEnterprise:securities_code',securitiescode) yahoo.r.rpush('NoticeEnterprise:securities_code',securitiescode)
else: else:
time.sleep(300) time.sleep(300)
print('没有数据暂停5分钟')
import configparser import configparser
...@@ -20,6 +20,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) ...@@ -20,6 +20,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from operator import itemgetter from operator import itemgetter
from itertools import groupby from itertools import groupby
import datetime import datetime
from decimal import Decimal
class SinaUsstock(object): class SinaUsstock(object):
...@@ -54,13 +55,19 @@ class SinaUsstock(object): ...@@ -54,13 +55,19 @@ class SinaUsstock(object):
seriesValue=tddoc.find('td').text().split(' ') seriesValue=tddoc.find('td').text().split(' ')
for i in range(0,len(pdate)): for i in range(0,len(pdate)):
value=seriesValue[i] value=seriesValue[i]
if '亿' in value: try:
value = value.replace("亿", "*100000000") if '亿' in value:
value = eval(value) value = value.replace("亿", "").replace(",", "")
elif '万' in value: value = Decimal(value) * Decimal('100000000')
value = value.replace("万", "*10000") # value = eval(value)
value = eval(value) elif '万' in value:
vvla=str(value) value = value.replace("万", "").replace(",", "")
value = Decimal(value) * Decimal('10000')
# value = eval(value)
except Exception as e:
print(e)
print(value)
vvla=str(value).replace(",", "")
serisemsg={ serisemsg={
'name':seriesName, 'name':seriesName,
'value':vvla, 'value':vvla,
...@@ -71,6 +78,31 @@ class SinaUsstock(object): ...@@ -71,6 +78,31 @@ class SinaUsstock(object):
return seriesList return seriesList
# 判断股票代码是否存在
def check_code(self,com_code):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
res = r.exists('com_sinacaiwushuju_code::'+com_code)
#如果key存在 则不是第一次采集该企业, res = 1
if res:
return False #表示不是第一次采集
else:
return True #表示是第一次采集
def check_date(self,com_code,info_date):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=3)
res = r.sismember('com_sinacaiwushuju_code::'+com_code, info_date) # 注意是 保存set的方式
if res:
return True
else:
return False
# 将采集后的股票代码对应的报告期保存进redis
def add_date(self,com_code,date_list):
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
#遍历date_list 放入redis
for date in date_list:
res = r.sadd('com_sinacaiwushuju_code::'+com_code,date)
def getCodeFromRedis(self): def getCodeFromRedis(self):
securitiescode=self.r.lpop('sina_usstock:securities_code') securitiescode=self.r.lpop('sina_usstock:securities_code')
securitiescode = securitiescode.decode('utf-8') securitiescode = securitiescode.decode('utf-8')
...@@ -209,7 +241,7 @@ class SinaUsstock(object): ...@@ -209,7 +241,7 @@ class SinaUsstock(object):
#转换数据格式发送接口 #转换数据格式发送接口
annualzb=zbl1+zbl3+zbl5 annualzb=zbl1+zbl3+zbl5
annualzb=self.groupZbData(annualzb,stock,social_credit_code,'annual') annualzb=self.groupZbData(annualzb,stock,social_credit_code,'year')
self.sendToFinance(annualzb) self.sendToFinance(annualzb)
quarterzb=zbl2+zbl4+zbl6 quarterzb=zbl2+zbl4+zbl6
quarterzb=self.groupZbData(quarterzb,stock,social_credit_code,'quarter') quarterzb=self.groupZbData(quarterzb,stock,social_credit_code,'quarter')
...@@ -228,15 +260,26 @@ class SinaUsstock(object): ...@@ -228,15 +260,26 @@ class SinaUsstock(object):
def sendToFinance(self,zbmsg): def sendToFinance(self,zbmsg):
for zbb in zbmsg: for zbb in zbmsg:
com_code=zbb['securitiesCode']
com_date=zbb['date']
#判断股票代码是否采集过
if self.check_code(com_code):
zbb['ynFirst']=True
if len(zbb) != 0: if len(zbb) != 0:
# 调凯歌接口存储数据 # 调凯歌接口存储数据
data = json.dumps(zbb) data = json.dumps(zbb)
#暂无接口 #暂无接口
url_baocun = '' url_baocun = 'http://114.115.236.206:8088/sync/finance/sina'
# url_baocun = 'http://114.115.236.206:8088/sync/finance/df' # url_baocun = 'http://114.115.236.206:8088/sync/finance/df'
for nnn in range(0, 3): for nnn in range(0, 3):
try: try:
res_baocun = requests.post(url_baocun, data=data) res_baocun = requests.post(url_baocun, data=data)
#将采集到的股票代码和日期进行记录用来标记是否采集过
com_date_list=[]
com_date_list.append(com_date)
self.add_date(com_code,com_date)
self.logger.info(res_baocun.text) self.logger.info(res_baocun.text)
break break
except: except:
...@@ -309,7 +352,7 @@ class SinaUsstock(object): ...@@ -309,7 +352,7 @@ class SinaUsstock(object):
if __name__ == '__main__': if __name__ == '__main__':
sinaUsstock=SinaUsstock() sinaUsstock=SinaUsstock()
# securitiescode= sinaUsstock.r.lpop('sina_usstock:securities_code') # securitiescode= sinaUsstock.r.lpop('sina_usstock:securities_code')
securitiescode= sinaUsstock.getCodeFromRedis() # securitiescode= sinaUsstock.getCodeFromRedis()
securitiescode='AAPL' securitiescode='AAPL'
try: try:
sinaUsstock.get_content2(securitiescode) sinaUsstock.get_content2(securitiescode)
......
""" """
...@@ -176,7 +176,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType) ...@@ -176,7 +176,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType)
dic_info_zcfzb = { dic_info_zcfzb = {
"name": info_name, "name": info_name,
'enName': info_name_en, 'enName': info_name_en,
"value": info_data "value": info_data,
"unit": "元"
} }
list_zcfzb.append(dic_info_zcfzb) list_zcfzb.append(dic_info_zcfzb)
...@@ -202,7 +203,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType) ...@@ -202,7 +203,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType)
dic_info_lrb = { dic_info_lrb = {
"name": info_name, "name": info_name,
'enName': info_name_en, 'enName': info_name_en,
"value": info_data "value": info_data,
"unit": "元"
} }
list_lrb.append(dic_info_lrb) list_lrb.append(dic_info_lrb)
...@@ -228,7 +230,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType) ...@@ -228,7 +230,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType)
dic_info_xjllb = { dic_info_xjllb = {
"name": info_name, "name": info_name,
'enName': info_name_en, 'enName': info_name_en,
"value": info_data "value": info_data,
"unit": "元"
} }
list_xjllb.append(dic_info_xjllb) list_xjllb.append(dic_info_xjllb)
...@@ -356,7 +359,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType) ...@@ -356,7 +359,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType)
dic_info_zcfzb = { dic_info_zcfzb = {
"name": info_name, "name": info_name,
'enName': info_name_en, 'enName': info_name_en,
"value": info_data "value": info_data,
"unit": '元'
} }
list_zcfzb.append(dic_info_zcfzb) list_zcfzb.append(dic_info_zcfzb)
...@@ -382,7 +386,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType) ...@@ -382,7 +386,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType)
dic_info_lrb = { dic_info_lrb = {
"name": info_name, "name": info_name,
'enName': info_name_en, 'enName': info_name_en,
"value": info_data "value": info_data,
'unit': '元'
} }
list_lrb.append(dic_info_lrb) list_lrb.append(dic_info_lrb)
...@@ -408,7 +413,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType) ...@@ -408,7 +413,8 @@ def get_info(social_code, com_code,info_date,delist_all,info_date_list,taskType)
dic_info_xjllb = { dic_info_xjllb = {
"name": info_name, "name": info_name,
'enName': info_name_en, 'enName': info_name_en,
"value": info_data "value": info_data,
'unit':'元'
} }
list_xjllb.append(dic_info_xjllb) list_xjllb.append(dic_info_xjllb)
......
...@@ -8,10 +8,8 @@ import pymysql ...@@ -8,10 +8,8 @@ import pymysql
import redis import redis
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages import urllib3 from requests.packages import urllib3
from retry import retry from retry import retry
from base import BaseCore from base import BaseCore
urllib3.disable_warnings() urllib3.disable_warnings()
...@@ -20,10 +18,7 @@ log = baseCore.getLogger() ...@@ -20,10 +18,7 @@ log = baseCore.getLogger()
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji', cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4') charset='utf8mb4')
cursor = cnx.cursor() cursor = cnx.cursor()
URL = 'https://www.nasdaq.com/' r = baseCore.r
session = requests.session()
session.mount('https://', HTTPAdapter(pool_connections=20, pool_maxsize=100))
session.mount('http://', HTTPAdapter(pool_connections=20, pool_maxsize=100))
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
} }
...@@ -86,7 +81,7 @@ def getUnit(gpdm): ...@@ -86,7 +81,7 @@ def getUnit(gpdm):
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml') soup = BeautifulSoup(req.text, 'lxml')
unit = soup.find('div', class_='financials__note').text.split(' ')[1].lstrip().strip() unit = soup.find('div', class_='financials__note').text.split(' ')[1].lstrip().strip()
unit = f'(千){unit}' unit = f'{unit}(千)'
req.close() req.close()
return unit return unit
...@@ -104,9 +99,11 @@ def getlist(table, tableName): ...@@ -104,9 +99,11 @@ def getlist(table, tableName):
value = re.sub(r"[^\d+-]", "", value) value = re.sub(r"[^\d+-]", "", value)
else: else:
value = '-' value = '-'
date = years[f'value{i}'].split('/')[2] + '-' + years[f'value{i}'].split('/')[0] + '-' + \ date_ = years[f'value{i}']
years[f'value{i}'].split('/')[1] if date_:
list.append({f'{tableName}': name, 'value': value, 'date': date, }) date = date_.split('/')[2] + '-' + date_.split('/')[0] + '-' + \
date_.split('/')[1]
list.append({f'{tableName}': name, 'value': value, 'date': date, })
return list return list
...@@ -136,13 +133,12 @@ def reviseData(lists, unit, tableName): ...@@ -136,13 +133,12 @@ def reviseData(lists, unit, tableName):
# 获取年度财务数据 # 获取年度财务数据
def getYear(start_time, session, social_code, gpdm): def getYear(start_time, social_code, gpdm):
ynFirst = check_code(social_code) ynFirst = check_code(social_code)
date_list = [] date_list = []
url = f'https://api.nasdaq.com/api/company/{gpdm}/financials?frequency=1' url = f'https://api.nasdaq.com/api/company/{gpdm}/financials?frequency=1'
try: try:
req = session.get(url, headers=headers, verify=False) req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
data = req.json()['data'] data = req.json()['data']
if data: if data:
unit = getUnit(gpdm) unit = getUnit(gpdm)
...@@ -162,6 +158,7 @@ def getYear(start_time, session, social_code, gpdm): ...@@ -162,6 +158,7 @@ def getYear(start_time, session, social_code, gpdm):
# 判断该报告期是否已采过 # 判断该报告期是否已采过
panduan = check_date(social_code, date + '-year') panduan = check_date(social_code, date + '-year')
if panduan: if panduan:
log.info(f'{social_code}=={gpdm}=={date}年度数据采集过')
continue continue
xjll_list_f = reviseData([item for item in final_list if 'xjll' in item], unit, 'xjll') xjll_list_f = reviseData([item for item in final_list if 'xjll' in item], unit, 'xjll')
zcfz_list_f = reviseData([item for item in final_list if 'zcfz' in item], unit, 'zcfz') zcfz_list_f = reviseData([item for item in final_list if 'zcfz' in item], unit, 'zcfz')
...@@ -177,13 +174,15 @@ def getYear(start_time, session, social_code, gpdm): ...@@ -177,13 +174,15 @@ def getYear(start_time, session, social_code, gpdm):
"ynFirst": ynFirst, "ynFirst": ynFirst,
} }
sendData(start_time, social_code, gpdm, dic_info) sendData(start_time, social_code, gpdm, dic_info)
log.info(f'{social_code}=={gpdm}=={date}年度财务数据采集成功')
date_list.append(date + '-year') date_list.append(date + '-year')
else: else:
log.error(f'找不到{social_code}=={gpdm}年度财务数据') log.error(f'找不到{social_code}=={gpdm}年度财务数据')
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===无年度财务数据') baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===无年度财务数据')
except: except Exception as e:
r.rpush('FinanceFromNasdaq:nasdaqfinance_socialCode', social_code)
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===年度财务数据访问失败') baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===年度财务数据访问失败')
...@@ -192,13 +191,12 @@ def getYear(start_time, session, social_code, gpdm): ...@@ -192,13 +191,12 @@ def getYear(start_time, session, social_code, gpdm):
# 获取季度财务数据 需要判断日期是否取与年度数据日期重合,重合需要修改类型为dateFlag字段为year # 获取季度财务数据 需要判断日期是否取与年度数据日期重合,重合需要修改类型为dateFlag字段为year
def getQuarter(start_time, session, social_code, gpdm): def getQuarter(start_time, social_code, gpdm):
ynFirst = check_code(social_code) ynFirst = check_code(social_code)
date_list = [] date_list = []
url = f'https://api.nasdaq.com/api/company/{gpdm}/financials?frequency=2' url = f'https://api.nasdaq.com/api/company/{gpdm}/financials?frequency=2'
try: try:
req = session.get(url, headers=headers, verify=False) req = requests.get(url, headers=headers, verify=False, timeout=60)
req.encoding = req.apparent_encoding
data = req.json()['data'] data = req.json()['data']
if data: if data:
unit = getUnit(gpdm) unit = getUnit(gpdm)
...@@ -217,6 +215,7 @@ def getQuarter(start_time, session, social_code, gpdm): ...@@ -217,6 +215,7 @@ def getQuarter(start_time, session, social_code, gpdm):
# 判断该报告期是否已采过 # 判断该报告期是否已采过
panduan = check_date(social_code, date + '-quarter') panduan = check_date(social_code, date + '-quarter')
if panduan: if panduan:
log.info(f'{social_code}=={gpdm}=={date}季度数据采集过')
continue continue
xjll_list_f = reviseData([item for item in final_list if 'xjll' in item], unit, 'xjll') xjll_list_f = reviseData([item for item in final_list if 'xjll' in item], unit, 'xjll')
zcfz_list_f = reviseData([item for item in final_list if 'zcfz' in item], unit, 'zcfz') zcfz_list_f = reviseData([item for item in final_list if 'zcfz' in item], unit, 'zcfz')
...@@ -236,13 +235,16 @@ def getQuarter(start_time, session, social_code, gpdm): ...@@ -236,13 +235,16 @@ def getQuarter(start_time, session, social_code, gpdm):
if panduan_flag: if panduan_flag:
dic_info['dateFlag'] = 'year' dic_info['dateFlag'] = 'year'
sendData(start_time, social_code, gpdm, dic_info) sendData(start_time, social_code, gpdm, dic_info)
log.info(f'{social_code}=={gpdm}=={date}季度财务数据采集成功')
date_list.append(date + '-quarter') date_list.append(date + '-quarter')
else: else:
log.error(f'{social_code}=={gpdm}无季度财务数据')
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===无季度财务数据') baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===无季度财务数据')
except: except Exception as e:
r.rpush('FinanceFromNasdaq:nasdaqfinance_socialCode', social_code)
log.error(f'{social_code}=={gpdm}===季度财务数据访问失败')
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===季度财务数据访问失败') baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===季度财务数据访问失败')
...@@ -250,36 +252,55 @@ def getQuarter(start_time, session, social_code, gpdm): ...@@ -250,36 +252,55 @@ def getQuarter(start_time, session, social_code, gpdm):
return date_list return date_list
# 信用代码放入redis中
def FinanceFromNasdaq():
sql = "select xydm from mgzqyjwyh_list where state=2 and exchange='Nasdaq;"
cursor.execute(sql)
finance = cursor.fetchall()
finance_list = [item[0] for item in finance]
for item in finance_list:
r.rpush('FinanceFromNasdaq:nasdaqfinance_socialCode', item)
print('redis放入成功')
def getInfomation(social_code):
sql = f"select * from mgzqyjwyh_list where state=2 and xydm='{social_code}';"
cursor.execute(sql)
data = cursor.fetchone()
return data
def doJob(): def doJob():
# while True: while True:
# social_code = baseCore.redicPullData('') social_code = baseCore.redicPullData('FinanceFromNasdaq:nasdaqfinance_socialCode')
# datas_enterprise = baseCore.getInfomation(social_code) if not social_code or social_code == None:
session.get(URL, headers=headers) log.info('============已没有数据============等待===============')
# sql = "select * from mgzqyjwyh_list where state=2 and exchange='Nasdaq';" time.sleep(600)
# cursor.execute(sql) continue
# datas_enterprise = cursor.fetchall() data_enterprise = getInfomation(social_code)
# for data_enterprise in datas_enterprise: start_time = time.time()
start_time = time.time() gpdm = data_enterprise[3]
# gpdm = data_enterprise[3] social_code = data_enterprise[6]
# social_code = data_enterprise[6] # print(gpdm,social_code)
social_code = 'ZD0CN0012309000172' # 采集年度数据
gpdm = 'NTES' date_list_year = getYear(start_time, social_code, gpdm)
# 采集年度数据 # 保存年度数据到redis
date_list_year = getYear(start_time, session, social_code, gpdm) add_date(social_code, date_list_year)
# 保存年度数据到redis # 采集季度数据
add_date(social_code, date_list_year) date_list_quarter = getQuarter(start_time, social_code, gpdm)
# 采集季度数据 # 保存季度数据到redis
date_list_quarter = getQuarter(start_time, session, social_code, gpdm) add_date(social_code, date_list_quarter)
# 保存季度数据到redis timeCost = baseCore.getTimeCost(start_time, time.time())
add_date(social_code, date_list_quarter) state = 1
timeCost = baseCore.getTimeCost(start_time, time.time()) baseCore.recordLog(social_code, taskType, state, timeCost, '', '')
state = 1 log.info(f'{social_code}=={gpdm}==耗时{timeCost}')
baseCore.recordLog(social_code, taskType, state, timeCost, '', '') time.sleep(2)
log.info(f'{social_code}=={gpdm}==耗时{timeCost}')
# break
cursor.close()
cnx.close()
if __name__ == '__main__': if __name__ == '__main__':
# 财务数据采集
doJob() doJob()
# 企业股票代码放入redis
# FinanceFromNasdaq()
cursor.close()
cnx.close()
import datetime
import json
import time
import requests
from kafka import KafkaProducer
from retry import retry
from bs4 import BeautifulSoup
from requests.packages import urllib3
from base import BaseCore
urllib3.disable_warnings()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
}
baseCore = BaseCore.BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
log = baseCore.getLogger()
r = baseCore.r
taskType = '纳斯达克/企业动态'
# 获取企业基本信息
def getInfomation(social_code):
sql = f"select * from mgzqyjwyh_list where state=2 and xydm='{social_code}';"
cursor.execute(sql)
data = cursor.fetchone()
return data
# 时间转换
def conversionTime(time):
try:
date_obj = datetime.datetime.strptime(time, "%B %d, %Y")
except:
date_obj = datetime.datetime.strptime(time, "%b%d,%Y")
pub_time = date_obj.strftime("%Y-%m-%d")
return pub_time
# 获取总页数
@retry(tries=3, delay=1)
def getTotal(gpdm):
url = f'https://api.nasdaq.com/api/news/topic/articlebysymbol?q={gpdm}|stocks&offset=0&limit=100&fallback=false'
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
total = req.json()['data']['totalrecords']
req.close()
return total
# 获取信息列表
@retry(tries=3, delay=1)
def getDataList(gpdm, offest, social_code):
data_list = []
url = f'https://api.nasdaq.com/api/news/topic/articlebysymbol?q={gpdm}|stocks&offset={offest}&limit=100&fallback=false'
# print(url)
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
datas = req.json()['data']['rows']
if datas != []:
for data in datas:
title = data['title']
author = data['publisher']
url = data['url']
if 'http' not in url:
url = 'https://www.nasdaq.com' + url
data_list.append([url, title, author, social_code])
req.close()
return data_list
@retry(tries=3, delay=1)
def getsoup(url):
req = requests.get(url, headers=headers, verify=False)
# req = session.get(url)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
return soup
# 页面A类型解析
def getDicA(data, soup):
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
url = data[0]
pub_time = soup.find('p', class_='jupiter22-c-author-byline__timestamp').text.split('—')[0].lstrip().strip()
pub_time = conversionTime(pub_time)
contentWithTag = soup.find('div', class_='nsdq-l-grid__item syndicated-article-body')
try:
contentWithTag.find('div', class_='jupiter22-c-tags jupiter22-c-tags-default').decompose()
except:
pass
try:
contentWithTag.find('div', class_='taboola-placeholder').decompose()
except:
pass
try:
divs_del = contentWithTag.find_all('div', class_='ads__inline')
for div_del in divs_del:
div_del.decompose()
except:
pass
try:
divs_del = contentWithTag.find_all('script')
for div_del in divs_del:
div_del.decompose()
except:
pass
content = contentWithTag.text
dic_news = {
'attachmentIds': '',
'author': data[2],
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'en',
'origin': '纳斯达克',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': url, # 原文链接
'summary': '',
'title': data[1],
'type': 2,
'socialCreditCode': data[3],
'year': pub_time[:4]
}
return dic_news
# 页面B类型解析
def getDicB(data, soup):
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
url = data[0]
pub_time = soup.find('div', class_='timestamp').find('time').text
pub_time = pub_time.split(' ')[0] + pub_time.split(' ')[1] + pub_time.split(' ')[2]
pub_time = conversionTime(pub_time)
contentWithTag = soup.find('div', class_='body__content')
try:
divs_del = contentWithTag.find_all('div', class_='ads__inline')
for div_del in divs_del:
div_del.decompose()
except:
pass
try:
divs_del = contentWithTag.find_all('script')
for div_del in divs_del:
div_del.decompose()
except:
pass
content = contentWithTag.text
imgs = contentWithTag.find_all('img')
for img in imgs:
src = img.get('src')
src_ = 'https://www.nasdaq.com' + src
contentWithTag = str(contentWithTag).replace(src, src_)
dic_news = {
'attachmentIds': '',
'author': data[2],
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'en',
'origin': '纳斯达克',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': url, # 原文链接
'summary': '',
'title': data[1],
'type': 2,
'socialCreditCode': data[3],
'year': pub_time[:4]
}
return dic_news
# 数据发送至Kafka
@retry(tries=3, delay=1)
def sendKafka(dic_news, start_time):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(dic_news['socialCreditCode'], taskType, state, takeTime, dic_news['sourceAddress'], '')
# 数据保存入库,用于判重
@retry(tries=3, delay=1)
def insertMysql(social_code, link):
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
# 动态信息列表
list_info = [
social_code,
link,
'纳斯达克',
'2',
]
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
# 判断动态是否采集过
@retry(tries=3, delay=1)
def selectUrl(url, social_code):
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
cursor.execute(sel_sql, (url, social_code))
selects = cursor.fetchone()
return selects
def doJob():
while True:
social_code = ''
data_enterprise = getInfomation(social_code)
gpdm = data_enterprise[3]
social_code = data_enterprise[6]
# gpdm = 'GOOGL'
# social_code = 'ZZSN22080900000013'
start_time = time.time()
try:
total = getTotal(gpdm)
except:
log.error(f'{social_code}==={gpdm}===获取总数失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'{social_code}==={gpdm}===获取总数失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
for offest in range(0, total + 1, 100):
try:
data_list = getDataList(gpdm, offest, social_code)
except:
log.error(f'{social_code}==={gpdm}===获取信息列表失败({offest}~{offest + 100}条)')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '',
f'{social_code}==={gpdm}===获取信息列表失败({offest}~{offest + 100}条)')
continue
# 只能获取前10000条数据
if data_list != []:
for data in data_list:
start_time = time.time()
url = data[0]
selects = selectUrl(url, social_code)
if selects:
log.info(f'{url}===已采集过')
# 全量使用
continue
# 增量使用
# break
try:
soup = getsoup(url)
try:
try:
dic_info = getDicA(data, soup)
except:
dic_info = getDicB(data, soup)
except:
log.error(f'{url}===正文解析失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'{url}===正文解析失败')
continue
try:
sendKafka(dic_info, start_time)
try:
insertMysql(social_code, url)
except:
log.error(f'{url}===数据入库失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'{url}===数据入库失败')
except Exception as e:
print(e)
log.error(f'{url}===发送kafka失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'{url}===发送kafka失败')
time.sleep(1)
except:
log.error(f'{url}===页面访问失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'{url}===页面访问失败')
break
else:
break
break
if __name__ == "__main__":
doJob()
baseCore.close()
# 核心工具包
import os
import random
import socket
import sys
import time
import fitz
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import pymysql
import redis
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
# self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
# charset='utf8mb4')
# self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except :
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
pass
else:
end=str.rfind(endStr)
if end==-1:
pass
else:
str = str[0:end+1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self,key):
item = self.r.lpop(key)
return item.decode() if item else None
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
# 获取本机IP
def getIP(self):
IP = socket.gethostbyname(socket.gethostname())
return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, com_name):
data = []
try:
sql = f"SELECT * FROM champion WHERE CompanyName = '{com_name}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql,values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
#获取企查查token
def GetToken(self):
#获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token_list = self.cursor.fetchall()
self.cnx.commit()
token = token_list[random.randint(0, len(token_list)-1)][0]
return token
# 删除失效的token
def delete_token(self,token):
deletesql = f"delete from QCC_token where token='{token}' "
self.cursor.execute(deletesql)
self.cnx.commit()
#获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
self.cnx.commit()
return token
#检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,key,item):
self.r.rpush(key, item)
#增加计数器的值并返回增加后的值
def incrSet(self,key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
#获取key剩余的过期时间
def getttl(self,key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
#上传至文件服务器,并解析pdf的内容和页数
def upLoadToServe(self,pdf_url,type_id,social_code):
headers = {}
retData = {'state':False,'type_id':type_id,'item_id':social_code,'group_name':'group1','path':'','full_path':'',
'category':'pdf','file_size':'','status':1,'create_by':'XueLingKun',
'create_time':'','page_size':'','content':''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
print(f'======pdf解析失败=====')
return retData
else:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,year,type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,year,type_id)
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self,social_code,cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
# -*- coding: utf-8 -*-
import pandas as pd
import time
import requests
import json
from kafka import KafkaProducer
from BaseCore import BaseCore
from getQccId import find_id_by_name
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log = baseCore.getLogger()
# 通过企查查id获取企业基本信息
def info_by_id(com_id,com_name):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
com_jc_name = ''
try:
result_dict = resp_dict['result']['Company']
except:
log.info(com_name + ":获取失败===========重新放入redis")
baseCore.rePutIntoR('champion:baseinfo',com_name)
return aa_dict_list
company_name = result_dict['Name']
CreditCode = result_dict['CreditCode']
if CreditCode is None:
CreditCode = ''
try:
OperName = result_dict['Oper']['Name']
except:
OperName = ''
if OperName is None:
OperName = ''
if baseCore.str_have_num(OperName):
OperName = ''
try:
Status = result_dict['ShortStatus']
except:
Status = ''
if Status is None:
Status = ''
try:
StartDate = result_dict['StartDate']
except:
StartDate = ''
if StartDate is None:
StartDate = ''
try:
RegistCapi = result_dict['RegistCapi']
except:
RegistCapi = ''
if RegistCapi is None:
RegistCapi = ''
RecCap = '' # result_dict['RecCap'] #实际缴纳金额,现已没有显示
if RecCap is None:
RecCap = ''
try:
OrgNo = result_dict['CreditCode'][8:-2] + '-' + result_dict['CreditCode'][-2] # 组织机构代码,现已没有显示
except:
OrgNo = ''
if OrgNo is None:
OrgNo = ''
try:
TaxNo = result_dict['TaxNo']
except:
TaxNo = ''
if TaxNo is None:
TaxNo = ''
try:
EconKind = result_dict['EconKind']
except:
EconKind = ''
if EconKind is None:
EconKind = ''
TermStart = '' # result_dict['TermStart'] 营业期限自,现已没有显示
if TermStart is None:
TermStart = ''
TeamEnd = '' # result_dict['TeamEnd']营业期限至,现已没有显示
if TeamEnd is None:
TeamEnd = ''
try:
SubIndustry = result_dict['Industry']['SubIndustry']
except:
SubIndustry = ''
if SubIndustry is None:
SubIndustry = ''
try:
Province = result_dict['Area']['Province']
except:
Province = ''
try:
City = result_dict['Area']['City']
except:
City = ''
try:
County = result_dict['Area']['County']
except:
County = ''
try:
region = Province + City + County
except:
region = ''
BelongOrg = '' # result_dict['BelongOrg']登记机关,现已没有显示
can_bao = ''
CommonList = [] # result_dict['CommonList']参保人数,现已没有显示
for Common_dict in CommonList:
try:
KeyDesc = Common_dict['KeyDesc']
except:
continue
if KeyDesc == '参保人数':
can_bao = Common_dict['Value']
if can_bao == '0':
can_bao = ''
OriginalName = ''
try:
OriginalName_lists = result_dict['OriginalName']
for OriginalName_dict in OriginalName_lists:
OriginalName += OriginalName_dict['Name'] + ' '
except:
OriginalName = ''
try:
OriginalName.strip()
except:
OriginalName = ''
EnglishName = '' # result_dict['EnglishName']企业英文名,现已没有显示
if EnglishName is None:
EnglishName = ''
IxCode = '' # result_dict['IxCode']进出口企业代码,现已没有显示
if IxCode is None:
IxCode = ''
Address = result_dict['Address']
if Address is None:
Address = ''
Scope = '' # result_dict['Scope']经营范围,现已没有显示
if Scope is None:
Scope = ''
try:
PhoneNumber = result_dict['companyExtendInfo']['Tel']
except:
PhoneNumber = ''
if PhoneNumber is None:
PhoneNumber = ''
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
try:
Email = result_dict['companyExtendInfo']['Email']
except:
Email = ''
if Email is None:
Email = ''
try:
Desc = result_dict['companyExtendInfo']['Desc']
except:
Desc = ''
if Desc is None:
Desc = ''
try:
Info = result_dict['companyExtendInfo']['Info']
except:
Info = ''
if Info is None:
Info = ''
company_name = baseCore.hant_2_hans(company_name)
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}".format(token, t,
com_id)
resp_dict2 = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(1)
try:
com2 = resp_dict2['result']['Company']
except:
com2 = ''
try:
Scope = com2['Scope']
except:
Scope = ''
try:
CheckDate = com2['CheckDate']
except:
CheckDate = ''
if CheckDate is None:
CheckDate = ''
try:
TaxpayerType = com2['TaxpayerType'] #纳税人资质
except:
TaxpayerType = ''
if TaxpayerType is None:
TaxpayerType = ''
try:
No = com2['No']
except:
No = ''
if No is None:
No = ''
try:
IxCode = com2['IxCode']
except:
IxCode = ''
try:
OrgNo = com2['OrgNo']
except:
OrgNo = ''
try:
for Common_t in com2['CommonList']:
try:
if Common_t['KeyDesc'] == '参保人数':
can_bao = Common_t['Value']
except:
pass
except:
can_bao = ''
try:
TermStart = com2['TermStart']
except:
TermStart = ''
try:
TeamEnd = com2['TeamEnd']
except:
TeamEnd = ''
try:
RecCap = com2['RecCap']
except:
RecCap = ''
try:
No = com2['No']
except:
No = ''
try:
SubIndustry = com2['IndustryArray'][-1]
except:
SubIndustry = ''
try:
BelongOrg = com2['BelongOrg']
except:
BelongOrg = ''
try:
EnglishName = com2['EnglishName']
except:
EnglishName = ''
aa_dict = {
'qccId': com_id, # 企查查企业id
'name': company_name, # 企业名称
'shortName': com_jc_name, # 企业简称
'socialCreditCode': CreditCode, # 统一社会信用代码
'legalPerson': OperName, # 法定代表人
'officialPhone': PhoneNumber, # 电话
'officialUrl': WebSite, # 官网
'officialEmail': Email, # 邮箱
'briefInfo': Desc, # 简介
'registerStatus': Status, # 登记状态
'incorporationDate': StartDate, # 成立日期
'capital': RegistCapi, # 注册资本
'paidCapital': RecCap, # 实缴资本
'approvalDate': CheckDate, # 核准日期
'organizationCode': OrgNo, # 组织机构代码
'registerNo': No, # 工商注册号
'taxpayerNo': CreditCode, # 纳税人识别号
'type': EconKind, # 企业类型
'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': SubIndustry, # 所属行业
'region': region,
'province': Province, # 所属省
'city': City, # 所属市
'county': County, # 所属县
'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模
'insured': can_bao, # 参保人数
'beforeName': OriginalName, # 曾用名
'englishName': EnglishName, # 英文名
'importExportEnterpriseCode': IxCode, # 进出口企业代码
'address': Address, # 地址
'businessRange': Scope, # 经营范围
'status': 0, # 状态
}
aa_dict_list.append(aa_dict)
log.info(company_name + ":爬取完成")
return aa_dict_list
if __name__ == '__main__':
taskType = '基本信息/企查查/单项冠军'
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
list_weicha = []
name_list = []
#从redis里拿数据
while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken()
if token:
pass
else:
log.info('==========已无token==========')
time.sleep(30)
continue
# list_all_info = []
start_time = time.time()
# 获取企业信息
com_name = baseCore.redicPullData('champion:baseinfo')
# com_name = '卓新市万达铸业有限公司'
if com_name == '':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(com_name)
log.info(f'----当前企业{com_name}--开始处理---')
social_code = dic_info[5]
#企查查id
company_id = dic_info[6]
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == None:
if social_code:
company_id = find_id_by_name(start_time,token,social_code)
else:
company_id = find_id_by_name(start_time,token,com_name)
if company_id == 'null':
log.info('=====搜索不到该企业====')
#todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
baseCore.rePutIntoR('champion:baseinfo', com_name + ':搜索不到')
continue
if not company_id:
log.info(com_name + ":企业ID获取失败===重新放入redis")
list_weicha.append(com_name + ":企业ID获取失败")
baseCore.rePutIntoR('champion:baseinfo',com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
time.sleep(20)
continue
else:
log.info(f'====={com_name}===={company_id}=====获取企业id成功=====')
# todo:写入数据库
updateqccid = f"update champion set qccid = '{company_id}' where CompanyName = '{com_name}'"
cursor_.execute(updateqccid)
cnx_.commit()
try:
post_data_list = info_by_id(company_id, com_name)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('champion:baseInfo', com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
continue
if post_data_list:
pass
else:
# log.info(f'======{social_code}====企查查token失效====')
time.sleep(20)
continue
for post_data in post_data_list:
# list_all_info.append(post_data)
if post_data is None:
print(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败")
continue
get_name = post_data['name']
get_socialcode = post_data['socialCreditCode']
#todo:将信用代码更新到表中
updatesocialcode = f"update champion set SocialCode = '{get_socialcode}' where CompanyName = '{com_name}'"
cursor_.execute(updatesocialcode)
cnx_.commit()
name_compile = {
'yuan_name':com_name,
'get_name':get_name
}
name_list.append(name_compile)
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
except:
exception = 'kafka传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
# break
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
# -*- coding: utf-8 -*-
import time
from urllib.parse import quote
import requests
import urllib3
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# headers = {
# 'Host': 'xcx.qcc.com',
# 'Connection': 'keep-alive',
# 'Qcc-Platform': 'mp-weixin',
# 'Qcc-Timestamp': '',
# 'Qcc-Version': '1.0.0',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br,'
# }
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'x-request-device-type': 'Android',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
'Content-Type': 'application/json',
'Qcc-Version': '1.0.0',
'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
'xweb_xhr': '1',
'xcx-version': '2023.09.27',
'Qcc-Platform': 'mp-weixin',
'Qcc-CurrentPage': '/company-subpackages/business/index',
'Qcc-Timestamp': '1696661787803',
'Qcc-RefPage': '/company-subpackages/detail/index',
'Accept': '*/*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh'
}
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(start,token,name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except Exception as e:
print(f'{e}-------------重试')
time.sleep(5)
continue
time.sleep(2)
#{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频,请升级小程序版本'}
if resp_dict['status']==40101:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
if resp_dict['status']==401:
KeyNo = False
log.info(f'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
try:
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = 'null'
else:
KeyNo = 'null'
except:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
return KeyNo
log.info("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
\ No newline at end of file
# 核心工具包
import os
import random
import socket
import sys
import time
import fitz
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import pymysql
import redis
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
# self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
# charset='utf8mb4')
# self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except :
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
pass
else:
end=str.rfind(endStr)
if end==-1:
pass
else:
str = str[0:end+1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self,key):
item = self.r.lpop(key)
return item.decode() if item else None
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
# 获取本机IP
def getIP(self):
IP = socket.gethostbyname(socket.gethostname())
return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, com_name):
data = []
try:
sql = f"SELECT * FROM technological WHERE CompanyName = '{com_name}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql,values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
#获取企查查token
def GetToken(self):
#获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token_list = self.cursor.fetchall()
self.cnx.commit()
token = token_list[random.randint(0, len(token_list)-1)][0]
return token
# 删除失效的token
def delete_token(self,token):
deletesql = f"delete from QCC_token where token='{token}' "
self.cursor.execute(deletesql)
self.cnx.commit()
#获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
self.cnx.commit()
return token
#检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,key,item):
self.r.rpush(key, item)
#增加计数器的值并返回增加后的值
def incrSet(self,key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
#获取key剩余的过期时间
def getttl(self,key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
#上传至文件服务器,并解析pdf的内容和页数
def upLoadToServe(self,pdf_url,type_id,social_code):
headers = {}
retData = {'state':False,'type_id':type_id,'item_id':social_code,'group_name':'group1','path':'','full_path':'',
'category':'pdf','file_size':'','status':1,'create_by':'XueLingKun',
'create_time':'','page_size':'','content':''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
print(f'======pdf解析失败=====')
return retData
else:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
retData['full_path'] = bytes.decode(result['Remote file_id'])
retData['file_size'] = result['Uploaded size']
retData['create_time'] = time_now
retData['page_size'] = page_size
return retData
def secrchATT(self,item_id,year,type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,year,type_id)
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self,social_code,cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
# -*- coding: utf-8 -*-
import pandas as pd
import time
import requests
import json
from kafka import KafkaProducer
from BaseCore import BaseCore
from getQccId import find_id_by_name
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log = baseCore.getLogger()
# 通过企查查id获取企业基本信息
def info_by_id(com_id,com_name):
aa_dict_list = []
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(2)
com_jc_name = ''
try:
result_dict = resp_dict['result']['Company']
except:
log.info(com_name + ":获取失败===========重新放入redis")
baseCore.rePutIntoR('technological:baseinfo',com_name)
return aa_dict_list
company_name = result_dict['Name']
CreditCode = result_dict['CreditCode']
if CreditCode is None:
CreditCode = ''
try:
OperName = result_dict['Oper']['Name']
except:
OperName = ''
if OperName is None:
OperName = ''
if baseCore.str_have_num(OperName):
OperName = ''
try:
Status = result_dict['ShortStatus']
except:
Status = ''
if Status is None:
Status = ''
try:
StartDate = result_dict['StartDate']
except:
StartDate = ''
if StartDate is None:
StartDate = ''
try:
RegistCapi = result_dict['RegistCapi']
except:
RegistCapi = ''
if RegistCapi is None:
RegistCapi = ''
RecCap = '' # result_dict['RecCap'] #实际缴纳金额,现已没有显示
if RecCap is None:
RecCap = ''
try:
OrgNo = result_dict['CreditCode'][8:-2] + '-' + result_dict['CreditCode'][-2] # 组织机构代码,现已没有显示
except:
OrgNo = ''
if OrgNo is None:
OrgNo = ''
try:
TaxNo = result_dict['TaxNo']
except:
TaxNo = ''
if TaxNo is None:
TaxNo = ''
try:
EconKind = result_dict['EconKind']
except:
EconKind = ''
if EconKind is None:
EconKind = ''
TermStart = '' # result_dict['TermStart'] 营业期限自,现已没有显示
if TermStart is None:
TermStart = ''
TeamEnd = '' # result_dict['TeamEnd']营业期限至,现已没有显示
if TeamEnd is None:
TeamEnd = ''
try:
SubIndustry = result_dict['Industry']['SubIndustry']
except:
SubIndustry = ''
if SubIndustry is None:
SubIndustry = ''
try:
Province = result_dict['Area']['Province']
except:
Province = ''
try:
City = result_dict['Area']['City']
except:
City = ''
try:
County = result_dict['Area']['County']
except:
County = ''
try:
region = Province + City + County
except:
region = ''
BelongOrg = '' # result_dict['BelongOrg']登记机关,现已没有显示
can_bao = ''
CommonList = [] # result_dict['CommonList']参保人数,现已没有显示
for Common_dict in CommonList:
try:
KeyDesc = Common_dict['KeyDesc']
except:
continue
if KeyDesc == '参保人数':
can_bao = Common_dict['Value']
if can_bao == '0':
can_bao = ''
OriginalName = ''
try:
OriginalName_lists = result_dict['OriginalName']
for OriginalName_dict in OriginalName_lists:
OriginalName += OriginalName_dict['Name'] + ' '
except:
OriginalName = ''
try:
OriginalName.strip()
except:
OriginalName = ''
EnglishName = '' # result_dict['EnglishName']企业英文名,现已没有显示
if EnglishName is None:
EnglishName = ''
IxCode = '' # result_dict['IxCode']进出口企业代码,现已没有显示
if IxCode is None:
IxCode = ''
Address = result_dict['Address']
if Address is None:
Address = ''
Scope = '' # result_dict['Scope']经营范围,现已没有显示
if Scope is None:
Scope = ''
try:
PhoneNumber = result_dict['companyExtendInfo']['Tel']
except:
PhoneNumber = ''
if PhoneNumber is None:
PhoneNumber = ''
try:
WebSite = result_dict['companyExtendInfo']['WebSite']
except:
WebSite = None
if WebSite is None:
try:
WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
except:
WebSite = ''
try:
Email = result_dict['companyExtendInfo']['Email']
except:
Email = ''
if Email is None:
Email = ''
try:
Desc = result_dict['companyExtendInfo']['Desc']
except:
Desc = ''
if Desc is None:
Desc = ''
try:
Info = result_dict['companyExtendInfo']['Info']
except:
Info = ''
if Info is None:
Info = ''
company_name = baseCore.hant_2_hans(company_name)
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = "https://xcx.qcc.com/mp-weixin/forwardApp/v6/base/getEntDetail?token={}&t={}&unique={}".format(token, t,
com_id)
resp_dict2 = requests.get(url=url, headers=headers, verify=False).json()
time.sleep(1)
try:
com2 = resp_dict2['result']['Company']
except:
com2 = ''
try:
Scope = com2['Scope']
except:
Scope = ''
try:
CheckDate = com2['CheckDate']
except:
CheckDate = ''
if CheckDate is None:
CheckDate = ''
try:
TaxpayerType = com2['TaxpayerType'] #纳税人资质
except:
TaxpayerType = ''
if TaxpayerType is None:
TaxpayerType = ''
try:
No = com2['No']
except:
No = ''
if No is None:
No = ''
try:
IxCode = com2['IxCode']
except:
IxCode = ''
try:
OrgNo = com2['OrgNo']
except:
OrgNo = ''
try:
for Common_t in com2['CommonList']:
try:
if Common_t['KeyDesc'] == '参保人数':
can_bao = Common_t['Value']
except:
pass
except:
can_bao = ''
try:
TermStart = com2['TermStart']
except:
TermStart = ''
try:
TeamEnd = com2['TeamEnd']
except:
TeamEnd = ''
try:
RecCap = com2['RecCap']
except:
RecCap = ''
try:
No = com2['No']
except:
No = ''
try:
SubIndustry = com2['IndustryArray'][-1]
except:
SubIndustry = ''
try:
BelongOrg = com2['BelongOrg']
except:
BelongOrg = ''
try:
EnglishName = com2['EnglishName']
except:
EnglishName = ''
aa_dict = {
'qccId': com_id, # 企查查企业id
'name': company_name, # 企业名称
'shortName': com_jc_name, # 企业简称
'socialCreditCode': CreditCode, # 统一社会信用代码
'legalPerson': OperName, # 法定代表人
'officialPhone': PhoneNumber, # 电话
'officialUrl': WebSite, # 官网
'officialEmail': Email, # 邮箱
'briefInfo': Desc, # 简介
'registerStatus': Status, # 登记状态
'incorporationDate': StartDate, # 成立日期
'capital': RegistCapi, # 注册资本
'paidCapital': RecCap, # 实缴资本
'approvalDate': CheckDate, # 核准日期
'organizationCode': OrgNo, # 组织机构代码
'registerNo': No, # 工商注册号
'taxpayerNo': CreditCode, # 纳税人识别号
'type': EconKind, # 企业类型
'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': SubIndustry, # 所属行业
'region': region,
'province': Province, # 所属省
'city': City, # 所属市
'county': County, # 所属县
'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模
'insured': can_bao, # 参保人数
'beforeName': OriginalName, # 曾用名
'englishName': EnglishName, # 英文名
'importExportEnterpriseCode': IxCode, # 进出口企业代码
'address': Address, # 地址
'businessRange': Scope, # 经营范围
'status': 0, # 状态
}
aa_dict_list.append(aa_dict)
log.info(company_name + ":爬取完成")
return aa_dict_list
if __name__ == '__main__':
taskType = '基本信息/企查查/科改示范企业'
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'Qcc-Platform': 'mp-weixin',
'Qcc-Timestamp': '',
'Qcc-Version': '1.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br,'
}
list_weicha = []
name_list = []
#从redis里拿数据
while True:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token = baseCore.GetToken()
if token:
pass
else:
log.info('==========已无token==========')
time.sleep(30)
continue
# list_all_info = []
start_time = time.time()
# 获取企业信息
# com_name = baseCore.redicPullData('technological:baseinfo')
com_name = '深圳市城市公共安全技术研究院有限公司'
if com_name == '':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(com_name)
log.info(f'----当前企业{com_name}--开始处理---')
social_code = dic_info[5]
#企查查id
company_id = dic_info[6]
#如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if company_id == None:
if social_code:
company_id = find_id_by_name(start_time,token,social_code)
else:
company_id = find_id_by_name(start_time,token,com_name)
if company_id == 'null':
log.info('=====搜索不到该企业====')
#todo:搜不到的企业没有信用代码 传输不过去 生成一个信用代码
baseCore.rePutIntoR('technological:baseinfo', com_name + ':搜索不到')
continue
if not company_id:
log.info(com_name + ":企业ID获取失败===重新放入redis")
list_weicha.append(com_name + ":企业ID获取失败")
baseCore.rePutIntoR('technological:baseinfo',com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
time.sleep(20)
continue
else:
log.info(f'====={com_name}===={company_id}=====获取企业id成功=====')
# todo:写入数据库
updateqccid = f"update technological set qccid = '{company_id}' where CompanyName = '{com_name}'"
cursor_.execute(updateqccid)
cnx_.commit()
try:
post_data_list = info_by_id(company_id, com_name)
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.rePutIntoR('technological:baseInfo', com_name)
baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
continue
if post_data_list:
pass
else:
# log.info(f'======{social_code}====企查查token失效====')
time.sleep(20)
continue
for post_data in post_data_list:
# list_all_info.append(post_data)
if post_data is None:
print(com_name + ":企业信息获取失败")
list_weicha.append(com_name + ":企业信息获取失败")
continue
get_name = post_data['name']
get_socialcode = post_data['socialCreditCode']
#todo:将信用代码更新到表中
updatesocialcode = f"update technological set SocialCode = '{get_socialcode}' where CompanyName = '{com_name}'"
cursor_.execute(updatesocialcode)
cnx_.commit()
name_compile = {
'yuan_name':com_name,
'get_name':get_name
}
name_list.append(name_compile)
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time,time.time())}')
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
except:
exception = 'kafka传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(get_socialcode, taskType, state, takeTime, '', exception)
log.info(f"{get_name}--{get_socialcode}--kafka传输失败")
break
nowtime = baseCore.getNowTime(1).replace('-','_')[:10]
companyName = pd.DataFrame(name_list)
companyName.to_excel(f'./data/企业名称对比_{nowtime}.xlsx',index=False)
false_com = pd.DataFrame(list_weicha)
false_com.to_excel(f'./data/采集失败企业名单_{nowtime}.xlsx',index=False)
# -*- coding: utf-8 -*-
import time
from urllib.parse import quote
import requests
import urllib3
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# headers = {
# 'Host': 'xcx.qcc.com',
# 'Connection': 'keep-alive',
# 'Qcc-Platform': 'mp-weixin',
# 'Qcc-Timestamp': '',
# 'Qcc-Version': '1.0.0',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
# 'content-type': 'application/json',
# 'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
# 'Accept-Encoding': 'gzip, deflate, br,'
# }
headers = {
'Host': 'xcx.qcc.com',
'Connection': 'keep-alive',
'x-request-device-type': 'Android',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8391',
'Content-Type': 'application/json',
'Qcc-Version': '1.0.0',
'authMini': 'Bearer f51dae1a2fcb109fa9ec58bd4a85e5c5',
'xweb_xhr': '1',
'xcx-version': '2023.09.27',
'Qcc-Platform': 'mp-weixin',
'Qcc-CurrentPage': '/company-subpackages/business/index',
'Qcc-Timestamp': '1696661787803',
'Qcc-RefPage': '/company-subpackages/detail/index',
'Accept': '*/*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wx395200814fcd7599/307/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh'
}
# 通过企业名称或信用代码获取企查查id
def find_id_by_name(start,token,name):
urllib3.disable_warnings()
qcc_key = name
t = str(int(time.time()) * 1000)
headers['Qcc-Timestamp'] = t
url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
for lll in range(1, 6):
try:
resp_dict = requests.get(url=url, headers=headers, verify=False).json()
break
except Exception as e:
print(f'{e}-------------重试')
time.sleep(5)
continue
time.sleep(2)
#{'status': 40101, 'message': '无效的sessionToken!'} {'status': 401, 'message': '您的账号访问超频,请升级小程序版本'}
if resp_dict['status']==40101:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
if resp_dict['status']==401:
KeyNo = False
log.info(f'=======您的账号访问超频,请升级小程序版本=====时间{baseCore.getTimeCost(start, time.time())}')
return KeyNo
try:
if resp_dict['result']['Result']:
result_dict = resp_dict['result']['Result'][0]
KeyNo = result_dict['KeyNo']
Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
if Name == '':
KeyNo = 'null'
else:
KeyNo = 'null'
except:
KeyNo = False
log.info(f'====token失效====时间{baseCore.getTimeCost(start,time.time())}')
return KeyNo
log.info("{},企业代码为:{}".format(qcc_key, KeyNo))
return KeyNo
\ No newline at end of file
import json import json
...@@ -5,7 +5,9 @@ import requests ...@@ -5,7 +5,9 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from base import BaseCore from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
...@@ -17,14 +19,79 @@ cursor_ = baseCore.cursor_ ...@@ -17,14 +19,79 @@ cursor_ = baseCore.cursor_
taskType = '企业公告/证监会' taskType = '企业公告/证监会'
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
#获取文件大小
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units)-1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
name = pdf_name + '.pdf'
now_time = time.strftime("%Y-%m")
result = obsClient.putContent('zzsn', f'ZJH/{now_time}/'+name, content=response.content)
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
break
except:
time.sleep(3)
continue
def secrchATT(item_id, name, type_id): if page_size < 1:
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s ''' # pdf解析失败
cursor_.execute(sel_sql, (item_id, name, type_id)) # print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
def secrchATT(item_id, name, type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
cursor_.execute(sel_sql, (item_id, name, type_id,order_by))
selects = cursor_.fetchone() selects = cursor_.fetchone()
return selects return selects
# 插入到att表 返回附件id # 插入到att表 返回附件id
def tableUpdate(retData, com_name, year, pdf_name, num): def tableUpdate(retData, com_name, year, pdf_name, num):
item_id = retData['item_id'] item_id = retData['item_id']
...@@ -39,26 +106,26 @@ def tableUpdate(retData, com_name, year, pdf_name, num): ...@@ -39,26 +106,26 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
page_size = retData['page_size'] page_size = retData['page_size']
create_time = retData['create_time'] create_time = retData['create_time']
order_by = num order_by = num
selects = secrchATT(item_id, pdf_name, type_id) # selects = secrchATT(item_id, pdf_name, type_id)
#
if selects: # if selects:
log.info(f'com_name:{com_name}已存在') # log.info(f'pdf_name:{pdf_name}已存在')
id = selects[0] # id = ''
return id # return id
else: # else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = ( values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by, status, create_by,
create_time, page_size) create_time, page_size)
cursor_.execute(Upsql, values) # 插入 cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交 cnx_.commit() # 提交
log.info("更新完成:{}".format(Upsql)) log.info("更新完成:{}".format(Upsql))
selects = secrchATT(item_id, pdf_name, type_id) selects = secrchATT(item_id, pdf_name, type_id,order_by)
id = selects[0] id = selects[0]
return id return id
def RequestUrl(url, payload, social_code,start_time): def RequestUrl(url, payload, social_code,start_time):
# ip = get_proxy()[random.randint(0, 3)] # ip = get_proxy()[random.randint(0, 3)]
...@@ -73,13 +140,20 @@ def RequestUrl(url, payload, social_code,start_time): ...@@ -73,13 +140,20 @@ def RequestUrl(url, payload, social_code,start_time):
pass pass
# 检查响应状态码 # 检查响应状态码
if response.status_code == 200: try:
# 请求成功,处理响应数据 if response.status_code == 200:
# print(response.text) # 请求成功,处理响应数据
soup = BeautifulSoup(response.text, 'html.parser') # print(response.text)
pass soup = BeautifulSoup(response.text, 'html.parser')
else: pass
# 请求失败,输出错误信息 else:
# 请求失败,输出错误信息
log.error('请求失败:', url)
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
soup = ''
except:
log.error('请求失败:', url) log.error('请求失败:', url)
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
...@@ -163,26 +237,32 @@ def getUrl(code, url_parms, Catagory2_parms): ...@@ -163,26 +237,32 @@ def getUrl(code, url_parms, Catagory2_parms):
} }
return dic_parms return dic_parms
def ifInstert(short_name, social_code, pdf_url):
def InsterInto(short_name, social_code, pdf_url): ifexist = True
inster = False
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' ''' sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' '''
cursor.execute(sel_sql, (social_code, pdf_url)) cursor.execute(sel_sql, (social_code, pdf_url))
selects = cursor.fetchone() selects = cursor.fetchone()
#如果数据库中存在 则跳过
if selects: if selects:
print(f'com_name:{short_name}、{pdf_url}已存在') ifexist = False
return inster log.info(f'com_name:{short_name}、{pdf_url}已存在')
return ifexist
else:
return ifexist
def InsterInto(social_code, pdf_url,pub_time):
insert = False
# 信息插入数据库 # 信息插入数据库
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
list_info = [ list_info = [
social_code, social_code,
pdf_url, pdf_url,
'证监会', '证监会',
'1', '1',
pub_time,
] ]
#144数据库 #144数据库
cursor.execute(insert_sql, tuple(list_info)) cursor.execute(insert_sql, tuple(list_info))
...@@ -195,10 +275,20 @@ def InsterInto(short_name, social_code, pdf_url): ...@@ -195,10 +275,20 @@ def InsterInto(short_name, social_code, pdf_url):
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
return insert return insert
def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num): def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
#上传至文件服务器 #判断文件是否已经存在obs服务器中
retData = baseCore.upLoadToServe(pdf_url,8,social_code) # file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
now_time = time.strftime("%Y-%m")
file_path = 'ZJH/'+now_time+'/'+pdf_name+'.pdf'
response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300:
log.info('=====文件不存在obs=====')
pass
else:
log.info(f'=====文件存在obs========{file_path}')
return False
#上传至华为云服务器
retData = uptoOBS(pdf_url,pdf_name,8,social_code)
#附件插入att数据库 #附件插入att数据库
if retData['state']: if retData['state']:
pass pass
...@@ -207,12 +297,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -207,12 +297,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
return False return False
num = num + 1 num = num + 1
att_id = tableUpdate(retData,com_name,year,pdf_name,num) att_id = tableUpdate(retData,com_name,year,pdf_name,num)
content = retData['content'] if att_id:
if retData['state']:
pass pass
else: else:
log.info(f'====pdf解析失败====')
return False return False
content = retData['content']
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
...@@ -248,7 +337,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -248,7 +337,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
'message': '操作成功', 'message': '操作成功',
'code': '200', 'code': '200',
} }
print(dic_result) log.info(dic_result)
return True return True
except Exception as e: except Exception as e:
dic_result = { dic_result = {
...@@ -260,14 +349,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na ...@@ -260,14 +349,11 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_na
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
print(dic_result) log.info(dic_result)
return False return False
# 采集信息 # 采集信息
def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库中获取到的基本信息 def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库中获取到的基本信息
okCount = 0
errorCount = 0
social_code = dic_info[2] social_code = dic_info[2]
short_name = dic_info[4] short_name = dic_info[4]
com_name = dic_info[1] com_name = dic_info[1]
...@@ -279,26 +365,26 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -279,26 +365,26 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
try: try:
is_exist = soup.find('div',class_='con').text is_exist = soup.find('div',class_='con').text
if is_exist == '没有查询到数据': if is_exist == '没有查询到数据':
state = 1 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, '') baseCore.recordLog(social_code, taskType, state, takeTime, url, '没有查询到数据')
return return
except: except:
pass pass
# 先获取页数 # # 先获取页数
page = soup.find('div', class_='pages').find('ul', class_='g-ul').text # page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
#
total = re.findall(r'\d+', page)[0] # total = re.findall(r'\d+', page)[0]
#
r_page = int(total) % 15 # r_page = int(total) % 15
if r_page == 0: # if r_page == 0:
Maxpage = int(total) // 15 # Maxpage = int(total) // 15
else: # else:
Maxpage = int(total) // 15 + 1 # Maxpage = int(total) // 15 + 1
log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页') # log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
# 首页和其他页不同,遍历 如果是首页 修改一下链接 # # 首页和其他页不同,遍历 如果是首页 修改一下链接
for i in range(1, Maxpage + 1): for i in range(1,51):
log.info(f'==========正在采集第{i}页=========') log.info(f'==========正在采集第{i}页=========')
if i == 1: if i == 1:
href = url href = url
...@@ -310,9 +396,9 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -310,9 +396,9 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
if soup == '': if soup == '':
continue continue
tr_list = soup.find('div', id='txt').find_all('tr') tr_list = soup.find('div', id='txt').find_all('tr')
pageIndex = 0 # pageIndex = 0
for tr in tr_list[1:]: for tr in tr_list[1:]:
pageIndex += 1 # pageIndex += 1
td_list = tr.find_all('td') td_list = tr.find_all('td')
pdf_url_info = td_list[2] pdf_url_info = td_list[2]
# print(pdf_url) # print(pdf_url)
...@@ -320,37 +406,35 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -320,37 +406,35 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[1].strip('\'') name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[1].strip('\'')
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'') pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
#todo:判断发布日期是否是日期格式
pattern = r"^\d{4}-\d{2}-\d{2}$" # 正则表达式匹配YYYY-MM-DD格式的日期
if re.match(pattern, pub_time):
pass
else:
continue
year = pub_time[:4] year = pub_time[:4]
report_type = td_list[4].text.strip() report_type = td_list[4].text.strip()
# 信息插入数据库 # 判断数据库中是否有该条资讯
insert = InsterInto(short_name, social_code, name_pdf) ifexist = ifInstert(short_name, social_code, pdf_url)
#如果不存在 ifexist = True
if insert: if ifexist:
# # 公告信息列表
# okCount = okCount + 1
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败 # 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
log.info(f'======={short_name}========{code}===插入公告库成功')
result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num) result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num)
if result: if result:
# 公告信息列表 # 公告信息列表
okCount = okCount + 1
log.info(f'{short_name}==============解析传输操作成功') log.info(f'{short_name}==============解析传输操作成功')
state = 1 state = 1
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')
#发送kafka成功之后 再插入数据库
insert = InsterInto(social_code,pdf_url,pub_time)
if insert:
log.info(f'===={social_code}========{name_pdf}=====插入库成功')
pass pass
else: else:
errorCount += 1
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
log.error(f'{short_name}=============解析或传输操作失败')
# try:
# insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex,type) values('{social_code}','证监会','{pdf_url}','{name_pdf}','{pub_time}',' ',now(),1,{i},{pageIndex},'1')"
# cursor_.execute(insert_err_sql)
# cnx_.commit()
# except:
# pass
continue continue
else: else:
log.info(f'======={short_name}========{code}===已存在') log.info(f'======={short_name}========{code}===已存在')
...@@ -393,14 +477,15 @@ if __name__ == '__main__': ...@@ -393,14 +477,15 @@ if __name__ == '__main__':
while True: while True:
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode') # social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
# social_code = '9110000071092841XX' social_code = '91440500617540496Q'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
time.sleep(20) time.sleep(20)
continue continue
dic_info = baseCore.getInfomation(social_code) dic_info = baseCore.getInfomation(social_code)
count = dic_info[16]
count = dic_info[17]
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
...@@ -418,11 +503,14 @@ if __name__ == '__main__': ...@@ -418,11 +503,14 @@ if __name__ == '__main__':
com_name = dic_info[1] com_name = dic_info[1]
dic_parms = getUrl(code, url_parms, Catagory2_parms) dic_parms = getUrl(code, url_parms, Catagory2_parms)
dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls) dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)
if dic_parms: if dic_parms:
start_time_cj = time.time() start_time_cj = time.time()
log.info(f'======开始处理{com_name}=====发行公告=======')
SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time,num) SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time,num)
log.info(f'{code}==========={short_name},{com_name},发行公告,耗时{baseCore.getTimeCost(start_time_cj, time.time())}') log.info(f'{code}==========={short_name},{com_name},发行公告,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
start_time_ls = time.time() start_time_ls = time.time()
log.info(f'======开始处理{com_name}=====临时报告=======')
SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time,num) SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time,num)
log.info(f'{code}==========={short_name},{com_name},临时报告,耗时{baseCore.getTimeCost(start_time_ls, time.time())}') log.info(f'{code}==========={short_name},{com_name},临时报告,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
# UpdateInfoSql(retData,retData_ls,social_code) # UpdateInfoSql(retData,retData_ls,social_code)
...@@ -431,11 +519,7 @@ if __name__ == '__main__': ...@@ -431,11 +519,7 @@ if __name__ == '__main__':
log.info(f'{short_name} ---- 该企业耗时 ---- {baseCore.getTimeCost(start_time, end_time)}-----------') log.info(f'{short_name} ---- 该企业耗时 ---- {baseCore.getTimeCost(start_time, end_time)}-----------')
count += 1 count += 1
runType = 'NoticeReportCount' runType = 'NoticeReportCount'
baseCore.updateRun(code, runType, count) baseCore.updateRun(social_code, runType, count)
cursor.close() cursor.close()
cnx.close() cnx.close()
# cursor_.close()
# cnx_.close()
# 释放资源
baseCore.close() baseCore.close()
"""
新浪财经美股企业动态
"""
import json
import time
import jieba
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from retry import retry
from base.smart import smart_extractor
from base.BaseCore import BaseCore
# 初始化,设置中文分词
jieba.cut("必须加载jieba")
smart = smart_extractor.SmartExtractor('cn')
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache'
}
taskType = '新浪财经/天眼查'
# 获取企业信息
def getinfomation(social_code):
selectSql = f"select * from mgzqjywyh_list where state = '2' and xydm='{social_code}' "
cursor.execute(selectSql)
data = cursor.fetchone()
cnx.commit()
data = list(data)
cursor.close()
cnx.close()
return data
# 获取响应页面
@retry(tries=3, delay=1)
def getrequests(url):
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
# 解析内容
def getDic(social_code, li):
start_time = time.time()
title = li.find('a').text
href = li.find('a').get('href')
tag_at = li.find('span', class_='xb_list_r').text
author = tag_at.split('|')[0].lstrip().strip()
pub_time = tag_at.split('|')[1].lstrip().strip()
pub_time = pub_time.split(' ')[0].replace('年', '-').replace('月', '-').replace('日', '')
if 'http' not in href:
href = 'https://finance.sina.com.cn' + href
href_ = href.replace('https', 'http')
try:
# 带标签正文
contentText = smart.extract_by_url(href_).text
# 不带标签正文
content = smart.extract_by_url(href_).cleaned_text
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
except:
log.error(f'{href}===页面解析失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
return
dic_news = {
'attachmentIds': '',
'author': author,
'content': content,
'contentWithTag': contentText,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': '新浪财经',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': href, # 原文链接
'summary': '',
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': pub_time[:4]
}
# print(dic_news)
try:
sendKafka(dic_news, start_time)
log.info(f'Kafka发送成功')
try:
insertMysql(social_code, href)
log.info(f'数据库保存成功')
except:
log.error(f'{href}===数据入库失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===数据入库失败')
except:
log.error(f'{href}===发送Kafka失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
# 数据发送至Kafka
@retry(tries=3, delay=1)
def sendKafka(dic_news, start_time):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(dic_news['socialCreditCode'], taskType, state, takeTime, dic_news['sourceAddress'], '')
# 数据保存入库,用于判重
@retry(tries=3, delay=1)
def insertMysql(social_code, link):
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
# 动态信息列表
list_info = [
social_code,
link,
'新浪财经',
'2',
]
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
# 判断动态是否采集过
@retry(tries=3, delay=1)
def selectUrl(url, social_code):
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
cursor.execute(sel_sql, (url, social_code))
selects = cursor.fetchone()
return selects
def doJob():
# while True:
# social_code = ''
# # 从redis中获取企业信用代码
# try:
# data = getinfomation(social_code)
# com_code = data[6]
com_code = 'AAPL'
social_code = 'ZZSN22080900000004'
log.info(f'{social_code}==={com_code}===开始采集')
start_time = time.time()
pageIndex = 1
while True:
# 拼接链接
# url = 'http://biz.finance.sina.com.cn/usstock/usstock_news.php?pageIndex=1&symbol=AAPL&type=1'
url = f'http://biz.finance.sina.com.cn/usstock/usstock_news.php?pageIndex={pageIndex}&symbol={com_code}&type=1'
soup_home = getrequests(url)
li_list = soup_home.select('body > div > div.xb_news > ul > li')
# 有可能第一次获取的li标签列表为空
for i in range(5):
if len(li_list) == 0:
li_list = soup_home.select('body > div > div.xb_news > ul > li')
else:
break
for li in li_list:
title = li.find('a').text
if title == '':
continue
href = li.find('a').get('href')
selects = selectUrl(href, social_code)
if selects:
log.info(f'{url}==已采集过')
else:
getDic(social_code, li)
break
break
# # 如果采集到已采集过动态,证明最新发布动态已经全部采集过
# 增量使用
# if selects:
# break
next = soup_home.select('body > div > div.xb_news > div.xb_pages > a')
for i in range(5):
if len(next) == 0:
next = soup_home.select('body > div > div.xb_news > div.xb_pages > a')
else:
break
if len(next) == 2:
break
pageIndex += 1
time.sleep(2)
log.info(f'{social_code}==={com_code}===企业整体耗时{baseCore.getTimeCost(start_time,time.time())}')
# except:
# log.info(f'==={social_code}=====获取企业信息失败====')
# #重新塞入redis
# baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode',social_code)
# state = 0
# takeTime = baseCore.getTimeCost(start, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
# time.sleep(5)
if __name__ == "__main__":
doJob()
...@@ -33,7 +33,7 @@ def updatewxLink(link,info_source_code,state): ...@@ -33,7 +33,7 @@ def updatewxLink(link,info_source_code,state):
def getjsonInfo(): def getjsonInfo():
#从数据库中获取信息 一条 #从数据库中获取信息 一条
select_sql = "select * from wx_link where state=100 order by id asc limit 1" select_sql = "select * from wx_link where state=0 order by id asc limit 1"
cursor_.execute(select_sql) cursor_.execute(select_sql)
row = cursor_.fetchone() row = cursor_.fetchone()
cnx_.commit() cnx_.commit()
......
# created by virtualenv automatically
*
import gc
from flask import Flask, render_template, request, current_app
import configparser
from controller.Main import Main # 导入全部蓝图变量
import datetime
from apscheduler.schedulers.blocking import BlockingScheduler
from datetime import datetime
from dao.Conn import ConnMySql
import sys
import io
# 清除登录状态
def clearLoginStateIn24H():
conn = ConnMySql()
conn.userClearLoginStateIn24H()
print("清除登录状态-" + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
app = Flask(__name__) # 初始化Flask对象
app.register_blueprint(Main) # 将所有蓝图对象注册到app这个flask对象内
# 上传文件最大16M字节
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
# App配置信息,键=段名+键名,如:db.port=3306
cfg = configparser.ConfigParser()
cfg.optionxform = str # 保持配置文件中键的大小写
cfg.read("static/conf/sys.ini", encoding='utf-8')
sections = cfg.sections()
for section in sections:
items = cfg.items(section)
for key, val in items:
app.config[section + '.' + key] = val
# 个别取值进行特殊处理
app.config['db.port'] = int(app.config['db.port'])
if app.config['sys.useProxy'] == "0":
app.config['sys.useProxy'] = False
else:
app.config['sys.useProxy'] = True
app.config['sys.proxyid'] = 0 #当前使用的代理id
app.config['sys.userid'] = 0 #当前使用的账号id
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
if __name__ == '__main__':
# webbrowser.open("0.0.0.0:5000")
app.run(host='0.0.0.0', port=5201, debug=True) # 启动入口
# 启动定时任务,定时清除异常登录状态,每半小时一次
# sched = BlockingScheduler()
# sched.add_job(clearLoginStateIn24H, 'interval', seconds=1800, id='task-clearLoginStateIn24H')
# sched.start()
import gc
from flask import Blueprint, request, current_app, make_response, send_file # 导入蓝图
import datetime
import re
import os
import logging
import sys
import io
import tempfile
import openpyxl
import string
import json
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium import webdriver
from selenium.webdriver.common.by import By
from util import UtilDate
from service.Service02 import Service02
Main = Blueprint('Main', __name__) # 初始化一个蓝图,而不是Flask对象
# 接受请求,读取请求体中的JSON参数,根据参数进行抓取动作
# {"from":"1900-01-01","last":最近x天数, "orgs":["单位1全称","单位2全称","单位3全称",...]}
@Main.route('/Main/getData', methods=["POST"])
def getData():
print("POST /Main/getData")
paras = request.get_json(force=True)
dateFrom = paras['from']
lastDays = paras['last']
orgs = paras['orgs']
if dateFrom == "":
if lastDays == "":
lastDays = 0
else:
lastDays = -(int(lastDays) - 1)
dateFrom = UtilDate.dateAdd("", "d", lastDays)
service02 = Service02()
return service02.getData(dateFrom, orgs) #"https://wenshu.court.gov.cn/website/wenshu/181029CR4M5A62CH/index.html"
import json
import configparser
# import mysql.connector
from flask import current_app
import pymysql
from pymssql import Cursor
from vo.ProxyInfo import ProxyInfo
from vo.LoginInfo import LoginInfo
class Conn(object):
def __init__(self, conn):
self._conn: pymysql.Connect = conn
def close(self) -> None:
'''
关闭游标对象和连接对象
:param:NULL
:return:None
'''
if self._conn is not None:
self._conn.close()
def genDict(self,oCursor: Cursor) -> {}:
ret={}
try:
i = -1
for field in oCursor.description:
i = i + 1
ret[field[0]] = i
except Exception as err:
print('error:', err)
return ret
# 获取所有proxy
def proxyGetAll(self) -> Cursor:
'''
执行SQL语句
:param sqlstring: Sql语句
:return: 返回结果
'''
cursor: Cursor = None
try:
sql = "SELECT id, proxy FROM caiji.clb_proxy"
cursor = self._conn.cursor()
cursor.execute(sql)
except Exception as err:
print('error:', err)
return cursor
# 获取下一个proxy
def proxyGetNext(self, id: int)->ProxyInfo:
'''
执行SQL语句
:param sqlstring: Sql语句
:return: 返回结果
'''
ret: ProxyInfo = None
sql = "SELECT id, proxy FROM caiji.clb_proxy where id>" + str(id) + " order by id asc limit 1"
try:
cursor = self._conn.cursor()
cursor.execute(sql)
results = cursor.fetchall()
if cursor.rowcount>0:
ret = ProxyInfo()
fields = self.genDict(cursor)
for row in results:
ret.id = row[fields["id"]]
sProxy = row[fields["proxy"]]
proxyInfos = sProxy.split('-')
ret.ip = proxyInfos[0]
ret.port = proxyInfos[1]
ret.user_name = proxyInfos[2]
ret.user_passwd = proxyInfos[3]
cursor.close()
except Exception as err:
print('error:', err)
return ret
# 获取一个空闲账号
def userGetFree(self, userGroup: str, id: int) -> LoginInfo:
"""
执行SQL语句
:param userGroup:
:return: 返回结果
"""
ret: LoginInfo = None
sql = f"SELECT * FROM caiji.login_info where user_group='{userGroup}' and id > {id} and login_time is null order by id asc limit 1"
try:
cursor = self._conn.cursor()
cursor.execute(sql)
results = cursor.fetchall()
if cursor.rowcount>0:
ret = LoginInfo()
fields = self.genDict(cursor)
for row in results:
ret.id = row[fields["id"]]
ret.user_group = row[fields["user_group"]]
ret.user_name = row[fields["user_name"]]
ret.user_passwd = row[fields["user_passwd"]]
cursor.close()
except Exception as err:
print('error:', err)
return ret
# 清除24小时未主动退出(异常退出)的用户的登录状态,
def userClearLoginStateIn24H(self):
'''
执行SQL语句
:param sqlstring: Sql语句
:return: 返回结果
'''
sql = "update caiji.login_info set login_time=null where TIME_TO_SEC(TIMEDIFF(now(), login_time))>86400"
try:
cursor = self._conn.cursor()
cursor.execute(sql)
self._conn.commit()
except Exception as err:
print('error:', err)
# 主动退出登录状态,退出后下次可继续使用,可能需要满足一定的条件
def userSetLoginStateByID(self, id: int):
'''
执行SQL语句
:param id:
:return: 返回结果
'''
sql = "update caiji.login_info set login_time=now() where id=" + str(id)
try:
cursor = self._conn.cursor()
cursor.execute(sql)
self._conn.commit()
except Exception as err:
print('error:', err)
# 主动退出登录状态,退出后下次可继续使用,可能需要满足一定的条件
def userClearLoginStateByID(self, id: int):
'''
执行SQL语句
:param id:
:return: 返回结果
'''
sql = "update caiji.login_info set login_time=null where id=" + str(id)
try:
cursor = self._conn.cursor()
cursor.execute(sql)
self._conn.commit()
except Exception as err:
print('error:', err)
def doSelectByColumns(self, tbname: str, *columns: str) -> list:
'''
通过列名进行Select查询
:param tbname: 表名
:param columns: 需要查询的列名
:return: 查询结果
'''
col = str(columns).replace("[", "").replace("]", "").replace("'", "").replace("(", "").replace(")", "")
sqlstring = f"select {col} from {tbname} "
if len(columns) == 0: sqlstring = f"select *from {tbname}"
self._cursor.execute(sqlstring)
strjson = self.transToJson(self._cursor)
return strjson
def doSelectWhere(self, tbname: str, where: str) -> list:
'''
通过where子句表达式进行Select查询
:param tbname: 表名
:param expression:where子句
:return: 查询结果
'''
sqlstring = f"select *from {tbname} where {where}"
self._cursor.execute(sqlstring)
strjson = self.transToJson(self._cursor)
return strjson
def doInsertRecord(self, tbname: str, *values) -> None:
'''
通过全部字段值新增数据到表
:param tbname: 表名
:param values: 所有字段的值
:return: None
'''
vls = str(values).replace("[", "").replace("]", "")
sqlstring = f"insert into {tbname} values {vls}"
print(sqlstring)
self._cursor.execute(sqlstring)
self._conn.commit()
def doInsertByKV(self, tbname: str, **keyvalues) -> None:
'''
通过 字段名=值 的键值对新增记录
:param tbname: 表名
:param keyvalues: 字段名=值的字典
:return: None
'''
keys = str(keyvalues.keys()).replace("dict_keys", "").replace("'", "").replace("[", "").replace("]", "")
values = str(keyvalues.values()).replace("dict_keys", "").replace("[", "").replace("]", "")
sqlstring = f"insert into {tbname} {keys} values {values}"
self._cursor.execute(sqlstring)
self._conn.commit()
def doDeleteByKV(self, tbname: str, **keyvalues) -> None:
'''
通过 字段名=值 的方式查找到对于键值对并删除
:param tbname: 表名
:param keyvalues: 键值对
:return: None
'''
keys = list(keyvalues.keys())
values = list(keyvalues.values())
pairs = []
for i in range(len(keys)):
pairs.append(f"{keys[i]}={values[i]}")
pairs.append("and") # 使用and连接词
del pairs[len(pairs) - 1] # 删除最后一个and连接词
pairs = str(pairs).replace("[", "").replace("]", "").replace("'", "").replace(",", "")
sqlstring = f"delete from {tbname} where {pairs}"
self._cursor.execute(sqlstring)
self._conn.commit()
def doDeleteWhere(self, tbname: str, where: str) -> None:
'''
通过where表达式进行查询并删除
:param tbname: 表名
:param expression:表达式
:return: None
'''
sqlstring = f"delete from {tbname} where {where}"
self._cursor.execute(sqlstring)
self._conn.commit()
def doUpdateKV(self, tbname: str, expression: str, **keyvalues) -> None:
'''
通过expression表达式找到数据后对 字段名=值 进行修改
:param tbname: 表名
:param expression:where表达式
:param keyvalues: 修改的字段名=值对
:return: None
'''
keys = list(keyvalues.keys())
values = list(keyvalues.values())
keypairs = []
for i in range(len(keys)):
temp = f"{keys[i]}=\"{(values[i])}\""
keypairs.append(temp)
keypairs = str(keypairs).replace("[", "").replace("]", "").replace("'", "")
sqlstring = f"update {tbname} set {keypairs} where {expression}"
self._cursor.execute(sqlstring)
self._conn.commit()
# 执行返回单值的操作,适用于返回行计数等
def selectCount(self, sqlstring):
cnt = self._cursor.execute_scalar(sqlstring)
return cnt
# 获取标题,以及标题类型字典
def MSSQL_GetTitleDict(self, cursor):
titleDict = {}
for rows in cursor.get_header():
titleDict[rows[0]] = rows[1]
# 如果调用conn完成后千万记得,要吧连接关闭。
return titleDict
def createtable(self, tbname: str, *args: list) -> None:
'''
通过List创建新表格
比如createtable("TB_TestTbale",["ID","nchar(10)"],["Password","nchar(20)","NOT NULL"])
每个字段用一个list表示 顺序为 [字段名,类型名,*约束,*其他]
:param tbname: 表格名称
:param args: 参数
:return:None
'''
data = []
for i in range(len(args)):
temp = str(args[i]).replace("[", "").replace("]", "").replace("'", "").replace(",", "")
data.append(temp)
data = str(data).replace("[", "(").replace("]", ")").replace("'", "")
sqlstring = f"create table {tbname} {data}"
self._cursor.execute(sqlstring)
self._conn.commit()
class ConnMySql(Conn):
def __init__(self):
oConn: pymysql.Connect = None
try:
oConn = pymysql.Connect(
host=current_app.config["db.host"],
user=current_app.config["db.user"],
passwd=current_app.config["db.passwd"],
db=current_app.config["db.db"],
port=int(current_app.config["db.port"]),
charset=current_app.config["db.charset"]
)
except Exception as err:
print('error:', err)
Conn.__init__(self, oConn)
class MySqlTemp(Conn):
def __init__(self):
oConn: pymysql.Connect = pymysql.Connect(
host="114.115.159.144",
user="caiji",
passwd="zzsn9988",
db="caiji",
port=3306,
charset="utf8"
)
Conn.__init__(self, oConn)
# 测试
# conn=MySqlTemp()
# o=conn.userGetFree("wenshu")
# print (o.user_name)
# for row in results:
# id = row[0]
# proxy = row[1]
# print(id, proxy)
# proxyInfos=proxy.split('-')
# for i in range(0,4): #proxyInfos:
# print("----",proxyInfos[i])
class ProxyDao():
def t(self):
pass
# 基本信息
from util import UtilDate
from util import UtilNumber
class BaseInfo:
info_title: str # 标题
key_word: str # 关键词
info_bianhao: str # 案号
info_address: str # 管辖法院
info_time: str # 发布日期 #yyyy-mm-dd
info_id: str # 案件ID
info_yuanyou: str # 裁判理由
info_content: str # 正文内容
# 判断本条信息日期是否在指定日期(含)之后
def isAfter(self, sDate: str) -> bool:
if sDate == "":
return False
else:
if self.info_time >= sDate:
return True
else:
return False
def toString(self):
return self.info_title + "\t" + self.key_word + "\t" + self.info_bianhao + "\t" + self.info_address + "\t" + self.info_time + "\t" + self.info_id
home = C:\Program Files\Python
implementation = CPython
version_info = 3.8.0.final.0
virtualenv = 20.13.0
include-system-site-packages = true
base-prefix = C:\Program Files\Python
base-exec-prefix = C:\Program Files\Python
base-executable = C:\Program Files\Python\python.exe
# 裁判文书抓取
from datetime import datetime, timedelta
import json
import time
from flask import current_app as app
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from typing import List
import io
import sys
from dao.Conn import ConnMySql
from util import UtilBrowser
from util import UtilCaptcha
from entity.BaseInfo import BaseInfo
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import jsonpickle
from util.UtilCaptcha import getCaptchaMode1
from vo.LoginInfo import LoginInfo
class Service02:
browser: WebDriver
url = ""
dateFrom = ""
loginInfo: LoginInfo
#在浏览器的命令行直接指定打开的url时,chrome会保留原来的默认打开的标签页,此时有2个标签页。通过driver.get打开网页,则直接在默认的标签页打开,此时有1个标签页
tab1=1 # 主页、列表页;driver.get打开时为0
tab2=2 # 裁判文书网页;driver.get打开时为1
baseInfo = []
nRetry = 100 # 重试次数,暂未使用
lstRet = []
# 主过程
def getData(self, sDateFrom: str, orgs: List[str]):
# 循环抓取数据,直到指定日期前的数据都抓取完成,基本信息总是抓取。
self.dateFrom = sDateFrom
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
print("getData...",flush=True)
for org in orgs:
ok1 = 0
print(org, flush=True)
# 循环采集一个单位的数据,直到全部完成
while True:
# 打开浏览器,查找单位并切换到单位详情,失败则更换代理IP后重来
while True:
if self.openBrowser(org):
break
else:
self.quitBrowser()
if ok1 == 0:
# 采集未完成,继续采集
ok1 = self.getData1()
if ok1 == 1:
# 采集已完成,退出并继续下一单位
break
conn = ConnMySql()
if self.loginInfo is not None:
conn.userClearLoginStateByID(self.loginInfo.id)
conn.close()
# 保存数据到缓冲区
o = {"org": org, "baseInfo": self.baseInfo}
self.lstRet.append(o)
# 全部单位数据采集完成,退出并返回数据给调用者
retData = jsonpickle.encode(self.lstRet, unpicklable=False)
print(json.loads(retData))
return retData
# 打开浏览器,查找单位并转到单位信息页面
def openBrowser(self, org: str) -> bool:
ret = False
print("openBrowser...", flush=True)
conn = ConnMySql()
self.loginInfo = conn.userGetFree("wenshu", app.config['sys.userid'])
if self.loginInfo is None:
app.config['sys.userid'] = 0
self.loginInfo = conn.userGetFree("wenshu", app.config['sys.userid'])
app.config['sys.userid'] = self.loginInfo.id
#conn.userSetLoginStateByID(self.loginInfo.id)
conn.close()
self.browser = UtilBrowser.newChrome(app.config['sys.mainUrl'], False , app.config['sys.useProxy'])
#打开浏览器及裁判文书主页后,删除可能存在的多余的窗口页签。暂未使用
# if len(self.browser.window_handles)>1:
# self.browser.switch_to.window(self.browser.window_handles[0])
# self.browser.close()
# self.browser.switch_to.window(self.browser.window_handles[0])
loginMode = app.config['sys.loginMode']
# 登录,后续放到单独的过程中统一处理
if loginMode == "0":
pass
elif loginMode == "1":
# 需要账号登录,账号存在数据库中,轮换使用
objLogin = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginButton'], 30, 1)
if objLogin != None:
objLogin.click()
#可能随机出现独立的图形验证码输入界面,4个大小写字母和数字组成
nTry = 0
hasPass = False
objLoginCaptchaButton0 = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginCaptchaButton'], 10, 1)
if objLoginCaptchaButton0 is not None:
# 尝试5次,若验证通过,则继续,否则退出,切换账号重新尝试
while nTry < 10:
nTry = nTry + 1
objLoginCaptchaButton = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginCaptchaButton'], 10, 1)
# 获取验证码图片Url
# url = self.browser.find_element(By.CSS_SELECTOR, app.config['sys.loginCaptchaImage']).get_attribute('src')
strCaptcha = UtilCaptcha.getCaptchaMode1(self.browser, app.config['sys.loginCaptchaImage'])
self.browser.find_element(By.CSS_SELECTOR, app.config['sys.loginCaptchaInput']).send_keys(strCaptcha)
objLoginCaptchaButton.click()
time.sleep(1)
objLoginCaptchaButton = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginCaptchaButton'], 10, 1)
if objLoginCaptchaButton is None:
hasPass = True
break
if hasPass:
nTry = 0
else:
nTry = 4
iframe = None # 登录信息
while nTry < 3:
iframe = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, "#contentIframe", 20, 1)
# WebDriverWait(driver=driver, timeout=20, poll_frequency=1, ignored_exceptions=None).until(expected_conditions.presence_of_element_located((By.ID,'contentIframe')))
if iframe is None:
self.browser.refresh()
else:
break
nTry = nTry + 1
if iframe is not None:
self.browser.switch_to.frame(iframe)
objUser = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginUser'])
if objUser is not None:
objUser.send_keys(self.loginInfo.user_name)
objPass = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginPasswd'])
objPass.send_keys(self.loginInfo.user_passwd)
objLogin = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginOk'])
# 登录时需要短信验证码,暂未处理
if app.config['sys.loginSMSCode'] != "":
objSMS = UtilBrowser.hasElement(self.browser, By.CSS_SELECTOR,
app.config['sys.loginSMSCode'])
if objSMS is not None:
self.browser.find_element(By.CSS_SELECTOR, app.config['sys.loginSMSCode']).send_keys("")
objLogin.click()
time.sleep(5)
self.browser.refresh()
#ret = True # 此处应确认出现特定元素后才返回True
# 若填写列口令则自动登录,否则等待人工登录
# self.browser.find_element(By.CSS_SELECTOR, app.config['sys.loginButton']).click()
elif loginMode == "2":
# cookie登录,暂未处理
pass
# self.browser.get(app.config['sys.mainUrl'])
# 搜索框填写单位名称并单击提交
objSearchInput = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['css.searchInput'])
#登录失败则页面不会有搜索框
if objSearchInput is None:
ret = False
else:
objSearchInput.send_keys(org)
time.sleep(2)
# 模拟单击搜索按钮
objSearchButton = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['css.searchButton'])
objSearchButton.click()
time.sleep(5)
self.browser.refresh()
# 设置为按日期倒排序
objDateSort = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['css.listDateSort'])
if objDateSort is not None:
objDateSort.click()
time.sleep(5)
ret = True
return ret
# 退出浏览器
def quitBrowser(self):
try:
self.browser.quit()
self.browser = None
except:
pass
# 裁判文书信息,重新查找某单位的文书时,和上次打开浏览器相比可能增加了新的裁判文书,所以每次都从首页开始,无需从上次中断的页开始
def getData1(self) -> int:
ret = 0
print("getData1...", flush=True)
# 以下在列表页
selector_title = app.config['css.listTitle'] # "#_view_1545184311000 > div:nth-child(?) > div.list_title.clearfix > h4 > a"
# 以下在详情页,需打开裁判文书,裁判文书在新页签打开
baseInfo1: BaseInfo
# 裁判文书数量
s = self.getAttr(By.CSS_SELECTOR, app.config['css.listCount'], "textContent")
n = self.toInt(s)
if n == 0:
# 若无数据则退出,且不在继续采集本类数据
return 1
pageNo = 0
while True:
# 循环采集多页数据
pageNo = pageNo + 1
for i in range(1, 6): # 每页5条数据
print(f"----文书数量:{n},每页文书个数:5,当前页号:{pageNo},当前序号:{i}", flush=True)
# 不满一页时遇到不存在的行则退出
# 文书列表从nth-child(3)开始
if not UtilBrowser.hasElement(self.browser, By.CSS_SELECTOR,
selector_title.replace("?", str(i + 2), 1)):
break
baseInfo1 = BaseInfo()
baseInfo1.info_title = self.getAttr(By.CSS_SELECTOR, selector_title.replace("?", str(i + 2), 1), "textContent")
baseInfo1.info_bianhao = self.getAttr(By.CSS_SELECTOR,
app.config['css.listBianhao'].replace("?", str(i + 2), 1),
"textContent")
baseInfo1.info_address = self.getAttr(By.CSS_SELECTOR,
app.config['css.listAddress'].replace("?", str(i + 2), 1),
"textContent")
baseInfo1.info_time = self.getAttr(By.CSS_SELECTOR,
app.config['css.listTime'].replace("?", str(i + 2), 1),
"textContent")
baseInfo1.info_yuanyou = self.getAttr(By.CSS_SELECTOR,
app.config['css.listYuanyou'].replace("?", str(i + 2), 1),
"textContent")
# https://wenshu.court.gov.cn/website/wenshu/181107ANFZ0BXSK4/index.html?docId=OUD3Tm7EvEQVkiexnBa5S3nnG9zDkQyxiWoR8jr7QJJtFc9Y6vX89Z/dgBYosE2gstL9HQn+C934OzwMvqVgk+DtAz+qRVZWr9dI7ybeiFnaPaFBceYmelTK0+qydxfd
link = self.getAttr(By.CSS_SELECTOR, selector_title.replace("?", str(i + 2), 1),
"href")
pos = link.index("=") + 1
baseInfo1.info_id = link[pos:]
if baseInfo1.isAfter(self.dateFrom):
# 当前数据条目在指定日期之后,则如果缓冲区没有的话追加到缓冲区,已经存在的,则忽略
exist = False
for e in self.baseInfo:
# 法律文书可抓取到原始的文书ID,可直接用ID查找是否已经抓取
if e.info_id == baseInfo1.info_id: # e.toString == baseInfo1.toString()
exist = True
break
if exist == False:
# 获取裁判文书正文,未抓取的才抓取正文
# 模拟单击正文链接
self.browser.find_element(By.CSS_SELECTOR, selector_title.replace("?", str(i + 2), 1)).click()
# 单击后会自动在新标签页打开正文链接,并且为活动页签
t1 = datetime.now()
while True:
if len(self.browser.window_handles) > self.tab2:
break
t2 = datetime.now()
if (t2 - t1).seconds > 60:
break
time.sleep(1)
if len(self.browser.window_handles) > self.tab2:
self.browser.switch_to.window(self.browser.window_handles[self.tab2])
baseInfo1.info_content = self.getAttr(By.CSS_SELECTOR, app.config['css.contContent'],
"textContent")
time.sleep(5)
# 关闭文书正文页签,回到文书列表页签
self.browser.close()
self.browser.switch_to.window(self.browser.window_handles[self.tab1])
print("--------当前文书长度:", len(baseInfo1.info_content), flush=True)
self.baseInfo.append(baseInfo1)
else:
# 当前数据条目在指定日期之后
ret = 1
break
# 如果有下一页,则继续,否则数据采集完成
if self.getAttr(By.CSS_SELECTOR, app.config['css.listNextPage'], "class").find("disabled"):
# 存在disabled则无下一页
ret = 1
else:
self.browser.find_element(By.CSS_SELECTOR, app.config['css.listNextPage']).click()
time.sleep(5)
# 出现验证码窗口或IP锁定界面则退出重新切换IP采集
if self.hasCaptcha() or self.hasBlock():
break
if ret == 1:
break
return ret
def toInt(self, s) -> int:
ret = 0
try:
ret = int(s)
except:
pass
return ret
# 返回页面元素指定属性的值,如class
def getAttr(self, by: str, selector: str, attr: str) -> str:
ret = ""
try:
if attr == "text":
ret = self.browser.find_element(by, selector).text
else:
ret = self.browser.find_element(by, selector).get_attribute(attr)
except:
pass
return ret
# 判断是否出现了验证码。
def hasCaptcha(self) -> bool:
ret = False
# we: WebElement #
# wes = self.browser.find_elements(By.TAG_NAME, "div")
# for we in wes:
# if we.get_attribute("class").find("geetest_box"): #
# if we.get_attribute("style").find("display: block;"):
# ret = True
return ret
# 判断是否出现了IP锁定。
def hasBlock(self) -> bool:
ret = False
# if self.getAttr(By.CSS_SELECTOR, "body > div > p", "text").find("夹带攻击行为"): # p.prom 您的地址(1.2.3.4)访问疑似夹带攻击行为,请稍后重试,或注册/登录
# print("*******夹带攻击行为*******")
# ret = True
return ret
#系统配置
[sys]
#文字识别Url,用于识别裁判文书网的验证码
ocrUrl=http://114.116.49.86:8013/wzsb_app?withCrLf=false
#登录模式,0-无需登录,1-账号登录(需要口令、短信、验证码相应的选择器不能为空),2-cookie登录
loginMode=1
#是否使用代理,0-不用,1-使用,需登录的一般不适用代理
useProxy=1
#验证码识别,0-不识别,1-识别,暂采用固定的方法识别验证码,后续扩展为不同的识别模式
verifiCode=0
#登录Url ?open=login
loginUrl=https://wenshu.court.gov.cn/website/wenshu/181010CARHS5BS3C/index.html
#正常Url,登录后可能会自动跳转到正常Url
mainUrl=https://wenshu.court.gov.cn
#登录-用户
loginUser=#root > div > form > div > div:nth-child(1) > div > div > div > input
#登录-口令
loginPasswd=#root > div > form > div > div:nth-child(2) > div > div > div > input
#裁判文书网的图形验证码在单独的页面,输入正确后返回到登录界面
#登录-图形验证码输入框,不为空时则需要识别验证码
loginCaptchaInput=body > div > div.card-body > div > form > div.captcha > input
#登录-图形验证码图片
loginCaptchaImage=#Image1
#登录-图形验证码确认按钮
loginCaptchaButton=body > div > div.card-body > div > form > div.warnbtn > input
#登录-短信验证码,可能和图形验证码同时需要,暂未处理
loginSMSCode=
#主界面登录按钮
loginButton=#loginLi > a
#登录界面确认登录按钮
loginOk=#root > div > form > div > div.login-button-container > span
#数据库配置
[db]
host=114.115.159.144
port=3306
user=caiji
passwd=zzsn9988
db=caiji
charset=utf8
#css选择器配置
[css]
#搜索-文本框
searchInput=#_view_1540966814000 > div > div.search-wrapper.clearfix > div.search-middle > input
#搜索-按钮
searchButton=#_view_1540966814000 > div > div.search-wrapper.clearfix > div.search-rightBtn.search-click
#列表-日期倒排按钮
listDateSort=#_view_1545184311000 > div.LM_tool.clearfix > div:nth-child(2) > a
#列表-案件数量
listCount=#_view_1545184311000 > div.LM_con.clearfix > div.fr.con_right > span
#列表-案件名称
listTitle=#_view_1545184311000 > div:nth-child(?) > div.list_title.clearfix > h4 > a
#列表-编号
listBianhao=#_view_1545184311000 > div:nth-child(?) > div.list_subtitle > span.ah
#列表-法院
listAddress=#_view_1545184311000 > div:nth-child(?) > div.list_subtitle > span.slfyName
#列表-审结日期
listTime=#_view_1545184311000 > div:nth-child(?) > div.list_subtitle > span.cprq
#列表-案由
listYuanyou=#_view_1545184311000 > div:nth-child(?) > div.list_reason > p
#下一页按钮
listNextPage=#_view_1545184311000 > div.left_7_3 > a:last-child
#正文-链接,一般和title相同
contLink=#_view_1545184311000 > div:nth-child(?) > div.list_title.clearfix > h4 > a
#正文-正文
contContent=#_view_1541573883000 > div > div.PDF_box > div.PDF_pox
from flask import current_app
from datetime import datetime, timedelta
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.chrome.options import Options
from msedge.selenium_tools import EdgeOptions
from msedge.selenium_tools import Edge
from selenium.webdriver.chrome.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.remote.webelement import WebElement
import seleniumwire.undetected_chromedriver.v2
from dao.Conn import ConnMySql
from vo.ProxyInfo import ProxyInfo
# 等待元素出现,timeout-等待时长,frequency等待期间检查频率,均为秒
def waitElement(browser: WebDriver, by: str, selecter: str, timeout: int = 20, frequency: int = 1) -> WebElement:
ret = None
t1 = datetime.now()
while (datetime.now() - t1).seconds < timeout:
if hasElement(browser, by, selecter):
ret = getElement(browser, by, selecter)
break
time.sleep(frequency)
return ret
# 获取元素,无则返回None
def getElement(browser: WebDriver, by: str, selecter: str) -> WebElement:
ret = None
try:
ret = browser.find_element(by, selecter)
except:
pass
return ret
# 判断元素是否存在。
def hasElement(browser: WebDriver, by: str, selecter: str) -> bool:
ret = True
try:
browser.find_element(by, selecter)
except:
ret = False
return ret
# 返回页面元素指定属性的值,如class,若未找到元素,则返回空串
def getAttr(brow: webdriver, by: str, selector: str, attr: str) -> str:
ret = ""
try:
s = brow.find_element(str, selector).get_attribute(attr) #
except:
pass
return ret
# 打开Edge浏览器
def newEdge(useProxy):
edge_options = EdgeOptions()
edge_options.use_chromium = True
edge_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Edge(options=edge_options)
# driver.get('https://bing.com')
# element = driver.find_element(By.ID, 'sb_form_q')
# element.send_keys('WebDriver')
# element.submit()
return driver
# 打开Chrome浏览器
# url:浏览器启动时需要打开的url
# useProxy:使用使用代理,True-使用
# cookie:需要设置的cookie信息,如登录信息。
def newChrome(url: str = "", debugMode: bool = False, useProxy: bool = False, cookie: str = "") -> WebDriver:
# 禁止浏览器自动关闭
option = webdriver.ChromeOptions()
if debugMode == False:
option.add_experimental_option("detach", True)
option.add_experimental_option('excludeSwitches',['enable-automation']) # 去掉web自动化,window.navigator.webdriver=undefined
option.add_experimental_option('useAutomationExtension', False) # 取消chrome受自动控制提示
option.add_argument("--disable-blink-features=AutomationControlled")
option.add_argument('disable-infobars') # 不显示Chrome正在受自动软件控制
option.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
option.add_argument('--ignore-certificate-errors')
# option.add_argument("--user-data-dir=C:/Users/Administrator/AppData/Local/Google/Chrome/User Data/Default");
# option.add_argument("--test-type=allow-running-insecure-content");
# option.add_argument('--headless') # 后台运行Chrome
# 随机设置代理IP,分为http代理和socks5代理。useProxy=False时,需要清除原来设置的代理
isHttpProxy = False
seleniumwire_options = {}
proxy = None
if useProxy:
# 读取一个IP代理,如果已经到最后一个,则重新从0开始
conn = ConnMySql()
proxyInfo: ProxyInfo
proxyInfo = conn.proxyGetNext(current_app.config['sys.proxyid'])
print("proxy_id:"+str(current_app.config['sys.proxyid'])+","+proxyInfo.ip,flush=True)
if proxyInfo is None:
current_app.config['sys.proxyid'] = 0
proxyInfo = conn.proxyGetNext(current_app.config['sys.proxyid'])
current_app.config['sys.proxyid'] = proxyInfo.id
desired_capabilities = webdriver.DesiredCapabilities.CHROME.copy()
sProxy = ""
if proxyInfo.user_name == "":
# option.add_argument(f'--proxy-server=http://{proxy_ip}:{proxy_port}')
# sProxy = f'--proxy-server=http://{proxy_ip}:{proxy_port}'
sProxy = f'http://{proxyInfo.ip}:{proxyInfo.port}'
else:
# option.add_argument(f'--proxy-server=http://{proxy_username}:{proxy_password}@{proxy_ip}:{proxy_port}')
# sProxy = f'--proxy-server=http://{proxy_username}:{proxy_password}@{proxy_ip}:{proxy_port}'
sProxy = f'http://{proxyInfo.user_name}:{proxyInfo.user_passwd}@{proxyInfo.ip}:{proxyInfo.port}'
webdriver.DesiredCapabilities.CHROME['proxy'] = {
"httpProxy": sProxy,
"sslProxy": sProxy,
"proxyType": "manual"
}
conn.close()
# 随机设置UserAgent
userAgent = getUserAgent()
# option.add_argument('user-agent=%s' %userAgent)
if url != "":
option.add_argument('--app ' + url) # 在默认窗口打开链接 https://wenshu.court.gov.cn
if debugMode:
#option.debugger_address = "127.0.0.1:9222"
option.add_experimental_option('debuggerAddress', '127.0.0.1:9222')
# 创建 Chrome 浏览器实例,同时设置代理信息
driver = webdriver.Chrome(
options=option) # service=ChromeService(ChromeDriverManager().install()), chrome_options options=option , desired_capabilities=desired_capabilities, seleniumwire_options=seleniumwire_options
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
if debugMode == False:
driver.maximize_window() # 窗口最大化
driver.delete_all_cookies() # 清除cookies
# cookie后续改为bool型,=True则每次从数据库中读取一个保存的cookie登录,登录后可能需要保存cookie,因为过期时间可能被重新设置。
# 需要时也可由专门的后台任务定期对最近未登录的cookie进行重新登录并保存新的cookie
if cookie != "":
cookie_dict = eval(cookie)
driver.add_cookie(cookie_dict)
driver.refresh()
return driver
# 返回代理IP及端口号,例:1.2.3.4:555
def getProxyIP():
return ""
# 随机返回一个浏览器的UserAgent
def getUserAgent():
user_agents = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
# random.choice返回列表的随机项
user_agent = random.choice(user_agents)
return user_agent
# 验证码识别,暂只处理裁判文书网的验证码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.webdriver import WebDriver
import requests
from flask import current_app
from pathlib import Path
import tempfile
import uuid
import hashlib
import os
import json
# selecter: 验证码图片css选择器
def getCaptchaMode1(browser: WebDriver,selecter: str):
ret = ""
# 通过requests发送一个get请求到图片地址,返回的响应就是图片内容
out_path = "./Temp_file"
try:
Path(out_path).mkdir(parents=True, exist_ok=True)
# 将获取到的图片二进制流写入本地文件
path_name = os.path.join(out_path, str(uuid.uuid4())) + ".png"
print(path_name)
# 保存验证码图片
img = browser.find_element(By.CSS_SELECTOR, selecter)
img.screenshot(path_name)
# #url方式下载
# r = requests.get(imgUrl)
# with open(path_name, 'wb') as f:
# # 对于图片类型的通过r.content方式访问响应内容,将响应内容写入baidu.png中
# f.write(r.content)
ocrUrl = current_app.config['sys.ocrUrl']
# 调用文字识别服务
file = open(path_name, "rb")
response = requests.post(ocrUrl, files={"multiRequest": file})
file.close()
os.remove(path_name)
# 返回:{"code":200,"logs":null,"message":"success","resultData":"2rVK"}
oRet = json.loads(response.text)
ret = oRet["resultData"]
#os.remove(path_name)
print(ret)
except Exception as err:
print('getCaptchaMode1 error:', err)
return ret
from datetime import datetime,timedelta
from dateutil.relativedelta import relativedelta
#将yyyy月m月d日格式的日期转为yyyy-mm-dd格式的日期
def convertDate(sDate:str):
sDate = sDate.replace("年","-")
sDate = sDate.replace("月", "-")
sDate = sDate.replace("日", "")
date_obj = datetime.strptime(sDate, '%Y-%m-%d')
sDate = date_obj.strftime('%Y-%m-%d')
return sDate
#日期加减偏置,参数ymd为单位,y=年,m=月,d=日
def dateAdd(sDate:str,ymd:str="d",diff:int=1):
if sDate=="":
sDate = datetime.now()
sDate = sDate.strftime('%Y-%m-%d')
date_obj = datetime.strptime(sDate, '%Y-%m-%d')
if ymd=="y":
if diff > 0:
date_obj = date_obj+relativedelta(years=diff)
else:
diff=-diff
date_obj = date_obj - relativedelta(years=diff)
elif ymd=="m":
if diff>0:
date_obj = date_obj + relativedelta(months=diff)
else:
diff=-diff
date_obj = date_obj - relativedelta(months=diff)
elif ymd=="d":
date_obj = date_obj + timedelta(days=diff)
else:
pass
sDate = date_obj.strftime('%Y-%m-%d')
return sDate
\ No newline at end of file
#数值处理类
#将字符串的金额转换为数值型金额,字符串金额可能包含万元,人民币等
def convertMoney(sMoney:str):
sMoney = sMoney.replace("万", "")
sMoney = sMoney.replace("亿", "")
sMoney = sMoney.replace("人民币", "")
sMoney = sMoney.replace("元", "")
return float(sMoney)
from selenium.webdriver.chrome.webdriver import WebDriver
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
#IP代理池
class UtilProxy:
id:int
ip:str
port:str
name:str
password:str
#切换IP代理
def alterIP(self,browser:WebDriver):
pass
# 账号信息
from util import UtilDate
from util import UtilNumber
class LoginInfo:
id: int # 标题
user_group: str
user_name: str
user_passwd: str
# 代理IP信息
from util import UtilDate
from util import UtilNumber
class ProxyInfo:
id: int # 标题
ip: str
port: str
user_name: str
user_passwd: str
import pandas as pd
import pandas as pd
import glob
# 查找当前目录及其子目录下所有以.txt结尾的文件
csv_files = glob.glob(r"D:\机械项目研报\机械项目研报*.xlsx", recursive=True)
# 创建一个空的DataFrame用于存储合并后的数据
merged_data = pd.DataFrame()
# 逐个读取CSV文件并合并到DataFrame中
for file in csv_files:
data = pd.read_excel(file,dtype=str)
# 去掉最后一列
# data = data.iloc[:, :-1]
dad=pd.DataFrame(data,dtype=str)
merged_data = merged_data.append(dad, ignore_index=True)
sorted_df = merged_data.sort_values('industry')
grouped = merged_data.groupby('industry')
# 将合并后的数据保存到新的CSV文件中
# merged_data.to_csv(r"D:\hg\tmp\11.csv", encoding='gbk', index=False, quoting=1, quotechar='"', escapechar='\\')
# merged_data.to_excel(r"D:\机械项目研报\机械项目研报汇总.xlsx", index=False, engine='openpyxl')
with pd.ExcelWriter(r'D:\机械项目研报\机械项目研报汇总2.xlsx') as writer:
for group_name, group_df in grouped:
group_df.to_excel(writer, sheet_name=group_name, index=False)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论