提交 cbfd2391 作者: 丁双波

裁判文书网

上级 afe226ba
function r(size){
function r(size){
var str = "",
arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
for(var i=0; i<size; i++){
str += arr[Math.round(Math.random() * (arr.length-1))];
}
return str;
}
function strTobinary(str) {
var result = [];
var list = str.split("");
for (var i = 0; i < list.length; i++) {
if (i != 0) {
result.push(" ");
}
var item = list[i];
var binaryStr = item.charCodeAt().toString(2);
result.push(binaryStr);
};
return result.join("");
}
function cipher() {
var date = new Date();
var timestamp = date.getTime().toString();
var salt = r(24);
var year = date.getFullYear().toString();
var month = (date.getMonth() + 1 < 10 ? "0" + (date.getMonth() + 1) : date
.getMonth()).toString();
var day = (date.getDate() < 10 ? "0" + date.getDate() : date.getDate())
.toString();
var iv = year + month + day;
return salt
}
function des(salt,iv,enc) {
// var enc = des3(timestamp, salt, iv).toString();
var str = salt + iv + enc;
var ciphertext = strTobinary(str);
return ciphertext;
}
function token(){
var size = 24
var str = "",
arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
for(var i=0; i<size; i++){
str += arr[Math.round(Math.random() * (arr.length-1))];
}
return str;
}
function pageid() {
var n = 32
var text = "";
var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
for (var i = 0; i < n; i++)
text += possible.charAt(Math.floor(Math.random() * possible.length));
return text;
}
// console.log(cipher());
\ No newline at end of file
import base64
import base64
import json
import random
import time
import execjs
import requests
import urllib3
from Crypto.Cipher import DES3
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore=BaseCore()
log=baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
#保存错误日志
def insertBadSql(error):
insertSql = f"insert into cpwsw_log (code,description,success,create_time,user,keyword,msg) values (%s,%s,%s,now(),%s,%s,%s)"
cursor_.execute(insertSql, tuple(error))
cnx_.commit()
#cookie的处理
def updateCookie(cookie,type):
if type==2:
#session失效,删除token
cursor_.execute("delete from cpwsw_user where cookie=%s",[cookie])
if type ==1:
#正常使用
cursor_.execute("update cpwsw_user set update_time=now() where cookie=%s",[cookie])
if type ==3:
#未知异常
cursor_.execute("update cpwsw_user set fenghao_time=now() where cookie=%s",[cookie])
cnx_.commit()
# 将DES3加密解密设置为类
class EncryptDate:
def __init__(self, pianyi, key):
self.key = key # 初始化密钥
self.iv = bytes(pianyi,encoding='utf8') # 偏移量
self.length = DES3.block_size # 初始化数据块大小
self.des3 = DES3.new(self.key, DES3.MODE_CBC, self.iv) # 初始化AES,CBC模式的实例
# 截断函数,去除填充的字符
self.unpad = lambda date: date[0:-ord(date[-1])]
def pad(self, text):
"""
#填充函数,使被加密数据的字节码长度是block_size的整数倍
"""
count = len(text.encode('utf-8'))
add = self.length - (count % self.length)
entext = text + (chr(add) * add)
return entext
def encrypt(self, encrData): # 加密函数
res = self.des3.encrypt(self.pad(encrData).encode("utf8"))
msg = str(base64.b64encode(res), encoding="utf8")
# msg = res.hex()
return msg
def decrypt(self, decrData): # 解密函数
res = base64.decodebytes(decrData.encode("utf8"))
# res = bytes.fromhex(decrData)
msg = self.des3.decrypt(res).decode("utf8")
return self.unpad(msg)
with open('裁判文书网.js', 'r', encoding='utf-8') as f:
jstext = f.read()
# 在python中调用js代码
ctx = execjs.compile(jstext)
print("ok")
url = 'https://wenshu.court.gov.cn/website/parse/rest.q4w'
#获取登录Cookie
def getCookie():
cursor_.execute(
f"select user,cookie from cpwsw_user where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
row = cursor_.fetchall()
if row:
pass
else:
# 没有查到token
log.info("没有拿到token")
return False
return row[0]
#获取正文
def getDoc(info_id,userCookie):
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': userCookie,
'Host': 'wenshu.court.gov.cn',
'Referer': 'https://wenshu.court.gov.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}
salt = ctx.call('cipher')
date_now = time.strftime("%Y%m%d",time.localtime())
t = time.time()
eg = EncryptDate(date_now,salt) # 偏移量和秘钥,这里密钥的长度必须是16的倍数
des = eg.encrypt(str(t)) #DES3加密
ciphertext = ctx.call("des",salt,date_now,des)
token = ctx.call("token")
data_info = {
'docId':info_id,
'ciphertext':ciphertext,
'cfg':'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch',
'__RequestVerificationToken':token,
'wh':'250',
'ww':'1536',
'cs':'0'
}
ip = baseCore.get_proxy()
res_info = requests.post(url=url,headers=headers,data=data_info,proxies=ip, verify=False)
#{'code': -12, 'description': None, 'secretKey': None, 'result': None, 'success': False} SESSION的值不对
#{'code': 9, 'description': '没有权限请求接口,cfg=com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch', 'secretKey': None, 'result': None, 'success': False}
#'{"code":1,"description":"权限已失效","secretKey":null,"result":null,"success":true}'
code = res_info.json()["code"]
if code != 1:
log.error(f"正文获取失败:----{res_info.json()}")
# 没有正常返回
return ""
try:
eg_jie = EncryptDate(date_now,res_info.json()['secretKey'])
res_jie = eg_jie.decrypt(res_info.json()['result']) #DES3解密
except Exception as e:
return ""
log.error(f"正文获取失败:----{e}")
return res_jie
#
def insertCpwsList(keyword,page,list_info,userCookie):
listCount = 0
repetCount = 0
insertCount = 0
for one_info in list_info:
listCount = listCount + 1
info_title = one_info['1']
info_time = one_info['31']
info_address = one_info['2']
info_yuanyou = one_info['26']
info_bianhao = one_info['7']
info_id = one_info['rowkey']
selectCountSql = f"select count(1) from cpwsw_list where keyword=%s and rowkey=%s"
cursor_.execute(selectCountSql,[keyword,info_id])
count = cursor_.fetchone()[0]
if count > 0:
repetCount = repetCount + 1
continue
else:
insertCount = insertCount + 1
try:
# 获取正文
log.info("开始采集正文")
content = getDoc(info_id,userCookie)
log.info("结束采集正文,开始休眠")
time.sleep(random.randint(60, 180))
if content=='':
log.info("采集到的正文为空")
continue
insertSql = f"insert into cpwsw_list (keyword,title,time,address,yuanyou,bianhao,rowkey,state,create_time,content) " \
f"values (%s,%s,%s,%s,%s,%s,%s,0,now(),%s)"
cursor_.execute(insertSql, [keyword,info_title,info_time,info_address,info_yuanyou,info_bianhao,info_id,content])
cnx_.commit()
updateCookie(userCookie, 1)
except Exception as e:
log.error(f"保存数据库失败:{e}")
log.info(f"---{keyword}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------")
if listCount == 0:
# 列表为空认为结束
return True
if repetCount >= listCount / 2:
# 重复数量大于等于一半认为结束
return True
# 没有结束
return False
def getList(keyword,page):
userAndCookie = getCookie()
if userAndCookie:
pass
else:
log.info("没有拿到token,开始递归")
while True:
log.info("没有拿到token,开始休眠")
time.sleep(60)
log.info("没有拿到token,结束休眠")
userAndCookie = getCookie()
if userAndCookie:
break
user = userAndCookie[0]
userCookie = userAndCookie[1]
log.info(f"获取到user----{user}")
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': userCookie,
'Host': 'wenshu.court.gov.cn',
'Referer': 'https://wenshu.court.gov.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}
salt = ctx.call('cipher')
date_now = time.strftime("%Y%m%d", time.localtime())
t = time.time()
eg = EncryptDate(date_now, salt) # 偏移量和秘钥,这里密钥的长度必须是16的倍数
des = eg.encrypt(str(t)) # DES3加密
ciphertext = ctx.call("des", salt, date_now, des)
pageId = ctx.call("pageid")
token = ctx.call("token")
search_key = [{"key": "s21", "value": f"{keyword}"}]
data = {
'pageId':pageId,
's21': keyword,
'sortFields': 's51:desc', # 按裁判日期排序
'ciphertext': ciphertext,
'pageNum': page,
'pageSize': '5',
'queryCondition': str(search_key),
'cfg': 'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@queryDoc',
'__RequestVerificationToken': token,
'wh': '403',
'ww': '1531',
'cs': '0'
}
res = requests.post(url=url, headers=headers, data=data)
code = res.json()["code"]
if code!=1:
#没有正常返回
#记录信息 删除登录信息
error = [res.json()["code"], res.json()["description"], res.json()["success"], user, keyword,'']
insertBadSql(tuple(error))
updateCookie(userCookie, 3)
return getList(keyword, page)
eg_jie = EncryptDate(date_now, res.json()['secretKey'])
res_jie = eg_jie.decrypt(res.json()['result'])
res_json = json.loads(res_jie) # 将解密后的数据转换为json格式
list_info = res_json['queryResult']['resultList']
return insertCpwsList(keyword, page,list_info,userCookie)
#
def doJob(keyword):
log.info(f"======{keyword}----开始采集=======")
for page in range(1,6):
retFlag = getList(keyword, page)
time.sleep(random.randint(60,180))
if retFlag:
#结束 跳出该公众号
break
else:
#没有结束
pass
log.info(f"======{keyword}---------结束采集=======")
def test():
pass
if __name__=="__main__":
while True:
keyword = baseCore.redicPullData('cpwsqy')
if keyword == 'None' or keyword == None:
log.info("redis已经没有数据了,重新放置数据")
break
doJob(keyword)
baseCore.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论