Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
cbfd2391
提交
cbfd2391
authored
10月 10, 2023
作者:
丁双波
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
裁判文书网
上级
afe226ba
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
331 行增加
和
0 行删除
+331
-0
裁判文书网.js
test/裁判文书网.js
+63
-0
裁判文书网列表正文.py
test/裁判文书网列表正文.py
+268
-0
没有找到文件。
test/裁判文书网.js
0 → 100644
浏览文件 @
cbfd2391
function
r
(
size
){
function
r
(
size
){
var
str
=
""
,
arr
=
[
'0'
,
'1'
,
'2'
,
'3'
,
'4'
,
'5'
,
'6'
,
'7'
,
'8'
,
'9'
,
'a'
,
'b'
,
'c'
,
'd'
,
'e'
,
'f'
,
'g'
,
'h'
,
'i'
,
'j'
,
'k'
,
'l'
,
'm'
,
'n'
,
'o'
,
'p'
,
'q'
,
'r'
,
's'
,
't'
,
'u'
,
'v'
,
'w'
,
'x'
,
'y'
,
'z'
,
'A'
,
'B'
,
'C'
,
'D'
,
'E'
,
'F'
,
'G'
,
'H'
,
'I'
,
'J'
,
'K'
,
'L'
,
'M'
,
'N'
,
'O'
,
'P'
,
'Q'
,
'R'
,
'S'
,
'T'
,
'U'
,
'V'
,
'W'
,
'X'
,
'Y'
,
'Z'
];
for
(
var
i
=
0
;
i
<
size
;
i
++
){
str
+=
arr
[
Math
.
round
(
Math
.
random
()
*
(
arr
.
length
-
1
))];
}
return
str
;
}
function
strTobinary
(
str
)
{
var
result
=
[];
var
list
=
str
.
split
(
""
);
for
(
var
i
=
0
;
i
<
list
.
length
;
i
++
)
{
if
(
i
!=
0
)
{
result
.
push
(
" "
);
}
var
item
=
list
[
i
];
var
binaryStr
=
item
.
charCodeAt
().
toString
(
2
);
result
.
push
(
binaryStr
);
};
return
result
.
join
(
""
);
}
function
cipher
()
{
var
date
=
new
Date
();
var
timestamp
=
date
.
getTime
().
toString
();
var
salt
=
r
(
24
);
var
year
=
date
.
getFullYear
().
toString
();
var
month
=
(
date
.
getMonth
()
+
1
<
10
?
"0"
+
(
date
.
getMonth
()
+
1
)
:
date
.
getMonth
()).
toString
();
var
day
=
(
date
.
getDate
()
<
10
?
"0"
+
date
.
getDate
()
:
date
.
getDate
())
.
toString
();
var
iv
=
year
+
month
+
day
;
return
salt
}
function
des
(
salt
,
iv
,
enc
)
{
// var enc = des3(timestamp, salt, iv).toString();
var
str
=
salt
+
iv
+
enc
;
var
ciphertext
=
strTobinary
(
str
);
return
ciphertext
;
}
function
token
(){
var
size
=
24
var
str
=
""
,
arr
=
[
'0'
,
'1'
,
'2'
,
'3'
,
'4'
,
'5'
,
'6'
,
'7'
,
'8'
,
'9'
,
'a'
,
'b'
,
'c'
,
'd'
,
'e'
,
'f'
,
'g'
,
'h'
,
'i'
,
'j'
,
'k'
,
'l'
,
'm'
,
'n'
,
'o'
,
'p'
,
'q'
,
'r'
,
's'
,
't'
,
'u'
,
'v'
,
'w'
,
'x'
,
'y'
,
'z'
,
'A'
,
'B'
,
'C'
,
'D'
,
'E'
,
'F'
,
'G'
,
'H'
,
'I'
,
'J'
,
'K'
,
'L'
,
'M'
,
'N'
,
'O'
,
'P'
,
'Q'
,
'R'
,
'S'
,
'T'
,
'U'
,
'V'
,
'W'
,
'X'
,
'Y'
,
'Z'
];
for
(
var
i
=
0
;
i
<
size
;
i
++
){
str
+=
arr
[
Math
.
round
(
Math
.
random
()
*
(
arr
.
length
-
1
))];
}
return
str
;
}
function
pageid
()
{
var
n
=
32
var
text
=
""
;
var
possible
=
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
;
for
(
var
i
=
0
;
i
<
n
;
i
++
)
text
+=
possible
.
charAt
(
Math
.
floor
(
Math
.
random
()
*
possible
.
length
));
return
text
;
}
// console.log(cipher());
\ No newline at end of file
test/裁判文书网列表正文.py
0 → 100644
浏览文件 @
cbfd2391
import
base64
import
base64
import
json
import
random
import
time
import
execjs
import
requests
import
urllib3
from
Crypto.Cipher
import
DES3
from
base.BaseCore
import
BaseCore
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
#保存错误日志
def
insertBadSql
(
error
):
insertSql
=
f
"insert into cpwsw_log (code,description,success,create_time,user,keyword,msg) values (
%
s,
%
s,
%
s,now(),
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
#cookie的处理
def
updateCookie
(
cookie
,
type
):
if
type
==
2
:
#session失效,删除token
cursor_
.
execute
(
"delete from cpwsw_user where cookie=
%
s"
,[
cookie
])
if
type
==
1
:
#正常使用
cursor_
.
execute
(
"update cpwsw_user set update_time=now() where cookie=
%
s"
,[
cookie
])
if
type
==
3
:
#未知异常
cursor_
.
execute
(
"update cpwsw_user set fenghao_time=now() where cookie=
%
s"
,[
cookie
])
cnx_
.
commit
()
# 将DES3加密解密设置为类
class
EncryptDate
:
def
__init__
(
self
,
pianyi
,
key
):
self
.
key
=
key
# 初始化密钥
self
.
iv
=
bytes
(
pianyi
,
encoding
=
'utf8'
)
# 偏移量
self
.
length
=
DES3
.
block_size
# 初始化数据块大小
self
.
des3
=
DES3
.
new
(
self
.
key
,
DES3
.
MODE_CBC
,
self
.
iv
)
# 初始化AES,CBC模式的实例
# 截断函数,去除填充的字符
self
.
unpad
=
lambda
date
:
date
[
0
:
-
ord
(
date
[
-
1
])]
def
pad
(
self
,
text
):
"""
#填充函数,使被加密数据的字节码长度是block_size的整数倍
"""
count
=
len
(
text
.
encode
(
'utf-8'
))
add
=
self
.
length
-
(
count
%
self
.
length
)
entext
=
text
+
(
chr
(
add
)
*
add
)
return
entext
def
encrypt
(
self
,
encrData
):
# 加密函数
res
=
self
.
des3
.
encrypt
(
self
.
pad
(
encrData
)
.
encode
(
"utf8"
))
msg
=
str
(
base64
.
b64encode
(
res
),
encoding
=
"utf8"
)
# msg = res.hex()
return
msg
def
decrypt
(
self
,
decrData
):
# 解密函数
res
=
base64
.
decodebytes
(
decrData
.
encode
(
"utf8"
))
# res = bytes.fromhex(decrData)
msg
=
self
.
des3
.
decrypt
(
res
)
.
decode
(
"utf8"
)
return
self
.
unpad
(
msg
)
with
open
(
'裁判文书网.js'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
jstext
=
f
.
read
()
# 在python中调用js代码
ctx
=
execjs
.
compile
(
jstext
)
print
(
"ok"
)
url
=
'https://wenshu.court.gov.cn/website/parse/rest.q4w'
#获取登录Cookie
def
getCookie
():
cursor_
.
execute
(
f
"select user,cookie from cpwsw_user where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1"
)
row
=
cursor_
.
fetchall
()
if
row
:
pass
else
:
# 没有查到token
log
.
info
(
"没有拿到token"
)
return
False
return
row
[
0
]
#获取正文
def
getDoc
(
info_id
,
userCookie
):
headers
=
{
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Cookie'
:
userCookie
,
'Host'
:
'wenshu.court.gov.cn'
,
'Referer'
:
'https://wenshu.court.gov.cn'
,
'User-Agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
,
}
salt
=
ctx
.
call
(
'cipher'
)
date_now
=
time
.
strftime
(
"
%
Y
%
m
%
d"
,
time
.
localtime
())
t
=
time
.
time
()
eg
=
EncryptDate
(
date_now
,
salt
)
# 偏移量和秘钥,这里密钥的长度必须是16的倍数
des
=
eg
.
encrypt
(
str
(
t
))
#DES3加密
ciphertext
=
ctx
.
call
(
"des"
,
salt
,
date_now
,
des
)
token
=
ctx
.
call
(
"token"
)
data_info
=
{
'docId'
:
info_id
,
'ciphertext'
:
ciphertext
,
'cfg'
:
'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch'
,
'__RequestVerificationToken'
:
token
,
'wh'
:
'250'
,
'ww'
:
'1536'
,
'cs'
:
'0'
}
ip
=
baseCore
.
get_proxy
()
res_info
=
requests
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data_info
,
proxies
=
ip
,
verify
=
False
)
#{'code': -12, 'description': None, 'secretKey': None, 'result': None, 'success': False} SESSION的值不对
#{'code': 9, 'description': '没有权限请求接口,cfg=com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch', 'secretKey': None, 'result': None, 'success': False}
#'{"code":1,"description":"权限已失效","secretKey":null,"result":null,"success":true}'
code
=
res_info
.
json
()[
"code"
]
if
code
!=
1
:
log
.
error
(
f
"正文获取失败:----{res_info.json()}"
)
# 没有正常返回
return
""
try
:
eg_jie
=
EncryptDate
(
date_now
,
res_info
.
json
()[
'secretKey'
])
res_jie
=
eg_jie
.
decrypt
(
res_info
.
json
()[
'result'
])
#DES3解密
except
Exception
as
e
:
return
""
log
.
error
(
f
"正文获取失败:----{e}"
)
return
res_jie
#
def
insertCpwsList
(
keyword
,
page
,
list_info
,
userCookie
):
listCount
=
0
repetCount
=
0
insertCount
=
0
for
one_info
in
list_info
:
listCount
=
listCount
+
1
info_title
=
one_info
[
'1'
]
info_time
=
one_info
[
'31'
]
info_address
=
one_info
[
'2'
]
info_yuanyou
=
one_info
[
'26'
]
info_bianhao
=
one_info
[
'7'
]
info_id
=
one_info
[
'rowkey'
]
selectCountSql
=
f
"select count(1) from cpwsw_list where keyword=
%
s and rowkey=
%
s"
cursor_
.
execute
(
selectCountSql
,[
keyword
,
info_id
])
count
=
cursor_
.
fetchone
()[
0
]
if
count
>
0
:
repetCount
=
repetCount
+
1
continue
else
:
insertCount
=
insertCount
+
1
try
:
# 获取正文
log
.
info
(
"开始采集正文"
)
content
=
getDoc
(
info_id
,
userCookie
)
log
.
info
(
"结束采集正文,开始休眠"
)
time
.
sleep
(
random
.
randint
(
60
,
180
))
if
content
==
''
:
log
.
info
(
"采集到的正文为空"
)
continue
insertSql
=
f
"insert into cpwsw_list (keyword,title,time,address,yuanyou,bianhao,rowkey,state,create_time,content) "
\
f
"values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,0,now(),
%
s)"
cursor_
.
execute
(
insertSql
,
[
keyword
,
info_title
,
info_time
,
info_address
,
info_yuanyou
,
info_bianhao
,
info_id
,
content
])
cnx_
.
commit
()
updateCookie
(
userCookie
,
1
)
except
Exception
as
e
:
log
.
error
(
f
"保存数据库失败:{e}"
)
log
.
info
(
f
"---{keyword}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------"
)
if
listCount
==
0
:
# 列表为空认为结束
return
True
if
repetCount
>=
listCount
/
2
:
# 重复数量大于等于一半认为结束
return
True
# 没有结束
return
False
def
getList
(
keyword
,
page
):
userAndCookie
=
getCookie
()
if
userAndCookie
:
pass
else
:
log
.
info
(
"没有拿到token,开始递归"
)
while
True
:
log
.
info
(
"没有拿到token,开始休眠"
)
time
.
sleep
(
60
)
log
.
info
(
"没有拿到token,结束休眠"
)
userAndCookie
=
getCookie
()
if
userAndCookie
:
break
user
=
userAndCookie
[
0
]
userCookie
=
userAndCookie
[
1
]
log
.
info
(
f
"获取到user----{user}"
)
headers
=
{
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Cookie'
:
userCookie
,
'Host'
:
'wenshu.court.gov.cn'
,
'Referer'
:
'https://wenshu.court.gov.cn'
,
'User-Agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
,
}
salt
=
ctx
.
call
(
'cipher'
)
date_now
=
time
.
strftime
(
"
%
Y
%
m
%
d"
,
time
.
localtime
())
t
=
time
.
time
()
eg
=
EncryptDate
(
date_now
,
salt
)
# 偏移量和秘钥,这里密钥的长度必须是16的倍数
des
=
eg
.
encrypt
(
str
(
t
))
# DES3加密
ciphertext
=
ctx
.
call
(
"des"
,
salt
,
date_now
,
des
)
pageId
=
ctx
.
call
(
"pageid"
)
token
=
ctx
.
call
(
"token"
)
search_key
=
[{
"key"
:
"s21"
,
"value"
:
f
"{keyword}"
}]
data
=
{
'pageId'
:
pageId
,
's21'
:
keyword
,
'sortFields'
:
's51:desc'
,
# 按裁判日期排序
'ciphertext'
:
ciphertext
,
'pageNum'
:
page
,
'pageSize'
:
'5'
,
'queryCondition'
:
str
(
search_key
),
'cfg'
:
'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@queryDoc'
,
'__RequestVerificationToken'
:
token
,
'wh'
:
'403'
,
'ww'
:
'1531'
,
'cs'
:
'0'
}
res
=
requests
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
)
code
=
res
.
json
()[
"code"
]
if
code
!=
1
:
#没有正常返回
#记录信息 删除登录信息
error
=
[
res
.
json
()[
"code"
],
res
.
json
()[
"description"
],
res
.
json
()[
"success"
],
user
,
keyword
,
''
]
insertBadSql
(
tuple
(
error
))
updateCookie
(
userCookie
,
3
)
return
getList
(
keyword
,
page
)
eg_jie
=
EncryptDate
(
date_now
,
res
.
json
()[
'secretKey'
])
res_jie
=
eg_jie
.
decrypt
(
res
.
json
()[
'result'
])
res_json
=
json
.
loads
(
res_jie
)
# 将解密后的数据转换为json格式
list_info
=
res_json
[
'queryResult'
][
'resultList'
]
return
insertCpwsList
(
keyword
,
page
,
list_info
,
userCookie
)
#
def
doJob
(
keyword
):
log
.
info
(
f
"======{keyword}----开始采集======="
)
for
page
in
range
(
1
,
6
):
retFlag
=
getList
(
keyword
,
page
)
time
.
sleep
(
random
.
randint
(
60
,
180
))
if
retFlag
:
#结束 跳出该公众号
break
else
:
#没有结束
pass
log
.
info
(
f
"======{keyword}---------结束采集======="
)
def
test
():
pass
if
__name__
==
"__main__"
:
while
True
:
keyword
=
baseCore
.
redicPullData
(
'cpwsqy'
)
if
keyword
==
'None'
or
keyword
==
None
:
log
.
info
(
"redis已经没有数据了,重新放置数据"
)
break
doJob
(
keyword
)
baseCore
.
close
()
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论