Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
a285ef8d
提交
a285ef8d
authored
8月 23, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
微信公众号
上级
1d84da3a
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
132 行增加
和
126 行删除
+132
-126
oneWeixin.py
comData/weixin_solo/oneWeixin.py
+132
-126
没有找到文件。
comData/weixin_solo/oneWeixin.py
浏览文件 @
a285ef8d
...
@@ -212,76 +212,30 @@ def get_info(sid,json_search,origin,url_,info_source_code,page):
...
@@ -212,76 +212,30 @@ def get_info(sid,json_search,origin,url_,info_source_code,page):
continue
continue
return
list_all_info
,
num_caiji
return
list_all_info
,
num_caiji
def
job
(
count
,
key
):
def
RequestUrl
(
dic_url
,
token
,
key
):
# 刷新浏览器并获取当前token和cookie
token
,
cookies
=
flushAndGetToken
(
list_b
)
log
.
info
(
'===========获取公众号============'
)
start_
=
time
.
time
()
start_
=
time
.
time
()
#todo:redis中数据 pop一条
infoSourceCode
=
baseCore
.
redicPullData
(
'WeiXinGZH:infoSourceCode'
)
if
infoSourceCode
==
'None'
or
infoSourceCode
==
None
:
#当一次采集完之后,重新插入数据并等待插入完成
getFromSql
()
time
.
sleep
(
20
)
log
.
info
(
f
'========本次公众号已采集完毕,共采集{count}个公众号=========总耗时:{baseCore.getTimeCost(start_,time.time())}'
)
return
count
sql
=
f
"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
# '一带一路百人论坛'
# sql = f"-- SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = 'IN-20220609-57436' "
cursor
.
execute
(
sql
)
row
=
cursor
.
fetchone
()
dic_url
=
{
'url_'
:
row
[
0
],
'sid'
:
row
[
1
],
'name'
:
row
[
2
],
'info_source_code'
:
row
[
3
],
'biz'
:
''
}
log
.
info
(
'===========获取biz=========='
)
s
.
cookies
.
update
(
cookies
)
s
.
keep_alive
=
False
url_
=
dic_url
[
'url_'
]
url_
=
dic_url
[
'url_'
]
origin
=
dic_url
[
'name'
]
origin
=
dic_url
[
'name'
]
info_source_code
=
dic_url
[
'info_source_code'
]
info_source_code
=
dic_url
[
'info_source_code'
]
sid
=
dic_url
[
'sid'
]
sid
=
dic_url
[
'sid'
]
try
:
biz
=
dic_url
[
'biz'
]
biz
=
url_
.
split
(
'__biz='
)[
1
]
.
split
(
'==&'
)[
0
]
.
split
(
'='
)[
0
]
dic_url
[
'biz'
]
=
biz
except
Exception
as
e
:
log
.
info
(
f
'---公众号--{origin}---biz错误'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
error
=
[
origin
,
url_
,
info_source_code
,
e
,
'biz错误'
,
time_now
]
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
return
count
fakeid
=
biz
+
'=='
fakeid
=
biz
+
'=='
url_search
=
f
'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
url_search
=
f
'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
#获取页数
ret
=
-
1
json_search
=
''
# 获取页数
try
:
try
:
# ip = baseCore.get_proxy()
# ip = baseCore.get_proxy()
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
str_t
=
json
.
dumps
(
json_search
)
str_t
=
json
.
dumps
(
json_search
)
time
.
sleep
(
1
)
time
.
sleep
(
1
)
except
Exception
as
e
:
except
Exception
as
e
:
log
.
error
(
f
'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}======={e}==='
)
log
.
error
(
f
'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}======={e}==='
)
rePutIntoR
(
info_source_code
)
rePutIntoR
(
info_source_code
)
time
.
sleep
(
20
)
time
.
sleep
(
20
)
return
count
return
json_search
,
ret
ret
=
json_search
[
'base_resp'
][
'ret'
]
ret
=
json_search
[
'base_resp'
][
'ret'
]
# {"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
# {"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
# TODO:需要判断返回值,根据返回值判断是封号还是biz错误
# TODO:需要判断返回值,根据返回值判断是封号还是biz错误
...
@@ -304,7 +258,7 @@ def job(count,key):
...
@@ -304,7 +258,7 @@ def job(count,key):
# browser_run.refresh()
# browser_run.refresh()
r
.
set
(
key
,
50
)
r
.
set
(
key
,
50
)
r
.
expire
(
key
,
5400
)
r
.
expire
(
key
,
5400
)
return
coun
t
return
json_search
,
re
t
elif
ret
==
200002
:
elif
ret
==
200002
:
# 公众号链接错误 保存库里 记录错误信息及错误类型
# 公众号链接错误 保存库里 记录错误信息及错误类型
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
@@ -320,7 +274,7 @@ def job(count,key):
...
@@ -320,7 +274,7 @@ def job(count,key):
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
cnx_
.
commit
()
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}'
)
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}'
)
return
coun
t
return
json_search
,
re
t
elif
ret
==
200003
:
elif
ret
==
200003
:
# 无效的session
# 无效的session
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
@@ -336,7 +290,7 @@ def job(count,key):
...
@@ -336,7 +290,7 @@ def job(count,key):
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
cnx_
.
commit
()
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}'
)
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}'
)
return
coun
t
return
json_search
,
re
t
else
:
else
:
log
.
info
(
f
'----其他情况-----{json_search}---公众号{origin}------'
)
log
.
info
(
f
'----其他情况-----{json_search}---公众号{origin}------'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
@@ -351,80 +305,132 @@ def job(count,key):
...
@@ -351,80 +305,132 @@ def job(count,key):
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
cnx_
.
commit
()
return
json_search
,
ret
def
job
(
count
,
key
):
# 刷新浏览器并获取当前token和cookie
token
,
cookies
=
flushAndGetToken
(
list_b
)
log
.
info
(
'===========获取公众号============'
)
start_
=
time
.
time
()
#todo:redis中数据 pop一条
infoSourceCode
=
baseCore
.
redicPullData
(
'WeiXinGZH:infoSourceCode'
)
if
infoSourceCode
==
'None'
or
infoSourceCode
==
None
:
#当一次采集完之后,重新插入数据并等待插入完成
getFromSql
()
time
.
sleep
(
20
)
log
.
info
(
f
'========本次公众号已采集完毕,共采集{count}个公众号=========总耗时:{baseCore.getTimeCost(start_,time.time())}'
)
return
count
return
count
try
:
Max_data
=
int
(
json_search
[
'app_msg_cnt'
])
Max_page
=
int
(
int
(
json_search
[
'app_msg_cnt'
])
/
5
)
if
int
(
json_search
[
'app_msg_cnt'
])
%
5
!=
0
:
Max_page
=
Max_page
+
1
else
:
Max_page
=
Max_page
except
:
Max_page
=
1
Max_data
=
5
log
.
info
(
f
'开始采集{origin}-----共{Max_page}页---{Max_data}条数据-----'
)
for
i
in
range
(
0
,
Max_data
,
5
):
url_search
=
f
'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin={i}&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
# url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
# https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAwNDA5Njc1Mg==&type=9&query=&token=550883192&lang=zh_CN&f=json&ajax=1
try
:
# ip = get_proxy()[random.randint(0, 3)]
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
str_t
=
json
.
dumps
(
json_search
)
time
.
sleep
(
2
)
except
Exception
as
e
:
log
.
error
(
f
'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}======={e}==='
)
rePutIntoR
(
info_source_code
)
return
count
list_all
=
json_search
[
'app_msg_list'
]
sql
=
f
"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
# '一带一路百人论坛'
# sql = f"-- SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = 'IN-20220609-57436' "
cursor
.
execute
(
sql
)
row
=
cursor
.
fetchone
()
dic_url
=
{
'url_'
:
row
[
0
],
'sid'
:
row
[
1
],
'name'
:
row
[
2
],
'info_source_code'
:
row
[
3
],
'biz'
:
''
}
log
.
info
(
'===========获取biz=========='
)
s
.
cookies
.
update
(
cookies
)
s
.
keep_alive
=
False
url_
=
dic_url
[
'url_'
]
origin
=
dic_url
[
'name'
]
info_source_code
=
dic_url
[
'info_source_code'
]
sid
=
dic_url
[
'sid'
]
try
:
biz
=
url_
.
split
(
'__biz='
)[
1
]
.
split
(
'==&'
)[
0
]
.
split
(
'='
)[
0
]
dic_url
[
'biz'
]
=
biz
except
Exception
as
e
:
log
.
info
(
f
'---公众号--{origin}---biz错误'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
error
=
[
origin
,
url_
,
info_source_code
,
e
,
'biz错误'
,
time_now
]
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
return
count
json_search
,
ret
=
RequestUrl
(
dic_url
,
token
,
key
)
if
ret
==
0
:
try
:
try
:
#开始采集每一页文章信息
Max_data
=
int
(
json_search
[
'app_msg_cnt'
])
page
=
int
(
i
/
5
+
1
)
Max_page
=
int
(
int
(
json_search
[
'app_msg_cnt'
])
/
5
)
log
.
info
(
f
'---{origin}---------开始采集第{page}个分页-----------'
)
if
int
(
json_search
[
'app_msg_cnt'
])
%
5
!=
0
:
list_all_info
,
num_caiji
=
get_info
(
sid
,
json_search
,
origin
,
url_
,
info_source_code
,
page
)
Max_page
=
Max_page
+
1
print
(
f
'----第{page}页采集到文章个数-----{len(list_all_info)}------{num_caiji}-------'
)
else
:
time
.
sleep
(
2
)
Max_page
=
Max_page
if
len
(
list_all_info
)
!=
0
:
except
:
count
+=
1
Max_page
=
1
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
Max_data
=
5
success
=
[
log
.
info
(
f
'开始采集{origin}-----共{Max_page}页---{Max_data}条数据-----'
)
origin
,
for
i
in
range
(
0
,
Max_data
,
5
):
url_
,
json_search
,
ret
=
RequestUrl
(
dic_url
,
token
,
key
)
info_source_code
,
if
ret
==
0
:
'采集成功'
,
pass
num_caiji
,
time_now
,
]
#成功信息保存
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,success_info,success_num,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
success
))
cnx_
.
commit
()
# 该公众号的所有文章采集完成
log
.
info
(
f
'---第{page}页采集到文章个数---{len(list_all_info)}---{num_caiji}---耗时{baseCore.getTimeCost(start_,time.time())}'
)
else
:
else
:
log
.
info
(
f
'----第{page}页采集到文章个数{num_caiji}--网址已存在!-----耗时{baseCore.getTimeCost(start_,time.time())}'
)
return
count
return
count
except
Exception
as
e
:
if
json_search
!=
''
:
# json解析该公众号成功但采集数据失败
# list_all = json_search['app_msg_list']
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
try
:
false
=
[
#开始采集每一页文章信息
origin
,
page
=
int
(
i
/
5
+
1
)
url_
,
log
.
info
(
f
'---{origin}---------开始采集第{page}个分页-----------'
)
info_source_code
,
list_all_info
,
num_caiji
=
get_info
(
sid
,
json_search
,
origin
,
url_
,
info_source_code
,
page
)
e
,
print
(
f
'----第{page}页采集到文章个数-----{len(list_all_info)}------{num_caiji}-------'
)
'采集失败'
,
time
.
sleep
(
2
)
time_now
if
len
(
list_all_info
)
!=
0
:
]
count
+=
1
# 失败信息保存
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
success
=
[
cursor_
.
execute
(
insertSql
,
tuple
(
false
))
origin
,
cnx_
.
commit
()
url_
,
log
.
info
(
f
'{fakeid}、公众号:{origin}采集失败!!!!!!耗时{baseCore.getTimeCost(start_, time.time())}'
)
info_source_code
,
count
+=
1
'采集成功'
,
log
.
info
(
f
'{fakeid}、公众号{origin}:采集成功!、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_, time.time())}'
)
num_caiji
,
time_now
,
]
#成功信息保存
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,success_info,success_num,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
success
))
cnx_
.
commit
()
# 该公众号的所有文章采集完成
log
.
info
(
f
'---第{page}页采集到文章个数---{len(list_all_info)}---{num_caiji}---耗时{baseCore.getTimeCost(start_,time.time())}'
)
else
:
log
.
info
(
f
'----第{page}页采集到文章个数{num_caiji}--网址已存在!-----耗时{baseCore.getTimeCost(start_,time.time())}'
)
return
count
except
Exception
as
e
:
# json解析该公众号成功但采集数据失败
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
false
=
[
origin
,
url_
,
info_source_code
,
e
,
'采集失败'
,
time_now
]
# 失败信息保存
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
false
))
cnx_
.
commit
()
log
.
info
(
f
'{biz}、公众号:{origin}采集失败!!!!!!耗时{baseCore.getTimeCost(start_, time.time())}'
)
count
+=
1
log
.
info
(
f
'{biz}、公众号{origin}:采集成功!、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_, time.time())}'
)
return
count
else
:
return
count
time
.
sleep
(
2
)
time
.
sleep
(
2
)
return
count
return
count
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论