Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
b8689932
提交
b8689932
authored
4月 03, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
微信公众号日志添加
上级
976e2fb4
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
44 行增加
和
36 行删除
+44
-36
oneWeixin2.py
comData/weixin_solo/oneWeixin2.py
+28
-9
test.py
comData/weixin_solo/test.py
+16
-27
wxList.py
comData/weixin_solo/wxList.py
+0
-0
没有找到文件。
comData/weixin_solo/oneWeixin2.py
浏览文件 @
b8689932
...
@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
...
@@ -11,6 +11,7 @@ from bs4 import BeautifulSoup
from
obs
import
ObsClient
from
obs
import
ObsClient
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
retry
import
retry
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
...
@@ -64,6 +65,15 @@ def getjsonInfo():
...
@@ -64,6 +65,15 @@ def getjsonInfo():
cnx_
.
commit
()
cnx_
.
commit
()
return
dict_json
return
dict_json
@retry
(
tries
=
3
,
delay
=
2
)
def
getrequest
(
url_news
):
ip
=
baseCore
.
get_proxy
()
res_news
=
requests
.
get
(
url_news
,
proxies
=
ip
,
timeout
=
20
)
if
res_news
.
status_code
!=
200
:
raise
def
get_info
(
dict_json
):
def
get_info
(
dict_json
):
# list_all_info = []
# list_all_info = []
# num_caiji = 0
# num_caiji = 0
...
@@ -80,23 +90,32 @@ def get_info(dict_json):
...
@@ -80,23 +90,32 @@ def get_info(dict_json):
origin
=
dict_json
[
'site_name'
]
origin
=
dict_json
[
'site_name'
]
url_news
=
dict_json
[
'link'
]
url_news
=
dict_json
[
'link'
]
info_source_code
=
dict_json
[
'info_source_code'
]
info_source_code
=
dict_json
[
'info_source_code'
]
# url_ft = check_url(sid, url_news)
# if url_ft:
# try:
# return list_all_info,num_caiji
# ip = baseCore.get_proxy()
# res_news = requests.get(url_news, proxies=ip, timeout=20)
# except:
# #400请求失败
# updatewxLink(url_news, info_source_code, 400)
# return False
# 修改请求方法,retry 3次
try
:
try
:
ip
=
baseCore
.
get_proxy
()
res_news
=
getrequest
(
url_news
)
res_news
=
requests
.
get
(
url_news
,
proxies
=
ip
,
timeout
=
20
)
except
:
except
:
#400请求失败
try
:
updatewxLink
(
url_news
,
info_source_code
,
400
)
res_news
=
requests
.
get
(
url_news
,
timeout
=
20
)
return
False
except
:
# 400请求失败
updatewxLink
(
url_news
,
info_source_code
,
400
)
return
False
soup_news
=
BeautifulSoup
(
res_news
.
content
,
'html.parser'
)
soup_news
=
BeautifulSoup
(
res_news
.
content
,
'html.parser'
)
try
:
try
:
news_html
=
soup_news
.
find
(
'div'
,
{
'id'
:
'js_content'
})
news_html
=
soup_news
.
find
(
'div'
,
{
'id'
:
'js_content'
})
news_html
[
'style'
]
=
'width: 814px ; margin: 0 auto;'
news_html
[
'style'
]
=
'width: 814px ; margin: 0 auto;'
#del news_html['style']
#del news_html['style']
news_html
=
rm_style_attr
(
news_html
)
news_html
=
rm_style_attr
(
news_html
)
del
news_html
[
'id'
]
del
news_html
[
'id'
]
del
news_html
[
'class'
]
del
news_html
[
'class'
]
except
:
except
:
...
...
comData/weixin_solo/test.py
浏览文件 @
b8689932
...
@@ -41,32 +41,6 @@ import pandas as pd
...
@@ -41,32 +41,6 @@ import pandas as pd
# pass
# pass
import
redis
import
redis
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
r
=
baseCore
.
r
key
=
'counter'
expiration_time
=
10
# 设置过期时间 60秒
# # 设置自增
# r.incr(key)
# # #自增并设置过期时间
# while True:
# # 设置自增
# r.incr(key)
# value = int(r.get(key).decode())
#
# if value > 10:
# print(value)
# # 设置过期时间
# r.expire(key, expiration_time)
# time.sleep(20)
# print('------------------')
# continue
# # print(value)
# time.sleep(5)
# print(value)
# print("==========")
def
check_url
():
def
check_url
():
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
...
@@ -76,7 +50,22 @@ def check_url():
...
@@ -76,7 +50,22 @@ def check_url():
print
(
'True'
)
print
(
'True'
)
else
:
else
:
print
(
'False'
)
print
(
'False'
)
check_url
()
# check_url()
def
test
(
dic_user_count
):
dic_user_count
[
"A"
]
+=
1
# print(dic_user.items())
for
key
,
value
in
dic_user_count
.
items
():
print
(
key
,
value
)
return
dic_user_count
def
test1
():
dic_user_count
=
{
"A"
:
0
}
for
i
in
range
(
3
):
dic_user_count
=
test
(
dic_user_count
)
print
(
dic_user_count
)
if
__name__
==
"__main__"
:
test1
()
comData/weixin_solo/wxList.py
浏览文件 @
b8689932
差异被折叠。
点击展开。
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论