Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
7bada3ac
提交
7bada3ac
authored
8月 22, 2023
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
a237c960
f0246e72
显示空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
144 行增加
和
118 行删除
+144
-118
BaseCore.py
base/BaseCore.py
+5
-4
RedisPPData.py
base/RedisPPData.py
+43
-15
oneWeixin.py
comData/weixin_solo/oneWeixin.py
+81
-65
baiduSpider.py
百度采集/baidu_comm/baiduSpider.py
+9
-9
baidutaskJob_loc.py
百度采集/baidu_comm/baidutaskJob_loc.py
+2
-24
requirements.txt
百度采集/baidu_comm/requirements.txt
+4
-1
没有找到文件。
base/BaseCore.py
浏览文件 @
7bada3ac
...
...
@@ -17,14 +17,15 @@ import langid
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class
BaseCore
:
# 序列号
__seq
=
0
# 代理池 数据库连接
__cnx_proxy
=
None
__cursor_proxy
=
None
cnx
=
None
cursor
=
None
r
=
None
# agent 池
__USER_AGENT_LIST
=
[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
,
...
...
@@ -392,7 +393,7 @@ class BaseCore:
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
self
,
key
):
item
=
self
.
r
.
r
pop
(
key
)
item
=
self
.
r
.
l
pop
(
key
)
return
item
.
decode
()
if
item
else
None
# 获得脚本进程PID
...
...
@@ -480,7 +481,7 @@ class BaseCore:
def
writerToExcel
(
self
,
detailList
,
filename
):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
)
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
,
dtype
=
str
)
# 创建新的数据
new_data
=
pd
.
DataFrame
(
data
=
detailList
)
# 将新数据添加到现有数据的末尾
...
...
base/RedisPPData.py
浏览文件 @
7bada3ac
import
time
import
pymysql
from
base
import
BaseCore
from
apscheduler.schedulers.blocking
import
BlockingScheduler
basecore
=
BaseCore
.
BaseCore
()
log
=
basecore
.
getLogger
()
#144数据库
cnx
=
basecore
.
cnx
cursor
=
basecore
.
cursor
r
=
basecore
.
r
#11数据库
cnx_
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cursor_
=
cnx_
.
cursor
()
# # 连接到Redis
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
#
...
...
@@ -32,9 +39,9 @@ r = basecore.r
#企业动态
def
NewsEnterprise
():
# #获取国内企业
#
gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
#
cursor.execute(gn_query)
#
gn_result = cursor.fetchall()
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
#获取国外企业
gw_query
=
"select SocialCode from EnterpriseInfo where Place = '2'"
cursor
.
execute
(
gw_query
)
...
...
@@ -42,20 +49,20 @@ def NewsEnterprise():
gw_social_list
=
[
item
[
0
]
for
item
in
gw_result
]
#todo:打印长度
print
(
len
(
gw_social_list
))
#
gn_social_list = [item[0] for item in gn_result]
#
print(len(gw_social_list))
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
#将数据插入到redis中
# for item in gn_social_list:
# r.rpush('NewsEnterprise:gnqy_socialCode', item)
count
=
0
#
count = 0
for
item
in
gw_social_list
:
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
count
+=
1
print
(
item
)
print
(
count
)
#
count+=1
#
print(item)
#
print(count)
#企业动态定时任务
def
NewsEnterprise_task
():
# 实例化一个调度器
...
...
@@ -140,6 +147,29 @@ def BaseInfoEnterprise_task():
print
(
'定时采集异常'
,
e
)
pass
#东方财富网财务数据
def
FinanceFromEast
():
#从上市企业库中读取数据
sql_sel
=
'''select social_credit_code from sys_base_enterprise_ipo where category = '1' limit 10 '''
cursor_
.
execute
(
sql_sel
)
finance
=
cursor_
.
fetchall
()
finance_list
=
[
item
[
0
]
for
item
in
finance
]
print
(
'======='
)
for
item
in
finance_list
:
r
.
rpush
(
'FinanceFromEast:finance_socialCode'
,
item
)
def
FinanceFromEase_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每个季度执行一次
scheduler
.
add_job
(
FinanceFromEast
,
'cron'
,
month
=
'1-12/3'
,
day
=
'1'
,
hour
=
0
,
minute
=
0
)
try
:
# redisPushData # 定时开始前执行一次
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
#微信公众号
def
WeiXingetFromSql
():
selectSql
=
"SELECT info_source_code from info_source where site_uri like '
%
mp.weixin.qq.com
%
'"
...
...
@@ -207,9 +237,6 @@ def FBS():
r
.
rpush
(
'NewsEnterpriseFbs:gnqy_socialCode'
,
item
)
if
__name__
==
"__main__"
:
start
=
time
.
time
()
# NoticeEnterprise()
...
...
@@ -222,8 +249,9 @@ if __name__ == "__main__":
# FBS()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
NoticeEnterprise
()
# NoticeEnterprise()
FinanceFromEast
()
log
.
info
(
f
'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}==='
)
# cnx
.close()
# cursor
.close()
cnx_
.
close
()
cursor_
.
close
()
# basecore.close()
comData/weixin_solo/oneWeixin.py
浏览文件 @
7bada3ac
...
...
@@ -41,30 +41,6 @@ def add_url(sid, article_url):
else
:
return
False
def
get_proxy
():
cnx
=
pymysql
.
connect
(
host
=
"114.115.159.144"
,
user
=
"root"
,
password
=
"zzsn9988"
,
db
=
"clb_project"
,
charset
=
"utf8mb4"
)
with
cnx
.
cursor
()
as
cursor
:
sql
=
"select proxy from clb_proxy"
cursor
.
execute
(
sql
)
proxy_lists
=
cursor
.
fetchall
()
ip_list
=
[]
for
proxy_
in
proxy_lists
:
ip_list
.
append
(
str
(
proxy_
)
.
replace
(
"('"
,
''
)
.
replace
(
"',)"
,
''
))
proxy_list
=
[]
for
str_ip
in
ip_list
:
str_ip_list
=
str_ip
.
split
(
'-'
)
proxyMeta
=
"http://
%(host)
s:
%(port)
s"
%
{
"host"
:
str_ip_list
[
0
],
"port"
:
str_ip_list
[
1
],
}
proxy
=
{
"HTTP"
:
proxyMeta
,
"HTTPS"
:
proxyMeta
}
proxy_list
.
append
(
proxy
)
return
proxy_list
#定时
def
getFromSql
():
selectSql
=
"SELECT info_source_code from info_source where site_uri like '
%
mp.weixin.qq.com
%
'"
cursor
.
execute
(
selectSql
)
...
...
@@ -93,7 +69,7 @@ def flushAndGetToken(list_b):
def
rePutIntoR
(
item
):
r
.
rpush
(
'WeiXinGZH:infoSourceCode'
,
item
)
def
get_info
(
sid
,
json_search
,
origin
,
info_source_cod
e
):
def
get_info
(
sid
,
json_search
,
origin
,
url_
,
info_source_code
,
pag
e
):
list_all_info
=
[]
num_caiji
=
0
kaishi_time
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -103,7 +79,7 @@ def get_info(sid,json_search,origin,info_source_code):
server
=
'https://obs.cn-north-1.myhuaweicloud.com'
# 你的桶的地址
)
list_all_news
=
json_search
[
'app_msg_list'
]
#采集第几篇文章
for
one_news
in
list_all_news
:
news_title
=
one_news
[
'title'
]
timestamp
=
one_news
[
'create_time'
]
...
...
@@ -114,10 +90,12 @@ def get_info(sid,json_search,origin,info_source_code):
url_ft
=
check_url
(
sid
,
url_news
)
if
url_ft
:
log
.
info
(
f
'
已采过该篇文章----文章链接-----{url_news}
'
)
log
.
info
(
f
'
-----{origin}--第{page}页--已采过该篇文章--文章链接--{url_news}-----
'
)
return
list_all_info
,
num_caiji
try
:
res_news
=
requests
.
get
(
url_news
,
timeout
=
20
)
ip
=
baseCore
.
get_proxy
()
res_news
=
requests
.
get
(
url_news
,
timeout
=
20
,
proxies
=
ip
)
time
.
sleep
(
2
)
except
:
continue
soup_news
=
BeautifulSoup
(
res_news
.
content
,
'html.parser'
)
...
...
@@ -132,16 +110,17 @@ def get_info(sid,json_search,origin,info_source_code):
try
:
news_content
=
news_html
.
text
except
:
log
.
info
(
f
'----
----内容为空--------{url_news}---
-----'
)
log
.
info
(
f
'----
{origin}--第{page}页--该篇文章内容为空--{url_news}
-----'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
false
=
[
news_title
,
url_
news
,
news_html
,
url_
,
info_source_code
,
'文章内容为空'
,
time_now
time_now
,
url_news
]
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,
json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s)"
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,
info_source_code,error_type,create_time,news_url) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
false
))
cnx_
.
commit
()
continue
...
...
@@ -182,6 +161,8 @@ def get_info(sid,json_search,origin,info_source_code):
section
.
name
=
'div'
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#将信息传输到kafka中
dic_info
=
{
'sid'
:
sid
,
'title'
:
news_title
,
...
...
@@ -196,8 +177,8 @@ def get_info(sid,json_search,origin,info_source_code):
'createDate'
:
time_now
}
for
nnn
in
range
(
0
,
3
):
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
dic_info
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
kafka_time_out
=
kafka_result
.
get
(
timeout
=
10
)
add_url
(
sid
,
url_news
)
...
...
@@ -205,8 +186,11 @@ def get_info(sid,json_search,origin,info_source_code):
except
:
time
.
sleep
(
5
)
continue
finally
:
producer
.
close
()
num_caiji
=
num_caiji
+
1
list_all_info
.
append
(
dic_info
)
time
.
sleep
(
5
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info2
=
{
'infoSourceId'
:
sid
,
...
...
@@ -235,18 +219,18 @@ def job(count,key):
log
.
info
(
'===========获取公众号============'
)
start_
=
time
.
time
()
#
#todo:redis中数据 pop一条
#todo:redis中数据 pop一条
infoSourceCode
=
baseCore
.
redicPullData
(
'WeiXinGZH:infoSourceCode'
)
if
infoSourceCode
==
'None'
:
if
infoSourceCode
==
'None'
or
infoSourceCode
==
None
:
#当一次采集完之后,重新插入数据并等待插入完成
getFromSql
()
time
.
sleep
(
20
)
log
.
info
(
f
'========本次公众号已采集完毕,共采集{count}个公众号=========总耗时:{baseCore.getTimeCost(start_,time.time())}'
)
return
return
count
sql
=
f
"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
# '一带一路百人论坛'
# sql = f"
SELECT site_uri,id,site_name,info_source_code from info_source where site_name = '一带一路百人论坛
' "
# sql = f"
-- SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = 'IN-20220609-57436
' "
cursor
.
execute
(
sql
)
row
=
cursor
.
fetchone
()
...
...
@@ -282,28 +266,28 @@ def job(count,key):
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
return
fakeid
=
biz
+
'=='
return
count
fakeid
=
biz
+
'=='
url_search
=
f
'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
#
https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAwNDA5Njc1Mg==&type=9&query=&token=550883192&lang=zh_CN&f=json&ajax=1
#
获取页数
try
:
ip
=
get_proxy
()[
random
.
randint
(
0
,
3
)]
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
proxies
=
ip
,
# ip = baseCore.get_proxy()
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
str_t
=
json
.
dumps
(
json_search
)
time
.
sleep
(
2
)
time
.
sleep
(
1
)
except
Exception
as
e
:
log
.
error
(
f
'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}======={e}==='
)
rePutIntoR
(
info_source_code
)
return
#{"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
time
.
sleep
(
20
)
return
count
ret
=
json_search
[
'base_resp'
][
'ret'
]
# {"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
# TODO:需要判断返回值,根据返回值判断是封号还是biz错误
# {'base_resp': {'err_msg': 'freq control', 'ret': 200013}}========= 封号
# {'base_resp': {'err_msg': 'invalid args', 'ret': 200002}} 公众号biz错误 链接
# 'base_resp': {'err_msg': 'ok', 'ret': 0} 正常
ret
=
json_search
[
'base_resp'
][
'ret'
]
if
ret
==
0
:
pass
elif
ret
==
200013
:
...
...
@@ -318,10 +302,9 @@ def job(count,key):
# log.info('=======等待时间600秒=====刷新浏览器=====')
# browser_run = list_b[0]
# browser_run.refresh()
r
.
set
(
key
,
50
)
r
.
expire
(
key
,
36
00
)
return
r
.
expire
(
key
,
54
00
)
return
count
elif
ret
==
200002
:
# 公众号链接错误 保存库里 记录错误信息及错误类型
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -336,8 +319,8 @@ def job(count,key):
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_,time.time())}'
)
return
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_,
time.time())}'
)
return
count
elif
ret
==
200003
:
# 无效的session
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -353,7 +336,7 @@ def job(count,key):
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
log
.
info
(
f
'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}'
)
return
return
count
else
:
log
.
info
(
f
'----其他情况-----{json_search}---公众号{origin}------'
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -368,12 +351,41 @@ def job(count,key):
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
error
))
cnx_
.
commit
()
return
return
count
try
:
Max_data
=
int
(
json_search
[
'app_msg_cnt'
])
Max_page
=
int
(
int
(
json_search
[
'app_msg_cnt'
])
/
5
)
if
int
(
json_search
[
'app_msg_cnt'
])
%
5
!=
0
:
Max_page
=
Max_page
+
1
else
:
Max_page
=
Max_page
except
:
Max_page
=
1
Max_data
=
5
log
.
info
(
f
'开始采集{origin}-----共{Max_page}页---{Max_data}条数据-----'
)
for
i
in
range
(
0
,
Max_data
,
5
):
url_search
=
f
'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin={i}&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
# url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
# https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAwNDA5Njc1Mg==&type=9&query=&token=550883192&lang=zh_CN&f=json&ajax=1
try
:
# ip = get_proxy()[random.randint(0, 3)]
json_search
=
s
.
get
(
url_search
,
headers
=
headers
,
verify
=
False
)
.
json
()
# , proxies=ip, verify=False
str_t
=
json
.
dumps
(
json_search
)
time
.
sleep
(
2
)
except
Exception
as
e
:
log
.
error
(
f
'===公众号{origin}请求失败!当前时间:{baseCore.getNowTime(1)}======={e}==='
)
rePutIntoR
(
info_source_code
)
return
count
list_all
=
json_search
[
'app_msg_list'
]
try
:
list_all_info
,
num_caiji
=
get_info
(
sid
,
json_search
,
origin
,
info_source_code
)
print
(
f
'----------{len(list_all_info)}------{num_caiji}-------'
)
#开始采集每一页文章信息
page
=
int
(
i
/
5
+
1
)
log
.
info
(
f
'---{origin}---------开始采集第{page}个分页-----------'
)
list_all_info
,
num_caiji
=
get_info
(
sid
,
json_search
,
origin
,
url_
,
info_source_code
,
page
)
print
(
f
'----第{page}页采集到文章个数-----{len(list_all_info)}------{num_caiji}-------'
)
time
.
sleep
(
2
)
if
len
(
list_all_info
)
!=
0
:
count
+=
1
...
...
@@ -384,19 +396,19 @@ def job(count,key):
info_source_code
,
'采集成功'
,
num_caiji
,
time_now
time_now
,
]
#成功信息保存
insertSql
=
f
"insert into WeixinGZH (site_name,site_url,info_source_code,success_info,success_num,create_time) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursor_
.
execute
(
insertSql
,
tuple
(
success
))
cnx_
.
commit
()
# 该公众号的所有文章采集完成
log
.
info
(
f
'{fakeid}、公众号{origin}:采集成功!、已采集{count}个公众号、
耗时{baseCore.getTimeCost(start_,time.time())}'
)
log
.
info
(
f
'---第{page}页采集到文章个数---{len(list_all_info)}---{num_caiji}---
耗时{baseCore.getTimeCost(start_,time.time())}'
)
else
:
log
.
info
(
f
'{fakeid}、公众号{origin}、网址已存在!耗时{baseCore.getTimeCost(start_,time.time())}'
)
log
.
info
(
f
'----第{page}页采集到文章个数{num_caiji}--网址已存在!-----耗时{baseCore.getTimeCost(start_,time.time())}'
)
return
count
except
Exception
as
e
:
# json解析该公众号成功但采集数据失败
count
+=
1
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
false
=
[
origin
,
...
...
@@ -411,12 +423,12 @@ def job(count,key):
cursor_
.
execute
(
insertSql
,
tuple
(
false
))
cnx_
.
commit
()
log
.
info
(
f
'{fakeid}、公众号:{origin}采集失败!!!!!!耗时{baseCore.getTimeCost(start_, time.time())}'
)
count
+=
1
log
.
info
(
f
'{fakeid}、公众号{origin}:采集成功!、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_, time.time())}'
)
time
.
sleep
(
2
)
return
count
if
__name__
==
"__main__"
:
requests
.
DEFAULT_RETRIES
=
5
time_start
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
...
@@ -456,15 +468,19 @@ if __name__=="__main__":
# 设置计数器的初始值为0
key
=
baseCore
.
getNextSeq
()
r
.
set
(
key
,
0
)
# 设置key的过期时间为
10秒
r
.
expire
(
key
,
36
00
)
# 设置key的过期时间为
一个半小时
r
.
expire
(
key
,
54
00
)
while
True
:
time
.
sleep
(
2
)
new_value
=
baseCore
.
incrSet
(
key
)
baseCore
.
getttl
(
key
)
if
new_value
<
50
:
try
:
aa
=
job
(
count
,
key
)
count
=
aa
time
.
sleep
(
20
)
except
:
time
.
sleep
(
10
)
else
:
#刷新浏览器
browser_run
=
list_b
[
0
]
...
...
百度采集/baidu_comm/baiduSpider.py
浏览文件 @
7bada3ac
#codi
ng=utf-8
#codi
ng=utf-8
...
...
@@ -266,13 +266,13 @@ class BaiduSpider(object):
break
for
detail
in
lists
:
publishTag
=
detail
[
'publishTag'
]
if
publishTag
:
pubtime
=
datetime
.
datetime
.
strptime
(
publishTag
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
needDate
=
'2022-01-01 00:00:00'
needTime
=
datetime
.
datetime
.
strptime
(
needDate
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
if
pubtime
<
needTime
:
timeFlag
=
True
break
#
if publishTag:
#
pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
#
needDate='2022-01-01 00:00:00'
#
needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
#
if pubtime < needTime:
#
timeFlag = True
#
break
is_member
=
self
.
r
.
sismember
(
'pybaidu_baidu_'
+
self
.
wordsCode
,
durl
)
if
is_member
:
continue
...
...
@@ -398,7 +398,7 @@ class BaiduSpider(object):
processitem
=
self
.
getProcessitem
(
bdetail
)
try
:
self
.
sendkafka
(
processitem
)
self
.
r
.
sadd
(
'pybaidu_
test
_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
self
.
r
.
sadd
(
'pybaidu_
baidu
_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
except
Exception
as
e
:
self
.
logger
.
info
(
"放入kafka失败!"
)
#插入数据库
...
...
百度采集/baidu_comm/baidutaskJob_loc.py
浏览文件 @
7bada3ac
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
...
@@ -190,29 +190,7 @@ if __name__ == '__main__':
while
True
:
try
:
codeList
=
[]
codeList
.
append
(
'KW-20221114-0007'
)
codeList
.
append
(
'KW-20221114-0006'
)
codeList
.
append
(
'KW-20221114-0005'
)
codeList
.
append
(
'KW-20221114-0009'
)
codeList
.
append
(
'KW-20221114-0011'
)
codeList
.
append
(
'KW-20221114-0012'
)
codeList
.
append
(
'KW-20221114-0013'
)
codeList
.
append
(
'KW-20221114-0014'
)
codeList
.
append
(
'KW-20221114-0018'
)
codeList
.
append
(
'KW-20221213-0006'
)
codeList
.
append
(
'KW-20221114-0008'
)
codeList
.
append
(
'KW-20221114-0015'
)
codeList
.
append
(
'KW-20221114-0016'
)
codeList
.
append
(
'KW-20221114-0017'
)
codeList
.
append
(
'KW-20221114-0019'
)
codeList
.
append
(
'KW-20221114-0022'
)
codeList
.
append
(
'KW-20221114-0023'
)
codeList
.
append
(
'KW-20221114-0024'
)
codeList
.
append
(
'KW-20221114-0025'
)
codeList
.
append
(
'KW-20221114-0026'
)
codeList
.
append
(
'KW-20221114-0027'
)
codeList
.
append
(
'KW-20221114-0020'
)
codeList
.
append
(
'KW-20221114-0021'
)
codeList
.
append
(
'KW-20230818-0003'
)
for
codeid
in
codeList
:
try
:
# keymsg=baiduTaskJob.getkafka()
...
...
百度采集/baidu_comm/requirements.txt
浏览文件 @
7bada3ac
...
...
@@ -12,6 +12,9 @@ pip install tqdm -i https://pypi.douban.com/simple
pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install jieba -i https://mirrors.aliyun.com/pypi/simple
selenium==3.141.0
selenium-wire==5.1.0
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论