Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
7bada3ac
提交
7bada3ac
authored
8月 22, 2023
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
a237c960
f0246e72
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
63 行增加
和
53 行删除
+63
-53
BaseCore.py
base/BaseCore.py
+5
-4
RedisPPData.py
base/RedisPPData.py
+43
-15
oneWeixin.py
comData/weixin_solo/oneWeixin.py
+0
-0
baiduSpider.py
百度采集/baidu_comm/baiduSpider.py
+9
-9
baidutaskJob_loc.py
百度采集/baidu_comm/baidutaskJob_loc.py
+2
-24
requirements.txt
百度采集/baidu_comm/requirements.txt
+4
-1
没有找到文件。
base/BaseCore.py
浏览文件 @
7bada3ac
...
@@ -17,14 +17,15 @@ import langid
...
@@ -17,14 +17,15 @@ import langid
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class
BaseCore
:
class
BaseCore
:
# 序列号
# 序列号
__seq
=
0
__seq
=
0
# 代理池 数据库连接
# 代理池 数据库连接
__cnx_proxy
=
None
__cnx_proxy
=
None
__cursor_proxy
=
None
__cursor_proxy
=
None
cnx
=
None
cursor
=
None
r
=
None
# agent 池
# agent 池
__USER_AGENT_LIST
=
[
__USER_AGENT_LIST
=
[
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
,
...
@@ -392,7 +393,7 @@ class BaseCore:
...
@@ -392,7 +393,7 @@ class BaseCore:
# 从Redis的List中获取并移除一个元素
# 从Redis的List中获取并移除一个元素
def
redicPullData
(
self
,
key
):
def
redicPullData
(
self
,
key
):
item
=
self
.
r
.
r
pop
(
key
)
item
=
self
.
r
.
l
pop
(
key
)
return
item
.
decode
()
if
item
else
None
return
item
.
decode
()
if
item
else
None
# 获得脚本进程PID
# 获得脚本进程PID
...
@@ -480,7 +481,7 @@ class BaseCore:
...
@@ -480,7 +481,7 @@ class BaseCore:
def
writerToExcel
(
self
,
detailList
,
filename
):
def
writerToExcel
(
self
,
detailList
,
filename
):
# filename='baidu搜索.xlsx'
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
# 读取已存在的xlsx文件
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
)
existing_data
=
pd
.
read_excel
(
filename
,
engine
=
'openpyxl'
,
dtype
=
str
)
# 创建新的数据
# 创建新的数据
new_data
=
pd
.
DataFrame
(
data
=
detailList
)
new_data
=
pd
.
DataFrame
(
data
=
detailList
)
# 将新数据添加到现有数据的末尾
# 将新数据添加到现有数据的末尾
...
...
base/RedisPPData.py
浏览文件 @
7bada3ac
import
time
import
time
import
pymysql
from
base
import
BaseCore
from
base
import
BaseCore
from
apscheduler.schedulers.blocking
import
BlockingScheduler
from
apscheduler.schedulers.blocking
import
BlockingScheduler
basecore
=
BaseCore
.
BaseCore
()
basecore
=
BaseCore
.
BaseCore
()
log
=
basecore
.
getLogger
()
log
=
basecore
.
getLogger
()
#144数据库
cnx
=
basecore
.
cnx
cnx
=
basecore
.
cnx
cursor
=
basecore
.
cursor
cursor
=
basecore
.
cursor
r
=
basecore
.
r
r
=
basecore
.
r
#11数据库
cnx_
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cursor_
=
cnx_
.
cursor
()
# # 连接到Redis
# # 连接到Redis
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
#
#
...
@@ -32,9 +39,9 @@ r = basecore.r
...
@@ -32,9 +39,9 @@ r = basecore.r
#企业动态
#企业动态
def
NewsEnterprise
():
def
NewsEnterprise
():
# #获取国内企业
# #获取国内企业
#
gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'"
#
cursor.execute(gn_query)
cursor
.
execute
(
gn_query
)
#
gn_result = cursor.fetchall()
gn_result
=
cursor
.
fetchall
()
#获取国外企业
#获取国外企业
gw_query
=
"select SocialCode from EnterpriseInfo where Place = '2'"
gw_query
=
"select SocialCode from EnterpriseInfo where Place = '2'"
cursor
.
execute
(
gw_query
)
cursor
.
execute
(
gw_query
)
...
@@ -42,20 +49,20 @@ def NewsEnterprise():
...
@@ -42,20 +49,20 @@ def NewsEnterprise():
gw_social_list
=
[
item
[
0
]
for
item
in
gw_result
]
gw_social_list
=
[
item
[
0
]
for
item
in
gw_result
]
#todo:打印长度
#todo:打印长度
print
(
len
(
gw_social_list
))
#
print(len(gw_social_list))
#
gn_social_list = [item[0] for item in gn_result]
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
print
(
'======='
)
#将数据插入到redis中
#将数据插入到redis中
# for item in gn_social_list:
# for item in gn_social_list:
# r.rpush('NewsEnterprise:gnqy_socialCode', item)
# r.rpush('NewsEnterprise:gnqy_socialCode', item)
count
=
0
#
count = 0
for
item
in
gw_social_list
:
for
item
in
gw_social_list
:
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
r
.
rpush
(
'NewsEnterprise:gwqy_socialCode'
,
item
)
count
+=
1
#
count+=1
print
(
item
)
#
print(item)
print
(
count
)
#
print(count)
#企业动态定时任务
#企业动态定时任务
def
NewsEnterprise_task
():
def
NewsEnterprise_task
():
# 实例化一个调度器
# 实例化一个调度器
...
@@ -140,6 +147,29 @@ def BaseInfoEnterprise_task():
...
@@ -140,6 +147,29 @@ def BaseInfoEnterprise_task():
print
(
'定时采集异常'
,
e
)
print
(
'定时采集异常'
,
e
)
pass
pass
#东方财富网财务数据
def
FinanceFromEast
():
#从上市企业库中读取数据
sql_sel
=
'''select social_credit_code from sys_base_enterprise_ipo where category = '1' limit 10 '''
cursor_
.
execute
(
sql_sel
)
finance
=
cursor_
.
fetchall
()
finance_list
=
[
item
[
0
]
for
item
in
finance
]
print
(
'======='
)
for
item
in
finance_list
:
r
.
rpush
(
'FinanceFromEast:finance_socialCode'
,
item
)
def
FinanceFromEase_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每个季度执行一次
scheduler
.
add_job
(
FinanceFromEast
,
'cron'
,
month
=
'1-12/3'
,
day
=
'1'
,
hour
=
0
,
minute
=
0
)
try
:
# redisPushData # 定时开始前执行一次
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
#微信公众号
#微信公众号
def
WeiXingetFromSql
():
def
WeiXingetFromSql
():
selectSql
=
"SELECT info_source_code from info_source where site_uri like '
%
mp.weixin.qq.com
%
'"
selectSql
=
"SELECT info_source_code from info_source where site_uri like '
%
mp.weixin.qq.com
%
'"
...
@@ -207,9 +237,6 @@ def FBS():
...
@@ -207,9 +237,6 @@ def FBS():
r
.
rpush
(
'NewsEnterpriseFbs:gnqy_socialCode'
,
item
)
r
.
rpush
(
'NewsEnterpriseFbs:gnqy_socialCode'
,
item
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
start
=
time
.
time
()
start
=
time
.
time
()
# NoticeEnterprise()
# NoticeEnterprise()
...
@@ -222,8 +249,9 @@ if __name__ == "__main__":
...
@@ -222,8 +249,9 @@ if __name__ == "__main__":
# FBS()
# FBS()
# NoticeEnterprise_task()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
# AnnualEnterprise_task()
NoticeEnterprise
()
# NoticeEnterprise()
FinanceFromEast
()
log
.
info
(
f
'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}==='
)
log
.
info
(
f
'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}==='
)
# cnx
.close()
cnx_
.
close
()
# cursor
.close()
cursor_
.
close
()
# basecore.close()
# basecore.close()
comData/weixin_solo/oneWeixin.py
浏览文件 @
7bada3ac
差异被折叠。
点击展开。
百度采集/baidu_comm/baiduSpider.py
浏览文件 @
7bada3ac
#codi
ng=utf-8
#codi
ng=utf-8
...
@@ -266,13 +266,13 @@ class BaiduSpider(object):
...
@@ -266,13 +266,13 @@ class BaiduSpider(object):
break
break
for
detail
in
lists
:
for
detail
in
lists
:
publishTag
=
detail
[
'publishTag'
]
publishTag
=
detail
[
'publishTag'
]
if
publishTag
:
#
if publishTag:
pubtime
=
datetime
.
datetime
.
strptime
(
publishTag
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
#
pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
needDate
=
'2022-01-01 00:00:00'
#
needDate='2022-01-01 00:00:00'
needTime
=
datetime
.
datetime
.
strptime
(
needDate
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
#
needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
if
pubtime
<
needTime
:
#
if pubtime < needTime:
timeFlag
=
True
#
timeFlag = True
break
#
break
is_member
=
self
.
r
.
sismember
(
'pybaidu_baidu_'
+
self
.
wordsCode
,
durl
)
is_member
=
self
.
r
.
sismember
(
'pybaidu_baidu_'
+
self
.
wordsCode
,
durl
)
if
is_member
:
if
is_member
:
continue
continue
...
@@ -398,7 +398,7 @@ class BaiduSpider(object):
...
@@ -398,7 +398,7 @@ class BaiduSpider(object):
processitem
=
self
.
getProcessitem
(
bdetail
)
processitem
=
self
.
getProcessitem
(
bdetail
)
try
:
try
:
self
.
sendkafka
(
processitem
)
self
.
sendkafka
(
processitem
)
self
.
r
.
sadd
(
'pybaidu_
test
_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
self
.
r
.
sadd
(
'pybaidu_
baidu
_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
"放入kafka失败!"
)
self
.
logger
.
info
(
"放入kafka失败!"
)
#插入数据库
#插入数据库
...
...
百度采集/baidu_comm/baidutaskJob_loc.py
浏览文件 @
7bada3ac
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
@@ -190,29 +190,7 @@ if __name__ == '__main__':
...
@@ -190,29 +190,7 @@ if __name__ == '__main__':
while
True
:
while
True
:
try
:
try
:
codeList
=
[]
codeList
=
[]
codeList
.
append
(
'KW-20221114-0007'
)
codeList
.
append
(
'KW-20230818-0003'
)
codeList
.
append
(
'KW-20221114-0006'
)
codeList
.
append
(
'KW-20221114-0005'
)
codeList
.
append
(
'KW-20221114-0009'
)
codeList
.
append
(
'KW-20221114-0011'
)
codeList
.
append
(
'KW-20221114-0012'
)
codeList
.
append
(
'KW-20221114-0013'
)
codeList
.
append
(
'KW-20221114-0014'
)
codeList
.
append
(
'KW-20221114-0018'
)
codeList
.
append
(
'KW-20221213-0006'
)
codeList
.
append
(
'KW-20221114-0008'
)
codeList
.
append
(
'KW-20221114-0015'
)
codeList
.
append
(
'KW-20221114-0016'
)
codeList
.
append
(
'KW-20221114-0017'
)
codeList
.
append
(
'KW-20221114-0019'
)
codeList
.
append
(
'KW-20221114-0022'
)
codeList
.
append
(
'KW-20221114-0023'
)
codeList
.
append
(
'KW-20221114-0024'
)
codeList
.
append
(
'KW-20221114-0025'
)
codeList
.
append
(
'KW-20221114-0026'
)
codeList
.
append
(
'KW-20221114-0027'
)
codeList
.
append
(
'KW-20221114-0020'
)
codeList
.
append
(
'KW-20221114-0021'
)
for
codeid
in
codeList
:
for
codeid
in
codeList
:
try
:
try
:
# keymsg=baiduTaskJob.getkafka()
# keymsg=baiduTaskJob.getkafka()
...
...
百度采集/baidu_comm/requirements.txt
浏览文件 @
7bada3ac
...
@@ -12,6 +12,9 @@ pip install tqdm -i https://pypi.douban.com/simple
...
@@ -12,6 +12,9 @@ pip install tqdm -i https://pypi.douban.com/simple
pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
pip install goose3 -i https://mirrors.aliyun.com/pypi/simple
pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
pip install Beautifulsoup4 -i https://mirrors.aliyun.com/pypi/simple
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install langid -i https://mirrors.aliyun.com/pypi/simple/
pip install jieba -i https://mirrors.aliyun.com/pypi/simple
selenium==3.141.0
selenium==3.141.0
selenium-wire==5.1.0
selenium-wire==5.1.0
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论