Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
b6df2db3
提交
b6df2db3
authored
10月 25, 2023
作者:
刘伟刚
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
中国政府采购网修改代码提交
上级
d974abb1
全部展开
显示空白字符变更
内嵌
并排
正在显示
10 个修改的文件
包含
51 行增加
和
48 行删除
+51
-48
tradviewNew.py
comData/tradingview/tradviewNew.py
+19
-18
jrttnewspider.py
jrtt1news_comm/jrttnewspider.py
+0
-0
jrttnewstaskJob_loc.py
jrtt1news_comm/jrttnewstaskJob_loc.py
+1
-1
qqnewspider.py
qqnews_comm/qqnewspider.py
+10
-8
qqnewstaskJob_loc.py
qqnews_comm/qqnewstaskJob_loc.py
+3
-3
sougouSpider.py
sougou_comm/sougouSpider.py
+3
-3
souhunewspider.py
souhunews_comm/souhunewspider.py
+4
-4
souhunewstaskJob_loc.py
souhunews_comm/souhunewstaskJob_loc.py
+3
-3
baiduSpider.py
百度采集/baidu_comm/baiduSpider.py
+6
-6
baidutaskJob_comm.py
百度采集/baidu_comm/baidutaskJob_comm.py
+2
-2
没有找到文件。
comData/tradingview/tradviewNew.py
浏览文件 @
b6df2db3
...
...
@@ -134,6 +134,7 @@ def paserList(searchmsg,social_code):
'sourceAddress'
:
sourceAddress
,
# 原文链接
'summary'
:
''
,
'title'
:
title
,
'source'
:
source
,
'socialCreditCode'
:
social_code
,
'year'
:
published
[:
4
]
}
...
...
@@ -249,7 +250,7 @@ def sendToKafka(detailmsg):
'id'
:
''
,
'keyWords'
:
''
,
'lang'
:
'zh'
,
'origin'
:
'Tradingview'
,
'origin'
:
detailmsg
[
'source'
]
,
'publishDate'
:
detailmsg
[
'publishDate'
],
'sid'
:
'1711619846545776641'
,
'sourceAddress'
:
detailmsg
[
'sourceAddress'
],
# 原文链接
...
...
@@ -316,23 +317,23 @@ if __name__ == '__main__':
# url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
# searchmsg=reqmsg(url)
# print(searchmsg)
getStockFromSql
()
#
while True:
#
try:
#
tradview_ticker=r.lpop('tradview_ticker')
#
if tradview_ticker:
#
#
tradviewticker = tradview_ticker.decode(errors='ignore')
#
log.info(f'采集资讯的企业{tradviewticker}')
#
ticker_param=str(tradviewticker).split('_')[0]
#
social_code=str(tradviewticker).split('_')[1]
#
url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
#
log.info(f'采集资讯企业列表地址{tradview_ticker}')
#
searchmsg=reqmsg(url)
#
paserList(searchmsg,social_code)
#
except Exception as e:
#
log.info(f'redis中获取企业信息为空{e}')
#
break
#
getStockFromSql()
while
True
:
try
:
tradview_ticker
=
r
.
lpop
(
'tradview_ticker'
)
if
tradview_ticker
:
tradviewticker
=
tradview_ticker
.
decode
(
errors
=
'ignore'
)
log
.
info
(
f
'采集资讯的企业{tradviewticker}'
)
ticker_param
=
str
(
tradviewticker
)
.
split
(
'_'
)[
0
]
social_code
=
str
(
tradviewticker
)
.
split
(
'_'
)[
1
]
url
=
f
'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
log
.
info
(
f
'采集资讯企业列表地址{tradview_ticker}'
)
searchmsg
=
reqmsg
(
url
)
paserList
(
searchmsg
,
social_code
)
except
Exception
as
e
:
log
.
info
(
f
'redis中获取企业信息为空{e}'
)
break
...
...
jrtt1news_comm/jrttnewspider.py
浏览文件 @
b6df2db3
差异被折叠。
点击展开。
jrtt1news_comm/jrttnewstaskJob_loc.py
浏览文件 @
b6df2db3
...
...
@@ -170,7 +170,7 @@ class JrttnewsTaskJob(object):
kwmsg
=
{
'kw'
:
kk
,
'wordsCode'
:
'jrtt'
,
'sid'
:
'1
02
0'
'sid'
:
'1
70619355567592653
0'
}
kwList
.
append
(
kwmsg
)
return
kwList
...
...
qqnews_comm/qqnewspider.py
浏览文件 @
b6df2db3
...
...
@@ -255,8 +255,9 @@ class QQnewsSpider(object):
'search_count_limit'
:
10
,
'appver'
:
'15.5_qqnews_7.1.80'
}
proxy
=
self
.
baseCore
.
get_proxy
()
try
:
res
=
requests
.
post
(
url
,
headers
=
headers2
,
data
=
data
,
verify
=
False
,
timeout
=
10
)
res
=
requests
.
post
(
url
,
headers
=
headers2
,
proxies
=
proxy
,
data
=
data
,
verify
=
False
,
timeout
=
10
)
res
.
encoding
=
'utf-8'
text
=
res
.
text
except
Exception
as
e
:
...
...
@@ -283,13 +284,14 @@ class QQnewsSpider(object):
def
get_page_html
(
self
):
#设置采集列表页面和页数
url
=
'https://i.news.qq.com/gw/pc_search/result'
totalnum
=
5
totalnum
=
6
keyword
=
self
.
searchkw
# keyword='浙江国有资本运营公司'
for
pagenum
in
range
(
0
,
totalnum
):
qerhtml
=
self
.
reqPostMsg
(
url
,
pagenum
,
keyword
)
jsonmsg
=
json
.
loads
(
qerhtml
)
secList
=
jsonmsg
[
'secList'
]
if
secList
:
for
sec
in
secList
:
try
:
title
=
sec
[
'newsList'
][
0
][
'title'
]
...
...
@@ -322,7 +324,7 @@ class QQnewsSpider(object):
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
processitem
=
self
.
getProcessitem
(
bdetail
)
try
:
#
self.sendkafka(processitem)
self
.
sendkafka
(
processitem
)
self
.
r
.
sadd
(
'pyqqnews_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
except
Exception
as
e
:
self
.
logger
.
info
(
"放入kafka失败!"
)
...
...
@@ -412,8 +414,8 @@ class QQnewsSpider(object):
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
}
proxy
=
{
'https'
:
'http://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
res
=
requests
.
get
(
url
,
headers
=
headers
,
verify
=
False
,
timeout
=
10
)
proxy
=
self
.
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
proxy
,
verify
=
False
,
timeout
=
10
)
res
.
encoding
=
'utf-8'
text
=
res
.
text
return
text
...
...
@@ -421,7 +423,7 @@ class QQnewsSpider(object):
def
extractorMsg
(
self
,
url
,
title
):
content
=
''
contentWithTag
=
''
lang
=
''
lang
=
'
cn
'
lang
=
self
.
detect_language
(
title
)
sm
=
SmartExtractor
(
lang
)
try
:
...
...
@@ -521,10 +523,10 @@ class QQnewsSpider(object):
if
content
!=
''
:
processitem
=
{
"sid"
:
self
.
sid
,
"source"
:
"
5"
,
"source"
:
"
22"
,
#腾讯新闻
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"contentWith
t
ag"
:
bdetail
[
'contentHtml'
],
"contentWith
T
ag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
...
...
qqnews_comm/qqnewstaskJob_loc.py
浏览文件 @
b6df2db3
...
...
@@ -170,7 +170,7 @@ class QQnewsTaskJob(object):
kwmsg
=
{
'kw'
:
kk
,
'wordsCode'
:
'qqnews'
,
'sid'
:
'1
02003
'
'sid'
:
'1
706193555675926530
'
}
kwList
.
append
(
kwmsg
)
return
kwList
...
...
@@ -182,7 +182,7 @@ class QQnewsTaskJob(object):
try
:
jrttnewsSpider
.
get_page_html
()
except
Exception
as
e
:
logger
.
info
(
'
今日头条
搜索异常'
+
searchkw
)
logger
.
info
(
'
腾讯新闻
搜索异常'
+
searchkw
)
if
jrttnewsSpider
.
detailList
.
qsize
()
!=
0
:
try
:
...
...
@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue
if
kwList
:
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
3
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
qqnewsTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
...
...
sougou_comm/sougouSpider.py
浏览文件 @
b6df2db3
...
...
@@ -140,8 +140,8 @@ class SougouSpider(object):
def
xpath_paser
(
self
,
html
):
lists
=
[]
itemTag
=
html
.
xpath
(
'//div[@class="vrwrap"]'
)
for
itemTag
in
itemTag
:
itemTag
s
=
html
.
xpath
(
'//div[@class="vrwrap"]'
)
for
itemTag
in
itemTag
s
:
try
:
title
=
itemTag
.
xpath
(
'.//h3[@class="vr-title"]/a/text()'
)[
0
]
except
Exception
as
e
:
...
...
@@ -512,7 +512,7 @@ class SougouSpider(object):
"source"
:
"5"
,
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"contentWith
t
ag"
:
bdetail
[
'contentHtml'
],
"contentWith
T
ag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
...
...
souhunews_comm/souhunewspider.py
浏览文件 @
b6df2db3
...
...
@@ -286,7 +286,7 @@ class SouhunewsSpider(object):
# 获取每一页数据, 开趴.
def
get_page_html
(
self
):
#设置采集列表页面和页数
totalnum
=
5
totalnum
=
6
keyword
=
self
.
searchkw
# keyword='浙江国有资本运营公司'
for
pagenum
in
range
(
0
,
totalnum
):
...
...
@@ -333,7 +333,7 @@ class SouhunewsSpider(object):
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
processitem
=
self
.
getProcessitem
(
bdetail
)
try
:
#
self.sendkafka(processitem)
self
.
sendkafka
(
processitem
)
self
.
r
.
sadd
(
'pysouhunews_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
except
Exception
as
e
:
self
.
logger
.
info
(
"放入kafka失败!"
)
...
...
@@ -528,10 +528,10 @@ class SouhunewsSpider(object):
if
content
!=
''
:
processitem
=
{
"sid"
:
self
.
sid
,
"source"
:
"
3"
,
"source"
:
"
23"
,
#搜狐新闻
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"contentWith
t
ag"
:
bdetail
[
'contentHtml'
],
"contentWith
T
ag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
...
...
souhunews_comm/souhunewstaskJob_loc.py
浏览文件 @
b6df2db3
...
...
@@ -170,7 +170,7 @@ class SouhunewsTaskJob(object):
kwmsg
=
{
'kw'
:
kk
,
'wordsCode'
:
'souhu'
,
'sid'
:
'1
02002
'
'sid'
:
'1
706193555675926530
'
}
kwList
.
append
(
kwmsg
)
return
kwList
...
...
@@ -182,7 +182,7 @@ class SouhunewsTaskJob(object):
try
:
jrttnewsSpider
.
get_page_html
()
except
Exception
as
e
:
logger
.
info
(
'
今日头条
搜索异常'
+
searchkw
)
logger
.
info
(
'
搜狐新闻
搜索异常'
+
searchkw
)
if
jrttnewsSpider
.
detailList
.
qsize
()
!=
0
:
try
:
...
...
@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue
if
kwList
:
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
3
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
souhunewsTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
...
...
百度采集/baidu_comm/baiduSpider.py
浏览文件 @
b6df2db3
#codi
ng=utf-8
#codi
ng=utf-8
...
...
@@ -62,7 +62,7 @@ class BaiduSpider(object):
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
self
.
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
#将列表数据插入到表中
baidu
_search_result
#将列表数据插入到表中
meta
_search_result
def
itemInsertToTable
(
self
,
items
):
try
:
itemdata
=
[]
...
...
@@ -72,7 +72,7 @@ class BaiduSpider(object):
data
=
(
self
.
sid
,
self
.
wordsCode
,
item
[
'title'
],
item
[
'detailurl'
],
item
[
'source'
],
item
[
'publishtime'
],
item
[
'content'
],
item
[
'contentHtml'
],
'1'
,
item
[
'kword'
],
nowtime
)
itemdata
.
append
(
data
)
sql
=
"INSERT into
baidu
_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
sql
=
"INSERT into
meta
_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursorM
.
executemany
(
sql
,
itemdata
)
self
.
logger
.
info
(
"数据插入数据库成功!"
)
# 定义插入数据的SQL语句
...
...
@@ -115,8 +115,8 @@ class BaiduSpider(object):
def
xpath_paser
(
self
,
html
):
lists
=
[]
itemTag
=
html
.
xpath
(
'//div[@class="result-op c-container xpath-log new-pmd"]'
)
for
itemTag
in
itemTag
:
itemTag
s
=
html
.
xpath
(
'//div[@class="result-op c-container xpath-log new-pmd"]'
)
for
itemTag
in
itemTag
s
:
try
:
title
=
itemTag
.
xpath
(
'.//h3[@class="news-title_1YtI1 "]/a/text()'
)[
0
]
except
Exception
as
e
:
...
...
@@ -487,7 +487,7 @@ class BaiduSpider(object):
"source"
:
"3"
,
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"contentWith
t
ag"
:
bdetail
[
'contentHtml'
],
"contentWith
T
ag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
...
...
百度采集/baidu_comm/baidutaskJob_comm.py
浏览文件 @
b6df2db3
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
...
@@ -200,7 +200,7 @@ if __name__ == '__main__':
continue
if
kwList
:
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
3
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
baiduTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论