Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
b6df2db3
提交
b6df2db3
authored
10月 25, 2023
作者:
刘伟刚
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
中国政府采购网修改代码提交
上级
d974abb1
隐藏空白字符变更
内嵌
并排
正在显示
10 个修改的文件
包含
150 行增加
和
141 行删除
+150
-141
tradviewNew.py
comData/tradingview/tradviewNew.py
+19
-18
jrttnewspider.py
jrtt1news_comm/jrttnewspider.py
+82
-76
jrttnewstaskJob_loc.py
jrtt1news_comm/jrttnewstaskJob_loc.py
+1
-1
qqnewspider.py
qqnews_comm/qqnewspider.py
+27
-25
qqnewstaskJob_loc.py
qqnews_comm/qqnewstaskJob_loc.py
+3
-3
sougouSpider.py
sougou_comm/sougouSpider.py
+3
-3
souhunewspider.py
souhunews_comm/souhunewspider.py
+4
-4
souhunewstaskJob_loc.py
souhunews_comm/souhunewstaskJob_loc.py
+3
-3
baiduSpider.py
百度采集/baidu_comm/baiduSpider.py
+6
-6
baidutaskJob_comm.py
百度采集/baidu_comm/baidutaskJob_comm.py
+2
-2
没有找到文件。
comData/tradingview/tradviewNew.py
浏览文件 @
b6df2db3
...
@@ -134,6 +134,7 @@ def paserList(searchmsg,social_code):
...
@@ -134,6 +134,7 @@ def paserList(searchmsg,social_code):
'sourceAddress'
:
sourceAddress
,
# 原文链接
'sourceAddress'
:
sourceAddress
,
# 原文链接
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
,
'title'
:
title
,
'source'
:
source
,
'socialCreditCode'
:
social_code
,
'socialCreditCode'
:
social_code
,
'year'
:
published
[:
4
]
'year'
:
published
[:
4
]
}
}
...
@@ -249,7 +250,7 @@ def sendToKafka(detailmsg):
...
@@ -249,7 +250,7 @@ def sendToKafka(detailmsg):
'id'
:
''
,
'id'
:
''
,
'keyWords'
:
''
,
'keyWords'
:
''
,
'lang'
:
'zh'
,
'lang'
:
'zh'
,
'origin'
:
'Tradingview'
,
'origin'
:
detailmsg
[
'source'
]
,
'publishDate'
:
detailmsg
[
'publishDate'
],
'publishDate'
:
detailmsg
[
'publishDate'
],
'sid'
:
'1711619846545776641'
,
'sid'
:
'1711619846545776641'
,
'sourceAddress'
:
detailmsg
[
'sourceAddress'
],
# 原文链接
'sourceAddress'
:
detailmsg
[
'sourceAddress'
],
# 原文链接
...
@@ -316,23 +317,23 @@ if __name__ == '__main__':
...
@@ -316,23 +317,23 @@ if __name__ == '__main__':
# url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
# url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
# searchmsg=reqmsg(url)
# searchmsg=reqmsg(url)
# print(searchmsg)
# print(searchmsg)
getStockFromSql
()
#
getStockFromSql()
#
while True:
while
True
:
#
try:
try
:
#
tradview_ticker=r.lpop('tradview_ticker')
tradview_ticker
=
r
.
lpop
(
'tradview_ticker'
)
#
if tradview_ticker:
if
tradview_ticker
:
#
#
tradviewticker = tradview_ticker.decode(errors='ignore')
tradviewticker
=
tradview_ticker
.
decode
(
errors
=
'ignore'
)
#
log.info(f'采集资讯的企业{tradviewticker}')
log
.
info
(
f
'采集资讯的企业{tradviewticker}'
)
#
ticker_param=str(tradviewticker).split('_')[0]
ticker_param
=
str
(
tradviewticker
)
.
split
(
'_'
)[
0
]
#
social_code=str(tradviewticker).split('_')[1]
social_code
=
str
(
tradviewticker
)
.
split
(
'_'
)[
1
]
#
url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
url
=
f
'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
#
log.info(f'采集资讯企业列表地址{tradview_ticker}')
log
.
info
(
f
'采集资讯企业列表地址{tradview_ticker}'
)
#
searchmsg=reqmsg(url)
searchmsg
=
reqmsg
(
url
)
#
paserList(searchmsg,social_code)
paserList
(
searchmsg
,
social_code
)
#
except Exception as e:
except
Exception
as
e
:
#
log.info(f'redis中获取企业信息为空{e}')
log
.
info
(
f
'redis中获取企业信息为空{e}'
)
#
break
break
...
...
jrtt1news_comm/jrttnewspider.py
浏览文件 @
b6df2db3
...
@@ -37,7 +37,7 @@ class JrttnewsSpider(object):
...
@@ -37,7 +37,7 @@ class JrttnewsSpider(object):
self
.
config
.
read
(
'config.ini'
)
self
.
config
.
read
(
'config.ini'
)
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
self
.
logger
=
baseCore
.
getLogger
()
self
.
logger
=
baseCore
.
getLogger
()
self
.
url
=
'https://www.toutiao.com/
'
self
.
url
=
f
'https://so.toutiao.com/search?dvpf=pc&source=input&keyword={searchkw}#
'
self
.
r
=
redis
.
Redis
(
host
=
self
.
config
.
get
(
'redis'
,
'host'
),
self
.
r
=
redis
.
Redis
(
host
=
self
.
config
.
get
(
'redis'
,
'host'
),
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
port
=
self
.
config
.
get
(
'redis'
,
'port'
),
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
0
)
password
=
self
.
config
.
get
(
'redis'
,
'pass'
),
db
=
0
)
...
@@ -49,6 +49,7 @@ class JrttnewsSpider(object):
...
@@ -49,6 +49,7 @@ class JrttnewsSpider(object):
self
.
searchkw
=
searchkw
self
.
searchkw
=
searchkw
self
.
wordsCode
=
wordsCode
self
.
wordsCode
=
wordsCode
self
.
sid
=
sid
self
.
sid
=
sid
self
.
driver
=
self
.
createDriver
();
#将列表数据插入到表中 meta_search_result
#将列表数据插入到表中 meta_search_result
def
itemInsertToTable
(
self
,
items
):
def
itemInsertToTable
(
self
,
items
):
...
@@ -95,53 +96,36 @@ class JrttnewsSpider(object):
...
@@ -95,53 +96,36 @@ class JrttnewsSpider(object):
html
=
etree
.
HTML
(
response
)
html
=
etree
.
HTML
(
response
)
lists
=
self
.
xpath_paser
(
html
)
lists
=
self
.
xpath_paser
(
html
)
try
:
try
:
flag
=
html
.
xpath
(
'//a[@
id="sogou_next
"]'
)[
0
]
flag
=
html
.
xpath
(
'//a[@
class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap
"]'
)[
0
]
except
Exception
as
e
:
except
Exception
as
e
:
flag
=
''
flag
=
''
lists
=
[]
lists
=
[]
return
flag
,
lists
return
flag
,
lists
def
getRealUrl
(
self
,
url
):
try
:
header
=
{
"accept"
:
"*/*"
,
"connection"
:
"Keep-Alive"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url
=
f
"https://www.sogou.com{url}"
res
=
requests
.
get
(
url
,
headers
=
header
)
text
=
res
.
text
# 定义正则表达式
pattern
=
r'https?://[^\s<>"]+|www\.[^\s<>"]+'
# 在给定的字符串中寻找匹配的URL
urls
=
re
.
findall
(
pattern
,
text
)
uri
=
''
if
len
(
urls
)
>
1
:
uri
=
urls
[
0
]
except
Exception
as
e
:
self
.
logger
.
info
(
"链接转换异常!"
)
return
uri
def
xpath_paser
(
self
,
html
):
def
xpath_paser
(
self
,
html
):
lists
=
[]
lists
=
[]
itemTag
=
html
.
xpath
(
'//div[@class="vrwrap"]'
)
itemTags
=
html
.
xpath
(
'//div[@class="cs-view cs-view-block cs-card-content"]'
)
for
itemTag
in
itemTag
:
for
itemTag
in
itemTags
:
html_str
=
etree
.
tostring
(
itemTag
)
try
:
try
:
title
=
itemTag
.
xpath
(
'.//
h3[@class="vr-title"]/a
/text()'
)[
0
]
title
=
itemTag
.
xpath
(
'.//
a[@class="text-ellipsis text-underline-hover"]
/text()'
)[
0
]
except
Exception
as
e
:
except
Exception
as
e
:
title
=
''
title
=
''
if
title
==
''
:
continue
try
:
try
:
detailUrl
=
itemTag
.
xpath
(
'.//h3[@class="vr-title"]/a/@href'
)[
0
]
detailUrl
=
itemTag
.
xpath
(
'.//a[@class="text-ellipsis text-underline-hover"]/@href'
)[
0
]
detailUrl
=
self
.
getRealUrl
(
detailUrl
)
id
=
self
.
get_reitemid
(
detailUrl
)
detailUrl
=
f
'https://www.toutiao.com/article/{id}/?&source=m_redirect'
except
Exception
as
e
:
except
Exception
as
e
:
detailUrl
=
''
detailUrl
=
''
try
:
try
:
sourceTag
=
itemTag
.
xpath
(
'.//
p[@class="news-from text-lightgray"]/span[1]
/text()'
)[
0
]
sourceTag
=
itemTag
.
xpath
(
'.//
span[@class="d-flex align-items-center text-ellipsis margin-right-4"]/
/text()'
)[
0
]
except
Exception
as
e
:
except
Exception
as
e
:
sourceTag
=
''
sourceTag
=
''
try
:
try
:
publishTag
=
itemTag
.
xpath
(
'.//
p[@class="news-from text-lightgray"]/span[2
]/text()'
)[
0
]
publishTag
=
itemTag
.
xpath
(
'.//
div[@class="cs-view cs-view-flex align-items-center flex-row cs-source-content"]/span[@class="text-ellipsis"
]/text()'
)[
0
]
publishTag
=
str
(
publishTag
)
publishTag
=
str
(
publishTag
)
publishtime
=
self
.
paserTime
(
publishTag
)
publishtime
=
self
.
paserTime
(
publishTag
)
publishTag
=
publishtime
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
publishTag
=
publishtime
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
...
@@ -285,7 +269,8 @@ class JrttnewsSpider(object):
...
@@ -285,7 +269,8 @@ class JrttnewsSpider(object):
def
get_reitemid
(
self
,
tmpurl
):
def
get_reitemid
(
self
,
tmpurl
):
try
:
try
:
pattern
=
'item_id=([
\
d]{1,})&search_id'
tmpurl
=
unquote
(
tmpurl
)
pattern
=
'com/a([
\
d]{1,}?)/'
match
=
re
.
search
(
pattern
,
tmpurl
)
match
=
re
.
search
(
pattern
,
tmpurl
)
# 判断是否匹配成功
# 判断是否匹配成功
if
match
:
if
match
:
...
@@ -307,53 +292,74 @@ class JrttnewsSpider(object):
...
@@ -307,53 +292,74 @@ class JrttnewsSpider(object):
# 获取每一页数据, 开趴.
# 获取每一页数据, 开趴.
def
get_page_html
(
self
):
def
get_page_html
(
self
):
#设置采集列表页面和页数
#设置采集列表页面和页数
totalnum
=
1
self
.
driver
.
get
(
self
.
url
)
keyword
=
self
.
searchkw
wait
=
WebDriverWait
(
self
.
driver
,
20
)
# keyword='浙江国有资本运营公司'
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
CLASS_NAME
,
"s-result-list"
)))
for
pagenum
in
range
(
0
,
totalnum
):
# try:
self
.
logger
.
info
(
f
"解析关键词{keyword}第{pagenum}页"
)
# self.driver.find_element(By.XPATH,'//div[@class="input_box_n6Efbw"]/input').send_keys(self.searchkw)
offset
=
pagenum
*
10
# except Exception as e:
tmpurl
=
'https://search5-search-hl.toutiaoapi.com/search/?source=search_subtab_switch&is_ttwebview=0&pass_through=default&action_type=input_keyword_search&is_incognito=0&api_param={"sug_session_id":"552235978851697767261639"}&inner_resolution=1080*1920&navbar_height=36&multi_container=1&gs_height=44&client_extra_params={"playparam":"codec_type:7,cdn_type:1,resolution:1080*1920,ttm_version:924000,enable_dash:0,unwatermark:1,v1_fitter_info:1,tt_net_energy:4,is_order_flow:-1,tt_device_score:7.1,tt_enable_adaptive:2"}&common_hashtags=default&_rticket=1697767236897&loadId=1&from_search_id=202310201001051EB17B3CBA66215D937D&isTTWebViewHeifSupport=0&has_gs=0&multi_container_type=1&forum=3&tt_font_size=m'
\
# print(e)
'&search_start_time=1697767265219&pd=information&cur_tab_title=search_tab&offset_height=108&openlive_plugin_status=0&fetch_by_ttnet=1&is_darkmode=0&from_pd=synthesis&plugin_enable=3&search_position=search_bar'
\
# self.driver.find_element(By.CLASS_NAME, 'search_33vwaQ').click()
'&keyword=[keyword]&session_id=f1d0e9e4-cb15-4b60-b894-de729a76e6a9&switch_tab_type=click&appTheme=light&search_json={"__logExtra__":{"if_sar_recall":"0","from_category_name":"__all__","from_enter_from":"click_headline","from_channel_id":"0"}}'
\
# wait = WebDriverWait(self.driver, 20)
'&from=search_tab&is_older=0&tt_daymode=1&search_sug=1&&runtime_tc=tt_search&browser_runtime_version=1720&format=json'
\
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "s-result-list")))
'&count=10&offset=[offset]&search_id=20231020120041F06F03C42D66A4AC5EC2&start_index=30&index_resource=&filter_vendor=&filter_period=&order_type='
\
# time.sleep(2)
'&min_time=&max_time=&traffic_source='
self
.
driver
.
find_element
(
'xpath'
,
'//div[@class="cs-view pad-bottom-6 cs-view-flex align-items-center flex-row nav_7Dk46Y"]/div[1]/a[text()="资讯"]'
)
.
click
()
url
=
tmpurl
.
replace
(
'[keyword]'
,
keyword
)
.
replace
(
'[offset]'
,
str
(
offset
))
time
.
sleep
(
2
)
lhtml
=
self
.
reqHtml
(
url
)
self
.
logger
.
info
(
"开始抓取首页..."
)
qqerhtml
=
json
.
loads
(
lhtml
)
try
:
qqerhtml
=
qqerhtml
[
'dom'
]
flag
,
lists
=
self
.
parse_page
()
# self.logger.info(f'列表页面信息:{lhtml}')
if
len
(
lists
)
<
1
:
soup
=
BeautifulSoup
(
qqerhtml
,
'html.parser'
)
return
listcontent
=
soup
.
select
(
'div[style="opacity: 1;"]'
)
except
Exception
as
e
:
for
litag
in
listcontent
:
time
.
sleep
(
5
)
return
if
len
(
lists
)
==
0
:
time
.
sleep
(
5
)
for
detail
in
lists
:
durl
=
detail
[
'detailUrl'
]
is_member
=
self
.
r
.
sismember
(
'pyjrttnews_'
+
self
.
wordsCode
,
durl
)
if
is_member
:
continue
self
.
detailList
.
put
(
detail
)
response
=
self
.
driver
.
page_source
html
=
etree
.
HTML
(
response
)
hasnext
=
html
.
xpath
(
'//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]//text()'
)[
0
]
hasnext
=
hasnext
.
strip
()
timeFlag
=
False
while
'下一页'
in
hasnext
:
try
:
if
self
.
page_num
==
5
:
break
self
.
page_num
=
self
.
page_num
+
1
self
.
logger
.
info
(
"开始抓取第
%
s页..."
%
self
.
page_num
)
try
:
try
:
lidoc
=
pq
(
str
(
litag
))
self
.
driver
.
find_element
(
By
.
XPATH
,
'//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]'
)
.
click
()
ahref
=
lidoc
.
find
(
'a[class="l-view block l-text line-clamp-2 color-darker font-medium l-header h3"]'
)
.
attr
(
'href'
)
except
Exception
as
e
:
id
=
self
.
get_reitemid
(
ahref
)
time
.
sleep
(
5
)
durl
=
f
'https://www.toutiao.com/article/{id}/?&source=m_redirect'
continue
title
=
lidoc
.
find
(
'a[class="l-view block l-text line-clamp-2 color-darker font-medium l-header h3"]'
)
.
text
()
.
replace
(
'
\n
'
,
''
)
time
.
sleep
(
5
)
source
=
lidoc
.
find
(
'div[class="l-source-text t3 l-source-min-width line-clamp-1 flex-shrink"]'
)
.
text
()
.
replace
(
'
\n
'
,
''
)
flag
,
lists
=
self
.
parse_page
()
publishdate
=
lidoc
.
find
(
'div[class="l-view block l-text-split flex-shrink-0 ml-8 color-default line-clamp-1 t3"]>span:last-child'
)
.
text
()
.
replace
(
'
\n
'
,
''
)
if
len
(
lists
)
<
1
:
publishdate
=
self
.
paserTime
(
publishdate
)
break
if
isinstance
(
publishdate
,
str
):
for
detail
in
lists
:
pubdate
=
publishdate
publishTag
=
detail
[
'publishTag'
]
else
:
pubdate
=
publishdate
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
is_member
=
self
.
r
.
sismember
(
'pyjrttnews_'
+
self
.
wordsCode
,
durl
)
is_member
=
self
.
r
.
sismember
(
'pyjrttnews_'
+
self
.
wordsCode
,
durl
)
if
is_member
:
if
is_member
:
self
.
logger
.
info
(
f
"搜索列表的链接已经存在!"
)
continue
continue
detailmsg
=
{
self
.
detailList
.
put
(
detail
)
'title'
:
title
,
if
timeFlag
:
'detailUrl'
:
durl
,
break
'sourceTag'
:
source
,
try
:
'publishTag'
:
pubdate
response
=
self
.
driver
.
page_source
}
html
=
etree
.
HTML
(
response
)
self
.
detailList
.
put
(
detailmsg
)
hasnext
=
html
.
xpath
(
'//div[@id="page"]//a[last()]//text()'
)[
0
]
hasnext
=
hasnext
.
strip
()
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
f
"搜索列表页异常{e}"
)
hasnext
=
''
continue
except
Exception
as
e
:
time
.
sleep
(
5
)
break
self
.
logger
.
info
(
"抓取完毕"
)
# 获取详情页
# 获取详情页
def
get_detail_html
(
self
):
def
get_detail_html
(
self
):
...
@@ -370,7 +376,7 @@ class JrttnewsSpider(object):
...
@@ -370,7 +376,7 @@ class JrttnewsSpider(object):
self
.
logger
.
info
(
f
"解析详情页标题{title},获取的内容长度:{len(bdetail['content'])}"
)
self
.
logger
.
info
(
f
"解析详情页标题{title},获取的内容长度:{len(bdetail['content'])}"
)
processitem
=
self
.
getProcessitem
(
bdetail
)
processitem
=
self
.
getProcessitem
(
bdetail
)
try
:
try
:
#
self.sendkafka(processitem)
self
.
sendkafka
(
processitem
)
self
.
r
.
sadd
(
'pyjrttnews_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
self
.
r
.
sadd
(
'pyjrttnews_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
"放入kafka失败!"
)
self
.
logger
.
info
(
"放入kafka失败!"
)
...
@@ -576,10 +582,10 @@ class JrttnewsSpider(object):
...
@@ -576,10 +582,10 @@ class JrttnewsSpider(object):
if
content
!=
''
:
if
content
!=
''
:
processitem
=
{
processitem
=
{
"sid"
:
self
.
sid
,
"sid"
:
self
.
sid
,
"source"
:
"
3
"
,
"source"
:
"
21
"
,
"title"
:
bdetail
[
'title'
],
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"content"
:
bdetail
[
'content'
],
"contentWith
t
ag"
:
bdetail
[
'contentHtml'
],
"contentWith
T
ag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
...
...
jrtt1news_comm/jrttnewstaskJob_loc.py
浏览文件 @
b6df2db3
...
@@ -170,7 +170,7 @@ class JrttnewsTaskJob(object):
...
@@ -170,7 +170,7 @@ class JrttnewsTaskJob(object):
kwmsg
=
{
kwmsg
=
{
'kw'
:
kk
,
'kw'
:
kk
,
'wordsCode'
:
'jrtt'
,
'wordsCode'
:
'jrtt'
,
'sid'
:
'1
02
0'
'sid'
:
'1
70619355567592653
0'
}
}
kwList
.
append
(
kwmsg
)
kwList
.
append
(
kwmsg
)
return
kwList
return
kwList
...
...
qqnews_comm/qqnewspider.py
浏览文件 @
b6df2db3
...
@@ -255,8 +255,9 @@ class QQnewsSpider(object):
...
@@ -255,8 +255,9 @@ class QQnewsSpider(object):
'search_count_limit'
:
10
,
'search_count_limit'
:
10
,
'appver'
:
'15.5_qqnews_7.1.80'
'appver'
:
'15.5_qqnews_7.1.80'
}
}
proxy
=
self
.
baseCore
.
get_proxy
()
try
:
try
:
res
=
requests
.
post
(
url
,
headers
=
headers2
,
data
=
data
,
verify
=
False
,
timeout
=
10
)
res
=
requests
.
post
(
url
,
headers
=
headers2
,
proxies
=
proxy
,
data
=
data
,
verify
=
False
,
timeout
=
10
)
res
.
encoding
=
'utf-8'
res
.
encoding
=
'utf-8'
text
=
res
.
text
text
=
res
.
text
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -283,31 +284,32 @@ class QQnewsSpider(object):
...
@@ -283,31 +284,32 @@ class QQnewsSpider(object):
def
get_page_html
(
self
):
def
get_page_html
(
self
):
#设置采集列表页面和页数
#设置采集列表页面和页数
url
=
'https://i.news.qq.com/gw/pc_search/result'
url
=
'https://i.news.qq.com/gw/pc_search/result'
totalnum
=
5
totalnum
=
6
keyword
=
self
.
searchkw
keyword
=
self
.
searchkw
# keyword='浙江国有资本运营公司'
# keyword='浙江国有资本运营公司'
for
pagenum
in
range
(
0
,
totalnum
):
for
pagenum
in
range
(
0
,
totalnum
):
qerhtml
=
self
.
reqPostMsg
(
url
,
pagenum
,
keyword
)
qerhtml
=
self
.
reqPostMsg
(
url
,
pagenum
,
keyword
)
jsonmsg
=
json
.
loads
(
qerhtml
)
jsonmsg
=
json
.
loads
(
qerhtml
)
secList
=
jsonmsg
[
'secList'
]
secList
=
jsonmsg
[
'secList'
]
for
sec
in
secList
:
if
secList
:
try
:
for
sec
in
secList
:
title
=
sec
[
'newsList'
][
0
][
'title'
]
try
:
durl
=
sec
[
'newsList'
][
0
][
'url'
]
title
=
sec
[
'newsList'
][
0
][
'title'
]
pubtime
=
sec
[
'newsList'
][
0
][
'time'
]
durl
=
sec
[
'newsList'
][
0
][
'url'
]
source
=
sec
[
'newsList'
][
0
][
'source'
]
pubtime
=
sec
[
'newsList'
][
0
][
'time'
]
is_member
=
self
.
r
.
sismember
(
'pyqqnews_'
+
self
.
wordsCode
,
durl
)
source
=
sec
[
'newsList'
][
0
][
'source'
]
if
is_member
:
is_member
=
self
.
r
.
sismember
(
'pyqqnews_'
+
self
.
wordsCode
,
durl
)
if
is_member
:
continue
detailmsg
=
{
'title'
:
title
,
'detailUrl'
:
durl
,
'sourceTag'
:
source
,
'publishTag'
:
pubtime
}
self
.
detailList
.
put
(
detailmsg
)
except
Exception
as
e
:
continue
continue
detailmsg
=
{
'title'
:
title
,
'detailUrl'
:
durl
,
'sourceTag'
:
source
,
'publishTag'
:
pubtime
}
self
.
detailList
.
put
(
detailmsg
)
except
Exception
as
e
:
continue
# 获取详情页
# 获取详情页
def
get_detail_html
(
self
):
def
get_detail_html
(
self
):
# 获取当前窗口的句柄
# 获取当前窗口的句柄
...
@@ -322,7 +324,7 @@ class QQnewsSpider(object):
...
@@ -322,7 +324,7 @@ class QQnewsSpider(object):
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
processitem
=
self
.
getProcessitem
(
bdetail
)
processitem
=
self
.
getProcessitem
(
bdetail
)
try
:
try
:
#
self.sendkafka(processitem)
self
.
sendkafka
(
processitem
)
self
.
r
.
sadd
(
'pyqqnews_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
self
.
r
.
sadd
(
'pyqqnews_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
"放入kafka失败!"
)
self
.
logger
.
info
(
"放入kafka失败!"
)
...
@@ -412,8 +414,8 @@ class QQnewsSpider(object):
...
@@ -412,8 +414,8 @@ class QQnewsSpider(object):
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
}
}
proxy
=
{
'https'
:
'http://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
proxy
=
self
.
baseCore
.
get_proxy
()
res
=
requests
.
get
(
url
,
headers
=
headers
,
verify
=
False
,
timeout
=
10
)
res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
proxy
,
verify
=
False
,
timeout
=
10
)
res
.
encoding
=
'utf-8'
res
.
encoding
=
'utf-8'
text
=
res
.
text
text
=
res
.
text
return
text
return
text
...
@@ -421,7 +423,7 @@ class QQnewsSpider(object):
...
@@ -421,7 +423,7 @@ class QQnewsSpider(object):
def
extractorMsg
(
self
,
url
,
title
):
def
extractorMsg
(
self
,
url
,
title
):
content
=
''
content
=
''
contentWithTag
=
''
contentWithTag
=
''
lang
=
''
lang
=
'
cn
'
lang
=
self
.
detect_language
(
title
)
lang
=
self
.
detect_language
(
title
)
sm
=
SmartExtractor
(
lang
)
sm
=
SmartExtractor
(
lang
)
try
:
try
:
...
@@ -521,10 +523,10 @@ class QQnewsSpider(object):
...
@@ -521,10 +523,10 @@ class QQnewsSpider(object):
if
content
!=
''
:
if
content
!=
''
:
processitem
=
{
processitem
=
{
"sid"
:
self
.
sid
,
"sid"
:
self
.
sid
,
"source"
:
"
5"
,
"source"
:
"
22"
,
#腾讯新闻
"title"
:
bdetail
[
'title'
],
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"content"
:
bdetail
[
'content'
],
"contentWith
t
ag"
:
bdetail
[
'contentHtml'
],
"contentWith
T
ag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
...
...
qqnews_comm/qqnewstaskJob_loc.py
浏览文件 @
b6df2db3
...
@@ -170,7 +170,7 @@ class QQnewsTaskJob(object):
...
@@ -170,7 +170,7 @@ class QQnewsTaskJob(object):
kwmsg
=
{
kwmsg
=
{
'kw'
:
kk
,
'kw'
:
kk
,
'wordsCode'
:
'qqnews'
,
'wordsCode'
:
'qqnews'
,
'sid'
:
'1
02003
'
'sid'
:
'1
706193555675926530
'
}
}
kwList
.
append
(
kwmsg
)
kwList
.
append
(
kwmsg
)
return
kwList
return
kwList
...
@@ -182,7 +182,7 @@ class QQnewsTaskJob(object):
...
@@ -182,7 +182,7 @@ class QQnewsTaskJob(object):
try
:
try
:
jrttnewsSpider
.
get_page_html
()
jrttnewsSpider
.
get_page_html
()
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
info
(
'
今日头条
搜索异常'
+
searchkw
)
logger
.
info
(
'
腾讯新闻
搜索异常'
+
searchkw
)
if
jrttnewsSpider
.
detailList
.
qsize
()
!=
0
:
if
jrttnewsSpider
.
detailList
.
qsize
()
!=
0
:
try
:
try
:
...
@@ -218,7 +218,7 @@ if __name__ == '__main__':
...
@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue
continue
if
kwList
:
if
kwList
:
# 创建一个线程池,指定线程数量为4
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
3
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
qqnewsTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
results
=
[
executor
.
submit
(
qqnewsTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
# 获取任务的执行结果
...
...
sougou_comm/sougouSpider.py
浏览文件 @
b6df2db3
...
@@ -140,8 +140,8 @@ class SougouSpider(object):
...
@@ -140,8 +140,8 @@ class SougouSpider(object):
def
xpath_paser
(
self
,
html
):
def
xpath_paser
(
self
,
html
):
lists
=
[]
lists
=
[]
itemTag
=
html
.
xpath
(
'//div[@class="vrwrap"]'
)
itemTag
s
=
html
.
xpath
(
'//div[@class="vrwrap"]'
)
for
itemTag
in
itemTag
:
for
itemTag
in
itemTag
s
:
try
:
try
:
title
=
itemTag
.
xpath
(
'.//h3[@class="vr-title"]/a/text()'
)[
0
]
title
=
itemTag
.
xpath
(
'.//h3[@class="vr-title"]/a/text()'
)[
0
]
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -512,7 +512,7 @@ class SougouSpider(object):
...
@@ -512,7 +512,7 @@ class SougouSpider(object):
"source"
:
"5"
,
"source"
:
"5"
,
"title"
:
bdetail
[
'title'
],
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"content"
:
bdetail
[
'content'
],
"contentWith
t
ag"
:
bdetail
[
'contentHtml'
],
"contentWith
T
ag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
...
...
souhunews_comm/souhunewspider.py
浏览文件 @
b6df2db3
...
@@ -286,7 +286,7 @@ class SouhunewsSpider(object):
...
@@ -286,7 +286,7 @@ class SouhunewsSpider(object):
# 获取每一页数据, 开趴.
# 获取每一页数据, 开趴.
def
get_page_html
(
self
):
def
get_page_html
(
self
):
#设置采集列表页面和页数
#设置采集列表页面和页数
totalnum
=
5
totalnum
=
6
keyword
=
self
.
searchkw
keyword
=
self
.
searchkw
# keyword='浙江国有资本运营公司'
# keyword='浙江国有资本运营公司'
for
pagenum
in
range
(
0
,
totalnum
):
for
pagenum
in
range
(
0
,
totalnum
):
...
@@ -333,7 +333,7 @@ class SouhunewsSpider(object):
...
@@ -333,7 +333,7 @@ class SouhunewsSpider(object):
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
processitem
=
self
.
getProcessitem
(
bdetail
)
processitem
=
self
.
getProcessitem
(
bdetail
)
try
:
try
:
#
self.sendkafka(processitem)
self
.
sendkafka
(
processitem
)
self
.
r
.
sadd
(
'pysouhunews_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
self
.
r
.
sadd
(
'pysouhunews_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
"放入kafka失败!"
)
self
.
logger
.
info
(
"放入kafka失败!"
)
...
@@ -528,10 +528,10 @@ class SouhunewsSpider(object):
...
@@ -528,10 +528,10 @@ class SouhunewsSpider(object):
if
content
!=
''
:
if
content
!=
''
:
processitem
=
{
processitem
=
{
"sid"
:
self
.
sid
,
"sid"
:
self
.
sid
,
"source"
:
"
3"
,
"source"
:
"
23"
,
#搜狐新闻
"title"
:
bdetail
[
'title'
],
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"content"
:
bdetail
[
'content'
],
"contentWith
t
ag"
:
bdetail
[
'contentHtml'
],
"contentWith
T
ag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
...
...
souhunews_comm/souhunewstaskJob_loc.py
浏览文件 @
b6df2db3
...
@@ -170,7 +170,7 @@ class SouhunewsTaskJob(object):
...
@@ -170,7 +170,7 @@ class SouhunewsTaskJob(object):
kwmsg
=
{
kwmsg
=
{
'kw'
:
kk
,
'kw'
:
kk
,
'wordsCode'
:
'souhu'
,
'wordsCode'
:
'souhu'
,
'sid'
:
'1
02002
'
'sid'
:
'1
706193555675926530
'
}
}
kwList
.
append
(
kwmsg
)
kwList
.
append
(
kwmsg
)
return
kwList
return
kwList
...
@@ -182,7 +182,7 @@ class SouhunewsTaskJob(object):
...
@@ -182,7 +182,7 @@ class SouhunewsTaskJob(object):
try
:
try
:
jrttnewsSpider
.
get_page_html
()
jrttnewsSpider
.
get_page_html
()
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
info
(
'
今日头条
搜索异常'
+
searchkw
)
logger
.
info
(
'
搜狐新闻
搜索异常'
+
searchkw
)
if
jrttnewsSpider
.
detailList
.
qsize
()
!=
0
:
if
jrttnewsSpider
.
detailList
.
qsize
()
!=
0
:
try
:
try
:
...
@@ -218,7 +218,7 @@ if __name__ == '__main__':
...
@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue
continue
if
kwList
:
if
kwList
:
# 创建一个线程池,指定线程数量为4
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
3
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
souhunewsTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
results
=
[
executor
.
submit
(
souhunewsTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
# 获取任务的执行结果
...
...
百度采集/baidu_comm/baiduSpider.py
浏览文件 @
b6df2db3
#codi
ng=utf-8
#codi
ng=utf-8
...
@@ -62,7 +62,7 @@ class BaiduSpider(object):
...
@@ -62,7 +62,7 @@ class BaiduSpider(object):
# proxy = "127.0.0.1:8080" # 代理地址和端口
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
# chrome_options.add_argument('--proxy-server=http://' + proxy)
self
.
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
self
.
driver
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
#将列表数据插入到表中
baidu
_search_result
#将列表数据插入到表中
meta
_search_result
def
itemInsertToTable
(
self
,
items
):
def
itemInsertToTable
(
self
,
items
):
try
:
try
:
itemdata
=
[]
itemdata
=
[]
...
@@ -72,7 +72,7 @@ class BaiduSpider(object):
...
@@ -72,7 +72,7 @@ class BaiduSpider(object):
data
=
(
self
.
sid
,
self
.
wordsCode
,
item
[
'title'
],
item
[
'detailurl'
],
item
[
'source'
],
item
[
'publishtime'
],
item
[
'content'
],
item
[
'contentHtml'
],
'1'
,
item
[
'kword'
],
nowtime
)
data
=
(
self
.
sid
,
self
.
wordsCode
,
item
[
'title'
],
item
[
'detailurl'
],
item
[
'source'
],
item
[
'publishtime'
],
item
[
'content'
],
item
[
'contentHtml'
],
'1'
,
item
[
'kword'
],
nowtime
)
itemdata
.
append
(
data
)
itemdata
.
append
(
data
)
sql
=
"INSERT into
baidu
_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
sql
=
"INSERT into
meta
_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
cursorM
.
executemany
(
sql
,
itemdata
)
cursorM
.
executemany
(
sql
,
itemdata
)
self
.
logger
.
info
(
"数据插入数据库成功!"
)
self
.
logger
.
info
(
"数据插入数据库成功!"
)
# 定义插入数据的SQL语句
# 定义插入数据的SQL语句
...
@@ -115,8 +115,8 @@ class BaiduSpider(object):
...
@@ -115,8 +115,8 @@ class BaiduSpider(object):
def
xpath_paser
(
self
,
html
):
def
xpath_paser
(
self
,
html
):
lists
=
[]
lists
=
[]
itemTag
=
html
.
xpath
(
'//div[@class="result-op c-container xpath-log new-pmd"]'
)
itemTag
s
=
html
.
xpath
(
'//div[@class="result-op c-container xpath-log new-pmd"]'
)
for
itemTag
in
itemTag
:
for
itemTag
in
itemTag
s
:
try
:
try
:
title
=
itemTag
.
xpath
(
'.//h3[@class="news-title_1YtI1 "]/a/text()'
)[
0
]
title
=
itemTag
.
xpath
(
'.//h3[@class="news-title_1YtI1 "]/a/text()'
)[
0
]
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -487,7 +487,7 @@ class BaiduSpider(object):
...
@@ -487,7 +487,7 @@ class BaiduSpider(object):
"source"
:
"3"
,
"source"
:
"3"
,
"title"
:
bdetail
[
'title'
],
"title"
:
bdetail
[
'title'
],
"content"
:
bdetail
[
'content'
],
"content"
:
bdetail
[
'content'
],
"contentWith
t
ag"
:
bdetail
[
'contentHtml'
],
"contentWith
T
ag"
:
bdetail
[
'contentHtml'
],
"origin"
:
bdetail
[
'source'
],
"origin"
:
bdetail
[
'source'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"publishDate"
:
bdetail
[
'publishtime'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
"sourceAddress"
:
bdetail
[
'detailurl'
],
...
...
百度采集/baidu_comm/baidutaskJob_comm.py
浏览文件 @
b6df2db3
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
@@ -200,7 +200,7 @@ if __name__ == '__main__':
...
@@ -200,7 +200,7 @@ if __name__ == '__main__':
continue
continue
if
kwList
:
if
kwList
:
# 创建一个线程池,指定线程数量为4
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
3
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
baiduTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
results
=
[
executor
.
submit
(
baiduTaskJob
.
runSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
# 获取任务的执行结果
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论