Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
d9bf9b2f
提交
d9bf9b2f
authored
1月 18, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
1/18
上级
654d0ce5
隐藏空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
145 行增加
和
31 行删除
+145
-31
Edge_pyautogui.py
Translate/Edge_pyautogui.py
+92
-0
requestQCC.py
comData/BaseInfo_qcc/requestQCC.py
+1
-1
CorePerson.py
comData/Tyc/CorePerson.py
+5
-4
config.ini
sougou_comm/config.ini
+4
-2
sougouSpider.py
sougou_comm/sougouSpider.py
+37
-19
sougoutaskJob_loc.py
sougou_comm/sougoutaskJob_loc.py
+6
-5
没有找到文件。
Translate/Edge_pyautogui.py
0 → 100644
浏览文件 @
d9bf9b2f
import
pyautogui
from
retry
import
retry
from
selenium
import
webdriver
from
selenium.webdriver.common.action_chains
import
ActionChains
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.wait
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
import
time
from
bson
import
ObjectId
import
pymongo
# 获取当前活动窗口的标题
def
get_active_window_title
():
window
=
pyautogui
.
getActiveWindow
()
print
(
f
'当前活动窗口的标题是:{window.title}'
)
return
window
.
title
if
window
else
None
@retry
(
tries
=
3
,
delay
=
1
)
def
Translate
(
_id
,
driver
):
driver
.
get
(
'file:///C:/Users/EDY/Desktop/aaa.html'
)
flag
=
driver
.
find_element
(
By
.
TAG_NAME
,
'body'
)
.
text
driver
.
maximize_window
()
# 切换到Edge浏览器窗口
driver
.
switch_to
.
window
(
driver
.
current_window_handle
)
# 等待一段时间,确保页面加载完成
time
.
sleep
(
5
)
# 获取Edge浏览器窗口的句柄
edge_handle
=
driver
.
current_window_handle
# driver.refresh()
# time.sleep(5)
# 右键选择翻译
rightClick
=
ActionChains
(
driver
)
position_element
=
driver
.
find_element
(
By
.
TAG_NAME
,
'body'
)
rightClick
.
context_click
(
position_element
)
.
perform
()
time
.
sleep
(
1
)
pyautogui
.
typewrite
([
'down'
]
*
6
)
pyautogui
.
typewrite
([
"enter"
])
js
=
"return action=document.body.scrollHeight"
new_height
=
driver
.
execute_script
(
js
)
for
i
in
range
(
0
,
new_height
,
300
):
# js = "var q=document.documentElement.scrollTop=300"
driver
.
execute_script
(
js
)
driver
.
execute_script
(
'window.scrollTo(0,
%
s)'
%
(
i
))
time
.
sleep
(
1
)
time
.
sleep
(
2
)
if
driver
.
find_element
(
By
.
TAG_NAME
,
'body'
)
.
text
[:
500
]
in
flag
:
print
(
f
'{_id}---翻译失败,重试'
)
# 使用pyautogui模块模拟按下Alt+Tab键,将Edge浏览器置于最前面
# while get_a
# ctive_window_title() != "Edge浏览器":
while
'Microsoft Edge'
not
in
get_active_window_title
():
pyautogui
.
hotkey
(
'alt'
,
'tab'
)
print
(
'窗口切换操作'
)
# pyautogui.hotkey('alt', 'tab')
# 切换到Edge浏览器窗口
driver
.
switch_to
.
window
(
edge_handle
)
driver
.
refresh
()
raise
from
bs4
import
BeautifulSoup
page_source
=
driver
.
page_source
contentWithTag
=
BeautifulSoup
(
page_source
,
'html.parser'
)
with
open
(
rf
'C:
\
Users
\
EDY
\
Desktop
\
{_id}.html'
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
str
(
contentWithTag
))
# print(str(contentWithTag))
if
__name__
==
"__main__"
:
driver
=
webdriver
.
Edge
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
中科软
[
'数据源_0106'
]
datas
=
db_storage
.
find
({
'postCode'
:
'2'
})
.
limit
(
10
)
for
data
in
datas
:
now
=
time
.
time
()
_id
=
str
(
data
[
'_id'
])
richTextForeign
=
data
[
'richTextForeign'
]
with
open
(
r'C:\Users\EDY\Desktop\aaa.html'
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
str
(
richTextForeign
))
try
:
Translate
(
_id
,
driver
)
except
:
print
(
'翻译失败'
)
print
(
f
'{_id}翻译用时--{time.time() - now}'
)
\ No newline at end of file
comData/BaseInfo_qcc/requestQCC.py
浏览文件 @
d9bf9b2f
...
@@ -48,7 +48,7 @@ if __name__ == "__main__":
...
@@ -48,7 +48,7 @@ if __name__ == "__main__":
# soup = BeautifulSoup(page_source,'html.parser')
# soup = BeautifulSoup(page_source,'html.parser')
# print(soup)
# print(soup)
browser
.
find_element
(
By
.
CLASS_NAME
,
'nav-item'
)
.
click
()
browser
.
find_element
(
By
.
CLASS_NAME
,
'nav-item'
)
.
click
()
time
.
sleep
(
2
0
)
time
.
sleep
(
7
0
)
cookies
=
flushAndGetToken
()
cookies
=
flushAndGetToken
()
cookies
=
json
.
dumps
(
cookies
)
cookies
=
json
.
dumps
(
cookies
)
insert
=
f
"insert into QCC_token (cookies,create_time,fenghao_time,update_time) values ('{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),now())"
insert
=
f
"insert into QCC_token (cookies,create_time,fenghao_time,update_time) values ('{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),now())"
...
...
comData/Tyc/CorePerson.py
浏览文件 @
d9bf9b2f
...
@@ -41,17 +41,18 @@ def doJob():
...
@@ -41,17 +41,18 @@ def doJob():
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
baseCore
.
rePutIntoR
(
'CorPersonEnterprise:gnqy_socialCode'
,
social_code
)
continue
continue
id
=
data
[
0
]
id
=
data
[
0
]
com_name
=
data
[
1
]
xydm
=
data
[
2
]
xydm
=
data
[
2
]
tycid
=
data
[
11
]
tycid
=
data
[
11
]
if
tycid
==
None
or
tycid
==
''
:
if
tycid
==
None
or
tycid
==
''
:
try
:
try
:
retData
=
getTycIdByXYDM
(
xydm
)
retData
=
getTycIdByXYDM
(
com_name
)
if
retData
[
'state'
]:
if
retData
[
'state'
]:
tycid
=
retData
[
'tycData'
][
'id'
]
tycid
=
retData
[
'tycData'
][
'id'
]
# # todo:写入数据库
# # todo:写入数据库
#
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
updateSql
=
f
"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
#
cursor_.execute(updateSql)
cursor_
.
execute
(
updateSql
)
#
cnx_.commit()
cnx_
.
commit
()
else
:
else
:
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
...
...
sougou_comm/config.ini
浏览文件 @
d9bf9b2f
...
@@ -16,6 +16,8 @@ topic=keyWordsInfo
...
@@ -16,6 +16,8 @@ topic=keyWordsInfo
groupId
=
python_sougou
groupId
=
python_sougou
[selenium]
[selenium]
chrome_driver
=
C:
\U
sers
\W
IN10
\D
ataspellProjects
\c
rawlerProjectDemo
\t
mpcrawler
\c
md100
\c
hromedriver.exe
;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location
=
D:
\c
rawler
\b
aidu_crawler
\t
ool
\G
oogle
\C
hrome
\A
pplication
\c
hrome.exe
;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver
=
D:
\c
md100
\c
hromedriver.exe
binary_location
=
D:
\G
oogle
\C
hrome
\A
pplication
\c
hrome.exe
sougou_comm/sougouSpider.py
浏览文件 @
d9bf9b2f
...
@@ -7,6 +7,7 @@ import urllib3
...
@@ -7,6 +7,7 @@ import urllib3
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
gne
import
GeneralNewsExtractor
from
gne
import
GeneralNewsExtractor
from
langid
import
langid
from
langid
import
langid
from
retry
import
retry
from
selenium
import
webdriver
from
selenium
import
webdriver
from
selenium.webdriver.chrome.service
import
Service
from
selenium.webdriver.chrome.service
import
Service
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.common.by
import
By
...
@@ -144,7 +145,14 @@ class SougouSpider(object):
...
@@ -144,7 +145,14 @@ class SougouSpider(object):
itemTags
=
html
.
xpath
(
'//div[@class="vrwrap"]'
)
itemTags
=
html
.
xpath
(
'//div[@class="vrwrap"]'
)
for
itemTag
in
itemTags
:
for
itemTag
in
itemTags
:
try
:
try
:
title
=
itemTag
.
xpath
(
'.//h3[@class="vr-title"]/a/text()'
)[
0
]
elements
=
itemTag
.
xpath
(
'.//h3[@class="vr-title"]/a/text()'
)
title
=
''
.
join
(
str
(
element
.
strip
())
for
element
in
elements
if
element
.
strip
())
# title = ''
# for e in elements:
# print(e)
# title += e
print
(
title
)
except
Exception
as
e
:
except
Exception
as
e
:
title
=
''
title
=
''
try
:
try
:
...
@@ -243,9 +251,10 @@ class SougouSpider(object):
...
@@ -243,9 +251,10 @@ class SougouSpider(object):
print
(
'时间解析异常!!'
)
print
(
'时间解析异常!!'
)
return
publishtime
return
publishtime
@retry
(
tries
=
3
,
delay
=
3
)
# 获取每一页数据, 开趴.
# 获取每一页数据, 开趴.
def
get_page_html
(
self
):
def
get_page_html
(
self
):
self
.
logger
.
info
(
"
进入搜狗首页..."
)
self
.
logger
.
info
(
f
"{self.searchkw}...
进入搜狗首页..."
)
self
.
driver
.
get
(
self
.
url
)
self
.
driver
.
get
(
self
.
url
)
self
.
driver
.
find_element
(
By
.
ID
,
'query'
)
.
send_keys
(
self
.
searchkw
)
self
.
driver
.
find_element
(
By
.
ID
,
'query'
)
.
send_keys
(
self
.
searchkw
)
self
.
driver
.
find_element
(
By
.
ID
,
'stb'
)
.
click
()
self
.
driver
.
find_element
(
By
.
ID
,
'stb'
)
.
click
()
...
@@ -280,7 +289,7 @@ class SougouSpider(object):
...
@@ -280,7 +289,7 @@ class SougouSpider(object):
timeFlag
=
False
timeFlag
=
False
while
hasnext
==
'下一页'
:
while
hasnext
==
'下一页'
:
try
:
try
:
if
self
.
page_num
==
2
:
if
self
.
page_num
==
21
:
break
break
self
.
page_num
=
self
.
page_num
+
1
self
.
page_num
=
self
.
page_num
+
1
self
.
logger
.
info
(
"开始抓取第
%
s页..."
%
self
.
page_num
)
self
.
logger
.
info
(
"开始抓取第
%
s页..."
%
self
.
page_num
)
...
@@ -302,6 +311,7 @@ class SougouSpider(object):
...
@@ -302,6 +311,7 @@ class SougouSpider(object):
# if pubtime < needTime:
# if pubtime < needTime:
# timeFlag = True
# timeFlag = True
# break
# break
durl
=
detail
[
'detailUrl'
]
is_member
=
self
.
r
.
sismember
(
'pysougou_'
+
self
.
wordsCode
,
durl
)
is_member
=
self
.
r
.
sismember
(
'pysougou_'
+
self
.
wordsCode
,
durl
)
if
is_member
:
if
is_member
:
continue
continue
...
@@ -325,6 +335,8 @@ class SougouSpider(object):
...
@@ -325,6 +335,8 @@ class SougouSpider(object):
def
getDetailmsg
(
self
,
detailmsg
):
def
getDetailmsg
(
self
,
detailmsg
):
try
:
try
:
detailurl
=
detailmsg
[
'detailUrl'
]
detailurl
=
detailmsg
[
'detailUrl'
]
if
detailurl
==
''
:
return
''
title
=
detailmsg
[
'title'
]
title
=
detailmsg
[
'title'
]
content
,
contentWithTag
=
self
.
extractorMsg
(
detailurl
,
title
)
content
,
contentWithTag
=
self
.
extractorMsg
(
detailurl
,
title
)
contentWithTag
=
self
.
rmTagattr
(
contentWithTag
,
detailurl
)
contentWithTag
=
self
.
rmTagattr
(
contentWithTag
,
detailurl
)
...
@@ -350,6 +362,7 @@ class SougouSpider(object):
...
@@ -350,6 +362,7 @@ class SougouSpider(object):
}
}
return
detailmsg
return
detailmsg
@retry
(
tries
=
3
,
delay
=
2
)
def
webDriver
(
self
,
url
):
def
webDriver
(
self
,
url
):
chrome_driver
=
self
.
config
.
get
(
'selenium'
,
'chrome_driver'
)
chrome_driver
=
self
.
config
.
get
(
'selenium'
,
'chrome_driver'
)
path
=
Service
(
chrome_driver
)
path
=
Service
(
chrome_driver
)
...
@@ -360,12 +373,12 @@ class SougouSpider(object):
...
@@ -360,12 +373,12 @@ class SougouSpider(object):
try
:
try
:
driver
.
get
(
url
)
driver
.
get
(
url
)
# 等待页面加载完成
# 等待页面加载完成
#
wait = WebDriverWait(self.driver, 20)
wait
=
WebDriverWait
(
self
.
driver
,
20
)
#
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
TAG_NAME
,
"body"
)))
time
.
sleep
(
2
)
time
.
sleep
(
2
)
html
=
driver
.
page_source
html
=
driver
.
page_source
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
'请求失败
'
)
self
.
logger
.
info
(
f
'请求失败{e}
'
)
finally
:
finally
:
driver
.
quit
()
driver
.
quit
()
...
@@ -406,11 +419,12 @@ class SougouSpider(object):
...
@@ -406,11 +419,12 @@ class SougouSpider(object):
# current_window = self.driver.current_window_handle
# current_window = self.driver.current_window_handle
while
True
:
while
True
:
if
self
.
detailList
.
qsize
()
!=
0
:
if
self
.
detailList
.
qsize
()
!=
0
:
detailmsg
=
self
.
detailList
.
get
()
title
=
detailmsg
[
'title'
]
detailUrl
=
detailmsg
[
'detailUrl'
]
self
.
logger
.
info
(
"
%
s:
%
s
\n
"
%
(
title
,
detailUrl
))
try
:
try
:
detailmsg
=
self
.
detailList
.
get
()
title
=
detailmsg
[
'title'
]
detailUrl
=
detailmsg
[
'detailUrl'
]
print
(
"
%
s:
%
s
\n
"
%
(
title
,
detailUrl
))
# # js = "window.open('"+detailUrl+"')"
# # js = "window.open('"+detailUrl+"')"
# # self.driver.execute_script(js)
# # self.driver.execute_script(js)
# try:
# try:
...
@@ -423,19 +437,23 @@ class SougouSpider(object):
...
@@ -423,19 +437,23 @@ class SougouSpider(object):
# response = self.driver.page_source
# response = self.driver.page_source
# bdetail=self.getDetailmsg(response,detailmsg)
# bdetail=self.getDetailmsg(response,detailmsg)
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
bdetail
=
self
.
getDetailmsg
(
detailmsg
)
if
not
bdetail
:
continue
processitem
=
self
.
getProcessitem
(
bdetail
)
processitem
=
self
.
getProcessitem
(
bdetail
)
try
:
try
:
#
self.sendkafka(processitem)
self
.
sendkafka
(
processitem
)
self
.
r
.
sadd
(
'pysougou_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
self
.
r
.
sadd
(
'pysougou_'
+
self
.
wordsCode
,
processitem
[
'sourceAddress'
])
# 插入数据库
try
:
items
=
[]
items
.
append
(
bdetail
)
self
.
itemInsertToTable
(
items
)
except
Exception
as
e
:
self
.
logger
.
info
(
f
"插入数据库失败!{bdetail['kword']}===={detailUrl}"
)
self
.
logger
.
info
(
f
"放入kafka成功!{bdetail['kword']}===={detailUrl}"
)
except
Exception
as
e
:
except
Exception
as
e
:
self
.
logger
.
info
(
"放入kafka失败!"
)
self
.
logger
.
info
(
f
"放入kafka失败!{bdetail['kword']}===={detailUrl}"
)
#插入数据库
try
:
items
=
[]
items
.
append
(
bdetail
)
self
.
itemInsertToTable
(
items
)
except
Exception
as
e
:
self
.
logger
.
info
(
"插入数据库失败!"
)
# 关闭当前新窗口
# 关闭当前新窗口
# self.driver.close()
# self.driver.close()
time
.
sleep
(
1
)
time
.
sleep
(
1
)
...
...
sougou_comm/sougoutaskJob_loc.py
浏览文件 @
d9bf9b2f
...
@@ -218,12 +218,13 @@ if __name__ == '__main__':
...
@@ -218,12 +218,13 @@ if __name__ == '__main__':
while
True
:
while
True
:
try
:
try
:
codeList
=
[]
codeList
=
[]
codeList
.
append
(
'KW-20231013-0001'
)
# codeList.append('KW-20231013-0001')
codeList
.
append
(
'KW-20240116-0001'
)
for
codeid
in
codeList
:
for
codeid
in
codeList
:
try
:
try
:
#
keymsg=sougouTaskJob.getkeyFromredis(codeid)
keymsg
=
sougouTaskJob
.
getkeyFromredis
(
codeid
)
#
kwList=sougouTaskJob.paserKeyMsg(keymsg)
kwList
=
sougouTaskJob
.
paserKeyMsg
(
keymsg
)
kwList
=
sougouTaskJob
.
lockwMsg
()
#
kwList=sougouTaskJob.lockwMsg()
if
len
(
kwList
)
<
1
:
if
len
(
kwList
)
<
1
:
continue
continue
logger
.
info
(
f
"需要搜索的关键词:{kwList}"
)
logger
.
info
(
f
"需要搜索的关键词:{kwList}"
)
...
@@ -233,7 +234,7 @@ if __name__ == '__main__':
...
@@ -233,7 +234,7 @@ if __name__ == '__main__':
continue
continue
if
kwList
:
if
kwList
:
# 创建一个线程池,指定线程数量为4
# 创建一个线程池,指定线程数量为4
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
3
)
as
executor
:
# 提交任务给线程池,每个任务处理一个数据
# 提交任务给线程池,每个任务处理一个数据
results
=
[
executor
.
submit
(
sougouTaskJob
.
runLocSpider
,
data
)
for
data
in
kwList
]
results
=
[
executor
.
submit
(
sougouTaskJob
.
runLocSpider
,
data
)
for
data
in
kwList
]
# 获取任务的执行结果
# 获取任务的执行结果
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论