Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
刘力源
zzsn
Commits
e8838732
提交
e8838732
authored
8月 02, 2024
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
lly 2024-08-02
上级
63f0268b
显示空白字符变更
内嵌
并排
正在显示
7 个修改的文件
包含
239 行增加
和
120 行删除
+239
-120
trandingEconomics.py
dingzhi/trandingEconomics.py
+69
-44
aieco.py
xmzx/aieco.py
+1
-1
config.ini
xmzx/config.ini
+34
-23
crbc.py
xmzx/crbc.py
+7
-7
intlGcAvic.py
xmzx/intlGcAvic.py
+2
-1
test.py
xmzx/test.py
+108
-27
testAll.py
xmzx/testAll.py
+18
-17
没有找到文件。
dingzhi/trandingEconomics.py
浏览文件 @
e8838732
...
...
@@ -6,21 +6,26 @@ import re
import
time
import
urllib.parse
from
urllib.parse
import
urljoin
import
sys
import
pymongo
import
requests
import
subprocess
from
functools
import
partial
import
traceback
from
bs4
import
BeautifulSoup
from
retry
import
retry
from
requests.packages.urllib3.exceptions
import
InsecureRequestWarning
# sys.path.append('D:\\zzsn_spider\\base')
from
base
import
BaseCore
subprocess
.
Popen
=
partial
(
subprocess
.
Popen
,
encoding
=
'utf-8'
)
import
execjs
from
base
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
(
sql
f
lg
=
False
)
baseCore
=
BaseCore
.
BaseCore
(
sql
F
lg
=
False
)
requests
.
packages
.
urllib3
.
disable_warnings
(
InsecureRequestWarning
)
log
=
baseCore
.
getLogger
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
RESCenter
[
'tradingEconomics'
]
...
...
@@ -60,7 +65,7 @@ def paserUrl(html, listurl):
@retry
(
tries
=
2
,
delay
=
5
)
def
getSoup
(
url
):
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
20
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'lxml'
)
soup
=
paserUrl
(
soup
,
url
)
...
...
@@ -74,8 +79,6 @@ def getCountries():
soup
=
getSoup
(
url
)
div_list
=
soup
.
select
(
'#ctl00_ContentPlaceHolder1_ctl01_tableCountries > div'
)
for
div_tag
in
div_list
:
if
'G20'
in
div_tag
.
text
:
continue
li_list
=
div_tag
.
select
(
'> ul > li'
)
for
li_tag
in
li_list
:
if
'active'
in
li_tag
[
'class'
]:
...
...
@@ -91,18 +94,14 @@ def getIndex(country, url):
soup
=
getSoup
(
url
)
li_list
=
soup
.
find
(
'div'
,
class_
=
'pagetabs'
)
.
find
(
'ul'
,
attrs
=
{
'id'
:
'pagemenutabs'
})
.
find_all
(
'li'
,
class_
=
'nav-item'
)
div_list
=
soup
.
find
(
'div'
,
class_
=
'tab-content'
)
.
find_all
(
'div'
,
class_
=
'tab-pane'
)
#
for i in range(len(li_list)):
#
li_tag = li_list[i]
#
if 'Overview' in li_tag.find('a').text.strip():
#
del div_list[i]
#
break
for
i
in
range
(
len
(
li_list
)):
li_tag
=
li_list
[
i
]
if
'Overview'
in
li_tag
.
find
(
'a'
)
.
text
.
strip
():
del
div_list
[
i
]
break
for
i
in
range
(
len
(
div_list
)):
div_tag
=
div_list
[
i
]
try
:
tr_list
=
div_tag
.
find
(
'table'
)
.
find
(
'tbody'
)
.
find_all
(
'tr'
)
except
:
print
(
url
,
i
,
sep
=
'==='
)
continue
for
tr_tag
in
tr_list
:
option
=
tr_tag
.
find
(
'td'
)
.
find
(
'a'
)
.
text
.
strip
()
href
=
tr_tag
.
find
(
'td'
)
.
find
(
'a'
)
.
get
(
'href'
)
...
...
@@ -124,24 +123,32 @@ def getTimeKey(timeType, calendarStr):
timeKey
=
calendarStr
[:
4
]
+
'Q4'
elif
timeType
==
'yearly'
:
timeKey
=
calendarStr
[:
4
]
elif
timeType
==
'weekly'
:
date_obj
=
datetime
.
datetime
.
strptime
(
calendarStr
,
'
%
Y-
%
m-
%
d'
)
year
,
week
,
day
=
date_obj
.
isocalendar
()
if
len
(
str
(
week
))
==
1
:
week
=
f
'0{week}'
timeKey
=
f
'{year}{week}'
else
:
timeKey
=
False
timeKey
=
calendarStr
.
replace
(
'-'
,
''
)
return
timeKey
@retry
(
tries
=
3
,
delay
=
5
)
def
decrypt
(
value
,
js_runtime
):
dataJson
=
js_runtime
.
call
(
'doJob'
,
value
,
key
)
return
dataJson
@retry
(
tries
=
3
,
delay
=
10
)
def
getJson
(
url
):
req
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
20
,
verify
=
False
)
def
getJson
(
url
,
js_runtime
):
req
=
requests
.
get
(
url
,
headers
=
headers
,
timeout
=
20
,
verify
=
False
)
value
=
req
.
text
.
replace
(
'"'
,
''
)
req
.
close
()
with
open
(
r'./trandingEconomics.js'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
js_runtime
=
execjs
.
compile
(
js
)
try
:
dataJson
=
js_runtime
.
call
(
'doJob'
,
value
,
key
)
except
:
print
(
valu
e
)
dataJson
=
decrypt
(
value
,
js_runtime
)
except
Exception
as
e
:
log
.
error
(
e
)
raise
if
dataJson
:
try
:
dataJson
=
json
.
loads
(
dataJson
)[
0
]
except
:
...
...
@@ -149,7 +156,8 @@ def getJson(url):
return
dataJson
def
getData
(
url
):
def
getData
(
url
,
js_runtime
):
type_list
=
[
'1w'
,
'1m'
,
'6m'
,
'1y'
,
'5y'
,
'10y'
,
'25y'
,
'50y'
,
'max'
,
'all'
]
createTime
=
datetime
.
datetime
.
now
()
createTimeStr
=
createTime
.
strftime
(
'
%
Y-
%
m-
%
d'
)
soup
=
getSoup
(
url
)
...
...
@@ -161,7 +169,7 @@ def getData(url):
break
else
:
log
.
error
(
f
'数据链接获取失败==={url}'
)
return
return
False
for
script
in
scripts
:
if
'TEChartsToken'
in
script
.
text
:
TEChartsToken
=
re
.
findall
(
'TEChartsToken =
\'
(.*?)
\'
'
,
script
.
text
)[
0
]
...
...
@@ -169,10 +177,18 @@ def getData(url):
break
else
:
log
.
error
(
f
'数据链接获取失败==={url}'
)
return
return
False
TESymbol
=
TESymbol
.
lower
()
href
=
f
'https://d3ii0wo49og5mi.cloudfront.net/economics/{urllib.parse.quote(TESymbol)}?&span=max&v=20240102145900&key={TEChartsToken}'
dataJson
=
getJson
(
href
)
for
type
in
type_list
:
href
=
f
'https://d3ii0wo49og5mi.cloudfront.net/economics/{urllib.parse.quote(TESymbol)}?&span={type}&v=20240102145900&key={TEChartsToken}'
try
:
dataJson
=
getJson
(
href
,
js_runtime
)
except
Exception
as
e
:
log
.
error
(
f
'{type}===数据请求失败==={e}'
)
return
False
#series = dataJson['series'][-10:]
if
not
dataJson
:
continue
series
=
dataJson
[
'series'
]
for
serie_
in
series
:
serie
=
serie_
[
'serie'
]
...
...
@@ -202,9 +218,10 @@ def getData(url):
}
if
db_storage
.
find_one
({
'country'
:
country
,
'indicators'
:
indicators
,
'timeType'
:
timeType
,
'calendarStr'
:
calendarStr
}):
log
.
info
(
f
'{country}==={indicators}==={calendarStr}===已采集'
)
break
continue
db_storage
.
insert_one
(
dic
)
log
.
info
(
f
'{country}==={indicators}==={calendarStr}===入库成功'
)
return
True
def
doJob
():
...
...
@@ -215,10 +232,22 @@ def doJob():
info
=
f
'{index[0]}|{index[1]}|{index[2]}'
baseCore
.
r
.
rpush
(
'trandingEconomics:info'
,
info
)
log
.
info
(
'数据已全部放入redis中'
)
# log.info(f'开始采集==={index[0]}==={index[1]}')
# getData(index[2])
def
doJobA
():
try
:
with
open
(
r'./trandingEconomics.js'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
execjs
.
get
(
'Node'
)
js_runtime
=
execjs
.
compile
(
js
)
except
:
return
errorNum
=
0
while
True
:
if
errorNum
>
10
:
break
info
=
baseCore
.
r
.
blpop
([
'trandingEconomics:info'
],
2
)
if
not
info
:
log
.
info
(
'数据已全部采集完成'
)
...
...
@@ -228,25 +257,21 @@ def doJobA():
country
=
info
.
split
(
'|'
)[
0
]
index
=
info
.
split
(
'|'
)[
1
]
url
=
info
.
split
(
'|'
)[
2
]
if
url
.
endswith
(
'/rating'
):
continue
log
.
info
(
f
'开始采集==={country}==={index}'
)
try
:
getData
(
url
)
if
getData
(
url
,
js_runtime
):
pass
else
:
errorNum
+=
1
baseCore
.
r
.
rpush
(
'trandingEconomics:info'
,
info
)
except
Exception
as
e
:
errorNum
+=
1
traceback
.
print_exc
()
log
.
error
(
f
'{country}==={index}===采集失败==={e}'
)
baseCore
.
r
.
rpush
(
'trandingEconomics:info'
,
info
)
if
__name__
==
"__main__"
:
# testA()
# doJob()
# Peru===Currency
# Bulgaria===Stock Market
# Bulgaria===Interest Rate
# infoLens = baseCore.r.llen('trandingEconomics:info')
# print(infoLens)
jiami
=
'a/lpZGluZ2VjbOvgLCKnQz3mlyvrByT3TUWWTZ/Bt9RVQx5xnQYsCU4fVSSo3ZGypEPLdDwREI65v+hkHO32iRzgmdYJc3AZFO6drPcW7yzvT7ovG7g4qxA1n3kxhiEQ808R90cOX+DZdz2H+xeTxuDmi/Un7sLeUZCPe3TS0sayhyPwOhUjXx/fFk2agaDz4pU0xWL34265lqd4zZSkAwwcpX/eLI5BvDEHKP61naRAHNgUIaX1g9DoyYzV9Mi6bu7gvSDvpkcvwyQ6WiOaoSpjI4vK2Kdt2SgJu92zedyrjpmpSjFfjnEf2Y6tIjgTY480acLmcWkt'
with
open
(
r'./trandingEconomics.js'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
js_runtime
=
execjs
.
compile
(
js
)
dataJson
=
js_runtime
.
call
(
'doJob'
,
jiami
,
key
)
print
(
dataJson
)
doJobA
()
xmzx/aieco.py
浏览文件 @
e8838732
...
...
@@ -57,7 +57,7 @@ def doJob():
origin
=
dataJson
[
'source'
]
if
db_stroage
.
find_one
({
'原文链接'
:
href
}):
continue
if
publishDate
>
'202
3-12-31 23:59:59'
or
publishDate
<
'2023
-01-01 00:00:00'
:
if
publishDate
>
'202
2-12-31 23:59:59'
or
publishDate
<
'2022
-01-01 00:00:00'
:
continue
log
.
info
(
f
'开始采集==={title}'
)
getDic
(
href
,
title
,
publishDate
,
origin
,
type
[
0
])
...
...
xmzx/config.ini
浏览文件 @
e8838732
[doJob]
;是否开始采集
;验证列表页
flg
=
False
;验证详情页
insertFlg
=
False
;flg为False时,验证列表页
;flg为True,insertFlg为False时,验证详情页
;都为True时,采集数据
;页码
[page]
;网页爬取页数的起始与末尾
;需要从链接中判断
begin
=
1
begin
=
0
end
=
3
;链接
[home]
;网站名称
origin
=
贸易投资网-贸易商机
;首页链接是否与后续链接有不同:是为True;不是为False
urlFlg
=
Fals
e
urlFlg
=
Tru
e
;如果首页链接与后续链接不同,需要填写该值
;样例 http://www.cgcjm.cecep.cn/g9222.aspx
urlBegin
=
http://www.
aieco.org/article/ywfc
urlBegin
=
http://www.
camce.com.cn/xbcn/xwzx/xmdt/index.html
;如果首页链接与后续链接不同,需要填写该值
urlBeginNum
=
1
urlBeginNum
=
0
;爬取网站链接的通用格式,页码处使用{}代替
;样例 http://www.cgcjm.cecep.cn/g9222/m17246/mp{}.aspx
url
=
http
s://www.tradeinvest.cn/trade/list?page={}&title=&industryType=&transactionType=&targetMarket=
url
=
http
://www.camce.com.cn/xbcn/xwzx/xmdt/index_{}.html
;列表页
[homeSelect]
;资讯列表的select
data_info_list
=
body > div
.container > div > div > div
data_info_list
=
body > div
:nth-of-type(3) > div > div.sidebarR > ul > li
;标题所在 select
title
=
h5
title
=
a
;发布时间不在列表页显示 该值需要置为False
publishDateFlg
=
True
publishDate
=
span
.format-datetime
publishDate
=
span
;获取到时间的格式
publishDateType
=
%%Y-%%m-%%d
;链接所在 select
href
=
a
;详情页
[detailSelect]
;正文
contentWithTag
=
#
articleBody
contentWithTag
=
#
xwxq2 > div
;是否有来源
originFlg = True
;来源
origin
=
#leftList > div.content_article_source > table > tr > td.first
origin
=
body > div:nth-of-type(3) > div > div.sidebarR > div.xwxq > div > form > table > tbody > tr > td:nth-of-type(2)
;发布时间 如果home中的publishDateFlg为False才需要配置
publishDate
=
div.second-news-item-date
;获取到时间的格式
...
...
@@ -49,15 +61,14 @@ publishDateType = %%Y-%%m-%%d %%H:%%M:%%S
[headers]
;请求头信息
Accept
=
text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
Accept-Language
=
zh-CN,zh-TW;q=0.9,zh;q=0.8
Connectio
=
keep-alive
Sec-Fetch-Dest
=
document
Sec-Fetch-Mode
=
navigate
Sec-Fetch-Site
=
same-origin
Sec-Fetch-User
=
?1
Upgrade-Insecure-Requests
=
1
User-Agent
=
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0
sec-ch-ua
=
"Not/A)Brand"
;v="8", "Chromium";v="126", "Microsoft Edge";v="126"
sec-ch-ua-mobile = ?0
sec-ch-ua-platform
=
"Windows"
\ No newline at end of file
Accept:
text/html,application/xhtml+xml,application/xml
;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
Accept-Encoding:
gzip,
deflate
Accept-Language:
zh-CN,zh-TW
;q=0.9,zh;q=0.8
Cache-Control:
max-age
=
0
Connection:
keep-alive
Cookie:
Hm_lvt_e54259884352514b610814aa18f84433
=
1722569732; HMACCOUNT=9222512DCF10CB7B; Hm_lpvt_e54259884352514b610814aa18f84433=1722569808
Host:
www.camce.com.cn
If-Modified-Since:
Mon,
15
Jul
2024
00:40:15
GMT
If-None-Match:
"7b53-61d3e78f899c0-gzip"
Upgrade-Insecure-Requests:
1
User-Agent:
Mozilla/5.0
(Windows
NT
10.0;
Win64;
x64)
AppleWebKit/537.36
(KHTML,
like
Gecko)
Chrome/127.0.0.0
Safari/537.36
Edg/127.0.0.0
\ No newline at end of file
xmzx/crbc.py
浏览文件 @
e8838732
...
...
@@ -54,13 +54,13 @@ def getDic(url, title, publishDate):
def
doJob
():
#
for page in range(1, 13):
for
page
in
range
(
0
,
5
):
if
page
==
0
:
url
=
f
'https://www.crbc.com/site/crbc/zwjgdt/index.html'
else
:
url
=
f
'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
#
url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
for
page
in
range
(
1
,
13
):
#
for page in range(0,5):
#
if page == 0:
#
url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
#
else:
#
url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
url
=
f
'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
soup
=
getSoup
(
url
)
liList
=
soup
.
find
(
'ul'
,
class_
=
'right-column-list'
)
.
find_all
(
'li'
)
for
liTag
in
liList
:
...
...
xmzx/intlGcAvic.py
浏览文件 @
e8838732
...
...
@@ -50,7 +50,8 @@ def getDic(url, title, publishDate):
def
doJob
():
for
page
in
range
(
1
,
4
):
url
=
'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
# url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
url
=
''
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
...
...
xmzx/test.py
浏览文件 @
e8838732
...
...
@@ -8,12 +8,13 @@ from bs4 import BeautifulSoup
from
base
import
BaseCore
from
elasticsearch
import
Elasticsearch
,
helpers
# db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
# db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
baseCore
=
BaseCore
.
BaseCore
()
# # db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
db_stroageShencaiB
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'项目咨询-神采'
]
db_stroageA
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'项目资讯-定制'
]
db_stroageB
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'项目资讯-定制_'
]
es_client
=
Elasticsearch
([{
'host'
:
'114.115.215.250'
,
'port'
:
'9700'
}],
http_auth
=
(
'elastic'
,
'zzsn9988'
),
timeout
=
600
)
baseCore
=
BaseCore
.
BaseCore
()
from
powerchina
import
db_stroage
...
...
@@ -63,8 +64,8 @@ def select(sid):
{
"range"
:
{
"publishDate"
:
{
"gte"
:
"202
3
-01-01T00:00:00"
,
"lte"
:
"202
3
-12-31T23:59:59"
"gte"
:
"202
2
-01-01T00:00:00"
,
"lte"
:
"202
2
-12-31T23:59:59"
}
}
}
...
...
@@ -191,29 +192,109 @@ def select(sid):
# df = pd.DataFrame(data_list)
# df.to_excel('./项目资讯-定制.xlsx', index=False)
sids
=
[
'1811965474060091394'
,
'1811960555242528770'
,
'1811958016631644161'
,
'1811955892663336962'
,
'1811950817584857089'
,
'1811939863161716737'
,
'1811937580770402305'
,
'1811933720142135297'
,
'1811566665440186370'
,
'1810983037486170113'
,
'1810980529153966081'
,
'1810978470438567938'
,
'1810976012817707009'
,
'1810972790830858242'
,
'1810968708888068097'
,
'1810960658496102401'
,
'1810954505034969089'
,
'1810947397855879170'
]
for
sid
in
sids
:
# sids = ['1811965474060091394',
# '1811960555242528770',
# '1811958016631644161',
# '1811955892663336962',
# '1811950817584857089',
# '1811939863161716737',
# '1811937580770402305',
# '1811933720142135297',
# '1811566665440186370',
# '1810983037486170113',
# '1810980529153966081',
# '1810978470438567938',
# '1810976012817707009',
# '1810972790830858242',
# '1810968708888068097',
# '1810960658496102401',
# '1810954505034969089',
# '1810947397855879170']
# for sid in sids:
# num = 0
# datas = select(sid)
# for data in datas:
# num += 1
# print(f'{sid}==={num}')
# es获取数据
def
esData
():
sql
=
'select source_id from info_source_group_map where group_id="1697061836360126466"'
baseCore
.
cursor_
.
execute
(
sql
)
datas
=
baseCore
.
cursor_
.
fetchall
()
dics
=
[]
urlList
=
[]
for
data
in
datas
:
sid
=
data
[
0
]
sqlSelect
=
f
'select web_site_name from info_source where id="{sid}"'
baseCore
.
cursor_
.
execute
(
sqlSelect
)
web
=
baseCore
.
cursor_
.
fetchone
()[
0
]
results
=
select
(
sid
)
num
=
0
datas
=
select
(
sid
)
for
result
in
results
:
try
:
title
=
result
[
'_source'
][
'title'
]
publishDate
=
result
[
'_source'
][
'publishDate'
]
if
len
(
publishDate
)
==
10
:
publishDate
=
publishDate
+
' 00:00:00'
else
:
publishDate
=
publishDate
.
replace
(
'T'
,
' '
)
origin
=
result
[
'_source'
][
'origin'
]
content
=
result
[
'_source'
][
'content'
]
contentWithTag
=
result
[
'_source'
][
'contentWithTag'
]
url
=
result
[
'_source'
][
'sourceAddress'
]
except
:
continue
if
url
in
urlList
:
continue
dic
=
{
'标题'
:
title
,
'发布时间'
:
publishDate
,
'来源'
:
origin
,
'网站'
:
web
,
'正文'
:
content
,
'正文带标签'
:
str
(
contentWithTag
),
'原文链接'
:
url
}
dics
.
append
(
dic
)
urlList
.
append
(
url
)
df
=
pd
.
DataFrame
(
dics
)
df
.
to_excel
(
'./项目资讯-定制.xlsx'
,
index
=
False
)
# 神采数据导出
def
shencai
():
datas_
=
[]
datas
=
db_stroageShencaiB
.
find
({
'日期'
:
{
"$gte"
:
'2022-01-01 00:00:00'
,
"$lt"
:
'2023-01-01 00:00:00'
}})
for
data
in
datas
:
num
+=
1
print
(
f
'{sid}==={num}'
)
title
=
data
[
'标题'
]
href
=
data
[
'URL'
]
origin
=
data
[
'来源'
]
web
=
data
[
'栏目'
]
content
=
data
[
'正文不带标签'
]
contentWithTag
=
data
[
'内容'
]
publishDate
=
data
[
'日期'
]
# print(publishDate)
datas_
.
append
([
title
,
publishDate
,
origin
,
web
,
content
,
contentWithTag
,
href
])
df
=
pd
.
DataFrame
(
datas_
,
columns
=
[
'标题'
,
'发布时间'
,
'来源'
,
'网站'
,
'正文'
,
'正文带标签'
,
'原文链接'
])
df
.
to_excel
(
'./项目资讯-定制.xlsx'
,
index
=
False
)
#mongodb 定制
def
dingzhi
():
data_list
=
[]
datas
=
db_stroageA
.
find
()
for
data
in
datas
:
del
data
[
'_id'
]
db_stroageB
.
insert_one
(
data
)
data_list
.
append
(
data
)
df
=
pd
.
DataFrame
(
data_list
)
df
.
to_excel
(
'./项目资讯-定制.xlsx'
,
index
=
False
)
if
__name__
==
'__main__'
:
# esData()
# shencai()
# dingzhi()
pass
baseCore
.
close
()
xmzx/testAll.py
浏览文件 @
e8838732
...
...
@@ -46,10 +46,10 @@ class Spider():
if
publishDate
>
'2023-12-31 23:59:59'
or
publishDate
<
'2023-01-01 00:00:00'
:
return
if
self
.
config
.
getboolean
(
'detailSelect'
,
'originFlg'
):
#
origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
source
=
soup
.
find
(
'body'
)
.
find
(
'script'
)
.
text
source
=
re
.
findall
(
'source =
\"
(.*?)
\"
;'
,
source
)[
0
]
origin
=
source
origin
=
soup
.
select
(
self
.
config
.
get
(
'detailSelect'
,
'origin'
))[
0
]
.
text
#
source = soup.find('body').find('script').text
#
source = re.findall('source = \"(.*?)\";', source)[0]
#
origin = source
try
:
try
:
origin
=
origin
.
split
(
'来源:'
)[
1
]
.
strip
()
...
...
@@ -88,8 +88,8 @@ class Spider():
def
doJob
(
self
):
for
page
in
range
(
int
(
self
.
config
.
get
(
'page'
,
'begin'
)),
int
(
self
.
config
.
get
(
'page'
,
'end'
))):
if
self
.
config
.
getboolean
(
'home'
,
'urlFlg'
)
and
page
==
self
.
config
.
get
(
'home'
,
'urlBeginNum'
):
url
=
self
.
config
.
get
(
'
sit
'
,
'urlBegin'
)
if
self
.
config
.
getboolean
(
'home'
,
'urlFlg'
)
and
page
==
int
(
self
.
config
.
get
(
'home'
,
'urlBeginNum'
)
):
url
=
self
.
config
.
get
(
'
home
'
,
'urlBegin'
)
else
:
url
=
self
.
config
.
get
(
'home'
,
'url'
)
.
format
(
page
)
soup
=
getSoup
(
url
,
self
.
getHeader
())
...
...
@@ -104,14 +104,14 @@ class Spider():
title
=
data_info
.
select
(
self
.
config
.
get
(
'homeSelect'
,
'title'
))[
0
]
.
text
.
strip
()
except
:
continue
href
=
data_info
.
get
(
'onclick'
)
href
=
'https://www.tradeinvest.cn/trade/'
+
re
.
findall
(
'
\
(
\"
(.*)
\"
\
)'
,
href
)[
0
]
+
'/detail'
#
href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
#
href = data_info.get('onclick')
#
href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
href
=
data_info
.
select
(
self
.
config
.
get
(
'homeSelect'
,
'href'
))[
0
]
.
get
(
'href'
)
# href = data_info.get('href')
if
self
.
config
.
getboolean
(
'homeSelect'
,
'publishDateFlg'
):
publishDate
=
data_info
.
select
(
self
.
config
.
get
(
'homeSelect'
,
'publishDate'
))[
0
]
.
text
.
strip
()
.
replace
(
'
\t
'
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
split
(
'T'
)[
0
]
+
' 00:00:00'
publishDate
=
data_info
.
select
(
self
.
config
.
get
(
'homeSelect'
,
'publishDate'
))[
0
]
.
text
.
strip
()
.
replace
(
'
\t
'
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
.
split
(
'T'
)[
0
]
# publishDate = href.split('net.cn')[1].split('art_')[0]
#
publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
publishDate
=
datetime
.
datetime
.
strptime
(
publishDate
,
self
.
config
.
get
(
'homeSelect'
,
'publishDateType'
))
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
if
publishDate
>
'2023-12-31 23:59:59'
or
publishDate
<
'2023-01-01 00:00:00'
:
continue
else
:
...
...
@@ -121,13 +121,14 @@ class Spider():
log
.
info
(
f
'开始采集==={title}==={publishDate}==={href}'
)
if
not
self
.
config
.
getboolean
(
'doJob'
,
'flg'
):
break
try
:
pass
# self.getDic(href, title, publishDate)
except
Exception
as
e
:
log
.
error
(
f
'{title}===采集失败==={e}'
)
# try:
self
.
getDic
(
href
,
title
,
publishDate
)
# except Exception as e:
# log.error(f'{title}===采集失败==={e}')
if
not
self
.
config
.
getboolean
(
'doJob'
,
'insertFlg'
):
break
time
.
sleep
(
0.5
)
if
not
self
.
config
.
getboolean
(
'doJob'
,
'flg'
):
if
not
self
.
config
.
getboolean
(
'doJob'
,
'flg'
)
or
not
self
.
config
.
getboolean
(
'doJob'
,
'insertFlg'
)
:
break
time
.
sleep
(
0.5
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论