提交 e8838732 作者: LiuLiYuan

lly 2024-08-02

上级 63f0268b
...@@ -6,21 +6,26 @@ import re ...@@ -6,21 +6,26 @@ import re
import time import time
import urllib.parse import urllib.parse
from urllib.parse import urljoin from urllib.parse import urljoin
import sys
import pymongo import pymongo
import requests import requests
import subprocess import subprocess
from functools import partial from functools import partial
import traceback
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from retry import retry from retry import retry
from requests.packages.urllib3.exceptions import InsecureRequestWarning
# sys.path.append('D:\\zzsn_spider\\base')
from base import BaseCore
subprocess.Popen = partial(subprocess.Popen, encoding='utf-8') subprocess.Popen = partial(subprocess.Popen, encoding='utf-8')
import execjs import execjs
from base import BaseCore
baseCore = BaseCore.BaseCore(sqlflg=False) baseCore = BaseCore.BaseCore(sqlFlg=False)
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
log = baseCore.getLogger() log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'tradingEconomics'] 'tradingEconomics']
...@@ -60,7 +65,7 @@ def paserUrl(html, listurl): ...@@ -60,7 +65,7 @@ def paserUrl(html, listurl):
@retry(tries=2, delay=5) @retry(tries=2, delay=5)
def getSoup(url): def getSoup(url):
req = requests.get(url, headers=headers) req = requests.get(url, headers=headers,timeout=20)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml') soup = BeautifulSoup(req.text, 'lxml')
soup = paserUrl(soup, url) soup = paserUrl(soup, url)
...@@ -74,8 +79,6 @@ def getCountries(): ...@@ -74,8 +79,6 @@ def getCountries():
soup = getSoup(url) soup = getSoup(url)
div_list = soup.select('#ctl00_ContentPlaceHolder1_ctl01_tableCountries > div') div_list = soup.select('#ctl00_ContentPlaceHolder1_ctl01_tableCountries > div')
for div_tag in div_list: for div_tag in div_list:
if 'G20' in div_tag.text:
continue
li_list = div_tag.select('> ul > li') li_list = div_tag.select('> ul > li')
for li_tag in li_list: for li_tag in li_list:
if 'active' in li_tag['class']: if 'active' in li_tag['class']:
...@@ -91,18 +94,14 @@ def getIndex(country, url): ...@@ -91,18 +94,14 @@ def getIndex(country, url):
soup = getSoup(url) soup = getSoup(url)
li_list = soup.find('div', class_='pagetabs').find('ul', attrs={'id': 'pagemenutabs'}).find_all('li', class_='nav-item') li_list = soup.find('div', class_='pagetabs').find('ul', attrs={'id': 'pagemenutabs'}).find_all('li', class_='nav-item')
div_list = soup.find('div', class_='tab-content').find_all('div', class_='tab-pane') div_list = soup.find('div', class_='tab-content').find_all('div', class_='tab-pane')
# for i in range(len(li_list)): for i in range(len(li_list)):
# li_tag = li_list[i] li_tag = li_list[i]
# if 'Overview' in li_tag.find('a').text.strip(): if 'Overview' in li_tag.find('a').text.strip():
# del div_list[i] del div_list[i]
# break break
for i in range(len(div_list)): for i in range(len(div_list)):
div_tag = div_list[i] div_tag = div_list[i]
try:
tr_list = div_tag.find('table').find('tbody').find_all('tr') tr_list = div_tag.find('table').find('tbody').find_all('tr')
except:
print(url, i, sep='===')
continue
for tr_tag in tr_list: for tr_tag in tr_list:
option = tr_tag.find('td').find('a').text.strip() option = tr_tag.find('td').find('a').text.strip()
href = tr_tag.find('td').find('a').get('href') href = tr_tag.find('td').find('a').get('href')
...@@ -124,24 +123,32 @@ def getTimeKey(timeType, calendarStr): ...@@ -124,24 +123,32 @@ def getTimeKey(timeType, calendarStr):
timeKey = calendarStr[:4] + 'Q4' timeKey = calendarStr[:4] + 'Q4'
elif timeType == 'yearly': elif timeType == 'yearly':
timeKey = calendarStr[:4] timeKey = calendarStr[:4]
elif timeType == 'weekly':
date_obj = datetime.datetime.strptime(calendarStr,'%Y-%m-%d')
year, week, day = date_obj.isocalendar()
if len(str(week)) == 1:
week = f'0{week}'
timeKey = f'{year}{week}'
else: else:
timeKey = False timeKey = calendarStr.replace('-','')
return timeKey return timeKey
@retry(tries=3, delay=5)
def decrypt(value,js_runtime):
dataJson = js_runtime.call('doJob', value, key)
return dataJson
@retry(tries=3, delay=10) @retry(tries=3, delay=10)
def getJson(url): def getJson(url,js_runtime):
req = requests.get(url, headers=headers, timeout=20, verify=False) req = requests.get(url, headers=headers, timeout=20,verify=False)
value = req.text.replace('"', '') value = req.text.replace('"', '')
req.close() req.close()
with open(r'./trandingEconomics.js', 'r', encoding='utf-8') as f:
js = f.read()
js_runtime = execjs.compile(js)
try: try:
dataJson = js_runtime.call('doJob', value, key) dataJson = decrypt(value,js_runtime)
except: except Exception as e:
print(value) log.error(e)
raise raise
if dataJson:
try: try:
dataJson = json.loads(dataJson)[0] dataJson = json.loads(dataJson)[0]
except: except:
...@@ -149,7 +156,8 @@ def getJson(url): ...@@ -149,7 +156,8 @@ def getJson(url):
return dataJson return dataJson
def getData(url): def getData(url,js_runtime):
type_list = ['1w','1m','6m','1y','5y','10y','25y','50y','max','all']
createTime = datetime.datetime.now() createTime = datetime.datetime.now()
createTimeStr = createTime.strftime('%Y-%m-%d') createTimeStr = createTime.strftime('%Y-%m-%d')
soup = getSoup(url) soup = getSoup(url)
...@@ -161,7 +169,7 @@ def getData(url): ...@@ -161,7 +169,7 @@ def getData(url):
break break
else: else:
log.error(f'数据链接获取失败==={url}') log.error(f'数据链接获取失败==={url}')
return return False
for script in scripts: for script in scripts:
if 'TEChartsToken' in script.text: if 'TEChartsToken' in script.text:
TEChartsToken = re.findall('TEChartsToken = \'(.*?)\'', script.text)[0] TEChartsToken = re.findall('TEChartsToken = \'(.*?)\'', script.text)[0]
...@@ -169,10 +177,18 @@ def getData(url): ...@@ -169,10 +177,18 @@ def getData(url):
break break
else: else:
log.error(f'数据链接获取失败==={url}') log.error(f'数据链接获取失败==={url}')
return return False
TESymbol = TESymbol.lower() TESymbol = TESymbol.lower()
href = f'https://d3ii0wo49og5mi.cloudfront.net/economics/{urllib.parse.quote(TESymbol)}?&span=max&v=20240102145900&key={TEChartsToken}' for type in type_list:
dataJson = getJson(href) href = f'https://d3ii0wo49og5mi.cloudfront.net/economics/{urllib.parse.quote(TESymbol)}?&span={type}&v=20240102145900&key={TEChartsToken}'
try:
dataJson = getJson(href,js_runtime)
except Exception as e:
log.error(f'{type}===数据请求失败==={e}')
return False
#series = dataJson['series'][-10:]
if not dataJson:
continue
series = dataJson['series'] series = dataJson['series']
for serie_ in series: for serie_ in series:
serie = serie_['serie'] serie = serie_['serie']
...@@ -202,9 +218,10 @@ def getData(url): ...@@ -202,9 +218,10 @@ def getData(url):
} }
if db_storage.find_one({'country': country, 'indicators': indicators, 'timeType': timeType, 'calendarStr': calendarStr}): if db_storage.find_one({'country': country, 'indicators': indicators, 'timeType': timeType, 'calendarStr': calendarStr}):
log.info(f'{country}==={indicators}==={calendarStr}===已采集') log.info(f'{country}==={indicators}==={calendarStr}===已采集')
break continue
db_storage.insert_one(dic) db_storage.insert_one(dic)
log.info(f'{country}==={indicators}==={calendarStr}===入库成功') log.info(f'{country}==={indicators}==={calendarStr}===入库成功')
return True
def doJob(): def doJob():
...@@ -215,10 +232,22 @@ def doJob(): ...@@ -215,10 +232,22 @@ def doJob():
info = f'{index[0]}|{index[1]}|{index[2]}' info = f'{index[0]}|{index[1]}|{index[2]}'
baseCore.r.rpush('trandingEconomics:info', info) baseCore.r.rpush('trandingEconomics:info', info)
log.info('数据已全部放入redis中') log.info('数据已全部放入redis中')
# log.info(f'开始采集==={index[0]}==={index[1]}')
# getData(index[2])
def doJobA(): def doJobA():
try:
with open(r'./trandingEconomics.js', 'r', encoding='utf-8') as f:
js = f.read()
execjs.get('Node')
js_runtime = execjs.compile(js)
except:
return
errorNum = 0
while True: while True:
if errorNum > 10:
break
info = baseCore.r.blpop(['trandingEconomics:info'], 2) info = baseCore.r.blpop(['trandingEconomics:info'], 2)
if not info: if not info:
log.info('数据已全部采集完成') log.info('数据已全部采集完成')
...@@ -228,25 +257,21 @@ def doJobA(): ...@@ -228,25 +257,21 @@ def doJobA():
country = info.split('|')[0] country = info.split('|')[0]
index = info.split('|')[1] index = info.split('|')[1]
url = info.split('|')[2] url = info.split('|')[2]
if url.endswith('/rating'):
continue
log.info(f'开始采集==={country}==={index}') log.info(f'开始采集==={country}==={index}')
try: try:
getData(url) if getData(url,js_runtime):
pass
else:
errorNum += 1
baseCore.r.rpush('trandingEconomics:info', info)
except Exception as e: except Exception as e:
errorNum += 1
traceback.print_exc()
log.error(f'{country}==={index}===采集失败==={e}') log.error(f'{country}==={index}===采集失败==={e}')
baseCore.r.rpush('trandingEconomics:info', info) baseCore.r.rpush('trandingEconomics:info', info)
if __name__ == "__main__": if __name__ == "__main__":
# testA() doJobA()
# doJob()
# Peru===Currency
# Bulgaria===Stock Market
# Bulgaria===Interest Rate
# infoLens = baseCore.r.llen('trandingEconomics:info')
# print(infoLens)
jiami = 'a/lpZGluZ2VjbOvgLCKnQz3mlyvrByT3TUWWTZ/Bt9RVQx5xnQYsCU4fVSSo3ZGypEPLdDwREI65v+hkHO32iRzgmdYJc3AZFO6drPcW7yzvT7ovG7g4qxA1n3kxhiEQ808R90cOX+DZdz2H+xeTxuDmi/Un7sLeUZCPe3TS0sayhyPwOhUjXx/fFk2agaDz4pU0xWL34265lqd4zZSkAwwcpX/eLI5BvDEHKP61naRAHNgUIaX1g9DoyYzV9Mi6bu7gvSDvpkcvwyQ6WiOaoSpjI4vK2Kdt2SgJu92zedyrjpmpSjFfjnEf2Y6tIjgTY480acLmcWkt'
with open(r'./trandingEconomics.js', 'r', encoding='utf-8') as f:
js = f.read()
js_runtime = execjs.compile(js)
dataJson = js_runtime.call('doJob', jiami, key)
print(dataJson)
...@@ -57,7 +57,7 @@ def doJob(): ...@@ -57,7 +57,7 @@ def doJob():
origin = dataJson['source'] origin = dataJson['source']
if db_stroage.find_one({'原文链接': href}): if db_stroage.find_one({'原文链接': href}):
continue continue
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00': if publishDate > '2022-12-31 23:59:59' or publishDate < '2022-01-01 00:00:00':
continue continue
log.info(f'开始采集==={title}') log.info(f'开始采集==={title}')
getDic(href, title, publishDate, origin, type[0]) getDic(href, title, publishDate, origin, type[0])
......
[doJob] [doJob]
;是否开始采集 ;是否开始采集
;验证列表页
flg = False flg = False
;验证详情页
insertFlg = False insertFlg = False
;flg为False时,验证列表页
;flg为True,insertFlg为False时,验证详情页
;都为True时,采集数据
;页码
[page] [page]
;网页爬取页数的起始与末尾 ;网页爬取页数的起始与末尾
;需要从链接中判断 ;需要从链接中判断
begin = 1 begin = 0
end = 3 end = 3
;链接
[home] [home]
;网站名称
origin = 贸易投资网-贸易商机 origin = 贸易投资网-贸易商机
;首页链接是否与后续链接有不同:是为True;不是为False ;首页链接是否与后续链接有不同:是为True;不是为False
urlFlg = False urlFlg = True
;如果首页链接与后续链接不同,需要填写该值 ;如果首页链接与后续链接不同,需要填写该值
;样例 http://www.cgcjm.cecep.cn/g9222.aspx ;样例 http://www.cgcjm.cecep.cn/g9222.aspx
urlBegin = http://www.aieco.org/article/ywfc urlBegin = http://www.camce.com.cn/xbcn/xwzx/xmdt/index.html
;如果首页链接与后续链接不同,需要填写该值 ;如果首页链接与后续链接不同,需要填写该值
urlBeginNum = 1 urlBeginNum = 0
;爬取网站链接的通用格式,页码处使用{}代替 ;爬取网站链接的通用格式,页码处使用{}代替
;样例 http://www.cgcjm.cecep.cn/g9222/m17246/mp{}.aspx ;样例 http://www.cgcjm.cecep.cn/g9222/m17246/mp{}.aspx
url = https://www.tradeinvest.cn/trade/list?page={}&title=&industryType=&transactionType=&targetMarket= url = http://www.camce.com.cn/xbcn/xwzx/xmdt/index_{}.html
;列表页
[homeSelect] [homeSelect]
;资讯列表的select ;资讯列表的select
data_info_list = body > div.container > div > div > div data_info_list = body > div:nth-of-type(3) > div > div.sidebarR > ul > li
;标题所在 select ;标题所在 select
title = h5 title = a
;发布时间不在列表页显示 该值需要置为False ;发布时间不在列表页显示 该值需要置为False
publishDateFlg = True publishDateFlg = True
publishDate = span.format-datetime publishDate = span
;获取到时间的格式 ;获取到时间的格式
publishDateType = %%Y-%%m-%%d publishDateType = %%Y-%%m-%%d
;链接所在 select ;链接所在 select
href = a href = a
;详情页
[detailSelect] [detailSelect]
;正文 ;正文
contentWithTag = #articleBody contentWithTag = #xwxq2 > div
;是否有来源 ;是否有来源
originFlg = True originFlg = True
;来源 ;来源
origin = #leftList > div.content_article_source > table > tr > td.first origin = body > div:nth-of-type(3) > div > div.sidebarR > div.xwxq > div > form > table > tbody > tr > td:nth-of-type(2)
;发布时间 如果home中的publishDateFlg为False才需要配置 ;发布时间 如果home中的publishDateFlg为False才需要配置
publishDate = div.second-news-item-date publishDate = div.second-news-item-date
;获取到时间的格式 ;获取到时间的格式
...@@ -49,15 +61,14 @@ publishDateType = %%Y-%%m-%%d %%H:%%M:%%S ...@@ -49,15 +61,14 @@ publishDateType = %%Y-%%m-%%d %%H:%%M:%%S
[headers] [headers]
;请求头信息 ;请求头信息
Accept = text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
Accept-Language = zh-CN,zh-TW;q=0.9,zh;q=0.8 Accept-Encoding: gzip, deflate
Connectio = keep-alive Accept-Language: zh-CN,zh-TW;q=0.9,zh;q=0.8
Sec-Fetch-Dest = document Cache-Control: max-age=0
Sec-Fetch-Mode = navigate Connection: keep-alive
Sec-Fetch-Site = same-origin Cookie: Hm_lvt_e54259884352514b610814aa18f84433=1722569732; HMACCOUNT=9222512DCF10CB7B; Hm_lpvt_e54259884352514b610814aa18f84433=1722569808
Sec-Fetch-User = ?1 Host: www.camce.com.cn
Upgrade-Insecure-Requests = 1 If-Modified-Since: Mon, 15 Jul 2024 00:40:15 GMT
User-Agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0 If-None-Match: "7b53-61d3e78f899c0-gzip"
sec-ch-ua = "Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126" Upgrade-Insecure-Requests: 1
sec-ch-ua-mobile = ?0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0
sec-ch-ua-platform = "Windows" \ No newline at end of file
\ No newline at end of file
...@@ -54,13 +54,13 @@ def getDic(url, title, publishDate): ...@@ -54,13 +54,13 @@ def getDic(url, title, publishDate):
def doJob(): def doJob():
# for page in range(1, 13): for page in range(1, 13):
for page in range(0,5): # for page in range(0,5):
if page == 0: # if page == 0:
url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html' # url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
else: # else:
url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html' # url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
# url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1' url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
soup = getSoup(url) soup = getSoup(url)
liList = soup.find('ul', class_='right-column-list').find_all('li') liList = soup.find('ul', class_='right-column-list').find_all('li')
for liTag in liList: for liTag in liList:
......
...@@ -50,7 +50,8 @@ def getDic(url, title, publishDate): ...@@ -50,7 +50,8 @@ def getDic(url, title, publishDate):
def doJob(): def doJob():
for page in range(1, 4): for page in range(1, 4):
url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2' # url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
url = ''
req = requests.get(url, headers=headers) req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser') soup = BeautifulSoup(req.text, 'html.parser')
......
...@@ -8,12 +8,13 @@ from bs4 import BeautifulSoup ...@@ -8,12 +8,13 @@ from bs4 import BeautifulSoup
from base import BaseCore from base import BaseCore
from elasticsearch import Elasticsearch, helpers from elasticsearch import Elasticsearch, helpers
# db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1'] baseCore = BaseCore.BaseCore()
# db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
# # db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
db_stroageA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制'] db_stroageA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制']
db_stroageB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制_'] db_stroageB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制_']
es_client = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('elastic', 'zzsn9988'), timeout=600) es_client = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('elastic', 'zzsn9988'), timeout=600)
baseCore = BaseCore.BaseCore()
from powerchina import db_stroage from powerchina import db_stroage
...@@ -63,8 +64,8 @@ def select(sid): ...@@ -63,8 +64,8 @@ def select(sid):
{ {
"range": { "range": {
"publishDate": { "publishDate": {
"gte": "2023-01-01T00:00:00", "gte": "2022-01-01T00:00:00",
"lte": "2023-12-31T23:59:59" "lte": "2022-12-31T23:59:59"
} }
} }
} }
...@@ -191,29 +192,109 @@ def select(sid): ...@@ -191,29 +192,109 @@ def select(sid):
# df = pd.DataFrame(data_list) # df = pd.DataFrame(data_list)
# df.to_excel('./项目资讯-定制.xlsx', index=False) # df.to_excel('./项目资讯-定制.xlsx', index=False)
sids = ['1811965474060091394', # sids = ['1811965474060091394',
'1811960555242528770', # '1811960555242528770',
'1811958016631644161', # '1811958016631644161',
'1811955892663336962', # '1811955892663336962',
'1811950817584857089', # '1811950817584857089',
'1811939863161716737', # '1811939863161716737',
'1811937580770402305', # '1811937580770402305',
'1811933720142135297', # '1811933720142135297',
'1811566665440186370', # '1811566665440186370',
'1810983037486170113', # '1810983037486170113',
'1810980529153966081', # '1810980529153966081',
'1810978470438567938', # '1810978470438567938',
'1810976012817707009', # '1810976012817707009',
'1810972790830858242', # '1810972790830858242',
'1810968708888068097', # '1810968708888068097',
'1810960658496102401', # '1810960658496102401',
'1810954505034969089', # '1810954505034969089',
'1810947397855879170'] # '1810947397855879170']
for sid in sids: # for sid in sids:
# num = 0
# datas = select(sid)
# for data in datas:
# num += 1
# print(f'{sid}==={num}')
# es获取数据
def esData():
sql = 'select source_id from info_source_group_map where group_id="1697061836360126466"'
baseCore.cursor_.execute(sql)
datas = baseCore.cursor_.fetchall()
dics = []
urlList = []
for data in datas:
sid = data[0]
sqlSelect = f'select web_site_name from info_source where id="{sid}"'
baseCore.cursor_.execute(sqlSelect)
web = baseCore.cursor_.fetchone()[0]
results = select(sid)
num = 0 num = 0
datas = select(sid) for result in results:
try:
title = result['_source']['title']
publishDate = result['_source']['publishDate']
if len(publishDate) == 10:
publishDate = publishDate + ' 00:00:00'
else:
publishDate = publishDate.replace('T', ' ')
origin = result['_source']['origin']
content = result['_source']['content']
contentWithTag = result['_source']['contentWithTag']
url = result['_source']['sourceAddress']
except:
continue
if url in urlList:
continue
dic = {
'标题': title,
'发布时间': publishDate,
'来源': origin,
'网站': web,
'正文': content,
'正文带标签': str(contentWithTag),
'原文链接': url
}
dics.append(dic)
urlList.append(url)
df = pd.DataFrame(dics)
df.to_excel('./项目资讯-定制.xlsx', index=False)
# 神采数据导出
def shencai():
datas_ = []
datas = db_stroageShencaiB.find({'日期': {"$gte": '2022-01-01 00:00:00', "$lt": '2023-01-01 00:00:00'}})
for data in datas: for data in datas:
num += 1 title = data['标题']
print(f'{sid}==={num}') href = data['URL']
origin = data['来源']
web = data['栏目']
content = data['正文不带标签']
contentWithTag = data['内容']
publishDate = data['日期']
# print(publishDate)
datas_.append([title, publishDate, origin, web, content, contentWithTag, href])
df = pd.DataFrame(datas_, columns=['标题', '发布时间', '来源', '网站', '正文', '正文带标签', '原文链接'])
df.to_excel('./项目资讯-定制.xlsx', index=False)
#mongodb 定制
def dingzhi():
data_list = []
datas = db_stroageA.find()
for data in datas:
del data['_id']
db_stroageB.insert_one(data)
data_list.append(data)
df = pd.DataFrame(data_list)
df.to_excel('./项目资讯-定制.xlsx', index=False)
if __name__ == '__main__':
# esData()
# shencai()
# dingzhi()
pass
baseCore.close() baseCore.close()
...@@ -46,10 +46,10 @@ class Spider(): ...@@ -46,10 +46,10 @@ class Spider():
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00': if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
return return
if self.config.getboolean('detailSelect', 'originFlg'): if self.config.getboolean('detailSelect', 'originFlg'):
# origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
source = soup.find('body').find('script').text # source = soup.find('body').find('script').text
source = re.findall('source = \"(.*?)\";', source)[0] # source = re.findall('source = \"(.*?)\";', source)[0]
origin = source # origin = source
try: try:
try: try:
origin = origin.split('来源:')[1].strip() origin = origin.split('来源:')[1].strip()
...@@ -88,8 +88,8 @@ class Spider(): ...@@ -88,8 +88,8 @@ class Spider():
def doJob(self): def doJob(self):
for page in range(int(self.config.get('page', 'begin')), int(self.config.get('page', 'end'))): for page in range(int(self.config.get('page', 'begin')), int(self.config.get('page', 'end'))):
if self.config.getboolean('home', 'urlFlg') and page == self.config.get('home', 'urlBeginNum'): if self.config.getboolean('home', 'urlFlg') and page == int(self.config.get('home', 'urlBeginNum')):
url = self.config.get('sit', 'urlBegin') url = self.config.get('home', 'urlBegin')
else: else:
url = self.config.get('home', 'url').format(page) url = self.config.get('home', 'url').format(page)
soup = getSoup(url, self.getHeader()) soup = getSoup(url, self.getHeader())
...@@ -104,14 +104,14 @@ class Spider(): ...@@ -104,14 +104,14 @@ class Spider():
title = data_info.select(self.config.get('homeSelect', 'title'))[0].text.strip() title = data_info.select(self.config.get('homeSelect', 'title'))[0].text.strip()
except: except:
continue continue
href = data_info.get('onclick') # href = data_info.get('onclick')
href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail' # href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
# href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href') href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
# href = data_info.get('href') # href = data_info.get('href')
if self.config.getboolean('homeSelect', 'publishDateFlg'): if self.config.getboolean('homeSelect', 'publishDateFlg'):
publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0] + ' 00:00:00' publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0]
# publishDate = href.split('net.cn')[1].split('art_')[0] # publishDate = href.split('net.cn')[1].split('art_')[0]
# publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S') publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00': if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
continue continue
else: else:
...@@ -121,13 +121,14 @@ class Spider(): ...@@ -121,13 +121,14 @@ class Spider():
log.info(f'开始采集==={title}==={publishDate}==={href}') log.info(f'开始采集==={title}==={publishDate}==={href}')
if not self.config.getboolean('doJob', 'flg'): if not self.config.getboolean('doJob', 'flg'):
break break
try: # try:
pass self.getDic(href, title, publishDate)
# self.getDic(href, title, publishDate) # except Exception as e:
except Exception as e: # log.error(f'{title}===采集失败==={e}')
log.error(f'{title}===采集失败==={e}') if not self.config.getboolean('doJob', 'insertFlg'):
break
time.sleep(0.5) time.sleep(0.5)
if not self.config.getboolean('doJob', 'flg'): if not self.config.getboolean('doJob', 'flg') or not self.config.getboolean('doJob', 'insertFlg'):
break break
time.sleep(0.5) time.sleep(0.5)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论