提交 e8838732 作者: LiuLiYuan

lly 2024-08-02

上级 63f0268b
......@@ -6,21 +6,26 @@ import re
import time
import urllib.parse
from urllib.parse import urljoin
import sys
import pymongo
import requests
import subprocess
from functools import partial
import traceback
from bs4 import BeautifulSoup
from retry import retry
from requests.packages.urllib3.exceptions import InsecureRequestWarning
# sys.path.append('D:\\zzsn_spider\\base')
from base import BaseCore
subprocess.Popen = partial(subprocess.Popen, encoding='utf-8')
import execjs
from base import BaseCore
baseCore = BaseCore.BaseCore(sqlflg=False)
baseCore = BaseCore.BaseCore(sqlFlg=False)
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'tradingEconomics']
......@@ -60,7 +65,7 @@ def paserUrl(html, listurl):
@retry(tries=2, delay=5)
def getSoup(url):
req = requests.get(url, headers=headers)
req = requests.get(url, headers=headers,timeout=20)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
soup = paserUrl(soup, url)
......@@ -74,8 +79,6 @@ def getCountries():
soup = getSoup(url)
div_list = soup.select('#ctl00_ContentPlaceHolder1_ctl01_tableCountries > div')
for div_tag in div_list:
if 'G20' in div_tag.text:
continue
li_list = div_tag.select('> ul > li')
for li_tag in li_list:
if 'active' in li_tag['class']:
......@@ -91,18 +94,14 @@ def getIndex(country, url):
soup = getSoup(url)
li_list = soup.find('div', class_='pagetabs').find('ul', attrs={'id': 'pagemenutabs'}).find_all('li', class_='nav-item')
div_list = soup.find('div', class_='tab-content').find_all('div', class_='tab-pane')
# for i in range(len(li_list)):
# li_tag = li_list[i]
# if 'Overview' in li_tag.find('a').text.strip():
# del div_list[i]
# break
for i in range(len(li_list)):
li_tag = li_list[i]
if 'Overview' in li_tag.find('a').text.strip():
del div_list[i]
break
for i in range(len(div_list)):
div_tag = div_list[i]
try:
tr_list = div_tag.find('table').find('tbody').find_all('tr')
except:
print(url, i, sep='===')
continue
for tr_tag in tr_list:
option = tr_tag.find('td').find('a').text.strip()
href = tr_tag.find('td').find('a').get('href')
......@@ -124,24 +123,32 @@ def getTimeKey(timeType, calendarStr):
timeKey = calendarStr[:4] + 'Q4'
elif timeType == 'yearly':
timeKey = calendarStr[:4]
elif timeType == 'weekly':
date_obj = datetime.datetime.strptime(calendarStr,'%Y-%m-%d')
year, week, day = date_obj.isocalendar()
if len(str(week)) == 1:
week = f'0{week}'
timeKey = f'{year}{week}'
else:
timeKey = False
timeKey = calendarStr.replace('-','')
return timeKey
@retry(tries=3, delay=5)
def decrypt(value,js_runtime):
dataJson = js_runtime.call('doJob', value, key)
return dataJson
@retry(tries=3, delay=10)
def getJson(url):
req = requests.get(url, headers=headers, timeout=20, verify=False)
def getJson(url,js_runtime):
req = requests.get(url, headers=headers, timeout=20,verify=False)
value = req.text.replace('"', '')
req.close()
with open(r'./trandingEconomics.js', 'r', encoding='utf-8') as f:
js = f.read()
js_runtime = execjs.compile(js)
try:
dataJson = js_runtime.call('doJob', value, key)
except:
print(value)
dataJson = decrypt(value,js_runtime)
except Exception as e:
log.error(e)
raise
if dataJson:
try:
dataJson = json.loads(dataJson)[0]
except:
......@@ -149,7 +156,8 @@ def getJson(url):
return dataJson
def getData(url):
def getData(url,js_runtime):
type_list = ['1w','1m','6m','1y','5y','10y','25y','50y','max','all']
createTime = datetime.datetime.now()
createTimeStr = createTime.strftime('%Y-%m-%d')
soup = getSoup(url)
......@@ -161,7 +169,7 @@ def getData(url):
break
else:
log.error(f'数据链接获取失败==={url}')
return
return False
for script in scripts:
if 'TEChartsToken' in script.text:
TEChartsToken = re.findall('TEChartsToken = \'(.*?)\'', script.text)[0]
......@@ -169,10 +177,18 @@ def getData(url):
break
else:
log.error(f'数据链接获取失败==={url}')
return
return False
TESymbol = TESymbol.lower()
href = f'https://d3ii0wo49og5mi.cloudfront.net/economics/{urllib.parse.quote(TESymbol)}?&span=max&v=20240102145900&key={TEChartsToken}'
dataJson = getJson(href)
for type in type_list:
href = f'https://d3ii0wo49og5mi.cloudfront.net/economics/{urllib.parse.quote(TESymbol)}?&span={type}&v=20240102145900&key={TEChartsToken}'
try:
dataJson = getJson(href,js_runtime)
except Exception as e:
log.error(f'{type}===数据请求失败==={e}')
return False
#series = dataJson['series'][-10:]
if not dataJson:
continue
series = dataJson['series']
for serie_ in series:
serie = serie_['serie']
......@@ -202,9 +218,10 @@ def getData(url):
}
if db_storage.find_one({'country': country, 'indicators': indicators, 'timeType': timeType, 'calendarStr': calendarStr}):
log.info(f'{country}==={indicators}==={calendarStr}===已采集')
break
continue
db_storage.insert_one(dic)
log.info(f'{country}==={indicators}==={calendarStr}===入库成功')
return True
def doJob():
......@@ -215,10 +232,22 @@ def doJob():
info = f'{index[0]}|{index[1]}|{index[2]}'
baseCore.r.rpush('trandingEconomics:info', info)
log.info('数据已全部放入redis中')
# log.info(f'开始采集==={index[0]}==={index[1]}')
# getData(index[2])
def doJobA():
try:
with open(r'./trandingEconomics.js', 'r', encoding='utf-8') as f:
js = f.read()
execjs.get('Node')
js_runtime = execjs.compile(js)
except:
return
errorNum = 0
while True:
if errorNum > 10:
break
info = baseCore.r.blpop(['trandingEconomics:info'], 2)
if not info:
log.info('数据已全部采集完成')
......@@ -228,25 +257,21 @@ def doJobA():
country = info.split('|')[0]
index = info.split('|')[1]
url = info.split('|')[2]
if url.endswith('/rating'):
continue
log.info(f'开始采集==={country}==={index}')
try:
getData(url)
if getData(url,js_runtime):
pass
else:
errorNum += 1
baseCore.r.rpush('trandingEconomics:info', info)
except Exception as e:
errorNum += 1
traceback.print_exc()
log.error(f'{country}==={index}===采集失败==={e}')
baseCore.r.rpush('trandingEconomics:info', info)
if __name__ == "__main__":
# testA()
# doJob()
# Peru===Currency
# Bulgaria===Stock Market
# Bulgaria===Interest Rate
# infoLens = baseCore.r.llen('trandingEconomics:info')
# print(infoLens)
jiami = 'a/lpZGluZ2VjbOvgLCKnQz3mlyvrByT3TUWWTZ/Bt9RVQx5xnQYsCU4fVSSo3ZGypEPLdDwREI65v+hkHO32iRzgmdYJc3AZFO6drPcW7yzvT7ovG7g4qxA1n3kxhiEQ808R90cOX+DZdz2H+xeTxuDmi/Un7sLeUZCPe3TS0sayhyPwOhUjXx/fFk2agaDz4pU0xWL34265lqd4zZSkAwwcpX/eLI5BvDEHKP61naRAHNgUIaX1g9DoyYzV9Mi6bu7gvSDvpkcvwyQ6WiOaoSpjI4vK2Kdt2SgJu92zedyrjpmpSjFfjnEf2Y6tIjgTY480acLmcWkt'
with open(r'./trandingEconomics.js', 'r', encoding='utf-8') as f:
js = f.read()
js_runtime = execjs.compile(js)
dataJson = js_runtime.call('doJob', jiami, key)
print(dataJson)
doJobA()
......@@ -57,7 +57,7 @@ def doJob():
origin = dataJson['source']
if db_stroage.find_one({'原文链接': href}):
continue
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
if publishDate > '2022-12-31 23:59:59' or publishDate < '2022-01-01 00:00:00':
continue
log.info(f'开始采集==={title}')
getDic(href, title, publishDate, origin, type[0])
......
[doJob]
;是否开始采集
;验证列表页
flg = False
;验证详情页
insertFlg = False
;flg为False时,验证列表页
;flg为True,insertFlg为False时,验证详情页
;都为True时,采集数据
;页码
[page]
;网页爬取页数的起始与末尾
;需要从链接中判断
begin = 1
begin = 0
end = 3
;链接
[home]
;网站名称
origin = 贸易投资网-贸易商机
;首页链接是否与后续链接有不同:是为True;不是为False
urlFlg = False
urlFlg = True
;如果首页链接与后续链接不同,需要填写该值
;样例 http://www.cgcjm.cecep.cn/g9222.aspx
urlBegin = http://www.aieco.org/article/ywfc
urlBegin = http://www.camce.com.cn/xbcn/xwzx/xmdt/index.html
;如果首页链接与后续链接不同,需要填写该值
urlBeginNum = 1
urlBeginNum = 0
;爬取网站链接的通用格式,页码处使用{}代替
;样例 http://www.cgcjm.cecep.cn/g9222/m17246/mp{}.aspx
url = https://www.tradeinvest.cn/trade/list?page={}&title=&industryType=&transactionType=&targetMarket=
url = http://www.camce.com.cn/xbcn/xwzx/xmdt/index_{}.html
;列表页
[homeSelect]
;资讯列表的select
data_info_list = body > div.container > div > div > div
data_info_list = body > div:nth-of-type(3) > div > div.sidebarR > ul > li
;标题所在 select
title = h5
title = a
;发布时间不在列表页显示 该值需要置为False
publishDateFlg = True
publishDate = span.format-datetime
publishDate = span
;获取到时间的格式
publishDateType = %%Y-%%m-%%d
;链接所在 select
href = a
;详情页
[detailSelect]
;正文
contentWithTag = #articleBody
contentWithTag = #xwxq2 > div
;是否有来源
originFlg = True
;来源
origin = #leftList > div.content_article_source > table > tr > td.first
origin = body > div:nth-of-type(3) > div > div.sidebarR > div.xwxq > div > form > table > tbody > tr > td:nth-of-type(2)
;发布时间 如果home中的publishDateFlg为False才需要配置
publishDate = div.second-news-item-date
;获取到时间的格式
......@@ -49,15 +61,14 @@ publishDateType = %%Y-%%m-%%d %%H:%%M:%%S
[headers]
;请求头信息
Accept = text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
Accept-Language = zh-CN,zh-TW;q=0.9,zh;q=0.8
Connectio = keep-alive
Sec-Fetch-Dest = document
Sec-Fetch-Mode = navigate
Sec-Fetch-Site = same-origin
Sec-Fetch-User = ?1
Upgrade-Insecure-Requests = 1
User-Agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0
sec-ch-ua = "Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"
sec-ch-ua-mobile = ?0
sec-ch-ua-platform = "Windows"
\ No newline at end of file
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh-TW;q=0.9,zh;q=0.8
Cache-Control: max-age=0
Connection: keep-alive
Cookie: Hm_lvt_e54259884352514b610814aa18f84433=1722569732; HMACCOUNT=9222512DCF10CB7B; Hm_lpvt_e54259884352514b610814aa18f84433=1722569808
Host: www.camce.com.cn
If-Modified-Since: Mon, 15 Jul 2024 00:40:15 GMT
If-None-Match: "7b53-61d3e78f899c0-gzip"
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0
\ No newline at end of file
......@@ -54,13 +54,13 @@ def getDic(url, title, publishDate):
def doJob():
# for page in range(1, 13):
for page in range(0,5):
if page == 0:
url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
else:
url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
# url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
for page in range(1, 13):
# for page in range(0,5):
# if page == 0:
# url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
# else:
# url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
soup = getSoup(url)
liList = soup.find('ul', class_='right-column-list').find_all('li')
for liTag in liList:
......
......@@ -50,7 +50,8 @@ def getDic(url, title, publishDate):
def doJob():
for page in range(1, 4):
url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
# url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
url = ''
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
......
......@@ -8,12 +8,13 @@ from bs4 import BeautifulSoup
from base import BaseCore
from elasticsearch import Elasticsearch, helpers
# db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
# db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
baseCore = BaseCore.BaseCore()
# # db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
db_stroageA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制']
db_stroageB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制_']
es_client = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('elastic', 'zzsn9988'), timeout=600)
baseCore = BaseCore.BaseCore()
from powerchina import db_stroage
......@@ -63,8 +64,8 @@ def select(sid):
{
"range": {
"publishDate": {
"gte": "2023-01-01T00:00:00",
"lte": "2023-12-31T23:59:59"
"gte": "2022-01-01T00:00:00",
"lte": "2022-12-31T23:59:59"
}
}
}
......@@ -191,29 +192,109 @@ def select(sid):
# df = pd.DataFrame(data_list)
# df.to_excel('./项目资讯-定制.xlsx', index=False)
sids = ['1811965474060091394',
'1811960555242528770',
'1811958016631644161',
'1811955892663336962',
'1811950817584857089',
'1811939863161716737',
'1811937580770402305',
'1811933720142135297',
'1811566665440186370',
'1810983037486170113',
'1810980529153966081',
'1810978470438567938',
'1810976012817707009',
'1810972790830858242',
'1810968708888068097',
'1810960658496102401',
'1810954505034969089',
'1810947397855879170']
for sid in sids:
# sids = ['1811965474060091394',
# '1811960555242528770',
# '1811958016631644161',
# '1811955892663336962',
# '1811950817584857089',
# '1811939863161716737',
# '1811937580770402305',
# '1811933720142135297',
# '1811566665440186370',
# '1810983037486170113',
# '1810980529153966081',
# '1810978470438567938',
# '1810976012817707009',
# '1810972790830858242',
# '1810968708888068097',
# '1810960658496102401',
# '1810954505034969089',
# '1810947397855879170']
# for sid in sids:
# num = 0
# datas = select(sid)
# for data in datas:
# num += 1
# print(f'{sid}==={num}')
# es获取数据
def esData():
sql = 'select source_id from info_source_group_map where group_id="1697061836360126466"'
baseCore.cursor_.execute(sql)
datas = baseCore.cursor_.fetchall()
dics = []
urlList = []
for data in datas:
sid = data[0]
sqlSelect = f'select web_site_name from info_source where id="{sid}"'
baseCore.cursor_.execute(sqlSelect)
web = baseCore.cursor_.fetchone()[0]
results = select(sid)
num = 0
datas = select(sid)
for result in results:
try:
title = result['_source']['title']
publishDate = result['_source']['publishDate']
if len(publishDate) == 10:
publishDate = publishDate + ' 00:00:00'
else:
publishDate = publishDate.replace('T', ' ')
origin = result['_source']['origin']
content = result['_source']['content']
contentWithTag = result['_source']['contentWithTag']
url = result['_source']['sourceAddress']
except:
continue
if url in urlList:
continue
dic = {
'标题': title,
'发布时间': publishDate,
'来源': origin,
'网站': web,
'正文': content,
'正文带标签': str(contentWithTag),
'原文链接': url
}
dics.append(dic)
urlList.append(url)
df = pd.DataFrame(dics)
df.to_excel('./项目资讯-定制.xlsx', index=False)
# 神采数据导出
def shencai():
datas_ = []
datas = db_stroageShencaiB.find({'日期': {"$gte": '2022-01-01 00:00:00', "$lt": '2023-01-01 00:00:00'}})
for data in datas:
num += 1
print(f'{sid}==={num}')
title = data['标题']
href = data['URL']
origin = data['来源']
web = data['栏目']
content = data['正文不带标签']
contentWithTag = data['内容']
publishDate = data['日期']
# print(publishDate)
datas_.append([title, publishDate, origin, web, content, contentWithTag, href])
df = pd.DataFrame(datas_, columns=['标题', '发布时间', '来源', '网站', '正文', '正文带标签', '原文链接'])
df.to_excel('./项目资讯-定制.xlsx', index=False)
#mongodb 定制
def dingzhi():
data_list = []
datas = db_stroageA.find()
for data in datas:
del data['_id']
db_stroageB.insert_one(data)
data_list.append(data)
df = pd.DataFrame(data_list)
df.to_excel('./项目资讯-定制.xlsx', index=False)
if __name__ == '__main__':
# esData()
# shencai()
# dingzhi()
pass
baseCore.close()
......@@ -46,10 +46,10 @@ class Spider():
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
return
if self.config.getboolean('detailSelect', 'originFlg'):
# origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
source = soup.find('body').find('script').text
source = re.findall('source = \"(.*?)\";', source)[0]
origin = source
origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
# source = soup.find('body').find('script').text
# source = re.findall('source = \"(.*?)\";', source)[0]
# origin = source
try:
try:
origin = origin.split('来源:')[1].strip()
......@@ -88,8 +88,8 @@ class Spider():
def doJob(self):
for page in range(int(self.config.get('page', 'begin')), int(self.config.get('page', 'end'))):
if self.config.getboolean('home', 'urlFlg') and page == self.config.get('home', 'urlBeginNum'):
url = self.config.get('sit', 'urlBegin')
if self.config.getboolean('home', 'urlFlg') and page == int(self.config.get('home', 'urlBeginNum')):
url = self.config.get('home', 'urlBegin')
else:
url = self.config.get('home', 'url').format(page)
soup = getSoup(url, self.getHeader())
......@@ -104,14 +104,14 @@ class Spider():
title = data_info.select(self.config.get('homeSelect', 'title'))[0].text.strip()
except:
continue
href = data_info.get('onclick')
href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
# href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
# href = data_info.get('onclick')
# href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
# href = data_info.get('href')
if self.config.getboolean('homeSelect', 'publishDateFlg'):
publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0] + ' 00:00:00'
publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0]
# publishDate = href.split('net.cn')[1].split('art_')[0]
# publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
continue
else:
......@@ -121,13 +121,14 @@ class Spider():
log.info(f'开始采集==={title}==={publishDate}==={href}')
if not self.config.getboolean('doJob', 'flg'):
break
try:
pass
# self.getDic(href, title, publishDate)
except Exception as e:
log.error(f'{title}===采集失败==={e}')
# try:
self.getDic(href, title, publishDate)
# except Exception as e:
# log.error(f'{title}===采集失败==={e}')
if not self.config.getboolean('doJob', 'insertFlg'):
break
time.sleep(0.5)
if not self.config.getboolean('doJob', 'flg'):
if not self.config.getboolean('doJob', 'flg') or not self.config.getboolean('doJob', 'insertFlg'):
break
time.sleep(0.5)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论