提交 e8838732 作者: LiuLiYuan

lly 2024-08-02

上级 63f0268b
......@@ -57,7 +57,7 @@ def doJob():
origin = dataJson['source']
if db_stroage.find_one({'原文链接': href}):
continue
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
if publishDate > '2022-12-31 23:59:59' or publishDate < '2022-01-01 00:00:00':
continue
log.info(f'开始采集==={title}')
getDic(href, title, publishDate, origin, type[0])
......
[doJob]
;是否开始采集
;验证列表页
flg = False
;验证详情页
insertFlg = False
;flg为False时,验证列表页
;flg为True,insertFlg为False时,验证详情页
;都为True时,采集数据
;页码
[page]
;网页爬取页数的起始与末尾
;需要从链接中判断
begin = 1
begin = 0
end = 3
;链接
[home]
;网站名称
origin = 贸易投资网-贸易商机
;首页链接是否与后续链接有不同:是为True;不是为False
urlFlg = False
urlFlg = True
;如果首页链接与后续链接不同,需要填写该值
;样例 http://www.cgcjm.cecep.cn/g9222.aspx
urlBegin = http://www.aieco.org/article/ywfc
urlBegin = http://www.camce.com.cn/xbcn/xwzx/xmdt/index.html
;如果首页链接与后续链接不同,需要填写该值
urlBeginNum = 1
urlBeginNum = 0
;爬取网站链接的通用格式,页码处使用{}代替
;样例 http://www.cgcjm.cecep.cn/g9222/m17246/mp{}.aspx
url = https://www.tradeinvest.cn/trade/list?page={}&title=&industryType=&transactionType=&targetMarket=
url = http://www.camce.com.cn/xbcn/xwzx/xmdt/index_{}.html
;列表页
[homeSelect]
;资讯列表的select
data_info_list = body > div.container > div > div > div
data_info_list = body > div:nth-of-type(3) > div > div.sidebarR > ul > li
;标题所在 select
title = h5
title = a
;发布时间不在列表页显示 该值需要置为False
publishDateFlg = True
publishDate = span.format-datetime
publishDate = span
;获取到时间的格式
publishDateType = %%Y-%%m-%%d
;链接所在 select
href = a
;详情页
[detailSelect]
;正文
contentWithTag = #articleBody
contentWithTag = #xwxq2 > div
;是否有来源
originFlg = True
;来源
origin = #leftList > div.content_article_source > table > tr > td.first
origin = body > div:nth-of-type(3) > div > div.sidebarR > div.xwxq > div > form > table > tbody > tr > td:nth-of-type(2)
;发布时间 如果home中的publishDateFlg为False才需要配置
publishDate = div.second-news-item-date
;获取到时间的格式
......@@ -49,15 +61,14 @@ publishDateType = %%Y-%%m-%%d %%H:%%M:%%S
[headers]
;请求头信息
Accept = text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
Accept-Language = zh-CN,zh-TW;q=0.9,zh;q=0.8
Connectio = keep-alive
Sec-Fetch-Dest = document
Sec-Fetch-Mode = navigate
Sec-Fetch-Site = same-origin
Sec-Fetch-User = ?1
Upgrade-Insecure-Requests = 1
User-Agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0
sec-ch-ua = "Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"
sec-ch-ua-mobile = ?0
sec-ch-ua-platform = "Windows"
\ No newline at end of file
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh-TW;q=0.9,zh;q=0.8
Cache-Control: max-age=0
Connection: keep-alive
Cookie: Hm_lvt_e54259884352514b610814aa18f84433=1722569732; HMACCOUNT=9222512DCF10CB7B; Hm_lpvt_e54259884352514b610814aa18f84433=1722569808
Host: www.camce.com.cn
If-Modified-Since: Mon, 15 Jul 2024 00:40:15 GMT
If-None-Match: "7b53-61d3e78f899c0-gzip"
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0
\ No newline at end of file
......@@ -54,13 +54,13 @@ def getDic(url, title, publishDate):
def doJob():
# for page in range(1, 13):
for page in range(0,5):
if page == 0:
url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
else:
url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
# url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
for page in range(1, 13):
# for page in range(0,5):
# if page == 0:
# url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
# else:
# url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
soup = getSoup(url)
liList = soup.find('ul', class_='right-column-list').find_all('li')
for liTag in liList:
......
......@@ -50,7 +50,8 @@ def getDic(url, title, publishDate):
def doJob():
for page in range(1, 4):
url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
# url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
url = ''
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
......
......@@ -8,12 +8,13 @@ from bs4 import BeautifulSoup
from base import BaseCore
from elasticsearch import Elasticsearch, helpers
# db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
# db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
baseCore = BaseCore.BaseCore()
# # db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
db_stroageA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制']
db_stroageB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制_']
es_client = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('elastic', 'zzsn9988'), timeout=600)
baseCore = BaseCore.BaseCore()
from powerchina import db_stroage
......@@ -63,8 +64,8 @@ def select(sid):
{
"range": {
"publishDate": {
"gte": "2023-01-01T00:00:00",
"lte": "2023-12-31T23:59:59"
"gte": "2022-01-01T00:00:00",
"lte": "2022-12-31T23:59:59"
}
}
}
......@@ -191,29 +192,109 @@ def select(sid):
# df = pd.DataFrame(data_list)
# df.to_excel('./项目资讯-定制.xlsx', index=False)
sids = ['1811965474060091394',
'1811960555242528770',
'1811958016631644161',
'1811955892663336962',
'1811950817584857089',
'1811939863161716737',
'1811937580770402305',
'1811933720142135297',
'1811566665440186370',
'1810983037486170113',
'1810980529153966081',
'1810978470438567938',
'1810976012817707009',
'1810972790830858242',
'1810968708888068097',
'1810960658496102401',
'1810954505034969089',
'1810947397855879170']
for sid in sids:
num = 0
datas = select(sid)
# sids = ['1811965474060091394',
# '1811960555242528770',
# '1811958016631644161',
# '1811955892663336962',
# '1811950817584857089',
# '1811939863161716737',
# '1811937580770402305',
# '1811933720142135297',
# '1811566665440186370',
# '1810983037486170113',
# '1810980529153966081',
# '1810978470438567938',
# '1810976012817707009',
# '1810972790830858242',
# '1810968708888068097',
# '1810960658496102401',
# '1810954505034969089',
# '1810947397855879170']
# for sid in sids:
# num = 0
# datas = select(sid)
# for data in datas:
# num += 1
# print(f'{sid}==={num}')
# es获取数据
def esData():
sql = 'select source_id from info_source_group_map where group_id="1697061836360126466"'
baseCore.cursor_.execute(sql)
datas = baseCore.cursor_.fetchall()
dics = []
urlList = []
for data in datas:
num += 1
print(f'{sid}==={num}')
sid = data[0]
sqlSelect = f'select web_site_name from info_source where id="{sid}"'
baseCore.cursor_.execute(sqlSelect)
web = baseCore.cursor_.fetchone()[0]
results = select(sid)
num = 0
for result in results:
try:
title = result['_source']['title']
publishDate = result['_source']['publishDate']
if len(publishDate) == 10:
publishDate = publishDate + ' 00:00:00'
else:
publishDate = publishDate.replace('T', ' ')
origin = result['_source']['origin']
content = result['_source']['content']
contentWithTag = result['_source']['contentWithTag']
url = result['_source']['sourceAddress']
except:
continue
if url in urlList:
continue
dic = {
'标题': title,
'发布时间': publishDate,
'来源': origin,
'网站': web,
'正文': content,
'正文带标签': str(contentWithTag),
'原文链接': url
}
dics.append(dic)
urlList.append(url)
df = pd.DataFrame(dics)
df.to_excel('./项目资讯-定制.xlsx', index=False)
# 神采数据导出
def shencai():
datas_ = []
datas = db_stroageShencaiB.find({'日期': {"$gte": '2022-01-01 00:00:00', "$lt": '2023-01-01 00:00:00'}})
for data in datas:
title = data['标题']
href = data['URL']
origin = data['来源']
web = data['栏目']
content = data['正文不带标签']
contentWithTag = data['内容']
publishDate = data['日期']
# print(publishDate)
datas_.append([title, publishDate, origin, web, content, contentWithTag, href])
df = pd.DataFrame(datas_, columns=['标题', '发布时间', '来源', '网站', '正文', '正文带标签', '原文链接'])
df.to_excel('./项目资讯-定制.xlsx', index=False)
#mongodb 定制
def dingzhi():
data_list = []
datas = db_stroageA.find()
for data in datas:
del data['_id']
db_stroageB.insert_one(data)
data_list.append(data)
df = pd.DataFrame(data_list)
df.to_excel('./项目资讯-定制.xlsx', index=False)
if __name__ == '__main__':
# esData()
# shencai()
# dingzhi()
pass
baseCore.close()
......@@ -46,10 +46,10 @@ class Spider():
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
return
if self.config.getboolean('detailSelect', 'originFlg'):
# origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
source = soup.find('body').find('script').text
source = re.findall('source = \"(.*?)\";', source)[0]
origin = source
origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
# source = soup.find('body').find('script').text
# source = re.findall('source = \"(.*?)\";', source)[0]
# origin = source
try:
try:
origin = origin.split('来源:')[1].strip()
......@@ -88,8 +88,8 @@ class Spider():
def doJob(self):
for page in range(int(self.config.get('page', 'begin')), int(self.config.get('page', 'end'))):
if self.config.getboolean('home', 'urlFlg') and page == self.config.get('home', 'urlBeginNum'):
url = self.config.get('sit', 'urlBegin')
if self.config.getboolean('home', 'urlFlg') and page == int(self.config.get('home', 'urlBeginNum')):
url = self.config.get('home', 'urlBegin')
else:
url = self.config.get('home', 'url').format(page)
soup = getSoup(url, self.getHeader())
......@@ -104,14 +104,14 @@ class Spider():
title = data_info.select(self.config.get('homeSelect', 'title'))[0].text.strip()
except:
continue
href = data_info.get('onclick')
href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
# href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
# href = data_info.get('onclick')
# href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
# href = data_info.get('href')
if self.config.getboolean('homeSelect', 'publishDateFlg'):
publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0] + ' 00:00:00'
publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0]
# publishDate = href.split('net.cn')[1].split('art_')[0]
# publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
continue
else:
......@@ -121,13 +121,14 @@ class Spider():
log.info(f'开始采集==={title}==={publishDate}==={href}')
if not self.config.getboolean('doJob', 'flg'):
break
try:
pass
# self.getDic(href, title, publishDate)
except Exception as e:
log.error(f'{title}===采集失败==={e}')
# try:
self.getDic(href, title, publishDate)
# except Exception as e:
# log.error(f'{title}===采集失败==={e}')
if not self.config.getboolean('doJob', 'insertFlg'):
break
time.sleep(0.5)
if not self.config.getboolean('doJob', 'flg'):
if not self.config.getboolean('doJob', 'flg') or not self.config.getboolean('doJob', 'insertFlg'):
break
time.sleep(0.5)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论