提交 e8838732 作者: LiuLiYuan

lly 2024-08-02

上级 63f0268b
...@@ -57,7 +57,7 @@ def doJob(): ...@@ -57,7 +57,7 @@ def doJob():
origin = dataJson['source'] origin = dataJson['source']
if db_stroage.find_one({'原文链接': href}): if db_stroage.find_one({'原文链接': href}):
continue continue
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00': if publishDate > '2022-12-31 23:59:59' or publishDate < '2022-01-01 00:00:00':
continue continue
log.info(f'开始采集==={title}') log.info(f'开始采集==={title}')
getDic(href, title, publishDate, origin, type[0]) getDic(href, title, publishDate, origin, type[0])
......
[doJob] [doJob]
;是否开始采集 ;是否开始采集
;验证列表页
flg = False flg = False
;验证详情页
insertFlg = False insertFlg = False
;flg为False时,验证列表页
;flg为True,insertFlg为False时,验证详情页
;都为True时,采集数据
;页码
[page] [page]
;网页爬取页数的起始与末尾 ;网页爬取页数的起始与末尾
;需要从链接中判断 ;需要从链接中判断
begin = 1 begin = 0
end = 3 end = 3
;链接
[home] [home]
;网站名称
origin = 贸易投资网-贸易商机 origin = 贸易投资网-贸易商机
;首页链接是否与后续链接有不同:是为True;不是为False ;首页链接是否与后续链接有不同:是为True;不是为False
urlFlg = False urlFlg = True
;如果首页链接与后续链接不同,需要填写该值 ;如果首页链接与后续链接不同,需要填写该值
;样例 http://www.cgcjm.cecep.cn/g9222.aspx ;样例 http://www.cgcjm.cecep.cn/g9222.aspx
urlBegin = http://www.aieco.org/article/ywfc urlBegin = http://www.camce.com.cn/xbcn/xwzx/xmdt/index.html
;如果首页链接与后续链接不同,需要填写该值 ;如果首页链接与后续链接不同,需要填写该值
urlBeginNum = 1 urlBeginNum = 0
;爬取网站链接的通用格式,页码处使用{}代替 ;爬取网站链接的通用格式,页码处使用{}代替
;样例 http://www.cgcjm.cecep.cn/g9222/m17246/mp{}.aspx ;样例 http://www.cgcjm.cecep.cn/g9222/m17246/mp{}.aspx
url = https://www.tradeinvest.cn/trade/list?page={}&title=&industryType=&transactionType=&targetMarket= url = http://www.camce.com.cn/xbcn/xwzx/xmdt/index_{}.html
;列表页
[homeSelect] [homeSelect]
;资讯列表的select ;资讯列表的select
data_info_list = body > div.container > div > div > div data_info_list = body > div:nth-of-type(3) > div > div.sidebarR > ul > li
;标题所在 select ;标题所在 select
title = h5 title = a
;发布时间不在列表页显示 该值需要置为False ;发布时间不在列表页显示 该值需要置为False
publishDateFlg = True publishDateFlg = True
publishDate = span.format-datetime publishDate = span
;获取到时间的格式 ;获取到时间的格式
publishDateType = %%Y-%%m-%%d publishDateType = %%Y-%%m-%%d
;链接所在 select ;链接所在 select
href = a href = a
;详情页
[detailSelect] [detailSelect]
;正文 ;正文
contentWithTag = #articleBody contentWithTag = #xwxq2 > div
;是否有来源 ;是否有来源
originFlg = True originFlg = True
;来源 ;来源
origin = #leftList > div.content_article_source > table > tr > td.first origin = body > div:nth-of-type(3) > div > div.sidebarR > div.xwxq > div > form > table > tbody > tr > td:nth-of-type(2)
;发布时间 如果home中的publishDateFlg为False才需要配置 ;发布时间 如果home中的publishDateFlg为False才需要配置
publishDate = div.second-news-item-date publishDate = div.second-news-item-date
;获取到时间的格式 ;获取到时间的格式
...@@ -49,15 +61,14 @@ publishDateType = %%Y-%%m-%%d %%H:%%M:%%S ...@@ -49,15 +61,14 @@ publishDateType = %%Y-%%m-%%d %%H:%%M:%%S
[headers] [headers]
;请求头信息 ;请求头信息
Accept = text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
Accept-Language = zh-CN,zh-TW;q=0.9,zh;q=0.8 Accept-Encoding: gzip, deflate
Connectio = keep-alive Accept-Language: zh-CN,zh-TW;q=0.9,zh;q=0.8
Sec-Fetch-Dest = document Cache-Control: max-age=0
Sec-Fetch-Mode = navigate Connection: keep-alive
Sec-Fetch-Site = same-origin Cookie: Hm_lvt_e54259884352514b610814aa18f84433=1722569732; HMACCOUNT=9222512DCF10CB7B; Hm_lpvt_e54259884352514b610814aa18f84433=1722569808
Sec-Fetch-User = ?1 Host: www.camce.com.cn
Upgrade-Insecure-Requests = 1 If-Modified-Since: Mon, 15 Jul 2024 00:40:15 GMT
User-Agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0 If-None-Match: "7b53-61d3e78f899c0-gzip"
sec-ch-ua = "Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126" Upgrade-Insecure-Requests: 1
sec-ch-ua-mobile = ?0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0
sec-ch-ua-platform = "Windows" \ No newline at end of file
\ No newline at end of file
...@@ -54,13 +54,13 @@ def getDic(url, title, publishDate): ...@@ -54,13 +54,13 @@ def getDic(url, title, publishDate):
def doJob(): def doJob():
# for page in range(1, 13): for page in range(1, 13):
for page in range(0,5): # for page in range(0,5):
if page == 0: # if page == 0:
url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html' # url = f'https://www.crbc.com/site/crbc/zwjgdt/index.html'
else: # else:
url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html' # url = f'https://www.crbc.com/site/crbc/zwjgdt/index_{page}.html'
# url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1' url = f'https://www.crbc.com/site/crbc/gsxw/index_{page}.html?ordernum=1'
soup = getSoup(url) soup = getSoup(url)
liList = soup.find('ul', class_='right-column-list').find_all('li') liList = soup.find('ul', class_='right-column-list').find_all('li')
for liTag in liList: for liTag in liList:
......
...@@ -50,7 +50,8 @@ def getDic(url, title, publishDate): ...@@ -50,7 +50,8 @@ def getDic(url, title, publishDate):
def doJob(): def doJob():
for page in range(1, 4): for page in range(1, 4):
url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2' # url = 'https://www.intl-gc.avic.com/main/news/lists/id/34.html?page=2'
url = ''
req = requests.get(url, headers=headers) req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser') soup = BeautifulSoup(req.text, 'html.parser')
......
...@@ -8,12 +8,13 @@ from bs4 import BeautifulSoup ...@@ -8,12 +8,13 @@ from bs4 import BeautifulSoup
from base import BaseCore from base import BaseCore
from elasticsearch import Elasticsearch, helpers from elasticsearch import Elasticsearch, helpers
# db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1'] baseCore = BaseCore.BaseCore()
# db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
# # db_stroageShencaiA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采_copy1']
db_stroageShencaiB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目咨询-神采']
db_stroageA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制'] db_stroageA = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制']
db_stroageB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制_'] db_stroageB = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['项目资讯-定制_']
es_client = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('elastic', 'zzsn9988'), timeout=600) es_client = Elasticsearch([{'host': '114.115.215.250', 'port': '9700'}], http_auth=('elastic', 'zzsn9988'), timeout=600)
baseCore = BaseCore.BaseCore()
from powerchina import db_stroage from powerchina import db_stroage
...@@ -63,8 +64,8 @@ def select(sid): ...@@ -63,8 +64,8 @@ def select(sid):
{ {
"range": { "range": {
"publishDate": { "publishDate": {
"gte": "2023-01-01T00:00:00", "gte": "2022-01-01T00:00:00",
"lte": "2023-12-31T23:59:59" "lte": "2022-12-31T23:59:59"
} }
} }
} }
...@@ -191,29 +192,109 @@ def select(sid): ...@@ -191,29 +192,109 @@ def select(sid):
# df = pd.DataFrame(data_list) # df = pd.DataFrame(data_list)
# df.to_excel('./项目资讯-定制.xlsx', index=False) # df.to_excel('./项目资讯-定制.xlsx', index=False)
sids = ['1811965474060091394', # sids = ['1811965474060091394',
'1811960555242528770', # '1811960555242528770',
'1811958016631644161', # '1811958016631644161',
'1811955892663336962', # '1811955892663336962',
'1811950817584857089', # '1811950817584857089',
'1811939863161716737', # '1811939863161716737',
'1811937580770402305', # '1811937580770402305',
'1811933720142135297', # '1811933720142135297',
'1811566665440186370', # '1811566665440186370',
'1810983037486170113', # '1810983037486170113',
'1810980529153966081', # '1810980529153966081',
'1810978470438567938', # '1810978470438567938',
'1810976012817707009', # '1810976012817707009',
'1810972790830858242', # '1810972790830858242',
'1810968708888068097', # '1810968708888068097',
'1810960658496102401', # '1810960658496102401',
'1810954505034969089', # '1810954505034969089',
'1810947397855879170'] # '1810947397855879170']
for sid in sids: # for sid in sids:
num = 0 # num = 0
datas = select(sid) # datas = select(sid)
# for data in datas:
# num += 1
# print(f'{sid}==={num}')
# es获取数据
def esData():
sql = 'select source_id from info_source_group_map where group_id="1697061836360126466"'
baseCore.cursor_.execute(sql)
datas = baseCore.cursor_.fetchall()
dics = []
urlList = []
for data in datas: for data in datas:
num += 1 sid = data[0]
print(f'{sid}==={num}') sqlSelect = f'select web_site_name from info_source where id="{sid}"'
baseCore.cursor_.execute(sqlSelect)
web = baseCore.cursor_.fetchone()[0]
results = select(sid)
num = 0
for result in results:
try:
title = result['_source']['title']
publishDate = result['_source']['publishDate']
if len(publishDate) == 10:
publishDate = publishDate + ' 00:00:00'
else:
publishDate = publishDate.replace('T', ' ')
origin = result['_source']['origin']
content = result['_source']['content']
contentWithTag = result['_source']['contentWithTag']
url = result['_source']['sourceAddress']
except:
continue
if url in urlList:
continue
dic = {
'标题': title,
'发布时间': publishDate,
'来源': origin,
'网站': web,
'正文': content,
'正文带标签': str(contentWithTag),
'原文链接': url
}
dics.append(dic)
urlList.append(url)
df = pd.DataFrame(dics)
df.to_excel('./项目资讯-定制.xlsx', index=False)
# 神采数据导出
def shencai():
datas_ = []
datas = db_stroageShencaiB.find({'日期': {"$gte": '2022-01-01 00:00:00', "$lt": '2023-01-01 00:00:00'}})
for data in datas:
title = data['标题']
href = data['URL']
origin = data['来源']
web = data['栏目']
content = data['正文不带标签']
contentWithTag = data['内容']
publishDate = data['日期']
# print(publishDate)
datas_.append([title, publishDate, origin, web, content, contentWithTag, href])
df = pd.DataFrame(datas_, columns=['标题', '发布时间', '来源', '网站', '正文', '正文带标签', '原文链接'])
df.to_excel('./项目资讯-定制.xlsx', index=False)
#mongodb 定制
def dingzhi():
data_list = []
datas = db_stroageA.find()
for data in datas:
del data['_id']
db_stroageB.insert_one(data)
data_list.append(data)
df = pd.DataFrame(data_list)
df.to_excel('./项目资讯-定制.xlsx', index=False)
if __name__ == '__main__':
# esData()
# shencai()
# dingzhi()
pass
baseCore.close() baseCore.close()
...@@ -46,10 +46,10 @@ class Spider(): ...@@ -46,10 +46,10 @@ class Spider():
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00': if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
return return
if self.config.getboolean('detailSelect', 'originFlg'): if self.config.getboolean('detailSelect', 'originFlg'):
# origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text origin = soup.select(self.config.get('detailSelect', 'origin'))[0].text
source = soup.find('body').find('script').text # source = soup.find('body').find('script').text
source = re.findall('source = \"(.*?)\";', source)[0] # source = re.findall('source = \"(.*?)\";', source)[0]
origin = source # origin = source
try: try:
try: try:
origin = origin.split('来源:')[1].strip() origin = origin.split('来源:')[1].strip()
...@@ -88,8 +88,8 @@ class Spider(): ...@@ -88,8 +88,8 @@ class Spider():
def doJob(self): def doJob(self):
for page in range(int(self.config.get('page', 'begin')), int(self.config.get('page', 'end'))): for page in range(int(self.config.get('page', 'begin')), int(self.config.get('page', 'end'))):
if self.config.getboolean('home', 'urlFlg') and page == self.config.get('home', 'urlBeginNum'): if self.config.getboolean('home', 'urlFlg') and page == int(self.config.get('home', 'urlBeginNum')):
url = self.config.get('sit', 'urlBegin') url = self.config.get('home', 'urlBegin')
else: else:
url = self.config.get('home', 'url').format(page) url = self.config.get('home', 'url').format(page)
soup = getSoup(url, self.getHeader()) soup = getSoup(url, self.getHeader())
...@@ -104,14 +104,14 @@ class Spider(): ...@@ -104,14 +104,14 @@ class Spider():
title = data_info.select(self.config.get('homeSelect', 'title'))[0].text.strip() title = data_info.select(self.config.get('homeSelect', 'title'))[0].text.strip()
except: except:
continue continue
href = data_info.get('onclick') # href = data_info.get('onclick')
href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail' # href = 'https://www.tradeinvest.cn/trade/' + re.findall('\(\"(.*)\"\)',href)[0] + '/detail'
# href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href') href = data_info.select(self.config.get('homeSelect', 'href'))[0].get('href')
# href = data_info.get('href') # href = data_info.get('href')
if self.config.getboolean('homeSelect', 'publishDateFlg'): if self.config.getboolean('homeSelect', 'publishDateFlg'):
publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0] + ' 00:00:00' publishDate = data_info.select(self.config.get('homeSelect', 'publishDate'))[0].text.strip().replace('\t', '').replace('\n', '').replace('\r', '').split('T')[0]
# publishDate = href.split('net.cn')[1].split('art_')[0] # publishDate = href.split('net.cn')[1].split('art_')[0]
# publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S') publishDate = datetime.datetime.strptime(publishDate, self.config.get('homeSelect', 'publishDateType')).strftime('%Y-%m-%d %H:%M:%S')
if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00': if publishDate > '2023-12-31 23:59:59' or publishDate < '2023-01-01 00:00:00':
continue continue
else: else:
...@@ -121,13 +121,14 @@ class Spider(): ...@@ -121,13 +121,14 @@ class Spider():
log.info(f'开始采集==={title}==={publishDate}==={href}') log.info(f'开始采集==={title}==={publishDate}==={href}')
if not self.config.getboolean('doJob', 'flg'): if not self.config.getboolean('doJob', 'flg'):
break break
try: # try:
pass self.getDic(href, title, publishDate)
# self.getDic(href, title, publishDate) # except Exception as e:
except Exception as e: # log.error(f'{title}===采集失败==={e}')
log.error(f'{title}===采集失败==={e}') if not self.config.getboolean('doJob', 'insertFlg'):
break
time.sleep(0.5) time.sleep(0.5)
if not self.config.getboolean('doJob', 'flg'): if not self.config.getboolean('doJob', 'flg') or not self.config.getboolean('doJob', 'insertFlg'):
break break
time.sleep(0.5) time.sleep(0.5)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论