提交 b6df2db3 作者: 刘伟刚

中国政府采购网修改代码提交

上级 d974abb1
......@@ -134,6 +134,7 @@ def paserList(searchmsg,social_code):
'sourceAddress': sourceAddress, # 原文链接
'summary': '',
'title': title,
'source': source,
'socialCreditCode': social_code,
'year': published[:4]
}
......@@ -249,7 +250,7 @@ def sendToKafka(detailmsg):
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': 'Tradingview',
'origin': detailmsg['source'],
'publishDate': detailmsg['publishDate'],
'sid': '1711619846545776641',
'sourceAddress': detailmsg['sourceAddress'], # 原文链接
......@@ -316,23 +317,23 @@ if __name__ == '__main__':
# url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
# searchmsg=reqmsg(url)
# print(searchmsg)
getStockFromSql()
# while True:
# try:
# tradview_ticker=r.lpop('tradview_ticker')
# if tradview_ticker:
#
# tradviewticker = tradview_ticker.decode(errors='ignore')
# log.info(f'采集资讯的企业{tradviewticker}')
# ticker_param=str(tradviewticker).split('_')[0]
# social_code=str(tradviewticker).split('_')[1]
# url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
# log.info(f'采集资讯企业列表地址{tradview_ticker}')
# searchmsg=reqmsg(url)
# paserList(searchmsg,social_code)
# except Exception as e:
# log.info(f'redis中获取企业信息为空{e}')
# break
# getStockFromSql()
while True:
try:
tradview_ticker=r.lpop('tradview_ticker')
if tradview_ticker:
tradviewticker = tradview_ticker.decode(errors='ignore')
log.info(f'采集资讯的企业{tradviewticker}')
ticker_param=str(tradviewticker).split('_')[0]
social_code=str(tradviewticker).split('_')[1]
url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
log.info(f'采集资讯企业列表地址{tradview_ticker}')
searchmsg=reqmsg(url)
paserList(searchmsg,social_code)
except Exception as e:
log.info(f'redis中获取企业信息为空{e}')
break
......
......@@ -170,7 +170,7 @@ class JrttnewsTaskJob(object):
kwmsg={
'kw':kk,
'wordsCode':'jrtt',
'sid':'1020'
'sid':'1706193555675926530'
}
kwList.append(kwmsg)
return kwList
......
......@@ -255,8 +255,9 @@ class QQnewsSpider(object):
'search_count_limit': 10,
'appver': '15.5_qqnews_7.1.80'
}
proxy =self.baseCore.get_proxy()
try:
res=requests.post(url,headers=headers2,data=data,verify=False,timeout=10)
res=requests.post(url,headers=headers2,proxies=proxy,data=data,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
except Exception as e:
......@@ -283,31 +284,32 @@ class QQnewsSpider(object):
def get_page_html(self):
#设置采集列表页面和页数
url='https://i.news.qq.com/gw/pc_search/result'
totalnum=5
totalnum=6
keyword=self.searchkw
# keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum):
qerhtml=self.reqPostMsg(url,pagenum,keyword)
jsonmsg=json.loads(qerhtml)
secList=jsonmsg['secList']
for sec in secList:
try:
title=sec['newsList'][0]['title']
durl=sec['newsList'][0]['url']
pubtime=sec['newsList'][0]['time']
source=sec['newsList'][0]['source']
is_member = self.r.sismember('pyqqnews_'+self.wordsCode, durl)
if is_member:
if secList:
for sec in secList:
try:
title=sec['newsList'][0]['title']
durl=sec['newsList'][0]['url']
pubtime=sec['newsList'][0]['time']
source=sec['newsList'][0]['source']
is_member = self.r.sismember('pyqqnews_'+self.wordsCode, durl)
if is_member:
continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubtime
}
self.detailList.put(detailmsg)
except Exception as e :
continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubtime
}
self.detailList.put(detailmsg)
except Exception as e :
continue
# 获取详情页
def get_detail_html(self):
# 获取当前窗口的句柄
......@@ -322,7 +324,7 @@ class QQnewsSpider(object):
bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail)
try:
# self.sendkafka(processitem)
self.sendkafka(processitem)
self.r.sadd('pyqqnews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
......@@ -412,8 +414,8 @@ class QQnewsSpider(object):
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
res=requests.get(url,headers=headers,verify=False,timeout=10)
proxy = self.baseCore.get_proxy()
res=requests.get(url,headers=headers,proxies=proxy,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
return text
......@@ -421,7 +423,7 @@ class QQnewsSpider(object):
def extractorMsg(self,url,title):
content=''
contentWithTag=''
lang=''
lang='cn'
lang=self.detect_language(title)
sm=SmartExtractor(lang)
try:
......@@ -521,10 +523,10 @@ class QQnewsSpider(object):
if content!='':
processitem={
"sid":self.sid,
"source":"5",
"source":"22", #腾讯新闻
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
......
......@@ -170,7 +170,7 @@ class QQnewsTaskJob(object):
kwmsg={
'kw':kk,
'wordsCode':'qqnews',
'sid':'102003'
'sid':'1706193555675926530'
}
kwList.append(kwmsg)
return kwList
......@@ -182,7 +182,7 @@ class QQnewsTaskJob(object):
try:
jrttnewsSpider.get_page_html()
except Exception as e:
logger.info('今日头条搜索异常'+searchkw)
logger.info('腾讯新闻搜索异常'+searchkw)
if jrttnewsSpider.detailList.qsize() != 0:
try:
......@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(qqnewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
......@@ -140,8 +140,8 @@ class SougouSpider(object):
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag:
itemTags=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTags:
try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
except Exception as e:
......@@ -512,7 +512,7 @@ class SougouSpider(object):
"source":"5",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
......
......@@ -286,7 +286,7 @@ class SouhunewsSpider(object):
# 获取每一页数据, 开趴.
def get_page_html(self):
#设置采集列表页面和页数
totalnum=5
totalnum=6
keyword=self.searchkw
# keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum):
......@@ -333,7 +333,7 @@ class SouhunewsSpider(object):
bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail)
try:
# self.sendkafka(processitem)
self.sendkafka(processitem)
self.r.sadd('pysouhunews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
......@@ -528,10 +528,10 @@ class SouhunewsSpider(object):
if content!='':
processitem={
"sid":self.sid,
"source":"3",
"source":"23", #搜狐新闻
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
......
......@@ -170,7 +170,7 @@ class SouhunewsTaskJob(object):
kwmsg={
'kw':kk,
'wordsCode':'souhu',
'sid':'102002'
'sid':'1706193555675926530'
}
kwList.append(kwmsg)
return kwList
......@@ -182,7 +182,7 @@ class SouhunewsTaskJob(object):
try:
jrttnewsSpider.get_page_html()
except Exception as e:
logger.info('今日头条搜索异常'+searchkw)
logger.info('搜狐新闻搜索异常'+searchkw)
if jrttnewsSpider.detailList.qsize() != 0:
try:
......@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
#coding=utf-8
#coding=utf-8
......@@ -62,7 +62,7 @@ class BaiduSpider(object):
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
#将列表数据插入到表中 baidu_search_result
#将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items):
try:
itemdata=[]
......@@ -72,7 +72,7 @@ class BaiduSpider(object):
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
itemdata.append(data)
sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
sql ="INSERT into meta_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句
......@@ -115,8 +115,8 @@ class BaiduSpider(object):
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]')
for itemTag in itemTag:
itemTags=html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]')
for itemTag in itemTags:
try:
title=itemTag.xpath('.//h3[@class="news-title_1YtI1 "]/a/text()')[0]
except Exception as e:
......@@ -487,7 +487,7 @@ class BaiduSpider(object):
"source":"3",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -200,7 +200,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论