提交 b6df2db3 作者: 刘伟刚

中国政府采购网修改代码提交

上级 d974abb1
...@@ -134,6 +134,7 @@ def paserList(searchmsg,social_code): ...@@ -134,6 +134,7 @@ def paserList(searchmsg,social_code):
'sourceAddress': sourceAddress, # 原文链接 'sourceAddress': sourceAddress, # 原文链接
'summary': '', 'summary': '',
'title': title, 'title': title,
'source': source,
'socialCreditCode': social_code, 'socialCreditCode': social_code,
'year': published[:4] 'year': published[:4]
} }
...@@ -249,7 +250,7 @@ def sendToKafka(detailmsg): ...@@ -249,7 +250,7 @@ def sendToKafka(detailmsg):
'id': '', 'id': '',
'keyWords': '', 'keyWords': '',
'lang': 'zh', 'lang': 'zh',
'origin': 'Tradingview', 'origin': detailmsg['source'],
'publishDate': detailmsg['publishDate'], 'publishDate': detailmsg['publishDate'],
'sid': '1711619846545776641', 'sid': '1711619846545776641',
'sourceAddress': detailmsg['sourceAddress'], # 原文链接 'sourceAddress': detailmsg['sourceAddress'], # 原文链接
...@@ -316,23 +317,23 @@ if __name__ == '__main__': ...@@ -316,23 +317,23 @@ if __name__ == '__main__':
# url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL' # url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
# searchmsg=reqmsg(url) # searchmsg=reqmsg(url)
# print(searchmsg) # print(searchmsg)
getStockFromSql() # getStockFromSql()
# while True: while True:
# try: try:
# tradview_ticker=r.lpop('tradview_ticker') tradview_ticker=r.lpop('tradview_ticker')
# if tradview_ticker: if tradview_ticker:
#
# tradviewticker = tradview_ticker.decode(errors='ignore') tradviewticker = tradview_ticker.decode(errors='ignore')
# log.info(f'采集资讯的企业{tradviewticker}') log.info(f'采集资讯的企业{tradviewticker}')
# ticker_param=str(tradviewticker).split('_')[0] ticker_param=str(tradviewticker).split('_')[0]
# social_code=str(tradviewticker).split('_')[1] social_code=str(tradviewticker).split('_')[1]
# url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}' url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
# log.info(f'采集资讯企业列表地址{tradview_ticker}') log.info(f'采集资讯企业列表地址{tradview_ticker}')
# searchmsg=reqmsg(url) searchmsg=reqmsg(url)
# paserList(searchmsg,social_code) paserList(searchmsg,social_code)
# except Exception as e: except Exception as e:
# log.info(f'redis中获取企业信息为空{e}') log.info(f'redis中获取企业信息为空{e}')
# break break
......
...@@ -170,7 +170,7 @@ class JrttnewsTaskJob(object): ...@@ -170,7 +170,7 @@ class JrttnewsTaskJob(object):
kwmsg={ kwmsg={
'kw':kk, 'kw':kk,
'wordsCode':'jrtt', 'wordsCode':'jrtt',
'sid':'1020' 'sid':'1706193555675926530'
} }
kwList.append(kwmsg) kwList.append(kwmsg)
return kwList return kwList
......
...@@ -255,8 +255,9 @@ class QQnewsSpider(object): ...@@ -255,8 +255,9 @@ class QQnewsSpider(object):
'search_count_limit': 10, 'search_count_limit': 10,
'appver': '15.5_qqnews_7.1.80' 'appver': '15.5_qqnews_7.1.80'
} }
proxy =self.baseCore.get_proxy()
try: try:
res=requests.post(url,headers=headers2,data=data,verify=False,timeout=10) res=requests.post(url,headers=headers2,proxies=proxy,data=data,verify=False,timeout=10)
res.encoding='utf-8' res.encoding='utf-8'
text=res.text text=res.text
except Exception as e: except Exception as e:
...@@ -283,31 +284,32 @@ class QQnewsSpider(object): ...@@ -283,31 +284,32 @@ class QQnewsSpider(object):
def get_page_html(self): def get_page_html(self):
#设置采集列表页面和页数 #设置采集列表页面和页数
url='https://i.news.qq.com/gw/pc_search/result' url='https://i.news.qq.com/gw/pc_search/result'
totalnum=5 totalnum=6
keyword=self.searchkw keyword=self.searchkw
# keyword='浙江国有资本运营公司' # keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum): for pagenum in range(0,totalnum):
qerhtml=self.reqPostMsg(url,pagenum,keyword) qerhtml=self.reqPostMsg(url,pagenum,keyword)
jsonmsg=json.loads(qerhtml) jsonmsg=json.loads(qerhtml)
secList=jsonmsg['secList'] secList=jsonmsg['secList']
for sec in secList: if secList:
try: for sec in secList:
title=sec['newsList'][0]['title'] try:
durl=sec['newsList'][0]['url'] title=sec['newsList'][0]['title']
pubtime=sec['newsList'][0]['time'] durl=sec['newsList'][0]['url']
source=sec['newsList'][0]['source'] pubtime=sec['newsList'][0]['time']
is_member = self.r.sismember('pyqqnews_'+self.wordsCode, durl) source=sec['newsList'][0]['source']
if is_member: is_member = self.r.sismember('pyqqnews_'+self.wordsCode, durl)
if is_member:
continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubtime
}
self.detailList.put(detailmsg)
except Exception as e :
continue continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubtime
}
self.detailList.put(detailmsg)
except Exception as e :
continue
# 获取详情页 # 获取详情页
def get_detail_html(self): def get_detail_html(self):
# 获取当前窗口的句柄 # 获取当前窗口的句柄
...@@ -322,7 +324,7 @@ class QQnewsSpider(object): ...@@ -322,7 +324,7 @@ class QQnewsSpider(object):
bdetail=self.getDetailmsg(detailmsg) bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail) processitem=self.getProcessitem(bdetail)
try: try:
# self.sendkafka(processitem) self.sendkafka(processitem)
self.r.sadd('pyqqnews_'+self.wordsCode, processitem['sourceAddress']) self.r.sadd('pyqqnews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e: except Exception as e:
self.logger.info("放入kafka失败!") self.logger.info("放入kafka失败!")
...@@ -412,8 +414,8 @@ class QQnewsSpider(object): ...@@ -412,8 +414,8 @@ class QQnewsSpider(object):
'sec-ch-ua-mobile':'?0', 'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"', 'sec-ch-ua-platform':'"Windows"',
} }
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'} proxy = self.baseCore.get_proxy()
res=requests.get(url,headers=headers,verify=False,timeout=10) res=requests.get(url,headers=headers,proxies=proxy,verify=False,timeout=10)
res.encoding='utf-8' res.encoding='utf-8'
text=res.text text=res.text
return text return text
...@@ -421,7 +423,7 @@ class QQnewsSpider(object): ...@@ -421,7 +423,7 @@ class QQnewsSpider(object):
def extractorMsg(self,url,title): def extractorMsg(self,url,title):
content='' content=''
contentWithTag='' contentWithTag=''
lang='' lang='cn'
lang=self.detect_language(title) lang=self.detect_language(title)
sm=SmartExtractor(lang) sm=SmartExtractor(lang)
try: try:
...@@ -521,10 +523,10 @@ class QQnewsSpider(object): ...@@ -521,10 +523,10 @@ class QQnewsSpider(object):
if content!='': if content!='':
processitem={ processitem={
"sid":self.sid, "sid":self.sid,
"source":"5", "source":"22", #腾讯新闻
"title":bdetail['title'], "title":bdetail['title'],
"content":bdetail['content'], "content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'], "origin":bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress":bdetail['detailurl'],
......
...@@ -170,7 +170,7 @@ class QQnewsTaskJob(object): ...@@ -170,7 +170,7 @@ class QQnewsTaskJob(object):
kwmsg={ kwmsg={
'kw':kk, 'kw':kk,
'wordsCode':'qqnews', 'wordsCode':'qqnews',
'sid':'102003' 'sid':'1706193555675926530'
} }
kwList.append(kwmsg) kwList.append(kwmsg)
return kwList return kwList
...@@ -182,7 +182,7 @@ class QQnewsTaskJob(object): ...@@ -182,7 +182,7 @@ class QQnewsTaskJob(object):
try: try:
jrttnewsSpider.get_page_html() jrttnewsSpider.get_page_html()
except Exception as e: except Exception as e:
logger.info('今日头条搜索异常'+searchkw) logger.info('腾讯新闻搜索异常'+searchkw)
if jrttnewsSpider.detailList.qsize() != 0: if jrttnewsSpider.detailList.qsize() != 0:
try: try:
...@@ -218,7 +218,7 @@ if __name__ == '__main__': ...@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(qqnewsTaskJob.runSpider, data) for data in kwList] results = [executor.submit(qqnewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
...@@ -140,8 +140,8 @@ class SougouSpider(object): ...@@ -140,8 +140,8 @@ class SougouSpider(object):
def xpath_paser(self,html): def xpath_paser(self,html):
lists=[] lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]') itemTags=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag: for itemTag in itemTags:
try: try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0] title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
except Exception as e: except Exception as e:
...@@ -512,7 +512,7 @@ class SougouSpider(object): ...@@ -512,7 +512,7 @@ class SougouSpider(object):
"source":"5", "source":"5",
"title":bdetail['title'], "title":bdetail['title'],
"content":bdetail['content'], "content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'], "origin":bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress":bdetail['detailurl'],
......
...@@ -286,7 +286,7 @@ class SouhunewsSpider(object): ...@@ -286,7 +286,7 @@ class SouhunewsSpider(object):
# 获取每一页数据, 开趴. # 获取每一页数据, 开趴.
def get_page_html(self): def get_page_html(self):
#设置采集列表页面和页数 #设置采集列表页面和页数
totalnum=5 totalnum=6
keyword=self.searchkw keyword=self.searchkw
# keyword='浙江国有资本运营公司' # keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum): for pagenum in range(0,totalnum):
...@@ -333,7 +333,7 @@ class SouhunewsSpider(object): ...@@ -333,7 +333,7 @@ class SouhunewsSpider(object):
bdetail=self.getDetailmsg(detailmsg) bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail) processitem=self.getProcessitem(bdetail)
try: try:
# self.sendkafka(processitem) self.sendkafka(processitem)
self.r.sadd('pysouhunews_'+self.wordsCode, processitem['sourceAddress']) self.r.sadd('pysouhunews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e: except Exception as e:
self.logger.info("放入kafka失败!") self.logger.info("放入kafka失败!")
...@@ -528,10 +528,10 @@ class SouhunewsSpider(object): ...@@ -528,10 +528,10 @@ class SouhunewsSpider(object):
if content!='': if content!='':
processitem={ processitem={
"sid":self.sid, "sid":self.sid,
"source":"3", "source":"23", #搜狐新闻
"title":bdetail['title'], "title":bdetail['title'],
"content":bdetail['content'], "content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'], "origin":bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress":bdetail['detailurl'],
......
...@@ -170,7 +170,7 @@ class SouhunewsTaskJob(object): ...@@ -170,7 +170,7 @@ class SouhunewsTaskJob(object):
kwmsg={ kwmsg={
'kw':kk, 'kw':kk,
'wordsCode':'souhu', 'wordsCode':'souhu',
'sid':'102002' 'sid':'1706193555675926530'
} }
kwList.append(kwmsg) kwList.append(kwmsg)
return kwList return kwList
...@@ -182,7 +182,7 @@ class SouhunewsTaskJob(object): ...@@ -182,7 +182,7 @@ class SouhunewsTaskJob(object):
try: try:
jrttnewsSpider.get_page_html() jrttnewsSpider.get_page_html()
except Exception as e: except Exception as e:
logger.info('今日头条搜索异常'+searchkw) logger.info('搜狐新闻搜索异常'+searchkw)
if jrttnewsSpider.detailList.qsize() != 0: if jrttnewsSpider.detailList.qsize() != 0:
try: try:
...@@ -218,7 +218,7 @@ if __name__ == '__main__': ...@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList] results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
#coding=utf-8 #coding=utf-8
...@@ -62,7 +62,7 @@ class BaiduSpider(object): ...@@ -62,7 +62,7 @@ class BaiduSpider(object):
# proxy = "127.0.0.1:8080" # 代理地址和端口 # proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy) # chrome_options.add_argument('--proxy-server=http://' + proxy)
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options) self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
#将列表数据插入到表中 baidu_search_result #将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items): def itemInsertToTable(self,items):
try: try:
itemdata=[] itemdata=[]
...@@ -72,7 +72,7 @@ class BaiduSpider(object): ...@@ -72,7 +72,7 @@ class BaiduSpider(object):
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime) data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
itemdata.append(data) itemdata.append(data)
sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)" sql ="INSERT into meta_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata) cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!") self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句 # 定义插入数据的SQL语句
...@@ -115,8 +115,8 @@ class BaiduSpider(object): ...@@ -115,8 +115,8 @@ class BaiduSpider(object):
def xpath_paser(self,html): def xpath_paser(self,html):
lists=[] lists=[]
itemTag=html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]') itemTags=html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]')
for itemTag in itemTag: for itemTag in itemTags:
try: try:
title=itemTag.xpath('.//h3[@class="news-title_1YtI1 "]/a/text()')[0] title=itemTag.xpath('.//h3[@class="news-title_1YtI1 "]/a/text()')[0]
except Exception as e: except Exception as e:
...@@ -487,7 +487,7 @@ class BaiduSpider(object): ...@@ -487,7 +487,7 @@ class BaiduSpider(object):
"source":"3", "source":"3",
"title":bdetail['title'], "title":bdetail['title'],
"content":bdetail['content'], "content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'], "origin":bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress":bdetail['detailurl'],
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -200,7 +200,7 @@ if __name__ == '__main__': ...@@ -200,7 +200,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList] results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论