提交 b6df2db3 作者: 刘伟刚

中国政府采购网修改代码提交

上级 d974abb1
......@@ -134,6 +134,7 @@ def paserList(searchmsg,social_code):
'sourceAddress': sourceAddress, # 原文链接
'summary': '',
'title': title,
'source': source,
'socialCreditCode': social_code,
'year': published[:4]
}
......@@ -249,7 +250,7 @@ def sendToKafka(detailmsg):
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': 'Tradingview',
'origin': detailmsg['source'],
'publishDate': detailmsg['publishDate'],
'sid': '1711619846545776641',
'sourceAddress': detailmsg['sourceAddress'], # 原文链接
......@@ -316,23 +317,23 @@ if __name__ == '__main__':
# url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
# searchmsg=reqmsg(url)
# print(searchmsg)
getStockFromSql()
# while True:
# try:
# tradview_ticker=r.lpop('tradview_ticker')
# if tradview_ticker:
#
# tradviewticker = tradview_ticker.decode(errors='ignore')
# log.info(f'采集资讯的企业{tradviewticker}')
# ticker_param=str(tradviewticker).split('_')[0]
# social_code=str(tradviewticker).split('_')[1]
# url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
# log.info(f'采集资讯企业列表地址{tradview_ticker}')
# searchmsg=reqmsg(url)
# paserList(searchmsg,social_code)
# except Exception as e:
# log.info(f'redis中获取企业信息为空{e}')
# break
# getStockFromSql()
while True:
try:
tradview_ticker=r.lpop('tradview_ticker')
if tradview_ticker:
tradviewticker = tradview_ticker.decode(errors='ignore')
log.info(f'采集资讯的企业{tradviewticker}')
ticker_param=str(tradviewticker).split('_')[0]
social_code=str(tradviewticker).split('_')[1]
url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
log.info(f'采集资讯企业列表地址{tradview_ticker}')
searchmsg=reqmsg(url)
paserList(searchmsg,social_code)
except Exception as e:
log.info(f'redis中获取企业信息为空{e}')
break
......
......@@ -37,7 +37,7 @@ class JrttnewsSpider(object):
self.config.read('config.ini')
baseCore=BaseCore()
self.logger=baseCore.getLogger()
self.url = 'https://www.toutiao.com/'
self.url = f'https://so.toutiao.com/search?dvpf=pc&source=input&keyword={searchkw}#'
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
......@@ -49,6 +49,7 @@ class JrttnewsSpider(object):
self.searchkw = searchkw
self.wordsCode = wordsCode
self.sid = sid
self.driver=self.createDriver();
#将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items):
......@@ -95,53 +96,36 @@ class JrttnewsSpider(object):
html = etree.HTML(response)
lists=self.xpath_paser(html)
try:
flag = html.xpath('//a[@id="sogou_next"]')[0]
flag = html.xpath('//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]')[0]
except Exception as e:
flag=''
lists=[]
return flag, lists
def getRealUrl(self,url):
try:
header={
"accept":"*/*",
"connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url=f"https://www.sogou.com{url}"
res = requests.get(url,headers=header)
text=res.text
# 定义正则表达式
pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
# 在给定的字符串中寻找匹配的URL
urls = re.findall(pattern, text)
uri=''
if len(urls)>1:
uri=urls[0]
except Exception as e:
self.logger.info("链接转换异常!")
return uri
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag:
itemTags=html.xpath('//div[@class="cs-view cs-view-block cs-card-content"]')
for itemTag in itemTags:
html_str = etree.tostring(itemTag)
try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
title=itemTag.xpath('.//a[@class="text-ellipsis text-underline-hover"]/text()')[0]
except Exception as e:
title=''
if title=='':
continue
try:
detailUrl=itemTag.xpath('.//h3[@class="vr-title"]/a/@href')[0]
detailUrl=self.getRealUrl(detailUrl)
detailUrl=itemTag.xpath('.//a[@class="text-ellipsis text-underline-hover"]/@href')[0]
id=self.get_reitemid(detailUrl)
detailUrl=f'https://www.toutiao.com/article/{id}/?&source=m_redirect'
except Exception as e:
detailUrl=''
try:
sourceTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[1]/text()')[0]
sourceTag=itemTag.xpath('.//span[@class="d-flex align-items-center text-ellipsis margin-right-4"]//text()')[0]
except Exception as e:
sourceTag=''
try:
publishTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[2]/text()')[0]
publishTag=itemTag.xpath('.//div[@class="cs-view cs-view-flex align-items-center flex-row cs-source-content"]/span[@class="text-ellipsis"]/text()')[0]
publishTag=str(publishTag)
publishtime=self.paserTime(publishTag)
publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S")
......@@ -285,7 +269,8 @@ class JrttnewsSpider(object):
def get_reitemid(self,tmpurl):
try:
pattern='item_id=([\d]{1,})&search_id'
tmpurl=unquote(tmpurl)
pattern='com/a([\d]{1,}?)/'
match = re.search(pattern, tmpurl)
# 判断是否匹配成功
if match:
......@@ -307,53 +292,74 @@ class JrttnewsSpider(object):
# 获取每一页数据, 开趴.
def get_page_html(self):
#设置采集列表页面和页数
totalnum=1
keyword=self.searchkw
# keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum):
self.logger.info(f"解析关键词{keyword}第{pagenum}页")
offset=pagenum*10
tmpurl='https://search5-search-hl.toutiaoapi.com/search/?source=search_subtab_switch&is_ttwebview=0&pass_through=default&action_type=input_keyword_search&is_incognito=0&api_param={"sug_session_id":"552235978851697767261639"}&inner_resolution=1080*1920&navbar_height=36&multi_container=1&gs_height=44&client_extra_params={"playparam":"codec_type:7,cdn_type:1,resolution:1080*1920,ttm_version:924000,enable_dash:0,unwatermark:1,v1_fitter_info:1,tt_net_energy:4,is_order_flow:-1,tt_device_score:7.1,tt_enable_adaptive:2"}&common_hashtags=default&_rticket=1697767236897&loadId=1&from_search_id=202310201001051EB17B3CBA66215D937D&isTTWebViewHeifSupport=0&has_gs=0&multi_container_type=1&forum=3&tt_font_size=m' \
'&search_start_time=1697767265219&pd=information&cur_tab_title=search_tab&offset_height=108&openlive_plugin_status=0&fetch_by_ttnet=1&is_darkmode=0&from_pd=synthesis&plugin_enable=3&search_position=search_bar' \
'&keyword=[keyword]&session_id=f1d0e9e4-cb15-4b60-b894-de729a76e6a9&switch_tab_type=click&appTheme=light&search_json={"__logExtra__":{"if_sar_recall":"0","from_category_name":"__all__","from_enter_from":"click_headline","from_channel_id":"0"}}' \
'&from=search_tab&is_older=0&tt_daymode=1&search_sug=1&&runtime_tc=tt_search&browser_runtime_version=1720&format=json' \
'&count=10&offset=[offset]&search_id=20231020120041F06F03C42D66A4AC5EC2&start_index=30&index_resource=&filter_vendor=&filter_period=&order_type=' \
'&min_time=&max_time=&traffic_source='
url=tmpurl.replace('[keyword]',keyword).replace('[offset]',str(offset))
lhtml=self.reqHtml(url)
qqerhtml=json.loads(lhtml)
qqerhtml=qqerhtml['dom']
# self.logger.info(f'列表页面信息:{lhtml}')
soup = BeautifulSoup(qqerhtml, 'html.parser')
listcontent=soup.select('div[style="opacity: 1;"]')
for litag in listcontent:
self.driver.get(self.url)
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "s-result-list")))
# try:
# self.driver.find_element(By.XPATH,'//div[@class="input_box_n6Efbw"]/input').send_keys(self.searchkw)
# except Exception as e:
# print(e)
# self.driver.find_element(By.CLASS_NAME, 'search_33vwaQ').click()
# wait = WebDriverWait(self.driver, 20)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "s-result-list")))
# time.sleep(2)
self.driver.find_element('xpath', '//div[@class="cs-view pad-bottom-6 cs-view-flex align-items-center flex-row nav_7Dk46Y"]/div[1]/a[text()="资讯"]').click()
time.sleep(2)
self.logger.info("开始抓取首页...")
try:
flag, lists = self.parse_page()
if len(lists)<1:
return
except Exception as e:
time.sleep(5)
return
if len(lists)==0:
time.sleep(5)
for detail in lists:
durl=detail['detailUrl']
is_member = self.r.sismember('pyjrttnews_'+self.wordsCode, durl)
if is_member:
continue
self.detailList.put(detail)
response = self.driver.page_source
html = etree.HTML(response)
hasnext = html.xpath('//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]//text()')[0]
hasnext = hasnext.strip()
timeFlag=False
while '下一页' in hasnext:
try:
if self.page_num==5:
break
self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num)
try:
lidoc=pq(str(litag))
ahref=lidoc.find('a[class="l-view block l-text line-clamp-2 color-darker font-medium l-header h3"]').attr('href')
id=self.get_reitemid(ahref)
durl=f'https://www.toutiao.com/article/{id}/?&source=m_redirect'
title=lidoc.find('a[class="l-view block l-text line-clamp-2 color-darker font-medium l-header h3"]').text().replace('\n','')
source=lidoc.find('div[class="l-source-text t3 l-source-min-width line-clamp-1 flex-shrink"]').text().replace('\n','')
publishdate=lidoc.find('div[class="l-view block l-text-split flex-shrink-0 ml-8 color-default line-clamp-1 t3"]>span:last-child').text().replace('\n','')
publishdate=self.paserTime(publishdate)
if isinstance(publishdate, str):
pubdate=publishdate
else:
pubdate=publishdate.strftime("%Y-%m-%d %H:%M:%S")
self.driver.find_element(By.XPATH, '//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]').click()
except Exception as e:
time.sleep(5)
continue
time.sleep(5)
flag, lists = self.parse_page()
if len(lists)<1:
break
for detail in lists:
publishTag=detail['publishTag']
is_member = self.r.sismember('pyjrttnews_'+self.wordsCode, durl)
if is_member:
self.logger.info(f"搜索列表的链接已经存在!")
continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubdate
}
self.detailList.put(detailmsg)
self.detailList.put(detail)
if timeFlag:
break
try:
response = self.driver.page_source
html = etree.HTML(response)
hasnext = html.xpath('//div[@id="page"]//a[last()]//text()')[0]
hasnext = hasnext.strip()
except Exception as e:
self.logger.info(f"搜索列表页异常{e}")
continue
hasnext=''
except Exception as e:
time.sleep(5)
break
self.logger.info("抓取完毕")
# 获取详情页
def get_detail_html(self):
......@@ -370,7 +376,7 @@ class JrttnewsSpider(object):
self.logger.info(f"解析详情页标题{title},获取的内容长度:{len(bdetail['content'])}")
processitem=self.getProcessitem(bdetail)
try:
# self.sendkafka(processitem)
self.sendkafka(processitem)
self.r.sadd('pyjrttnews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
......@@ -576,10 +582,10 @@ class JrttnewsSpider(object):
if content!='':
processitem={
"sid":self.sid,
"source":"3",
"source":"21",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
......
......@@ -170,7 +170,7 @@ class JrttnewsTaskJob(object):
kwmsg={
'kw':kk,
'wordsCode':'jrtt',
'sid':'1020'
'sid':'1706193555675926530'
}
kwList.append(kwmsg)
return kwList
......
......@@ -255,8 +255,9 @@ class QQnewsSpider(object):
'search_count_limit': 10,
'appver': '15.5_qqnews_7.1.80'
}
proxy =self.baseCore.get_proxy()
try:
res=requests.post(url,headers=headers2,data=data,verify=False,timeout=10)
res=requests.post(url,headers=headers2,proxies=proxy,data=data,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
except Exception as e:
......@@ -283,31 +284,32 @@ class QQnewsSpider(object):
def get_page_html(self):
#设置采集列表页面和页数
url='https://i.news.qq.com/gw/pc_search/result'
totalnum=5
totalnum=6
keyword=self.searchkw
# keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum):
qerhtml=self.reqPostMsg(url,pagenum,keyword)
jsonmsg=json.loads(qerhtml)
secList=jsonmsg['secList']
for sec in secList:
try:
title=sec['newsList'][0]['title']
durl=sec['newsList'][0]['url']
pubtime=sec['newsList'][0]['time']
source=sec['newsList'][0]['source']
is_member = self.r.sismember('pyqqnews_'+self.wordsCode, durl)
if is_member:
if secList:
for sec in secList:
try:
title=sec['newsList'][0]['title']
durl=sec['newsList'][0]['url']
pubtime=sec['newsList'][0]['time']
source=sec['newsList'][0]['source']
is_member = self.r.sismember('pyqqnews_'+self.wordsCode, durl)
if is_member:
continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubtime
}
self.detailList.put(detailmsg)
except Exception as e :
continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubtime
}
self.detailList.put(detailmsg)
except Exception as e :
continue
# 获取详情页
def get_detail_html(self):
# 获取当前窗口的句柄
......@@ -322,7 +324,7 @@ class QQnewsSpider(object):
bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail)
try:
# self.sendkafka(processitem)
self.sendkafka(processitem)
self.r.sadd('pyqqnews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
......@@ -412,8 +414,8 @@ class QQnewsSpider(object):
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
res=requests.get(url,headers=headers,verify=False,timeout=10)
proxy = self.baseCore.get_proxy()
res=requests.get(url,headers=headers,proxies=proxy,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
return text
......@@ -421,7 +423,7 @@ class QQnewsSpider(object):
def extractorMsg(self,url,title):
content=''
contentWithTag=''
lang=''
lang='cn'
lang=self.detect_language(title)
sm=SmartExtractor(lang)
try:
......@@ -521,10 +523,10 @@ class QQnewsSpider(object):
if content!='':
processitem={
"sid":self.sid,
"source":"5",
"source":"22", #腾讯新闻
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
......
......@@ -170,7 +170,7 @@ class QQnewsTaskJob(object):
kwmsg={
'kw':kk,
'wordsCode':'qqnews',
'sid':'102003'
'sid':'1706193555675926530'
}
kwList.append(kwmsg)
return kwList
......@@ -182,7 +182,7 @@ class QQnewsTaskJob(object):
try:
jrttnewsSpider.get_page_html()
except Exception as e:
logger.info('今日头条搜索异常'+searchkw)
logger.info('腾讯新闻搜索异常'+searchkw)
if jrttnewsSpider.detailList.qsize() != 0:
try:
......@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(qqnewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
......@@ -140,8 +140,8 @@ class SougouSpider(object):
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag:
itemTags=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTags:
try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
except Exception as e:
......@@ -512,7 +512,7 @@ class SougouSpider(object):
"source":"5",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
......
......@@ -286,7 +286,7 @@ class SouhunewsSpider(object):
# 获取每一页数据, 开趴.
def get_page_html(self):
#设置采集列表页面和页数
totalnum=5
totalnum=6
keyword=self.searchkw
# keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum):
......@@ -333,7 +333,7 @@ class SouhunewsSpider(object):
bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail)
try:
# self.sendkafka(processitem)
self.sendkafka(processitem)
self.r.sadd('pysouhunews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
......@@ -528,10 +528,10 @@ class SouhunewsSpider(object):
if content!='':
processitem={
"sid":self.sid,
"source":"3",
"source":"23", #搜狐新闻
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
......
......@@ -170,7 +170,7 @@ class SouhunewsTaskJob(object):
kwmsg={
'kw':kk,
'wordsCode':'souhu',
'sid':'102002'
'sid':'1706193555675926530'
}
kwList.append(kwmsg)
return kwList
......@@ -182,7 +182,7 @@ class SouhunewsTaskJob(object):
try:
jrttnewsSpider.get_page_html()
except Exception as e:
logger.info('今日头条搜索异常'+searchkw)
logger.info('搜狐新闻搜索异常'+searchkw)
if jrttnewsSpider.detailList.qsize() != 0:
try:
......@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
#coding=utf-8
#coding=utf-8
......@@ -62,7 +62,7 @@ class BaiduSpider(object):
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
#将列表数据插入到表中 baidu_search_result
#将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items):
try:
itemdata=[]
......@@ -72,7 +72,7 @@ class BaiduSpider(object):
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
itemdata.append(data)
sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
sql ="INSERT into meta_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句
......@@ -115,8 +115,8 @@ class BaiduSpider(object):
def xpath_paser(self,html):
lists=[]
itemTag=html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]')
for itemTag in itemTag:
itemTags=html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]')
for itemTag in itemTags:
try:
title=itemTag.xpath('.//h3[@class="news-title_1YtI1 "]/a/text()')[0]
except Exception as e:
......@@ -487,7 +487,7 @@ class BaiduSpider(object):
"source":"3",
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -200,7 +200,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论