提交 b6df2db3 作者: 刘伟刚

中国政府采购网修改代码提交

上级 d974abb1
...@@ -134,6 +134,7 @@ def paserList(searchmsg,social_code): ...@@ -134,6 +134,7 @@ def paserList(searchmsg,social_code):
'sourceAddress': sourceAddress, # 原文链接 'sourceAddress': sourceAddress, # 原文链接
'summary': '', 'summary': '',
'title': title, 'title': title,
'source': source,
'socialCreditCode': social_code, 'socialCreditCode': social_code,
'year': published[:4] 'year': published[:4]
} }
...@@ -249,7 +250,7 @@ def sendToKafka(detailmsg): ...@@ -249,7 +250,7 @@ def sendToKafka(detailmsg):
'id': '', 'id': '',
'keyWords': '', 'keyWords': '',
'lang': 'zh', 'lang': 'zh',
'origin': 'Tradingview', 'origin': detailmsg['source'],
'publishDate': detailmsg['publishDate'], 'publishDate': detailmsg['publishDate'],
'sid': '1711619846545776641', 'sid': '1711619846545776641',
'sourceAddress': detailmsg['sourceAddress'], # 原文链接 'sourceAddress': detailmsg['sourceAddress'], # 原文链接
...@@ -316,23 +317,23 @@ if __name__ == '__main__': ...@@ -316,23 +317,23 @@ if __name__ == '__main__':
# url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL' # url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
# searchmsg=reqmsg(url) # searchmsg=reqmsg(url)
# print(searchmsg) # print(searchmsg)
getStockFromSql() # getStockFromSql()
# while True: while True:
# try: try:
# tradview_ticker=r.lpop('tradview_ticker') tradview_ticker=r.lpop('tradview_ticker')
# if tradview_ticker: if tradview_ticker:
#
# tradviewticker = tradview_ticker.decode(errors='ignore') tradviewticker = tradview_ticker.decode(errors='ignore')
# log.info(f'采集资讯的企业{tradviewticker}') log.info(f'采集资讯的企业{tradviewticker}')
# ticker_param=str(tradviewticker).split('_')[0] ticker_param=str(tradviewticker).split('_')[0]
# social_code=str(tradviewticker).split('_')[1] social_code=str(tradviewticker).split('_')[1]
# url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}' url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={ticker_param}'
# log.info(f'采集资讯企业列表地址{tradview_ticker}') log.info(f'采集资讯企业列表地址{tradview_ticker}')
# searchmsg=reqmsg(url) searchmsg=reqmsg(url)
# paserList(searchmsg,social_code) paserList(searchmsg,social_code)
# except Exception as e: except Exception as e:
# log.info(f'redis中获取企业信息为空{e}') log.info(f'redis中获取企业信息为空{e}')
# break break
......
...@@ -37,7 +37,7 @@ class JrttnewsSpider(object): ...@@ -37,7 +37,7 @@ class JrttnewsSpider(object):
self.config.read('config.ini') self.config.read('config.ini')
baseCore=BaseCore() baseCore=BaseCore()
self.logger=baseCore.getLogger() self.logger=baseCore.getLogger()
self.url = 'https://www.toutiao.com/' self.url = f'https://so.toutiao.com/search?dvpf=pc&source=input&keyword={searchkw}#'
self.r = redis.Redis(host=self.config.get('redis', 'host'), self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'), port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0) password=self.config.get('redis', 'pass'), db=0)
...@@ -49,6 +49,7 @@ class JrttnewsSpider(object): ...@@ -49,6 +49,7 @@ class JrttnewsSpider(object):
self.searchkw = searchkw self.searchkw = searchkw
self.wordsCode = wordsCode self.wordsCode = wordsCode
self.sid = sid self.sid = sid
self.driver=self.createDriver();
#将列表数据插入到表中 meta_search_result #将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items): def itemInsertToTable(self,items):
...@@ -95,53 +96,36 @@ class JrttnewsSpider(object): ...@@ -95,53 +96,36 @@ class JrttnewsSpider(object):
html = etree.HTML(response) html = etree.HTML(response)
lists=self.xpath_paser(html) lists=self.xpath_paser(html)
try: try:
flag = html.xpath('//a[@id="sogou_next"]')[0] flag = html.xpath('//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]')[0]
except Exception as e: except Exception as e:
flag='' flag=''
lists=[] lists=[]
return flag, lists return flag, lists
def getRealUrl(self,url):
try:
header={
"accept":"*/*",
"connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url=f"https://www.sogou.com{url}"
res = requests.get(url,headers=header)
text=res.text
# 定义正则表达式
pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
# 在给定的字符串中寻找匹配的URL
urls = re.findall(pattern, text)
uri=''
if len(urls)>1:
uri=urls[0]
except Exception as e:
self.logger.info("链接转换异常!")
return uri
def xpath_paser(self,html): def xpath_paser(self,html):
lists=[] lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]') itemTags=html.xpath('//div[@class="cs-view cs-view-block cs-card-content"]')
for itemTag in itemTag: for itemTag in itemTags:
html_str = etree.tostring(itemTag)
try: try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0] title=itemTag.xpath('.//a[@class="text-ellipsis text-underline-hover"]/text()')[0]
except Exception as e: except Exception as e:
title='' title=''
if title=='':
continue
try: try:
detailUrl=itemTag.xpath('.//h3[@class="vr-title"]/a/@href')[0] detailUrl=itemTag.xpath('.//a[@class="text-ellipsis text-underline-hover"]/@href')[0]
detailUrl=self.getRealUrl(detailUrl) id=self.get_reitemid(detailUrl)
detailUrl=f'https://www.toutiao.com/article/{id}/?&source=m_redirect'
except Exception as e: except Exception as e:
detailUrl='' detailUrl=''
try: try:
sourceTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[1]/text()')[0] sourceTag=itemTag.xpath('.//span[@class="d-flex align-items-center text-ellipsis margin-right-4"]//text()')[0]
except Exception as e: except Exception as e:
sourceTag='' sourceTag=''
try: try:
publishTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[2]/text()')[0] publishTag=itemTag.xpath('.//div[@class="cs-view cs-view-flex align-items-center flex-row cs-source-content"]/span[@class="text-ellipsis"]/text()')[0]
publishTag=str(publishTag) publishTag=str(publishTag)
publishtime=self.paserTime(publishTag) publishtime=self.paserTime(publishTag)
publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S") publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S")
...@@ -285,7 +269,8 @@ class JrttnewsSpider(object): ...@@ -285,7 +269,8 @@ class JrttnewsSpider(object):
def get_reitemid(self,tmpurl): def get_reitemid(self,tmpurl):
try: try:
pattern='item_id=([\d]{1,})&search_id' tmpurl=unquote(tmpurl)
pattern='com/a([\d]{1,}?)/'
match = re.search(pattern, tmpurl) match = re.search(pattern, tmpurl)
# 判断是否匹配成功 # 判断是否匹配成功
if match: if match:
...@@ -307,53 +292,74 @@ class JrttnewsSpider(object): ...@@ -307,53 +292,74 @@ class JrttnewsSpider(object):
# 获取每一页数据, 开趴. # 获取每一页数据, 开趴.
def get_page_html(self): def get_page_html(self):
#设置采集列表页面和页数 #设置采集列表页面和页数
totalnum=1 self.driver.get(self.url)
keyword=self.searchkw wait = WebDriverWait(self.driver, 20)
# keyword='浙江国有资本运营公司' wait.until(EC.presence_of_element_located((By.CLASS_NAME, "s-result-list")))
for pagenum in range(0,totalnum): # try:
self.logger.info(f"解析关键词{keyword}第{pagenum}页") # self.driver.find_element(By.XPATH,'//div[@class="input_box_n6Efbw"]/input').send_keys(self.searchkw)
offset=pagenum*10 # except Exception as e:
tmpurl='https://search5-search-hl.toutiaoapi.com/search/?source=search_subtab_switch&is_ttwebview=0&pass_through=default&action_type=input_keyword_search&is_incognito=0&api_param={"sug_session_id":"552235978851697767261639"}&inner_resolution=1080*1920&navbar_height=36&multi_container=1&gs_height=44&client_extra_params={"playparam":"codec_type:7,cdn_type:1,resolution:1080*1920,ttm_version:924000,enable_dash:0,unwatermark:1,v1_fitter_info:1,tt_net_energy:4,is_order_flow:-1,tt_device_score:7.1,tt_enable_adaptive:2"}&common_hashtags=default&_rticket=1697767236897&loadId=1&from_search_id=202310201001051EB17B3CBA66215D937D&isTTWebViewHeifSupport=0&has_gs=0&multi_container_type=1&forum=3&tt_font_size=m' \ # print(e)
'&search_start_time=1697767265219&pd=information&cur_tab_title=search_tab&offset_height=108&openlive_plugin_status=0&fetch_by_ttnet=1&is_darkmode=0&from_pd=synthesis&plugin_enable=3&search_position=search_bar' \ # self.driver.find_element(By.CLASS_NAME, 'search_33vwaQ').click()
'&keyword=[keyword]&session_id=f1d0e9e4-cb15-4b60-b894-de729a76e6a9&switch_tab_type=click&appTheme=light&search_json={"__logExtra__":{"if_sar_recall":"0","from_category_name":"__all__","from_enter_from":"click_headline","from_channel_id":"0"}}' \ # wait = WebDriverWait(self.driver, 20)
'&from=search_tab&is_older=0&tt_daymode=1&search_sug=1&&runtime_tc=tt_search&browser_runtime_version=1720&format=json' \ # wait.until(EC.presence_of_element_located((By.CLASS_NAME, "s-result-list")))
'&count=10&offset=[offset]&search_id=20231020120041F06F03C42D66A4AC5EC2&start_index=30&index_resource=&filter_vendor=&filter_period=&order_type=' \ # time.sleep(2)
'&min_time=&max_time=&traffic_source=' self.driver.find_element('xpath', '//div[@class="cs-view pad-bottom-6 cs-view-flex align-items-center flex-row nav_7Dk46Y"]/div[1]/a[text()="资讯"]').click()
url=tmpurl.replace('[keyword]',keyword).replace('[offset]',str(offset)) time.sleep(2)
lhtml=self.reqHtml(url) self.logger.info("开始抓取首页...")
qqerhtml=json.loads(lhtml) try:
qqerhtml=qqerhtml['dom'] flag, lists = self.parse_page()
# self.logger.info(f'列表页面信息:{lhtml}') if len(lists)<1:
soup = BeautifulSoup(qqerhtml, 'html.parser') return
listcontent=soup.select('div[style="opacity: 1;"]') except Exception as e:
for litag in listcontent: time.sleep(5)
return
if len(lists)==0:
time.sleep(5)
for detail in lists:
durl=detail['detailUrl']
is_member = self.r.sismember('pyjrttnews_'+self.wordsCode, durl)
if is_member:
continue
self.detailList.put(detail)
response = self.driver.page_source
html = etree.HTML(response)
hasnext = html.xpath('//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]//text()')[0]
hasnext = hasnext.strip()
timeFlag=False
while '下一页' in hasnext:
try:
if self.page_num==5:
break
self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num)
try: try:
lidoc=pq(str(litag)) self.driver.find_element(By.XPATH, '//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]').click()
ahref=lidoc.find('a[class="l-view block l-text line-clamp-2 color-darker font-medium l-header h3"]').attr('href') except Exception as e:
id=self.get_reitemid(ahref) time.sleep(5)
durl=f'https://www.toutiao.com/article/{id}/?&source=m_redirect' continue
title=lidoc.find('a[class="l-view block l-text line-clamp-2 color-darker font-medium l-header h3"]').text().replace('\n','') time.sleep(5)
source=lidoc.find('div[class="l-source-text t3 l-source-min-width line-clamp-1 flex-shrink"]').text().replace('\n','') flag, lists = self.parse_page()
publishdate=lidoc.find('div[class="l-view block l-text-split flex-shrink-0 ml-8 color-default line-clamp-1 t3"]>span:last-child').text().replace('\n','') if len(lists)<1:
publishdate=self.paserTime(publishdate) break
if isinstance(publishdate, str): for detail in lists:
pubdate=publishdate publishTag=detail['publishTag']
else:
pubdate=publishdate.strftime("%Y-%m-%d %H:%M:%S")
is_member = self.r.sismember('pyjrttnews_'+self.wordsCode, durl) is_member = self.r.sismember('pyjrttnews_'+self.wordsCode, durl)
if is_member: if is_member:
self.logger.info(f"搜索列表的链接已经存在!")
continue continue
detailmsg={ self.detailList.put(detail)
'title':title, if timeFlag:
'detailUrl':durl, break
'sourceTag':source, try:
'publishTag':pubdate response = self.driver.page_source
} html = etree.HTML(response)
self.detailList.put(detailmsg) hasnext = html.xpath('//div[@id="page"]//a[last()]//text()')[0]
hasnext = hasnext.strip()
except Exception as e: except Exception as e:
self.logger.info(f"搜索列表页异常{e}") hasnext=''
continue except Exception as e:
time.sleep(5)
break
self.logger.info("抓取完毕")
# 获取详情页 # 获取详情页
def get_detail_html(self): def get_detail_html(self):
...@@ -370,7 +376,7 @@ class JrttnewsSpider(object): ...@@ -370,7 +376,7 @@ class JrttnewsSpider(object):
self.logger.info(f"解析详情页标题{title},获取的内容长度:{len(bdetail['content'])}") self.logger.info(f"解析详情页标题{title},获取的内容长度:{len(bdetail['content'])}")
processitem=self.getProcessitem(bdetail) processitem=self.getProcessitem(bdetail)
try: try:
# self.sendkafka(processitem) self.sendkafka(processitem)
self.r.sadd('pyjrttnews_'+self.wordsCode, processitem['sourceAddress']) self.r.sadd('pyjrttnews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e: except Exception as e:
self.logger.info("放入kafka失败!") self.logger.info("放入kafka失败!")
...@@ -576,10 +582,10 @@ class JrttnewsSpider(object): ...@@ -576,10 +582,10 @@ class JrttnewsSpider(object):
if content!='': if content!='':
processitem={ processitem={
"sid":self.sid, "sid":self.sid,
"source":"3", "source":"21",
"title":bdetail['title'], "title":bdetail['title'],
"content":bdetail['content'], "content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'], "origin":bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress":bdetail['detailurl'],
......
...@@ -170,7 +170,7 @@ class JrttnewsTaskJob(object): ...@@ -170,7 +170,7 @@ class JrttnewsTaskJob(object):
kwmsg={ kwmsg={
'kw':kk, 'kw':kk,
'wordsCode':'jrtt', 'wordsCode':'jrtt',
'sid':'1020' 'sid':'1706193555675926530'
} }
kwList.append(kwmsg) kwList.append(kwmsg)
return kwList return kwList
......
...@@ -255,8 +255,9 @@ class QQnewsSpider(object): ...@@ -255,8 +255,9 @@ class QQnewsSpider(object):
'search_count_limit': 10, 'search_count_limit': 10,
'appver': '15.5_qqnews_7.1.80' 'appver': '15.5_qqnews_7.1.80'
} }
proxy =self.baseCore.get_proxy()
try: try:
res=requests.post(url,headers=headers2,data=data,verify=False,timeout=10) res=requests.post(url,headers=headers2,proxies=proxy,data=data,verify=False,timeout=10)
res.encoding='utf-8' res.encoding='utf-8'
text=res.text text=res.text
except Exception as e: except Exception as e:
...@@ -283,31 +284,32 @@ class QQnewsSpider(object): ...@@ -283,31 +284,32 @@ class QQnewsSpider(object):
def get_page_html(self): def get_page_html(self):
#设置采集列表页面和页数 #设置采集列表页面和页数
url='https://i.news.qq.com/gw/pc_search/result' url='https://i.news.qq.com/gw/pc_search/result'
totalnum=5 totalnum=6
keyword=self.searchkw keyword=self.searchkw
# keyword='浙江国有资本运营公司' # keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum): for pagenum in range(0,totalnum):
qerhtml=self.reqPostMsg(url,pagenum,keyword) qerhtml=self.reqPostMsg(url,pagenum,keyword)
jsonmsg=json.loads(qerhtml) jsonmsg=json.loads(qerhtml)
secList=jsonmsg['secList'] secList=jsonmsg['secList']
for sec in secList: if secList:
try: for sec in secList:
title=sec['newsList'][0]['title'] try:
durl=sec['newsList'][0]['url'] title=sec['newsList'][0]['title']
pubtime=sec['newsList'][0]['time'] durl=sec['newsList'][0]['url']
source=sec['newsList'][0]['source'] pubtime=sec['newsList'][0]['time']
is_member = self.r.sismember('pyqqnews_'+self.wordsCode, durl) source=sec['newsList'][0]['source']
if is_member: is_member = self.r.sismember('pyqqnews_'+self.wordsCode, durl)
if is_member:
continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubtime
}
self.detailList.put(detailmsg)
except Exception as e :
continue continue
detailmsg={
'title':title,
'detailUrl':durl,
'sourceTag':source,
'publishTag':pubtime
}
self.detailList.put(detailmsg)
except Exception as e :
continue
# 获取详情页 # 获取详情页
def get_detail_html(self): def get_detail_html(self):
# 获取当前窗口的句柄 # 获取当前窗口的句柄
...@@ -322,7 +324,7 @@ class QQnewsSpider(object): ...@@ -322,7 +324,7 @@ class QQnewsSpider(object):
bdetail=self.getDetailmsg(detailmsg) bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail) processitem=self.getProcessitem(bdetail)
try: try:
# self.sendkafka(processitem) self.sendkafka(processitem)
self.r.sadd('pyqqnews_'+self.wordsCode, processitem['sourceAddress']) self.r.sadd('pyqqnews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e: except Exception as e:
self.logger.info("放入kafka失败!") self.logger.info("放入kafka失败!")
...@@ -412,8 +414,8 @@ class QQnewsSpider(object): ...@@ -412,8 +414,8 @@ class QQnewsSpider(object):
'sec-ch-ua-mobile':'?0', 'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"', 'sec-ch-ua-platform':'"Windows"',
} }
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'} proxy = self.baseCore.get_proxy()
res=requests.get(url,headers=headers,verify=False,timeout=10) res=requests.get(url,headers=headers,proxies=proxy,verify=False,timeout=10)
res.encoding='utf-8' res.encoding='utf-8'
text=res.text text=res.text
return text return text
...@@ -421,7 +423,7 @@ class QQnewsSpider(object): ...@@ -421,7 +423,7 @@ class QQnewsSpider(object):
def extractorMsg(self,url,title): def extractorMsg(self,url,title):
content='' content=''
contentWithTag='' contentWithTag=''
lang='' lang='cn'
lang=self.detect_language(title) lang=self.detect_language(title)
sm=SmartExtractor(lang) sm=SmartExtractor(lang)
try: try:
...@@ -521,10 +523,10 @@ class QQnewsSpider(object): ...@@ -521,10 +523,10 @@ class QQnewsSpider(object):
if content!='': if content!='':
processitem={ processitem={
"sid":self.sid, "sid":self.sid,
"source":"5", "source":"22", #腾讯新闻
"title":bdetail['title'], "title":bdetail['title'],
"content":bdetail['content'], "content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'], "origin":bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress":bdetail['detailurl'],
......
...@@ -170,7 +170,7 @@ class QQnewsTaskJob(object): ...@@ -170,7 +170,7 @@ class QQnewsTaskJob(object):
kwmsg={ kwmsg={
'kw':kk, 'kw':kk,
'wordsCode':'qqnews', 'wordsCode':'qqnews',
'sid':'102003' 'sid':'1706193555675926530'
} }
kwList.append(kwmsg) kwList.append(kwmsg)
return kwList return kwList
...@@ -182,7 +182,7 @@ class QQnewsTaskJob(object): ...@@ -182,7 +182,7 @@ class QQnewsTaskJob(object):
try: try:
jrttnewsSpider.get_page_html() jrttnewsSpider.get_page_html()
except Exception as e: except Exception as e:
logger.info('今日头条搜索异常'+searchkw) logger.info('腾讯新闻搜索异常'+searchkw)
if jrttnewsSpider.detailList.qsize() != 0: if jrttnewsSpider.detailList.qsize() != 0:
try: try:
...@@ -218,7 +218,7 @@ if __name__ == '__main__': ...@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(qqnewsTaskJob.runSpider, data) for data in kwList] results = [executor.submit(qqnewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
...@@ -140,8 +140,8 @@ class SougouSpider(object): ...@@ -140,8 +140,8 @@ class SougouSpider(object):
def xpath_paser(self,html): def xpath_paser(self,html):
lists=[] lists=[]
itemTag=html.xpath('//div[@class="vrwrap"]') itemTags=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTag: for itemTag in itemTags:
try: try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0] title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
except Exception as e: except Exception as e:
...@@ -512,7 +512,7 @@ class SougouSpider(object): ...@@ -512,7 +512,7 @@ class SougouSpider(object):
"source":"5", "source":"5",
"title":bdetail['title'], "title":bdetail['title'],
"content":bdetail['content'], "content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'], "origin":bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress":bdetail['detailurl'],
......
...@@ -286,7 +286,7 @@ class SouhunewsSpider(object): ...@@ -286,7 +286,7 @@ class SouhunewsSpider(object):
# 获取每一页数据, 开趴. # 获取每一页数据, 开趴.
def get_page_html(self): def get_page_html(self):
#设置采集列表页面和页数 #设置采集列表页面和页数
totalnum=5 totalnum=6
keyword=self.searchkw keyword=self.searchkw
# keyword='浙江国有资本运营公司' # keyword='浙江国有资本运营公司'
for pagenum in range(0,totalnum): for pagenum in range(0,totalnum):
...@@ -333,7 +333,7 @@ class SouhunewsSpider(object): ...@@ -333,7 +333,7 @@ class SouhunewsSpider(object):
bdetail=self.getDetailmsg(detailmsg) bdetail=self.getDetailmsg(detailmsg)
processitem=self.getProcessitem(bdetail) processitem=self.getProcessitem(bdetail)
try: try:
# self.sendkafka(processitem) self.sendkafka(processitem)
self.r.sadd('pysouhunews_'+self.wordsCode, processitem['sourceAddress']) self.r.sadd('pysouhunews_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e: except Exception as e:
self.logger.info("放入kafka失败!") self.logger.info("放入kafka失败!")
...@@ -528,10 +528,10 @@ class SouhunewsSpider(object): ...@@ -528,10 +528,10 @@ class SouhunewsSpider(object):
if content!='': if content!='':
processitem={ processitem={
"sid":self.sid, "sid":self.sid,
"source":"3", "source":"23", #搜狐新闻
"title":bdetail['title'], "title":bdetail['title'],
"content":bdetail['content'], "content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'], "origin":bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress":bdetail['detailurl'],
......
...@@ -170,7 +170,7 @@ class SouhunewsTaskJob(object): ...@@ -170,7 +170,7 @@ class SouhunewsTaskJob(object):
kwmsg={ kwmsg={
'kw':kk, 'kw':kk,
'wordsCode':'souhu', 'wordsCode':'souhu',
'sid':'102002' 'sid':'1706193555675926530'
} }
kwList.append(kwmsg) kwList.append(kwmsg)
return kwList return kwList
...@@ -182,7 +182,7 @@ class SouhunewsTaskJob(object): ...@@ -182,7 +182,7 @@ class SouhunewsTaskJob(object):
try: try:
jrttnewsSpider.get_page_html() jrttnewsSpider.get_page_html()
except Exception as e: except Exception as e:
logger.info('今日头条搜索异常'+searchkw) logger.info('搜狐新闻搜索异常'+searchkw)
if jrttnewsSpider.detailList.qsize() != 0: if jrttnewsSpider.detailList.qsize() != 0:
try: try:
...@@ -218,7 +218,7 @@ if __name__ == '__main__': ...@@ -218,7 +218,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList] results = [executor.submit(souhunewsTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
#coding=utf-8 #coding=utf-8
...@@ -62,7 +62,7 @@ class BaiduSpider(object): ...@@ -62,7 +62,7 @@ class BaiduSpider(object):
# proxy = "127.0.0.1:8080" # 代理地址和端口 # proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy) # chrome_options.add_argument('--proxy-server=http://' + proxy)
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options) self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
#将列表数据插入到表中 baidu_search_result #将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items): def itemInsertToTable(self,items):
try: try:
itemdata=[] itemdata=[]
...@@ -72,7 +72,7 @@ class BaiduSpider(object): ...@@ -72,7 +72,7 @@ class BaiduSpider(object):
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime) data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
itemdata.append(data) itemdata.append(data)
sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)" sql ="INSERT into meta_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata) cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!") self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句 # 定义插入数据的SQL语句
...@@ -115,8 +115,8 @@ class BaiduSpider(object): ...@@ -115,8 +115,8 @@ class BaiduSpider(object):
def xpath_paser(self,html): def xpath_paser(self,html):
lists=[] lists=[]
itemTag=html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]') itemTags=html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]')
for itemTag in itemTag: for itemTag in itemTags:
try: try:
title=itemTag.xpath('.//h3[@class="news-title_1YtI1 "]/a/text()')[0] title=itemTag.xpath('.//h3[@class="news-title_1YtI1 "]/a/text()')[0]
except Exception as e: except Exception as e:
...@@ -487,7 +487,7 @@ class BaiduSpider(object): ...@@ -487,7 +487,7 @@ class BaiduSpider(object):
"source":"3", "source":"3",
"title":bdetail['title'], "title":bdetail['title'],
"content":bdetail['content'], "content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithTag":bdetail['contentHtml'],
"origin":bdetail['source'], "origin":bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress":bdetail['detailurl'],
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -200,7 +200,7 @@ if __name__ == '__main__': ...@@ -200,7 +200,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList] results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论