提交 a354950d 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

......@@ -38,8 +38,8 @@ taskType = '政策法规'
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
'国务院_国资委_copy1']
driver_path = r'D:\cmd100\chromedriver.exe'
chromr_bin = r'D:\Google\Chrome\Application\chrome.exe'
driver_path = r'F:\spider\cmd100\chromedriver.exe'
chromr_bin = r'F:\spider\Google\Chrome\Application\chrome.exe'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
......@@ -63,7 +63,7 @@ def paserUrl(html, listurl):
def getDriver():
service = Service(driver_path)
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
......@@ -76,12 +76,6 @@ def getDriver():
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return bro
def save_data(dic_news):
......@@ -4834,6 +4828,12 @@ def gan_su():
bro.get(href)
time.sleep(2)
dhtml = bro.page_source
if dhtml == '<html><head></head><body></body></html>':
bro.close()
bro.quit()
bro = getDriver()
bro.get(href)
dhtml = bro.page_source
if len(dhtml) < 200:
time.sleep(5)
continue
......@@ -4866,14 +4866,14 @@ def gan_su():
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
# id_ = redefid(id_list)
id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
......@@ -4940,9 +4940,9 @@ def gan_su():
href = dd['href']
publishDate = dd['publishDate']
is_href = db_storage.find_one({'网址': href})
# if is_href:
# num+=1
# continue
if is_href:
num+=1
continue
bro.get(href)
try:
alls = bro.find_element(By.CLASS_NAME, 'alls').text
......@@ -4952,6 +4952,12 @@ def gan_su():
pass
time.sleep(3)
html = bro.page_source
if html == '<html><head></head><body></body></html>':
bro.close()
bro.quit()
bro = getDriver()
bro.get(href)
html = bro.page_source
doc = pq(html)
origin = ''
pub_hao = ''
......@@ -4977,6 +4983,13 @@ def gan_su():
if len(origin) < 1:
origin = doc('div[class="pages-date"]>span').text().replace("来源:", "")
contentWithTag = doc('div[id="UCAP-CONTENT"]')
if len(title) == 0:
title = doc('div[class="links_tit"]').text()
writtenDate = doc('div[class="links_tab"]>table>tbody>tr:nth-child(4)>td:nth-child(2)').text()
origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
contentWithTag = doc('div[id="content"]')
print(title)
soup = paserUrl(str(contentWithTag), href)
try:
......@@ -4998,15 +5011,15 @@ def gan_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
log.info(f'{file_name}---{href}--')
# retData = baseCore.uptoOBS(file_href, '1696',file_name)
# if retData['state']:
# pass
# else:
# continue
# att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
# id_list.append(att_id)
# # todo:将返回的地址更新到soup
# file['href'] = full_path
retData = baseCore.uptoOBS(file_href, '1696',file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = full_path
contentWithTag = str(soup.prettify())
content = soup.text
......@@ -5015,34 +5028,34 @@ def gan_su():
continue
if len(content) < 2:
continue
# t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
t = time.strptime(publishDate, "%Y年%m月%d日")
publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
# dic_news = {
# 'attachmentIds': id_list,
# 'author': '',
# 'content': str(content),
# 'contentWithTag': str(contentWithTag),
# 'createDate': time_now,
# 'deleteFlag': 0,
# 'id': '',
# 'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
# 'origin': origin,
# 'organ': organ,
# 'topicClassification': topicClassification,
# 'issuedNumber': pub_hao,
# 'publishDate': publishDate,
# 'writtenDate': writtenDate,
# 'sid': '1697458829758697473',
# 'sourceAddress': href,
# 'summary': '',
# 'title': title
# }
# # print(dic_news)
# flag = sendKafka(dic_news)
# if flag:
# save_data(dic_news)
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
'publishDate': publishDate,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
count += 1
except Exception as e:
......@@ -5058,23 +5071,6 @@ def gan_su():
num = 0
count = 0
start_time = time.time()
# # service = Service(r'D:/chrome/103/chromedriver.exe')
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# chrome_options.add_experimental_option(
# "excludeSwitches", ["enable-automation"])
# chrome_options.add_experimental_option('useAutomationExtension', False)
# chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
# chrome_options.add_argument(
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
# with open('./stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
bro = getDriver()
url = 'http://gzw.gansu.gov.cn/gzw/c115553/xxgk_list.shtml'
hrefs = []
......@@ -5115,6 +5111,12 @@ def gan_su():
bro.get(href)
time.sleep(3)
html = bro.page_source
if html == '<html><head></head><body></body></html>':
bro.close()
bro.quit()
bro = getDriver()
bro.get(href)
html = bro.page_source
doc = pq(html)
origin = ''
pub_hao = ''
......@@ -5141,6 +5143,19 @@ def gan_su():
if len(origin) < 1:
origin = doc('div[class="pages-date"]>span').text().replace("来源:", "")
contentWithTag = doc('div[id="UCAP-CONTENT"]')
if len(title) == 0:
title = doc('div[class="links_tit"]').text()
writtenDate = doc('div[class="links_tab"]>table>tbody>tr:nth-child(4)>td:nth-child(2)').text()
origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
contentWithTag = doc('div[id="content"]')
print(title)
if len(title) == 0 or contentWithTag.text() == '':
title = doc('div[class="main"]>h1').text().lstrip().strip()
writtenDate = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('日期:')[0].split(' ')[0].lstrip().strip()
origin = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('来源:')[0].lstrip().strip()
contentWithTag = doc('div[class="detailContent"]')
print(title)
soup = paserUrl(str(contentWithTag), href)
try:
......@@ -5175,6 +5190,7 @@ def gan_su():
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
print(bro.page_source)
continue
if len(content) < 2:
continue
......@@ -5209,16 +5225,17 @@ def gan_su():
num += 1
count += 1
except Exception as e:
print(e)
ee = e.__traceback__.tb_lineno
print(ee,e)
except:
pass
bro.quit()
end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# gan_su1()
gan_su1()
gan_su2()
# gan_su3()
gan_su3()
# 宁夏
def ning_xia():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论