提交 a354950d 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

...@@ -38,8 +38,8 @@ taskType = '政策法规' ...@@ -38,8 +38,8 @@ taskType = '政策法规'
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
'国务院_国资委_copy1'] '国务院_国资委_copy1']
driver_path = r'D:\cmd100\chromedriver.exe' driver_path = r'F:\spider\cmd100\chromedriver.exe'
chromr_bin = r'D:\Google\Chrome\Application\chrome.exe' chromr_bin = r'F:\spider\Google\Chrome\Application\chrome.exe'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
...@@ -63,7 +63,7 @@ def paserUrl(html, listurl): ...@@ -63,7 +63,7 @@ def paserUrl(html, listurl):
def getDriver(): def getDriver():
service = Service(driver_path) service = Service(driver_path)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless') chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--no-sandbox') # chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-dev-shm-usage')
...@@ -76,12 +76,6 @@ def getDriver(): ...@@ -76,12 +76,6 @@ def getDriver():
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36') 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service) # bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path) bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return bro return bro
def save_data(dic_news): def save_data(dic_news):
...@@ -4834,6 +4828,12 @@ def gan_su(): ...@@ -4834,6 +4828,12 @@ def gan_su():
bro.get(href) bro.get(href)
time.sleep(2) time.sleep(2)
dhtml = bro.page_source dhtml = bro.page_source
if dhtml == '<html><head></head><body></body></html>':
bro.close()
bro.quit()
bro = getDriver()
bro.get(href)
dhtml = bro.page_source
if len(dhtml) < 200: if len(dhtml) < 200:
time.sleep(5) time.sleep(5)
continue continue
...@@ -4866,14 +4866,14 @@ def gan_su(): ...@@ -4866,14 +4866,14 @@ def gan_su():
id_list.append(att_id) id_list.append(att_id)
# todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
file['href'] = full_path file['href'] = full_path
# id_ = redefid(id_list) id_ = redefid(id_list)
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
if content == '' or content == None: if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----') log.info(f'-----{href}----{title}----内容为空-----')
continue continue
# t = time.strptime(publishDate, "%Y年%m月%d日") t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t) publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
...@@ -4940,9 +4940,9 @@ def gan_su(): ...@@ -4940,9 +4940,9 @@ def gan_su():
href = dd['href'] href = dd['href']
publishDate = dd['publishDate'] publishDate = dd['publishDate']
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
# if is_href: if is_href:
# num+=1 num+=1
# continue continue
bro.get(href) bro.get(href)
try: try:
alls = bro.find_element(By.CLASS_NAME, 'alls').text alls = bro.find_element(By.CLASS_NAME, 'alls').text
...@@ -4952,6 +4952,12 @@ def gan_su(): ...@@ -4952,6 +4952,12 @@ def gan_su():
pass pass
time.sleep(3) time.sleep(3)
html = bro.page_source html = bro.page_source
if html == '<html><head></head><body></body></html>':
bro.close()
bro.quit()
bro = getDriver()
bro.get(href)
html = bro.page_source
doc = pq(html) doc = pq(html)
origin = '' origin = ''
pub_hao = '' pub_hao = ''
...@@ -4977,6 +4983,13 @@ def gan_su(): ...@@ -4977,6 +4983,13 @@ def gan_su():
if len(origin) < 1: if len(origin) < 1:
origin = doc('div[class="pages-date"]>span').text().replace("来源:", "") origin = doc('div[class="pages-date"]>span').text().replace("来源:", "")
contentWithTag = doc('div[id="UCAP-CONTENT"]') contentWithTag = doc('div[id="UCAP-CONTENT"]')
if len(title) == 0:
title = doc('div[class="links_tit"]').text()
writtenDate = doc('div[class="links_tab"]>table>tbody>tr:nth-child(4)>td:nth-child(2)').text()
origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
contentWithTag = doc('div[id="content"]')
print(title)
soup = paserUrl(str(contentWithTag), href) soup = paserUrl(str(contentWithTag), href)
try: try:
...@@ -4998,15 +5011,15 @@ def gan_su(): ...@@ -4998,15 +5011,15 @@ def gan_su():
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip() file_name = file.text.strip()
log.info(f'{file_name}---{href}--') log.info(f'{file_name}---{href}--')
# retData = baseCore.uptoOBS(file_href, '1696',file_name) retData = baseCore.uptoOBS(file_href, '1696',file_name)
# if retData['state']: if retData['state']:
# pass pass
# else: else:
# continue continue
# att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num) att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
# id_list.append(att_id) id_list.append(att_id)
# # todo:将返回的地址更新到soup # todo:将返回的地址更新到soup
# file['href'] = full_path file['href'] = full_path
contentWithTag = str(soup.prettify()) contentWithTag = str(soup.prettify())
content = soup.text content = soup.text
...@@ -5015,34 +5028,34 @@ def gan_su(): ...@@ -5015,34 +5028,34 @@ def gan_su():
continue continue
if len(content) < 2: if len(content) < 2:
continue continue
# t = time.strptime(publishDate, "%Y年%m月%d日") t = time.strptime(publishDate, "%Y年%m月%d日")
# publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t) publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
# dic_news = { dic_news = {
# 'attachmentIds': id_list, 'attachmentIds': id_list,
# 'author': '', 'author': '',
# 'content': str(content), 'content': str(content),
# 'contentWithTag': str(contentWithTag), 'contentWithTag': str(contentWithTag),
# 'createDate': time_now, 'createDate': time_now,
# 'deleteFlag': 0, 'deleteFlag': 0,
# 'id': '', 'id': '',
# 'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}], 'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
# 'origin': origin, 'origin': origin,
# 'organ': organ, 'organ': organ,
# 'topicClassification': topicClassification, 'topicClassification': topicClassification,
# 'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
# 'publishDate': publishDate, 'publishDate': publishDate,
# 'writtenDate': writtenDate, 'writtenDate': writtenDate,
# 'sid': '1697458829758697473', 'sid': '1697458829758697473',
# 'sourceAddress': href, 'sourceAddress': href,
# 'summary': '', 'summary': '',
# 'title': title 'title': title
# } }
# # print(dic_news) # print(dic_news)
# flag = sendKafka(dic_news) flag = sendKafka(dic_news)
# if flag: if flag:
# save_data(dic_news) save_data(dic_news)
num += 1 num += 1
count += 1 count += 1
except Exception as e: except Exception as e:
...@@ -5058,23 +5071,6 @@ def gan_su(): ...@@ -5058,23 +5071,6 @@ def gan_su():
num = 0 num = 0
count = 0 count = 0
start_time = time.time() start_time = time.time()
# # service = Service(r'D:/chrome/103/chromedriver.exe')
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# chrome_options.add_experimental_option(
# "excludeSwitches", ["enable-automation"])
# chrome_options.add_experimental_option('useAutomationExtension', False)
# chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
# chrome_options.add_argument(
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
# bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
# with open('./stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
bro = getDriver() bro = getDriver()
url = 'http://gzw.gansu.gov.cn/gzw/c115553/xxgk_list.shtml' url = 'http://gzw.gansu.gov.cn/gzw/c115553/xxgk_list.shtml'
hrefs = [] hrefs = []
...@@ -5115,6 +5111,12 @@ def gan_su(): ...@@ -5115,6 +5111,12 @@ def gan_su():
bro.get(href) bro.get(href)
time.sleep(3) time.sleep(3)
html = bro.page_source html = bro.page_source
if html == '<html><head></head><body></body></html>':
bro.close()
bro.quit()
bro = getDriver()
bro.get(href)
html = bro.page_source
doc = pq(html) doc = pq(html)
origin = '' origin = ''
pub_hao = '' pub_hao = ''
...@@ -5141,6 +5143,19 @@ def gan_su(): ...@@ -5141,6 +5143,19 @@ def gan_su():
if len(origin) < 1: if len(origin) < 1:
origin = doc('div[class="pages-date"]>span').text().replace("来源:", "") origin = doc('div[class="pages-date"]>span').text().replace("来源:", "")
contentWithTag = doc('div[id="UCAP-CONTENT"]') contentWithTag = doc('div[id="UCAP-CONTENT"]')
if len(title) == 0:
title = doc('div[class="links_tit"]').text()
writtenDate = doc('div[class="links_tab"]>table>tbody>tr:nth-child(4)>td:nth-child(2)').text()
origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
contentWithTag = doc('div[id="content"]')
print(title)
if len(title) == 0 or contentWithTag.text() == '':
title = doc('div[class="main"]>h1').text().lstrip().strip()
writtenDate = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('日期:')[0].split(' ')[0].lstrip().strip()
origin = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('来源:')[0].lstrip().strip()
contentWithTag = doc('div[class="detailContent"]')
print(title)
soup = paserUrl(str(contentWithTag), href) soup = paserUrl(str(contentWithTag), href)
try: try:
...@@ -5175,6 +5190,7 @@ def gan_su(): ...@@ -5175,6 +5190,7 @@ def gan_su():
content = soup.text content = soup.text
if content == '' or content == None: if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----') log.info(f'-----{href}----{title}----内容为空-----')
print(bro.page_source)
continue continue
if len(content) < 2: if len(content) < 2:
continue continue
...@@ -5209,16 +5225,17 @@ def gan_su(): ...@@ -5209,16 +5225,17 @@ def gan_su():
num += 1 num += 1
count += 1 count += 1
except Exception as e: except Exception as e:
print(e) ee = e.__traceback__.tb_lineno
print(ee,e)
except: except:
pass pass
bro.quit() bro.quit()
end_time = time.time() end_time = time.time()
print(f'共抓取{count}条数据,共耗时{end_time - start_time}') print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
# gan_su1() gan_su1()
gan_su2() gan_su2()
# gan_su3() gan_su3()
# 宁夏 # 宁夏
def ning_xia(): def ning_xia():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论