Merge remote-tracking branch 'origin/master'

a354950d · 薛凌堃 · 3514653b · e8e37184 · a354950d
--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py
@@ -38,8 +38,8 @@ taskType = '政策法规'
 db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
    '国务院_国资委_copy1']

-driver_path = r'D:\cmd100\chromedriver.exe'
-chromr_bin = r'D:\Google\Chrome\Application\chrome.exe'
+driver_path = r'F:\spider\cmd100\chromedriver.exe'
+chromr_bin = r'F:\spider\Google\Chrome\Application\chrome.exe'

 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
@@ -63,7 +63,7 @@ def paserUrl(html, listurl):
 def getDriver():
    service = Service(driver_path)
    chrome_options = webdriver.ChromeOptions()
-    # chrome_options.add_argument('--headless')
+    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    # chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
@@ -76,12 +76,6 @@ def getDriver():
        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
    # bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
    bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path)
-    # with open('stealth.min.js') as f:
-    #     js = f.read()
-    #
-    # bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
-    #     "source": js
-    # })
    return bro

 def save_data(dic_news):
@@ -4834,6 +4828,12 @@ def gan_su():
                        bro.get(href)
                        time.sleep(2)
                        dhtml = bro.page_source
+                        if dhtml == '<html><head></head><body></body></html>':
+                            bro.close()
+                            bro.quit()
+                            bro = getDriver()
+                            bro.get(href)
+                            dhtml = bro.page_source
                        if len(dhtml) < 200:
                            time.sleep(5)
                            continue
@@ -4866,14 +4866,14 @@ def gan_su():
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
                            file['href'] =  full_path
-                    # id_ = redefid(id_list)
+                    id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    content = soup.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
                        continue
-                    # t = time.strptime(publishDate, "%Y年%m月%d日")
-                    # publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
+                    t = time.strptime(publishDate, "%Y年%m月%d日")
+                    publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
@@ -4940,9 +4940,9 @@ def gan_su():
                    href = dd['href']
                    publishDate = dd['publishDate']
                    is_href = db_storage.find_one({'网址': href})
-                    # if is_href:
-                    #     num+=1
-                    #     continue
+                    if is_href:
+                        num+=1
+                        continue
                    bro.get(href)
                    try:
                        alls = bro.find_element(By.CLASS_NAME, 'alls').text
@@ -4952,6 +4952,12 @@ def gan_su():
                        pass
                    time.sleep(3)
                    html = bro.page_source
+                    if html == '<html><head></head><body></body></html>':
+                        bro.close()
+                        bro.quit()
+                        bro = getDriver()
+                        bro.get(href)
+                        html = bro.page_source
                    doc = pq(html)
                    origin = ''
                    pub_hao = ''
@@ -4977,6 +4983,13 @@ def gan_su():
                        if len(origin) < 1:
                            origin = doc('div[class="pages-date"]>span').text().replace("来源：", "")
                        contentWithTag = doc('div[id="UCAP-CONTENT"]')
+                    if len(title) == 0:
+                        title = doc('div[class="links_tit"]').text()
+                        writtenDate = doc('div[class="links_tab"]>table>tbody>tr:nth-child(4)>td:nth-child(2)').text()
+                        origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
+                        pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
+                        contentWithTag = doc('div[id="content"]')
+                        print(title)

                    soup = paserUrl(str(contentWithTag), href)
                    try:
@@ -4998,15 +5011,15 @@ def gan_su():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            log.info(f'{file_name}---{href}--')
-                            # retData = baseCore.uptoOBS(file_href, '1696',file_name)
-                            # if retData['state']:
-                            #     pass
-                            # else:
-                            #     continue
-                            # att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
-                            # id_list.append(att_id)
-                            # # todo:将返回的地址更新到soup
-                            # file['href'] = full_path
+                            retData = baseCore.uptoOBS(file_href, '1696',file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            file['href'] = full_path

                    contentWithTag = str(soup.prettify())
                    content = soup.text
@@ -5015,34 +5028,34 @@ def gan_su():
                        continue
                    if len(content) < 2:
                        continue
-                    # t = time.strptime(publishDate, "%Y年%m月%d日")
-                    # publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
+                    t = time.strptime(publishDate, "%Y年%m月%d日")
+                    publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
-                    # dic_news = {
-                    #     'attachmentIds': id_list,
-                    #     'author': '',
-                    #     'content': str(content),
-                    #     'contentWithTag': str(contentWithTag),
-                    #     'createDate': time_now,
-                    #     'deleteFlag': 0,
-                    #     'id': '',
-                    #     'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
-                    #     'origin': origin,
-                    #     'organ': organ,
-                    #     'topicClassification': topicClassification,
-                    #     'issuedNumber': pub_hao,
-                    #     'publishDate': publishDate,
-                    #     'writtenDate': writtenDate,
-                    #     'sid': '1697458829758697473',
-                    #     'sourceAddress': href,
-                    #     'summary': '',
-                    #     'title': title
-                    # }
-                    # # print(dic_news)
-                    # flag = sendKafka(dic_news)
-                    # if flag:
-                    #     save_data(dic_news)
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': str(content),
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
+                        'origin': origin,
+                        'organ': organ,
+                        'topicClassification': topicClassification,
+                        'issuedNumber': pub_hao,
+                        'publishDate': publishDate,
+                        'writtenDate': writtenDate,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                    count += 1
                except Exception as e:
@@ -5058,23 +5071,6 @@ def gan_su():
        num = 0
        count = 0
        start_time = time.time()
-        # # service = Service(r'D:/chrome/103/chromedriver.exe')
-        # chrome_options = webdriver.ChromeOptions()
-        # chrome_options.add_argument('--headless')
-        # chrome_options.add_argument('--disable-gpu')
-        # chrome_options.add_experimental_option(
-        #     "excludeSwitches", ["enable-automation"])
-        # chrome_options.add_experimental_option('useAutomationExtension', False)
-        # chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
-        # chrome_options.add_argument(
-        #     'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
-        # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
-        # with open('./stealth.min.js') as f:
-        #     js = f.read()
-        #
-        # bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
-        #     "source": js
-        # })
        bro = getDriver()
        url = 'http://gzw.gansu.gov.cn/gzw/c115553/xxgk_list.shtml'
        hrefs = []
@@ -5115,6 +5111,12 @@ def gan_su():
                    bro.get(href)
                    time.sleep(3)
                    html = bro.page_source
+                    if html == '<html><head></head><body></body></html>':
+                        bro.close()
+                        bro.quit()
+                        bro = getDriver()
+                        bro.get(href)
+                        html = bro.page_source
                    doc = pq(html)
                    origin = ''
                    pub_hao = ''
@@ -5141,6 +5143,19 @@ def gan_su():
                        if len(origin) < 1:
                            origin = doc('div[class="pages-date"]>span').text().replace("来源：", "")
                        contentWithTag = doc('div[id="UCAP-CONTENT"]')
+                    if len(title) == 0:
+                        title = doc('div[class="links_tit"]').text()
+                        writtenDate = doc('div[class="links_tab"]>table>tbody>tr:nth-child(4)>td:nth-child(2)').text()
+                        origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
+                        pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
+                        contentWithTag = doc('div[id="content"]')
+                        print(title)
+                    if len(title) == 0 or contentWithTag.text() == '':
+                        title = doc('div[class="main"]>h1').text().lstrip().strip()
+                        writtenDate = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('日期：')[0].split(' ')[0].lstrip().strip()
+                        origin = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('来源：')[0].lstrip().strip()
+                        contentWithTag = doc('div[class="detailContent"]')
+                        print(title)

                    soup = paserUrl(str(contentWithTag), href)
                    try:
@@ -5175,6 +5190,7 @@ def gan_su():
                    content = soup.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
+                        print(bro.page_source)
                        continue
                    if len(content) < 2:
                        continue
@@ -5209,16 +5225,17 @@ def gan_su():
                        num += 1
                        count += 1
                except Exception as e:
-                    print(e)
+                    ee = e.__traceback__.tb_lineno
+                    print(ee,e)
        except:
            pass
        bro.quit()
        end_time = time.time()
        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')

-    # gan_su1()
+    gan_su1()
    gan_su2()
-    # gan_su3()
+    gan_su3()

 # 宁夏
 def ning_xia():