Merge branch 'master' of http://114.115.159.144:8090/DingShuangBo/zzsn_spider

cf9c7394 · 刘伟刚 · 7fc3194e · a9ad4913 · cf9c7394
--- a/comData/policylaw/2.py
+++ b/comData/policylaw/2.py
@@ -33,7 +33,7 @@ taskType = '政策法规'
 各地方国资委
 """

-db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委']
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']

 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
@@ -52,86 +52,17 @@ def paserUrl(html,listurl):
            link['src'] = urljoin(listurl, link['src'])
    return html

-def replaceUrl(hostUrl,src):
-    if '../' in src:
-        src = src.strip('../')
-    if './' in src:
-        src = src.strip('.')
-    finnal_href = hostUrl + src
-    return finnal_href

-def save_data(result_dict):
-    try:
-        aa = result_dict['信息来源']
-        a_dict = result_dict
-    except:
-        try:
-            tid = result_dict['tid']
-        except:
-            tid = '1666'
-            pass
-        a_dict = {
-            '标题': result_dict['标题'],
-            '来源': result_dict['来源'],
-            '发文机关': '',
-            '发文字号': result_dict['号'],
-            '内容-未去标签': result_dict['内容'],
-            '附件网址': result_dict['附件网址'],
-            '发布时间': result_dict['发布时间'],
-            '成文时间': '',
-            '主题分类': '',
-            '网址': result_dict['网址'],
-            '归属': result_dict['归属'],
-            '信息来源': '地方国资委',
-            'tid': tid,
-        }
-    # a_dict['内容-未去标签'] = a_dict['内容-未去标签'].split('扫一扫在手机打开')[0]
-    #
-    if a_dict['标题']:
-        pass
-    else:
-        return
-    try:
-        post_url = 'http://39.105.62.235:1820/ExtarctLawInfo'
-        headers_ = {
-            'Content-Type': 'application/json'
-        }
-        resp = requests.post(post_url, headers=headers_, verify=False, data=json.dumps(a_dict))
-        if resp.status_code == 500:
-            try:
-                tid = result_dict['tid']
-            except:
-                tid = '1666'
-            a_dict = {
-                '标题': result_dict['标题'],
-                '来源': result_dict['来源'],
-                '发文机关': '',
-                '发文字号': result_dict['号'],
-                '内容-未去标签': '--',
-                '附件网址': result_dict['附件网址'],
-                '发布时间': result_dict['发布时间'],
-                '成文时间': '',
-                '主题分类': '',
-                '网址': result_dict['网址'],
-                '归属': result_dict['归属'],
-                '信息来源': '地方国资委',
-                'tid': tid,
-            }
-            resp = requests.post(post_url, headers=headers_, verify=False, data=json.dumps(a_dict))
-        print('推送：', resp.status_code)
-        if resp.status_code != 200:
-            print('推送失败！')
-            time.sleep(10)
-            a_dict['is_send'] = ''
-            db_storage.insert_one(a_dict)
-            return
-    except:
-        print('推送失败！')
-        time.sleep(10)
-        a_dict['is_send'] = ''
-        db_storage.insert_one(a_dict)
-        return
-    db_storage.insert_one(a_dict)
+def save_data(dic_news):
+    aaa_dic = {
+
+        '附件id':dic_news['attachmentIds'],
+        '网址':dic_news['sourceAddress'],
+        'tid':dic_news['labels'][0]['relationId'],
+        '来源':dic_news['labels'][0]['relationName'],
+        '创建时间':dic_news['createDate']
+    }
+    db_storage.insert_one(aaa_dic)

 def sendKafka(dic_news):
    start_time = time.time()
@@ -475,7 +406,7 @@ from urllib.parse import urljoin

 # 北京
 def bei_jing():
-    id_list = []
+
    num = 0
    start_time = time.time()
    # 有反爬需要使用selenium
@@ -521,37 +452,56 @@ def bei_jing():
                break
            updown.click()
            time.sleep(2)
-        for href in hrefs[4:6]:
+        log.info(f'------{len(hrefs)}条数据-------------')
+        num = 0
+        for href in hrefs:
+            id_list = []
            title = href[1]
            #todo:测试需要 注释掉判重
            # 判断是否已经爬取过
-            # is_href = db_storage.find_one({'网址': href[0]})
-            # if is_href:
-            #     continue
+            is_href = db_storage.find_one({'网址': href[0]})
+            if is_href:
+                log.info('已采集----------跳过')
+                continue
            # 对获取信息页面发送请求
            bro.get(href[0])
            time.sleep(1)
            # 获取所要信息
            pub = bro.find_element(By.CLASS_NAME, 'doc-info')
+            topic = str(pub.text).split('[主题分类] ')[1].split('\n')[0].strip()
+            #发文机构
+            organ = str(pub.text).split('[发文机构] ')[1].split('\n')[0].strip()
            pub_time = str(pub.text).split('[发布日期] ')[1].split('[有效性] ')[0].strip().lstrip()
-            pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
+            writtenDate = str(pub.text).split('[成文日期] ')[1].split('\n')[0].strip()
+            # pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
            pub_hao = pub.find_element(By.CLASS_NAME, 'fwzh').text.replace('[发文字号] ', '').lstrip().strip()
+            try:
+                pub_list = bro.find_elements(By.CLASS_NAME,'article-info')
+                for source in pub_list:
+                    if '来源' in source.text:
+                        pub_source = source.text.split('来源：')[1].split('\n')[0]
+                        # print(pub_source)
+            except:
+                pub_source = ''
+            #.split('来源：')[1]
            if '号' not in pub_hao:
                pub_hao = ''
            cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML')
-
            soup_cont = BeautifulSoup(cont,'lxml')

            soup = paserUrl(soup_cont, href[0])
            text = str(soup.prettify())
-            print(text)
+            #todo:去掉扫一扫
+            soup.find('div',id='div_div').decompose()
            # print(title)
-            num = 0

            fu_jian_soup = soup.find_all('a')
            for file in fu_jian_soup:
-                num+=1
-                file_href = file['href']
+                try:
+                    file_href = file['href']
+                except Exception as e:
+                    log.info(f'---{href[0]}--------{e}-------')
+                    continue
                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
@@ -567,45 +517,46 @@ def bei_jing():
                    #todo:将返回的地址更新到soup
                    file['href'] = 'http://114.115.215.96/' + full_path

-            id_ = redefid(id_list)
+            # id_ = redefid(id_list)
            #todo:替换完成之后，将附件上传至文件服务器
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # todo:传kafka字段
            dic_news = {
-                'attachmentIds': id_,
+                'attachmentIds': id_list,
                'author': '',
-                'content': str(soup_cont.text),
-                'contentWithTag': str(soup_cont),
+                'content': str(soup.text),
+                'contentWithTag': str(soup),
                'createDate': time_now,
                'deleteFlag': 0,
                'id': '',
                'labels': [{'relationId': "1667", 'relationName': "北京市国资委", 'labelMark': "policy"}],
                'origin': pub_source,
-                'organ': pub_hao,
-                'topicClassification': '',
+                'organ': organ,
+                'topicClassification': topic,
                'issuedNumber': pub_hao,
                'publishDate': pub_time,
-                'writtenDate': pub_time,
+                'writtenDate': writtenDate,
                'sid': '1697458829758697473',
-                'sourceAddress': '',
+                'sourceAddress': href[0],
                'summary': '',
                'title': title
            }
-            print(dic_news)
-            # sendKafka(dic_news)
+            # print(dic_news)
+            sendKafka(dic_news)
+            save_data(dic_news)
            # print(id)
            # id_list.append(id)
            num += 1
+        end_time = time.time()
+        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
        bro.quit()
    except Exception as e:
-        print(e)
+        log.info(e)
        pass
-    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+

 # 内蒙古
 def nei_meng_gu():
-    id_list = []
    start = time.time()
    num = 0
    url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
@@ -617,6 +568,7 @@ def nei_meng_gu():
        result = soup.find(class_='right_two')
        li_list = result.find_all(class_='font14wr')
        for a in li_list[:1]:
+            id_list = []
            a_text = str(a)
            real_href = 'https://gzw.nmg.gov.cn/zfxxgk' + a_text.split('href="..')[-1].split('" target="_blank')[0]
            # # 判断是否已经爬取过
@@ -631,13 +583,19 @@ def nei_meng_gu():
                href_text.encoding = 'utf-8'
                i_html = href_text.text
                i_soup = BeautifulSoup(i_html, 'html.parser')
+                #todo:将html中的a标签相对路径改为绝对路径
+                i_soup = paserUrl(i_soup,real_href)
+
                i_result = i_soup.find('div', id='d_laiyuan')
                time_ = i_result.find_all('span')[0]
                time_ = str(time_)
                pub_time = time_.split('<span>')[1].split('</span>')[0].replace('发布时间：', '')
-                source = i_result.find_all('span')[1]
-                source = str(source)
-                pub_source = source.split('<span>')[1].split('</span>')[0].replace('来源：', '')
+                #发布机关
+                origin = i_result.find_all('span')[1]
+                origin = str(origin)
+                pub_source = origin.split('<span>')[1].split('</span>')[0].replace('来源：', '')
+                #发文机关
+                organ = origin
                fwzh = i_soup.find_all('td')[7]
                pub_hao_result = re.findall('〔(.*?)〕', str(fwzh))
                if len(pub_hao_result) == 0:
@@ -647,16 +605,19 @@ def nei_meng_gu():
                        pub_hao = str(fwzh).split('<td>')[1].split('</td>')[0]
                    else:
                        pub_hao = ''
+                #成文时间
+                writtenDate = i_soup.find_all('td')[9].text
+                topicClassification = i_soup.find_all('td')[3].text
                i_content = str(i_soup.find(class_='d_show'))
                if i_content:
                    content = i_content
                else:
                    i_content = str(i_soup.find(class_='view TRS_UEDITOR trs_paper_default'))
                    content = i_content
-
-                fujian = i_soup.find_all(class_='ql_detailbro_right_qztp')
+                #todo:内蒙古市的附件不在正文中，异步加载出来，替换不了标签，附件可上传att表中
+                fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
                fu_jian_result = re.findall('href="(.*?)"', str(fujian))
-                fu_jian_href_list = []
+                # fu_jian_result = fujian.find('a')['href']
                if len(fu_jian_result) > 0:
                    for fu_jian_re in fu_jian_result:
                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
@@ -664,58 +625,53 @@ def nei_meng_gu():
                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                            fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
                            fu_jian_href = fu_jian_re
-                            fu_jian_href_list.append(fu_jian_href)
-                #todo:附件需要上传文件服务器 type_id:7
-
-                result_dict = {
-                    '标题': title,
-                    '来源': pub_source,
-                    '号': pub_hao,
-                    '内容': content,
-                    '附件网址': fu_jian_href_list,
-                    '发布时间': pub_time,
-                    '网址': real_href,
-                    '归属': '内蒙古自治区国资委',
-                }
+                            #todo:附件上传至文件服务器
+                            retData = baseCore.uploadToserver(fu_jian_href,'1669')
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
+                            id_list.append(att_id)
+                            # # todo:将返回的地址更新到soup
+                            # fu_jian_link['href'] = 'http://114.115.215.96/' + full_path
                print(title)
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                id = baseCore.getNextSeq()
                # todo:传kafka字段
                dic_news = {
-                    'attachmentIds': "14,15,16",
+                    'attachmentIds': id_list,
                    'author': '',
                    'content': content,
                    'contentWithTag': content,
                    'createDate': time_now,
                    'deleteFlag': 0,
-                    'id': id,
+                    'id': '',
                    'labels':[{'relationId': "1669", 'relationName': "内蒙古自治区国资委", 'labelMark': "policy"}],
-                    'origin': pub_source,
-                    'organ': pub_hao,
-                    'topicClassification': '',
+                    'origin': origin,
+                    'organ': organ,
+                    'topicClassification': topicClassification,
                    'issuedNumber': pub_hao,
                    'publishDate': pub_time,
-                    'writtenDate':pub_time,
-                    'sid':'0987654321',
-                    'sourceAddress':'',
+                    'writtenDate':writtenDate,
+                    'sid':'1697458829758697473',
+                    'sourceAddress':real_href,
                    'summary':'',
                    'title':title
                }
                sendKafka(dic_news)
-                print(id)
-                id_list.append(id)
-                # save_data(result_dict)
+
+                save_data(dic_news)
                num = num + 1
                break
            except:
                pass
    except:
        pass
-    print(id_list)
+
    end = time.time()
    print('共', num, '条', '...........', '共耗时', end - start, '秒')

-
 # 吉林
 def ji_lin():
    start = time.time()
@@ -3950,9 +3906,9 @@ if __name__ == '__main__':
    # get_content1()
    # get_content2()
    # get_content3()
-    bei_jing()
+    # bei_jing()
    # nei_meng_gu()
-    # ji_lin()
+    ji_lin()
    # shang_hai()
    # zhe_jiang()
    # fu_jian()