政策法规最终版

c9546130 · 薛凌堃 · eeb41ef7 · c9546130 · c9546130
--- a/comData/policylaw/2.py
+++ b/comData/policylaw/2.py
--- a/comData/policylaw/厅局.py
+++ b/comData/policylaw/厅局.py
-import random
+import json
+import json
 import random
 import time
+from urllib.parse import urljoin
+import pymongo
+from kafka import KafkaProducer
 from tqdm import tqdm
 import pandas as pd
 import pymysql
@@ -12,47 +17,80 @@ log = baseCore.getLogger()
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 cnx = baseCore.cnx
 cursor = baseCore.cursor
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']
+def paserUrl(html,listurl):
+    # soup = BeautifulSoup(html, 'html.parser')
+    # 获取所有的<a>标签和<img>标签
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+def save_data(dic_news):
+    aaa_dic = {
+        '附件id':dic_news['attachmentIds'],
+        '网址':dic_news['sourceAddress'],
+        'tid':dic_news['labels'][0]['relationId'],
+        '来源':dic_news['labels'][0]['relationName'],
+        '创建时间':dic_news['createDate']
+    }
+    db_storage.insert_one(aaa_dic)
+def sendKafka(dic_news):
+    start_time = time.time()
+    try:#114.116.116.241
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+        kafka_result = producer.send("policy",
+                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+        print(kafka_result.get(timeout=10))
-headers = {
+        dic_result = {
-'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'success': 'ture',
-'Accept-Encoding':'gzip, deflate',
+            'message': '操作成功',
-'Accept-Language':'zh-CN,zh;q=0.9',
+            'code': '200',
-'Cache-Control':'no-cache',
+        }
-'Connection':'keep-alive',
+        log.info(dic_result)
-'Cookie':'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
+        # 传输成功,写入日志中
-'Host':'www.sasac.gov.cn',
+        state = 1
-'Pragma':'no-cache',
+        takeTime = baseCore.getTimeCost(start_time, time.time())
-'Referer':'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
+        # return True
-'Upgrade-Insecure-Requests':'1',
-'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+    except Exception as e:
-}
-# 创建一个ExcelWriter对象
+        dic_result = {
-writer = pd.ExcelWriter('国务院厅局.xlsx')
+            'success': 'false',
-url = 'http://www.sasac.gov.cn/n2588020/index.html'
+            'message': '操作失败',
-ip = baseCore.get_proxy()
+            'code': '204',
-res = requests.get(url,headers,proxies=ip)
+            'e': e
-soup = BeautifulSoup(res.content,'html.parser')
+        }
-time.sleep(2)
+        log.error(dic_result)
-#厅局列表
+        e = 'Kafka操作失败'
-list_type = soup.find('div',class_='l-jgkk-right column').find_all('dd')
+        state = 0
-list_error = []
+        takeTime = baseCore.getTimeCost(start_time, time.time())
-for type in tqdm(list_type[:2]):
-    list_news = []
+def work(href_type,ting_type,relationId):
-    href_type = type.find('a')['href']
+    ip = baseCore.get_proxy()
-    ting_type = type.find('a').text
+    log.info(f'\n================厅局类别==={ting_type}========================')
-    print(f'\n================厅局类别==={ting_type}========================')
    if 'http' in href_type:
        url_type = href_type
    else:
-        url_type = 'http://www.sasac.gov.cn/' + href_type.replace('../','')
+        url_type = 'http://www.sasac.gov.cn/' + href_type.replace('../', '')
    # print(url_type)
-    i_res = requests.get(url_type,headers)
+    i_res = requests.get(url=url_type, headers=headers, proxies=ip)
-    i_soup = BeautifulSoup(i_res.content,'html.parser')
+    i_soup = BeautifulSoup(i_res.content, 'html.parser')
    time.sleep(2)
-    news_list = i_soup.find('div',class_='tjywBottom').find_all('li')
+    news_list = i_soup.find('div', class_='tjywBottom').find_all('li')
-    #文章列表
+    # 文章列表
    # print('================新闻列表==================')
-    for news in tqdm(news_list[:2]):
+    for news in tqdm(news_list):
        try:
            news_href = news.find('a')['href']
        except:
@@ -60,55 +98,185 @@ for type in tqdm(list_type[:2]):
        if 'http' in news_href:
            news_url = news_href
        else:
-            news_url = 'http://www.sasac.gov.cn/' + news_href.replace('../','')
+            news_url = 'http://www.sasac.gov.cn/' + news_href.replace('../', '')
+        # 判断是否已经爬取过
+        is_href = db_storage.find_one({'网址': news_url})
+        if is_href:
+            log.info('已采集----------跳过')
+            continue
        news_title = news.find('a').text.split('[')[0]
-        print(f'\n----正在采集: {news_title}-------')
+        log.info(f'\n----正在采集: {news_title}-------')
-        pub_time = news.find('span').text.replace('[','').replace(']','')
+        pub_time = news.find('span').text.replace('[', '').replace(']', '')
-        #文章信息
+        # 文章信息
-        ii_res = requests.get(news_url,headers)
+        header = {
-        ii_soup = BeautifulSoup(ii_res.content,'html.parser')
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Encoding': 'gzip, deflate',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Cache-Control': 'no-cache',
+            'Cookie': 'wdcid=30ffdae06d11dbde; __jsluid_h=e623973ba12a5f48b086f8c5cee6fffa; SF_cookie_1=67313298; Hm_lvt_fa835457efbc11dfb88752e70521d23b=1693808034; zh_choose=n; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1694078708; wdses=381c6ab86ce01570; wdlast=1694163647; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1694163647; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1694165617',
+            'Host': 'www.sasac.gov.cn',
+            'Pragma': 'no-cache',
+            'Proxy-Connection': 'keep-alive',
+            'Referer': 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28651762/content.html',
+            'Upgrade-Insecure-Requests': '1',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
+        }
+        # news_url = 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28102228/content.html'
+        ii_res = requests.get(url=news_url, headers=header, proxies=ip)
+        ii_soup = BeautifulSoup(ii_res.content, 'html.parser')
        # todo:相对路径转化为绝对路径
+        ii_soup = paserUrl(ii_soup, news_url)
+        # 去掉扫一扫
+        try:
+            ii_soup.find('div', id='qr_container').decompose()
+        except:
+            pass
+        # 去掉style标签
+        for styleTag in ii_soup.find_all('style'):
+            styleTag.extract()
        time.sleep(2)
        try:
-            news_info = ii_soup.find('div',class_='zsy_cotitle')
+            news_info = ii_soup.find('div', class_='zsy_cotitle')
        except Exception as e:
-            print(e)
+            log.error(e)
            news_info = ''
        if news_info:
            try:
-                pub_source = news_info.find('p').text.split('文章来源：')[1].split('发布时间')[0]
+                # origin
+                pub_source = news_info.find('p').text.split('文章来源：')[1].split('发布时间')[0].strip()
            except:
                pub_source = ''
            try:
-                content = ii_soup.find('div','zsy_comain').text.replace('扫一扫在手机打开当前页','').strip()
+                contentWithTag = ii_soup.find('div', 'zsy_comain')
+                content = contentWithTag.text.strip()
            except:
                content = ''
-            # print(news_url)
+                contentWithTag = ''
+            if len(content) > 100:
+                pass
+            else:
+                continue
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            dic_news = {
-                '标题':news_title,
+                'attachmentIds': [],
-                '发布时间':pub_time,
+                'author': '',
-                '来源':pub_source,
+                # 'content': content,
-                '内容':content,
+                # 'contentWithTag': str(contentWithTag),
-                '原文链接':news_url
+                'createDate': time_now,
+                'deleteFlag': 0,
+                'id': '',
+                'labels': [{'relationId': relationId, 'relationName': ting_type, 'labelMark': "policy"}],
+                'origin': pub_source,
+                'organ': '',
+                'topicClassification': '',
+                'issuedNumber': '',
+                'publishDate': pub_time,
+                'writtenDate': '',
+                'sid': '1697458829758697473',
+                'sourceAddress': news_url,
+                'summary': '',
+                'title': news_title
            }
-            list_news.append(dic_news)
+            sendKafka(dic_news)
+            save_data(dic_news)
+            log.info(f'{ting_type}-----{news_title}----发送成功', )
        else:
            dic_error = {
                '标题': news_title,
-                '原文链接':news_url,
+                '原文链接': news_url,
-                '厅局类别':ting_type
+                '厅局类别': ting_type
            }
-            list_error.append(dic_error)
+            log.error(dic_error)
+#中央纪委国家监委驻国资委纪检监察组
+def job1(a_type):
+    href = a_type['href']
+    ting_type = a_type.text
+    return href,ting_type
+def job():
+    url = 'http://www.sasac.gov.cn/n2588020/index.html'
+    ip = baseCore.get_proxy()
+    res = requests.get(url=url, headers=headers, proxies=ip)
+    soup = BeautifulSoup(res.content, 'html.parser')
+    time.sleep(2)
+    # 厅局列表
+    list_type = soup.find('div', class_='l-jgkk-right column').find_all('dd')[:22]
+    a_soup = soup.find('div', class_='l-jgkk-right column').find_all('dt')[0]
+    a_type = a_soup.text.strip()
+    a_href = a_soup.find('a')['href']
+    a_id = '1874'
+    list_error = []
+    num = 0
+    start_time = time.time()
+    work(a_href,a_type, a_id)
+    for type in tqdm(list_type):
+        list_news = []
+        href_type = type.find('a')['href']
+        ting_type = type.find('a').text
+        relationId = mapId_dic[ting_type]
+        work(href_type,ting_type,relationId)
-    df = pd.DataFrame(list_news)
+    num += 1
-    # 将数据写入不同的sheet页
+    end_time = time.time()
-    df.to_excel(writer, sheet_name=ting_type,index=False)
+    log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
-    print(f'=============当前sheet页{ting_type}---数据总数：{len(df)}================')
    time.sleep(1)
-writer.save()
+    # writer.save()
-df_error = pd.DataFrame(list_error)
+    # df_error = pd.DataFrame(list_error)
-df_error.to_excel('未采到文章.xlsx',index=False)
+    # df_error.to_excel('未采到文章.xlsx',index=False)
+if __name__=='__main__':
+    mapId_dic = {
+        '办公厅（党委办公厅）':'1643',
+        '综合研究局':'1644',
+        '政策法规局':'1645',
+        '规划发展局':'1646',
+        '财务监管与运行评价局':'1647',
+        '产权管理局':'1648',
+        '企业改革局':'1649',
+        '考核分配局':'1650',
+        '资本运营与收益管理局':'1651',
+        '科技创新和社会责任局':'1652',
+        '综合监督局':'1653',
+        '监督追责局':'1654',
+        '企业领导人员管理一局（董事会工作局）':'1655',
+        '企业领导人员管理二局':'1656',
+        '党建工作局（党委组织部、党委统战部）':'1657',
+        '宣传工作局（党委宣传部）':'1658',
+        '国际合作局':'1659',
+        '人事局':'1660',
+        '机关服务管理局（离退休干部管理局）':'1662',
+        '机关党委':'1663',
+        '党委巡视工作办公室、国资委巡视组':'1664',
+    }
+    headers = {
+    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding':'gzip, deflate',
+    'Accept-Language':'zh-CN,zh;q=0.9',
+    'Cache-Control':'no-cache',
+    'Connection':'keep-alive',
+    'Cookie':'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
+    'Host':'www.sasac.gov.cn',
+    'Pragma':'no-cache',
+    'Referer':'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
+    'Upgrade-Insecure-Requests':'1',
+    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+    }
+    try:
+        job()
+    except Exception as e:
+        print(e)
+    # 创建一个ExcelWriter对象
+    # writer = pd.ExcelWriter('国务院厅局.xlsx')