1/31

862e97ab · 薛凌堃 · 1d1053c8 · 862e97ab · 862e97ab · 862e97ab
--- a/comData/Tyc/CorePerson.py
+++ b/comData/Tyc/CorePerson.py
--- a/comData/Tyc/CorePerson2.py
+++ b/comData/Tyc/CorePerson2.py
--- a/comData/YanBao/resentYanbao.py
+++ b/comData/YanBao/resentYanbao.py
@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
            break
        except Exception as e:
            time.sleep(3)
+            log.info(e)
            continue
    if page_size < 1:
@@ -206,7 +207,8 @@ def download(data, order_by,header):
        come = data['come']
    except:
        come = ''
+    if publishDate < '2024-01-29':
+        return
    tf_url = add_check_url(sourceAddress)
    if tf_url:
        dic_result = {
@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
    #     qianyanzhishiku()
    # except Exception as e:
    #     pass
-    try:
+    # try:
-        log.info('shijiejingjiluntan')
+    #     log.info('shijiejingjiluntan')
-        shijiejingjiluntan()
+    #     shijiejingjiluntan()
-    except Exception as e:
+    # except Exception as e:
-        log.info(e)
+    #     log.info(e)
-        pass
+    #     pass
    # try:
    #     log.info('dongfangcaifu')
    #     dongfangcaifu()
@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
    # except Exception as e:
    #     log.info(e)
    #     pass
-    #
-    # try:
+    try:
-    #     log.info('dongfangcaifu4')
+        log.info('dongfangcaifu4')
-    #     dongfangcaifu4()
+        dongfangcaifu4()
-    # except Exception as e:
+    except Exception as e:
-    #     log.info(e)
+        log.info(e)
-    #     pass
+        pass
-    #
-    # try:
+    try:
-    #     log.info('dongfangcaifu5')
+        log.info('dongfangcaifu5')
-    #     dongfangcaifu5()
+        dongfangcaifu5()
-    # except Exception as e:
+    except Exception as e:
-    #     log.info(e)
+        log.info(e)
-    #     pass
+        pass
-    #
-    # try:
+    try:
-    #     log.info('dongfangcaifu6')
+        log.info('dongfangcaifu6')
-    #     dongfangcaifu6()
+        dongfangcaifu6()
-    # except Exception as e:
+    except Exception as e:
-    #     log.info(e)
+        log.info(e)
-    #     pass
+        pass
-    #
-    # try:
+    try:
-    #     log.info('dongfangcaifu7')
+        log.info('dongfangcaifu7')
-    #     dongfangcaifu7()
+        dongfangcaifu7()
-    # except Exception as e:
+    except Exception as e:
-    #     log.info(e)
+        log.info(e)
-    #     pass
+        pass
--- a/comData/dingzhi/dfsm_sasac.py
+++ b/comData/dingzhi/dfsm_sasac.py
+import requests
+import json
+import sys
+import redis
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
+    }
+def two_dfsm_mtgc():
+    info_list = []
+    """
+    地方扫描
+    """
+    url_list = ['http://www.sasac.gov.cn/n2588025/n2588129/index.html',
+                # 'http://www.sasac.gov.cn/n2588025/n2588139/index.html'
+                ]
+    for url in url_list:
+        res = requests.get(url=url,headers=headers)
+        res.encoding = res.apparent_encoding
+        res_text = res.text
+        soup = BeautifulSoup(res_text, 'html.parser')
+        pages = soup.find('td', class_='pages')
+        pages_tag = pages['id'].split('pag_')[1]
+        pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
+        # print(pages)
+        # for page in range(378,int(pages)+1):
+        for page in range(1,378):
+            log.info(f'==============开始采集第{page}页===============')
+            if page == 1:
+                url = 'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
+            else:
+                url = f'http://www.sasac.gov.cn/n2588025/n2588129/index_{pages_tag}_{int(pages)+1-page}.html'
+            try:
+                res = requests.get(url=url, headers=headers)
+            except:
+                continue
+            res.encoding = res.apparent_encoding
+            res_text = res.text
+            soup = BeautifulSoup(res_text, 'html.parser')
+            li_list = soup.find('span', id=f'comp_{pages_tag}')
+            if li_list:
+                li_list = li_list.find_all('li')
+            else:
+                li_list = soup.find_all('li')
+            for li in li_list:
+                # print(type(li))
+                if len(li):
+                    a = li.find('a')
+                    # print(a)
+                    href = a['href']
+                    if 'http' in href:
+                        href = href
+                    else:
+                        href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
+                    # print(href)
+                    try:
+                        flag = r.sismember('IN-20240129-0019-test', href)
+                        if flag:
+                            log.info('信息已采集入库过')
+                            continue
+                        # else:
+                        #     log.info(f'未采到----{page}-----{href}')
+                        #     continue
+                    except Exception as e:
+                        continue
+                    # href = "http://www.sasac.gov.cn/n2588025/n2588129/c2711101/content.html"
+                    try:
+                        title = a['title']
+                    except:
+                        title = ''
+                    # print(title)
+                    try:
+                        res_href = requests.get(url=href,headers=headers,verify=False)
+                    except:
+                        continue
+                    res_href.encoding = res_href.apparent_encoding
+                    href_text = res_href.text
+                    i_soup = BeautifulSoup(href_text,'html.parser')
+                    result = i_soup.find(class_='zsy_cotitle')
+                    try:
+                        if result:
+                            result =result.find('p').text
+                            pub_source = result.split('发布时间：')[0].replace('文章来源：','').strip()
+                            pub_time = result.split('发布时间：')[1]
+                            # print(pub_source,pub_time)
+                            try:
+                                i_soup.find('div', id='div_div').decompose()
+                                i_soup.find('div', id='qr_container').decompose()
+                            except:
+                                pass
+                            contentWithTag = str(i_soup.find(class_='zsy_comain'))
+                            content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
+                        else:
+                            result = i_soup.find(class_='lyshijian').find_all('span')
+                            try:
+                                pub_source = str(result[0]).split('文章来源：')[1].split('</span>')[0].strip()
+                                pub_time = str(result[1]).split('发布时间：')[1].split('</span>')[0].strip()
+                            except:
+                                pub_time = str(result[0]).split('发布时间：')[1].split('</span>')[0].strip()
+                                pub_source =''
+                            contentWithTag = str(i_soup.find(class_='pages_content'))
+                            content = str(i_soup.find(class_='articlecontent').text)
+                        if title == '':
+                            log.info(f'title为空----{page}--{title}--{href}')
+                            continue
+                        info_code = 'IN-20240129-0019'
+                        result_dict = {
+                            'id': '',
+                            'sid': '1751849444877144065',
+                            'title': title,
+                            'organ': pub_source,
+                            'origin': '国务院国有资产监督管理委员会',
+                            # '摘要': zhaiyao,
+                            'source': 16,
+                            'content': content,
+                            'contentWithTag': contentWithTag,
+                            'publishDate': pub_time,
+                            'sourceAddress': href,
+                        }
+                        log.info(f'{page}--{title}--{href}')
+                        # info_list.append(result_dict)
+                        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                        try:
+                            kafka_result = producer.send("crawlerInfo",
+                                                         json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
+                            r.sadd(info_code + '-test', href)
+                            log.info('发送kafka成功！')
+                        except Exception as e:
+                            log.info(e)
+                        finally:
+                            producer.close()
+                    except:
+                        continue
+if __name__ == "__main__":
+    r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
+    two_dfsm_mtgc()
\ No newline at end of file
--- a/comData/dingzhi/gzyw_sasac.py
+++ b/comData/dingzhi/gzyw_sasac.py
+import json
+import sys
+import redis
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
+    }
+#国资要闻
+def gzyw():
+    info_list = []
+    url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
+    res = requests.get(url=url, headers=headers)
+    res.encoding = res.apparent_encoding
+    res_text = res.text
+    soup = BeautifulSoup(res_text, 'html.parser')
+    # pages = soup.find('td',id='pag_4278129')
+    pages = soup.find('td', class_='pages')
+    pages_tag = pages['id'].split('pag_')[1]
+    pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
+    # print(pages)
+    for page in range(1, int(pages)+1):
+        log.info(f'==============开始采集第{page}页===============')
+        if page == 1:
+            url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
+        else:
+            #http://www.sasac.gov.cn/n2588025/n2643309/index_4278129_131.html
+            url = f'http://www.sasac.gov.cn/n2588025/n2643314/index_{pages_tag}_{int(pages)+1-page}.html'
+        try:
+            res = requests.get(url=url, headers=headers)
+        except:
+            continue
+        res.encoding = res.apparent_encoding
+        res_text = res.text
+        soup = BeautifulSoup(res_text, 'html.parser')
+        li_list = soup.find('span', id=f'comp_{pages_tag}')
+        if li_list:
+            li_list = li_list.find_all('li')
+        else:
+            li_list = soup.find_all('li')
+        for li in li_list:
+            # print(type(li))
+            if len(li):
+                a = li.find('a')
+                # print(a)
+                href = a['href']
+                if 'http' in href:
+                    href = href
+                else:
+                    href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
+                # print(href)
+                try:
+                    flag = r.sismember('IN-20240129-0002-test', href)
+                    if flag:
+                        # log.info('信息已采集入库过')
+                        continue
+                    # else:
+                    #     log.info(f'未采到----{page}-----{href}')
+                except Exception as e:
+                    continue
+                try:
+                    title = a['title']
+                except:
+                    title = ''
+                # print(title)
+                try:
+                    res_href = requests.get(url=href,headers=headers,verify=False)
+                except:
+                    continue
+                res_href.encoding = res_href.apparent_encoding
+                href_text = res_href.text
+                i_soup = BeautifulSoup(href_text,'html.parser')
+                result = i_soup.find(class_='zsy_cotitle')
+                try:
+                    if result:
+                        result_ =result.find('p').text
+                        pub_source = result_.split('发布时间：')[0].replace('文章来源：', '').strip()
+                        pub_time = result_.split('发布时间：')[1]
+                        # print(pub_source,pub_time)
+                        if title == '':
+                            result.find('p').decompose()
+                            title = result.text.strip().replace(' ', '').replace('\n', '').replace('\t', '')
+                        try:
+                            i_soup.find('div', id='div_div').decompose()
+                            i_soup.find('div', id='qr_container').decompose()
+                        except:
+                            pass
+                        contentWithTag = str(i_soup.find(class_='zsy_comain'))
+                        content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
+                    else:
+                        result = i_soup.find(class_='lyshijian')
+                        if result:
+                            result_ = result.find_all('span')
+                            try:
+                                pub_source = str(result_[0]).split('文章来源：')[1].split('</span>')[0].strip()
+                                pub_time = str(result_[1]).split('发布时间：')[1].split('</span>')[0].strip()
+                            except:
+                                pub_time = str(result_[0]).split('发布时间：')[1].split('</span>')[0].strip()
+                                pub_source = ''
+                            if title == '':
+                                result.find('p').decompose()
+                                title = result.text.strip()
+                            contentWithTag = str(i_soup.find(class_='articlecontent'))
+                            content = str(i_soup.find(class_='articlecontent').text)
+                        else:
+                            result = i_soup.find(class_='pages-date')
+                            pub_source = result.find('span').text.replace('来源：', '').strip()
+                            pub_time = result.text
+                            pub_time = pub_time.split('来源')[0].strip()
+                            contentWithTag = str(i_soup.find(class_='pages_content'))
+                            content = str(i_soup.find(class_='pages_content').text)
+                        # content = str(i_soup.find(class_='articlecontent').text)
+                    if title == '':
+                        log.info(f'title为空----{page}--{title}--{href}')
+                        continue
+                    # zhaiyao = HanLP.extractSummary(content,6)
+                    info_code = 'IN-20240129-0002'
+                    result_dict = {
+                        'id':'',
+                        'sid':'1751810519211053058',
+                        'title': title,
+                        'organ': pub_source,
+                        'origin': '国务院国有资产监督管理委员会',
+                        # '摘要': zhaiyao,
+                        'source':16,
+                        'content': content,
+                        'contentWithTag': contentWithTag,
+                        'publishDate': pub_time,
+                        'sourceAddress': href,
+                    }
+                    log.info(f'{page}--{title}--{href}')
+                    # info_list.append(result_dict)
+                    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                    try:
+                        kafka_result = producer.send("crawlerInfo",
+                                                     json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
+                        r.sadd(info_code + '-test', href)
+                        log.info('发送kafka成功！')
+                    except Exception as e:
+                        log.info(e)
+                    finally:
+                        producer.close()
+                except:
+                    continue
+if __name__ == "__main__":
+    r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
+    gzyw()
\ No newline at end of file
--- a/comData/dingzhi/zzcx.py
+++ b/comData/dingzhi/zzcx.py
+"""
+中证智能财讯
+"""
+import json
+import requests
+from bs4 import BeautifulSoup
+def zzcx():
+    url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
+    payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
+    headers = {
+        'Accept': 'application/json',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Content-Length': '56',
+        'Content-Type': 'application/json;charset=UTF-8',
+        'Cookie': 'zycna=VEwasVGF9akBAXuVA58n9CJm',
+        'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': '"Windows"',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-origin',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Origin': 'https://zzcx.cs.com.cn',
+        'Referer': 'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
+    }
+    payload = json.dumps(payload)
+    result_json = requests.post(url=url, data=payload, headers=headers).json()
+    print(result_json)
+    pages = result_json['data']['pages']
+    for page in range(1, int(pages + 1)):
+        payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
+        payload_page = json.dumps(payload_page)
+        datas = requests.post(url=url, data=payload_page, headers=headers)
+        records = datas.json()['data']['records']
+        for news in records:
+            title = news['title']
+            news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
+            news_req = requests.get(url=news_url, headers=headers)
+            news_soup = BeautifulSoup(news_req.content, 'html.parser')
+            detail_info = news_soup.find('div', class_='subTitle___svblj')
+            div_list = detail_info.find_all('div')
+            origin = div_list[0].text
+            publishDate = div_list[1].text
+if __name__ == "__main__":
+    zzcx()
\ No newline at end of file
--- a/comData/policylaw/ClassTool.py
+++ b/comData/policylaw/ClassTool.py
@@ -85,7 +85,8 @@ class ClassTool():
            '来源': dic_news['labels'][0]['relationName'],
            '创建时间': dic_news['createDate'],
            '带标签内容': dic_news['contentWithTag'][:100],
-            '发布时间': dic_news['publishDate']
+            '发布时间': dic_news['publishDate'],
+            '标题': dic_news['title']
        }
        self.db_storage.insert_one(aaa_dic)

--- a/test.py
+++ b/test.py
@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
 #
 # code = use_ocr(out_img_path)
 # 验证码输入框元素.send_keys(code)
+# import requests
+# headers = {
+#     # 'Accept': '*/*',
+#     # 'Accept-Encoding': 'gzip, deflate, br',
+#     # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+#     # 'Cache-Control': 'no-cache',
+#     # 'Connection': 'keep-alive',
+#     # 'Host': 'search-api-web.eastmoney.com',
+#     # 'Pragma': 'no-cache',
+#     # 'Sec-Fetch-Dest': 'script',
+#     # 'Sec-Fetch-Mode': 'no-cors',
+#     # 'Sec-Fetch-Site': 'same-site',
+#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
+#     # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
+#     # 'sec-ch-ua-mobile': '?0',
+#     # 'sec-ch-ua-platform': '"Windows"'
+# }
+# url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
+#
+#
+# # res = requests.get(url).text[1:-1]
+# res = requests.get(url=url, headers=headers)
+# with open('./a.pdf','wb') as f:
+#     f.write(res.content)
+import datetime
+import json
 import requests
-headers = {
+import pymongo
-    # 'Accept': '*/*',
+from base import BaseCore
-    # 'Accept-Encoding': 'gzip, deflate, br',
+baseCore = BaseCore.BaseCore()
-    # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+log = baseCore.getLogger()
-    # 'Cache-Control': 'no-cache',
-    # 'Connection': 'keep-alive',
-    # 'Host': 'search-api-web.eastmoney.com',
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
-    # 'Pragma': 'no-cache',
+    '数据源_0504']
-    # 'Sec-Fetch-Dest': 'script',
-    # 'Sec-Fetch-Mode': 'no-cors',
+datas = db_storage.find({'postCode':'2'}).limit(5)
-    # 'Sec-Fetch-Site': 'same-site',
+for data in datas:
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
+    title = data['titleForeign']
-    # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
+    contentWithTag = data['richTextForeign']
-    # 'sec-ch-ua-mobile': '?0',
+    summary = data['contentForeign']
-    # 'sec-ch-ua-platform': '"Windows"'
+    dic_info = {
-}
+        'title':title,
-url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
+        'summary':summary,
+        'contentWithTag':contentWithTag
+    }
-# res = requests.get(url).text[1:-1]
+    headers = {
-res = requests.get(url=url, headers=headers)
+        'Content-Type': 'application/json',
-with open('./a.pdf','wb') as f:
+    }
-    f.write(res.content)
+    dic_info_ = json.dumps(dic_info)
\ No newline at end of file
+    # print(dic_info_)
+    # with open('./data.json','w') as f:
+    #     f.write(dic_info_)
+    # break
+    # req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
+    req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
+    log.info(req.text)
\ No newline at end of file