Merge remote-tracking branch 'origin/master'

c52051f8 · 薛凌堃 · 3d5c08da · aa593218 · c52051f8 · c52051f8
--- a/comData/tender/BaseCore.py
+++ b/comData/tender/BaseCore.py
--- a/comData/tender/qgzypt.py
+++ b/comData/tender/qgzypt.py
+import datetime
+import json
+import random
+import re
+import time
+
+import pymysql
+import requests
+from bs4 import BeautifulSoup
+import sys
+
+sys.path.append('D:\\建材')
+import BaseCore
+from kafka import KafkaProducer
+from requests.adapters import HTTPAdapter
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    #              Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+}
+session = requests.session()
+session.mount('https://', HTTPAdapter(max_retries=3))
+session.mount('http://', HTTPAdapter(max_retries=3))
+
+
+# 8月以后的
+# 三个月
+# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=2&DEAL_TIME=05&DEAL_CLASSIFY=01&DEAL_STAGE=0100&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
+# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=1&DEAL_TIME=05&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
+# 当天
+# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=1&DEAL_TIME=01&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
+# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=2&DEAL_TIME=01&DEAL_CLASSIFY=01&DEAL_STAGE=0100&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
+
+# 发送kafka
+def sendKafka(dic_news):
+    try:
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+        kafka_result = producer.send("tenderClusterData",
+                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+
+        print(kafka_result.get(timeout=10))
+
+        log.info(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输成功")
+    except Exception as e:
+        log.error(f'{e}')
+        log.error(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输失败")
+
+
+# 获取总页数
+def getTotal(url, params):
+    try:
+        req_ = session.post(url, params=params, timeout=30)
+        req_.encoding = req_.apparent_encoding
+        res = json.loads(req_.text)
+        total = res['ttlpage']
+    except:
+        session.close()
+        url_ = 'http://www.ggzy.gov.cn/'
+        session.get(url=url_, headers=headers, timeout=30)
+        req_ = session.post(url, params=params, timeout=30)
+        req_.encoding = req_.apparent_encoding
+        res = json.loads(req_.text)
+        total = res['ttlpage']
+    return total
+
+
+# 获取一也所有数据
+def getDatas(url, params):
+    try:
+        req_ = session.get(url, params=params, timeout=30)
+        req_.encoding = req_.apparent_encoding
+        res = json.loads(req_.text)
+        datas = res['data']
+    except:
+        session.close()
+        url_ = 'http://www.ggzy.gov.cn/'
+        session.get(url=url_, headers=headers, timeout=30)
+        req_ = session.get(url, params=params, timeout=30)
+        req_.encoding = req_.apparent_encoding
+        res = json.loads(req_.text)
+        datas = res['data']
+    return datas
+
+
+# 获取编号
+def getTaN(href):
+    try:
+        req_content = session.get(href, timeout=30)
+        req_content.encoding = req_content.apparent_encoding
+        soup = BeautifulSoup(req_content.text, 'html.parser')
+        if '编号：' not in req_content.text:
+            contentNo = ''
+        else:
+            contentNo = \
+                soup.select('body > div.fully > p')[0].text.split('信息来源：')[0].split(
+                    '编号：')[1]
+        title = soup.select('body > div.fully > h4')[0].text
+    except:
+        session.close()
+        url_ = 'http://www.ggzy.gov.cn/'
+        session.get(url=url_, headers=headers, timeout=30)
+        req_content = session.get(href, timeout=30)
+        req_content.encoding = req_content.apparent_encoding
+        soup = BeautifulSoup(req_content.text, 'html.parser')
+        if '编号：' not in req_content.text:
+            contentNo = ''
+        else:
+            contentNo = \
+                soup.select('body > div.fully > p')[0].text.split('信息来源：')[0].split(
+                    '编号：')[1]
+        title = soup.select('body > div.fully > h4')[0].text
+    return contentNo, title
+
+
+# 解析数据
+def getData(data):
+    # pub_time = data['timeShow']
+    province = data['districtShow']
+    href = data['url']
+    href_ = href.replace('/information/html/a', '/html/b')
+    infoType = data['stageShow']
+    businessType = data['classifyShow']
+    origin = data['platformName']
+    trade = data['tradeShow']
+    contentNo, title_1 = getTaN(href)
+    req_content = session.get(href_, timeout=30)
+    req_content.encoding = req_content.apparent_encoding
+    soup = BeautifulSoup(req_content.text, 'lxml')
+    pub_time = soup.select('body > div > p > span')[0].text.split('：')[1].lstrip().strip()
+    title_2 = soup.select('body > div > h4')[0].text.replace('\n', '').replace('\r', '')
+    contentWithTag = soup.select('body > div')[0]
+    content = contentWithTag.text
+    data = {
+        'businessType': businessType,  # 业务类型
+        'infoType': infoType,  # 信息类型
+        'trade': trade,  # 行业
+        'province': province,  # 省份
+        'origin': origin,  # 来源网站
+        'isAbroad': '1',  # 1、国内 2、国外
+        'contentNo': contentNo,  # 编号
+        'content': content,  # 内容(不带标签)
+        'contentWithTag': str(contentWithTag),  # 内容(带标签)
+        'sid': '1699606095238131714',  # 信息源id
+        'publishDate': pub_time,  # 发布时间
+        'sourceAddress': href,  # 原文链接
+        'title': title_1,  # 标题
+        'subtitle': title_2  # 二级标题
+    }
+    return data
+
+
+def zb():
+    now = str(datetime.date.today())
+    past = str(datetime.date.today() - datetime.timedelta(days=50))
+    num = 0
+    url = 'http://www.ggzy.gov.cn/'
+    req = session.get(url=url, headers=headers, timeout=30)
+    if req.status_code != 200:
+        log.error('网站连接失败')
+        return
+    url_ = 'http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp'
+    # 当天
+    params_s = [
+        f'TIMEBEGIN_SHOW=2023-10-01&TIMEEND_SHOW={now}&TIMEBEGIN=2023-10-01&TIMEEND={now}&SOURCE_TYPE=1&DEAL_TIME=06&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=',
+        f'TIMEBEGIN_SHOW=2023-10-01&TIMEEND_SHOW={now}&TIMEBEGIN=2023-10-01&TIMEEND={now}&SOURCE_TYPE=2&DEAL_TIME=06&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=']
+    for params_ in params_s:
+        total = getTotal(url_, params_)
+        for page in range(total):
+            datas = getDatas(url_, params_)
+            for data_ in datas:
+                try:
+                    data = getData(data_)
+                    # print(data['contentWithTag'])
+                    sendKafka(data)
+                    # print(data)
+                    num += 1
+                except:
+                    session.close()
+                    session.get(url=url, headers=headers, timeout=30)
+                    data = getData(data_)
+                    # print(data['contentWithTag'])
+                    sendKafka(data)
+                    num += 1
+                time.sleep(0.5)
+            params_ = params_.replace(f'PAGENUMBER={page + 1}', f'PAGENUMBER={page + 2}')
+    log.info(f"共采集{num}条数据")
+
+
+if __name__ == '__main__':
+    zb()
+    # current_time = datetime.datetime.now()
+    # midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
+    # sleep_seconds = (midnight_time - current_time).total_seconds()
+    # time.sleep(sleep_seconds)
+    baseCore.close()
--- a/comData/tender/stealth.min.js
+++ b/comData/tender/stealth.min.js
--- a/comData/tender/zhongguocg.py
+++ b/comData/tender/zhongguocg.py
--- a/comData/tender/zhongyangcg.py
+++ b/comData/tender/zhongyangcg.py