提交 c52051f8 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

import datetime
import json
import random
import re
import time
import pymysql
import requests
from bs4 import BeautifulSoup
import sys
sys.path.append('D:\\建材')
import BaseCore
from kafka import KafkaProducer
from requests.adapters import HTTPAdapter
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
}
session = requests.session()
session.mount('https://', HTTPAdapter(max_retries=3))
session.mount('http://', HTTPAdapter(max_retries=3))
# 8月以后的
# 三个月
# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=2&DEAL_TIME=05&DEAL_CLASSIFY=01&DEAL_STAGE=0100&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=1&DEAL_TIME=05&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
# 当天
# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=1&DEAL_TIME=01&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=2&DEAL_TIME=01&DEAL_CLASSIFY=01&DEAL_STAGE=0100&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
# 发送kafka
def sendKafka(dic_news):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("tenderClusterData",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
log.info(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输成功")
except Exception as e:
log.error(f'{e}')
log.error(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输失败")
# 获取总页数
def getTotal(url, params):
try:
req_ = session.post(url, params=params, timeout=30)
req_.encoding = req_.apparent_encoding
res = json.loads(req_.text)
total = res['ttlpage']
except:
session.close()
url_ = 'http://www.ggzy.gov.cn/'
session.get(url=url_, headers=headers, timeout=30)
req_ = session.post(url, params=params, timeout=30)
req_.encoding = req_.apparent_encoding
res = json.loads(req_.text)
total = res['ttlpage']
return total
# 获取一也所有数据
def getDatas(url, params):
try:
req_ = session.get(url, params=params, timeout=30)
req_.encoding = req_.apparent_encoding
res = json.loads(req_.text)
datas = res['data']
except:
session.close()
url_ = 'http://www.ggzy.gov.cn/'
session.get(url=url_, headers=headers, timeout=30)
req_ = session.get(url, params=params, timeout=30)
req_.encoding = req_.apparent_encoding
res = json.loads(req_.text)
datas = res['data']
return datas
# 获取编号
def getTaN(href):
try:
req_content = session.get(href, timeout=30)
req_content.encoding = req_content.apparent_encoding
soup = BeautifulSoup(req_content.text, 'html.parser')
if '编号:' not in req_content.text:
contentNo = ''
else:
contentNo = \
soup.select('body > div.fully > p')[0].text.split('信息来源:')[0].split(
'编号:')[1]
title = soup.select('body > div.fully > h4')[0].text
except:
session.close()
url_ = 'http://www.ggzy.gov.cn/'
session.get(url=url_, headers=headers, timeout=30)
req_content = session.get(href, timeout=30)
req_content.encoding = req_content.apparent_encoding
soup = BeautifulSoup(req_content.text, 'html.parser')
if '编号:' not in req_content.text:
contentNo = ''
else:
contentNo = \
soup.select('body > div.fully > p')[0].text.split('信息来源:')[0].split(
'编号:')[1]
title = soup.select('body > div.fully > h4')[0].text
return contentNo, title
# 解析数据
def getData(data):
# pub_time = data['timeShow']
province = data['districtShow']
href = data['url']
href_ = href.replace('/information/html/a', '/html/b')
infoType = data['stageShow']
businessType = data['classifyShow']
origin = data['platformName']
trade = data['tradeShow']
contentNo, title_1 = getTaN(href)
req_content = session.get(href_, timeout=30)
req_content.encoding = req_content.apparent_encoding
soup = BeautifulSoup(req_content.text, 'lxml')
pub_time = soup.select('body > div > p > span')[0].text.split(':')[1].lstrip().strip()
title_2 = soup.select('body > div > h4')[0].text.replace('\n', '').replace('\r', '')
contentWithTag = soup.select('body > div')[0]
content = contentWithTag.text
data = {
'businessType': businessType, # 业务类型
'infoType': infoType, # 信息类型
'trade': trade, # 行业
'province': province, # 省份
'origin': origin, # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title_1, # 标题
'subtitle': title_2 # 二级标题
}
return data
def zb():
now = str(datetime.date.today())
past = str(datetime.date.today() - datetime.timedelta(days=50))
num = 0
url = 'http://www.ggzy.gov.cn/'
req = session.get(url=url, headers=headers, timeout=30)
if req.status_code != 200:
log.error('网站连接失败')
return
url_ = 'http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp'
# 当天
params_s = [
f'TIMEBEGIN_SHOW=2023-10-01&TIMEEND_SHOW={now}&TIMEBEGIN=2023-10-01&TIMEEND={now}&SOURCE_TYPE=1&DEAL_TIME=06&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=',
f'TIMEBEGIN_SHOW=2023-10-01&TIMEEND_SHOW={now}&TIMEBEGIN=2023-10-01&TIMEEND={now}&SOURCE_TYPE=2&DEAL_TIME=06&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=']
for params_ in params_s:
total = getTotal(url_, params_)
for page in range(total):
datas = getDatas(url_, params_)
for data_ in datas:
try:
data = getData(data_)
# print(data['contentWithTag'])
sendKafka(data)
# print(data)
num += 1
except:
session.close()
session.get(url=url, headers=headers, timeout=30)
data = getData(data_)
# print(data['contentWithTag'])
sendKafka(data)
num += 1
time.sleep(0.5)
params_ = params_.replace(f'PAGENUMBER={page + 1}', f'PAGENUMBER={page + 2}')
log.info(f"共采集{num}条数据")
if __name__ == '__main__':
zb()
# current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds)
baseCore.close()
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论