提交 aa593218 作者: LiuLiYuan

招标信息采集 10/20

上级 ecb8098f
# 核心工具包
import os
import random
import socket
import sys
import time
import fitz
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import pymysql
import redis
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
# from fdfs_client.client import get_tracker_conf, Fdfs_client
# tracker_conf = get_tracker_conf('E:\\kkwork\\zzsn_spider\\base\\client.conf')
# client = Fdfs_client(tracker_conf)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
# self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
# charset='utf8mb4')
# self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except :
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
pass
else:
end=str.rfind(endStr)
if end==-1:
pass
else:
str = str[0:end+1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self,key):
item = self.r.lpop(key)
return item.decode() if item else None
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
# 获取本机IP
def getIP(self):
IP = socket.gethostbyname(socket.gethostname())
return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, social_code):
data = []
try:
sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql,values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
#获取企查查token
def GetToken(self):
#获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
return token
#获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
return token
#检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,key,item):
self.r.rpush(key, item)
#增加计数器的值并返回增加后的值
def incrSet(self,key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
#获取key剩余的过期时间
def getttl(self,key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
# #上传至文件服务器,并解析pdf的内容和页数
# def upLoadToServe(self,pdf_url,type_id,social_code):
# headers = {}
# retData = {'state':False,'type_id':type_id,'item_id':social_code,'group_name':'group1','path':'','full_path':'',
# 'category':'pdf','file_size':'','status':1,'create_by':'XueLingKun',
# 'create_time':'','page_size':'','content':''}
# headers['User-Agent'] = self.getRandomUserAgent()
# for i in range(0, 3):
# try:
# resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
# break
# except:
# time.sleep(3)
# continue
# page_size = 0
#
# for i in range(0, 3):
# try:
# result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
# with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
# for page in doc.pages():
# retData['content'] += page.get_text()
# break
# except:
# time.sleep(3)
# continue
# if page_size < 1:
# # pdf解析失败
# print(f'======pdf解析失败=====')
# return retData
# else:
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# retData['state'] = True
# retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
# retData['full_path'] = bytes.decode(result['Remote file_id'])
# retData['file_size'] = result['Uploaded size']
# retData['create_time'] = time_now
# retData['page_size'] = page_size
#
# return retData
def secrchATT(self,item_id,year,type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,year,pdf_name,num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id,year,type_id)
# sel_sql = '''select id,item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
# self.cursor.execute(sel_sql, (item_id, year,type_id))
# selects = self.cursor.fetchone()
if selects:
self.getLogger().info(f'com_name:{com_name}已存在')
id = selects[0]
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,year,type_id)
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self,social_code,cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
import datetime
import json
import random
import re
import time
import pymysql
import requests
from bs4 import BeautifulSoup
import sys
sys.path.append('D:\\建材')
import BaseCore
from kafka import KafkaProducer
from requests.adapters import HTTPAdapter
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
}
session = requests.session()
session.mount('https://', HTTPAdapter(max_retries=3))
session.mount('http://', HTTPAdapter(max_retries=3))
# 8月以后的
# 三个月
# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=2&DEAL_TIME=05&DEAL_CLASSIFY=01&DEAL_STAGE=0100&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=1&DEAL_TIME=05&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
# 当天
# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=1&DEAL_TIME=01&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
# TIMEBEGIN_SHOW=2023-08-24&TIMEEND_SHOW=2023-09-02&TIMEBEGIN=2023-08-24&TIMEEND=2023-09-02&SOURCE_TYPE=2&DEAL_TIME=01&DEAL_CLASSIFY=01&DEAL_STAGE=0100&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=
# 发送kafka
def sendKafka(dic_news):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("tenderClusterData",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
log.info(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输成功")
except Exception as e:
log.error(f'{e}')
log.error(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输失败")
# 获取总页数
def getTotal(url, params):
try:
req_ = session.post(url, params=params, timeout=30)
req_.encoding = req_.apparent_encoding
res = json.loads(req_.text)
total = res['ttlpage']
except:
session.close()
url_ = 'http://www.ggzy.gov.cn/'
session.get(url=url_, headers=headers, timeout=30)
req_ = session.post(url, params=params, timeout=30)
req_.encoding = req_.apparent_encoding
res = json.loads(req_.text)
total = res['ttlpage']
return total
# 获取一也所有数据
def getDatas(url, params):
try:
req_ = session.get(url, params=params, timeout=30)
req_.encoding = req_.apparent_encoding
res = json.loads(req_.text)
datas = res['data']
except:
session.close()
url_ = 'http://www.ggzy.gov.cn/'
session.get(url=url_, headers=headers, timeout=30)
req_ = session.get(url, params=params, timeout=30)
req_.encoding = req_.apparent_encoding
res = json.loads(req_.text)
datas = res['data']
return datas
# 获取编号
def getTaN(href):
try:
req_content = session.get(href, timeout=30)
req_content.encoding = req_content.apparent_encoding
soup = BeautifulSoup(req_content.text, 'html.parser')
if '编号:' not in req_content.text:
contentNo = ''
else:
contentNo = \
soup.select('body > div.fully > p')[0].text.split('信息来源:')[0].split(
'编号:')[1]
title = soup.select('body > div.fully > h4')[0].text
except:
session.close()
url_ = 'http://www.ggzy.gov.cn/'
session.get(url=url_, headers=headers, timeout=30)
req_content = session.get(href, timeout=30)
req_content.encoding = req_content.apparent_encoding
soup = BeautifulSoup(req_content.text, 'html.parser')
if '编号:' not in req_content.text:
contentNo = ''
else:
contentNo = \
soup.select('body > div.fully > p')[0].text.split('信息来源:')[0].split(
'编号:')[1]
title = soup.select('body > div.fully > h4')[0].text
return contentNo, title
# 解析数据
def getData(data):
# pub_time = data['timeShow']
province = data['districtShow']
href = data['url']
href_ = href.replace('/information/html/a', '/html/b')
infoType = data['stageShow']
businessType = data['classifyShow']
origin = data['platformName']
trade = data['tradeShow']
contentNo, title_1 = getTaN(href)
req_content = session.get(href_, timeout=30)
req_content.encoding = req_content.apparent_encoding
soup = BeautifulSoup(req_content.text, 'lxml')
pub_time = soup.select('body > div > p > span')[0].text.split(':')[1].lstrip().strip()
title_2 = soup.select('body > div > h4')[0].text.replace('\n', '').replace('\r', '')
contentWithTag = soup.select('body > div')[0]
content = contentWithTag.text
data = {
'businessType': businessType, # 业务类型
'infoType': infoType, # 信息类型
'trade': trade, # 行业
'province': province, # 省份
'origin': origin, # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title_1, # 标题
'subtitle': title_2 # 二级标题
}
return data
def zb():
now = str(datetime.date.today())
past = str(datetime.date.today() - datetime.timedelta(days=50))
num = 0
url = 'http://www.ggzy.gov.cn/'
req = session.get(url=url, headers=headers, timeout=30)
if req.status_code != 200:
log.error('网站连接失败')
return
url_ = 'http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp'
# 当天
params_s = [
f'TIMEBEGIN_SHOW=2023-10-01&TIMEEND_SHOW={now}&TIMEBEGIN=2023-10-01&TIMEEND={now}&SOURCE_TYPE=1&DEAL_TIME=06&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=',
f'TIMEBEGIN_SHOW=2023-10-01&TIMEEND_SHOW={now}&TIMEBEGIN=2023-10-01&TIMEEND={now}&SOURCE_TYPE=2&DEAL_TIME=06&DEAL_CLASSIFY=00&DEAL_STAGE=0000&DEAL_PROVINCE=0&DEAL_CITY=0&DEAL_PLATFORM=0&BID_PLATFORM=0&DEAL_TRADE=0&isShowAll=1&PAGENUMBER=1&FINDTXT=']
for params_ in params_s:
total = getTotal(url_, params_)
for page in range(total):
datas = getDatas(url_, params_)
for data_ in datas:
try:
data = getData(data_)
# print(data['contentWithTag'])
sendKafka(data)
# print(data)
num += 1
except:
session.close()
session.get(url=url, headers=headers, timeout=30)
data = getData(data_)
# print(data['contentWithTag'])
sendKafka(data)
num += 1
time.sleep(0.5)
params_ = params_.replace(f'PAGENUMBER={page + 1}', f'PAGENUMBER={page + 2}')
log.info(f"共采集{num}条数据")
if __name__ == '__main__':
zb()
# current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds)
baseCore.close()
This source diff could not be displayed because it is too large. You can view the blob instead.
import datetime
import json
import re
import time
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from retry import retry
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
URL = 'http://www.ccgp.gov.cn/'
def sendKafka(dic_news):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("tenderClusterData",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
log.info(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输成功")
except Exception as e:
log.error(f'{e}')
log.error(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输失败")
# 招标结构化信息
def getDataA(session, title, publishDate, province, href):
fj_list = []
city = ''
contentNo = ''
biddingProjectName = ''
biddingItems = ''
biddingUnit = ''
biddingUnitAddress = ''
bidOpeningTime = ''
bidOpeningAddress = ''
budgetAmount = ''
biddingContact = ''
biddingPhone = ''
req = session.get(href)
if req.status_code == 404:
log.error(f'{href}===404')
return
req.encoding = req.apparent_encoding
biddingContent = ''
no_flg = 1
soup = BeautifulSoup(req.text, 'lxml')
div = soup.find('div', class_='table')
tr_list = div.select('tr')
for tr in tr_list:
if '采购项目名称' in tr.text:
biddingProjectName = tr.find_all('td')[-1].text
elif '品目' in tr.text:
biddingItems = tr.find_all('td')[-1].text
elif '采购单位地址' in tr.text:
biddingUnitAddress = tr.find_all('td')[-1].text
elif '采购单位' in tr.text and '采购单位联系方式' not in tr.text:
biddingUnit = tr.find_all('td')[-1].text
elif '开标时间' in tr.text:
bidOpeningTime = tr.find_all('td')[-1].text.replace('年', '-').replace('月', '-').split('日')[0]
elif '开标地点' in tr.text:
bidOpeningAddress = tr.find_all('td')[-1].text
elif '预算金额' in tr.text:
budgetAmount = tr.find_all('td')[-1].text
elif '项目联系人' in tr.text:
biddingContact = tr.find_all('td')[-1].text
elif '采购单位联系方式' in tr.text:
biddingPhone = tr.find_all('td')[-1].text
elif '行政区域' in tr.text:
city = tr.find_all('td')[1].text
contentWithTag = soup.find('div', class_='vF_detail_content_container')
content = contentWithTag.text
if content == 0:
log.error(f'{href}===解析失败')
return
table = contentWithTag.find('table', class_='Content')
if table:
tr_list = table.find_all('tr')
for i in range(len(tr_list)):
if '项目编号:' in tr_list[i].text:
try:
if no_flg == 1:
contentNo = tr_list[i].text.lstrip().strip()
no_flg = 0
except:
contentNo = ''
elif '采购需求' in tr_list[i].text:
try:
try:
biddingContent = tr_list[i].text.split('采购需求:')[1].lstrip().strip()
except:
biddingContent = tr_list[i + 1].text.lstrip().strip()
except:
biddingContent = ''
else:
p_list = contentWithTag.find_all('p')
for i in range(len(p_list)):
if '采购需求:' in p_list[i].text:
try:
biddingContent = p_list[i + 1].text
except:
biddingContent = ''
elif '项目编号:' in p_list[i].text:
try:
if no_flg == 1:
try:
contentNo = p_list[i].text.split('项目编号:')[1].split('项目名称')[0].lstrip().strip().split(' ')[0]
except:
try:
contentNo = \
p_list[i].text.split('项目编号:')[1].split('项目名称')[0].lstrip().strip().split(' ')[0]
except:
contentNo = \
p_list[i].text.split('项目编号):')[1].split('采购项目名称')[0].lstrip().strip().split(' ')[0]
no_flg = 0
except:
contentNo = ''
elif '采购项目编号' in p_list[i].text:
try:
if no_flg == 1:
contentNo = \
p_list[i].text.split('采购项目编号(建议书编号):')[1].split('采购项目名称')[0].lstrip().strip().split(' ')[0]
no_flg = 0
except:
contentNo = ''
a_list = contentWithTag.find_all('a')
contentWithTag = str(contentWithTag)
for a in a_list:
try:
href = a.get('href')
contentWithTag.replace(href, f'http{href}')
except:
pass
data = {
'businessType': '政府采购', # 业务类型
'infoType': '招标公告', # 信息类型
'trade': '', # 行业
'province': province, # 省份
'city': city, # 市区
'origin': '中国政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': publishDate, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': biddingItems, # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': bidOpeningTime, # 开标时间
'bidOpeningAddress': bidOpeningAddress, # 开标地点
'budgetAmount': budgetAmount, # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
return data
def getDataA_(session, title, publishDate, province, href):
req = session.get(href)
if req.status_code == 404:
log.error(f'{href}===404')
return
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
contentWithTag = soup.find('div', class_='vF_detail_content_container')
content = contentWithTag.text
if content == 0:
log.error(f'{href}===解析失败')
return
a_list = contentWithTag.find_all('a')
contentWithTag = str(contentWithTag)
for a in a_list:
try:
href = a.get('href')
contentWithTag.replace(href, f'http{href}')
except:
pass
data = {
'businessType': '政府采购', # 业务类型
'infoType': '招标公告', # 信息类型
'trade': '', # 行业
'province': province, # 省份
'city': '', # 市区
'origin': '中国政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': '', # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': publishDate, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': '', # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': '', # 招标单位
'biddingUnitAddress': '', # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': '', # 招标单位联系人
'biddingPhone': '', # 招标单位联系电话
'biddingContent': '', # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
return data
# 中标结构化信息
def getDataB(session, title, publishDate, province, href):
fj_list = []
city = ''
contentNo = ''
biddingProjectName = ''
biddingItems = ''
biddingUnit = ''
biddingUnitAddress = ''
biddingContact = ''
biddingPhone = ''
bidPrice = ''
req = session.get(href)
if req.status_code == 404:
log.error(f'{href}===404')
return
req.encoding = req.apparent_encoding
biddingContent = ''
no_flg = 1
soup = BeautifulSoup(req.text, 'lxml')
div = soup.find('div', class_='table')
tr_list = div.select('tr')
for tr in tr_list:
if '采购项目名称' in tr.text:
biddingProjectName = tr.find_all('td')[-1].text
elif '品目' in tr.text:
biddingItems = tr.find_all('td')[-1].text
elif '采购单位地址' in tr.text:
biddingUnitAddress = tr.find_all('td')[-1].text
elif '采购单位' in tr.text and '采购单位联系方式' not in tr.text:
biddingUnit = tr.find_all('td')[-1].text
elif '项目联系人' in tr.text:
biddingContact = tr.find_all('td')[-1].text
elif '采购单位联系方式' in tr.text:
biddingPhone = tr.find_all('td')[-1].text
elif '行政区域' in tr.text:
city = tr.find_all('td')[1].text
elif '总中标金额' in tr.text:
bidPrice = tr.find_all('td')[-1].text
contentWithTag = soup.find('div', class_='vF_detail_content_container')
content = contentWithTag.text
if content == 0:
log.error(f'{href}===解析失败')
return
p_list = contentWithTag.find_all('p')
for i in range(len(p_list)):
if '采购需求:' in p_list[i].text:
try:
biddingContent = p_list[i + 1].text
except:
biddingContent = ''
try:
try:
contentNo = re.findall('项目编号:(.*)(招标文件编号', content)[0].lstrip().strip()
except:
contentNo = re.findall('项目编号:(.*)二、项目名称', content)[0].lstrip().strip()
except:
p_list = contentWithTag.find_all('p')
for i in range(len(p_list)):
if '一、项目编号' in p_list[i].text:
contentNo = p_list[i + 1].text.lstrip().strip()
a_list = contentWithTag.find_all('a')
contentWithTag = str(contentWithTag)
for a in a_list:
try:
href = a.get('href')
contentWithTag.replace(href, f'http{href}')
except:
pass
data = {
'businessType': '政府采购', # 业务类型
'infoType': '中标公告', # 信息类型
'trade': '', # 行业
'province': province, # 省份
'city': city, # 市区
'origin': '中国政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': publishDate, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': biddingItems, # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': bidPrice, # 中标价格
}
return data
def getDataB_(session, title, publishDate, province, href):
req = session.get(href)
if req.status_code == 404:
log.error(f'{href}===404')
return
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
contentWithTag = soup.find('div', class_='vF_detail_content_container')
content = contentWithTag.text
if content == 0:
log.error(f'{href}===解析失败')
return
a_list = contentWithTag.find_all('a')
contentWithTag = str(contentWithTag)
for a in a_list:
try:
href = a.get('href')
contentWithTag.replace(href, f'http{href}')
except:
pass
data = {
'businessType': '政府采购', # 业务类型
'infoType': '中标公告', # 信息类型
'trade': '', # 行业
'province': province, # 省份
'city': '', # 市区
'origin': '中国政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': '', # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': publishDate, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': '', # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': '', # 招标单位
'biddingUnitAddress': '', # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': '', # 招标单位联系人
'biddingPhone': '', # 招标单位联系电话
'biddingContent': '', # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
req.close()
return data
# 成交结构化信息
def getDataC(session, title, publishDate, province, href):
fj_list = []
city = ''
contentNo = ''
biddingProjectName = ''
biddingItems = ''
biddingUnit = ''
biddingUnitAddress = ''
biddingContact = ''
biddingPhone = ''
winningBidder = ''
winningBidderAddress = ''
bidPrice = ''
req = session.get(href)
if req.status_code == 404:
log.error(f'{href}===404')
return
req.encoding = req.apparent_encoding
biddingContent = ''
no_flg = 1
soup = BeautifulSoup(req.text, 'lxml')
div = soup.find('div', class_='table')
tr_list = div.select('tr')
for tr in tr_list:
if '采购项目名称' in tr.text:
biddingProjectName = tr.find_all('td')[-1].text
elif '品目' in tr.text:
biddingItems = tr.find_all('td')[-1].text
elif '采购单位地址' in tr.text:
biddingUnitAddress = tr.find_all('td')[-1].text
elif '采购单位' in tr.text and '采购单位联系方式' not in tr.text:
biddingUnit = tr.find_all('td')[-1].text
elif '项目联系人' in tr.text:
biddingContact = tr.find_all('td')[-1].text
elif '采购单位联系方式' in tr.text:
biddingPhone = tr.find_all('td')[-1].text
elif '行政区域' in tr.text:
city = tr.find_all('td')[1].text
elif '总中标金额' in tr.text or '总成交金额' in tr.text:
bidPrice = tr.find_all('td')[-1].text
contentWithTag = soup.find('div', class_='vF_detail_content_container')
content = contentWithTag.text
if content == 0:
log.error(f'{href}===解析失败')
return
p_list = contentWithTag.find_all('p')
for i in range(len(p_list)):
if '一、项目编号' in p_list[i].text:
try:
contentNo = p_list[i].text.split('一、项目编号:')[1].split('(招标文件编号')[0].lstrip().strip()
except:
contentNo = ''
elif '中标(成交)信息' in p_list[i].text:
try:
winningBidder = p_list[i + 1].text.lstrip().strip()
winningBidderAddress = p_list[i + 2].text.lstrip().strip()
except:
winningBidder = ''
winningBidderAddress = ''
a_list = contentWithTag.find_all('a')
contentWithTag = str(contentWithTag)
for a in a_list:
try:
href = a.get('href')
contentWithTag.replace(href, f'http{href}')
except:
pass
data = {
'businessType': '政府采购', # 业务类型
'infoType': '成交公告', # 信息类型
'trade': '', # 行业
'province': province, # 省份
'city': city, # 市区
'origin': '中国政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': publishDate, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': biddingItems, # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': winningBidder, # 中标单位
'winningBidderAddress': winningBidderAddress, # 中标单位地址
'bidPrice': bidPrice, # 中标价格
}
req.close()
return data
def getDataC_(session, title, publishDate, province, href):
req = session.get(href)
if req.status_code == 404:
log.error(f'{href}===404')
return
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
contentWithTag = soup.find('div', class_='vF_detail_content_container')
content = contentWithTag.text
if content == 0:
log.error(f'{href}===解析失败')
return
a_list = contentWithTag.find_all('a')
contentWithTag = str(contentWithTag)
for a in a_list:
try:
href = a.get('href')
contentWithTag.replace(href, f'http{href}')
except:
pass
data = {
'businessType': '政府采购', # 业务类型
'infoType': '成交公告', # 信息类型
'trade': '', # 行业
'province': province, # 省份
'city': '', # 市区
'origin': '中国政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': '', # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': publishDate, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': '', # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': '', # 招标单位
'biddingUnitAddress': '', # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': '', # 招标单位联系人
'biddingPhone': '', # 招标单位联系电话
'biddingContent': '', # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
req.close()
return data
# 获取总页数
@retry(tries=3, delay=10)
def getTotal(url):
session_total = requests.session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57',
'Upgrade-Insecure-Requests': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
}
ip = baseCore.get_proxy()
session_total.get(URL, headers=headers, proxies=ip, timeout=20)
# req = requests.get(url, headers=headers, proxies=ip)
req = session_total.get(url)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
p = soup.find('p', class_='pager').find('script').text
totle = re.findall(r'size:(.*?),', p)[0]
session_total.close()
return int(totle)
# 获取soup
@retry(tries=3, delay=10)
def getSoup(url):
session_soup = requests.session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57',
'Upgrade-Insecure-Requests': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
}
ip = baseCore.get_proxy()
session_soup.get(URL, headers=headers, proxies=ip, timeout=20)
# req = requests.get(url, headers=headers, proxies=ip)
req = session_soup.get(url)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
session_soup.close()
return soup
def doJob():
headers = {}
headers['user-agent'] = baseCore.getRandomUserAgent()
session = requests.session()
session.get(URL, headers=headers)
# start_time = ((datetime.date.today() - datetime.timedelta(days=2)).strftime("%Y:%m:%d"))
start_time = '2023:10:01'
end_time = datetime.datetime.now().strftime('%Y:%m:%d')
bidTypes = [[11, '成交公告'], [1, '公开招标'], [7, '中标公告']]
for bidType in bidTypes:
num = 0
url = f'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType={bidType[0]}&dbselect=bidx&kw=&start_time={start_time}&end_time={end_time}&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
try:
total = getTotal(url)
except:
log.error(f'{bidType[1]}===获取总页数失败')
continue
log.info(f'开始采集{bidType[1]}===共{total}页')
time.sleep(1.5)
for page in range(1, int(total) + 1):
log.info(f'正在采集==={bidType[1]}===第{page}页')
url = f'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index={page}&bidSort=0&buyerName=&projectId=&pinMu=0&bidType={bidType[0]}&dbselect=bidx&kw=&start_time={start_time}&end_time={end_time}&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
try:
soup = getSoup(url)
except:
log.error(f'{bidType[1]}===第{page}页===解析失败')
continue
try:
li_list = soup.find('ul', class_='vT-srch-result-list-bid').find_all('li')
except:
log.error(f'{bidType[1]}===第{page}页===获取信息列表失败')
continue
for li in li_list:
a = li.find('a')
href = a.get('href')
title = a.text.lstrip().strip()
pub_time = li.find('span').text.split(' ')[0].replace('.',
'-').lstrip().strip()
province = li.find('span').find('a').text
if bidType[0] == 1:
try:
data = getDataA(session, title, pub_time, province, href)
except:
data = getDataA_(session, title, pub_time, province, href)
elif bidType[0] == 7:
try:
data = getDataB(session, title, pub_time, province, href)
except:
data = getDataB_(session, title, pub_time, province, href)
else:
try:
data = getDataC(session, title, pub_time, province, href)
except:
data = getDataC_(session, title, pub_time, province, href)
if data:
print(data['title'])
# sendKafka(data)
num += 1
time.sleep(1.5)
log.info(f'{bidType[1]}===采集{num}条')
session.close()
if __name__ == '__main__':
# while True:
doJob()
# current_time = datetime.datetime.now()
# midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
# sleep_seconds = (midnight_time - current_time).total_seconds()
# time.sleep(sleep_seconds)
baseCore.close()
import datetime
import re
import time
import requests
import json
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from base import BaseCore
URL = 'https://zycg.gov.cn/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Content-Type': 'application/json;charset=utf-8',
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Host': 'zycg.gov.cn',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'sec-ch-ua': '"Chromium";v="118", "Microsoft Edge";v="118", "Not=A?Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
session = requests.session()
session.get(URL, headers=headers)
def sendKafka(dic_news):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("tenderClusterData",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
log.info(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输成功")
except Exception as e:
log.error(f'{e}')
log.error(f"{dic_news['title']}.....{dic_news['subtitle']}.....{dic_news['sourceAddress']}传输失败")
# 附件有接口 https://www.zycg.gov.cn/freecms/rest/v1/notice/selectNoticeDocInfo.do?currPage=1&pageSize=10&id=71caad2a-8b01-11ed-9548-fa163e9acaa1
def getTotal(session,url):
req = session.get(url)
req.encoding = req.apparent_encoding
print(req.text)
total = json.loads(req.text)['total']
return total
def getJson(session,url):
req = session.get(url)
req.encoding = req.apparent_encoding
data_json = json.loads(req.text)
datas = data_json['data']
return datas
def getSoup(session, url):
req = session.get(url)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
return soup
def getConNo(content):
try:
pattern = r"项目编号([^\u4e00-\u9fa5]+)"
matches = re.findall(pattern, content)[0]
pattern = r"[a-zA-Z0-9-]+"
contentNo = re.findall(pattern, matches)[0]
except:
pattern = r"工程编号([^\u4e00-\u9fa5]+)"
matches = re.findall(pattern, content)[0]
pattern = r"[a-zA-Z0-9-]+"
contentNo = re.findall(pattern, matches)[0]
return contentNo
# noticeType 1为招标 2为中标
# operationStartTime 开始时间 2023-09-06%2000:00:00
# operationEndTime 截止时间 2023-09-08%2000:00:00
def zhaobiao():
def getA(title, href, pub_time, soup, id):
biddingProjectName = title.split('采购公告')[0].split('招标公告')[0].split('征集公告')[0]
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
biddingUnit = ''
biddingUnitAddress = ''
bidOpeningTime = ''
biddingContact = ''
biddingPhone = ''
bidOpeningAddress = ''
budgetAmount = ''
biddingContent = ''
p_list = contentWithTag.select('p')
for i in range(len(p_list)):
if '1.采购人信息' in p_list[i].text:
biddingPhone = re.findall('联系方式:(.*)', p_list[i + 3].text)[0].lstrip().strip()
try:
biddingUnit = re.findall('名  称:(.*)', p_list[i + 1].text)[0].lstrip().strip()
biddingUnitAddress = re.findall('地  址:(.*)', p_list[i + 2].text)[0].lstrip().strip()
except:
biddingUnit = re.findall('名    称:(.*)', p_list[i + 1].text)[0].lstrip().strip()
biddingUnitAddress = re.findall('地    址:(.*)', p_list[i + 2].text)[0].lstrip().strip()
elif '五、开标' in p_list[i].text:
try:
bidOpeningTime = re.findall('时间:(.*)', p_list[i + 2].text)[0].lstrip().strip()
bidOpeningAddress = re.findall('地点:(.*)', p_list[i + 4].text)[0].lstrip().strip()
except:
bidOpeningTime = re.findall('时间:(.*)', p_list[i + 1].text)[0].lstrip().strip()
bidOpeningAddress = re.findall('注意事项:(.*)', p_list[i + 2].text)[0].lstrip().strip()
elif '五、开启' in p_list[i].text:
bidOpeningTime = re.findall('时间:(.*)', p_list[i + 1].text)[0].lstrip().strip()
bidOpeningAddress = re.findall('地点:(.*)', p_list[i + 2].text)[0].lstrip().strip()
elif '预算金额' in p_list[i].text:
budgetAmount = re.findall('预算金额:(.*)', p_list[i].text)[0].lstrip().strip()
elif '基准预算' in p_list[i].text:
budgetAmount = re.findall('基准预算:(.*)', p_list[i].text)[0]
elif '采购需求' in p_list[i].text:
try:
biddingContent = re.findall('采购需求:(.*)', p_list[i].text.replace('\n', ' ').replace('\r', ' '))[
0].lstrip().strip()
except:
continue
elif '文件联系人及电话' in p_list[i].text or '联系人及电话' in p_list[i].text:
pattern = r'(\w+)\s+(\d{3}-\d{8}|\d{11}|\d{4}-\d{7})'
try:
contactAndPhone_ = re.findall('文件联系人及电话:(.*)', p_list[i].text)[0].replace('\xa0',
' ').lstrip().strip()
except:
contactAndPhone_ = re.findall('联系人及电话(.*)', p_list[i].text)[0].replace('\xa0', ' ').lstrip().strip()
contactAndPhones = re.findall(pattern, contactAndPhone_)
for contactAndPhone in contactAndPhones:
biddingContact = biddingContact + contactAndPhone[0] + ':'
biddingContact = biddingContact + contactAndPhone[1] + ';'
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '招标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': bidOpeningTime, # 开标时间
'bidOpeningAddress': bidOpeningAddress, # 开标地点
'budgetAmount': budgetAmount, # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
return data
def getB(title, href, pub_time, soup, id):
biddingProjectName = title.split('采购公告')[0].split('招标公告')[0].split('征集公告')[0]
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
biddingUnit = ''
biddingUnitAddress = ''
bidOpeningTime = ''
biddingContact = ''
biddingPhone = ''
bidOpeningAddress = ''
budgetAmount = ''
biddingContent = ''
content_ = content.replace(' ', '').replace(' ', '').replace('\n', ' ').replace('\r', ' ')
try:
try:
budgetAmount = re.findall('合同估算价(.*?)2\.2', content_)[0].lstrip().strip()
except:
budgetAmount = re.findall('合同估算价(.*?)2\.3', content_)[0].lstrip().strip()
except:
budgetAmount = re.findall('投资额(.*?)2\.3', content_)[0].lstrip().strip()
biddingContent_ = re.findall('招标范围(.*?)资格要求', content_)[0].lstrip().strip()
try:
biddingContent = re.findall('招标范围(.*?)。', biddingContent_)[0].lstrip().strip() + '。'
except:
biddingContent = re.findall('招标范围(.*?)2\.5', biddingContent_)[0].lstrip().strip()
try:
biddingUnit_ = re.findall('联系方式(.*?)采购执行机构信息', content_)[0].lstrip().strip()
biddingUnit = re.findall('招标人:(.*?)地址:', biddingUnit_)[0].lstrip().strip()
biddingUnitAddress = re.findall('地址:(.*?)联系人:', biddingUnit_)[0].lstrip().strip()
except:
biddingUnit_ = re.findall('发布公告的媒介(.*?)采购执行机构信息', content_)[0].lstrip().strip()
try:
biddingUnit = re.findall('招标人:(.*?)地址:', biddingUnit_)[0].lstrip().strip()
biddingUnitAddress = re.findall('地址:(.*?)联系人:', biddingUnit_)[0].lstrip().strip()
except:
biddingUnit = re.findall('招标人:(.*?)联系人', biddingUnit_)[0].lstrip().strip()
biddingUnitAddress = ''
try:
biddingContact = re.findall('联系人:(.*?)电话:', biddingUnit_)[0].lstrip().strip()
biddingPhone = re.findall('电话:(.*?)电子邮件:', biddingUnit_)[0].lstrip().strip()
except:
biddingContact_ = re.findall('联系人:(.*$)', biddingUnit_)[0].lstrip().strip()
biddingContact = re.findall(r'[\u4e00-\u9fa5]+', biddingContact_)[0].replace('电话', '')
biddingPhone = biddingContact_.split(biddingContact)[1].replace('电话:', '')
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '招标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': bidOpeningTime, # 开标时间
'bidOpeningAddress': bidOpeningAddress, # 开标地点
'budgetAmount': budgetAmount, # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
return data
def getC(title, href, pub_time, soup, id):
biddingProjectName = title.split('采购公告')[0].split('招标公告')[0].split('征集公告')[0]
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
biddingUnit = ''
biddingUnitAddress = ''
bidOpeningTime = ''
biddingContact = ''
biddingPhone = ''
bidOpeningAddress = ''
budgetAmount = ''
biddingContent = ''
biddingContent = re.findall('招标内容与范围:(.*?)招标暂估金额:', content)[0].lstrip().strip()
budgetAmount = re.findall('招标暂估金额:(.*?)项目地址:', content)[0].lstrip().strip().replace(' ', '').replace(' ', '')
bidOpeningAddress = re.findall('项目地址:(.*?)其它说明:', content)[0].lstrip().strip()
biddingUnit = re.findall('招标人名称:(.*?)地址:', content)[0].lstrip().strip()
biddingUnitAddress = re.findall('地址:(.*?)联系人:', re.findall('招标人名称:(.*?)联系电话:', content)[0])[0].lstrip().strip()
try:
biddingPhone = re.findall('联系电话:(.*?)电子邮件:', content)[0].lstrip().strip()
except:
biddingPhone = re.findall('联系电话:(.*?)采购执行机构信息', content)[0].lstrip().strip()
biddingContact = re.findall('联系人:(.*?)联系电话:', content)[0].lstrip().strip()
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '招标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': bidOpeningTime, # 开标时间
'bidOpeningAddress': bidOpeningAddress, # 开标地点
'budgetAmount': budgetAmount, # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
return data
def getD(title, href, pub_time, soup, id):
biddingProjectName = title.split('采购公告')[0].split('招标公告')[0].split('征集公告')[0]
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
p_list = contentWithTag.select('p')
biddingUnit = ''
biddingUnitAddress = ''
bidOpeningTime = ''
biddingContact = ''
biddingPhone = ''
bidOpeningAddress = ''
budgetAmount = ''
biddingContent = ''
for i in range(len(p_list)):
if '招标内容:' in p_list[i].text or '招标内容:' in p_list[i].text:
try:
biddingContent = p_list[i].text.split('招标内容:')[1].lstrip().strip()
except:
try:
biddingContent = p_list[i].text.split('招标内容:')[1].lstrip().strip()
except:
biddingContent = p_list[i + 1].text.lstrip().strip()
elif '项目名称' in p_list[i].text:
biddingProjectName = re.findall('项目名称:(.*)', p_list[i].text)[0]
elif '投标截止及开标时间' in p_list[i].text:
bidOpeningTime = re.findall('时间:(.*)', p_list[i].text)[0].lstrip().strip()
elif '投标地点及开标地点' in p_list[i].text:
bidOpeningAddress = p_list[i + 1].text.lstrip().strip()
elif '预算金额为' in p_list[i].text:
budgetAmount = re.findall('预算金额为:(.*)', p_list[i].text)[0].lstrip().strip().replace('\xa0', '')
elif '采购人名称:' in p_list[i].text:
biddingUnit = re.findall('采购人名称:(.*)', p_list[i].text)[0].lstrip().strip()
biddingUnitAddress = re.findall('地址:(.*)', p_list[i + 1].text)[0].lstrip().strip()
biddingPhone = re.findall('联系电话:(.*)', p_list[i + 2].text)[0].lstrip().strip()
elif '项目招标文件负责人及电话' in p_list[i].text:
pattern = r'(\w+)\s+(\d{3}-\d{8}|\d{11}|\d{4}-\d{7}|\d{8})'
contactAndPhones = re.findall(pattern, p_list[i].text)
for contactAndPhone in contactAndPhones:
biddingContact = contactAndPhone[0] + ":" + contactAndPhone[1] + ";"
elif '项目联系人' in p_list[i].text:
pattern = r'(\w+)\s+(\d{3}-\d{8}|\d{11}|\d{4}-\d{7}|\d{8})'
contactAndPhones = re.findall(pattern, p_list[i].text.replace('联系电话:', ' '))
for contactAndPhone in contactAndPhones:
biddingContact = contactAndPhone[0] + ":" + contactAndPhone[1] + ";"
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '招标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': bidOpeningTime, # 开标时间
'bidOpeningAddress': bidOpeningAddress, # 开标地点
'budgetAmount': budgetAmount, # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
return data
def getE(title, href, pub_time, soup, id):
biddingProjectName = title.split('采购公告')[0].split('招标公告')[0].split('征集公告')[0]
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
biddingUnit = ''
biddingUnitAddress = ''
bidOpeningTime = ''
biddingContact = ''
biddingPhone = ''
bidOpeningAddress = ''
budgetAmount = ''
biddingContent = ''
p_list = contentWithTag.find_all('p')
for i in range(len(p_list)):
if '招标内容:' in p_list[i].text or '谈判内容:' in p_list[i]:
biddingContent = p_list[i].text.split(':')[1].lstrip().strip()
elif '开标时间:' in p_list[i].text:
bidOpeningTime = p_list[i].text.split(':')[1].lstrip().strip()
elif '谈判开始时间' in p_list[i].text:
bidOpeningTime = p_list[i].text.split(':')[1].lstrip().strip()
elif '投标地点及开标地点' in p_list[i].text or '谈判地点' in p_list[i].text:
bidOpeningAddress = p_list[i + 1].text.lstrip().strip()
elif '采购人名称' in p_list[i].text:
biddingUnit = p_list[i].text.split(':')[1].lstrip().strip()
biddingUnitAddress = p_list[i + 1].text.split(':')[1].lstrip().strip()
biddingPhone = p_list[i + 2].text.split(':')[1].lstrip().strip()
elif '项目联系人' in p_list[i].text:
biddingContact = p_list[i].text.split(':')[1].lstrip().strip()
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '招标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': bidOpeningTime, # 开标时间
'bidOpeningAddress': bidOpeningAddress, # 开标地点
'budgetAmount': budgetAmount, # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
return data
def getError(title, href, pub_time, soup, id):
biddingProjectName = title.split('采购公告')[0].split('招标公告')[0].split('征集公告')[0]
contentWithTag = soup.select('#printArea')[0]
try:
contentWithTag.select('.relatelink')[0].decompose()
except:
pass
try:
contentWithTag.select('.change-page')[0].decompose()
except:
pass
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
biddingUnit = ''
biddingUnitAddress = ''
bidOpeningTime = ''
biddingContact = ''
biddingPhone = ''
bidOpeningAddress = ''
budgetAmount = ''
biddingContent = ''
id_list = []
data = {
'businessType': '政府采购', # 业务类型
'infoType': '招标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': bidOpeningTime, # 开标时间
'bidOpeningAddress': bidOpeningAddress, # 开标地点
'budgetAmount': budgetAmount, # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
return data
now = datetime.datetime.now().date().strftime("%Y-%m-%d")
num = 0
url = f'https://zycg.gov.cn/freecms/rest/v1/notice/selectInfoMore.do?&siteId=6f5243ee-d4d9-4b69-abbd-1e40576ccd7d&channel=d0e7c5f4-b93e-4478-b7fe-61110bb47fd5&currPage=1&pageSize=12&noticeType=1&implementWay=1&operationStartTime=2023-10-01%2000:00:00&title=&operationEndTime={now}%2000:00:00'
total = getTotal(session,url)
if total % 12 != 0:
pages = int(total / 12) + 1
else:
pages = int(total / 12)
log.info(f'招标公告===共{pages}页')
for page in range(1, pages + 1):
log.info(f'开始采集第{page}页')
url = f'https://zycg.gov.cn/freecms/rest/v1/notice/selectInfoMore.do?&siteId=6f5243ee-d4d9-4b69-abbd-1e40576ccd7d&channel=d0e7c5f4-b93e-4478-b7fe-61110bb47fd5&currPage={page}&pageSize=12&noticeType=1&implementWay=1&operationStartTime=2023-10-01%2000:00:00&title=&operationEndTime={now}%2000:00:00'
datas_json = getJson(session,url)
for data_json in datas_json:
id = data_json['id']
title = data_json['title']
href = data_json['pageurl']
pub_time = data_json['addtimeStr']
if 'http' not in href:
href = 'https://www.zycg.gov.cn' + href
soup = getSoup(session, href)
try:
if '1、招标内容' in soup.text:
data = getD(title, href, pub_time, soup, id)
elif '一、招标条件' in soup.text:
try:
data = getC(title, href, pub_time, soup, id)
except:
data = getE(title, href, pub_time, soup, id)
elif '1. 招标条件' in soup.text or '1.招标条件' in soup.text or '1.招标条件' in soup.text:
data = getB(title, href, pub_time, soup, id)
else:
data = getA(title, href, pub_time, soup, id)
num += 1
except Exception as e:
data = getError(title, href, pub_time, soup, id)
sendKafka(data)
time.sleep(1)
print(f'招标公告==={num}条')
def zhongbiao():
num = 0
def getA(title, href, pub_time, soup, id):
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
biddingContact = ''
winningBidder = ''
winningBidderAddress = ''
bidPrice = ''
biddingProjectName = ''
biddingUnit = ''
biddingUnitAddress = ''
biddingPhone = ''
try:
contentNo = getConNo(content)
except:
contentNo = ''
p_list = contentWithTag.find_all('p')
for i in range(len(p_list)):
if '项目名称:' in p_list[i].text:
biddingProjectName = p_list[i].text.split('项目名称:')[1].lstrip().strip()
elif '采购项目名称' in p_list[i].text:
biddingProjectName = p_list[i].text.split('采购项目名称:')[1].lstrip().strip()
elif '供应商名称' in p_list[i].text or '供应商名称及地址' in p_list[i].text:
try:
winningBidder = p_list[i].text.split('供应商名称:')[1].lstrip().strip()
except:
flag = 1
while True:
if '中标(成交)金额' in p_list[i + flag].text:
break
winningBidder = winningBidder + p_list[i + flag].text.split(',')[
0].lstrip().strip() + ';'
winningBidderAddress = winningBidderAddress + p_list[i + flag].text.split(',')[
1].lstrip().strip() + ';'
flag += 1
elif '供应商地址' in p_list[i].text:
winningBidderAddress = p_list[i].text.split('供应商地址:')[1].lstrip().strip()
elif '中标(成交)金额' in p_list[i].text:
patterm = r"([\d,]+\.\d+|\d+(?:\.\d+)?)\s*(万|元)"
try:
bidPrice = re.findall(patterm, p_list[i].text)[0][0] + re.findall(patterm, p_list[i].text)[0][1]
except:
flag = 1
while True:
if '元' not in p_list[i + flag].text:
break
bidPrice = bidPrice + p_list[i + flag].text.lstrip().strip() + ';'
flag += 1
elif '中标金额:' in p_list[i].text or '成交金额:' in p_list[i].text:
patterm = r"([\d,]+\.\d+|\d+(?:\.\d+)?)\s*(万|元)"
bidPrice = re.findall(patterm, p_list[i].text)[0][0] + re.findall(patterm, p_list[i].text)[0][1]
elif '1.采购人信息' in p_list[i].text:
biddingUnit = p_list[i + 1].text.replace(' ', '').replace(' ', '').split('名称:')[1].lstrip().strip()
biddingUnitAddress = p_list[i + 2].text.replace(' ', '').replace(' ', '').split('地址:')[
1].lstrip().strip()
biddingPhone = p_list[i + 3].text.split('联系方式:')[1].lstrip().strip()
elif '1、采购人信息' in p_list[i].text:
biddingUnit = p_list[i + 1].text.split('采购人名称:')[1].lstrip().strip()
biddingUnitAddress = p_list[i + 2].text.split('地址:')[1].lstrip().strip()
biddingPhone = p_list[i + 3].text.split('联系电话:')[1].lstrip().strip()
elif '采购人名称:' in p_list[i].text:
biddingUnit = p_list[i].text.split('采购人名称:')[1].lstrip().strip()
biddingUnitAddress = p_list[i + 1].text.split('地址:')[1].lstrip().strip()
biddingPhone = p_list[i + 2].text.split('联系电话:')[1].lstrip().strip()
elif '文件联系人及电话:' in p_list[i].text:
pattern = r'(\w+)\s+(\d{3}-\d{8}|\d{11}|\d{4}-\d{7})'
contactAndPhones = re.findall(pattern, p_list[i].text)
for contactAndPhone in contactAndPhones:
biddingContact = biddingContact + contactAndPhone[0] + ':'
biddingContact = biddingContact + contactAndPhone[1] + ';'
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '中标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': '', # 招标内容
'winningBidder': winningBidder, # 中标单位
'winningBidderAddress': winningBidderAddress, # 中标单位地址
'bidPrice': bidPrice, # 中标价格
}
return data
def getB(title, href, pub_time, soup, id):
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
try:
try:
tbody = contentWithTag.select('table')[1].select('tbody')[0].text.replace('\n', '').replace('\r', '')
except:
tbody = contentWithTag.select('table')[0].select('tbody')[0].text.replace('\n', '').replace('\r', '')
except:
try:
data = getC(title, href, pub_time, soup, id)
return data
except:
print(f'c,{href}.............错误')
return []
biddingProjectName = re.findall('工程名称(.*?)建设地点', tbody)[0].lstrip().strip()
winningBidder = re.findall('中标人(.*?)中标价', tbody)[0].lstrip().strip()
biddingUnit = re.findall('建设单位名称(.*?)工程名称', tbody)[0].lstrip().strip()
try:
bidPrice = tbody.split('中标价(元)')[1].split('公示开始时间')[0].lstrip().strip()
except:
bidPrice = tbody.split('中标价(元)')[1].lstrip().strip()
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '中标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': '', # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': '', # 招标单位联系人
'biddingPhone': '', # 招标单位联系电话
'biddingContent': '', # 招标内容
'winningBidder': winningBidder, # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': bidPrice, # 中标价格
}
return data
def getC(title, href, pub_time, soup, id):
biddingProjectName = ''
biddingUnit = ''
biddingUnitAddress = ''
biddingContact = ''
biddingPhone = ''
winningBidder = ''
bidPrice = ''
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
p_list = contentWithTag.find_all('p')
for i in range(len(p_list)):
if '招标项目编号' in p_list[i].text:
biddingProjectName = p_list[i].text.split('(招标项目编号')[0].lstrip().strip()
elif '中标人:' in p_list[i].text:
winningBidder = p_list[i].text.split('中标人:')[1].lstrip().strip()
elif '中标报价' in p_list[i].text:
bidPrice = p_list[i].text.split('中标报价')[1].lstrip().strip().replace('元', '')
elif '招 标 人' in p_list[i].text:
biddingUnit = p_list[i].text.split(':')[1].lstrip().strip()
biddingUnitAddress = p_list[i + 1].text.split(':')[1].lstrip().strip()
biddingContact = p_list[i + 2].text.split(':')[1].lstrip().strip()
biddingPhone = p_list[i + 3].text.split(':')[1].lstrip().strip()
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '中标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': '', # 招标内容
'winningBidder': winningBidder, # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': bidPrice, # 中标价格
}
return data
def getD(title, href, pub_time, soup, id):
biddingProjectName = ''
winningBidder = ''
winningBidderAddress = ''
bidPrice = ''
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
p_list = contentWithTag.find_all('p')
for i in range(len(p_list)):
if '2、项目名称:' in p_list[i].text:
biddingProjectName = re.findall('项目名称:(.*)', p_list[i].text)[0].lstrip().strip()
elif '供应商名称及地址' in p_list[i].text:
winningBidder = p_list[i + 1].text.split(',')[0].lstrip().strip()
winningBidderAddress = p_list[i + 1].text.split(',')[1].lstrip().strip()
elif '中标(成交)金额' in p_list[i].text:
bidPrice = p_list[i + 1].text
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '中标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': '', # 招标单位
'biddingUnitAddress': '', # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': '', # 招标单位联系人
'biddingPhone': '', # 招标单位联系电话
'biddingContent': '', # 招标内容
'winningBidder': winningBidder, # 中标单位
'winningBidderAddress': winningBidderAddress, # 中标单位地址
'bidPrice': bidPrice, # 中标价格
}
return data
def getE(title, href, pub_time, soup, id):
contentWithTag = soup.select('#printArea')[0]
contentWithTag.select('.relatelink')[0].decompose()
contentWithTag.select('.change-page')[0].decompose()
content = contentWithTag.text
bidPrice = ''
winningBidder = ''
try:
contentNo = getConNo(content)
except:
contentNo = ''
biddingProjectName = re.findall('组织的(.*)(', content)[0]
try:
winningBidder_con = re.findall('中标人(.*?)感谢', content)[0]
except:
winningBidder_con = re.findall('中标人(.*?)若', content)[0]
winningBidders = re.findall(r"([\u4e00-\u9fa5]+(?:有限公司|公司))", winningBidder_con)
for winningBidder_ in winningBidders:
winningBidder = winningBidder + winningBidder_ + ';'
try:
bidPrices = re.findall(r"([\d,]+\.\d+|[\d,]+)元", winningBidder_con)
for bidPrice_ in bidPrices:
bidPrice = bidPrice + bidPrice_ + '元;'
except:
pass
contentWithTag = str(contentWithTag)
# fj_list, id_list = getFj(session, id)
# for fj in fj_list:
# contentWithTag = contentWithTag.replace(f"{fj[1]}", f"{fj[2]}")
data = {
'businessType': '政府采购', # 业务类型
'infoType': '中标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': '', # 招标单位
'biddingUnitAddress': '', # 招标单位地址
'bidOpeningTime': '', # 开标时间
'bidOpeningAddress': '', # 开标地点
'budgetAmount': '', # 预算金额
'biddingContact': '', # 招标单位联系人
'biddingPhone': '', # 招标单位联系电话
'biddingContent': '', # 招标内容
'winningBidder': winningBidder, # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': bidPrice, # 中标价格
}
return data
def getError(title, href, pub_time, soup, id):
contentWithTag = soup.select('#printArea')[0]
try:
contentWithTag.select('.relatelink')[0].decompose()
except:
pass
try:
contentWithTag.select('.change-page')[0].decompose()
except:
pass
content = contentWithTag.text
try:
contentNo = getConNo(content)
except:
contentNo = ''
biddingProjectName = ''
biddingUnit = ''
biddingUnitAddress = ''
bidOpeningTime = ''
biddingContact = ''
biddingPhone = ''
bidOpeningAddress = ''
budgetAmount = ''
biddingContent = ''
id_list = []
data = {
'businessType': '政府采购', # 业务类型
'infoType': '中标公告', # 信息类型
'trade': '', # 行业
'province': '', # 省份
'origin': '中央政府采购网', # 来源网站
'isAbroad': '1', # 1、国内 2、国外
'contentNo': contentNo, # 编号
'content': content, # 内容(不带标签)
'contentWithTag': str(contentWithTag), # 内容(带标签)
'sid': '1699606095238131714', # 信息源id
'publishDate': pub_time, # 发布时间
'sourceAddress': href, # 原文链接
'title': title, # 标题
'subtitle': '', # 二级标题
'biddingProjectName': biddingProjectName, # 招标项目名称
'biddingItems': '', # 品目
'biddingUnit': biddingUnit, # 招标单位
'biddingUnitAddress': biddingUnitAddress, # 招标单位地址
'bidOpeningTime': bidOpeningTime, # 开标时间
'bidOpeningAddress': bidOpeningAddress, # 开标地点
'budgetAmount': budgetAmount, # 预算金额
'biddingContact': biddingContact, # 招标单位联系人
'biddingPhone': biddingPhone, # 招标单位联系电话
'biddingContent': biddingContent, # 招标内容
'winningBidder': '', # 中标单位
'winningBidderAddress': '', # 中标单位地址
'bidPrice': '', # 中标价格
}
return data
now = datetime.datetime.now().date().strftime("%Y-%m-%d")
url = f'https://www.zycg.gov.cn/freecms/rest/v1/notice/selectInfoMore.do?&siteId=6f5243ee-d4d9-4b69-abbd-1e40576ccd7d&channel=d0e7c5f4-b93e-4478-b7fe-61110bb47fd5&currPage=1&pageSize=12&noticeType=2&implementWay=1&operationStartTime=2023-10-01%2000:00:00&title=&operationEndTime={now}%2000:00:00'
total = getTotal(session,url)
if total % 12 != 0:
pages = int(total / 12) + 1
else:
pages = int(total / 12)
log.info(f'中标公告===共{pages}页')
for page in range(1, pages + 1):
log.info(f'开始采集第{page}页')
url = f'https://www.zycg.gov.cn/freecms/rest/v1/notice/selectInfoMore.do?&siteId=6f5243ee-d4d9-4b69-abbd-1e40576ccd7d&channel=d0e7c5f4-b93e-4478-b7fe-61110bb47fd5&currPage={page}&pageSize=12&noticeType=2&implementWay=1&operationStartTime=2023-10-01%2000:00:00&title=&operationEndTime={now}%2000:00:00'
datas_json = getJson(session,url)
for data_json in datas_json:
id = data_json['id']
title = data_json['title']
href = data_json['pageurl']
pub_time = data_json['addtimeStr']
if pub_time < '2023-10-01':
break
if 'http' not in href:
href = 'https://www.zycg.gov.cn' + href
soup = getSoup(session, href)
try:
if '结果公示' in title:
data = getB(title, href, pub_time, soup, id)
if len(data) == 0:
data = getError(title, href, pub_time, soup, id)
elif '中标公告' in title or '结果公告' in title or '成交公告' in title:
try:
data = getA(title, href, pub_time, soup, id)
except:
data = getE(title, href, pub_time, soup, id)
elif '成交公告' in title:
data = getD(title, href, pub_time, soup, id)
else:
data = getE(title, href, pub_time, soup, id)
except:
data = getError(title, href, pub_time, soup, id)
sendKafka(data)
num += 1
time.sleep(1)
log.info(f"中标公告==={num}条")
if __name__ == '__main__':
log.info('招标公告开始采集')
# try:
zhaobiao()
# except:
# log.error('招标公告采集出错')
log.info('中标公告开始采集')
try:
zhongbiao()
except:
log.error('中标公告采集出错')
baseCore.close()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论