提交 650c704a 作者: 刘伟刚

REITs网站采集

上级 497088ef
# 核心工具包
import os
import random
import socket
import sys
import time
import fitz
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import redis
import langid
#创建连接池
import pymysql
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from obs import ObsClient
import fitz
from urllib.parse import unquote
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
#11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except :
pass
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
pass
else:
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
pass
else:
end=str.rfind(endStr)
if end==-1:
pass
else:
str = str[0:end+1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
#检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl',dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
#解析word文件页数
# def doc_page(self,file_path):
# doc = Document(file_path)
# return len(doc.sections)
def secrchATT(self,item_id,file_name,type_id,order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id,order_by))
selects = self.cursor_.fetchone()
return selects
#插入到att表 返回附件id
def tableUpdate(self,retData,com_name,file_name,num,pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time,path,'zzsn',pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id,file_name,type_id,order_by)
id = selects[0]
return id,full_path
# 获取文件大小
def convert_size(self,size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(self,file_href,item_id,file_name):
headers = {}
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 7, 'item_id': item_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(file_href, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
for i in range(0, 3):
try:
# name = file_name
if category in file_name:
pass
else:
file_name = file_name + '.' + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + file_name, content=response.content)
break
except:
time.sleep(3)
continue
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'663',
'Content-Type':'application/json',
'Cookie':'SESSION=MGFhMGQxNDItM2MyOS00NjU5LWI2MTgtZjdiM2UxNjFkMGI3; _trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false',
'Host':'www.cq.gov.cn',
'Origin':'https://www.cq.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.cq.gov.cn/zwgk/search.html?DOCTITLE=REITs&DEPT=&gte=&lte=&REFERENCENO=&nh=&number=',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
dlist=[]
pagenum=3
for i in range(1,pagenum):
log.info(f'henan采集第{i}页列表')
lurl='https://www.cq.gov.cn/irs/front/list'
data={
"customFilter": {
"operator": "and",
"properties": [],
"filters": [
{
"operator": "or",
"properties": [
{
"property": "f_202121500898",
"operator": "eq",
"value": "REITs"
},
{
"property": "f_202142777829",
"operator": "eq",
"value": "REITs"
}
],
"filters": []
},
{
"operator": "or",
"properties": [
{
"property": "f_202146838317",
"operator": "gte",
"value": "2023-11-09 16:14:20"
},
{
"property": "f_202146235090",
"operator": "gte",
"value": "2023-11-09 16:14:20"
}
],
"filters": [
{
"operator": "and",
"properties": [
{
"property": "f_202146838317",
"operator": "eq",
"value": None
},
{
"property": "f_202146235090",
"operator": "eq",
"value": None
}
]
}
]
}
]
},
"sorts": [],
"tableName": "t_1775cd018c6",
"tenantId": "7",
"pageSize": 10,
"pageNo": i
}
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['list']
for lmsg in datas:
try:
title=lmsg['f_202121500898']
subtitle=''
summary=lmsg['f_202142777829']
createDate=''
writeDate=''
pubDate=lmsg['save_time']
source=lmsg['f_202121437464']
durl=lmsg['doc_pub_url']
wenjianhao=lmsg['f_202121837479']
suoyihao=lmsg['f_202121273539']
content=''
siteweb='重庆市人民政府'
except Exception as e:
continue
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'wenjianhao':wenjianhao,
'suoyihao':suoyihao,
}
is_member = rr.sismember('reis_cqgov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_cqgov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'重庆市人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'_trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false; _trs_user=',
'Host':'www.cq.gov.cn',
'Pragma':'no-cache',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
log.info(f'详情请求地址:{durl}')
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
contentWithTag,content=soupPaserHtml(soup,'div[class="zcwjk-xlcon"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="document mt-1 mt-12"]')
if not content:
log.info(f'详情内容为空:{durl}')
contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_word"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'147',
'Content-Type':'application/json',
'Cookie':'cmssearch_session=ppktmj4Iyt337q8j2yiQBGvfwpRlLe44ifEPusj2; SEARCH_LIST=%5B%22REITS%22%2C%22REITs%22%5D; XSRF-TOKEN=eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
'Host':'search.gd.gov.cn',
'Origin':'https://search.gd.gov.cn',
'Pragma':'no-cache',
'Referer':'https://search.gd.gov.cn/search/file/2?page=1&position=all&keywords=REITs&filterType=localSite&filterId=undefined',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-XSRF-TOKEN':'eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
dlist=[]
pagenum=3
for i in range(1,pagenum):
log.info(f'henan采集第{i}页列表')
lurl='https://search.gd.gov.cn/api/search/file'
data={
"page":str(i),
"position":"all",
"keywords":"REITs",
"sort":"smart",
"site_id":"2",
"range":"site",
"recommand":1,
"gdbsDivision":"440000",
"service_area":1
}
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['list']
for lmsg in datas:
title=lmsg['title']
subtitle=''
summary=lmsg['content']
createDate=''
writeDate=''
pubDate=lmsg['pub_time']
source=lmsg['source']
durl=lmsg['url']
docNumberStr=lmsg['document_number']
reNum=lmsg['identifier']
content=''
siteweb='广东省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_gdgov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_gdgov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'广东省人民政府-政策文件')
def getList2():
header={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'147',
'Content-Type':'application/json',
'Cookie':'cmssearch_session=ppktmj4Iyt337q8j2yiQBGvfwpRlLe44ifEPusj2; SEARCH_LIST=%5B%22REITS%22%2C%22REITs%22%5D; XSRF-TOKEN=eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
'Host':'search.gd.gov.cn',
'Origin':'https://search.gd.gov.cn',
'Pragma':'no-cache',
'Referer':'https://search.gd.gov.cn/search/file/2?page=1&position=all&keywords=REITs&filterType=localSite&filterId=undefined',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-XSRF-TOKEN':'eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
dlist=[]
pagenum=3
for i in range(1,pagenum):
log.info(f'henan采集第{i}页列表')
lurl='https://search.gd.gov.cn/api/search/file'
data={
"label": "政策解读",
"position": "all",
"keywords": "REITs",
"sort": "smart",
"site_id": "2",
"range": "site",
"page": i,
"tag_name": "政策解读",
"recommand": 1,
"gdbsDivision": "440000",
"service_area": 1
}
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['list']
for lmsg in datas:
title=lmsg['title']
subtitle=''
summary=lmsg['content']
createDate=''
writeDate=''
pubDate=lmsg['pub_time']
source=lmsg['source']
durl=lmsg['url']
docNumberStr=lmsg['document_number']
reNum=lmsg['identifier']
content=''
siteweb='广东省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_gdgov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_gdgov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'广东省人民政府-政策解读')
def getList3():
header={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'147',
'Content-Type':'application/json',
'Cookie':'cmssearch_session=ppktmj4Iyt337q8j2yiQBGvfwpRlLe44ifEPusj2; SEARCH_LIST=%5B%22REITS%22%2C%22REITs%22%5D; XSRF-TOKEN=eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
'Host':'search.gd.gov.cn',
'Origin':'https://search.gd.gov.cn',
'Pragma':'no-cache',
'Referer':'https://search.gd.gov.cn/search/file/2?page=1&position=all&keywords=REITs&filterType=localSite&filterId=undefined',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-XSRF-TOKEN':'eyJpdiI6IjN3OFQ4XC9cL21vNkV2ZWJIMzNLZ29wQT09IiwidmFsdWUiOiIyYXpXa1dtMkJmdmhGdDBvSU9jRGZ4XC9UXC9QS3F4Kzh2ZWdQMTVcL09kMnBDbTl1a0FIUHR5VjRmTlpuVW9RWW01IiwibWFjIjoiMWJmOGUxMjEzZTUxNTFlYTFhMjZkYThiNzMzODEwZmYzYzA5OTAzN2ViNGEwYTBhYjlmNmJlYjk2NzdmODkzMSJ9',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
dlist=[]
pagenum=2
for i in range(1,pagenum):
log.info(f'henan采集第{i}页列表')
lurl='https://search.gd.gov.cn/api/search/file'
data={
"label": "计划规划",
"position": "all",
"keywords": "REITs",
"sort": "smart",
"site_id": "2",
"range": "site",
"page": i,
"tag_name": "计划规划",
"recommand": 1,
"gdbsDivision": "440000",
"service_area": 1
}
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['list']
for lmsg in datas:
title=lmsg['title']
subtitle=''
summary=lmsg['content']
createDate=''
writeDate=''
pubDate=lmsg['pub_time']
source=lmsg['source']
durl=lmsg['url']
docNumberStr=lmsg['document_number']
reNum=lmsg['identifier']
content=''
siteweb='广东省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_gdgov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_gdgov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'广东省人民政府-计划规划')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'gkmlfront_session=eyJpdiI6InRLSFR6VDc2bVRFcDY5ZlN1aW9yWnc9PSIsInZhbHVlIjoiaU9pdGVDWkxMRitQWFNJK2dEbVdVRDg5bXo2RDg5SEhMSTZWaWpYRGh1XC83NzJEVDFaNFQyUjhHWk5MVUFQTVIiLCJtYWMiOiJiNjdmZmQ1YTY3ZGE4OGQwZWI4OTJiZjQ2NDRjMTJjZjhlNjJiODIzZjMxY2Q2ODFhZGRlMWMyODI0YmMyZTI0In0%3D; front_uc_session=eyJpdiI6ImhaZ3E3VmxkUnFwT0hjbUw4cSs4d3c9PSIsInZhbHVlIjoiTlhhME1jWkVXTUtzK285cmYzWHlSZDV2c3p6M2ZIaEI5NjVJZnVwMDBsVmFXMDI5MWk3bGU0b0NSZHA1WGZobSIsIm1hYyI6IjZjZTMzOThhMGRjNWFjZDdmMzA2Njc0N2UxNThlOGQ0ZDU5OWJjMGIxOTY5ZGRjOWYzZDczZTk4OTFjMTBkNzYifQ%3D%3D',
'Host':'www.gd.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.gd.gov.cn/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
class_type=soupPaserHtml(soup,'div[class="classify"]>table>tbody:nth-child(1)>td:nth-child(4)')[1]
pub_jigou=soupPaserHtml(soup,'div[class="classify"]>table>tbody:nth-child(2)>td:nth-child(2)')[1]
write_data=soupPaserHtml(soup,'div[class="classify"]>table>tbody:nth-child(2)>td:nth-child(4)')[1]
#file_num=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(2)')[1]
#pub_data=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(4)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="article-content"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['class_type']=class_type
detailmsg['pub_jigou']=pub_jigou
detailmsg['write_data']=write_data
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
# getList()
# getList2()
getList3()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'385',
'Content-Type':'application/json',
'Cookie':'Hm_lvt_a013af4793f2380a4bcf49ca1ce393eb=1699513646; _trs_uv=loqujlqp_3625_8md7; _trs_ua_s_1=loqujlqp_3625_2jhe; arialoadData=true; ariawapChangeViewPort=false; Hm_lpvt_a013af4793f2380a4bcf49ca1ce393eb=1699513657; SEARCHHISTORY=[%22REiTs%22]',
'Host':'www.gxzf.gov.cn',
'Origin':'http://www.gxzf.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.gxzf.gov.cn/irs-intelligent-search/search?code=181aedaa542&dataTypeId=241&configCode=&sign=9cc99c9d-94aa-44b4-aa79-41227a5385d7&orderBy=related&searchBy=all&appendixType=&granularity=ALL&isSearchForced=0&pageNo=1&pageSize=10&isAdvancedSearch&isDefaultAdvanced&advancedFilters%20&searchWord=REiTs&advancedFilters',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
dlist=[]
pagenum=5
for i in range(1,pagenum):
log.info(f'采集第{i}页列表')
lurl='http://www.gxzf.gov.cn/irs/front/search'
data={
"code": "181aedaa542",
"dataTypeId": "241",
"configCode": "",
"sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7",
"searchWord": "REiTs",
"orderBy": "related",
"searchBy": "all",
"appendixType": "",
"granularity": "ALL",
"isSearchForced": "0",
"filters": [],
"pageNo": i,
"pageSize": 10,
"isAdvancedSearch": None,
"isDefaultAdvanced": None,
"advancedFilters": None,
"advancedFilters ": None,
"historySearchWords": [
"REiTs"
]
}
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['middle']['listAndBox']
for lmsgg in datas:
lmsg=lmsgg['data']
title=lmsg['title']
subtitle=''
summary=lmsg['table-7']
createDate=''
writeDate=lmsg['table-4']
pubDate=lmsg['time']
source=lmsg['table-3']
durl=lmsg['url']
docNumberStr=lmsg['table-5']
reNum=lmsg['table-1']
content=''
siteweb='广西壮族自治区人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_gxgov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_gxgov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'广西壮族自治区人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_a013af4793f2380a4bcf49ca1ce393eb=1699513646; _trs_uv=loqujlqp_3625_8md7; _trs_ua_s_1=loqujlqp_3625_2jhe; arialoadData=true; ariawapChangeViewPort=false; SEARCHHISTORY=[%22REiTs%22]; Hm_lpvt_a013af4793f2380a4bcf49ca1ce393eb=1699514234',
'Host':'www.gxzf.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.gxzf.gov.cn/irs-intelligent-search/search?code=181aedaa542&dataTypeId=241&configCode=&sign=9cc99c9d-94aa-44b4-aa79-41227a5385d7&orderBy=related&searchBy=all&appendixType=&granularity=ALL&isSearchForced=0&pageNo=1&pageSize=10&isAdvancedSearch&isDefaultAdvanced&advancedFilters%20&searchWord=REiTs&advancedFilters',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
#class_type=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(1)>td:nth-child(4)')[1]
pub_jigou=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(2)>td:nth-child(1)')[1]
write_data=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(2)>td:nth-child(2)')[1]
#file_num=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(2)')[1]
#pub_data=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(4)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="article-con"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
# detailmsg['class_type']=class_type
detailmsg['pub_jigou']=pub_jigou
detailmsg['write_data']=write_data
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153159&searchUseTime-349',
'Host':'www.hainan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
dlist=[]
pagenum=5
for i in range(1,pagenum):
log.info(f'采集第{i}页列表')
lurl=f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={i}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
lcont=reqbase.reqGetHtml(lurl,header)
if lcont:
try:
soup = BeautifulSoup(lcont, 'html.parser')
soup = reqbase.paserUrl(str(soup), lurl)
divlist=soup.select('div[id="showPage"]>div')
for lmsg in divlist:
title=soupPaserHtml(lmsg,'h3>a')[1]
subtitle=''
summary=''
createDate=''
writeDate=''
pubDate=soupPaserHtml(lmsg,'span[class="quily-con"]')[1]
source=soupPaserHtml(lmsg,'a[class="address-con permitU"]')[1]
try:
durl=soupPaserHtml(lmsg,'h3>a')[0].get('href')
except Exception as e:
durl=''
continue
docNumberStr=''
reNum=''
content=''
siteweb='海南省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_hainangov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_hainangov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'海南省人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'HttpOnly=true; 4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153247&searchUseTime-337; HA_STICKY_apps=apps.srv34; Hm_lvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515700; yfx_c_g_u_id_10005682=_ck23110915414012919174127333485; yfx_f_l_v_t_10005682=f_t_1699515700292__r_t_1699515700292__v_t_1699515700292__r_c_0; _trs_uv=loqvrn5x_4549_5u3r; _trs_ua_s_1=loqvrn5x_4549_1lnl; arialoadData=true; ariawapChangeViewPort=false; Hm_lpvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515718',
'Host':'www.hainan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=1&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
log.info(f'解析详情地址:{durl}')
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
souyihao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(1)')[1]
fenlei=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(2)')[1]
fawenjiguan=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(1)')[1]
write_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(2)')[1]
wenhao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(1)')[1]
pub_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(2)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[id="zoom"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['souyihao']=souyihao
detailmsg['fenlei']=fenlei
detailmsg['fawenjiguan']=fawenjiguan
detailmsg['wenhao']=wenhao
detailmsg['pub_data']=pub_data
detailmsg['write_data']=write_data
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Host':'searchapi.henan.gov.cn',
'Origin':'https://www.henan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.henan.gov.cn/',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-site',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
dlist=[]
pagenum=10
for i in range(1,pagenum):
log.info(f'henan采集第{i}页列表')
lurl=f'https://searchapi.henan.gov.cn/open/api/external?keywords=&siteId=4500000001&allKeyword=&anyKeyword=&noKeyword=&searchRange=-1000&sortType=200&beginTime=&endTime=&pageNumber={i}&pageSize=15&fileType=3&channelMarkId=45000000010115416542055691'
lcont=reqbase.reqGetHtml(lurl,header)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['datas']
for lmsg in datas:
title=lmsg['title']
subtitle=lmsg['subtitle']
summary=lmsg['summary']
createDate=lmsg['createDate']
writeDate=lmsg['writeDate']
pubDate=lmsg['pubDate']
source=lmsg['source']
durl=lmsg['selfUrl']
docNumberStr=lmsg['docNumberStr']
reNum=lmsg['reNum']
content=lmsg['content']
siteweb='河南省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_henangov', durl)
if is_member:
continue
paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_henangov',durl)
except Exception as e:
print(f'请求异常{e}-异常页码{i}')
reqbase.pdwriterXLS(dlist,'河南省人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'zh_choose=n; yfx_c_g_u_id_10000001=_ck23110818022219777515353379336; yfx_f_l_v_t_10000001=f_t_1699437742968__r_t_1699437742968__v_t_1699437742968__r_c_0',
'Host':'www.henan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.henan.gov.cn/zwgk/fgwj/szfl/',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
contentWithTag=soup.select('div[id="content"]')[0]
content = contentWithTag.text # 不带标签正文
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
return detailmsg
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'JSESSIONID=B970564BAAD37BB8E9EF19F78FF45618; Hm_lvt_5544783ae3e1427d6972d9e77268f25d=1699492684; token=43cfe913-ec04-4aa3-b037-96903ccaa188; uuid=43cfe913-ec04-4aa3-b037-96903ccaa188; Hm_lpvt_5544783ae3e1427d6972d9e77268f25d=1699492955; 924omrTVcFchT=0IZtGVa8M20F2wJXy_C6l9PPFZOO1SBDdB3qZtsaLbLGaQ5t4l6Vt8HF9dIwhxtBcLdkdRZwlK42NCaEUjZZoPsXZAZ1o.tgK50mj8FJZTM5zCxcVg3w4cOCSM4BvYApzj7YMWHycK14.NY6Y.AP6bW6g0jDIqZlbp2hKSpDfZYBhjsgwJJraXKf2S4sgG6swjXFVVUHGngt2GMQPUZQRsE0_tL9Pz3_h6JeSD9qHWLOVKJWz0z8hdC_F4kiGZj2FRjjSUZp0VLUS8pjkrJdGYrKhC5xwGy8xSFYBE_trVuCFjr8.vhLqBONYkoWvZM2qNX_WZXg_3wTLMqCMrjoCmkvHf7B9.MMVu8tMC4hwDT4wjeyoNoRjIlgKuwE.aGhn',
'Host':'www.hubei.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.hubei.gov.cn/site/hubei/search.html',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
dlist=[]
pagenum=3
for i in range(1,pagenum):
log.info(f'湖北采集第{i}页列表')
lurl=f'http://www.hubei.gov.cn/igs/front/search.jhtml?position=&timeOrder=&code=872801132c71495bbe5a938f6acff5aa&orderBy=all&pageSize=10&type=%E6%96%87%E4%BB%B6&time=&chnldesc=&pageNumber={i}&aggrFieldName=chnldesc&sortByFocus=true&siteId=50&name=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitename=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitetype=%E7%9C%81%E6%94%BF%E5%BA%9C&searchWord=REITS&6LDjm9Ls=0t3_jtGlqEJQLVtYPg5o4LE8KRsDcOrdhcQJ2gpgbWwP9rQyfChv7ADuy_hXWgy2abOG9jq8_hKyrFekh7IWmLmb9VBbEQh7tULy0_6L3zqkGOSoDWEcli5Ympa58KVMviSIxe_LiYGE'
lcont=reqbase.reqGetHtml(lurl,header)
if lcont:
try:
data=json.loads(lcont)
datas=data['page']['content']
for lmsg in datas:
title=lmsg['DOCTITLE']
subtitle=lmsg['FileName']
summary=lmsg['DOCCONTENT']
createDate=''
writeDate=''
pubDate=lmsg['PUBDATE']
source=lmsg['publisher']
durl=lmsg['url']
docNumberStr=''
reNum=lmsg['IdxID']
content=''
siteweb='湖北省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_hubeigov', durl)
if is_member:
continue
paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_hubeigov',durl)
except Exception as e:
print(f'请求异常{e}-异常页码{i}')
reqbase.pdwriterXLS(dlist,'河南省人民政府')
def paserdetail(detailmsg):
# opt = webdriver.ChromeOptions()
# opt.add_argument("--ignore-certificate-errors")
# opt.add_argument("--ignore-ssl-errors")
# opt.add_experimental_option("excludeSwitches", ["enable-automation"])
# opt.add_experimental_option('excludeSwitches', ['enable-logging'])
# opt.add_experimental_option('useAutomationExtension', False)
# opt.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
# chromedriver = r'D:\chrome62\cmdvip\chromedriver.exe'
# driver = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
headers={
'Host':'www.hubei.gov.cn',
'Proxy-Connection':'keep-alive',
'Pragma':'no-cache',
'Cache-Control':'no-cache',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Referer':'http://www.hubei.gov.cn/zfwj/ezbf/202303/t20230303_4569220.shtml',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'924omrTVcFchT=0ydF5mX9FkIQDMfAAr4A60Yt6sHsXZOzTlm30NLRHm2OwX_YgXaMFBUe3WeNORSf0ZqYHjvBxVL5CXNSWoCOOThArMBpBDzXVdWxVIoA5YBGBLbPUN4CbcQQLZEty.w1MZkgI1pn30uv5STvyCsHLoYGTDHDSIbaURf4XIXzC3fNhxDX.nR5ZWV_HBo9ZAyC5I93.otc4vf7nD6v3Tympw6h2ZUuyAJ0Q7Nes3n0dIB_BIhwCkjyvJibUZt04ggU6XeXnS.qXr2CaM8BJQQ4mdLJ5apGqInkYuNv2GJP1AvL',
}
durl=detailmsg['durl']
try:
dhmsg=reqbase.reqGetHtml(durl,headers)
if dhmsg:
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
idx_num=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
class_type=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
dplay_gov=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
pub_date=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
fileNum=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="hbgov-article-content"]')
# contentWithTag=soup.select('div[class="hbgov-article-content"]')[0]
content = contentWithTag.text # 不带标签正文
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['idx_num']=idx_num
detailmsg['class_type']=class_type
detailmsg['dplay_gov']=dplay_gov
detailmsg['pub_date']=pub_date
detailmsg['fileNum']=fileNum
except Exception as e:
print(e)
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# https://www.henan.gov.cn/
import json
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import xlsxwriter
import openpyxl
import BaseCore
baseCore=BaseCore.BaseCore()
def reqGetHtml(url,header):
for i in range(0,3):
try:
proxy=baseCore.get_proxy()
response=requests.get(url=url,headers=header,proxies=proxy,verify=False,timeout=10)
response.encoding=response.apparent_encoding
hcont=response.text
if hcont:
break
except Exception as e:
hcont=''
return hcont
def reqPostHtml(url,header,data):
for i in range(0,3):
try:
proxy=baseCore.get_proxy()
if isinstance(data, str):
res=requests.post(url=url,data=data,headers=header,proxies=proxy,verify=False,timeout=10)
else:
res=requests.post(url=url,data=json.dumps(data),headers=header,verify=False,timeout=10)
hcont=res.text
if hcont:
break
except Exception as e:
hcont=''
return hcont
def reqPostStrHtml(url,header,data):
for i in range(0,3):
try:
res=requests.post(url=url,data=data,headers=header,verify=False,timeout=10)
hcont=res.text
if hcont:
break
except Exception as e:
hcont=''
return hcont
def createDriver():
chrome_driver = r'D:\Google\Chrome\Application\chrome.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =r'D:\chrome\chromedriver.exe'
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
return driver
# 将html中的相对地址转换成绝对地址
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def pdwriterXLS(dlist,siteName):
df_out = pd.DataFrame(data=dlist)
df_out.to_excel(siteName+'.xlsx', engine='xlsxwriter', index=False)
# _*_ coding:utf-8 _*_
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Content-Length':'185',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Origin':'https://sheng.so-gov.cn',
'Pragma':'no-cache',
'Referer':'https://sheng.so-gov.cn/',
'Sec-Ch-Ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-site',
'Suid':'cf354a807a13d634f76bf167610f9c07',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
dlist=[]
pagenum=4
for i in range(3,pagenum):
log.info(f'henan采集第{i}页列表')
lurl='https://api.so-gov.cn/s'
data=f'siteCode=5300000033&tab=zcwj&timestamp=1699525503095&wordToken=72df37fd2f1058524e0c7467610d9ab7&page={i}&pageSize=20&qt=REITs&timeOption=0&sort=dateDesc&keyPlace=0&fileType=&toolsStatus=1'
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['search']['searchs']
for myValues in datas:
lmsg=myValues['myValues']
try:
title=lmsg['DRETITLEO']
subtitle=''
summary=lmsg['QUICKDESCRIPTION']
createDate=''
writeDate=''
pubDate=''
source=lmsg['WEBSITENAME']
durl=lmsg['URL']
wenjianhao=lmsg['C3']
suoyinhao=''
content=''
siteweb='云南省人民政府'
except Exception as e:
continue
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'wenjianhao':wenjianhao,
'suoyinhao':suoyinhao,
}
is_member = rr.sismember('reis_yngov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_yngov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'云南人民政府-政策2')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'_gscu_802487706=99519044sn0ozi20; _gscbrs_802487706=1; Hm_lvt_b9099e95d08017e30f6285a8b55eb822=1699519045; TrsAccessMonitor=TrsAccessMonitor-1699519056000-2819180807; _gscs_802487706=995190442fewym20|pv:2; Hm_lpvt_b9099e95d08017e30f6285a8b55eb822=1699519714',
'Host':'www.yn.gov.cn',
'Pragma':'no-cache',
'Referer':'https://sheng.so-gov.cn/',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
log.info(f'详情页标题:{detailmsg["title"]}')
log.info(f'详情请求地址:{durl}')
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
suoyinhao=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(1)>dd')[1]
wenjianhao=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(2)>dd')[1]
pubDate=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(4)>dd')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="trs_editor_view TRS_UEDITOR trs_paper_default trs_web"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="content"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_external trs_web trs_key4format"]')
if not content:
log.info(f'详情内容为空:{durl}')
#contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_external trs_web trs_key4format"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['suoyinhao']=suoyinhao
detailmsg['wenjianhao']=wenjianhao
detailmsg['pubDate']=pubDate
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
索引(Index)
索引(Index)
索引就是一类文档的集合,类似于关系型数据库中的表。索引由其名称进行标识,每个索引名称必须是小写。
文档(Document)
Index中单条记录称为文档,等同于关系型数据库表中的行。
字段(Field)
json结构的字段,等同于关系型数据库表中的列。
映射(Mapping)
Mapping是处理数据的方式和规则方面做一些限制,如:某个字段的数据类型、默认值、分析器、是否被索引等等,都是映射里可以设置的。
分片(Shards)
一个索引可以存储超过单个节点硬件限制的大量数据,相当于分表的概念。
ES提供了将索引划分成多份的能力,每一份称之为分片。
当创建一个索引的时候,可以指定想要的分片数量。
允许水平分割/扩展内容容量;允许在分片之上进行分布式的、并行的操作,进而提高性能/吞吐量。
副本(Replicas)
在分片/节点失败的情况下,提供了高可用性。
复制分片从不与原/主要分片置于同一节点上是非常重要的。
扩展搜索量/吞吐量,因为搜索可以在所有副本上并行运行。
from elasticsearch import Elasticsearch
# 连接ES
es=Elasticsearch(["192.168.1.90:9200"],
sniff_on_start=True,# 连接前测试
sniff_on_connection_fail=True,# 节点无响应时刷新节点
sniffer_timeout=60) # 设置超时时间
index_name='test_data'
def main():
# 连接ES
es=Elasticsearch(["192.168.1.90:9200"],
sniff_on_start=True,# 连接前测试
sniff_on_connection_fail=True,# 节点无响应时刷新节点
sniffer_timeout=60) # 设置超时时间
if __name__ == '__main__':
main()
# 创建索引
def create_index():
# 定义mapping body
body_index = {
'mappings': {
'properties': {
'name': {
'type': 'keyword'
},
'age': {
'type': 'long'
},
'tags': {
'type': 'text'
}
}
},
'settings': {
'index': {
'number_of_shards': '3',
'number_of_replicas': '0'
}
}
}
# 创建index
res = es.indices.create(index=index_name, body=body_index, ignore=400)
def instert_data():
person1 = {
'name': '张三',
'age': 18,
'tags': '勤奋学习十载寒窗,凿壁借光,囊萤映雪,手不释卷,有良好的表达能力。有耐心心态好,善于维系客户关系。果断热情勇敢孤僻活力,思想成熟能够独立工作。'
}
res = es.index(index=index_name, body=person1)
from elasticsearch import helpers
def instert_bach():
insert_infos = []
person2 = {
'_index': index_name,
'name': '李四',
'age': 20,
'tags': '有极强的领导艺术,公正严明铁面无私,公私分明。关心他人无微不至,体贴入微。精力充沛,并有很强的事业心。气吞山河正气凛然,善于同各种人员打交道。'
}
person3 = {
'_index': index_name,
'name': '王五',
'age': 19,
'tags': '尊敬师长团结同学,乐于助人学习勤奋,用心向上,用心参加班级学校组织的各种课内外活动。用心开展批评与自我批评。'
}
insert_infos.append(person2)
insert_infos.append(person3)
helpers.bulk(client=es, actions=insert_infos)
def del_index():
# 删除index
res = es.indices.delete(index=index_name, ignore=[400])
def del_doc_byid():
# 按id删除
res = es.delete(index=index_name, id='bKTgXYUBfH4USN9RFMOh')
def del_by_condation():
# 按条件删除
body = {
'query': {
'match': {
'name': '张三'
}
}
}
res = es.delete_by_query(index=index_name, body=body, ignore=[400, 404])
# index() 方法完成两个操作,如果数据不存在,那就执行插入操作,如果已经存在,那就执行更新操作。
# index实现更新时,body中必须写入全部字段,否则未包含的字段会被置为空。
def index_update_doc():
body = {
'name': '王五',
'age': 19,
'tags': '尊敬师长团结同学,乐于助人学习勤奋,用心向上,用心参加班级学校组织的各种课内外活动。用心开展批评与自我批评。'
}
res = es.index(index=index_name, id='baTgXYUBfH4USN9RFMOh', body=body)
def update_doc():
body = {
'doc': {
'name': '王五'
}
}
es.update(index=index_name, id='baTgXYUBfH4USN9RFMOh', body=body)
def select_info():
# 查看ES中索引的信息
index_info = es.indices.get('*')
# 查看索引的名称
index_names = index_info.keys()
index_name = 'es_index'
print(es.indices.exists(index_name))
doc_count = es.count(index=index_name)
def query_by_id():
body = {
'query': {
'match': {
'_id': 'baTgXYUBfH4USN9RFMOh'
}
}
}
res = es.search(index=index_name, body=body)
def query_by_filed():
body = {
'query': {
'match': {
'age': 20
}
},
'_source': ['name', 'tags']
}
res = es.search(index=index_name, body=body)
def query_by_sort():
body = {
'sort': {
'age': {
'order': 'desc' # asc: 升序, desc: 降序
}
}
}
res = es.search(index=index_name, body=body)
def query_by_range():
body = {
'query': {
'range': {
'age': {
'gt': 18,
'lte': 20
}
}
}
}
res = es.search(index=index_name, body=body)
def query_by_page():
body = {
'sort': {
'age': {
'order': 'desc' # asc: 升序, desc: 降序
}
},
'from': 0,
'size': 1
}
res = es.search(index=index_name, body=body)
def quere_by_paser():
body = {
"query": {
"match_phrase": {
"tags": "耐心"
}
}
}
res = es.search(index=index_name, body=body)
def query_by_mult():
body = {
"query": {
"bool": {
"must": [
{
"match": {
"name": "张三"
}
},
{
"match_phrase": {
"tags": "耐心"
}
}
]
}
}
}
res = es.search(index=index_name, body=body)
def query_by_not():
body = {
"query": {
"bool": {
"must": [
{
"match": {
"name": "王五"
}
}
],
'must_not': [
{
"match_phrase": {
"tags": "耐心"
}
}
]
}
}
}
res = es.search(index=index_name, body=body)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论