提交 c2749092 作者: 刘伟刚

天眼查测试代码

上级 f7814e8e
# 核心工具包
import os
import random
import socket
import sys
import time
import logbook
import logbook.more
import pandas as pd
import requests
import zhconv
import redis
from retry import retry
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import langid
# 创建连接池
import pymysql
from DBUtils.PooledDB import PooledDB
# import sys
# sys.path.append('D://zzsn_spider//base//fdfs_client')
from fdfs_client.client import get_tracker_conf, Fdfs_client
import uuid
# tracker_conf = get_tracker_conf('D:\\kkwork\\zzsn_spider\\base\\client.conf')
# client = Fdfs_client(tracker_conf)
from obs import ObsClient
import fitz
from urllib.parse import unquote
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
# __cnx_proxy =None
# __cursor_proxy = None
cnx = None
cursor = None
cnx_ = None
cursor_ = None
r = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
# Android agent池
__USER_PHONE_AGENT_LIST = [
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
# self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
# charset='utf8mb4')
# self.__cursor_proxy = self.__cnx_proxy.cursor()
self.cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
self.cursor = self.cnx.cursor()
# 11数据库
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
def close(self):
try:
self.cursor.close()
self.cnx.close()
except:
pass
# 计算耗时
def getTimeCost(self, start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 获取流水号
def getNextSeq(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return self.getNowTime(2) + str(self.__seq).zfill(3)
# 获取信用代码
def getNextXydm(self):
self.__seq += 1
if self.__seq > 1000:
self.__seq = 0
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self, record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self, fileLogFlag=True, stdOutFlag=True):
pid = self.getPID()
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + f"{pid}.log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.cursor.execute(sql)
proxy_lists = self.cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
# 字符串截取
def getSubStr(self, str, beginStr, endStr):
if beginStr == '':
pass
else:
begin = str.rfind(beginStr)
if begin == -1:
begin = 0
str = str[begin:]
if endStr == '':
pass
else:
end = str.rfind(endStr)
if end == -1:
pass
else:
str = str[0:end + 1]
return str
# 繁体字转简体字
def hant_2_hans(self, hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self, str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self, key):
item = self.r.lpop(key)
return item.decode() if item else None
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
# 获取本机IP
def getIP(self):
IP = socket.gethostbyname(socket.gethostname())
return IP
def mkPath(self, path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, social_code):
data = []
try:
sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql, values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
# 获取企查查token
def GetToken(self):
# 获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token_list = self.cursor.fetchall()
self.cnx.commit()
token = token_list[random.randint(0, len(token_list) - 1)][0]
return token
# 获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
self.cnx.commit()
return token
# 检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
# 追加接入excel
def writerToExcel(self, detailList, filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename, engine='openpyxl', dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
# 对失败或者断掉的企业 重新放入redis
def rePutIntoR(self, key, item):
self.r.rpush(key, item)
# 增加计数器的值并返回增加后的值
def incrSet(self, key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
# 获取key剩余的过期时间
def getttl(self, key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
# # 上传至文件服务器,并解析pdf的内容和页数
# def upLoadToServe(self, pdf_url, type_id, social_code):
# headers = {}
# retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
# 'full_path': '',
# 'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
# 'create_time': '', 'page_size': '', 'content': ''}
# headers['User-Agent'] = self.getRandomUserAgent()
# for i in range(0, 3):
# try:
# resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
# break
# except:
# time.sleep(3)
# continue
# page_size = 0
#
# for i in range(0, 3):
# try:
# result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
# with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
# for page in doc.pages():
# retData['content'] += page.get_text()
# break
# except:
# time.sleep(3)
# continue
# if page_size < 1:
# # pdf解析失败
# print(f'======pdf解析失败=====')
# return retData
# else:
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# retData['state'] = True
# retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
# retData['full_path'] = bytes.decode(result['Remote file_id'])
# retData['file_size'] = result['Uploaded size']
# retData['create_time'] = time_now
# retData['page_size'] = page_size
#
# return retData
def secrchATT(self, item_id, year, type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, year, pdf_name, num, pub_time,origin):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id, year, type_id)
if selects:
self.getLogger().info(f'com_name:{com_name}--{year}已存在')
id = ''
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
pub_time,origin)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id, year, type_id)
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self, social_code, cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
# 上传至obs华为云服务器,并解析破地方的内容和页数
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
# 查看obs文件是否已经上传
def obsexist(self, file_path):
# # 文件路径
# file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
# 检查文件是否存在
response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300:
self.getLogger().info('=====文件不存在obs=====')
return True
else:
self.getLogger().info(f'=====文件存在obs========{file_path}')
#uuid 根据时间戳生成 文件名 上传到obs
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
def uptoOBS(self, pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = self.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
name = str(self.getuuid()) + '.pdf'
now_time = time.strftime("%Y-%m")
try:
result = self.getOBSres(pathType, now_time, name, response)
except:
log = self.getLogger()
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log = self.getLogger()
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = self.getTimeCost(start_time, time.time())
self.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
@retry(tries=3, delay=1)
def getOBSres(self, pathType, now_time, name, response):
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
# resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
return result
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 根据信用代码获取天眼查id
import json
import random
import time
import pymysql
import requests
from base.BaseCore import BaseCore
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
requests.adapters.DEFAULT_RETRIES = 5
baseCore = BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '32',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5NzE5MDMwMywiZXhwIjoxNjk5NzgyMzAzfQ.awXuS-59RzK35r0aUJq4Rj83JzyAOvsdUfL_ojp66CVQMjlLv_ZDD9g5gCoZKE21LN1JYRMLNZhuWsHhxapROw',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
# headers = {
# 'X-TYCID':'30c1289042f511ee9182cd1e1bcaa517',
# # 'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5MjkzMzIxMiwiZXhwIjoxNjk1NTI1MjEyfQ.BKxDem8fpgeDHrIgm3qCoF76ueHtQSG1DggiTl4FAaoNKt4gem6NTX1XYndPXqVj9TXfl-8yp2kKE3jY66dyig',
# 'version':'TYC-Web',
# 'Content-Type':'application/json;charset=UTF-8'
# }
# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
# cursor= cnx.cursor()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
taskType = '天眼查企业id/天眼查'
#根据信用代码获取天眼查id 企业名字等信息
def getTycIdByXYDM(xydm):
retData={'state':False,'tycData':None,'reput':True}
url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
ip = baseCore.get_proxy()
paramJsonData = {'keyword':xydm}
try:
# headers['User-Agent'] = baseCore.getRandomUserAgent()
# headers['X-AUTH-TOKEN'] = baseCore.GetTYCToken()
response = requests.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip)
time.sleep(random.randint(3, 5))
retJsonData =json.loads(response.content.decode('utf-8'))
if retJsonData['data'] and retJsonData['state']== 'ok':
pass
else:
log.error(f"---{xydm}-未查询到该企业---")
retData['reput'] = False
return retData
matchType=retJsonData['data'][0]['matchType']
if matchType=='信用代码匹配':
retData['state'] = True
retData['tycData'] = retJsonData['data'][0]
response.close()
return retData
else:
log.error(f"{xydm}------{retJsonData}")
response.close()
return retData
except:
log.error(f"---{xydm}--天眼查token失效---")
return retData
# 更新天眼查企业基本信息
def updateTycInfo():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gnqy_socialCode')
# social_code = '9111000066990444XF'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
continue
start = time.time()
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except Exception as e:
log.error(e)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
if __name__ == '__main__':
updateTycInfo()
\ No newline at end of file
#企业动态 从redis中获取数据
import json
import random
import requests, time, pymysql
import jieba
import sys
from kafka import KafkaProducer
from getTycId import getTycIdByXYDM
from base.BaseCore import BaseCore
from base.smart import smart_extractor
sys.path.append('D:\\zzsn_spider\\base')
# import BaseCore
# from smart import smart_extractor
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
jieba.cut("必须加载jieba")
# 初始化,设置中文分词
smart =smart_extractor.SmartExtractor('cn')
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor = cnx.cursor()
pageSize = 10
log.info(f'======================当前脚本进程为{baseCore.getPID()}==============================')
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5NzE5MDMwMywiZXhwIjoxNjk5NzgyMzAzfQ.awXuS-59RzK35r0aUJq4Rj83JzyAOvsdUfL_ojp66CVQMjlLv_ZDD9g5gCoZKE21LN1JYRMLNZhuWsHhxapROw',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
taskType = '企业动态/天眼查/补采20W+'
def beinWork(tyc_code, social_code,start_time):
time.sleep(3)
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
t = time.time()
url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
try:
for m in range(0, 3):
ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent()
response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
time.sleep(random.randint(3, 5))
break
if (response.status_code == 200):
pass
except Exception as e:
log.error(f"{tyc_code}-----获取总数接口失败")
error = '获取总数接口失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{error}----{e}')
return retData
try:
json_1 = json.loads(response.content.decode('utf-8'))
total = json_1['data']['total']
except:
log.error(f"{tyc_code}-----获取总数失败")
e = '获取总数失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
return retData
if (total > 0):
if (total % pageSize == 0):
totalPage = total // pageSize
else:
totalPage = total // pageSize + 1
else:
log.error(f"{tyc_code}--------总数为0")
retData['state'] = True
return retData
log.info(f"{tyc_code}-------总数:{total}----总页数:{totalPage}")
retData['total'] = total
up_okCount = 0
up_errorCount = 0
up_repetCount = 0
for num in range(1, totalPage + 1):
time.sleep(3)
log.info(f"获取分页数据--{tyc_code}----分页{num}----开始")
start_page = time.time()
url_page = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={time.time()}&id={tyc_code}&ps={pageSize}&pn={num}&emotion=-100&event=-100'
for m in range(0, 3):
try:
ip = baseCore.get_proxy()
headers['User-Agent'] = baseCore.getRandomUserAgent()
response_page = requests.get(url=url_page, headers=headers, proxies=ip, verify=False)
time.sleep(1)
break
except:
pass
if (response_page.status_code == 200):
pass
else:
log.error(f"{tyc_code}--{num}页---获取分页数据失败")
e = '获取分页数据失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
up_errorCount = up_errorCount + pageSize
continue
try:
json_page = json.loads(response_page.content.decode('utf-8'))
info_list_page = json_page['data']['items']
except:
log.error(f"{tyc_code}--{num}页---获取分页数据失败")
e = '获取分页数据失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
up_errorCount = up_errorCount + pageSize
continue
pageIndex = 0
for info_page in info_list_page:
pageIndex = pageIndex + 1
title = info_page['title']
source = info_page['website']
link = info_page['uri']
try:
sel_sql = '''select social_credit_code from brpa_source_article_news where source_address = %s and social_credit_code=%s and type='2' '''
cursor_.execute(sel_sql, (link, social_code))
except Exception as e:
print(e)
selects = cursor_.fetchone()
if selects:
log.info(f'{tyc_code}-----{social_code}----{link}:已经存在')
# todo:如果该条数据存在则说明该条数据之后的都已经采集完成,就可以跳出函数,执行下一个企业
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
#return retData
continue
try:
time_struct = time.localtime(int(info_page['rtm'] / 1000)) # 首先把时间戳转换为结构化时间
time_format = time.strftime("%Y-%m-%d %H:%M:%S", time_struct) # 把结构化时间转换为格式化时间
except:
time_format = baseCore.getNowTime(1)
#记录时间 对比时间
#if time_format > '2023-09-25' and time_format < '2023-10-01':
#pass
#else:
#continue
try:
# 开始进行智能解析
# lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang)
#带标签正文
contentText = smart.extract_by_url(link).text
#不带标签正文
content = smart.extract_by_url(link).cleaned_text
# time.sleep(3)
except Exception as e:
contentText = ''
if contentText == '':
log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}')
e = '获取正文失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
up_errorCount = up_errorCount + 1
try:
insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex) values('{social_code}','{source}','{link}','{title}','{time_format}','{info_page['abstracts']}',now(),1,{num},{pageIndex})"
cursor.execute(insert_err_sql)
cnx.commit()
except:
pass
continue
try:
insert_sql = '''insert into brpa_source_article_news(social_credit_code,source_address,origin,type,publish_time,content,create_time) values(%s,%s,%s,%s,%s,%s,now())'''
# 动态信息列表
up_okCount = up_okCount + 1
list_info = [
social_code,
link,
f'天眼查-{source}',
'2',
time_format,
content[:500]
]
cursor_.execute(insert_sql, tuple(list_info))
cnx_.commit()
# 采集一条资讯记录一条,记录该企业采到了多少的资讯
log.info(f'{social_code}----{link}:新增一条')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:插入一条数据,并传入kafka
dic_news = {
'attachmentIds': '',
'author': '',
'content': content,
'contentWithTag': contentText,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': source,
'publishDate': time_format,
#'sid': '1684032033495392257',
'sid': '1714852232679067649',
'sourceAddress': link, # 原文链接
'summary': info_page['abstracts'],
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': time_format[:4]
}
except Exception as e:
log.info(f'传输失败:{social_code}----{link}')
error = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, f'{error}----{e}')
continue
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, '成功')
# return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.error(dic_result)
e = 'Kafka操作失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
log.info(f"获取分页数据--{tyc_code}----分页{num},耗时{baseCore.getTimeCost(start_page, time.time())}")
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
return retData
# 日志信息保存至现已创建好数据库中,因此并没有再对此前保存日志信息数据库进行保存
def doJob():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
#social_code = '91440300665899831W'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
continue
start = time.time()
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
#数据重新塞入redis
baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode',social_code)
continue
id = data[0]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode', social_code)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始处理")
start_time = time.time()
# 开始采集企业动态
retData = beinWork(tycid, xydm,start_time)
# 信息采集完成后将该企业的采集次数更新
runType = 'NewsRunCount'
count += 1
baseCore.updateRun(social_code, runType, count)
total = retData['total']
up_okCount = retData['up_okCount']
up_errorCount = retData['up_errorCount']
up_repetCount = retData['up_repetCount']
log.info(
f"{id}---{xydm}----{tycid}----结束处理,耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}")
except Exception as e:
log.info(f'==={social_code}=====获取企业信息失败====')
#重新塞入redis
baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode',social_code)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
#break
cursor.close()
cnx.close()
# 释放资源
baseCore.close()
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
doJob()
# -*- coding: utf-8 -*-
import requests
from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractor:
@staticmethod
def get_supported_lang_code_dict():
"""
支持语言:
1、需要分词,传递分词器(3种):
a. 中文、韩语、阿拉伯语
2、不需要分词,直接传递语言编码(16种)
a. 其中英语、俄语,单独测试
"""
supported_lang_code_dict = {
'cn': '中文', # 中文
'zh-cn': '简体中文', # 简体中文
'zh': '简体中文', # 简体中文
'ko': '韩语', # 韩语
'ar': '阿拉伯语', # 阿拉伯语
'en': '英语', # 英语
'ru': '俄语', # 俄语
'da': '丹麦语', # 丹麦语
'de': '德语', # 德语
'es': '西班牙语', # 西班牙语
'fi': '芬兰语', # 芬兰语
'fr': '法语', # 法语
'hu': '匈牙利语', # 匈牙利语
'id': '印度尼西亚语', # 印度尼西亚语
'it': '意大利语', # 意大利语
'nb': '挪威语(伯克梅尔)', # 挪威语(伯克梅尔)
'nl': '荷兰语', # 荷兰语
'no': '挪威文(耐诺斯克)', # 挪威文(耐诺斯克)
'pl': '波兰语', # 波兰语
'pt': '葡萄牙语', # 葡萄牙语
'sv': '瑞典语', # 瑞典语
}
return supported_lang_code_dict
def __init__(self, lang_code='cn'):
"""
构造器:未指定 lang_code 参数时,默认为 cn
"""
# 支持语言
supported_lang_code_list = list(SmartExtractor.get_supported_lang_code_dict())
# 初始化 goose 对象:
# 1、根据语言代码,创建 goose 对象
if lang_code is None or lang_code == 'cn' or lang_code == 'zh-cn' or lang_code == 'zh':
# 需要分词:中文
# 1、不指定lang_code参数,或不指定lang_code为 None 时,默认为中文分词
# 2、Flask Web接口:未指定get参数 lang_code 时,lang_code 会接收为 None
self.goose = Goose({'stopwords_class': StopWordsChinese})
elif lang_code == 'ko':
# 需要分词:韩语
# 1、测试:只传递语言,不传递分词器
# self.goose = Goose({'use_meta_language': False, 'target_language': 'ko'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试失败:正文采集为空
# 韩语分词:测试成功
self.goose = Goose({'stopwords_class': StopWordsKorean})
elif lang_code == 'ar':
# 需要分词:阿拉伯语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试成功
# self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
self.goose = Goose({'stopwords_class': StopWordsArabic})
elif lang_code == 'en':
# 单独测试:英文
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})
# 测试成功:创建Goose对象时,不指定语言默认为英文分词
self.goose = Goose()
elif lang_code == 'ru':
# 单独测试:俄语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
elif lang_code in supported_lang_code_list:
# 其它语言编码,统一处理,不再单独测试
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})
else:
# 未识别的语言代码
raise Exception(f'智能采集时,无法识别语言代码:{lang_code}')
def get_extraction_result(self, article, link_text=''):
"""
获取采集结果:
1、从 artcile 对象中,采集数据并封装到 ExtractionResult
"""
# 用于保存:采集后的文本
extraction_result = ExtractionResult()
# 标题
# extraction_result.title = article.title # 原办法:使用 goose 采集到的 title 中的标题
extraction_result.title = SmartExtractorUtility.get_article_title(article, link_text)
# 发布日期
extraction_result.publish_date = SmartExtractorUtility.get_publish_date(article)
# 正文(保留所有HTML标记,如:br、img)
extraction_result.text = SmartExtractorUtility.get_article_text(article)
# URL
extraction_result.url = article.final_url
# 摘要
extraction_result.meta_description = article.meta_description
# 干净正文(不带HTML)
extraction_result.cleaned_text = article.cleaned_text
# 来源(目前只支持采集中文网站中的“来源”)
extraction_result.source = ''
return extraction_result
def extract_by_url(self, url, link_text=''):
"""
按URL采集内容
"""
# 采集正文:传入url
article = self.goose.extract(url=url)
# article = goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_url_test():
# 测试:按URL采集
url_list = [
# "http://www.news.cn/politics/2022-07/31/c_1128879636.htm", # 短文本
# "https://baijiahao.baidu.com/s?id=1741311527693101670", # 带多张图片
# "https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml", # 带多张图片,及一个视频(测试内容XPath失败)
# "http://opinion.people.com.cn/n1/2022/0803/c1003-32492653.html", # 人民网
# 韩文:中央日报-politics
# "https://www.joongang.co.kr/article/25094974",
# "https://www.joongang.co.kr/article/25094967",
# 英文:加德满都邮报-national-security
# "https://kathmandupost.com/national-security/2020/01/17/police-s-intelligence-continues-to-fail-them-as-chand-party-claims-explosion",
# "https://kathmandupost.com/national-security/2019/11/04/india-s-new-political-map-places-disputed-territory-of-kalapani-inside-its-own-borders", # 测试采集:发布时间
# 俄语:今日白俄罗斯报-word
# "https://www.sb.by/articles/byvshiy-premer-ministr-italii-zayavil-chto-strane-sleduet-otkazatsya-ot-gaza-iz-rossii.html",
# 'https://www.sb.by/articles/kryuchkov-predupredil-o-nepopravimykh-posledstviyakh-dlya-ukrainy-v-sluchae-udarov-po-krymu.html',
# 阿语
# "http://arabic.people.com.cn/n3/2022/0822/c31659-10137917.html",
# "http://arabic.people.com.cn/n3/2022/0822/c31657-10137909.html",
# 测试提取标题
# "http://www.sasac.gov.cn/n4470048/n16518962/n20928507/n20928570/c25819031/content.html",
# "http://www.forestry.gov.cn/main/102/20220823/092407820617754.html",
# "http://www.sasac.gov.cn/n2588025/n2588139/c25825832/content.html", # 标题采集为空
# 'http://www.crfeb.com.cn/1j/_124/2005409/index.html', # 内容采集失败
# 'http://www.crfeb.com.cn/1j/_124/912248/index.html', # 内容采集失败
# 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html', # 中国铁建股份有限公司-工作动态(日期采集错误)
# 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html', # 中国土木工程集团有限公司-多个栏目(日期采集错误)
# 'http://v.people.cn/n1/2022/0901/c444662-32517559.html', # 人民网视频:title必须以“元素中的标题”开始,不能判断“包含”
# 'https://www.chec.bj.cn/cn/xwzx/gsyw/2022/202207/t20220706_8128.html', # 中国港湾工程有限责任公司-公司要闻(标题采集失败)
# 'https://www.cscec.com/xwzx_new/gsyw_new/202208/3570377.html', # 中国建筑集团有限公司-中建要闻(标题采集失败)
# 'https://www.crbc.com/site/crbc/276/info/2022/46884837.html', # 中国路桥工程有限责任公司-多个栏目(标题采集失败)
# 'http://www.cgcoc.com.cn/news/432.html', # 中地海外集团有限公司-新闻中心(标题和内容采集失败)
# 'http://www.mcc.com.cn/mcc/_132154/_132572/308233/index.html' # 中国五矿(测试:正文采集失败)
# 'http://www.powerchina.cn/art/2015/5/27/art_7449_441845.html', # 中国电力建设集团(测试:标题、正文采集失败)
# 中国电力建设集团(测试:标题采集失败),相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
# 'http://world.people.com.cn/n1/2022/0624/c1002-32455607.html', # 标题采集失败:看着没有问题
# 'https://www.cscec.com/xwzx_new/zqydt_new/202209/3578274.html', # 中国建筑股份有限公司-企业动态:日期采集错误,采集到当天日期
# 'https://3g.k.sohu.com/t/n705260979' #天眼查--企业公告'
# 'https://baijiahao.baidu.com/s?id=1769415116218226935'
# 'https://m.gelonghui.com/community/post/1678728#ocr'
'http://epaper.zqrb.cn/html/2023-05/27/content_950333.htm'
]
# 语言编码
lang_code = 'cn'
# lang_code = 'ko'
# lang_code = 'en'
# lang_code = 'ru'
# lang_code = 'ar'
for url in url_list:
print()
print("-" * 100)
print('请求URL:', url)
extraction_result = SmartExtractor(lang_code).extract_by_url(url)
# 测试转换为JSON
# 1、直接转换时,会抛异常:TypeError: Object of type ExtractionResult is not JSON serializable
# print(json.dumps(extraction_result))
# print(json.dumps(extraction_result, default=ExtractionResult.to_dict)) # 转换成功:指定序列化器
# print(type(json.dumps(extraction_result.to_dict()))) # 返回类型:<class 'str'>,内容中的中文会被转义
# print(str(extraction_result.to_dict())) # 如果直接转换为字符串,中文不会被转义
# 打印测试结果
print_extraction_result(extraction_result)
def extract_by_html_test():
# 测试:按HTML采集
html = '''
<html>
<head>
<title>标题</title>
</head>
<body>
<div>标题</div>
<div>内容</div>
</body>
</html>
'''
# 测试:通过请求URL,获取完整的html
# url = "http://www.news.cn/politics/2022-07/31/c_1128879636.htm" # 测试成功
# url = "http://views.ce.cn/view/ent/202208/15/t20220815_37961634.shtml" # 1、测试失败:lxml.etree.ParserError: Document is empty
url = 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html' # 中国铁建股份有限公司-工作动态(日期采集错误)
# url = 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html' # 中国土木工程集团有限公司-多个栏目(日期采集错误)
print()
print("-" * 100)
print('请求URL:', url)
html = requests.get(url).text
# 语言编码
lang_code = 'cn'
# 采集内容
extraction_result = SmartExtractor(lang_code).extract_by_html(html)
# 打印测试结果
print_extraction_result(extraction_result)
def print_extraction_result(extraction_result):
# 打印测试结果
print("标题:", extraction_result.title) # 标题
print("发布时间:", extraction_result.publish_date) # 发布时间
print("正文:", extraction_result.text) # 正文
print("URL:", extraction_result.url) # URL
print("摘要:", extraction_result.meta_description) # 摘要
print("干净正文:", extraction_result.cleaned_text) # 干净正文
if __name__ == '__main__':
try:
# 测试:按URL采集
extract_by_url_test()
# 测试:按HTML采集
# extract_by_html_test()
except Exception as e:
print("采集失败:", e)
# -*- coding: utf-8 -*-
import re
from goose3.article import Article
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractorUtility:
# 标题最小长度
title_min_len = 6
@staticmethod
def extract_publish_date(html):
pattern_list = [
# 2010-10-1 8:00:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010-10-1 8:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 2010年10月1日 8:00:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}:\d{1,2}",
# 2010年10月1日 8:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}",
# 2010/10/1 8:00:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010/10/1 8:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}",
# 2010-10-1
r"20\d{2}-\d{1,2}-\d{1,2}",
# 2010年10月1日
r"20\d{2}年\d{1,2}月\d{1,2}日",
# 2010/10/1
r"20\d{2}/\d{1,2}/\d{1,2}",
# 2022.08.28
r"20\d{2}\.\d{1,2}\.\d{1,2}"
# 12-07-02 10:10
r"\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 1月前
r"\d+(&nbsp;| )*月前",
# 12天前
r"\d+(&nbsp;| )*天前",
# 2小时前
r"\d+(&nbsp;| )*小时前",
# 15分钟前
r"\d+(&nbsp;| )*分钟前",
# 昨天&nbsp;17:59
r"昨天(&nbsp;| )*\d{1,2}:\d{1,2}",
]
# 尝试匹配所有正则式
for pattern in pattern_list:
# 提取可见日期:
# 1、必须在标签内部,不能提取HTML标签属性中的日期
# 2、提取规则:必须在 > 和 < 之间,且中间不能再有 >
tag_pattern = f'>[^>]*(?P<date>{pattern})[^>]*<'
# 搜索第一个匹配项
match = re.search(tag_pattern, html)
# 如果匹配成功,返回正确的发布时间
if match:
return match.group('date')
# 所有正则式匹配失败,返回空字符串
return ""
@staticmethod
def add_html_br(cleaned_text):
# 包装HTML标记:换行
# 1、优先替换双换行:使用goose提取到的cleaned_text,都是双换行
cleaned_text = cleaned_text.replace("\n\n", "<br>")
cleaned_text = cleaned_text.replace("\n", "<br>")
return cleaned_text
@staticmethod
def get_article_title(article: Article, link_text=''):
#
# 优先提取h1、div、span、td元素中的标题
# 1、测试任务:2.智能采集\1.测试任务\国资委-新闻发布
# a. 原title标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”-国务院国有资产监督管理委员会
# b. div元素中的标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”
# 2、测试任务:2.智能采集\1.测试任务\国家林业和草原局-地方动态
# a. 原title标题:上海完成森林资源年度监测遥感解译图斑市级质量检查_地方动态_国家林业和草原局政府网
# b. span元素中的标题:上海完成森林资源年度监测遥感解译图斑市级质量检查
#
# 根据xpath,查询标题元素时:
# 1、标签优先级:h1、特殊元素(id或class包含title)、h2、h3、div、span、td
#
title_element_list = [
'h1',
'h2',
'h3',
'div',
'span',
'td',
'p',
]
# 对比标题前,统一将空格剔除(2022-09-21):
# 1、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# 2、相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
link_text = link_text.replace(" ", "")
tag_title = article.title.replace(" ", "")
title = None
for title_element in title_element_list:
element_list = article.raw_doc.getroottree().xpath(f'//{title_element}')
# 查询XPath成功,遍历所有元素
for element in element_list:
# 取纯文本内容,包括子元素
text = etree.tounicode(element, method='text').strip()
text_no_space = text.replace(" ", "")
# 判断标题:
# 1、如果智能采集的原title标题,以“元素内容”开头,则取元素内容
# 2、查找成功后,返回text作为标题,否则继续下一个循环
# 判断是否以“元素中的标题”开始:
# 1、title必须以“元素中的标题”开始,不能判断“包含”
# 2、测试URL:http://v.people.cn/n1/2022/0901/c444662-32517559.html
# 3、title标签:<title>亿缕阳光丨小生意,大格局--人民视频--人民网</title>
# a. 如果判断“包含”,会采集到:人民网
# b. 因为存在元素:<a href="http://www.people.com.cn/" class="clink">人民网</a>
# c. 如果判断以“元素中的标题”开始,采集到:亿缕阳光丨小生意,大格局
# d. 标题元素:<h2>亿缕阳光丨小生意,大格局</h2>
# 新方案:
# 1、对比常用元素:仍判断是否以“元素中的标题”开始
# 2、优先对比“链接文本”,其次对比“title元素”
# 3、满足最少字数:6个字
# 新方案(2022-09-21):
# 1、对比“链接文本”、“title元素”时,除了判断开始,同时允许结尾
# 2、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# a. 列表中的链接文本:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电...
# b. title标签中的内容:<title>中国电力建设集团 公司要闻 【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠</title>
# c. 元素中的标题:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠
if text_no_space is not None and text_no_space != '' and len(
text_no_space) >= SmartExtractorUtility.title_min_len:
# 优先判断6个字,以方便调试:排除短文本元素
if link_text.startswith(text_no_space) or link_text.endswith(text_no_space) or tag_title.startswith(
text_no_space) or tag_title.endswith(text_no_space):
# 返回时,仍返回未剔除空格后的标题
return text
if title:
# 查找成功,返回元素中的标题
return title
else:
# 查找失败,返回提取到的title属性
# return article.title
# 新考虑:标题采集失败后,返回空值
# 1、原因:article.title 不可靠,只是提取了 title 标签中的内容
return ''
@staticmethod
def get_publish_date(article: Article):
# 优先使用正则式提取日期
# 1、测试任务:加德满都邮报-national-security
# a. 使用 publish_datetime_utc 提取英文日期后,提取错误
# b. 实际日期:Friday, August 19, 2022,但提取到了:2015-02-05
# c. 原因:在下方JS中,有一段JSON文本: "datePublished": "2015-02-05T08:00:00+08:00"
# 2、注意:中文网站,都必须使用正则式
publish_date = SmartExtractorUtility.extract_publish_date(article.raw_html)
if publish_date != '':
return publish_date
else:
if article.publish_datetime_utc:
# 优先使用提取成功的 datetime
return article.publish_datetime_utc.strftime('%Y-%m-%d')
elif article.publish_date:
# 其次使用提取成功的 date 字符串
return article.publish_date
else:
# 全部提取失败,返回字符串
return ''
@staticmethod
def get_article_text(article: Article):
# 第一种方法:在纯文本(cleaned_text)基础上,添加br标签
# 1、缺点:无法获取图片,同时会丢掉原有的p标签(只能用br替补)
# text = SmartExtractor.add_html_br(article.cleaned_text)
# 第二种方法:直接获取 top_node 的HTML内容
# 1、优点:可保留原有的p标签等
# 2、缺点:无法获取图片,img标签未被保留
# text = etree.tounicode(article.top_node, method='html')
# 测试抛出异常
# raise Exception("测试抛出异常")
# 第三种方法:获取到 top_node 的xpath,再通过xpath查询原始doc
# 1、可行:通过查询原始doc,可以获取“正文”的所有HTML内容
# 2、遇到问题:获取到 top_node 的xpath不准确,与原位置偏移一个元素
# a. 测试URL:https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml
# b. 获取到的xpath:/html/body/div/div[1]/div[2]/div[4]
# c. 实际xpath:/html/body/div/div[1]/div[2]/div[5]
# 3、解决办法:
# a. 优先使用id、class查询,如果没有id、class,再查询 top_node 的xpath
xpath = None
if type(article.top_node) is HtmlElement:
if 'id' in article.top_node.attrib:
xpath = "//*[@id='{}']".format(article.top_node.attrib['id'])
elif 'class' in article.top_node.attrib:
xpath = "//*[@class='{}']".format(article.top_node.attrib['class'])
else:
xpath = article.top_node.getroottree().getpath(article.top_node)
else:
# article.top_node 有时为空:
# 1、测试URL:https://baijiahao.baidu.com/s?id=1741311527693101670
# 2、输出日志:article.top_node 不是 HtmlElement 对象:None
print("SmartExtractor:article.top_node 为 {},不是 HtmlElement 对象。".format(article.top_node))
# article.top_node 为空时,直接输出 cleaned_text:
# 1、在纯文本(cleaned_text)基础上,添加br标签
text = SmartExtractorUtility.add_html_br(article.cleaned_text)
return text
# 根据xpath,查询元素
element_list = article.raw_doc.getroottree().xpath(xpath)
if element_list:
# 查询XPath成功,获取第一个元素的HTML
text = etree.tounicode(element_list[0], method='html')
else:
# 查询XPath失败,返回 top_node 原有的HTML
# 1、缺点:无法获取图片,img标签未被保留
text = etree.tounicode(article.top_node, method='html')
return text
"""
天眼查动态采集流程,
1.根据信用代码获取天眼查的公司id
2.获取对应的动态数量
3.进行列表翻页采集
4.对动态详情进行请求和抽取
5.将相关信息发送到kafka和日志记录
"""
import json
import random
import time
import pymysql
import requests
from kafka import KafkaProducer
from BaseCore import BaseCore
from getTycId import getTycIdByXYDM
import smart_extractor
class Tycdt(object):
def __init__(self):
self.baseCore=BaseCore()
self.log = self.baseCore.getLogger()
self.cnx_ = self.baseCore.cnx
self.cursor_ = self.baseCore.cursor
self.taskType = '企业动态/天眼查/补采20W+'
# 初始化,设置中文分词
self.smart =smart_extractor.SmartExtractor('cn')
self.cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
self.cursor = self.cnx.cursor()
self.pageSize = 10
self.headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5NzE5MDMwMywiZXhwIjoxNjk5NzgyMzAzfQ.awXuS-59RzK35r0aUJq4Rj83JzyAOvsdUfL_ojp66CVQMjlLv_ZDD9g5gCoZKE21LN1JYRMLNZhuWsHhxapROw',
'X-TYCID': '6f6298905d3011ee96146793e725899d',
'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
# 日志信息保存至现已创建好数据库中,因此并没有再对此前保存日志信息数据库进行保存
def doJob(self):
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
social_code = '913205002517479347'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
continue
start = time.time()
try:
data = self.baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
#数据重新塞入redis
self.baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode',social_code)
continue
id = data[0]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['tycData']['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
self.cursor_.execute(updateSql)
self.cnx_.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = self.baseCore.getTimeCost(start, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, '', '获取天眼查id失败')
self.log.info(f'======={social_code}====重新放入redis====')
self.baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = self.baseCore.getTimeCost(start, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, '', '获取天眼查id失败')
self.baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode', social_code)
continue
count = data[17]
self.log.info(f"{id}---{xydm}----{tycid}----开始处理")
start_time = time.time()
# 开始采集企业动态
retData = self.beinWork(tycid, xydm,start_time)
# 信息采集完成后将该企业的采集次数更新
runType = 'NewsRunCount'
count += 1
self.baseCore.updateRun(social_code, runType, count)
total = retData['total']
up_okCount = retData['up_okCount']
up_errorCount = retData['up_errorCount']
up_repetCount = retData['up_repetCount']
self.log.info(
f"{id}---{xydm}----{tycid}----结束处理,耗时{self.baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}")
except Exception as e:
self.log.info(f'==={social_code}=====获取企业信息失败====')
#重新塞入redis
self.baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode',social_code)
state = 0
takeTime = self.baseCore.getTimeCost(start, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
#break
self.cursor.close()
self.cnx.close()
# 释放资源
self.baseCore.close()
def beinWork(self,tyc_code, social_code,start_time):
time.sleep(3)
# retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
t = time.time()
url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={self.pageSize}&pn=1&emotion=-100&event=-100'
try:
for m in range(0, 3):
ip = self.baseCore.get_proxy()
self.headers['User-Agent'] = self.baseCore.getRandomUserAgent()
response = requests.get(url=url, headers=self.headers, proxies=ip, verify=False,timeout=10)
# time.sleep(random.randint(3, 5))
break
if (response.status_code == 200):
pass
except Exception as e:
self.log.error(f"{tyc_code}-----获取总数接口失败")
error = '获取总数接口失败'
state = 0
takeTime = self.baseCore.getTimeCost(start_time, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, url, f'{error}----{e}')
return retData
try:
json_1 = json.loads(response.content.decode('utf-8'))
total = json_1['data']['total']
except:
self.log.error(f"{tyc_code}-----获取总数失败")
e = '获取总数失败'
state = 0
takeTime = self.baseCore.getTimeCost(start_time, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, url, e)
return retData
if (total > 0):
if (total % self.pageSize == 0):
totalPage = total // self.pageSize
else:
totalPage = total // self.pageSize + 1
else:
self.log.error(f"{tyc_code}--------总数为0")
retData['state'] = True
return retData
self.log.info(f"{tyc_code}-------总数:{total}----总页数:{totalPage}")
retData['total'] = total
up_okCount = 0
up_errorCount = 0
up_repetCount = 0
for num in range(1, totalPage + 1):
# time.sleep(3)
self.log.info(f"获取分页数据--{tyc_code}----分页{num}----开始")
start_page = time.time()
url_page = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={time.time()}&id={tyc_code}&ps={self.pageSize}&pn={num}&emotion=-100&event=-100'
for m in range(0, 3):
try:
ip = self.baseCore.get_proxy()
self.headers['User-Agent'] = self.baseCore.getRandomUserAgent()
response_page = requests.get(url=url_page, headers=self.headers, proxies=ip,timeout=10, verify=False)
# time.sleep(1)
break
except:
pass
if (response_page.status_code == 200):
pass
else:
self.log.error(f"{tyc_code}--{num}页---获取分页数据失败")
e = '获取分页数据失败'
state = 0
takeTime = self.baseCore.getTimeCost(start_time, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, url_page, e)
up_errorCount = up_errorCount + self.pageSize
continue
try:
json_page = json.loads(response_page.content.decode('utf-8'))
info_list_page = json_page['data']['items']
except:
self.log.error(f"{tyc_code}--{num}页---获取分页数据失败")
e = '获取分页数据失败'
state = 0
takeTime = self.baseCore.getTimeCost(start_time, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, url_page, e)
up_errorCount = up_errorCount + self.pageSize
continue
pageIndex = 0
for info_page in info_list_page:
pageIndex = pageIndex + 1
title = info_page['title']
source = info_page['website']
link = info_page['uri']
try:
sel_sql = '''select social_credit_code from brpa_source_article_news where source_address = %s and social_credit_code=%s and type='2' '''
self.cursor_.execute(sel_sql, (link, social_code))
except Exception as e:
print(e)
selects = self.cursor_.fetchone()
if selects:
self.log.info(f'{tyc_code}-----{social_code}----{link}:已经存在')
# todo:如果该条数据存在则说明该条数据之后的都已经采集完成,就可以跳出函数,执行下一个企业
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
#return retData
continue
try:
time_struct = time.localtime(int(info_page['rtm'] / 1000)) # 首先把时间戳转换为结构化时间
time_format = time.strftime("%Y-%m-%d %H:%M:%S", time_struct) # 把结构化时间转换为格式化时间
except:
time_format = self.baseCore.getNowTime(1)
#记录时间 对比时间
#if time_format > '2023-09-25' and time_format < '2023-10-01':
#pass
#else:
#continue
try:
# 开始进行智能解析
# lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang)
html_text=self.reqDetailmsg(link)
article=self.smart.extract_by_html(html_text,title)
content=article.cleaned_text
contentText=article.text
# #带标签正文
# contentText = self.smart.extract_by_url(link).text
# #不带标签正文
# content = self.smart.extract_by_url(link).cleaned_text
# time.sleep(3)
except Exception as e:
contentText = ''
if contentText == '':
self.log.error(f'获取正文失败:--------{tyc_code}--------{num}--------{link}')
e = '获取正文失败'
state = 0
takeTime = self.baseCore.getTimeCost(start_time, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, link, e)
up_errorCount = up_errorCount + 1
try:
insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex) values('{social_code}','{source}','{link}','{title}','{time_format}','{info_page['abstracts']}',now(),1,{num},{pageIndex})"
self.cursor.execute(insert_err_sql)
self.cnx.commit()
except:
pass
continue
try:
insert_sql = '''insert into brpa_source_article_news(social_credit_code,source_address,origin,type,publish_time,content,create_time) values(%s,%s,%s,%s,%s,%s,now())'''
# 动态信息列表
up_okCount = up_okCount + 1
list_info = [
social_code,
link,
f'天眼查-{source}',
'2',
time_format,
content[:500]
]
self.cursor_.execute(insert_sql, tuple(list_info))
self.cnx_.commit()
# 采集一条资讯记录一条,记录该企业采到了多少的资讯
self.log.info(f'{social_code}----{link}:新增一条')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:插入一条数据,并传入kafka
dic_news = {
'attachmentIds': '',
'author': '',
'content': content,
'contentWithTag': contentText,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': source,
'publishDate': time_format,
#'sid': '1684032033495392257',
'sid': '1714852232679067649',
'sourceAddress': link, # 原文链接
'summary': info_page['abstracts'],
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': time_format[:4]
}
except Exception as e:
self.log.info(f'传输失败:{social_code}----{link}')
error = '数据库传输失败'
state = 0
takeTime = self.baseCore.getTimeCost(start_time, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, link, f'{error}----{e}')
continue
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
self.log.info(dic_result)
# 传输成功,写入日志中
state = 1
takeTime = self.baseCore.getTimeCost(start_time, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, link, '成功')
# return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
self.log.error(dic_result)
e = 'Kafka操作失败'
state = 0
takeTime = self.baseCore.getTimeCost(start_time, time.time())
self.baseCore.recordLog(social_code, self.taskType, state, takeTime, link, e)
self.log.info(f"获取分页数据--{tyc_code}----分页{num},耗时{self.baseCore.getTimeCost(start_page, time.time())}")
retData['up_okCount'] = up_okCount
retData['up_errorCount'] = up_errorCount
retData['up_repetCount'] = up_repetCount
return retData
def reqDetailmsg(self,url):
header={
'Connection':'keep-alive',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-platform':'"Windows"',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site':'none',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-User':'?1',
'Sec-Fetch-Dest':'document',
'Accept-Encoding':'gzip, deflate, br',
}
self.headers['User-Agent'] = self.baseCore.getRandomUserAgent()
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
proxy = self.baseCore.get_proxy()
for i in range(0,2):
try:
response=requests.get(url=url,headers=header,timeout=10,proxies=proxy,verify=False)
response.encoding = response.apparent_encoding
htmltext=response.text
except Exception as e:
htmltext=''
self.log.info(f'{url}---详情请求失败--{e}')
if htmltext:
self.log.info(f'{url}---详情请求成功')
break
return htmltext
if __name__ == '__main__':
tydt=Tycdt()
tydt.doJob()
import os
import redis
from flask import Flask, request, send_file, render_template, jsonify
import json
import pymysql
from pyquery import PyQuery as pq
from flask_cors import cross_origin
'''
手动捕获请求的接口数据,实现解析
使用fiddler将链接对应的页面数据信息发送到后台,后台对数据进行解析
'''
r = redis.Redis(host='127.0.0.1', port='6379', db=0)
def connMysql():
# 创建MySQL连接
conx = pymysql.connect(host='114.115.159.144',
user='caiji',
password='zzsn9988',
database='caiji')
# 创建一个游标对象
cursorM = conx.cursor()
return conx,cursorM
def closeSql(conx,cursorM):
# 关闭游标和连接
cursorM.close()
conx.close()
#将列表数据插入到表中 baidu_search_result
def itemInsertToTable(item):
conx,cursorM=connMysql()
zKeyNo=item['zKeyNo']
yKeyNo=item['yKeyNo']
try:
select_sql=f'select * from qccholdmsg where yKeyNo="{yKeyNo}" and zKeyNo="{zKeyNo}" '
cursorM.execute(select_sql)
existing_record = cursorM.fetchone()
except Exception as e:
existing_record=''
if existing_record:
print(f'数据已存在!{zKeyNo}')
return
insert_param=(item['yKeyNo'],item['yCompanyName'],item['nameCount'],item['zKeyNo'],item['zName'],
item['registCapi'],item['province'],item['industry'],item['shortStatus'],item['percentTotal'],item['startDateStr'],
item['h5Url'],item['district'],item['industryDesc'],item['area'],item['industryItem'])
insert_sql ="INSERT into qccholdmsg (yKeyNo,yCompanyName,nameCount,zKeyNo,zName,registCapi,province," \
"industry,shortStatus,percentTotal,startDateStr,h5Url,district,industryDesc,area,industryItem) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.execute(insert_sql,insert_param)
# 定义插入数据的SQL语句
# 执行插入操作
conx.commit()
print('数据插入成功!')
closeSql(conx,cursorM)
app = Flask(__name__)
@app.route('/')
@cross_origin()
def index():
return 'Welcome to the website!'
@app.route('/get_hold', methods=['POST'])
@cross_origin()
def get_news():
data=request.form
@app.route('/ws/setCookie', methods=['GET'])
# @cross_origin()
def setCookie():
try:
cookie = request.args.get('cookie')
r.sadd('wscookie',cookie)
except Exception as e:
print('error')
return 'succes'
@app.route('/ws/getCookieSize', methods=['GET'])
@cross_origin()
def getCookieSize():
try:
size=r.scard('wscookie')
data = {
"code": 200,
"msg": "操作成功",
"data": size
}
except Exception as e:
data={
"code": 200,
"msg": "操作失败",
"data": 0
}
return jsonify(data)
@app.route('/ws/getHtml', methods=['POST'])
# @cross_origin()
def getnewMonth():
try:
html = request.form.get('html')
doc=pq(html)
wsmsg=doc('select[id="endMonth"]>option[selected="selected"]').text()
r.set('wsmsg',wsmsg)
except Exception as e:
print('error')
return 'success'
if __name__ == '__main__':
app.run(port=8033)
function r(size){
function r(size){
var str = "",
arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
for(var i=0; i<size; i++){
str += arr[Math.round(Math.random() * (arr.length-1))];
}
return str;
}
function strTobinary(str) {
var result = [];
var list = str.split("");
for (var i = 0; i < list.length; i++) {
if (i != 0) {
result.push(" ");
}
var item = list[i];
var binaryStr = item.charCodeAt().toString(2);
result.push(binaryStr);
};
return result.join("");
}
function cipher() {
var date = new Date();
var timestamp = date.getTime().toString();
var salt = r(24);
var year = date.getFullYear().toString();
var month = (date.getMonth() + 1 < 10 ? "0" + (date.getMonth() + 1) : date
.getMonth()).toString();
var day = (date.getDate() < 10 ? "0" + date.getDate() : date.getDate())
.toString();
var iv = year + month + day;
return salt
}
function des(salt,iv,enc) {
// var enc = des3(timestamp, salt, iv).toString();
var str = salt + iv + enc;
var ciphertext = strTobinary(str);
return ciphertext;
}
function token(){
var size = 24
var str = "",
arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
for(var i=0; i<size; i++){
str += arr[Math.round(Math.random() * (arr.length-1))];
}
return str;
}
function pageid() {
var n = 32
var text = "";
var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
for (var i = 0; i < n; i++)
text += possible.charAt(Math.floor(Math.random() * possible.length));
return text;
}
// console.log(cipher());
\ No newline at end of file
import base64
import base64
import datetime
import json
import random
import time
import execjs
import requests
import urllib3
from Crypto.Cipher import DES3
from kafka import KafkaProducer
from BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore=BaseCore()
log=baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
#保存错误日志
def insertBadSql(error):
insertSql = f"insert into cpwsw_log (code,description,success,create_time,user,keyword,msg) values (%s,%s,%s,now(),%s,%s,%s)"
cursor_.execute(insertSql, tuple(error))
cnx_.commit()
#cookie的处理
def updateCookie(cookie,type):
if type==2:
#session失效,删除token
cursor_.execute("delete from cpwsw_user where cookie=%s",[cookie])
if type ==1:
#正常使用
cursor_.execute("update cpwsw_user set update_time=now() where cookie=%s",[cookie])
if type ==3:
#未知异常
cursor_.execute("update cpwsw_user set fenghao_time=now() where cookie=%s",[cookie])
cnx_.commit()
# 将DES3加密解密设置为类
class EncryptDate:
def __init__(self, pianyi, key):
self.key = key # 初始化密钥
self.iv = bytes(pianyi,encoding='utf8') # 偏移量
self.length = DES3.block_size # 初始化数据块大小
self.des3 = DES3.new(self.key, DES3.MODE_CBC, self.iv) # 初始化AES,CBC模式的实例
# 截断函数,去除填充的字符
self.unpad = lambda date: date[0:-ord(date[-1])]
def pad(self, text):
"""
#填充函数,使被加密数据的字节码长度是block_size的整数倍
"""
count = len(text.encode('utf-8'))
add = self.length - (count % self.length)
entext = text + (chr(add) * add)
return entext
def encrypt(self, encrData): # 加密函数
res = self.des3.encrypt(self.pad(encrData).encode("utf8"))
msg = str(base64.b64encode(res), encoding="utf8")
# msg = res.hex()
return msg
def decrypt(self, decrData): # 解密函数
res = base64.decodebytes(decrData.encode("utf8"))
# res = bytes.fromhex(decrData)
msg = self.des3.decrypt(res).decode("utf8")
return self.unpad(msg)
with open('裁判文书网.js', 'r', encoding='utf-8') as f:
jstext = f.read()
# 在python中调用js代码
ctx = execjs.compile(jstext)
print("ok")
url = 'https://wenshu.court.gov.cn/website/parse/rest.q4w'
#获取登录Cookie
def getCookie():
cursor_.execute(
f"select user,cookie from cpwsw_user where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
row = cursor_.fetchall()
if row:
pass
else:
# 没有查到token
log.info("没有拿到token")
return False
return row[0]
#获取正文
def getDoc(info_id,userCookie):
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': userCookie,
'Host': 'wenshu.court.gov.cn',
'Referer': 'https://wenshu.court.gov.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}
salt = ctx.call('cipher')
date_now = time.strftime("%Y%m%d",time.localtime())
t = time.time()
eg = EncryptDate(date_now,salt) # 偏移量和秘钥,这里密钥的长度必须是16的倍数
des = eg.encrypt(str(t)) #DES3加密
ciphertext = ctx.call("des",salt,date_now,des)
token = ctx.call("token")
data_info = {
'docId':info_id,
'ciphertext':ciphertext,
'cfg':'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch',
'__RequestVerificationToken':token,
'wh':'250',
'ww':'1536',
'cs':'0'
}
ip = baseCore.get_proxy()
res_info = requests.post(url=url,headers=headers,data=data_info,proxies=ip, verify=False,timeout=10)
#{'code': -12, 'description': None, 'secretKey': None, 'result': None, 'success': False} SESSION的值不对
#{'code': 9, 'description': '没有权限请求接口,cfg=com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@docInfoSearch', 'secretKey': None, 'result': None, 'success': False}
#'{"code":1,"description":"权限已失效","secretKey":null,"result":null,"success":true}'
code = res_info.json()["code"]
if code != 1:
log.error(f"正文获取失败:----{res_info.json()}")
# 没有正常返回
return ""
try:
eg_jie = EncryptDate(date_now,res_info.json()['secretKey'])
res_jie = eg_jie.decrypt(res_info.json()['result']) #DES3解密
except Exception as e:
return ""
log.error(f"正文获取失败:----{e}")
return res_jie
#
def insertCpwsList(keyword,page,list_info,userCookie):
listCount = 0
repetCount = 0
insertCount = 0
for one_info in list_info:
listCount = listCount + 1
info_title = one_info['1']
info_time = one_info['31']
info_address = one_info['2']
info_yuanyou = one_info['26']
info_bianhao = one_info['7']
info_id = one_info['rowkey']
selectCountSql = f"select count(1) from cpwsw_list where keyword=%s and rowkey=%s"
cursor_.execute(selectCountSql,[keyword,info_id])
count = cursor_.fetchone()[0]
if count > 0:
repetCount = repetCount + 1
continue
else:
insertCount = insertCount + 1
try:
# 获取正文
log.info("开始采集正文")
content = getDoc(info_id,userCookie)
log.info("结束采集正文,开始休眠")
if content=='':
log.info("采集到的正文为空")
continue
mcontent=json.loads(content)
bdetail={
'title':mcontent['s1'],
'content':mcontent['s25'],
'contentHtml':mcontent['qwContent'],
'source':mcontent['s2'],
'publishtime':mcontent['s41'],
'detailurl':mcontent['s5'],
}
try:
processitem=getProcessitem(bdetail)
sendkafka(processitem)
except Exception as e:
log.error(f"{info_id}保存kafka失败:{e}")
insertSql = f"insert into cpwsw_list (keyword,title,time,address,yuanyou,bianhao,rowkey,state,create_time,content) " \
f"values (%s,%s,%s,%s,%s,%s,%s,0,now(),%s)"
cursor_.execute(insertSql, [keyword,info_title,info_time,info_address,info_yuanyou,info_bianhao,info_id,content])
cnx_.commit()
updateCookie(userCookie, 1)
time.sleep(random.randint(60, 180))
except Exception as e:
log.error(f"保存数据库失败:{e}")
time.sleep(random.randint(60, 180))
log.info(f"---{keyword}--第{page}页----总数:{listCount}---重复数:{repetCount}---新增数:{insertCount}-------------")
if listCount == 0:
# 列表为空认为结束
return True
if repetCount >= listCount / 2:
# 重复数量大于等于一半认为结束
return True
# 没有结束
return False
def getNowDate():
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
return currentdate
def getProcessitem(bdetail):
nowDate=getNowDate()
content=bdetail['content']
if content!='':
processitem={
"sid":1706193555675926530,
"source":"19",#裁判文书
"title":bdetail['title'],
"content":bdetail['content'],
"contentWithtag":bdetail['contentHtml'],
"origin":bdetail['source'],
"publishDate":bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'],
"createDate":nowDate
}
return processitem
def sendkafka(processitem):
try:
producer = KafkaProducer(bootstrap_servers="114.115.159.144:9092")
content=processitem['content']
publishDate=str(processitem['publishDate'])
title=processitem['title']
if title =='':
return
if content=='':
return
if publishDate=='':
return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
log.info("数据发送kafka成功")
log.info(kafka_result.get(timeout=10))
except Exception as e:
log.info('发送kafka异常')
finally:
producer.close()
def getList(keyword,page):
userAndCookie = getCookie()
if userAndCookie:
pass
else:
log.info("没有拿到token,开始递归")
while True:
log.info("没有拿到token,开始休眠")
time.sleep(60)
log.info("没有拿到token,结束休眠")
userAndCookie = getCookie()
if userAndCookie:
break
user = userAndCookie[0]
userCookie = userAndCookie[1]
log.info(f"获取到user----{user}")
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': userCookie,
'Host': 'wenshu.court.gov.cn',
'Referer': 'https://wenshu.court.gov.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}
salt = ctx.call('cipher')
date_now = time.strftime("%Y%m%d", time.localtime())
t = time.time()
eg = EncryptDate(date_now, salt) # 偏移量和秘钥,这里密钥的长度必须是16的倍数
des = eg.encrypt(str(t)) # DES3加密
ciphertext = ctx.call("des", salt, date_now, des)
pageId = ctx.call("pageid")
token = ctx.call("token")
search_key = [{"key": "s21", "value": f"{keyword}"}]
data = {
'pageId':pageId,
's21': keyword,
'sortFields': 's51:desc', # 按裁判日期排序
'ciphertext': ciphertext,
'pageNum': page,
'pageSize': '5',
'queryCondition': str(search_key),
'cfg': 'com.lawyee.judge.dc.parse.dto.SearchDataDsoDTO@queryDoc',
'__RequestVerificationToken': token,
'wh': '403',
'ww': '1531',
'cs': '0'
}
proxy=baseCore.get_proxy()
res = requests.post(url=url, headers=headers, data=data,verify=False,timeout=10,proxies=proxy)
code = res.json()["code"]
if code!=1:
#没有正常返回
#记录信息 删除登录信息
error = [res.json()["code"], res.json()["description"], res.json()["success"], user, keyword,'']
insertBadSql(tuple(error))
updateCookie(userCookie, 3)
return getList(keyword, page)
eg_jie = EncryptDate(date_now, res.json()['secretKey'])
res_jie = eg_jie.decrypt(res.json()['result'])
res_json = json.loads(res_jie) # 将解密后的数据转换为json格式
list_info = res_json['queryResult']['resultList']
return insertCpwsList(keyword, page,list_info,userCookie)
#
def doJob(keyword):
log.info(f"======{keyword}----开始采集=======")
for page in range(1,6):
retFlag = getList(keyword, page)
time.sleep(random.randint(60,180))
if retFlag:
#结束 跳出该公众号
break
else:
#没有结束
pass
log.info(f"======{keyword}---------结束采集=======")
def test():
pass
if __name__=="__main__":
while True:
kwLstr='浙江银通典当有限责任公司|浙江省化工进出口有限公司|浙江省能源集团有限公司|浙江海港中奥能源有限责任公司|杭州香格里拉饭店有限公司|浙能集团新疆准东能源化工有限公司|温州衢化东南工贸有限公司|义乌产权交易所有限公司|温州机场集团有限公司|浙江浙能电力股份有限公司|浙江工程设计有限公司|浙江国信控股集团有限责任公司|浙江巨化集团进出口有限公司|上海盛东国际集装箱码头有限公司|衢州巨化房地产开发有限公司|浙江省空港融资租赁有限公司|浙江中旅商务会展有限公司|浙江海港集团财务有限公司|浙江浙能港口运营管理有限公司|浙江机场集团保安服务有限公司|新疆塔建三五九建工有限责任公司|浙江能源天然气集团有限公司|浙江华江科技股份有限公司|浙江浙能资产经营管理有限公司|浙江海港黄泽山油品储运有限公司|浙江浙旅投资有限责任公司|浙江海港海洋工程建设有限公司|浙江省能源集团财务有限责任公司|江西省赣浙能源有限公司|温州联合产权交易中心有限公司|浙江省通用航空产业发展有限公司|物产中大期货有限公司|浙江省富浙融资租赁有限公司|宁波机场集团有限公司|浙江省振兴乡村建设投资发展集团有限公司|浙江杭钢融资租赁有限公司|浙江国贸云商控股有限公司|浙江省农村实业发展有限公司|巨化控股有限公司|浙江省国际贸易集团有限公司|浙江机电职业技术学院|浙江头门港投资开发有限公司|伊犁新天煤化工有限责任公司|浙江省农村发展集团农产品有限公司|衢州巨化传媒有限公司|浙江机场投资有限责任公司|浙江中大元通融资租赁有限公司|巨化集团公司兴化实业有限公司|浙江新世纪期货有限公司|巨化集团有限公司|浙旅盛景资本投资有限公司|浙江省医疗健康集团有限公司|浙江歌瑞新材料有限公司|浙江省国贸集团资产经营有限公司|浙能资本控股有限公司|杭州萧山国际机场有限公司|浙江梅苑酒店管理有限公司|浙江国贸集团东方机电工程股份有限公司|上港集团平湖独山港码头有限公司|浙江巨化热电有限公司|浙江省粮食集团有限公司|宁波舟山港集团有限公司|浙江省纺织品进出口集团有限公司|浙江巨化物流有限公司|浙江建设技师学院|浙江杭州临空经济开发有限公司|浙江中大技术进出口集团有限公司|浙江省粮油食品进出口股份有限公司|浙江省石油股份有限公司|浙江空港培训服务咨询有限公司|浙江省机电集团有限公司|浙江省二轻集团有限责任公司|巨化集团上海融资租赁有限公司|浙江巨化股份有限公司|宁波航运交易所有限公司|浙江巨化投资有限公司|浙江省农村发展集团有限公司|浙江省国际贸易集团温州有限公司|浙江巨化化工矿业有限公司|浙江天虹物资贸易有限公司|浙江浙能兴源节能科技有限公司|浙江巨化装备工程集团有限公司|温州瑞平苍高速公路有限公司|上海巨化实业发展有限公司|浙能集团新疆准东煤业有限公司|浙江浙能煤运投资有限责任公司|浙江省新能源投资集团股份有限公司|浙江锦华新材料股份有限公司|浙旅湛景置业有限公司|浙江省交通投资集团财务有限责任公司|杭州钢铁集团有限公司|浙江巨化塑胶有限责任公司|浙江巨化信息技术有限公司|浙江新农都实业有限公司|浙江省万里教育集团|浙江长广(集团)有限责任公司|浙江海港资产管理有限公司|物产中大融资租赁集团有限公司|浙江浙能企业管理培训服务有限公司|浙江省海洋产业投资有限公司|浙江省交通投资集团有限公司|浙江轻工联非融资性担保有限公司|浙江省机场集团有限公司|浙江巨化汉正新材料有限公司|浙江海港内河港口发展有限公司|浙江外事旅游股份有限公司|浙江省浙商资产管理股份有限公司|浙江东方金融控股集团股份有限公司|浙江浙能技术研究院有限公司|浙江经济职业技术学院|浙江巨化清安检测科技有限公司|浙江省国有资本运营有限公司|浙江省土产畜产进出口集团有限公司|浙江巨化能源有限公司|浙江产权交易所有限公司|宁波海运集团有限公司|浙江省中国旅行社集团有限公司|杭州工商信托股份有限公司|浙江省衢州机场管理有限公司|浙江省旅游投资集团有限公司|巨化集团公司塑化厂|同煤浙能麻家梁煤业有限责任公司|浙江轻工联典当有限责任公司|浙江省海港投资运营集团有限公司|衢州衢化宾馆有限公司|舟山普陀山机场有限公司|深圳市巨化华南投资发展有限公司|浙江金华安邦护卫有限公司|浙江海港洋山投资开发有限公司|物产中大资本管理(浙江)有限公司|浙江南方工程咨询管理有限公司|黑龙江绿色农业发展集团有限公司|浙商财产保险股份有限公司|物产中大资产管理(浙江)有限公司|衢州氟硅技术研究院|招商局港口集团股份有限公司|浙江省台州机场管理有限公司|浙江省机电技师学院|巨化集团财务有限责任公司|浙江省电力建设有限公司|浙江省中医药健康产业集团有限公司|浙江巨化化工材料有限公司|浙江浙商金控有限公司|浙江富兴电力燃料有限公司|浙旅蝶来酒店集团有限公司|浙江英特集团股份有限公司|浙江省义乌机场管理有限公司|浙江省农都农产品有限公司|物产中大集团财务有限公司|宁波海运股份有限公司|浙江省建设投资集团股份有限公司|浙江菲达环保科技股份有限公司|浙江海正药业股份有限公司|物产中大集团股份有限公司|浙江运达风电股份有限公司|杭州钢铁股份有限公司|浙商中拓集团股份有限公司|勿忘农集团有限公司|浙江交通科技股份有限公司|巨化集团(香港)有限公司|香港泰纬国际贸易有限公司|浙江能源国际有限公司|常熟市国瑞科技股份有限公司|浙江镇洋发展股份有限公司|浙商银行股份有限公司|浙江物产环保能源股份有限公司|浙江诺和机电股份有限公司|浙江沪杭甬高速公路股份有限公司|宁波远洋运输股份有限公司|浙江大学控股集团有限公司|安邦护卫集团股份有限公司|浙江富建投资管理有限公司|浙江富浙科技有限公司|浙江富浙资产管理有限公司|浙江富浙资本管理有限公司|浙江富物资产管理有限公司|浙江省发展资产经营有限公司|浙江省环境科技有限公司|浙江省盐业集团有限公司|浙江省种业集团有限公司|浙江物产中大医药有限公司|浙江物产元通汽车集团有限公司|浙江物产实业控股(集团)有限公司|物产中大(浙江)产业投资有限公司|物产中大云商有限公司|物产中大元通实业集团有限公司|物产中大元通汽车有限公司|物产中大公用环境投资有限公司|物产中大化工集团有限公司|物产中大医疗健康投资有限公司|物产中大国际学院|物产中大数字科技有限公司|物产中大欧泰有限公司|物产中大物流投资集团有限公司|物产中大财智共享服务(浙江)有限公司|物产中大金属集团有限公司|物产中大金石集团有限公司|物产中大长乐林场有限公司|物产中大集团投资有限公司|南水北调(开化)水务有限公司|太仓中茵建设投资有限公司|安吉浙建投资有限公司|庆元县浙建项目管理有限公司|新昌县浙建投资管理有限公司|新疆阿拉尔上游水利水电工程有限责任公司|杭州财金未来社区股权投资合伙企业(有限合伙)|永嘉县浙建投资有限公司|浙建(兰溪)矿业有限公司|浙建项目管理(杭州)有限公司|浙江基建投资管理有限公司|浙江天台浙建环保科技有限公司|浙江建工建筑工程咨询有限公司|浙江建投创新科技有限公司|浙江建投发展房地产开发有限公司|浙江建投工程物资设备有限公司|浙江建投数字技术有限公司|浙江建投机械租赁有限公司|浙江建投环保工程有限公司|浙江建设商贸物流有限公司|浙江浙建云采贸易有限公司|浙江浙建实业发展有限公司|浙江浙建建筑设计有限公司|浙江浙建美丽乡村建设有限公司|浙江浙建钢结构有限公司|浙江省一建建设集团有限公司|浙江省二建建设集团有限公司|浙江省工业设备安装集团有限公司|浙江省建工集团有限责任公司|浙江省建投交通基础建设集团有限公司|浙江省建材集团有限公司|浙江省建设工程机械集团有限公司|浙江财金未来社区股权投资有限责任公司|衢州市衢江区浙建投资管理有限公司|长兴建图建设投资管理有限公司|长兴浙建投资有限公司|玉环市浙建城镇建设投资有限公司|绍兴市越城区浙建建设项目管理有限公司|绍兴市越城区浙建投资有限公司|苏州浙建地产发展有限公司|长兴浙建城镇建设有限公司|遂昌浙建投资有限公司|长兴浙永建设投资有限公司|阿拉尔浙建城市建设投资有限公司|浙江《机电工程》杂志社有限公司|浙江工匠培训有限公司|浙江康宁咨询服务有限公司|浙江新华体育器材制造有限公司|浙江机电华瑞航空投资有限公司|浙江机电集团投资有限公司|浙江新联民爆器材有限公司|浙江浙商金融服务有限公司|浙江省军工集团股份有限公司|浙江省工业矿产对外贸易有限责任公司|浙江省机电设计研究院有限公司|浙江蓝箭称重技术有限公司|浙江省机械设备进出口有限责任公司|浙江富浙投资有限公司|浙江海正集团有限公司|浙江五金矿产控股有限公司|浙江省医药保健品进出口有限责任公司|浙江省国兴进出口有限公司|浙江省国际贸易集团供应链有限公司|浙江省工艺品进出口有限公司|浙江经贸房地产公司|浙江英特药业有限责任公司|天台国大镕丰酒店管理有限公司|杭州千岛湖温馨岛娱乐旅游有限公司|杭州花港饭店|浙江《江南游报》社有限责任公司|浙江东菱酒店有限公司|杭州花港海航度假酒店有限公司|浙江国际大酒店有限公司|浙江旅游信息中心有限公司|浙江时代国际展览服务有限公司|浙江浙勤服务控股集团有限公司|浙江浙勤集团有限公司|浙江浙旅投数字科技有限公司|浙江省人才发展集团有限公司|浙江省古村落(传统村落)保护利用股权投资基金合伙企业(有限合伙)|浙江省国际投资服务中心有限公司|浙江省国际贸易展览有限公司|浙江省旅工贸有限责任公司|浙江雷迪森物业服务有限公司|浙江雷迪森酒店集团有限公司|舟山市普陀山银海饭店有限公司|雷迪森旅业集团有限公司|上海杭钢凯暄矿业投资有限公司|幸福之江资本运营有限公司|杭州杭钢合金钢铸造有限公司|杭州钢铁厂小型轧钢股份有限公司|浙江东菱商贸有限公司|浙江富春紫光环保股份有限公司|中杭监测技术研究院有限公司|杭州紫云能源综合利用开发有限公司|杭州紫元置业有限公司|浙江杭钢人力资源开发服务有限公司|浙江杭钢健康产业投资管理有限公司|浙江杭钢公管后勤服务有限公司|浙江杭钢动力有限公司|浙江杭钢商贸集团有限公司|浙江杭钢工贸有限公司|浙江杭钢数字科技有限公司|浙江杭钢智谷科技有限公司|浙江杭钢电炉炼钢有限公司|浙江杭钢职业教育集团有限公司|浙江杭钢高速线材有限公司|浙江省冶金研究院有限公司|浙江省工业设计研究院有限公司|浙江省环保集团有限公司|浙江省遂昌金矿有限公司|浙江紫汇资产管理有限公司|浙江紫臻物业管理服务有限公司|浙江钢联控股有限公司|温州杭钢水务有限公司|上海华山康健医疗有限公司|台州台信企业管理合伙企业(有限合伙)|浙江中天东方氟硅材料股份有限公司|浙江巨化环保科技有限公司|浙江巨柯私募基金管理有限公司|浙江巨荣石油化工销售有限公司|浙江晋巨化工有限公司|浙能巨化(浙江自贸区)股权投资基金合伙企业(有限合伙)|中核浙能能源有限公司|中海油绿能港浙江宁波能源有限公司|伊犁新矿煤业有限责任公司|内蒙古同煤鄂尔多斯矿业投资有限公司|国家管网集团浙江省天然气管网有限公司|浙江浙能产业研究院有限公司|浙江浙能物业发展有限公司|浙江浙能绿城体育文化发展有限公司|浙江省白马湖实验室有限公司|浙江职业足球俱乐部有限公司|嘉兴公路建设投资有限公司|嘉兴市嘉萧高速公路投资开发有限公司|德清县杭绕高速有限公司|杭州都市高速公路有限公司|浙商食品集团有限公司|浙江临金高速公路有限公司|浙江义东高速公路有限公司|浙江乐清湾高速公路有限公司|浙江交投交通建设管理有限公司|浙江交投太平交通基础设施股权投资基金(有限合伙)|浙江交投高速公路建设管理有限公司|浙江交投高速公路运营管理有限公司|浙江交通资源投资集团有限公司|浙江台州沈海高速公路有限公司|浙江台州甬台温高速公路有限公司|浙江宁波杭甬复线三期高速公路有限公司|浙江宁波甬台温高速公路有限公司|浙江数智交院科技股份有限公司|浙江景文高速公路有限公司|浙江杭宁高速公路有限责任公司|浙江杭宣高速公路有限公司|浙江杭新景高速公路有限公司|浙江杭海城际铁路有限公司|浙江杭温铁路有限公司|浙江杭甬复线宁波一期高速公路有限公司|浙江杭绍甬高速公路有限公司|浙江沪平盐铁路有限公司|浙江温州市域铁路一号线有限公司|浙江甬舟复线一期高速公路有限公司|浙江省交投控股集团有限公司|浙江省交通投资集团高速公路管理有限公司|浙江省商业集团有限公司|浙江省海运集团股份有限公司|浙江省经济建设投资有限公司|浙江省轨道交通建设管理集团有限公司|浙江省轨道交通运营管理集团有限公司|浙江省铁路发展控股集团有限责任公司|浙江省长三角投资有限公司|浙江舟山北向大通道有限公司|浙江衢丽铁路有限公司|浙江衢松铁路有限公司|浙江诸永高速公路有限公司|浙江路产城发展集团有限公司|浙江金华甬金衢上高速公路有限公司|浙江高信技术股份有限公司|浙江高速物流有限公司|温州市文泰高速公路有限公司|温州市瑞文高速公路有限公司|绍兴柯桥杭金衢联络线高速公路有限公司|金华市东永高速投资有限公司|东港投资发展集团有限公司|杭州农发原乡人农特产有限公司|杭州千岛湖鲟龙科技股份有限公司|浙江农发产业投资有限公司|浙江省现代农业研究会|芜湖信农硬科技投资合伙企业(有限合伙)|黑龙江亚欧牧业有限公司|浙江四港联动发展有限公司|浙江电子口岸有限公司|浙江空港商业经营管理有限责任公司|浙江空港数字科技有限公司|浙江空港资本控股有限公司|上海大宗商品仓单登记有限责任公司|宁波舟山港铁矿石储运有限公司|杭州港务集团有限公司|浙江中澳现代产业园有限公司|浙江义迪通供应链服务有限公司|浙江之迪控股有限公司|浙江海港国际联运有限公司|浙江海港大宗商品交易中心有限公司|浙江海港引航服务有限公司|浙江船舶交易市场有限公司|杭州富格企业管理合伙企业(有限合伙)|杭州松下马达有限公司|汇孚集团有限公司|浙江信联钢铁有限公司|浙江广杰投资管理有限公司|浙江建设融资租赁有限公司|浙江杰尚投资管理有限公司|浙江申达塑料机械有限公司|浙江申达机器制造股份有限公司|浙江省二轻供销总公司|浙江省二轻商业经营管理有限公司|浙江省二轻房地产开发有限公司|浙江省工美控股有限公司|浙江省工艺美术研究院有限公司|浙江省皮革塑料有限公司|浙江省艺创投资发展股份有限公司|浙江省艺创文旅发展有限公司|安邦护卫(浙江)公共安全智慧科技有限公司|浙江丽水安邦护卫有限公司|浙江台州安邦护卫有限公司|浙江嘉兴安邦护卫有限公司|浙江宁波安邦护卫有限公司|浙江安邦护卫安全服务有限公司|浙江安邦护卫科技服务有限公司|浙江杭州安邦护卫有限公司|浙江温州安邦护卫有限公司|浙江湖州安邦护卫有限公司|浙江绍兴安邦护卫有限公司|浙江舟山安邦护卫有限公司|浙江衢州安邦护卫有限公司|上海浙大科技发展有限公司|杭州启真未来创新股权投资合伙企业(有限合伙)|杭州浙大动物医院有限公司|杭州浙大文化创意发展有限公司|杭州紫金港未来创新投资合伙企业(有限合伙)|杭州网新信息控股有限公司|杭州西投启真脑机智能产业运营有限公司|浙江启真人才发展有限公司|浙江大学农业科技园有限公司|浙江大学创新技术研究院有限公司|浙江大学城乡规划设计研究院有限公司|浙江大学杭州国际科创中心发展有限公司|浙江大学科技园发展有限公司|浙江大学能源工程设计研究院有限公司|浙江浙大列车智能化工程技术研究中心有限公司|浙江浙大圆正科技创新服务有限公司|浙江浙大新宇物业集团有限公司|浙江浙大科创集团有限公司|浙江浙大西投脑机智能科技有限公司|浙江钱塘机器人及智能装备研究有限公司|丽水市廉合产权交易有限公司|北京中产智合咨询服务中心(有限合伙)|北京金马甲产权网络交易有限公司|台州市产权交易所有限公司|嘉兴市产权交易有限公司|湖州市联合产权交易有限公司|绍兴市产权交易有限公司|舟山市产权交易中心有限责任公司|衢州市产权交易中心有限公司|浙江省三建建设集团有限公司|'
kwL=kwLstr.split('|')
for keyword in kwL:
log.info(f"采集的企业公司:{keyword}")
# keyword = baseCore.redicPullData('cpwsqy')
if keyword == 'None' or keyword == None:
log.info("redis已经没有数据了,重新放置数据")
break
doJob(keyword)
baseCore.close()
\ No newline at end of file
import time
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from BaseCore import BaseCore
baseCore=BaseCore()
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
agent = baseCore.getRandomUserAgent()
def flushAndGetToken():
browser.refresh()
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.ID, "loginLi")))
cookie_list = browser.get_cookies()
cookies = ''
for cookie in cookie_list:
cookies=cookies+cookie['name']+"="+cookie['value']+";"
info = browser.page_source
# res_2 = requests.get(year_url, proxies=ip)
soup = BeautifulSoup(info, 'html.parser')
#欢迎您,13683816984
aEle = soup.find('li', attrs={'id':'loginLi'})
text = aEle.text
user_name = text.replace("欢迎您,","")
#user_name = soup.select('a[id="loginLi"]')[0].text.replace("欢迎您,","")
#user_name = soup.find('a', id_='loginLi').text.replace("欢迎您,","")
if len(user_name)<3:
log.info("没有登录成功,请重试")
return
cursor_.execute("insert into cpwsw_user (user,cookie,create_time,update_time,fenghao_time) values (%s,%s,now(),now(),DATE_SUB(NOW(), INTERVAL 1 DAY))",[user_name,cookies])
cnx_.commit()
log.info("保存成功")
opt = webdriver.ChromeOptions()
#opt.add_argument('user-agent=' + baseCore.getRandomUserAgent())
#opt.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
opt.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
opt.add_argument("--ignore-certificate-errors")
opt.add_argument("--ignore-ssl-errors")
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False)
#opt.binary_location = r'C:/Program Files/Google/Chrome/Application/chrome.exe'
opt.binary_location = r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
chromedriver =r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
service = Service(chromedriver)
browser = webdriver.Chrome(options=opt, service=service)
url = "https://wenshu.court.gov.cn/"
browser.get(url)
# 手机登录
#等200秒后在获取cookie保存到数据库
time.sleep(60)
flushAndGetToken()
cursor_.close()
cnx_.close()
baseCore.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论