提交 29d5214b 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

# -*- coding: utf-8 -*-
import ast
import json
import re
import time
......@@ -13,7 +14,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import sys
# sys.path.append('D:\\KK\\zzsn_spider\\base')
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
sys.path.append(r'D:\PycharmProjects\zzsn\base')
import BaseCore
baseCore = BaseCore.BaseCore()
cnx_ = baseCore.cnx
......@@ -592,12 +593,13 @@ def login():
time.sleep(30)
return
id_cookie = cookieinfo[0]
cookie_list = json.loads(cookieinfo[1])
cookie_list = cookieinfo[1]
cookie_list = ast.literal_eval(cookie_list)
# cookie_list = json.dumps(cookieinfo[1])
print(cookie_list)
# cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721815475, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': f'{cookie_["CNZZDATA1254842228"]}'}, {'domain': '.qcc.com', 'expiry': 1740650660, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'bb480035-2a34-4270-9a8b-db8b7d9374b3'}, {'domain': '.qcc.com', 'expiry': 1706695474, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'ccf17b97219476a1faa8aaff79'}, {'domain': '.qcc.com', 'expiry': 1721815461, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '18d3aed87f3552-01ba17134bcbe9-4c657b58-e1000-18d3aed87f4c5d'}, {'domain': 'www.qcc.com', 'expiry': 1706092459, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '3d365a1c17060906591851865e848bfd116d30ed8d2ac3e144455c8ff8'}]
for cookie in cookie_list:
cookie['expiry'] = int(cookie['expiry'])
# cookie['expiry'] = int(cookie['expiry'])
# del cookie['expiry']
driver.add_cookie(cookie)
time.sleep(5)
......
......@@ -50,7 +50,7 @@ class Token():
# 获取token
def getToken(self):
# cursor.execute(f"select id,cookies from QCC_token where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
cursor.execute(f" select id, cookies from QCC_token where id = 63")
cursor.execute(f" select id, cookies from QCC_token where id= 82 ")
# rows = cursor.fetchall()
# cnx.commit()
# if rows:
......
"""模拟扫码登录"""
import datetime
import json
import time
......@@ -20,14 +19,10 @@ baseCore = BaseCore()
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
db_storageInsert = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
'企查查登录信息']
def createDriver():
# path = r'D:\soft\msedgedriver.exe'
path = r'F:\spider\117\msedgedriver.exe'
from selenium.webdriver.chrome.service import Service
def createDriver_():
path = r'D:\soft\msedgedriver.exe'
# path = r'F:\spider\117\msedgedriver.exe'
options = {
"browserName": "MicrosoftEdge",
......@@ -42,11 +37,29 @@ def createDriver():
session = webdriver.Edge(executable_path=path, capabilities=options)
return session
def createDriver():
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
# chrome_options.add_argument("--disable-javascript")
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
return driver
def flushAndGetToken():
log.info('======刷新浏览器=====')
browser.refresh()
cookie_list = browser.get_cookies()
cookies = {}
print(cookie_list)
# for cookie in cookie_list:
# cookies[cookie['name']] = cookie['value']
# print(cookies)
# return cookies
print(type(cookie_list))
return cookie_list
......@@ -86,3 +99,8 @@ if __name__ == "__main__":
if flg == 'N' or flg == 'n':
break
baseCore.close()
import pandas as pd
# from pandas import DataFrame as df
import pymysql
import redis
# https://capi.tianyancha.com/cloud-company-background/companyV2/dim/holderV2?_=1716444648296
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
with cnx.cursor() as cursor:
select = """select relationName, relationId from klb_company"""
cursor.execute(select)
results = cursor.fetchall()
for result in results:
name = result[0]
xydm = result[1]
item = f'{name}|{xydm}'
r.rpush('SousuoBaidu:companyname', cell_value)
t = str(int(time.time()) * 1000)
url = f'https://capi.tianyancha.com/cloud-company-background/companyV2/dim/holderV2?_=1716444648296'
payload = {"gid":f"{tycid}", "pageSize":10, "pageNum":1, "sortField":"", "sortType":"-100", "historyType":1}
ip = get_proxy()[random.randint(0, 3)]
res = requests.post(url, headers=headers, data=payload, proxies=ip, timeout=10)
json_info = res.json()
holder_info = json_info['data']['result'][0]
shareHolderName = holder_info['shareHolderName']
percent = holder_info['percent']
\ No newline at end of file
......@@ -6,6 +6,7 @@ import time
import pymysql
import requests
from retry import retry
sys.path.append('D:\\PycharmProjects\\zzsn\\base')
import BaseCore
......@@ -14,16 +15,45 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
requests.adapters.DEFAULT_RETRIES = 5
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
# headers = {
# 'Accept': 'application/json, text/plain, */*',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Connection': 'keep-alive',
# 'Content-Length': '32',
# 'Content-Type': 'application/json',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
# 'version': 'TYC-Web'
# }
headers = {
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/json',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '32',
'Content-Length': '93',
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Host': 'capi.tianyancha.com',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzYzNjcxMTc0NiIsImlhdCI6MTcxNDk1Njg3MywiZXhwIjoxNzE3NTQ4ODczfQ.qMEvtETT7RS3Rhwq9idu5H2AKMxc2cjtr5bDDW6C6yOFKR-ErgDwT4SOBX9PB2LWDexAG2hNaeAvn6swr-n6VA',
'X-TYCID': 'dad485900fcc11ee8c0de34479b5b939',
'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
}
# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
# cursor= cnx.cursor()
......@@ -31,36 +61,34 @@ cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
taskType = '天眼查企业id/天眼查'
#根据信用代码获取天眼查id 企业名字等信息
@retry(tries=5, delay=3)
def getTycIdByXYDM(com_name, s):
retData={'state':False,'tycData':None,'reput':True}
retData={'state':False, 'tycData':None, 'reput':True}
url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
# url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3"
ip = baseCore.get_proxy()
paramJsonData = {'keyword': com_name}
try:
# headers['User-Agent'] = baseCore.getRandomUserAgent()
# headers['X-AUTH-TOKEN'] = baseCore.GetTYCToken()
response = s.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip)
# response = s.post(url, json=paramJsonData, headers=headers)
time.sleep(random.randint(3, 5))
retJsonData =json.loads(response.content.decode('utf-8'))
if retJsonData['data'] and retJsonData['state'] == 'ok':
pass
else:
log.error(f"---{com_name}-未查询到该企业---")
retData['reput'] = False
return retData
matchType=retJsonData['data'][0]['matchType']
if matchType =='公司名称匹配':
retData['state'] = True
retData['tycData'] = retJsonData['data'][0]
response.close()
return retData
else:
log.error(f"{com_name}------{retJsonData}")
response.close()
return retData
except Exception as e:
log.error(f"---{com_name}--{e}---")
# response = requests.post(url=url, json=paramJsonData, headers=header, verify=False, proxies=ip)
response = s.post(url, json=paramJsonData, headers=headers)
time.sleep(random.randint(3, 5))
retJsonData =json.loads(response.content.decode('utf-8'))
if retJsonData['data'] and retJsonData['state'] == 'ok':
pass
else:
log.error(f"---{com_name}-未查询到该企业---")
retData['reput'] = False
return retData
matchType = retJsonData['data'][0]['matchType']
if matchType == '信用代码匹配' or matchType == '公司名称匹配':
retData['state'] = True
retData['tycData'] = retJsonData['data'][0]
response.close()
return retData
else:
log.error(f"{com_name}------{retJsonData}")
response.close()
return retData
......
# 根据信用代码获取天眼查id
import json
import random
import sys
import time
import pymysql
import requests
sys.path.append('D:\\PycharmProjects\\zzsn\\base')
import BaseCore
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
requests.adapters.DEFAULT_RETRIES = 5
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '32',
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
# cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
# cursor= cnx.cursor()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
taskType = '天眼查企业id/天眼查'
#根据信用代码获取天眼查id 企业名字等信息
def getTycIdByXYDM(com_name, s):
retData={'state':False,'tycData':None,'reput':True}
url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
ip = baseCore.get_proxy()
paramJsonData = {'keyword': com_name}
try:
# headers['User-Agent'] = baseCore.getRandomUserAgent()
# headers['X-AUTH-TOKEN'] = baseCore.GetTYCToken()
response = s.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip)
# response = s.post(url, json=paramJsonData, headers=headers)
time.sleep(random.randint(3, 5))
retJsonData =json.loads(response.content.decode('utf-8'))
if retJsonData['data'] and retJsonData['state'] == 'ok':
pass
else:
log.error(f"---{com_name}-未查询到该企业---")
retData['reput'] = False
return retData
matchType=retJsonData['data'][0]['matchType']
if matchType =='公司名称匹配':
retData['state'] = True
retData['tycData'] = retJsonData['data'][0]
response.close()
return retData
else:
log.error(f"{com_name}------{retJsonData}")
response.close()
return retData
except Exception as e:
log.error(f"---{com_name}--{e}---")
return retData
# 更新天眼查企业基本信息
def updateTycInfo():
while True:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gnqy_socialCode')
# social_code = '9111000066990444XF'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
continue
start = time.time()
data = baseCore.getInfomation(social_code)
if len(data) != 0:
pass
else:
# 数据重新塞入redis
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData['tycData'] and retData['reput']:
tycid = retData['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except Exception as e:
log.error(e)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
if __name__ == '__main__':
updateTycInfo()
\ No newline at end of file
import json
import json
import threading
import time
import uuid
import redis
import requests
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
baseCore = BaseCore.BaseCore()
# 使用连接池
# cnx_ = baseCore.pool_11.connection()
# cursor_ = cnx_.cursor()
cnx_ = baseCore.cnx_
cursor_ = cnx_.cursor()
lock = threading.Lock()
pathType = 'QYNotice/'
taskType = '企业研报/东方财富网'
pool = redis.ConnectionPool(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'researchreportdata'
def queryatt(self,index_name,pnum):
body = {
"_source": ["attachmentIds","createDate","sourceAddress","labels.relationId","title","year","publishDate","createDate"],
"query": {
"bool": {
"must": [
{
"term": {
"type.keyword": {
"value": "1"
}
}
},
{
"term": {
"origin.keyword": {
"value": "雪球网"
}
}
},
{
"range": {
"createDate": {
"gte": "2024-05-25T00:00:00"
}
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"track_total_hits": True,
"size": 200,
"from": pnum
}
filter_path = ['hits.hits._id',
'hits.total.value',
'hits.hits._source.title',
'hits.hits._source.sourceAddress',
'hits.hits._source.createDate',
'hits.hits._source.origin'
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
# log.info(result)
return result
def main(page, p, esMethod):
redis_conn = redis.Redis(connection_pool=pool)
result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# return
try:
msglist = result['hits']['hits']
except:
log.info(f'error-----{result}')
return
log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
for mms in msglist:
id = mms['_id']
redis_conn.lrem(f'NianbaoOT:id', 0, id)
redis_conn.lpush(f'NianbaoOT:id', id)
def run_threads(num_threads,esMethod,j):
threads = []
for i in range(num_threads):
page = j + i + 1
p = j + i * 200
thread = threading.Thread(target=main, args=(page, p, esMethod))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
if __name__ == "__main__":
j = 0
for i in range(10):
esMethod = EsMethod()
# result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
# total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# break
start = time.time()
num_threads = 5
run_threads(num_threads, esMethod, j)
j += 1000
log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
import json
import json
import threading
import time
import uuid
import redis
import requests
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
baseCore = BaseCore.BaseCore()
# 使用连接池
# cnx_ = baseCore.pool_11.connection()
# cursor_ = cnx_.cursor()
cnx_ = baseCore.cnx_
cursor_ = cnx_.cursor()
lock = threading.Lock()
pathType = 'QYNotice/'
taskType = '企业研报/东方财富网'
pool = redis.ConnectionPool(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'researchreportdata'
def queryatt(self,index_name,pnum):
body = {
"_source": ["attachmentIds", "createDate", "origin", "labels.relationId", "title", "year",
"publishDate"],
"query": {
"bool": {
"must": [
{
"term": {
"type.keyword": {
"value": "1"
}
}
},
{
"term": {
"year.keyword": {
"value": "2023"
}
}
}
],
"must_not": [
{
"term": {
"origin.keyword": {
"value": "SEC美国证券交易委员会"
}
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"track_total_hits": True,
"size": 200,
"from": pnum
}
filter_path = ['hits.hits._id',
'hits.total.value',
'hits.hits._source.labels.relationId',
'hits.hits._source.year',
'hits.hits._source.createDate',
'hits.hits._source.origin'
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
# log.info(result)
return result
def main(page, p, esMethod):
redis_conn = redis.Redis(connection_pool=pool)
result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# return
try:
msglist = result['hits']['hits']
except:
log.info(f'error-----{result}')
return
log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
for mms in msglist:
id = mms['_id']
year = mms['_source']['year']
socialcode = mms['_source']['labels'][0]['relationId']
origin = mms['_source']['origin']
item = socialcode + "|" + year + "|" + origin
log.info(f'{id}--{year}--{origin}--{socialcode}---')
redis_conn.lrem(f'Nianbao:id', 0, item)
redis_conn.lpush(f'Nianbao:id', item)
def run_threads(num_threads,esMethod,j):
threads = []
for i in range(num_threads):
page = j + i + 1
p = j + i * 200
thread = threading.Thread(target=main, args=(page, p, esMethod))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
if __name__ == "__main__":
j = 0
for i in range(10):
esMethod = EsMethod()
# result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
# total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# break
start = time.time()
num_threads = 5
run_threads(num_threads, esMethod, j)
j += 1000
log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
import sys
import sys
sys.path.append(r'D:\PycharmProjects\zzsn\base')
import BaseCore
baseCore = BaseCore.BaseCore()
import pandas as pd
r = baseCore.r
key = 'Nianbao:id'
df = pd.read_excel(r'D:\kkwork\企业数据\企业年报.xlsx')
def mian(key):
while True:
info = baseCore.redicPullData(key)
# info = '91330281784320546U|'
if info == None:
break
else:
pass
r.lpush('Nianbao:id_2', info)
social_code = info.split('|')[0]
# if df.loc[df['信用代码'] == social_code].astype(str).iloc[0]:
try:
row = df.loc[df['信用代码'] == social_code].astype(str).iloc[0]
# 现在我们想要在这一行中追加一个新列,例如'新列名',值为'新值'
new_column_name = '系统中是否有年报'
new_value = '2023'
# 在DataFrame中追加新列
df.loc[df['信用代码'] == social_code, new_column_name] = new_value
except:
continue
# break
df.to_excel(r'D:\kkwork\企业数据\企业年报.xlsx', index=False)
print('完成')
if __name__ == '__main__':
mian(key)
import json
import json
......@@ -338,8 +338,8 @@ if __name__ == '__main__':
while True:
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:zjh_socialCode')
# social_code = '91100000100003962T'
# social_code = baseCore.redicPullData('AnnualEnterprise:zjh_socialCode')
social_code = '91340000719975888H'
if not social_code:
time.sleep(20)
continue
......@@ -366,6 +366,7 @@ if __name__ == '__main__':
count += 1
runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count)
break
cnx.close()
cursor_.close()
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -12,7 +12,7 @@ from datetime import datetime
from kafka import KafkaProducer
import sys
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
sys.path.append(r'D:\PycharmProjects\zzsn\base')
import BaseCore
baseCore = BaseCore.BaseCore()
import requests, re, time, pymysql, fitz
......@@ -175,7 +175,7 @@ def spider_annual_report(dict_info,num):
selects = cursor.fetchone()
if selects:
log.info(f'com_name:{com_name}、{year}已存在')
continue
return
else:
#上传文件至obs服务器
retData = baseCore.uptoOBS(pdf_url,name_pdf,1,social_code,pathType,taskType,start_time,'XueLingKun')
......@@ -264,8 +264,8 @@ if __name__ == '__main__':
while True:
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
# social_code = '91440300192176077R'
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode22')
social_code = '91440200555570170B'
if not social_code:
time.sleep(20)
if not baseCore.check_mysql_conn(cnx):
......
......@@ -19,7 +19,7 @@ from tempfile import NamedTemporaryFile
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
sys.path.append('D:\\PycharmProjects\\zzsn\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
......@@ -122,7 +122,7 @@ def zzcx():
#todo:将图片塞进去 新建一个new_tag
append_tag = png_.find_element(By.XPATH, './/div/div[1]')
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
"var newElement = document.createElement('img'); newElement.src = 'http://obs.ciglobal.cn" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
append_tag)
os.remove(file_path)
except:
......@@ -153,7 +153,7 @@ def zzcx():
# todo:将图片塞进去 新建一个new_tag
# append_tag = u_png.find_element(By.XPATH, './/div')
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
"var newElement = document.createElement('img'); newElement.src = 'http://obs.ciglobal.cn" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
u_png)
os.remove(file_path)
......@@ -182,13 +182,13 @@ def zzcx():
# todo:将图片塞进去 新建一个new_tag
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; newElement.style.width = '50%'; newElement.style.position = 'relative'; newElement.style.float = 'left'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
"var newElement = document.createElement('img'); newElement.src = 'http://obs.ciglobal.cn" + path + "'; newElement.style.width = '50%'; newElement.style.position = 'relative'; newElement.style.float = 'left'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
line_bar)
# #todo:创建清晰的图片标签
# driver.execute_script(f"""
# var img = new Image();
# img.src = "http://zzsn.luyuen.com{path}"; // 替换为你的图片路径
# img.src = "http://obs.ciglobal.cn{path}"; // 替换为你的图片路径
# img.onload = function() {{
# var canvas = document.createElement("canvas");
# canvas.width = img.width;
......@@ -243,5 +243,5 @@ def zzcx():
if __name__ == "__main__":
pathType = 'PhotoDingzhi/'
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
r = redis.Redis(host='114.116.90.53', port=6380, password='clbzzsn', db=5)
zzcx()
\ No newline at end of file
# 核心工具包
import os
import random
import socket
import sys
import time
import langid
import logbook
import logbook.more
import zhconv
class BaseCore:
# 计算耗时
def getTimeCost(self, start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 日志格式
def logFormate(self, record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self, fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 字符串截取
def getSubStr(self, str, beginStr, endStr):
if beginStr == '':
pass
else:
begin = str.rfind(beginStr)
if begin == -1:
begin = 0
str = str[begin:]
if endStr == '':
pass
else:
end = str.rfind(endStr)
if end == -1:
pass
else:
str = str[0:end + 1]
return str
# 繁体字转简体字
def hant_2_hans(self, hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self, str_num):
panduan = False
for str_1 in str_num:
ppp = str_1.isdigit()
if ppp:
panduan = ppp
return panduan
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
# 获取本机IP
def getIP(self):
IP = socket.gethostbyname(socket.gethostname())
return IP
# 检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
return result[0]
[mysql]
host=114.115.159.144
username=caiji
password=zzsn9988
database=jx_enterprise
[task]
# 每月的几号开始采集
# 格式:1,2,3/1
#每月的1、2、3号开始执行 或 每月的1号执行
day=15
# 几点开始执行
# 格式:12,13/12
# 12、13各执行一次 或 12点执行一次
hour=12
# 几分开始执行
#格式:0,30/0
# 整点、半点各执行一次 或 整点执行一次
minute=0
[interface]
# 设置接口的端口号
port=8000
\ No newline at end of file
import configparser
import datetime
import hashlib
import json
import time
import uuid
from urllib.parse import urlencode
import pandas as pd
import pymysql
import requests
from DBUtils.PooledDB import PooledDB
from apscheduler.schedulers.blocking import BlockingScheduler
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
# 接口信息
appkey = '84959d1d-6afa-4c57-9fea-b75d5599f761'
secretKey = '2aebca76-2def-4f8e-961a-436eeb0828fd'
timeStamp = int(time.time()) * 1000
import configparser
import datetime
class ToMysql():
def __init__(self):
self.now = datetime.datetime.now().strftime('%Y-%m-%d')
self.config = configparser.ConfigParser()
self.config.read('config.ini', encoding='utf-8')
# 设置请求头信息
self.headers = {
'Auth-version': '2.0', # 指定接口验证版本
'appkey': appkey,
'timestamp': str(timeStamp),
'sign': self.md5Encode(appkey + str(timeStamp) + secretKey),
'Connection': 'keep-alive'
}
def md5Encode(self, srcStr):
'''计算字符串的md5值'''
m = hashlib.md5()
m.update(srcStr.encode('utf-8'))
return m.hexdigest()
def mysqlConnection(self):
self.pool = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host=self.config.get('mysql', 'host'),
port=3306,
user=self.config.get('mysql', 'username'),
password=self.config.get('mysql', 'password'),
database=self.config.get('mysql', 'database'),
charset='utf8mb4'
)
self.cnx = self.pool.connection()
self.cursor = self.cnx.cursor()
# 数据库断开连接
def mysqlClose(self):
self.cursor.close()
self.cnx.close()
self.pool.close()
# 调用接口,获取数据
def dataRequest(self, url, company):
# 调用接口
response = requests.get(url, headers=self.headers)
dataJson = response.json()
log.info(dataJson)
if dataJson['status'] == '201':
log.info(f'{company}===无结果')
result = None
elif dataJson['status'] == '207':
log.error(f'{company}===查询错误')
result = None
elif dataJson['status'] == '208':
log.error(f'{company}===参数名错误或参数为空')
result = None
elif dataJson['status'] == '216':
log.error(f'{company}===调用次数超过账户额度限制')
result = None
elif dataJson['status'] == '102':
log.error(f'{company}===账户余额不足')
result = None
elif dataJson['status'] != '200':
log.error(f'{company}===出错状态码{dataJson["status"]}')
result = None
else:
result = dataJson['data']
return result
def getOrgId(self, company):
sql = f'select ORG_ID from organization_id where ORG_NAME = "{company}"'
self.cursor.execute(sql)
try:
orgId = self.cursor.fetchone()[0]
self.cnx.commit()
return orgId
except:
orgId = str(uuid.uuid4())
sqlInsert = f'insert into organization_id(ORG_ID,ORG_NAME) values (%s,%s)'
self.cursor.execute(sqlInsert, (orgId, company))
self.cnx.commit()
return orgId
# 工商照面 1.41
def ORGANIZAION(self):
errorNameList = []
log.info('开始采集企业基本信息===接口1.41')
baseUrl = 'https://api.qixin.com/APIService/enterprise/getBasicInfo'
df = pd.read_excel('./监管企业名单_.xlsx', sheet_name='Sheet1')
companyList = df['单位名称']
for company in companyList:
company = company.strip()
sqlSelect = f"select * from organization_ where NAME='{company}' and CREATE_DATE='{self.now}'"
self.cursor.execute(sqlSelect)
self.cnx.commit()
is_insert = self.cursor.fetchone()
if is_insert:
log.info(f'{company}===已入库')
continue
# 请求参数
urlParams = {
'keyword': company
}
# 构造url
url = '{}?{}'.format(baseUrl, urlencode(urlParams))
companyData = self.dataRequest(url, company)
if not companyData:
errorNameList.append(company)
continue
_id = self.getOrgId(company)
url = f"https://www.qixin.com/company/{companyData['id']}"
name = companyData['name']
format_name = companyData['format_name']
econKind = companyData['econKind']
econKindCode = companyData['econKindCode']
registCapi = companyData['registCapi']
currency_unit = companyData['currency_unit']
type_new = companyData['type_new']
historyNames = companyData['historyNames']
historyNames_ = ''
for historyNames in historyNames:
historyNames_ += f'{historyNames},'
historyNames_ = historyNames_.rstrip(',')
address = companyData['address']
regNo = companyData['regNo']
scope = companyData['scope']
termStart = companyData['termStart']
termEnd = companyData['termEnd']
belongOrg = companyData['belongOrg']
operName = companyData['operName']
title = companyData['title']
startDate = companyData['startDate']
endDate = companyData['endDate']
checkDate = companyData['checkDate']
status = companyData['status']
new_status = companyData['new_status']
orgNo = companyData['orgNo']
creditNo = companyData['creditNo']
districtCode = companyData['districtCode']
actualCapi = companyData['actualCapi']
categoryNew = companyData['categoryNew']
domain = companyData['domain']
tags = companyData['tags']
tags_ = ''
for tag in tags:
tags_ += f'{tag},'
tags_ = tags_.rstrip(',')
revoke_reason = companyData['revoke_reason']
logout_reason = companyData['logout_reason']
revoke_date = companyData['revoke_date']
fenname = companyData['fenname']
sql = 'insert into organization_(CREDIT_NO,NAME,ECON_KIND,REGIST_CAPI,ID,TAGS,BELONG_ORG,STATUS,TERM_START,FORMAT_NAME,HISTORY_NAMES,REVOKE_DATE,END_DATE,REG_NO,ECON_KIND_CODE,DOMAIN,CATEGORY_NEW,ADDRESS,ORG_NO,DISTRICT_CODE,START_DATE,SCOPE,NEW_STATUS,OPER_NAME,TITLE,CHECK_DATE,ACTUAL_CAPI,TERM_END,CURRENCY_UNIT,REVOKE_REASON,TYEP_NEW,LOGOUT_REASON,FENNAME,URL,CREATE_DATE) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) '
try:
self.cursor.execute(sql, (
creditNo, name, econKind, registCapi, _id, tags_, belongOrg, status, termStart, format_name, historyNames_,
revoke_date, endDate, regNo, econKindCode, domain, categoryNew, address, orgNo, districtCode, startDate,
scope,
new_status, operName, title, checkDate, actualCapi, termEnd, currency_unit, revoke_reason, type_new,
logout_reason, fenname, url, self.now))
self.cnx.commit()
log.info(f'{name}===入库成功')
except:
log.info(f'{name}==={company}===有重复')
dfError = pd.DataFrame(errorNameList, columns=['单位名称'])
dfError.to_excel('./查询失败名单.xlsx', index=False)
if __name__ == '__main__':
toMysql = ToMysql()
toMysql.mysqlConnection()
toMysql.ORGANIZAION()
toMysql.mysqlClose()
......@@ -51,18 +51,29 @@ def convert_size(size_bytes):
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {}
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'acw_tc=b7ccf59b17183478493685990e058ea61e8fcc97b3e3ceeb9ca72237eb; cdn_sec_tc=b7ccf59b17183478493685990e058ea61e8fcc97b3e3ceeb9ca72237eb; acw_sc__v3=666be84cf4454ec8c2436572df9f9e6dc78b409b; tfstk=fqG-Yit_Tnxu599nN49m-_AIYB8Dsb3ru0u1tkXQNmEI7DCnr3uQAkigmbqor2ZKpoisr_xrxDEIS2DuVDDuAyiif42H87mY9mn0qT0H46hKjmf3V3PSp6FriYf3q3PKRcVpjhAMs4uzaWtMjvDJxuN_WgaCxuNbhEjiRARMs4u5rzTilCYevQh4AkNCFk6Xky48OzwSAowb5Pj7OWiIlEU37k1QAzgfGzznwK58vaZN9vHWr405ar5CObNUelUOK6CLOzwRU4Zxy4hYy8ETn4oFqbimRbcz3KW0TqDtvvi6mTqSBPnYIYKOwcnzRmFo1eJz3JGK9rk2v9Etd4DZd-LWNqF82-U8eaBQwvir98kR8FubNmkab8920rhosJEaHitSoqE7Bvnk06ZoBqiYIjjcs5MZDXe_1gPEsfIB8GqT-TTvk9WUFrfOmApHgpiI6rEMumWFL-U4klYvk9WUFrzYjEyVL9yYu',
'Host': 'static.sse.com.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
# headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
try:
file_size = int(response.headers.get('Content-Length'))
except:
file_size = 0
break
except:
except Exception as e:
time.sleep(3)
continue
page_size = 0
......@@ -78,7 +89,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
except Exception as e:
log.error(f'文件损坏')
return retData
......@@ -156,10 +167,12 @@ def tableUpdate(retData, com_name, year, pdf_name, num,pub_time,origin):
@retry(tries=3, delay=5)
def RequestUrl(url, payload, social_code,start_time):
ip = baseCore.get_proxy()
# ip = baseCore.get_proxy()
# proxy = {'https': 'http://127.0.0.1:8888', 'http': 'http://127.0.0.1:8888'}
response = requests.post(url=url, headers=headers, data=payload, proxies=ip)
# response = requests.post(url=url, headers=headers, data=payload, proxies=ip)
response = requests.post(url=url, headers=headers, data=payload)
# response = requests.post(url=url, data=payload)
response.encoding = response.apparent_encoding
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
......@@ -372,9 +385,9 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
com_name = dic_info[1]
try:
soup = RequestUrl(url, payload, social_code, start_time)
except:
except Exception as e:
# 请求失败,输出错误信息
log.error(f'请求失败:{url}')
log.error(f'请求失败:{url}----{e}')
#重新放入redis
baseCore.rePutIntoR('NoticeEnterprise:gnqy_socialCode_add', social_code)
time.sleep(random.randint(60, 120))
......@@ -462,6 +475,13 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
pub_time = date_object.strftime("%Y-%m-%d %H:%M:%S")
year = pub_time[:4]
report_type = td_list[4].text.strip()
# 获取当前年份
current_year = datetime.now().year
# print(current_year)
if int(current_year) < int(year):
continue
if str(current_year)[:1] < year[:1]: # 防止年份出现6005这种切出来股票代码的情况
continue
# 判断数据库中是否有该条资讯
ifexist = ifInstert(short_name, social_code, pdf_url)
......@@ -488,22 +508,22 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
else:
log.info(f'======={short_name}========{code}===已存在')
# continue
break
return
if __name__ == '__main__':
num = 0
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
# 'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '380',
# 'Content-Length': '380',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': 'acw_tc=01c6049e16908026442931294e4d0b65d95e3ba93ac19993d151844ac6',
'Host': 'eid.csrc.gov.cn',
'Origin': 'http://eid.csrc.gov.cn',
'Pragma': 'no-cache',
'Cookie': 'acw_tc=2760825217168606497214655ec9cb62ffa696c5367ec9f402d2086a0287ae; tgw_l7_route=125d8c38fe1eb06650b04b0cc6f51270',
# 'Host': 'eid.csrc.gov.cn',
# 'Origin': 'http://eid.csrc.gov.cn',
# 'Pragma': 'no-cache',
'Referer': 'http://eid.csrc.gov.cn/101111/index_1_f.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
......@@ -527,8 +547,8 @@ if __name__ == '__main__':
while True:
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
social_code = '91370000163446410B'
social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode_add')
# social_code = '91370000163446410B'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
......
import execjs
js = execjs.compile(open(r'D:\PycharmProjects\zzsn\douyin\static\dy.js', 'r', encoding='gb18030').read())
if __name__ == '__main__':
data = 'device_platform=webapp&aid=6383&channel=channel_pc_web&publish_video_strategy_type=2&source=channel_pc_web&sec_user_id=MS4wLjABAAAADtPlZR0GJ11ox3X04rzqaBel7L441QHPVoJA8jISv9Q&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1707&screen_height=1067&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=117.0.2045.47&browser_online=true&engine_name=Blink&engine_version=117.0.0.0&os_name=Windows&os_version=10&cpu_core_num=20&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7372020108170167871&msToken=DrQxAShB824nYAbtbpl31BDIxRN4WyGfdyjHWGxJbqozTVhJcuCU8kxxT7HUZttUFJjzft1NqmFXpe0-GW59wC9eRxS6CS24x2YTDIkSIAoqzWbzyLP46cwmh0iHQTuo&'
xs = js.call('get_dy_xb',data)
print(xs)
\ No newline at end of file
# 获取详情页
# 获取详情页
# 获取详情页
import time
import redis
from bs4 import BeautifulSoup
r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
def getDetailmsg( detailmsg):
try:
detailurl = detailmsg['detailUrl']
title = detailmsg['title']
content, contentWithTag = self.extractorMsg(detailurl, title)
contentWithTag = self.rmTagattr(contentWithTag, detailurl)
except Exception as e:
content = ''
contentWithTag = ''
currentdate = self.getNowDate()
kword = self.searchkw
publishDate = detailmsg['publishTag']
publishDate = publishDate + ''
# publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
detailmsg = {
'title': detailmsg['title'],
'source': detailmsg['sourceTag'],
'detailurl': detailurl,
'content': content,
'contentHtml': contentWithTag,
'publishtime': publishDate,
'currentdate': currentdate,
'kword': kword
}
return detailmsg
def getProcessitem(bdetail):
nowDate = self.getNowDate()
content = bdetail['content']
if content != '':
processitem = {
"sid": self.sid,
"source": "4",
"title": bdetail['title'],
"content": bdetail['content'],
"contentWithtag": bdetail['contentHtml'],
"origin": bdetail['source'],
"publishDate": bdetail['publishtime'],
"sourceAddress": bdetail['detailurl'],
"createDate": nowDate
}
return processitem
# 将列表数据插入到表中 baidu_search_result
def itemInsertToTable(items):
itemdata = []
conx, cursorM = connMysql()
companyinfo = item
social_code = str(companyinfo.split('|')[0])
ch_name = companyinfo.split('|')[1]
en_name = companyinfo.split('|')[2]
rank = bangdan_name + '|' + str(companyinfo.split('|')[3])
for item in items:
nowtime = getNowDate()
data = (social_code, en_name, ch_name, rank, item['title'], item['content'], item['detailurl'], item['publishtime'], item['source'], nowtime)
itemdata.append(data)
sql = "INSERT into Company_layoff_copy1 (企业信用代码,企业英文名称,企业中文名称,所在榜单排名,标题,内容,链接,发布时间,来源,创建时间) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata)
logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句
# 执行插入操作
conx.commit()
closeSql(conx, cursorM)
def get_detail_html():
......
......@@ -451,22 +451,164 @@ def aaaaa(final_output):
finall_list.append(result)
print(finall_list)
if __name__ == '__main__':
# same_list = ['让我们从一次时光旅行', '开启植物天堂的故事', '地球的午夜', '是在火山喷发中度过的', '到了凌晨三四点', '在海洋深处有了生命的迹象', '清晨6点多', '更加壮丽的生命乐章开始了', '更加壮丽的生命乐草开始了', '更加壮丽的生命乐章开始了', '更加壮丽的生命乐草开始了', '更加壮丽的生命乐章开始了', '种蓝藻细菌', '一种蓝藻细菌', '学会利用二氧化碳水和阳光', '制造生命所需能量', '同时释放出了氧气', '这个被称为光合作用的过程', '为植物世界打开了大门', '此时', '中国的陆地', '也逐渐从海洋露出形成岛屿', '但在相当长的时间里', '陆地十分荒凉没有生机', '这些岩石坚硬', '无法储存水分', '是当时陆地环境的写照', '直到晚上九点多', '也就是四亿年前左右', '些矮小的生命', '开始征服陆地', '她们用一种近似于根的构造', '固定在岩石上', '苔藓', '是陆地最早的拓荒者之', '小', '她们死后的身体', '形成了肥沃的土壤', '让更多的植物可以在这里生存', '从此', '绿色成为植物天堂的底色']
def paserTime(publishtime):
timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
current_datetime = datetime.datetime.now()
publishtime = publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(days=30 * day)
publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(weeks=day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(days=day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days=2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days=1)
publishtime = current_datetime - delta
elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
delta = datetime.timedelta(hours=5)
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime:
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime:
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime = str(current_year) + '年' + publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
# aaa = aaaaa(same_list)
if __name__ == '__main__':
#
# for i in range(len(same_list)):
# print(i, same_list[i])
# # aaa = aaaaa(same_list)
#
isHandleSuccess, handleMsg = True, "success"
for i in range(3):
if i <= 3:
HandleSuccess, handleMsg = True, "success"
else:
HandleSuccess, handleMsg = False, "error"
print(i, HandleSuccess, handleMsg)
# #
# # for i in range(len(same_list)):
# # print(i, same_list[i])
# #
# # isHandleSuccess, handleMsg = True, "success"
# # for i in range(3):
# # if i <= 3:
# # HandleSuccess, handleMsg = True, "success"
# # else:
# # HandleSuccess, handleMsg = False, "error"
# # print(i, HandleSuccess, handleMsg)
# import re
# import time
#
# import pandas as pd
# import pymongo
# import redis
#
# r = redis.StrictRedis(host='114.115.221.202', port=6379, db=1, decode_responses=True, password='clbzzsn')
# db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
# '裁员数据']
# # typeList = ['2023年福布斯','2022年福布斯','独角兽','世界500','欧盟']
# # for type in typeList:
# # par = re.compile(type)
# # datas = db_storage.find({'flg': False, '内容-翻译': ''})
# # for data in datas:
# # r.rpush('translation:downsiz', str(data['_id']))
# # print(data)
# dataList = []
# par = re.compile('独角兽')
# datas = db_storage.find({'flg':True,'所在榜单排名':par})
# for data in datas:
# del data['_id']
# del data['flg']
# dataList.append(data)
# pd.DataFrame(dataList).to_excel('./独角兽.xlsx')
# from base import BaseCore
# basecore = BaseCore.BaseCore()
# header = {
# 'Accept': 'application/json, text/plain, */*',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Connection': 'keep-alive',
# 'Content-Type': 'application/json',
# 'Sec-Fetch-Dest': 'empty',
# 'Sec-Fetch-Mode': 'cors',
# 'Sec-Fetch-Site': 'same-site',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
# 'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzYzNjcxMTc0NiIsImlhdCI6MTcxNDk1Njg3MywiZXhwIjoxNzE3NTQ4ODczfQ.qMEvtETT7RS3Rhwq9idu5H2AKMxc2cjtr5bDDW6C6yOFKR-ErgDwT4SOBX9PB2LWDexAG2hNaeAvn6swr-n6VA',
# 'X-TYCID': 'dad485900fcc11ee8c0de34479b5b939',
# 'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"',
# 'version': 'TYC-Web'
# }
# header = {
# # 'Accept': 'application/json, text/plain, */*',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Connection': 'keep-alive',
# 'Content-Type': 'application/json',
# # 'Sec-Fetch-Dest': 'empty',
# # 'Sec-Fetch-Mode': 'cors',
# # 'Sec-Fetch-Site': 'same-site',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
# 'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzYzNjcxMTc0NiIsImlhdCI6MTcxNDk1Njg3MywiZXhwIjoxNzE3NTQ4ODczfQ.qMEvtETT7RS3Rhwq9idu5H2AKMxc2cjtr5bDDW6C6yOFKR-ErgDwT4SOBX9PB2LWDexAG2hNaeAvn6swr-n6VA',
# 'X-TYCID': 'dad485900fcc11ee8c0de34479b5b939',
# # 'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
# # 'sec-ch-ua-mobile': '?0',
# # 'sec-ch-ua-platform': '"Windows"',
# 'version': 'TYC-Web'
# }
# ip = basecore.get_proxy()
# url = 'https://capi.tianyancha.com/cloud-listed-company/listed/holder/hk?&date=&gid=2348871426&sortField=&sortType=-100&pageSize=10&pageNum=1&percentLevel=-100&keyword='
# # url = 'https://capi.tianyancha.com/cloud-listed-company/listed/holder/topTen?_=1716458307394&type=1&gid=4845825&sortField=&sortType=-100&pageSize=10&pageNum=1'
# # url = 'https://capi.tianyancha.com/cloud-company-background/companyV2/dim/holderV2?_=1716534254189'
# # payload = {"gid":"2350084808","pageSize":10,"pageNum":1,"sortField":"","sortType":"-100","historyType":1,"percentLevel":"-100","keyword":""}
# # req = requests.post(url=url, headers=header, data=json.dumps(payload), proxies=ip)
# req = requests.get(url=url, headers=header, proxies=ip)
# print(req.json())
# req.close()
# headers = {
# 'Accept':
# 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding': 'gzip, deflate',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'max-age=0',
# 'Connection': 'keep-alive',
# 'Content-Length': '367',
# 'Content-Type': 'application/x-www-form-urlencoded',
# 'Cookie':
# 'acw_tc=2760825217168606497214655ec9cb62ffa696c5367ec9f402d2086a0287ae; tgw_l7_route=125d8c38fe1eb06650b04b0cc6f51270',
# 'Host': 'eid.csrc.gov.cn',
# 'Origin': 'http://eid.csrc.gov.cn',
# 'Referer': 'http://eid.csrc.gov.cn/101812/index_f.html',
# 'Upgrade-Insecure-Requests': '1',
# 'User-Agent':
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
# }
# payload = {'prodType': '002598', 'prodType2': '代码/简称/拼音缩写 ', 'keyWord': '', 'keyWord2': '关键字', 'startDate': '',
# 'startDate2': '请输入开始时间', 'endDate': '', 'endDate2': '请输入结束时间', 'selCatagory2': '10057', 'selBoardCode0': '',
# 'selBoardCode': ''}
# req = requests.get(url='http://eid.csrc.gov.cn/101812/index_2_f.html', headers=headers, data=payload)
# print(req.status_code)
publish_time = '2023年10月5日 '
aaa = paserTime(publish_time)
print(aaa)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论