提交 9dbcd59d 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

......@@ -49,8 +49,8 @@ class File():
class Token():
# 获取token
def getToken(self):
cursor.execute(f"select id,cookies from QCC_token where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
# cursor.execute(f" select id, cookies from QCC_token")
# cursor.execute(f"select id,cookies from QCC_token where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
cursor.execute(f" select id, cookies from QCC_token where id = 63")
# rows = cursor.fetchall()
# cnx.commit()
# if rows:
......
import datetime
import json
import os.path
import random
import pymongo
from bson import ObjectId
from openpyxl import Workbook, load_workbook
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息']
class File():
# 创建文件
def createFile(self,file_name):
if os.path.exists(file_name):
return
else:
wb = Workbook()
sheet = wb.active
# 更改默认的sheet名称
sheet.title = "需处理企业"
sheet.append(["企业名称", "社会信用代码"])
# 创建另一个sheet
sheet2 = wb.create_sheet("获取基本信息成功企业")
sheet2.append(["企业名称", "采到的企业名称", "社会信用代码", "采到的信用代码"])
wb.save(file_name)
wb.close()
# 删除文件
def deleteFile(self,file_name):
if os.path.exists(file_name):
os.remove(file_name)
else:
pass
# 追加数据
def appenddata(self,file_name, sheet, data):
# 打开现有的Excel文件
wb = load_workbook(file_name)
# 选择要追加数据的sheet
sheet = wb[sheet]
sheet.append(data)
# 保存Excel文件
wb.save(file_name)
wb.close()
class Token():
# 获取token
def get_cookies(self):
query = {
'fenghaoTime': {'$lt': 'updateTime'}, # 封号时间小于更新时间
}
result = db_storage.find_one(query, sort=[('updateTime', 1)])
cookies = result['cookies']
id_token = result['_id']
return cookies, id_token
# 删除失效的token
def delete_token(self, cookie_):
deletesql = f"delete from QCC_token where id='{cookie_}' "
cursor.execute(deletesql)
cnx.commit()
# token的处理
def updateTokeen(self,id_token, type):
if type == 1:
# session失效,删除token
cursor.execute(f"delete from QCC_token where id={id_token}")
if type == 2:
# 封号了 修改封号时间
filter = {'_id': ObjectId(id_token)}
# 更新操作
update = {'$set': {'fenghaoTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}}
# 执行更新操作
db_storage.update_one(filter, update)
if type == 3:
# 修改使用时间
filter = {'_id': ObjectId(id_token)}
# 更新操作
update = {'$set': {'updateTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}}
# 执行更新操作
db_storage.update_one(filter, update)
cnx.commit()
class Tag():
# 删除特定属性标签
def deletep(self, soup, tag_, attribute_to_delete, value_to_delete):
if attribute_to_delete and value_to_delete:
# 查找带有指定属性的P标签并删除
tags = soup.find_all(tag_, {attribute_to_delete: value_to_delete})
for tag in tags:
# print(tag)
tag.decompose()
else:
tags = soup.find_all(tag_)
for tag in tags:
# print(tag)
tag.decompose()
# 删除空标签
def deletek(self, soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video",
"br"] and tag.name != "br" or tag.get_text() == ' ' or tag.get_text() == ' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
# 删除span标签
def deletespan(self, td):
spans = td.find_all('span', class_='app-copy copy-button-item')
for span in spans:
if '复制' in span.text:
span.extract() # 删除span标签
spans2 = td.find_all('span', slot='content')
for span2 in spans2:
if '趋势图' in span2.text:
span2.extract()
spans3 = td.find_all('span', class_='m-l-r-10')
for span3 in spans3:
if '年报' in span3.text:
span3.extract()
spans4 = td.find_all('span', class_='text-span')
for span4 in spans4:
span4.extract()
\ No newline at end of file
import datetime
import time
from selenium import webdriver
import pymongo
from selenium.webdriver.common.by import By
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息']
url = 'https://www.tianyancha.com/'
def create_driver():
path = r'D:\soft\msedgedriver.exe'
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
}
driver = webdriver.Edge(executable_path=path, capabilities=options)
return driver
if __name__ == "__main__":
name = input('所属用户:')
driver = create_driver()
driver.get(url)
time.sleep(60)
cookies = driver.get_cookies()
# print(driver.get_cookies())
# 要存储的数据
create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
fenghaoTime = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y-%m-%d %H:%M:%S')
data = {
'name': name,
'cookies': cookies,
'createTime': create_time,
'fenghaoTime': fenghaoTime,
'updateTime': create_time,
}
# 插入数据
result = db_storage.insert_one(data)
# 打印插入的数据的 ID
print(result.inserted_id)
\ No newline at end of file
import os
import os
......@@ -49,9 +49,15 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
retData['content'] = response.text
break
#todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
break
except:
time.sleep(3)
continue
......@@ -339,7 +345,8 @@ def gonggao_info(dic_info):
info_content = json_2['data']['notice_content']
except:
info_content = ''
ifexist = ifInstert(com_name, social_code, info_url)
# ifexist = ifInstert(com_name, social_code, info_url)
ifexist = True
if ifexist:
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
result = GetContent(pdf_url, info_url,title, social_code, year, info_date, start_time, com_name, num)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论