提交 480932b7 作者: 薛凌堃

新增企业自动化

上级 7d60109b
# -*- coding: utf-8 -*-
import json
import os.path
import openpyxl
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from base.BaseCore import BaseCore
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log = baseCore.getLogger()
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from openpyxl import Workbook, load_workbook
# 创建文件
def createFile(file_name):
if os.path.exists(file_name):
return
else:
wb = Workbook()
sheet = wb.active
# 更改默认的sheet名称
sheet.title = "需处理企业"
sheet.append(["企业名称", "社会信用代码"])
# 创建另一个sheet
sheet2 = wb.create_sheet("获取基本信息成功企业")
sheet2.append(["企业名称", "社会信用代码", "采到的信用代码"])
wb.save(file_name)
wb.close()
# 删除文件
def deleteFile(file_name):
if os.path.exists(file_name):
os.remove(file_name)
else:
pass
# 追加数据
def appenddata(file_name,sheet,data):
# 打开现有的Excel文件
wb = load_workbook(file_name)
# 选择要追加数据的sheet
sheet = wb[sheet]
sheet.append(data)
# 保存Excel文件
wb.save(file_name)
wb.close()
from classtool import Token, File, Tag
token = Token()
file = File()
tag = Tag()
# 发送数据
def sendkafka(post_data):
......@@ -72,49 +33,6 @@ def sendkafka(post_data):
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
log.info(f"{com_name}--{social_code}--kafka传输失败")
# 删除特定属性标签
def deletep(soup,tag_,attribute_to_delete,value_to_delete):
if attribute_to_delete and value_to_delete:
# 查找带有指定属性的P标签并删除
tags = soup.find_all(tag_, {attribute_to_delete: value_to_delete})
for tag in tags:
# print(tag)
tag.decompose()
else:
tags = soup.find_all(tag_)
for tag in tags:
# print(tag)
tag.decompose()
# 删除空标签
def deletek(soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' 'or tag.get_text()==' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
# 删除span标签
def deletespan(td):
spans = td.find_all('span', class_='app-copy copy-button-item')
for span in spans:
if '复制' in span.text:
span.extract() # 删除span标签
spans2 = td.find_all('span', slot='content')
for span2 in spans2:
if '趋势图' in span2.text:
span2.extract()
spans3 = td.find_all('span', class_='m-l-r-10')
for span3 in spans3:
if '年报' in span3.text:
span3.extract()
spans4 = td.find_all('span',class_='text-span')
for span4 in spans4:
span4.extract()
# 合并基本信息和工商信息字段
def getinfo(dict1,dict2):
# 取出两个字典的key值集合
......@@ -142,9 +60,9 @@ def baseinfo(com_soup):
value = value.split(match.group(0))[0]
# print(value)
deletep(cominfo, 'span', 'class', 'val')
deletep(cominfo, 'a', '', '')
deletek(cominfo)
tag.deletep(cominfo, 'span', 'class', 'val')
tag.deletep(cominfo, 'a', '', '')
tag.deletek(cominfo)
# print(cominfo)
name = cominfo.text.replace('\n', '').replace('复制', '').strip(' ').replace(':', '')
......@@ -392,8 +310,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
if not soup:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
token.delete_token(cookie_)
log.info('=====已重新放入redis,失效cookies已删除======')
time.sleep(20)
return count
else:
......@@ -402,7 +320,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
log.info('=====搜索不到该企业====')
data = [com_name, social_code]
# todo:搜不到的企业需要返回到一个表格中
appenddata(file_name, '需处理企业', data)
file.appenddata(file_name, '需处理企业', data)
return count
else:
# 开始采集
......@@ -416,8 +334,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_social_code', company_field)
# baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
token.delete_token(cookie_)
log.info('=====已重新放入redis,失效cookies已删除======')
return count
......@@ -486,10 +404,10 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# 没有class='tb'属性的标签
att_list = ['inline-block', 'ntag-v2', 'm-l-r-10', 'm-l-sm']
for att in att_list:
deletep(td, 'a', 'class', att)
deletek(td)
deletep(td, 'div', 'class', 'text-gray clearfix original-name-part')
deletespan(td)
tag.deletep(td, 'a', 'class', att)
tag.deletek(td)
tag.deletep(td, 'div', 'class', 'text-gray clearfix original-name-part')
tag.deletespan(td)
# if len(result_dict) <= len(td_tags) // 2:
div_tags = td.find_all('div')
texts = [div.text for div in div_tags if len(div.attrs) == 0]
......@@ -522,7 +440,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# print(result_dict)
# 采集成功的企业
data = [com_name, social_code, result_dict['统一社会信用代码']]
appenddata(file_name, '获取基本信息成功企业', data)
file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰
aa_dic = dic_handle(result_dict)
aa_dic['qccId'] = qccid
......@@ -541,7 +459,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
data_baseinfo = baseinfo(com_soup)
# 采集成功的企业
data = [com_name, social_code, data_baseinfo['统一社会信用代码']]
appenddata(file_name, '获取基本信息成功企业', data)
file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰
aa_dic = dic_handle(data_baseinfo)
aa_dic['qccId'] = qccid
......@@ -564,8 +482,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_social_code', company_field)
# baseCore.delete_token(token)
log.info('=====已重新放入redis,失效token已删除======')
token.delete_token(cookie_)
log.info('=====已重新放入redis,失效cookie已删除======')
return False
# receptname = '小米通讯技术有限公司'
......@@ -600,7 +518,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
else:
#没有搜到相同的企业名称
data = [com_name, social_code]
appenddata(file_name, '需处理企业',data)
file.appenddata(file_name, '需处理企业',data)
time.sleep(2)
return False
return True
......@@ -611,21 +529,27 @@ if __name__ == '__main__':
while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况_{nowtime}.xlsx'
createFile(file_name)
file.createFile(file_name)
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
# token = baseCore.GetToken()
# if token:
# pass
# else:
# log.info('==========已无token==========')
# time.sleep(30)
# continue
cookies = token.getToken()
print(type(cookies))
if cookies:
pass
else:
log.info('==========已无cookies==========')
time.sleep(30)
continue
cookie_ = json.loads(cookies[0])
print(type(cookie_))
log.info(f"获取cookie到----{cookie_}")
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411',
# 'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411',
'Cookie': f'qcc_did={cookie_["qcc_did"]}; acw_tc={cookie_["acw_tc"]}; QCCSESSID={cookie_["QCCSESSID"]}',
'Host': 'www.qcc.com',
'Referer': 'https://www.qcc.com/',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
......@@ -640,38 +564,38 @@ if __name__ == '__main__':
}
start_time = time.time()
# 获取企业信息
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '||浙江绿脉农业科技有限公司'
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
company_field = '91220101606092819L||'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore.sendEmail(file_name)
time.sleep(20)
deleteFile(file_name)
file.deleteFile(file_name)
continue
if company_field == '' or company_field is None:
# 本轮结束后没有新增的企业要采集
deleteFile(file_name)
file.deleteFile(file_name)
time.sleep(20)
continue
social_code = company_field.split('|')[0]
com_name = company_field.split('|')[2]
ynDomestic = company_field.split('|')[15]
countryName = company_field.split('|')[16]
securitiesCode = company_field.split('|')[17]
securitiesShortName = company_field.split('|')[18]
listingDate = company_field.split('|')[21]
category = company_field.split('|')[19]
exchange = company_field.split('|')[20]
# ynDomestic = ''
# countryName = ''
# securitiesCode = ''
# securitiesShortName = ''
# listingDate = ''
# category = ''
# exchange = ''
# ynDomestic = company_field.split('|')[15]
# countryName = company_field.split('|')[16]
# securitiesCode = company_field.split('|')[17]
# securitiesShortName = company_field.split('|')[18]
# listingDate = company_field.split('|')[21]
# category = company_field.split('|')[19]
# exchange = company_field.split('|')[20]
ynDomestic = ''
countryName = ''
securitiesCode = ''
securitiesShortName = ''
listingDate = ''
category = ''
exchange = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name)
......
import os.path
from openpyxl import Workbook, load_workbook
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
class File():
# 创建文件
def createFile(self,file_name):
if os.path.exists(file_name):
return
else:
wb = Workbook()
sheet = wb.active
# 更改默认的sheet名称
sheet.title = "需处理企业"
sheet.append(["企业名称", "社会信用代码"])
# 创建另一个sheet
sheet2 = wb.create_sheet("获取基本信息成功企业")
sheet2.append(["企业名称", "社会信用代码", "采到的信用代码"])
wb.save(file_name)
wb.close()
# 删除文件
def deleteFile(self,file_name):
if os.path.exists(file_name):
os.remove(file_name)
else:
pass
# 追加数据
def appenddata(self,file_name, sheet, data):
# 打开现有的Excel文件
wb = load_workbook(file_name)
# 选择要追加数据的sheet
sheet = wb[sheet]
sheet.append(data)
# 保存Excel文件
wb.save(file_name)
wb.close()
class Token():
#获取token
def getToken(self):
cursor.execute(f"select cookies from QCC_token order by update_time asc limit 1")
row = cursor.fetchall()
cnx.commit()
if row:
pass
else:
#没有查到token
log.info("没有拿到token")
return False
return row[0]
# 删除失效的token
def delete_token(self, cookie_):
deletesql = f"delete from QCC_token where cookies='{cookie_}' "
cursor.execute(deletesql)
cnx.commit()
class Tag():
# 删除特定属性标签
def deletep(self, soup, tag_, attribute_to_delete, value_to_delete):
if attribute_to_delete and value_to_delete:
# 查找带有指定属性的P标签并删除
tags = soup.find_all(tag_, {attribute_to_delete: value_to_delete})
for tag in tags:
# print(tag)
tag.decompose()
else:
tags = soup.find_all(tag_)
for tag in tags:
# print(tag)
tag.decompose()
# 删除空标签
def deletek(self, soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video",
"br"] and tag.name != "br" or tag.get_text() == ' ' or tag.get_text() == ' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
# 删除span标签
def deletespan(self, td):
spans = td.find_all('span', class_='app-copy copy-button-item')
for span in spans:
if '复制' in span.text:
span.extract() # 删除span标签
spans2 = td.find_all('span', slot='content')
for span2 in spans2:
if '趋势图' in span2.text:
span2.extract()
spans3 = td.find_all('span', class_='m-l-r-10')
for span3 in spans3:
if '年报' in span3.text:
span3.extract()
spans4 = td.find_all('span', class_='text-span')
for span4 in spans4:
span4.extract()
\ No newline at end of file
"""模拟扫码登录"""
import json
import time
import requests
......@@ -11,6 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from base.BaseCore import BaseCore
from pymysql.converters import escape_string
baseCore = BaseCore()
log = baseCore.getLogger()
cnx_ = baseCore.cnx
......@@ -34,41 +36,7 @@ def flushAndGetToken():
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
print(cookies)
insert = f"insert into QCC_token (token,cookies,create_time,fenghao_time,user_name,update_time) values ('{token}','{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),'{user_name}',now())"
cursor_.execute(insert)
cnx_.commit()
baseCore.close()
def getrequest_soup(headers,url):
req = requests.get(headers=headers, url=url)
result = BeautifulSoup(req.content, 'html.parser')
return result
def dojob():
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; QCCSESSID=1d489139eea4830a062c3a1240; acw_tc=db9062ad16994955552435350e3b43e7e5cee64c77d9f807936897ab1f',
'Host': 'www.qcc.com',
'Referer': 'https://www.qcc.com/',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
url = 'https://www.qcc.com/api/userCenter/getAuthInfo'
soup = getrequest_soup(headers,url)
pass
return cookies
if __name__ == "__main__":
urlqcc = 'https://www.qcc.com/'
......@@ -81,7 +49,13 @@ if __name__ == "__main__":
# print(soup)
browser.find_element(By.CLASS_NAME, 'nav-item').click()
time.sleep(20)
flushAndGetToken()
cookies = flushAndGetToken()
cookies = json.dumps(cookies)
insert = f"insert into QCC_token (cookies,create_time,fenghao_time,update_time) values ('{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),now())"
cursor_.execute(insert)
cnx_.commit()
baseCore.close()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论