提交 480932b7 作者: 薛凌堃

新增企业自动化

上级 7d60109b
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import os.path
import openpyxl
import re import re
import time import time
import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
cursor_ = baseCore.cursor cursor_ = baseCore.cursor
log = baseCore.getLogger() log = baseCore.getLogger()
import urllib3 from classtool import Token, File, Tag
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) token = Token()
file = File()
from openpyxl import Workbook, load_workbook tag = Tag()
# 创建文件
def createFile(file_name):
if os.path.exists(file_name):
return
else:
wb = Workbook()
sheet = wb.active
# 更改默认的sheet名称
sheet.title = "需处理企业"
sheet.append(["企业名称", "社会信用代码"])
# 创建另一个sheet
sheet2 = wb.create_sheet("获取基本信息成功企业")
sheet2.append(["企业名称", "社会信用代码", "采到的信用代码"])
wb.save(file_name)
wb.close()
# 删除文件
def deleteFile(file_name):
if os.path.exists(file_name):
os.remove(file_name)
else:
pass
# 追加数据
def appenddata(file_name,sheet,data):
# 打开现有的Excel文件
wb = load_workbook(file_name)
# 选择要追加数据的sheet
sheet = wb[sheet]
sheet.append(data)
# 保存Excel文件
wb.save(file_name)
wb.close()
# 发送数据 # 发送数据
def sendkafka(post_data): def sendkafka(post_data):
...@@ -72,49 +33,6 @@ def sendkafka(post_data): ...@@ -72,49 +33,6 @@ def sendkafka(post_data):
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception) baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
log.info(f"{com_name}--{social_code}--kafka传输失败") log.info(f"{com_name}--{social_code}--kafka传输失败")
# 删除特定属性标签
def deletep(soup,tag_,attribute_to_delete,value_to_delete):
if attribute_to_delete and value_to_delete:
# 查找带有指定属性的P标签并删除
tags = soup.find_all(tag_, {attribute_to_delete: value_to_delete})
for tag in tags:
# print(tag)
tag.decompose()
else:
tags = soup.find_all(tag_)
for tag in tags:
# print(tag)
tag.decompose()
# 删除空标签
def deletek(soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' 'or tag.get_text()==' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
# 删除span标签
def deletespan(td):
spans = td.find_all('span', class_='app-copy copy-button-item')
for span in spans:
if '复制' in span.text:
span.extract() # 删除span标签
spans2 = td.find_all('span', slot='content')
for span2 in spans2:
if '趋势图' in span2.text:
span2.extract()
spans3 = td.find_all('span', class_='m-l-r-10')
for span3 in spans3:
if '年报' in span3.text:
span3.extract()
spans4 = td.find_all('span',class_='text-span')
for span4 in spans4:
span4.extract()
# 合并基本信息和工商信息字段 # 合并基本信息和工商信息字段
def getinfo(dict1,dict2): def getinfo(dict1,dict2):
# 取出两个字典的key值集合 # 取出两个字典的key值集合
...@@ -142,9 +60,9 @@ def baseinfo(com_soup): ...@@ -142,9 +60,9 @@ def baseinfo(com_soup):
value = value.split(match.group(0))[0] value = value.split(match.group(0))[0]
# print(value) # print(value)
deletep(cominfo, 'span', 'class', 'val') tag.deletep(cominfo, 'span', 'class', 'val')
deletep(cominfo, 'a', '', '') tag.deletep(cominfo, 'a', '', '')
deletek(cominfo) tag.deletek(cominfo)
# print(cominfo) # print(cominfo)
name = cominfo.text.replace('\n', '').replace('复制', '').strip(' ').replace(':', '') name = cominfo.text.replace('\n', '').replace('复制', '').strip(' ').replace(':', '')
...@@ -392,8 +310,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -392,8 +310,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
if not soup: if not soup:
log.info("登录失效===重新放入redis") log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field) baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.delete_token(token) token.delete_token(cookie_)
log.info('=====已重新放入redis,失效token已删除======') log.info('=====已重新放入redis,失效cookies已删除======')
time.sleep(20) time.sleep(20)
return count return count
else: else:
...@@ -402,7 +320,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -402,7 +320,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
log.info('=====搜索不到该企业====') log.info('=====搜索不到该企业====')
data = [com_name, social_code] data = [com_name, social_code]
# todo:搜不到的企业需要返回到一个表格中 # todo:搜不到的企业需要返回到一个表格中
appenddata(file_name, '需处理企业', data) file.appenddata(file_name, '需处理企业', data)
return count return count
else: else:
# 开始采集 # 开始采集
...@@ -416,8 +334,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin ...@@ -416,8 +334,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
except Exception as e: except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====') log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_social_code', company_field) baseCore.r.lpush('BaseInfoEnterprise:gnqy_social_code', company_field)
# baseCore.delete_token(token) token.delete_token(cookie_)
log.info('=====已重新放入redis,失效token已删除======') log.info('=====已重新放入redis,失效cookies已删除======')
return count return count
...@@ -486,10 +404,10 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -486,10 +404,10 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# 没有class='tb'属性的标签 # 没有class='tb'属性的标签
att_list = ['inline-block', 'ntag-v2', 'm-l-r-10', 'm-l-sm'] att_list = ['inline-block', 'ntag-v2', 'm-l-r-10', 'm-l-sm']
for att in att_list: for att in att_list:
deletep(td, 'a', 'class', att) tag.deletep(td, 'a', 'class', att)
deletek(td) tag.deletek(td)
deletep(td, 'div', 'class', 'text-gray clearfix original-name-part') tag.deletep(td, 'div', 'class', 'text-gray clearfix original-name-part')
deletespan(td) tag.deletespan(td)
# if len(result_dict) <= len(td_tags) // 2: # if len(result_dict) <= len(td_tags) // 2:
div_tags = td.find_all('div') div_tags = td.find_all('div')
texts = [div.text for div in div_tags if len(div.attrs) == 0] texts = [div.text for div in div_tags if len(div.attrs) == 0]
...@@ -522,7 +440,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -522,7 +440,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# print(result_dict) # print(result_dict)
# 采集成功的企业 # 采集成功的企业
data = [com_name, social_code, result_dict['统一社会信用代码']] data = [com_name, social_code, result_dict['统一社会信用代码']]
appenddata(file_name, '获取基本信息成功企业', data) file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰 # 将字段转化成英文驼峰
aa_dic = dic_handle(result_dict) aa_dic = dic_handle(result_dict)
aa_dic['qccId'] = qccid aa_dic['qccId'] = qccid
...@@ -541,7 +459,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca ...@@ -541,7 +459,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
data_baseinfo = baseinfo(com_soup) data_baseinfo = baseinfo(com_soup)
# 采集成功的企业 # 采集成功的企业
data = [com_name, social_code, data_baseinfo['统一社会信用代码']] data = [com_name, social_code, data_baseinfo['统一社会信用代码']]
appenddata(file_name, '获取基本信息成功企业', data) file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰 # 将字段转化成英文驼峰
aa_dic = dic_handle(data_baseinfo) aa_dic = dic_handle(data_baseinfo)
aa_dic['qccId'] = qccid aa_dic['qccId'] = qccid
...@@ -564,8 +482,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -564,8 +482,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
except: except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====') log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_social_code', company_field) baseCore.r.lpush('BaseInfoEnterprise:gnqy_social_code', company_field)
# baseCore.delete_token(token) token.delete_token(cookie_)
log.info('=====已重新放入redis,失效token已删除======') log.info('=====已重新放入redis,失效cookie已删除======')
return False return False
# receptname = '小米通讯技术有限公司' # receptname = '小米通讯技术有限公司'
...@@ -600,7 +518,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat ...@@ -600,7 +518,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
else: else:
#没有搜到相同的企业名称 #没有搜到相同的企业名称
data = [com_name, social_code] data = [com_name, social_code]
appenddata(file_name, '需处理企业',data) file.appenddata(file_name, '需处理企业',data)
time.sleep(2) time.sleep(2)
return False return False
return True return True
...@@ -611,21 +529,27 @@ if __name__ == '__main__': ...@@ -611,21 +529,27 @@ if __name__ == '__main__':
while True: while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8] nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况_{nowtime}.xlsx' file_name = f'./data/国内企业基本信息采集情况_{nowtime}.xlsx'
createFile(file_name) file.createFile(file_name)
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得 # TODO:需要隔两个小时左右抓包修改,token从数据库中获得
# token = baseCore.GetToken() cookies = token.getToken()
# if token: print(type(cookies))
# pass if cookies:
# else: pass
# log.info('==========已无token==========') else:
# time.sleep(30) log.info('==========已无cookies==========')
# continue time.sleep(30)
continue
cookie_ = json.loads(cookies[0])
print(type(cookie_))
log.info(f"获取cookie到----{cookie_}")
headers = { headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive', 'Connection': 'keep-alive',
'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411', # 'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411',
'Cookie': f'qcc_did={cookie_["qcc_did"]}; acw_tc={cookie_["acw_tc"]}; QCCSESSID={cookie_["QCCSESSID"]}',
'Host': 'www.qcc.com', 'Host': 'www.qcc.com',
'Referer': 'https://www.qcc.com/', 'Referer': 'https://www.qcc.com/',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', 'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
...@@ -640,38 +564,38 @@ if __name__ == '__main__': ...@@ -640,38 +564,38 @@ if __name__ == '__main__':
} }
start_time = time.time() start_time = time.time()
# 获取企业信息 # 获取企业信息
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode') # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '||浙江绿脉农业科技有限公司' company_field = '91220101606092819L||'
if company_field == 'end': if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮 # 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore.sendEmail(file_name) baseCore.sendEmail(file_name)
time.sleep(20) time.sleep(20)
deleteFile(file_name) file.deleteFile(file_name)
continue continue
if company_field == '' or company_field is None: if company_field == '' or company_field is None:
# 本轮结束后没有新增的企业要采集 # 本轮结束后没有新增的企业要采集
deleteFile(file_name) file.deleteFile(file_name)
time.sleep(20) time.sleep(20)
continue continue
social_code = company_field.split('|')[0] social_code = company_field.split('|')[0]
com_name = company_field.split('|')[2] com_name = company_field.split('|')[2]
ynDomestic = company_field.split('|')[15] # ynDomestic = company_field.split('|')[15]
countryName = company_field.split('|')[16] # countryName = company_field.split('|')[16]
securitiesCode = company_field.split('|')[17] # securitiesCode = company_field.split('|')[17]
securitiesShortName = company_field.split('|')[18] # securitiesShortName = company_field.split('|')[18]
listingDate = company_field.split('|')[21] # listingDate = company_field.split('|')[21]
category = company_field.split('|')[19] # category = company_field.split('|')[19]
exchange = company_field.split('|')[20] # exchange = company_field.split('|')[20]
# ynDomestic = '' ynDomestic = ''
# countryName = '' countryName = ''
# securitiesCode = '' securitiesCode = ''
# securitiesShortName = '' securitiesShortName = ''
# listingDate = '' listingDate = ''
# category = '' category = ''
# exchange = '' exchange = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name) count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,ynDomestic, countryName, file_name)
......
import os.path
from openpyxl import Workbook, load_workbook
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
class File():
# 创建文件
def createFile(self,file_name):
if os.path.exists(file_name):
return
else:
wb = Workbook()
sheet = wb.active
# 更改默认的sheet名称
sheet.title = "需处理企业"
sheet.append(["企业名称", "社会信用代码"])
# 创建另一个sheet
sheet2 = wb.create_sheet("获取基本信息成功企业")
sheet2.append(["企业名称", "社会信用代码", "采到的信用代码"])
wb.save(file_name)
wb.close()
# 删除文件
def deleteFile(self,file_name):
if os.path.exists(file_name):
os.remove(file_name)
else:
pass
# 追加数据
def appenddata(self,file_name, sheet, data):
# 打开现有的Excel文件
wb = load_workbook(file_name)
# 选择要追加数据的sheet
sheet = wb[sheet]
sheet.append(data)
# 保存Excel文件
wb.save(file_name)
wb.close()
class Token():
#获取token
def getToken(self):
cursor.execute(f"select cookies from QCC_token order by update_time asc limit 1")
row = cursor.fetchall()
cnx.commit()
if row:
pass
else:
#没有查到token
log.info("没有拿到token")
return False
return row[0]
# 删除失效的token
def delete_token(self, cookie_):
deletesql = f"delete from QCC_token where cookies='{cookie_}' "
cursor.execute(deletesql)
cnx.commit()
class Tag():
# 删除特定属性标签
def deletep(self, soup, tag_, attribute_to_delete, value_to_delete):
if attribute_to_delete and value_to_delete:
# 查找带有指定属性的P标签并删除
tags = soup.find_all(tag_, {attribute_to_delete: value_to_delete})
for tag in tags:
# print(tag)
tag.decompose()
else:
tags = soup.find_all(tag_)
for tag in tags:
# print(tag)
tag.decompose()
# 删除空标签
def deletek(self, soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video",
"br"] and tag.name != "br" or tag.get_text() == ' ' or tag.get_text() == ' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
# 删除span标签
def deletespan(self, td):
spans = td.find_all('span', class_='app-copy copy-button-item')
for span in spans:
if '复制' in span.text:
span.extract() # 删除span标签
spans2 = td.find_all('span', slot='content')
for span2 in spans2:
if '趋势图' in span2.text:
span2.extract()
spans3 = td.find_all('span', class_='m-l-r-10')
for span3 in spans3:
if '年报' in span3.text:
span3.extract()
spans4 = td.find_all('span', class_='text-span')
for span4 in spans4:
span4.extract()
\ No newline at end of file
"""模拟扫码登录""" """模拟扫码登录"""
import json
import time import time
import requests import requests
...@@ -11,6 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait ...@@ -11,6 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
from pymysql.converters import escape_string
baseCore = BaseCore() baseCore = BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
cnx_ = baseCore.cnx cnx_ = baseCore.cnx
...@@ -34,41 +36,7 @@ def flushAndGetToken(): ...@@ -34,41 +36,7 @@ def flushAndGetToken():
for cookie in cookie_list: for cookie in cookie_list:
cookies[cookie['name']] = cookie['value'] cookies[cookie['name']] = cookie['value']
print(cookies) print(cookies)
insert = f"insert into QCC_token (token,cookies,create_time,fenghao_time,user_name,update_time) values ('{token}','{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),'{user_name}',now())" return cookies
cursor_.execute(insert)
cnx_.commit()
baseCore.close()
def getrequest_soup(headers,url):
req = requests.get(headers=headers, url=url)
result = BeautifulSoup(req.content, 'html.parser')
return result
def dojob():
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; QCCSESSID=1d489139eea4830a062c3a1240; acw_tc=db9062ad16994955552435350e3b43e7e5cee64c77d9f807936897ab1f',
'Host': 'www.qcc.com',
'Referer': 'https://www.qcc.com/',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
url = 'https://www.qcc.com/api/userCenter/getAuthInfo'
soup = getrequest_soup(headers,url)
pass
if __name__ == "__main__": if __name__ == "__main__":
urlqcc = 'https://www.qcc.com/' urlqcc = 'https://www.qcc.com/'
...@@ -81,7 +49,13 @@ if __name__ == "__main__": ...@@ -81,7 +49,13 @@ if __name__ == "__main__":
# print(soup) # print(soup)
browser.find_element(By.CLASS_NAME, 'nav-item').click() browser.find_element(By.CLASS_NAME, 'nav-item').click()
time.sleep(20) time.sleep(20)
flushAndGetToken() cookies = flushAndGetToken()
cookies = json.dumps(cookies)
insert = f"insert into QCC_token (cookies,create_time,fenghao_time,update_time) values ('{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),now())"
cursor_.execute(insert)
cnx_.commit()
baseCore.close()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论