提交 eff87695 作者: 薛凌堃

企查查基本信息采集维护

上级 9b2d7df4
......@@ -76,7 +76,13 @@ def baseinfo(com_soup):
value = cominfo.find('span', class_='val').text.replace('复制', '').strip(' ')
except:
try:
value = cominfo.find('span', class_='val next-tick-copy-value').text.replace('复制', '').strip(' ')
value_tags = cominfo.find_all('span')
for _ in value_tags:
if len(_.attrs) == 0:
value = _.text.replace('复制', '').strip(' ')
break
else:
return data
except:
return data
pattern = r'\(\d{4}\s*年\)'
......@@ -97,20 +103,20 @@ def baseinfo(com_soup):
return data
# 检查登陆状态
def checklogin(key):
# url = f'https://www.qcc.com/web/search?key=91110108558521630L'
url = f'https://www.qcc.com/web/search?key={key}'
# ip = baseCore.get_proxy()
# req = requests.get(headers=headers, url=url, proxies=ip)
req = requests.get(headers=headers, url=url)
time.sleep(1)
soup = BeautifulSoup(req.content, 'html.parser')
if soup.find('title').text == '会员登录 - 企查查':
log.info('状态---未登录')
soup = ''
return soup
return soup
# def checklogin(key):
#
# # url = f'https://www.qcc.com/web/search?key=91110108558521630L'
# url = f'https://www.qcc.com/web/search?key={key}'
# # ip = baseCore.get_proxy()
# # req = requests.get(headers=headers, url=url, proxies=ip)
# req = requests.get(headers=headers, url=url)
# time.sleep(1)
# soup = BeautifulSoup(req.content, 'html.parser')
# if soup.find('title').text == '会员登录 - 企查查':
# log.info('状态---未登录')
# soup = ''
# return soup
# return soup
# 处理要发送的字段
def dic_handle(result_dic):
......@@ -333,20 +339,21 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
# company_id = dic_info[12]
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if social_code:
# soup = checklogin(social_code)
url = f'https://www.qcc.com/web/search?key={social_code}'
driver.get(url)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
else:
soup = ''
# soup = checklogin(com_name)
url = f'https://www.qcc.com/web/search?key={com_name}'
driver.get(url)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
pass
if not soup:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# token.delete_token(id_cookie)
log.info('=====已重新放入redis,失效cookies已删除======')
# log.info('=====已重新放入redis,失效cookies已删除======')
time.sleep(20)
return count
else:
......@@ -355,7 +362,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
except:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# token.updateTokeen(id_cookie,2)
token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
time.sleep(20)
return count
......@@ -371,22 +378,25 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
if spiderwork(soup, com_name, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
# token.updateTokeen(id_cookie,3)
token.updateTokeen(id_cookie,3)
return count
else:
return count
except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# token.updateTokeen(id_cookie,2)
# log.info('=====已重新放入redis,cookies已封号======')
token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
return count
def ifbeforename(company_url):
req_ = requests.get(headers=headers, url=company_url)
com_soup = BeautifulSoup(req_.content, 'html.parser')
# req_ = requests.get(headers=headers, url=company_url)
# com_soup = BeautifulSoup(req_.content, 'html.parser')
driver.get(company_url)
page_source_2 = driver.page_source
com_soup = BeautifulSoup(page_source_2, 'html.parser')
try:
businessinfo = com_soup.find('div', class_='cominfo-normal')
except:
......@@ -409,8 +419,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql)
cnx_.commit()
# ip = baseCore.get_proxy()
# req_ = requests.get(headers=headers, url=company_url, proxies=ip)
# req_ = requests.get(headers=headers, url=company_url)
# com_soup = BeautifulSoup(req_.content, 'html.parser')
......@@ -571,17 +579,17 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
def login():
driver = create_driver()
url = 'https://www.qcc.com/'
url = 'https://www.qcc.com'
driver.get(url)
driver.maximize_window()
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "nav-item")))
# page_source = browser.page_source
# soup = BeautifulSoup(page_source,'html.parser')
# print(soup)
driver.find_element(By.CLASS_NAME, 'nav-item').click()
time.sleep(10)
# from selenium.webdriver.support import expected_conditions as EC
# wait = WebDriverWait(driver, 10)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "nav-item")))
# # page_source = browser.page_source
# # soup = BeautifulSoup(page_source,'html.parser')
# # print(soup)
# driver.find_element(By.CLASS_NAME, 'nav-item').click()
# time.sleep(10)
# wait = WebDriverWait(driver, 10)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "login-change")))
# driver.find_element(By.CLASS_NAME, 'login-change').click()
......@@ -590,43 +598,53 @@ def login():
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[2]/input').send_keys('angel2468')
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[4]/button').click()
# time.sleep(3)
cookie_list = driver.get_cookies()
# cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721790462, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'secure': False, 'value': '1642640529-1706065651-%7C1706065663'}, {'domain': '.qcc.com', 'expiry': 1792465649, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'a56c994f-851b-4d6f-964f-80896160c221'}, {'domain': '.qcc.com', 'expiry': 1706670461.146448, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'secure': False, 'value': '15fbea36e490d86bda4ba24353'}, {'domain': '.qcc.com', 'expiry': 1721790450, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'secure': False, 'value': '18d396fe41533d-04b6782077b01c-313f68-e1000-18d396fe416778'}, {'domain': 'www.qcc.com', 'expiry': 1706067447.840599, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'secure': False, 'value': '3d365a3017060656472424474e1ed648e1b2a8b72216b66d27de7566e1'}]
# cookie_list = driver.get_cookies()
cookieinfo = token.getToken()
if cookieinfo:
pass
else:
log.info('==========已无cookies==========')
time.sleep(30)
return
id_cookie = cookieinfo[0]
cookie_ = json.loads(cookieinfo[1])
cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721815475, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': f'{cookie_["CNZZDATA1254842228"]}'}, {'domain': '.qcc.com', 'expiry': 1740650660, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'bb480035-2a34-4270-9a8b-db8b7d9374b3'}, {'domain': '.qcc.com', 'expiry': 1706695474, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'ccf17b97219476a1faa8aaff79'}, {'domain': '.qcc.com', 'expiry': 1721815461, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '18d3aed87f3552-01ba17134bcbe9-4c657b58-e1000-18d3aed87f4c5d'}, {'domain': 'www.qcc.com', 'expiry': 1706092459, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '3d365a1c17060906591851865e848bfd116d30ed8d2ac3e144455c8ff8'}]
for cookie in cookie_list:
driver.add_cookie(cookie)
return driver
time.sleep(5)
url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
driver.get(url_test)
return driver,id_cookie
if __name__ == '__main__':
taskType = '基本信息/企查查'
driver, id_cookie = login()
while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况.xlsx'
print(file_name)
file.createFile(file_name)
driver = login()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
# 'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411',
# 'Cookie': f'qcc_did={cookie_["qcc_did"]}; acw_tc={cookie_["acw_tc"]}; QCCSESSID={cookie_["QCCSESSID"]}',
'Host': 'www.qcc.com',
'Referer': 'https://www.qcc.com/',
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
# headers = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Connection': 'keep-alive',
# # 'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411',
# # 'Cookie': f'qcc_did={cookie_["qcc_did"]}; acw_tc={cookie_["acw_tc"]}; QCCSESSID={cookie_["QCCSESSID"]}',
# 'Host': 'www.qcc.com',
# 'Referer': 'https://www.qcc.com/',
# 'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
# 'Sec-Ch-Ua-Mobile': '?0',
# 'Sec-Ch-Ua-Platform': '"Windows"',
# 'Sec-Fetch-Dest': 'document',
# 'Sec-Fetch-Mode': 'navigate',
# 'Sec-Fetch-Site': 'same-origin',
# 'Sec-Fetch-User': '?1',
# 'Upgrade-Insecure-Requests': '1',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
# }
start_time = time.time()
# 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
......@@ -640,7 +658,7 @@ if __name__ == '__main__':
if company_field == '' or company_field is None:
# 本轮结束后没有新增的企业要采集
# file.deleteFile(file_name)
file.deleteFile(file_name)
flag = True
while flag:
log.info('--------已没有数据---------')
......@@ -676,7 +694,7 @@ if __name__ == '__main__':
# listingDate = ''
# category = ''
# exchange = ''
file_name = ''
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,listType, ynDomestic, countryName, file_name)
time.sleep(10)
# break
......
......@@ -389,9 +389,9 @@ def ifbeforename(company_url):
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('firm/')[1].split('.html')[0]
# 将采集到的企查查id更新
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
cursor_.execute(updateSql)
cnx_.commit()
# updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
# cursor_.execute(updateSql)
# cnx_.commit()
# ip = baseCore.get_proxy()
# req_ = requests.get(headers=headers, url=company_url, proxies=ip)
req_ = requests.get(headers=headers, url=company_url)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论