Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
eff87695
提交
eff87695
authored
1月 26, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
企查查基本信息采集维护
上级
9b2d7df4
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
86 行增加
和
68 行删除
+86
-68
baseinfo0123.py
comData/BaseInfo_qcc/baseinfo0123.py
+83
-65
baseinfo1122.py
comData/BaseInfo_qcc/baseinfo1122.py
+3
-3
没有找到文件。
comData/BaseInfo_qcc/baseinfo0123.py
浏览文件 @
eff87695
...
...
@@ -76,7 +76,13 @@ def baseinfo(com_soup):
value
=
cominfo
.
find
(
'span'
,
class_
=
'val'
)
.
text
.
replace
(
'复制'
,
''
)
.
strip
(
' '
)
except
:
try
:
value
=
cominfo
.
find
(
'span'
,
class_
=
'val next-tick-copy-value'
)
.
text
.
replace
(
'复制'
,
''
)
.
strip
(
' '
)
value_tags
=
cominfo
.
find_all
(
'span'
)
for
_
in
value_tags
:
if
len
(
_
.
attrs
)
==
0
:
value
=
_
.
text
.
replace
(
'复制'
,
''
)
.
strip
(
' '
)
break
else
:
return
data
except
:
return
data
pattern
=
r'\(\d{4}\s*年\)'
...
...
@@ -97,20 +103,20 @@ def baseinfo(com_soup):
return
data
# 检查登陆状态
def
checklogin
(
key
):
# url = f'https://www.qcc.com/web/search?key=91110108558521630L'
url
=
f
'https://www.qcc.com/web/search?key={key}'
# ip = baseCore.get_proxy()
# req = requests.get(headers=headers, url=url, proxies=ip)
req
=
requests
.
get
(
headers
=
headers
,
url
=
url
)
time
.
sleep
(
1
)
soup
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
if
soup
.
find
(
'title'
)
.
text
==
'会员登录 - 企查查'
:
log
.
info
(
'状态---未登录'
)
soup
=
''
return
soup
return
soup
#
def checklogin(key):
#
#
# url = f'https://www.qcc.com/web/search?key=91110108558521630L'
#
url = f'https://www.qcc.com/web/search?key={key}'
#
# ip = baseCore.get_proxy()
#
# req = requests.get(headers=headers, url=url, proxies=ip)
#
req = requests.get(headers=headers, url=url)
#
time.sleep(1)
#
soup = BeautifulSoup(req.content, 'html.parser')
#
if soup.find('title').text == '会员登录 - 企查查':
#
log.info('状态---未登录')
#
soup = ''
#
return soup
#
return soup
# 处理要发送的字段
def
dic_handle
(
result_dic
):
...
...
@@ -333,20 +339,21 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
# company_id = dic_info[12]
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if
social_code
:
# soup = checklogin(social_code)
url
=
f
'https://www.qcc.com/web/search?key={social_code}'
driver
.
get
(
url
)
page_source
=
driver
.
page_source
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
else
:
soup
=
''
# soup = checklogin(com_name)
url
=
f
'https://www.qcc.com/web/search?key={com_name}'
driver
.
get
(
url
)
page_source
=
driver
.
page_source
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
pass
if
not
soup
:
log
.
info
(
"登录失效===重新放入redis"
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
# token.delete_token(id_cookie)
log
.
info
(
'=====已重新放入redis,失效cookies已删除======'
)
#
log.info('=====已重新放入redis,失效cookies已删除======')
time
.
sleep
(
20
)
return
count
else
:
...
...
@@ -355,7 +362,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
except
:
log
.
info
(
"登录失效===重新放入redis"
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
#
token.updateTokeen(id_cookie,2)
token
.
updateTokeen
(
id_cookie
,
2
)
log
.
info
(
'=====已重新放入redis,cookies已封号======'
)
time
.
sleep
(
20
)
return
count
...
...
@@ -371,22 +378,25 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
if
spiderwork
(
soup
,
com_name
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
count
+=
1
log
.
info
(
f
'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}'
)
#
token.updateTokeen(id_cookie,3)
token
.
updateTokeen
(
id_cookie
,
3
)
return
count
else
:
return
count
except
Exception
as
e
:
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
#
token.updateTokeen(id_cookie,2)
#
log.info('=====已重新放入redis,cookies已封号======')
token
.
updateTokeen
(
id_cookie
,
2
)
log
.
info
(
'=====已重新放入redis,cookies已封号======'
)
return
count
def
ifbeforename
(
company_url
):
req_
=
requests
.
get
(
headers
=
headers
,
url
=
company_url
)
com_soup
=
BeautifulSoup
(
req_
.
content
,
'html.parser'
)
# req_ = requests.get(headers=headers, url=company_url)
# com_soup = BeautifulSoup(req_.content, 'html.parser')
driver
.
get
(
company_url
)
page_source_2
=
driver
.
page_source
com_soup
=
BeautifulSoup
(
page_source_2
,
'html.parser'
)
try
:
businessinfo
=
com_soup
.
find
(
'div'
,
class_
=
'cominfo-normal'
)
except
:
...
...
@@ -409,8 +419,6 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
updateSql
=
f
"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
cursor_
.
execute
(
updateSql
)
cnx_
.
commit
()
# ip = baseCore.get_proxy()
# req_ = requests.get(headers=headers, url=company_url, proxies=ip)
# req_ = requests.get(headers=headers, url=company_url)
# com_soup = BeautifulSoup(req_.content, 'html.parser')
...
...
@@ -571,17 +579,17 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
def
login
():
driver
=
create_driver
()
url
=
'https://www.qcc.com
/
'
url
=
'https://www.qcc.com'
driver
.
get
(
url
)
driver
.
maximize_window
()
from
selenium.webdriver.support
import
expected_conditions
as
EC
wait
=
WebDriverWait
(
driver
,
10
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
CLASS_NAME
,
"nav-item"
)))
# page_source = browser.page_source
# soup = BeautifulSoup(page_source,'html.parser')
# print(soup)
driver
.
find_element
(
By
.
CLASS_NAME
,
'nav-item'
)
.
click
()
time
.
sleep
(
10
)
#
from selenium.webdriver.support import expected_conditions as EC
#
wait = WebDriverWait(driver, 10)
#
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "nav-item")))
#
#
page_source = browser.page_source
#
#
soup = BeautifulSoup(page_source,'html.parser')
#
#
print(soup)
#
driver.find_element(By.CLASS_NAME, 'nav-item').click()
#
time.sleep(10)
# wait = WebDriverWait(driver, 10)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "login-change")))
# driver.find_element(By.CLASS_NAME, 'login-change').click()
...
...
@@ -590,43 +598,53 @@ def login():
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[2]/input').send_keys('angel2468')
# driver.find_element(By.XPATH, '//*[@id="loginModal"]/div/div/div/div[1]/div[3]/form/div[4]/button').click()
# time.sleep(3)
cookie_list
=
driver
.
get_cookies
()
# cookie_list= [{'domain': 'www.qcc.com', 'expiry': 1721790462, 'httpOnly': False, 'name': 'CNZZDATA1254842228', 'path': '/', 'secure': False, 'value': '1642640529-1706065651-%7C1706065663'}, {'domain': '.qcc.com', 'expiry': 1792465649, 'httpOnly': False, 'name': 'qcc_did', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'a56c994f-851b-4d6f-964f-80896160c221'}, {'domain': '.qcc.com', 'expiry': 1706670461.146448, 'httpOnly': True, 'name': 'QCCSESSID', 'path': '/', 'secure': False, 'value': '15fbea36e490d86bda4ba24353'}, {'domain': '.qcc.com', 'expiry': 1721790450, 'httpOnly': False, 'name': 'UM_distinctid', 'path': '/', 'secure': False, 'value': '18d396fe41533d-04b6782077b01c-313f68-e1000-18d396fe416778'}, {'domain': 'www.qcc.com', 'expiry': 1706067447.840599, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'secure': False, 'value': '3d365a3017060656472424474e1ed648e1b2a8b72216b66d27de7566e1'}]
# cookie_list = driver.get_cookies()
cookieinfo
=
token
.
getToken
()
if
cookieinfo
:
pass
else
:
log
.
info
(
'==========已无cookies=========='
)
time
.
sleep
(
30
)
return
id_cookie
=
cookieinfo
[
0
]
cookie_
=
json
.
loads
(
cookieinfo
[
1
])
cookie_list
=
[{
'domain'
:
'www.qcc.com'
,
'expiry'
:
1721815475
,
'httpOnly'
:
False
,
'name'
:
'CNZZDATA1254842228'
,
'path'
:
'/'
,
'sameSite'
:
'Lax'
,
'secure'
:
False
,
'value'
:
f
'{cookie_["CNZZDATA1254842228"]}'
},
{
'domain'
:
'.qcc.com'
,
'expiry'
:
1740650660
,
'httpOnly'
:
False
,
'name'
:
'qcc_did'
,
'path'
:
'/'
,
'sameSite'
:
'None'
,
'secure'
:
True
,
'value'
:
'bb480035-2a34-4270-9a8b-db8b7d9374b3'
},
{
'domain'
:
'.qcc.com'
,
'expiry'
:
1706695474
,
'httpOnly'
:
True
,
'name'
:
'QCCSESSID'
,
'path'
:
'/'
,
'sameSite'
:
'Lax'
,
'secure'
:
False
,
'value'
:
'ccf17b97219476a1faa8aaff79'
},
{
'domain'
:
'.qcc.com'
,
'expiry'
:
1721815461
,
'httpOnly'
:
False
,
'name'
:
'UM_distinctid'
,
'path'
:
'/'
,
'sameSite'
:
'Lax'
,
'secure'
:
False
,
'value'
:
'18d3aed87f3552-01ba17134bcbe9-4c657b58-e1000-18d3aed87f4c5d'
},
{
'domain'
:
'www.qcc.com'
,
'expiry'
:
1706092459
,
'httpOnly'
:
True
,
'name'
:
'acw_tc'
,
'path'
:
'/'
,
'sameSite'
:
'Lax'
,
'secure'
:
False
,
'value'
:
'3d365a1c17060906591851865e848bfd116d30ed8d2ac3e144455c8ff8'
}]
for
cookie
in
cookie_list
:
driver
.
add_cookie
(
cookie
)
return
driver
time
.
sleep
(
5
)
url_test
=
'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
driver
.
get
(
url_test
)
return
driver
,
id_cookie
if
__name__
==
'__main__'
:
taskType
=
'基本信息/企查查'
driver
,
id_cookie
=
login
()
while
True
:
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
file_name
=
f
'./data/国内企业基本信息采集情况.xlsx'
print
(
file_name
)
file
.
createFile
(
file_name
)
driver
=
login
()
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
# 'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411',
# 'Cookie': f'qcc_did={cookie_["qcc_did"]}; acw_tc={cookie_["acw_tc"]}; QCCSESSID={cookie_["QCCSESSID"]}',
'Host'
:
'www.qcc.com'
,
'Referer'
:
'https://www.qcc.com/'
,
'Sec-Ch-Ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
'Sec-Ch-Ua-Platform'
:
'"Windows"'
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
# headers = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Connection': 'keep-alive',
# # 'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411',
# # 'Cookie': f'qcc_did={cookie_["qcc_did"]}; acw_tc={cookie_["acw_tc"]}; QCCSESSID={cookie_["QCCSESSID"]}',
# 'Host': 'www.qcc.com',
# 'Referer': 'https://www.qcc.com/',
# 'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
# 'Sec-Ch-Ua-Mobile': '?0',
# 'Sec-Ch-Ua-Platform': '"Windows"',
# 'Sec-Fetch-Dest': 'document',
# 'Sec-Fetch-Mode': 'navigate',
# 'Sec-Fetch-Site': 'same-origin',
# 'Sec-Fetch-User': '?1',
# 'Upgrade-Insecure-Requests': '1',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
# }
start_time
=
time
.
time
()
# 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
...
...
@@ -640,7 +658,7 @@ if __name__ == '__main__':
if
company_field
==
''
or
company_field
is
None
:
# 本轮结束后没有新增的企业要采集
#
file.deleteFile(file_name)
file
.
deleteFile
(
file_name
)
flag
=
True
while
flag
:
log
.
info
(
'--------已没有数据---------'
)
...
...
@@ -676,7 +694,7 @@ if __name__ == '__main__':
# listingDate = ''
# category = ''
# exchange = ''
file_name
=
''
count
=
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
)
time
.
sleep
(
10
)
# break
...
...
comData/BaseInfo_qcc/baseinfo1122.py
浏览文件 @
eff87695
...
...
@@ -389,9 +389,9 @@ def ifbeforename(company_url):
def
spiderinfo
(
company_url
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
listType
,
ynDomestic
,
countryName
,
file_name
):
qccid
=
company_url
.
split
(
'firm/'
)[
1
]
.
split
(
'.html'
)[
0
]
# 将采集到的企查查id更新
updateSql
=
f
"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
cursor_
.
execute
(
updateSql
)
cnx_
.
commit
()
#
updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
#
cursor_.execute(updateSql)
#
cnx_.commit()
# ip = baseCore.get_proxy()
# req_ = requests.get(headers=headers, url=company_url, proxies=ip)
req_
=
requests
.
get
(
headers
=
headers
,
url
=
company_url
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论