Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
480932b7
提交
480932b7
authored
11月 16, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
新增企业自动化
上级
7d60109b
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
179 行增加
和
169 行删除
+179
-169
baseinfo1113.py
comData/BaseInfo_qcc/baseinfo1113.py
+57
-133
classtool.py
comData/BaseInfo_qcc/classtool.py
+112
-0
requestQCC.py
comData/BaseInfo_qcc/requestQCC.py
+10
-36
没有找到文件。
comData/BaseInfo_qcc/baseinfo1113.py
浏览文件 @
480932b7
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import
json
import
json
import
os.path
import
openpyxl
import
re
import
re
import
time
import
time
import
pandas
as
pd
import
requests
import
requests
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
cnx_
=
baseCore
.
cnx
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cursor_
=
baseCore
.
cursor
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
import
urllib3
from
classtool
import
Token
,
File
,
Tag
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
token
=
Token
()
file
=
File
()
from
openpyxl
import
Workbook
,
load_workbook
tag
=
Tag
()
# 创建文件
def
createFile
(
file_name
):
if
os
.
path
.
exists
(
file_name
):
return
else
:
wb
=
Workbook
()
sheet
=
wb
.
active
# 更改默认的sheet名称
sheet
.
title
=
"需处理企业"
sheet
.
append
([
"企业名称"
,
"社会信用代码"
])
# 创建另一个sheet
sheet2
=
wb
.
create_sheet
(
"获取基本信息成功企业"
)
sheet2
.
append
([
"企业名称"
,
"社会信用代码"
,
"采到的信用代码"
])
wb
.
save
(
file_name
)
wb
.
close
()
# 删除文件
def
deleteFile
(
file_name
):
if
os
.
path
.
exists
(
file_name
):
os
.
remove
(
file_name
)
else
:
pass
# 追加数据
def
appenddata
(
file_name
,
sheet
,
data
):
# 打开现有的Excel文件
wb
=
load_workbook
(
file_name
)
# 选择要追加数据的sheet
sheet
=
wb
[
sheet
]
sheet
.
append
(
data
)
# 保存Excel文件
wb
.
save
(
file_name
)
wb
.
close
()
# 发送数据
# 发送数据
def
sendkafka
(
post_data
):
def
sendkafka
(
post_data
):
...
@@ -72,49 +33,6 @@ def sendkafka(post_data):
...
@@ -72,49 +33,6 @@ def sendkafka(post_data):
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
exception
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
exception
)
log
.
info
(
f
"{com_name}--{social_code}--kafka传输失败"
)
log
.
info
(
f
"{com_name}--{social_code}--kafka传输失败"
)
# 删除特定属性标签
def
deletep
(
soup
,
tag_
,
attribute_to_delete
,
value_to_delete
):
if
attribute_to_delete
and
value_to_delete
:
# 查找带有指定属性的P标签并删除
tags
=
soup
.
find_all
(
tag_
,
{
attribute_to_delete
:
value_to_delete
})
for
tag
in
tags
:
# print(tag)
tag
.
decompose
()
else
:
tags
=
soup
.
find_all
(
tag_
)
for
tag
in
tags
:
# print(tag)
tag
.
decompose
()
# 删除空标签
def
deletek
(
soup
):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for
i
in
soup
.
find_all
(
lambda
tag
:
len
(
tag
.
get_text
())
==
0
and
tag
.
name
not
in
[
"img"
,
"video"
,
"br"
]
and
tag
.
name
!=
"br"
or
tag
.
get_text
()
==
' '
or
tag
.
get_text
()
==
' '
):
for
j
in
i
.
descendants
:
if
j
.
name
in
[
"img"
,
"video"
,
"br"
]:
break
else
:
i
.
decompose
()
# 删除span标签
def
deletespan
(
td
):
spans
=
td
.
find_all
(
'span'
,
class_
=
'app-copy copy-button-item'
)
for
span
in
spans
:
if
'复制'
in
span
.
text
:
span
.
extract
()
# 删除span标签
spans2
=
td
.
find_all
(
'span'
,
slot
=
'content'
)
for
span2
in
spans2
:
if
'趋势图'
in
span2
.
text
:
span2
.
extract
()
spans3
=
td
.
find_all
(
'span'
,
class_
=
'm-l-r-10'
)
for
span3
in
spans3
:
if
'年报'
in
span3
.
text
:
span3
.
extract
()
spans4
=
td
.
find_all
(
'span'
,
class_
=
'text-span'
)
for
span4
in
spans4
:
span4
.
extract
()
# 合并基本信息和工商信息字段
# 合并基本信息和工商信息字段
def
getinfo
(
dict1
,
dict2
):
def
getinfo
(
dict1
,
dict2
):
# 取出两个字典的key值集合
# 取出两个字典的key值集合
...
@@ -142,9 +60,9 @@ def baseinfo(com_soup):
...
@@ -142,9 +60,9 @@ def baseinfo(com_soup):
value
=
value
.
split
(
match
.
group
(
0
))[
0
]
value
=
value
.
split
(
match
.
group
(
0
))[
0
]
# print(value)
# print(value)
deletep
(
cominfo
,
'span'
,
'class'
,
'val'
)
tag
.
deletep
(
cominfo
,
'span'
,
'class'
,
'val'
)
deletep
(
cominfo
,
'a'
,
''
,
''
)
tag
.
deletep
(
cominfo
,
'a'
,
''
,
''
)
deletek
(
cominfo
)
tag
.
deletek
(
cominfo
)
# print(cominfo)
# print(cominfo)
name
=
cominfo
.
text
.
replace
(
'
\n
'
,
''
)
.
replace
(
'复制'
,
''
)
.
strip
(
' '
)
.
replace
(
':'
,
''
)
name
=
cominfo
.
text
.
replace
(
'
\n
'
,
''
)
.
replace
(
'复制'
,
''
)
.
strip
(
' '
)
.
replace
(
':'
,
''
)
...
@@ -392,8 +310,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
...
@@ -392,8 +310,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
if
not
soup
:
if
not
soup
:
log
.
info
(
"登录失效===重新放入redis"
)
log
.
info
(
"登录失效===重新放入redis"
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_socialCode'
,
company_field
)
# baseCore.delete_token(token
)
token
.
delete_token
(
cookie_
)
log
.
info
(
'=====已重新放入redis,失效
token
已删除======'
)
log
.
info
(
'=====已重新放入redis,失效
cookies
已删除======'
)
time
.
sleep
(
20
)
time
.
sleep
(
20
)
return
count
return
count
else
:
else
:
...
@@ -402,7 +320,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
...
@@ -402,7 +320,7 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
log
.
info
(
'=====搜索不到该企业===='
)
log
.
info
(
'=====搜索不到该企业===='
)
data
=
[
com_name
,
social_code
]
data
=
[
com_name
,
social_code
]
# todo:搜不到的企业需要返回到一个表格中
# todo:搜不到的企业需要返回到一个表格中
appenddata
(
file_name
,
'需处理企业'
,
data
)
file
.
appenddata
(
file_name
,
'需处理企业'
,
data
)
return
count
return
count
else
:
else
:
# 开始采集
# 开始采集
...
@@ -416,8 +334,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
...
@@ -416,8 +334,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
except
Exception
as
e
:
except
Exception
as
e
:
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_social_code'
,
company_field
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_social_code'
,
company_field
)
# baseCore.delete_token(token
)
token
.
delete_token
(
cookie_
)
log
.
info
(
'=====已重新放入redis,失效
token
已删除======'
)
log
.
info
(
'=====已重新放入redis,失效
cookies
已删除======'
)
return
count
return
count
...
@@ -486,10 +404,10 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
...
@@ -486,10 +404,10 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# 没有class='tb'属性的标签
# 没有class='tb'属性的标签
att_list
=
[
'inline-block'
,
'ntag-v2'
,
'm-l-r-10'
,
'm-l-sm'
]
att_list
=
[
'inline-block'
,
'ntag-v2'
,
'm-l-r-10'
,
'm-l-sm'
]
for
att
in
att_list
:
for
att
in
att_list
:
deletep
(
td
,
'a'
,
'class'
,
att
)
tag
.
deletep
(
td
,
'a'
,
'class'
,
att
)
deletek
(
td
)
tag
.
deletek
(
td
)
deletep
(
td
,
'div'
,
'class'
,
'text-gray clearfix original-name-part'
)
tag
.
deletep
(
td
,
'div'
,
'class'
,
'text-gray clearfix original-name-part'
)
deletespan
(
td
)
tag
.
deletespan
(
td
)
# if len(result_dict) <= len(td_tags) // 2:
# if len(result_dict) <= len(td_tags) // 2:
div_tags
=
td
.
find_all
(
'div'
)
div_tags
=
td
.
find_all
(
'div'
)
texts
=
[
div
.
text
for
div
in
div_tags
if
len
(
div
.
attrs
)
==
0
]
texts
=
[
div
.
text
for
div
in
div_tags
if
len
(
div
.
attrs
)
==
0
]
...
@@ -522,7 +440,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
...
@@ -522,7 +440,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
# print(result_dict)
# print(result_dict)
# 采集成功的企业
# 采集成功的企业
data
=
[
com_name
,
social_code
,
result_dict
[
'统一社会信用代码'
]]
data
=
[
com_name
,
social_code
,
result_dict
[
'统一社会信用代码'
]]
appenddata
(
file_name
,
'获取基本信息成功企业'
,
data
)
file
.
appenddata
(
file_name
,
'获取基本信息成功企业'
,
data
)
# 将字段转化成英文驼峰
# 将字段转化成英文驼峰
aa_dic
=
dic_handle
(
result_dict
)
aa_dic
=
dic_handle
(
result_dict
)
aa_dic
[
'qccId'
]
=
qccid
aa_dic
[
'qccId'
]
=
qccid
...
@@ -541,7 +459,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
...
@@ -541,7 +459,7 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
data_baseinfo
=
baseinfo
(
com_soup
)
data_baseinfo
=
baseinfo
(
com_soup
)
# 采集成功的企业
# 采集成功的企业
data
=
[
com_name
,
social_code
,
data_baseinfo
[
'统一社会信用代码'
]]
data
=
[
com_name
,
social_code
,
data_baseinfo
[
'统一社会信用代码'
]]
appenddata
(
file_name
,
'获取基本信息成功企业'
,
data
)
file
.
appenddata
(
file_name
,
'获取基本信息成功企业'
,
data
)
# 将字段转化成英文驼峰
# 将字段转化成英文驼峰
aa_dic
=
dic_handle
(
data_baseinfo
)
aa_dic
=
dic_handle
(
data_baseinfo
)
aa_dic
[
'qccId'
]
=
qccid
aa_dic
[
'qccId'
]
=
qccid
...
@@ -564,8 +482,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
...
@@ -564,8 +482,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
except
:
except
:
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
log
.
info
(
f
'====={social_code}=====获取基本信息失败,重新放入redis====='
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_social_code'
,
company_field
)
baseCore
.
r
.
lpush
(
'BaseInfoEnterprise:gnqy_social_code'
,
company_field
)
# baseCore.delete_token(token
)
token
.
delete_token
(
cookie_
)
log
.
info
(
'=====已重新放入redis,失效
token
已删除======'
)
log
.
info
(
'=====已重新放入redis,失效
cookie
已删除======'
)
return
False
return
False
# receptname = '小米通讯技术有限公司'
# receptname = '小米通讯技术有限公司'
...
@@ -600,7 +518,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
...
@@ -600,7 +518,7 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
else
:
else
:
#没有搜到相同的企业名称
#没有搜到相同的企业名称
data
=
[
com_name
,
social_code
]
data
=
[
com_name
,
social_code
]
appenddata
(
file_name
,
'需处理企业'
,
data
)
file
.
appenddata
(
file_name
,
'需处理企业'
,
data
)
time
.
sleep
(
2
)
time
.
sleep
(
2
)
return
False
return
False
return
True
return
True
...
@@ -611,21 +529,27 @@ if __name__ == '__main__':
...
@@ -611,21 +529,27 @@ if __name__ == '__main__':
while
True
:
while
True
:
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
nowtime
=
baseCore
.
getNowTime
(
1
)
.
replace
(
'-'
,
''
)[:
8
]
file_name
=
f
'./data/国内企业基本信息采集情况_{nowtime}.xlsx'
file_name
=
f
'./data/国内企业基本信息采集情况_{nowtime}.xlsx'
createFile
(
file_name
)
file
.
createFile
(
file_name
)
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
# token = baseCore.GetToken()
cookies
=
token
.
getToken
()
# if token:
print
(
type
(
cookies
))
# pass
if
cookies
:
# else:
pass
# log.info('==========已无token==========')
else
:
# time.sleep(30)
log
.
info
(
'==========已无cookies=========='
)
# continue
time
.
sleep
(
30
)
continue
cookie_
=
json
.
loads
(
cookies
[
0
])
print
(
type
(
cookie_
))
log
.
info
(
f
"获取cookie到----{cookie_}"
)
headers
=
{
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https
%253
A
%252
F
%252
Fwww.qcc.com
%252
F
%7
C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411'
,
# 'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; acw_tc=db9062a717000200596487102e63dac7bed6aad2a049361c973816fabf; QCCSESSID=3c95642bd6445b7681c8fc6411',
'Cookie'
:
f
'qcc_did={cookie_["qcc_did"]}; acw_tc={cookie_["acw_tc"]}; QCCSESSID={cookie_["QCCSESSID"]}'
,
'Host'
:
'www.qcc.com'
,
'Host'
:
'www.qcc.com'
,
'Referer'
:
'https://www.qcc.com/'
,
'Referer'
:
'https://www.qcc.com/'
,
'Sec-Ch-Ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'Sec-Ch-Ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
...
@@ -640,38 +564,38 @@ if __name__ == '__main__':
...
@@ -640,38 +564,38 @@ if __name__ == '__main__':
}
}
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
company_field
=
baseCore
.
redicPullData
(
'BaseInfoEnterprise:gnqy_socialCode'
)
#
company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
# company_field = '||浙江绿脉农业科技有限公司
'
company_field
=
'91220101606092819L||
'
if
company_field
==
'end'
:
if
company_field
==
'end'
:
# 本轮处理完毕,需要发送邮件,并且进入下一轮
# 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore
.
sendEmail
(
file_name
)
baseCore
.
sendEmail
(
file_name
)
time
.
sleep
(
20
)
time
.
sleep
(
20
)
deleteFile
(
file_name
)
file
.
deleteFile
(
file_name
)
continue
continue
if
company_field
==
''
or
company_field
is
None
:
if
company_field
==
''
or
company_field
is
None
:
# 本轮结束后没有新增的企业要采集
# 本轮结束后没有新增的企业要采集
deleteFile
(
file_name
)
file
.
deleteFile
(
file_name
)
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
social_code
=
company_field
.
split
(
'|'
)[
0
]
social_code
=
company_field
.
split
(
'|'
)[
0
]
com_name
=
company_field
.
split
(
'|'
)[
2
]
com_name
=
company_field
.
split
(
'|'
)[
2
]
ynDomestic
=
company_field
.
split
(
'|'
)[
15
]
#
ynDomestic = company_field.split('|')[15]
countryName
=
company_field
.
split
(
'|'
)[
16
]
#
countryName = company_field.split('|')[16]
securitiesCode
=
company_field
.
split
(
'|'
)[
17
]
#
securitiesCode = company_field.split('|')[17]
securitiesShortName
=
company_field
.
split
(
'|'
)[
18
]
#
securitiesShortName = company_field.split('|')[18]
listingDate
=
company_field
.
split
(
'|'
)[
21
]
#
listingDate = company_field.split('|')[21]
category
=
company_field
.
split
(
'|'
)[
19
]
#
category = company_field.split('|')[19]
exchange
=
company_field
.
split
(
'|'
)[
20
]
#
exchange = company_field.split('|')[20]
#
ynDomestic = ''
ynDomestic
=
''
#
countryName = ''
countryName
=
''
#
securitiesCode = ''
securitiesCode
=
''
#
securitiesShortName = ''
securitiesShortName
=
''
#
listingDate = ''
listingDate
=
''
#
category = ''
category
=
''
#
exchange = ''
exchange
=
''
count
=
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
)
count
=
redaytowork
(
com_name
,
social_code
,
securitiesCode
,
securitiesShortName
,
listingDate
,
category
,
exchange
,
ynDomestic
,
countryName
,
file_name
)
...
...
comData/BaseInfo_qcc/classtool.py
0 → 100644
浏览文件 @
480932b7
import
os.path
from
openpyxl
import
Workbook
,
load_workbook
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
cnx
=
baseCore
.
cnx
cursor
=
baseCore
.
cursor
class
File
():
# 创建文件
def
createFile
(
self
,
file_name
):
if
os
.
path
.
exists
(
file_name
):
return
else
:
wb
=
Workbook
()
sheet
=
wb
.
active
# 更改默认的sheet名称
sheet
.
title
=
"需处理企业"
sheet
.
append
([
"企业名称"
,
"社会信用代码"
])
# 创建另一个sheet
sheet2
=
wb
.
create_sheet
(
"获取基本信息成功企业"
)
sheet2
.
append
([
"企业名称"
,
"社会信用代码"
,
"采到的信用代码"
])
wb
.
save
(
file_name
)
wb
.
close
()
# 删除文件
def
deleteFile
(
self
,
file_name
):
if
os
.
path
.
exists
(
file_name
):
os
.
remove
(
file_name
)
else
:
pass
# 追加数据
def
appenddata
(
self
,
file_name
,
sheet
,
data
):
# 打开现有的Excel文件
wb
=
load_workbook
(
file_name
)
# 选择要追加数据的sheet
sheet
=
wb
[
sheet
]
sheet
.
append
(
data
)
# 保存Excel文件
wb
.
save
(
file_name
)
wb
.
close
()
class
Token
():
#获取token
def
getToken
(
self
):
cursor
.
execute
(
f
"select cookies from QCC_token order by update_time asc limit 1"
)
row
=
cursor
.
fetchall
()
cnx
.
commit
()
if
row
:
pass
else
:
#没有查到token
log
.
info
(
"没有拿到token"
)
return
False
return
row
[
0
]
# 删除失效的token
def
delete_token
(
self
,
cookie_
):
deletesql
=
f
"delete from QCC_token where cookies='{cookie_}' "
cursor
.
execute
(
deletesql
)
cnx
.
commit
()
class
Tag
():
# 删除特定属性标签
def
deletep
(
self
,
soup
,
tag_
,
attribute_to_delete
,
value_to_delete
):
if
attribute_to_delete
and
value_to_delete
:
# 查找带有指定属性的P标签并删除
tags
=
soup
.
find_all
(
tag_
,
{
attribute_to_delete
:
value_to_delete
})
for
tag
in
tags
:
# print(tag)
tag
.
decompose
()
else
:
tags
=
soup
.
find_all
(
tag_
)
for
tag
in
tags
:
# print(tag)
tag
.
decompose
()
# 删除空标签
def
deletek
(
self
,
soup
):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for
i
in
soup
.
find_all
(
lambda
tag
:
len
(
tag
.
get_text
())
==
0
and
tag
.
name
not
in
[
"img"
,
"video"
,
"br"
]
and
tag
.
name
!=
"br"
or
tag
.
get_text
()
==
' '
or
tag
.
get_text
()
==
' '
):
for
j
in
i
.
descendants
:
if
j
.
name
in
[
"img"
,
"video"
,
"br"
]:
break
else
:
i
.
decompose
()
# 删除span标签
def
deletespan
(
self
,
td
):
spans
=
td
.
find_all
(
'span'
,
class_
=
'app-copy copy-button-item'
)
for
span
in
spans
:
if
'复制'
in
span
.
text
:
span
.
extract
()
# 删除span标签
spans2
=
td
.
find_all
(
'span'
,
slot
=
'content'
)
for
span2
in
spans2
:
if
'趋势图'
in
span2
.
text
:
span2
.
extract
()
spans3
=
td
.
find_all
(
'span'
,
class_
=
'm-l-r-10'
)
for
span3
in
spans3
:
if
'年报'
in
span3
.
text
:
span3
.
extract
()
spans4
=
td
.
find_all
(
'span'
,
class_
=
'text-span'
)
for
span4
in
spans4
:
span4
.
extract
()
\ No newline at end of file
comData/BaseInfo_qcc/requestQCC.py
浏览文件 @
480932b7
"""模拟扫码登录"""
"""模拟扫码登录"""
import
json
import
time
import
time
import
requests
import
requests
...
@@ -11,6 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait
...
@@ -11,6 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
from
pymysql.converters
import
escape_string
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
cnx_
=
baseCore
.
cnx
cnx_
=
baseCore
.
cnx
...
@@ -34,41 +36,7 @@ def flushAndGetToken():
...
@@ -34,41 +36,7 @@ def flushAndGetToken():
for
cookie
in
cookie_list
:
for
cookie
in
cookie_list
:
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
cookies
[
cookie
[
'name'
]]
=
cookie
[
'value'
]
print
(
cookies
)
print
(
cookies
)
insert
=
f
"insert into QCC_token (token,cookies,create_time,fenghao_time,user_name,update_time) values ('{token}','{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),'{user_name}',now())"
return
cookies
cursor_
.
execute
(
insert
)
cnx_
.
commit
()
baseCore
.
close
()
def
getrequest_soup
(
headers
,
url
):
req
=
requests
.
get
(
headers
=
headers
,
url
=
url
)
result
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
return
result
def
dojob
():
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https
%253
A
%252
F
%252
Fwww.qcc.com
%252
F
%7
C1695113473; _uab_collina=169935323766710839405007; QCCSESSID=1d489139eea4830a062c3a1240; acw_tc=db9062ad16994955552435350e3b43e7e5cee64c77d9f807936897ab1f'
,
'Host'
:
'www.qcc.com'
,
'Referer'
:
'https://www.qcc.com/'
,
'Sec-Ch-Ua'
:
'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
'Sec-Ch-Ua-Platform'
:
'"Windows"'
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
url
=
'https://www.qcc.com/api/userCenter/getAuthInfo'
soup
=
getrequest_soup
(
headers
,
url
)
pass
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
urlqcc
=
'https://www.qcc.com/'
urlqcc
=
'https://www.qcc.com/'
...
@@ -81,7 +49,13 @@ if __name__ == "__main__":
...
@@ -81,7 +49,13 @@ if __name__ == "__main__":
# print(soup)
# print(soup)
browser
.
find_element
(
By
.
CLASS_NAME
,
'nav-item'
)
.
click
()
browser
.
find_element
(
By
.
CLASS_NAME
,
'nav-item'
)
.
click
()
time
.
sleep
(
20
)
time
.
sleep
(
20
)
flushAndGetToken
()
cookies
=
flushAndGetToken
()
cookies
=
json
.
dumps
(
cookies
)
insert
=
f
"insert into QCC_token (cookies,create_time,fenghao_time,update_time) values ('{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),now())"
cursor_
.
execute
(
insert
)
cnx_
.
commit
()
baseCore
.
close
()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论