Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
b52e4502
提交
b52e4502
authored
2月 26, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
2/26
上级
ca40e9aa
全部展开
显示空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
207 行增加
和
34 行删除
+207
-34
研究中心需更新企业.py
base/研究中心需更新企业.py
+62
-0
test.py
comData/BaseInfo_qcc/test.py
+15
-17
test_1.py
comData/BaseInfo_qcc/test_1.py
+0
-0
zyqmshggldxzhy19.py
comData/important_meeting/zyqmshggldxzhy19.py
+61
-15
get_tokenCookies.py
comData/weixin_solo/get_tokenCookies.py
+1
-1
test.py
test.py
+68
-1
没有找到文件。
base/研究中心需更新企业.py
0 → 100644
浏览文件 @
b52e4502
from
apscheduler.schedulers.blocking
import
BlockingScheduler
from
apscheduler.schedulers.blocking
import
BlockingScheduler
import
pandas
as
pd
import
redis
def
putCom
():
com_list
=
[
'91210000558190456G'
,
'914200001000115161'
,
'911100007109310534'
,
'9111000071093123XX'
,
'91110000100017643K'
,
'91110000100018267J'
,
'91110000MA01P657XY'
,
'91230100127057741M'
,
'91440300190346175T'
,
'ZZSN22083000000003'
,
'91110000400000720M'
,
'911100001055722912'
,
'91110000100005220B'
,
'911100001000094165'
,
'91310000132200821H'
,
'911100001000128855'
,
'91110000710924910P'
,
'91110000710924929L'
,
'911100007109225442'
,
'9111000071092649XU'
,
'91310000MA1FL70B67'
,
'911100007109311097'
,
'912201011239989159'
,
'911100007178306183'
,
'91310000MA7ALG04XG'
,
'91110000100017707H'
,
'91110000710929498G'
,
'91110000100010249W'
,
'9151000062160427XG'
,
'91310000MA1FL4B24G'
,
'91110000400001889L'
,
'9144030010001694XX'
,
'91110000100000825Q'
,
'91110000100006194G'
,
'91110000717828315T'
,
'91110000100001043E'
,
'91110000MA005UCQ5P'
,
'91110000710935732K'
,
'91110000710930392Y'
,
'91110000710930296M'
,
'911100007109303176'
,
'91110000710925243K'
,
'91110000100014071Q'
,
'91110000100009563N'
,
'9111000071093107XN'
,
'9111000010001002XD'
,
'91110000100001852R'
,
'91110000100001625L'
,
'911100001000080343'
,
'91110000400008060U'
,
'91110000101699383Q'
,
'91110000100000489L'
,
'9111000071092868XL'
,
'91110000100001035K'
,
'911100004000011410'
,
'91110000710933809D'
,
'91110000100010310K'
,
'91133100MABRLCFR5Q'
,
'91110000MA001HYK9X'
,
'911100001000016682'
,
'911100007109279199'
,
'12100000400010275N'
,
'91110000710935636A'
,
'91110000100024800K'
,
'9144000076384341X8'
,
'91440000100005896P'
,
'91110000MA01W8B394'
,
'91110000717830650E'
,
'91110000100003057A'
,
'ZZSN22061600000001'
,
'91310000MA1FL0LX06'
,
'9111000010169286X1'
,
'91110000100010433L'
,
'91110000100010660R'
,
'91110000102016548J'
,
'91110000100001676W'
,
'9111000071092200XY'
,
'91133100MA0G9YKT8B'
,
'9111000010000093XR'
,
'91110000100006485K'
,
'91360702MA7FK4MR44'
,
'91420100MA4L0GG411'
,
'91110000101625149Q'
,
'12100000400006022G'
,
'912302001285125661'
,
'91110000100005888C'
,
'911100007109250324'
,
'91110000100024915R'
,
'9111000040000094XW'
,
'91310000MA1FL1MMXL'
,
'91110000100015058K'
,
'91110000710929930X'
,
'91133100MA0GBL5F38'
,
'9111000010000085X6'
,
'91110000101100414N'
]
df
=
pd
.
read_excel
(
'D:
\\
企业数据
\\
数据组提供
\\
国内企业.xlsx'
)
# 连接到Redis数据库
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
for
i
in
range
(
len
(
df
)):
social_code
=
df
[
'social_code'
][
i
]
com_name
=
df
[
'name'
][
i
]
# print(social_code)
if
social_code
in
com_list
:
pass
else
:
if
'ZZSN'
in
social_code
or
'ZD'
in
social_code
:
continue
else
:
item
=
social_code
+
'|'
+
com_name
r
.
rpush
(
'UpdateBasdeInfo:SocialCode_CompanyName'
,
item
)
def
putCom_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每个月执行一次
scheduler
.
add_job
(
putCom
,
'cron'
,
day
=
1
,
hour
=
0
,
minute
=
0
)
try
:
# redisPushData # 定时开始前执行一次
# putCom()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
if
__name__
==
'__main__'
:
putCom_task
()
\ No newline at end of file
comData/BaseInfo_qcc/test.py
浏览文件 @
b52e4502
import
pandas
as
pd
# from pandas import DataFrame as df
import
pymysql
import
redis
import
redis
# 连接到Redis
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
with
cnx
.
cursor
()
as
cursor
:
select
=
"""select relationName, relationId from klb_company"""
cursor
.
execute
(
select
)
results
=
cursor
.
fetchall
()
for
result
in
results
:
name
=
result
[
0
]
xydm
=
result
[
1
]
item
=
f
'{name}|{xydm}'
r
.
rpush
(
'SousuoBaidu:companyname'
,
cell_value
)
# 列表名称
list_name
=
'BaseInfoEnterpriseMz:gnqy_socialCode'
# 获取列表中的所有元素
elements
=
r
.
lrange
(
list_name
,
0
,
-
1
)
# 遍历列表中的元素
for
element
in
elements
:
# 获取元素在列表中的数量
count
=
r
.
lrem
(
list_name
,
0
,
element
)
# 如果数量大于1,说明有重复值,删除多余的重复值
if
count
>
1
:
r
.
lrem
(
list_name
,
count
-
1
,
element
)
# 打印处理后的列表
print
(
r
.
lrange
(
list_name
,
0
,
-
1
))
comData/BaseInfo_qcc/test_1.py
0 → 100644
浏览文件 @
b52e4502
差异被折叠。
点击展开。
comData/important_meeting/zyqmshggldxzhy19.py
浏览文件 @
b52e4502
# 中央全面深化改革委员会会议
# 中央全面深化改革委员会会议
import
json
import
json
import
sys
import
time
import
time
import
redis
import
requests
import
requests
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
from
datetime
import
datetime
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
headers
=
{
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
header
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
...
@@ -26,22 +32,50 @@ headers = {
...
@@ -26,22 +32,50 @@ headers = {
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
'sec-ch-ua-platform'
:
'"Windows"'
}
}
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'cna=HcAKHtgXUG4CAQHBO1G6ZJYK'
,
'Host'
:
'news.12371.cn'
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
'none'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'
,
'sec-ch-ua'
:
'"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
# 中央全面深化改革委员会会议
# 中央全面深化改革委员会会议
r
=
redis
.
Redis
(
host
=
'114.115.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
# 中央全面深化改革领导小组会议
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url
_list
=
[
'https://www.12371.cn/special/zyqmshggldxzhy19/'
]
url
=
'https://www.12371.cn/special/zyqmshggldxzhy19/'
for
url
in
url_list
:
request
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
request
=
requests
.
get
(
url
=
url
,
headers
=
header
)
soup
=
BeautifulSoup
(
request
.
content
,
'html.parser'
)
soup
=
BeautifulSoup
(
request
.
content
,
'html.parser'
)
# print(soup)
request
.
encoding
=
request
.
apparent_encoding
request
.
encoding
=
request
.
apparent_encoding
# print(soup)
# print(soup)
info_html
=
soup
.
find
(
'div'
,
id
=
'SUBD1663831285709121'
)
.
find
(
'ul'
,
class_
=
'ul_list'
)
# info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
ul_list
=
info_html
.
find_all
(
'li'
)
info_html_list
=
soup
.
find_all
(
'div'
,
class_
=
'dyw1023_right_list01 hyty'
)
for
ul
in
ul_list
:
flag
=
1
for
info_html
in
info_html_list
:
if
flag
==
1
:
info_code
=
'IN-20230816-0004'
sid
=
'1691633319715676162'
else
:
sid
=
'1691633869186277378'
info_code
=
'IN-20230816-0005'
ul_list
=
info_html
.
find
(
'ul'
,
class_
=
'ul_list'
)
.
find_all
(
'li'
)
for
ul
in
ul_list
[::
-
1
]:
publishDate_
=
str
(
ul
.
find
(
'span'
)
.
text
)
publishDate_
=
str
(
ul
.
find
(
'span'
)
.
text
)
date_obj
=
datetime
.
strptime
(
publishDate_
,
"
%
Y年
%
m月
%
d日"
)
date_obj
=
datetime
.
strptime
(
publishDate_
,
"
%
Y年
%
m月
%
d日"
)
publishDate
=
date_obj
.
strftime
(
'
%
Y-
%
m-
%
d'
)
publishDate
=
date_obj
.
strftime
(
'
%
Y-
%
m-
%
d'
)
...
@@ -51,18 +85,27 @@ if __name__ == "__main__":
...
@@ -51,18 +85,27 @@ if __name__ == "__main__":
newsUrl
=
ul
.
find
(
'a'
)[
'href'
]
newsUrl
=
ul
.
find
(
'a'
)[
'href'
]
summary
=
ul
.
find
(
'a'
)
.
text
summary
=
ul
.
find
(
'a'
)
.
text
# todo: 链接判重
# todo: 链接判重
news_request
=
requests
.
get
(
url
=
newsUrl
,
headers
=
headers
)
try
:
flag
=
r
.
sismember
(
info_code
,
newsUrl
)
if
flag
:
log
.
info
(
'信息已采集入库过'
)
continue
except
Exception
as
e
:
continue
news_request
=
requests
.
get
(
url
=
newsUrl
,
headers
=
headers
,
allow_redirects
=
False
)
news_soup
=
BeautifulSoup
(
news_request
.
content
,
'html.parser'
)
news_soup
=
BeautifulSoup
(
news_request
.
content
,
'html.parser'
)
print
(
news_soup
)
# print(news_soup)
try
:
title
=
news_soup
.
find
(
'h1'
,
class_
=
'big_title'
)
.
text
title
=
news_soup
.
find
(
'h1'
,
class_
=
'big_title'
)
.
text
source
=
news_soup
.
find
(
'div'
,
class_
=
'title_bottom'
)
.
find
(
'i'
)
.
text
source
=
news_soup
.
find
(
'div'
,
class_
=
'title_bottom'
)
.
find
(
'i'
)
.
text
contentwithTag
=
news_soup
.
find
(
'div'
,
class_
=
'word'
)
contentwithTag
=
news_soup
.
find
(
'div'
,
class_
=
'word'
)
content
=
contentwithTag
.
text
content
=
contentwithTag
.
text
if
url
==
'https://www.12371.cn/special/zyqmshggldxzhy19/'
:
except
Exception
as
e
:
sid
=
'1691633319715676162'
log
.
error
(
f
'解析网页出错{newsUrl}'
)
else
:
continue
sid
=
'1691633869186277378'
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info
=
{
dic_info
=
{
'id'
:
'1681549361661489154'
+
str
(
int
(
time
.
time
()
*
1000
)),
'id'
:
'1681549361661489154'
+
str
(
int
(
time
.
time
()
*
1000
)),
'title'
:
title
,
'title'
:
title
,
...
@@ -79,6 +122,7 @@ if __name__ == "__main__":
...
@@ -79,6 +122,7 @@ if __name__ == "__main__":
'createDate'
:
time_now
,
'createDate'
:
time_now
,
}
}
r
.
sadd
(
info_code
,
newsUrl
)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
try
:
kafka_result
=
producer
.
send
(
"research_center_fourth"
,
kafka_result
=
producer
.
send
(
"research_center_fourth"
,
...
@@ -90,3 +134,4 @@ if __name__ == "__main__":
...
@@ -90,3 +134,4 @@ if __name__ == "__main__":
print
(
'发送kafka异常!'
)
print
(
'发送kafka异常!'
)
finally
:
finally
:
producer
.
close
()
producer
.
close
()
flag
+=
1
\ No newline at end of file
comData/weixin_solo/get_tokenCookies.py
浏览文件 @
b52e4502
...
@@ -56,7 +56,7 @@ if __name__=="__main__":
...
@@ -56,7 +56,7 @@ if __name__=="__main__":
url
=
"https://mp.weixin.qq.com/"
url
=
"https://mp.weixin.qq.com/"
browser
.
get
(
url
)
browser
.
get
(
url
)
# 可改动
# 可改动
time
.
sleep
(
2
0
)
time
.
sleep
(
8
0
)
s
=
requests
.
session
()
s
=
requests
.
session
()
#获取到token和cookies
#获取到token和cookies
...
...
test.py
浏览文件 @
b52e4502
...
@@ -170,5 +170,71 @@ for data in datas:
...
@@ -170,5 +170,71 @@ for data in datas:
# f.write(dic_info_)
# f.write(dic_info_)
# break
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req
=
requests
.
post
(
'http://117.78.23.14:500
1
/translate'
,
data
=
dic_info_
,
headers
=
headers
)
req
=
requests
.
post
(
'http://117.78.23.14:500
0
/translate'
,
data
=
dic_info_
,
headers
=
headers
)
log
.
info
(
req
.
text
)
log
.
info
(
req
.
text
)
# import re, datetime
#
#
# def paserTime(publishtime):
# timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
# current_datetime = datetime.datetime.now()
# publishtime = publishtime.strip()
# print(publishtime)
#
# try:
# if '年前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=365 * day)
# publishtime = current_datetime - delta
# elif '月前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(months=day)
# publishtime = current_datetime - delta
# elif '周前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(weeks=day)
# publishtime = current_datetime - delta
# elif '天前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=day)
# publishtime = current_datetime - delta
# elif '前天' in publishtime:
# delta = datetime.timedelta(days=2)
# publishtime = current_datetime - delta
# elif '昨天' in publishtime:
# current_datetime = datetime.datetime.now()
# delta = datetime.timedelta(days=1)
# publishtime = current_datetime - delta
# elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
# if '小时' in publishtime:
# hour = publishtime.split("小时")[0]
# else:
# hour = 0
# if hour != 0:
# min = publishtime.split("小时")[1].split("分钟")[0]
# else:
# min = publishtime.split("分钟")[0]
#
# delta = datetime.timedelta(hours=int(hour), minutes=int(min))
# publishtime = current_datetime - delta
# elif '年' in publishtime and '月' in publishtime:
# time_format = '%Y年%m月%d日'
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# elif '月' in publishtime and '日' in publishtime:
# current_year = current_datetime.year
# time_format = '%Y年%m月%d日'
# publishtime = str(current_year) + '年' + publishtime
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# except Exception as e:
# print('时间解析异常!!')
# return publishtime
#
# if __name__ == "__main__":
# publishtime_ = '1小时17分钟前'
# publish_time = paserTime(publishtime_).strftime("%Y-%m-%d")
# print(publish_time)
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论