Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
b52e4502
提交
b52e4502
authored
2月 26, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
2/26
上级
ca40e9aa
全部展开
显示空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
207 行增加
和
34 行删除
+207
-34
研究中心需更新企业.py
base/研究中心需更新企业.py
+62
-0
test.py
comData/BaseInfo_qcc/test.py
+15
-17
test_1.py
comData/BaseInfo_qcc/test_1.py
+0
-0
zyqmshggldxzhy19.py
comData/important_meeting/zyqmshggldxzhy19.py
+61
-15
get_tokenCookies.py
comData/weixin_solo/get_tokenCookies.py
+1
-1
test.py
test.py
+68
-1
没有找到文件。
base/研究中心需更新企业.py
0 → 100644
浏览文件 @
b52e4502
from
apscheduler.schedulers.blocking
import
BlockingScheduler
from
apscheduler.schedulers.blocking
import
BlockingScheduler
import
pandas
as
pd
import
redis
def
putCom
():
com_list
=
[
'91210000558190456G'
,
'914200001000115161'
,
'911100007109310534'
,
'9111000071093123XX'
,
'91110000100017643K'
,
'91110000100018267J'
,
'91110000MA01P657XY'
,
'91230100127057741M'
,
'91440300190346175T'
,
'ZZSN22083000000003'
,
'91110000400000720M'
,
'911100001055722912'
,
'91110000100005220B'
,
'911100001000094165'
,
'91310000132200821H'
,
'911100001000128855'
,
'91110000710924910P'
,
'91110000710924929L'
,
'911100007109225442'
,
'9111000071092649XU'
,
'91310000MA1FL70B67'
,
'911100007109311097'
,
'912201011239989159'
,
'911100007178306183'
,
'91310000MA7ALG04XG'
,
'91110000100017707H'
,
'91110000710929498G'
,
'91110000100010249W'
,
'9151000062160427XG'
,
'91310000MA1FL4B24G'
,
'91110000400001889L'
,
'9144030010001694XX'
,
'91110000100000825Q'
,
'91110000100006194G'
,
'91110000717828315T'
,
'91110000100001043E'
,
'91110000MA005UCQ5P'
,
'91110000710935732K'
,
'91110000710930392Y'
,
'91110000710930296M'
,
'911100007109303176'
,
'91110000710925243K'
,
'91110000100014071Q'
,
'91110000100009563N'
,
'9111000071093107XN'
,
'9111000010001002XD'
,
'91110000100001852R'
,
'91110000100001625L'
,
'911100001000080343'
,
'91110000400008060U'
,
'91110000101699383Q'
,
'91110000100000489L'
,
'9111000071092868XL'
,
'91110000100001035K'
,
'911100004000011410'
,
'91110000710933809D'
,
'91110000100010310K'
,
'91133100MABRLCFR5Q'
,
'91110000MA001HYK9X'
,
'911100001000016682'
,
'911100007109279199'
,
'12100000400010275N'
,
'91110000710935636A'
,
'91110000100024800K'
,
'9144000076384341X8'
,
'91440000100005896P'
,
'91110000MA01W8B394'
,
'91110000717830650E'
,
'91110000100003057A'
,
'ZZSN22061600000001'
,
'91310000MA1FL0LX06'
,
'9111000010169286X1'
,
'91110000100010433L'
,
'91110000100010660R'
,
'91110000102016548J'
,
'91110000100001676W'
,
'9111000071092200XY'
,
'91133100MA0G9YKT8B'
,
'9111000010000093XR'
,
'91110000100006485K'
,
'91360702MA7FK4MR44'
,
'91420100MA4L0GG411'
,
'91110000101625149Q'
,
'12100000400006022G'
,
'912302001285125661'
,
'91110000100005888C'
,
'911100007109250324'
,
'91110000100024915R'
,
'9111000040000094XW'
,
'91310000MA1FL1MMXL'
,
'91110000100015058K'
,
'91110000710929930X'
,
'91133100MA0GBL5F38'
,
'9111000010000085X6'
,
'91110000101100414N'
]
df
=
pd
.
read_excel
(
'D:
\\
企业数据
\\
数据组提供
\\
国内企业.xlsx'
)
# 连接到Redis数据库
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
for
i
in
range
(
len
(
df
)):
social_code
=
df
[
'social_code'
][
i
]
com_name
=
df
[
'name'
][
i
]
# print(social_code)
if
social_code
in
com_list
:
pass
else
:
if
'ZZSN'
in
social_code
or
'ZD'
in
social_code
:
continue
else
:
item
=
social_code
+
'|'
+
com_name
r
.
rpush
(
'UpdateBasdeInfo:SocialCode_CompanyName'
,
item
)
def
putCom_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每个月执行一次
scheduler
.
add_job
(
putCom
,
'cron'
,
day
=
1
,
hour
=
0
,
minute
=
0
)
try
:
# redisPushData # 定时开始前执行一次
# putCom()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
if
__name__
==
'__main__'
:
putCom_task
()
\ No newline at end of file
comData/BaseInfo_qcc/test.py
浏览文件 @
b52e4502
import
pandas
as
pd
# from pandas import DataFrame as df
import
pymysql
import
redis
# 连接到Redis
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
with
cnx
.
cursor
()
as
cursor
:
select
=
"""select relationName, relationId from klb_company"""
cursor
.
execute
(
select
)
results
=
cursor
.
fetchall
()
for
result
in
results
:
name
=
result
[
0
]
xydm
=
result
[
1
]
item
=
f
'{name}|{xydm}'
r
.
rpush
(
'SousuoBaidu:companyname'
,
cell_value
)
# 列表名称
list_name
=
'BaseInfoEnterpriseMz:gnqy_socialCode'
# 获取列表中的所有元素
elements
=
r
.
lrange
(
list_name
,
0
,
-
1
)
# 遍历列表中的元素
for
element
in
elements
:
# 获取元素在列表中的数量
count
=
r
.
lrem
(
list_name
,
0
,
element
)
# 如果数量大于1,说明有重复值,删除多余的重复值
if
count
>
1
:
r
.
lrem
(
list_name
,
count
-
1
,
element
)
# 打印处理后的列表
print
(
r
.
lrange
(
list_name
,
0
,
-
1
))
comData/BaseInfo_qcc/test_1.py
0 → 100644
浏览文件 @
b52e4502
差异被折叠。
点击展开。
comData/important_meeting/zyqmshggldxzhy19.py
浏览文件 @
b52e4502
# 中央全面深化改革委员会会议
import
json
import
sys
import
time
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
from
kafka
import
KafkaProducer
headers
=
{
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
header
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
...
...
@@ -26,22 +32,50 @@ headers = {
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'cna=HcAKHtgXUG4CAQHBO1G6ZJYK'
,
'Host'
:
'news.12371.cn'
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
'none'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'
,
'sec-ch-ua'
:
'"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
if
__name__
==
"__main__"
:
# 中央全面深化改革委员会会议
r
=
redis
.
Redis
(
host
=
'114.115.236.206'
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
5
)
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url
_list
=
[
'https://www.12371.cn/special/zyqmshggldxzhy19/'
]
for
url
in
url_list
:
request
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
url
=
'https://www.12371.cn/special/zyqmshggldxzhy19/'
request
=
requests
.
get
(
url
=
url
,
headers
=
header
)
soup
=
BeautifulSoup
(
request
.
content
,
'html.parser'
)
# print(soup)
request
.
encoding
=
request
.
apparent_encoding
# print(soup)
info_html
=
soup
.
find
(
'div'
,
id
=
'SUBD1663831285709121'
)
.
find
(
'ul'
,
class_
=
'ul_list'
)
ul_list
=
info_html
.
find_all
(
'li'
)
for
ul
in
ul_list
:
# info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
info_html_list
=
soup
.
find_all
(
'div'
,
class_
=
'dyw1023_right_list01 hyty'
)
flag
=
1
for
info_html
in
info_html_list
:
if
flag
==
1
:
info_code
=
'IN-20230816-0004'
sid
=
'1691633319715676162'
else
:
sid
=
'1691633869186277378'
info_code
=
'IN-20230816-0005'
ul_list
=
info_html
.
find
(
'ul'
,
class_
=
'ul_list'
)
.
find_all
(
'li'
)
for
ul
in
ul_list
[::
-
1
]:
publishDate_
=
str
(
ul
.
find
(
'span'
)
.
text
)
date_obj
=
datetime
.
strptime
(
publishDate_
,
"
%
Y年
%
m月
%
d日"
)
publishDate
=
date_obj
.
strftime
(
'
%
Y-
%
m-
%
d'
)
...
...
@@ -51,18 +85,27 @@ if __name__ == "__main__":
newsUrl
=
ul
.
find
(
'a'
)[
'href'
]
summary
=
ul
.
find
(
'a'
)
.
text
# todo: 链接判重
news_request
=
requests
.
get
(
url
=
newsUrl
,
headers
=
headers
)
try
:
flag
=
r
.
sismember
(
info_code
,
newsUrl
)
if
flag
:
log
.
info
(
'信息已采集入库过'
)
continue
except
Exception
as
e
:
continue
news_request
=
requests
.
get
(
url
=
newsUrl
,
headers
=
headers
,
allow_redirects
=
False
)
news_soup
=
BeautifulSoup
(
news_request
.
content
,
'html.parser'
)
print
(
news_soup
)
# print(news_soup)
try
:
title
=
news_soup
.
find
(
'h1'
,
class_
=
'big_title'
)
.
text
source
=
news_soup
.
find
(
'div'
,
class_
=
'title_bottom'
)
.
find
(
'i'
)
.
text
contentwithTag
=
news_soup
.
find
(
'div'
,
class_
=
'word'
)
content
=
contentwithTag
.
text
if
url
==
'https://www.12371.cn/special/zyqmshggldxzhy19/'
:
sid
=
'1691633319715676162'
else
:
sid
=
'1691633869186277378'
except
Exception
as
e
:
log
.
error
(
f
'解析网页出错{newsUrl}'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_info
=
{
'id'
:
'1681549361661489154'
+
str
(
int
(
time
.
time
()
*
1000
)),
'title'
:
title
,
...
...
@@ -79,6 +122,7 @@ if __name__ == "__main__":
'createDate'
:
time_now
,
}
r
.
sadd
(
info_code
,
newsUrl
)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"research_center_fourth"
,
...
...
@@ -90,3 +134,4 @@ if __name__ == "__main__":
print
(
'发送kafka异常!'
)
finally
:
producer
.
close
()
flag
+=
1
\ No newline at end of file
comData/weixin_solo/get_tokenCookies.py
浏览文件 @
b52e4502
...
...
@@ -56,7 +56,7 @@ if __name__=="__main__":
url
=
"https://mp.weixin.qq.com/"
browser
.
get
(
url
)
# 可改动
time
.
sleep
(
2
0
)
time
.
sleep
(
8
0
)
s
=
requests
.
session
()
#获取到token和cookies
...
...
test.py
浏览文件 @
b52e4502
...
...
@@ -170,5 +170,71 @@ for data in datas:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req
=
requests
.
post
(
'http://117.78.23.14:500
1
/translate'
,
data
=
dic_info_
,
headers
=
headers
)
req
=
requests
.
post
(
'http://117.78.23.14:500
0
/translate'
,
data
=
dic_info_
,
headers
=
headers
)
log
.
info
(
req
.
text
)
# import re, datetime
#
#
# def paserTime(publishtime):
# timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
# current_datetime = datetime.datetime.now()
# publishtime = publishtime.strip()
# print(publishtime)
#
# try:
# if '年前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=365 * day)
# publishtime = current_datetime - delta
# elif '月前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(months=day)
# publishtime = current_datetime - delta
# elif '周前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(weeks=day)
# publishtime = current_datetime - delta
# elif '天前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=day)
# publishtime = current_datetime - delta
# elif '前天' in publishtime:
# delta = datetime.timedelta(days=2)
# publishtime = current_datetime - delta
# elif '昨天' in publishtime:
# current_datetime = datetime.datetime.now()
# delta = datetime.timedelta(days=1)
# publishtime = current_datetime - delta
# elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
# if '小时' in publishtime:
# hour = publishtime.split("小时")[0]
# else:
# hour = 0
# if hour != 0:
# min = publishtime.split("小时")[1].split("分钟")[0]
# else:
# min = publishtime.split("分钟")[0]
#
# delta = datetime.timedelta(hours=int(hour), minutes=int(min))
# publishtime = current_datetime - delta
# elif '年' in publishtime and '月' in publishtime:
# time_format = '%Y年%m月%d日'
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# elif '月' in publishtime and '日' in publishtime:
# current_year = current_datetime.year
# time_format = '%Y年%m月%d日'
# publishtime = str(current_year) + '年' + publishtime
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# except Exception as e:
# print('时间解析异常!!')
# return publishtime
#
# if __name__ == "__main__":
# publishtime_ = '1小时17分钟前'
# publish_time = paserTime(publishtime_).strftime("%Y-%m-%d")
# print(publish_time)
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论