Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
7bf6c3cf
提交
7bf6c3cf
authored
1月 25, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
1/24
上级
0ce37662
隐藏空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
57 行增加
和
56 行删除
+57
-56
BaseCore.py
base/BaseCore.py
+1
-1
resentYanbao.py
comData/YanBao/resentYanbao.py
+1
-1
gwyfile.py
comData/policylaw/gwyfile.py
+1
-1
gwyparts.py
comData/policylaw/gwyparts.py
+1
-1
qiushi_leaderspeech.py
qiushi_leaderspeech.py
+8
-8
test.py
百度采集/baidu_comm/test.py
+45
-44
没有找到文件。
base/BaseCore.py
浏览文件 @
7bf6c3cf
...
@@ -932,7 +932,7 @@ class BaseCore:
...
@@ -932,7 +932,7 @@ class BaseCore:
# 发送邮箱地址
# 发送邮箱地址
sender
=
'1195236739@qq.com'
sender
=
'1195236739@qq.com'
# 接收邮箱地址
# 接收邮箱地址
receiver
=
'
1007765445@qq.com
'
receiver
=
'
fujunxue@ciglobal.cn
'
smtpserver
=
'smtp.qq.com'
smtpserver
=
'smtp.qq.com'
# 发送邮箱登录 账户 密码
# 发送邮箱登录 账户 密码
username
=
'1195236739@qq.com'
username
=
'1195236739@qq.com'
...
...
comData/YanBao/resentYanbao.py
浏览文件 @
7bf6c3cf
...
@@ -902,7 +902,7 @@ def qianyanzhishiku():
...
@@ -902,7 +902,7 @@ def qianyanzhishiku():
def
shijiejingjiluntan
():
def
shijiejingjiluntan
():
allnum
=
{
'一'
:
'01'
,
'二'
:
'02'
,
'三'
:
'03'
,
'四'
:
'04'
,
'五'
:
'05'
,
'六'
:
'06'
,
'七'
:
'07'
,
'八'
:
'08'
,
'九'
:
'09'
,
'十'
:
'10'
,
'十一'
:
'11'
,
'十二'
:
'12'
}
allnum
=
{
'一'
:
'01'
,
'二'
:
'02'
,
'三'
:
'03'
,
'四'
:
'04'
,
'五'
:
'05'
,
'六'
:
'06'
,
'七'
:
'07'
,
'八'
:
'08'
,
'九'
:
'09'
,
'十'
:
'10'
,
'十一'
:
'11'
,
'十二'
:
'12'
}
for
i
in
range
(
2
,
3
):
for
i
in
range
(
1
,
3
):
# res = requests.get(url)
# res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser')
# soup = BeautifulSoup(res.content,'html.parser')
...
...
comData/policylaw/gwyfile.py
浏览文件 @
7bf6c3cf
...
@@ -169,7 +169,7 @@ def get_content1():
...
@@ -169,7 +169,7 @@ def get_content1():
'id'
:
''
,
#
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1766"
,
'relationName'
:
"国务院文件"
,
'labelMark'
:
"policy"
}],
'labels'
:
[{
'relationId'
:
"1766"
,
'relationName'
:
"国务院文件"
,
'labelMark'
:
"policy"
}],
# 关联标签id 关联标签名称 关联标签标识
# 关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
# 政策发布机关
'origin'
:
'
中华人民共和国中央人民政府
'
,
# 政策发布机关
'organ'
:
pub_org
,
# 政策发文机关
'organ'
:
pub_org
,
# 政策发文机关
'topicClassification'
:
child_type
,
# 政策文件分类
'topicClassification'
:
child_type
,
# 政策文件分类
'issuedNumber'
:
pub_code
,
# 发文字号
'issuedNumber'
:
pub_code
,
# 发文字号
...
...
comData/policylaw/gwyparts.py
浏览文件 @
7bf6c3cf
...
@@ -151,7 +151,7 @@ def get_content2():
...
@@ -151,7 +151,7 @@ def get_content2():
'id'
:
''
,
#
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1699"
,
'relationName'
:
"国务院各部委文件"
,
'labelMark'
:
"policy"
}],
'labels'
:
[{
'relationId'
:
"1699"
,
'relationName'
:
"国务院各部委文件"
,
'labelMark'
:
"policy"
}],
# 关联标签id 关联标签名称 关联标签标识
# 关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
# 政策发布机关
'origin'
:
'
中华人民共和国中央人民政府
'
,
# 政策发布机关
'organ'
:
pub_org
,
# 政策发文机关
'organ'
:
pub_org
,
# 政策发文机关
'topicClassification'
:
child_type
,
# 政策文件分类
'topicClassification'
:
child_type
,
# 政策文件分类
'issuedNumber'
:
pub_code
,
# 发文字号
'issuedNumber'
:
pub_code
,
# 发文字号
...
...
qiushi_leaderspeech.py
浏览文件 @
7bf6c3cf
...
@@ -119,15 +119,15 @@ if __name__=='__main__':
...
@@ -119,15 +119,15 @@ if __name__=='__main__':
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
if
'('
in
author
or
'本刊'
in
author
\
# if '(' in author or '本刊' in author \
or
'记者'
in
author
or
'新闻社'
in
author
\
# or '记者' in author or '新闻社' in author \
or
'”'
in
author
\
# or '”' in author\
or
'大学'
in
author
or
'洛桑江村'
in
author
:
# or '大学' in author or '洛桑江村' in author:
continue
# if '国资委党委' in author:
# pass
# else:
# continue
# continue
if
'国资委党委'
in
author
:
pass
else
:
continue
new_href
=
new
.
find
(
'a'
)[
'href'
]
new_href
=
new
.
find
(
'a'
)[
'href'
]
is_member
=
r
.
sismember
(
'qiushileaderspeech_two::'
+
period_title
,
new_href
)
is_member
=
r
.
sismember
(
'qiushileaderspeech_two::'
+
period_title
,
new_href
)
if
is_member
:
if
is_member
:
...
...
百度采集/baidu_comm/test.py
浏览文件 @
7bf6c3cf
# from baiduSpider import BaiduSpider
from
baiduSpider
import
BaiduSpider
# from baiduSpider import BaiduSpider
from
baiduSpider
import
BaiduSpider
# searchkw, wordsCode, sid = '', '', ''
searchkw
,
wordsCode
,
sid
=
''
,
''
,
''
# baidu = BaiduSpider(searchkw, wordsCode, sid)
baidu
=
BaiduSpider
(
searchkw
,
wordsCode
,
sid
)
import
requests
import
requests
# url = 'https://baijiahao.baidu.com/s?id=1784907851792547880&wfr=spider&for=pc'
# url = 'https://baijiahao.baidu.com/s?id=1784907851792547880&wfr=spider&for=pc'
# title = '“一带一路”商学院联盟副秘书长解奕炯:临沂在国际化物流建设中一定能“先行一步”'
url
=
'https://www.163.com/dy/article/IKD3M2P20514IPKH.html'
# try:
title
=
'“一带一路”商学院联盟副秘书长解奕炯:临沂在国际化物流建设中一定能“先行一步”'
# detailurl = url
try
:
# title = title
detailurl
=
url
# content, contentWithTag = baidu.extractorMsg(detailurl, title)
title
=
title
# contentWithTag = baidu.rmTagattr(contentWithTag, detailurl)
content
,
contentWithTag
=
baidu
.
extractorMsg
(
detailurl
,
title
)
# except Exception as e:
contentWithTag
=
baidu
.
rmTagattr
(
contentWithTag
,
detailurl
)
# content = ''
except
Exception
as
e
:
# contentWithTag = ''
content
=
''
#
contentWithTag
=
''
#
# detailmsg = {
# 'title': title,
# 'detailurl': url,
# 'content': content,
# 'contentHtml': contentWithTag,
# }
# print(detailmsg)
headers
=
{
'Accept'
:
'*/*'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh-TW;q=0.9,zh;q=0.8'
,
'Cache-Control'
:
'no-cache'
,
'Connection'
:
'keep-alive'
,
'Host'
:
'search-api-web.eastmoney.com'
,
'Pragma'
:
'no-cache'
,
'Sec-Fetch-Dest'
:
'script'
,
'Sec-Fetch-Mode'
:
'no-cors'
,
'Sec-Fetch-Site'
:
'same-site'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'sec-ch-ua'
:
'"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
url
=
'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969¶m=
%7
B
%22
uid
%22%3
A
%22%22%2
C
%22
keyword
%22%3
A
%22%
E7
%
A7
%91%
E8
%
BE
%
BE
%
E8
%87%
AA
%
E6
%8
E
%
A7
%22%2
C
%22
type
%22%3
A
%5
B
%22
researchReport
%22%5
D
%2
C
%22
client
%22%3
A
%22
web
%22%2
C
%22
clientVersion
%22%3
A
%22
curr
%22%2
C
%22
clientType
%22%3
A
%22
web
%22%2
C
%22
param
%22%3
A
%7
B
%22
researchReport
%22%3
A
%7
B
%22
client
%22%3
A
%22
web
%22%2
C
%22
pageSize
%22%3
A10
%2
C
%22
pageIndex
%22%3
A1
%7
D
%7
D
%7
D&_=1702455623970'
# res = requests.get(url).text[1:-1]
res
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
res_json
=
res
.
text
print
(
res_json
)
detailmsg
=
{
\ No newline at end of file
'title'
:
title
,
'detailurl'
:
url
,
'content'
:
content
,
'contentHtml'
:
contentWithTag
,
}
print
(
detailmsg
)
# headers = {
# 'Accept': '*/*',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'search-api-web.eastmoney.com',
# 'Pragma': 'no-cache',
# 'Sec-Fetch-Dest': 'script',
# 'Sec-Fetch-Mode': 'no-cors',
# 'Sec-Fetch-Site': 'same-site',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"'
# }
# url = 'https://search-api-web.eastmoney.com/search/jsonp?cb=jQuery35103326233792363984_1702455623969¶m=%7B%22uid%22%3A%22%22%2C%22keyword%22%3A%22%E7%A7%91%E8%BE%BE%E8%87%AA%E6%8E%A7%22%2C%22type%22%3A%5B%22researchReport%22%5D%2C%22client%22%3A%22web%22%2C%22clientVersion%22%3A%22curr%22%2C%22clientType%22%3A%22web%22%2C%22param%22%3A%7B%22researchReport%22%3A%7B%22client%22%3A%22web%22%2C%22pageSize%22%3A10%2C%22pageIndex%22%3A1%7D%7D%7D&_=1702455623970'
# # res = requests.get(url).text[1:-1]
# res = requests.get(url=url, headers=headers)
#
# res_json = res.text
# print(res_json)
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论