Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
b3fa91e8
提交
b3fa91e8
authored
3月 21, 2024
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
REITs 政策法规 03/21
上级
88209302
显示空白字符变更
内嵌
并排
正在显示
22 个修改的文件
包含
276 行增加
和
88 行删除
+276
-88
policy_beijing.py
REITs_policyData/policy_beijing.py
+9
-6
policy_fujian.py
REITs_policyData/policy_fujian.py
+13
-3
policy_guangdong.py
REITs_policyData/policy_guangdong.py
+1
-7
policy_guangxi.py
REITs_policyData/policy_guangxi.py
+49
-15
policy_hainan.py
REITs_policyData/policy_hainan.py
+6
-3
policy_heilongjiang.py
REITs_policyData/policy_heilongjiang.py
+15
-12
policy_hubei.py
REITs_policyData/policy_hubei.py
+8
-4
policy_jiangsu.py
REITs_policyData/policy_jiangsu.py
+10
-4
policy_jiangxi.py
REITs_policyData/policy_jiangxi.py
+1
-1
policy_jilin.py
REITs_policyData/policy_jilin.py
+6
-3
policy_liaoning.py
REITs_policyData/policy_liaoning.py
+6
-3
policy_neimenggu.py
REITs_policyData/policy_neimenggu.py
+6
-3
policy_shandong.py
REITs_policyData/policy_shandong.py
+6
-3
policy_shanghai.py
REITs_policyData/policy_shanghai.py
+6
-3
policy_shanxi.py
REITs_policyData/policy_shanxi.py
+6
-3
policy_sichuan.py
REITs_policyData/policy_sichuan.py
+6
-3
policy_tianjin.py
REITs_policyData/policy_tianjin.py
+6
-3
policy_yunnan.py
REITs_policyData/policy_yunnan.py
+6
-3
policy_zhejiang.py
REITs_policyData/policy_zhejiang.py
+6
-3
reits.py
REITs_policyData/reits.py
+4
-2
FundAnncmnt-hkex.py
REITs专题数据/FundAnncmnt-hkex.py
+3
-1
cushman.py
REITs专题数据/cushman.py
+97
-0
没有找到文件。
REITs_policyData/policy_beijing.py
浏览文件 @
b3fa91e8
...
@@ -15,8 +15,8 @@ from reits import Policy
...
@@ -15,8 +15,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'北京市人民政府'
webname
=
'北京市人民政府
_
'
class
Policy1
():
class
Policy1
():
@retry
(
tries
=
3
,
delay
=
10
)
@retry
(
tries
=
3
,
delay
=
10
)
...
@@ -282,14 +282,17 @@ def beijing():
...
@@ -282,14 +282,17 @@ def beijing():
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
if
content
==
''
:
if
content
==
''
:
continue
continue
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
@@ -312,6 +315,6 @@ def beijing():
...
@@ -312,6 +315,6 @@ def beijing():
time
.
sleep
(
random
.
randint
(
10
,
20
))
time
.
sleep
(
random
.
randint
(
10
,
20
))
num
+=
1
num
+=
1
#
if __name__ == '__main__':
if
__name__
==
'__main__'
:
#
beijing()
beijing
()
#
baseCore.close()
baseCore
.
close
()
REITs_policyData/policy_fujian.py
浏览文件 @
b3fa91e8
import
json
import
time
import
time
import
os
import
os
...
@@ -12,8 +13,8 @@ log = baseCore.getLogger()
...
@@ -12,8 +13,8 @@ log = baseCore.getLogger()
from
reits
import
Policy
from
reits
import
Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'福建省人民政府'
webname
=
'福建省人民政府
_
'
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
...
@@ -66,6 +67,7 @@ def getContent(num, url, publishDate):
...
@@ -66,6 +67,7 @@ def getContent(num, url, publishDate):
style
.
decompose
()
style
.
decompose
()
except
:
except
:
pass
pass
try
:
a_list
=
soup
.
find
(
'div'
,
class_
=
'xl_list1'
)
.
find_all
(
'a'
)
a_list
=
soup
.
find
(
'div'
,
class_
=
'xl_list1'
)
.
find_all
(
'a'
)
for
a
in
a_list
:
for
a
in
a_list
:
fj_href
=
a
.
get
(
'href'
)
fj_href
=
a
.
get
(
'href'
)
...
@@ -82,6 +84,8 @@ def getContent(num, url, publishDate):
...
@@ -82,6 +84,8 @@ def getContent(num, url, publishDate):
if
att_id
:
if
att_id
:
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
a
[
'href'
]
=
full_path
a
[
'href'
]
=
full_path
except
:
pass
content
=
contentWithTag
.
text
.
lstrip
()
.
strip
()
content
=
contentWithTag
.
text
.
lstrip
()
.
strip
()
...
@@ -116,7 +120,10 @@ def doJob():
...
@@ -116,7 +120,10 @@ def doJob():
for
data_post
in
data_posts
:
for
data_post
in
data_posts
:
data_json
=
getDataJson
(
data_post
)
data_json
=
getDataJson
(
data_post
)
for
data_
in
data_json
:
for
data_
in
data_json
:
try
:
title
=
data_
[
'_doctitle'
]
title
=
data_
[
'_doctitle'
]
except
:
title
=
data_
[
'doctitle'
]
publishDate
=
data_
[
'crtime'
]
.
replace
(
'.'
,
'-'
)
publishDate
=
data_
[
'crtime'
]
.
replace
(
'.'
,
'-'
)
origin
=
data_
[
'docsourcename'
]
origin
=
data_
[
'docsourcename'
]
href
=
data_
[
'docpuburl'
]
href
=
data_
[
'docpuburl'
]
...
@@ -142,14 +149,17 @@ def doJob():
...
@@ -142,14 +149,17 @@ def doJob():
content
,
contentWithTag
,
id_list
=
getContent
(
num
,
href
,
publishDate
[:
10
])
content
,
contentWithTag
,
id_list
=
getContent
(
num
,
href
,
publishDate
[:
10
])
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_guangdong.py
浏览文件 @
b3fa91e8
...
@@ -20,7 +20,7 @@ policy = Policy()
...
@@ -20,7 +20,7 @@ policy = Policy()
topic
=
'research_center_fourth'
topic
=
'research_center_fourth'
webname
=
'广东省人民政府'
webname
=
'广东省人民政府
_
'
headers
=
{
headers
=
{
'Content-Type'
:
'application/json'
,
'Content-Type'
:
'application/json'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
,
...
@@ -225,10 +225,4 @@ def doJob():
...
@@ -225,10 +225,4 @@ def doJob():
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
doJob
()
doJob
()
# doJob_1()
# doJob_2(2)
# url = 'http://www.gd.gov.cn/gkmlpt/content/4/4022/post_4022955.html#8'
# soup = getSoup(url)
#
# print(contentWithTag)
baseCore
.
close
()
baseCore
.
close
()
REITs_policyData/policy_guangxi.py
浏览文件 @
b3fa91e8
...
@@ -12,11 +12,11 @@ baseCore = BaseCore.BaseCore()
...
@@ -12,11 +12,11 @@ baseCore = BaseCore.BaseCore()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
from
reits
import
Policy
from
reits
import
Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'广西壮族自治区人民政府'
webname
=
'广西壮族自治区人民政府
_
'
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'Content-Type'
:
'application/json'
,
'Content-Type'
:
'application/json'
,
...
@@ -41,11 +41,27 @@ def getFjContent(url):
...
@@ -41,11 +41,27 @@ def getFjContent(url):
def
getTotal
():
def
getTotal
():
ip
=
baseCore
.
get_proxy
()
ip
=
baseCore
.
get_proxy
()
url
=
'http://www.gxzf.gov.cn/irs/front/search'
url
=
'http://www.gxzf.gov.cn/irs/front/search'
data_post
=
{
"code"
:
"181aedaa542"
,
"dataTypeId"
:
"241"
,
"configCode"
:
""
,
# data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
"sign"
:
"9cc99c9d-94aa-44b4-aa79-41227a5385d7"
,
"searchWord"
:
"REITs"
,
"orderBy"
:
"related"
,
# "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
"searchBy"
:
"all"
,
"appendixType"
:
""
,
"granularity"
:
"ALL"
,
"isSearchForced"
:
"0"
,
"filters"
:
[],
# "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
"pageNo"
:
1
,
"pageSize"
:
10
,
"isAdvancedSearch"
:
None
,
"isDefaultAdvanced"
:
None
,
# "pageNo": 1, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
"advancedFilters"
:
None
,
"advancedFilters "
:
None
,
"historySearchWords"
:
[]}
# "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
data_post
=
{
'advancedFilters'
:
None
,
'appendixType'
:
""
,
'code'
:
"181aedaa542"
,
'configCode'
:
""
,
'dataTypeId'
:
"241"
,
'filters'
:
[],
'granularity'
:
"ALL"
,
'historySearchWords'
:
[],
'isAdvancedSearch'
:
None
,
'isDefaultAdvanced'
:
None
,
'isSearchForced'
:
"0"
,
'orderBy'
:
"related"
,
'pageNo'
:
1
,
'pageSize'
:
10
,
'searchBy'
:
"all"
,
'searchWord'
:
"REITs"
,
}
data_post
=
json
.
dumps
(
data_post
)
data_post
=
json
.
dumps
(
data_post
)
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
data_post
,
proxies
=
ip
)
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
data_post
,
proxies
=
ip
)
req
.
encoding
=
req
.
apparent_encoding
req
.
encoding
=
req
.
apparent_encoding
...
@@ -55,11 +71,27 @@ def getTotal():
...
@@ -55,11 +71,27 @@ def getTotal():
def
getDataJson
(
page
):
def
getDataJson
(
page
):
ip
=
baseCore
.
get_proxy
()
ip
=
baseCore
.
get_proxy
()
url
=
'http://www.gxzf.gov.cn/irs/front/search'
url
=
'http://www.gxzf.gov.cn/irs/front/search'
data_post
=
{
"code"
:
"181aedaa542"
,
"dataTypeId"
:
"241"
,
"configCode"
:
""
,
# data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
"sign"
:
"9cc99c9d-94aa-44b4-aa79-41227a5385d7"
,
"searchWord"
:
"REITs"
,
"orderBy"
:
"related"
,
# "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
"searchBy"
:
"all"
,
"appendixType"
:
""
,
"granularity"
:
"ALL"
,
"isSearchForced"
:
"0"
,
"filters"
:
[],
# "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
"pageNo"
:
page
,
"pageSize"
:
10
,
"isAdvancedSearch"
:
None
,
"isDefaultAdvanced"
:
None
,
# "pageNo": page, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
"advancedFilters"
:
None
,
"advancedFilters "
:
None
,
"historySearchWords"
:
[]}
# "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
data_post
=
{
'advancedFilters'
:
None
,
'appendixType'
:
""
,
'code'
:
"181aedaa542"
,
'configCode'
:
""
,
'dataTypeId'
:
"241"
,
'filters'
:
[],
'granularity'
:
"ALL"
,
'historySearchWords'
:
[],
'isAdvancedSearch'
:
None
,
'isDefaultAdvanced'
:
None
,
'isSearchForced'
:
"0"
,
'orderBy'
:
"related"
,
'pageNo'
:
page
,
'pageSize'
:
10
,
'searchBy'
:
"all"
,
'searchWord'
:
"REITs"
,
}
data_post
=
json
.
dumps
(
data_post
)
data_post
=
json
.
dumps
(
data_post
)
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
data_post
,
proxies
=
ip
)
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
data_post
,
proxies
=
ip
)
req
.
encoding
=
req
.
apparent_encoding
req
.
encoding
=
req
.
apparent_encoding
...
@@ -117,14 +149,17 @@ def getData(data_, num):
...
@@ -117,14 +149,17 @@ def getData(data_, num):
content
,
contentWithTag
,
id_list
=
getContent
(
href
,
publishDate
,
num
)
content
,
contentWithTag
,
id_list
=
getContent
(
href
,
publishDate
,
num
)
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'author'
:
''
,
'author'
:
''
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
@@ -163,7 +198,6 @@ def doJob():
...
@@ -163,7 +198,6 @@ def doJob():
time
.
sleep
(
2
)
time
.
sleep
(
2
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
doJob
()
doJob
()
baseCore
.
close
()
baseCore
.
close
()
REITs_policyData/policy_hainan.py
浏览文件 @
b3fa91e8
...
@@ -17,8 +17,8 @@ from reits import Policy
...
@@ -17,8 +17,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'海南省人民政府'
webname
=
'海南省人民政府
_
'
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
...
@@ -108,14 +108,17 @@ def getData(div, num):
...
@@ -108,14 +108,17 @@ def getData(div, num):
return
return
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
[],
'attachmentIds'
:
[],
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_heilongjiang.py
浏览文件 @
b3fa91e8
#coding=utf-8
#
coding=utf-8
import
os
import
os
import
time
import
time
...
@@ -10,14 +10,14 @@ baseCore = BaseCore.BaseCore()
...
@@ -10,14 +10,14 @@ baseCore = BaseCore.BaseCore()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
from
reits
import
Policy
from
reits
import
Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'黑龙江省人民政府'
webname
=
'黑龙江省人民政府
_
'
headers
=
{
headers
=
{
'Content-Type'
:
'application/x-www-form-urlencoded'
,
'Content-Type'
:
'application/x-www-form-urlencoded'
,
'Token'
:
'
9a9ff46e-f534-43b8-bad1-063d80af7e5
1'
,
'Token'
:
'
b946cd4e-77a4-42f5-bcaf-a9c4f26b519
1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
}
}
...
@@ -26,11 +26,12 @@ def getDataJson():
...
@@ -26,11 +26,12 @@ def getDataJson():
ip
=
baseCore
.
get_proxy
()
ip
=
baseCore
.
get_proxy
()
url
=
'https://www.hlj.gov.cn/znwd/policy/policy/policy/home/public/policyWikipedia?_method=get'
url
=
'https://www.hlj.gov.cn/znwd/policy/policy/policy/home/public/policyWikipedia?_method=get'
data_post
=
{
data_post
=
{
'sort'
:
'
smartIndex
'
,
'sort'
:
'
date
'
,
'order'
:
'
a
sc'
,
'order'
:
'
de
sc'
,
'start'
:
'0'
,
'start'
:
'0'
,
'length'
:
'20'
,
'length'
:
'20'
,
'filter.all'
:
'REITs'
,
'filter.all'
:
'REITs'
,
'filter.tyoe'
:
'0'
}
}
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
data_post
,
proxies
=
ip
)
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
data_post
,
proxies
=
ip
)
req
.
encoding
=
req
.
apparent_encoding
req
.
encoding
=
req
.
apparent_encoding
...
@@ -54,7 +55,7 @@ def getFjContent(url):
...
@@ -54,7 +55,7 @@ def getFjContent(url):
return
req
.
content
return
req
.
content
def
getContent
(
num
,
title
,
publishDate
,
summary
,
id
,
pub_hao
,
organ
,
type
):
def
getContent
(
num
,
title
,
publishDate
,
summary
,
id
,
pub_hao
,
organ
,
type
):
id_list
=
[]
id_list
=
[]
url
=
f
'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}'
url
=
f
'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}'
writtenDate
=
None
writtenDate
=
None
...
@@ -83,7 +84,7 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
...
@@ -83,7 +84,7 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
fj_title
=
fj_title
.
replace
(
'<'
,
''
)
.
replace
(
'>'
,
''
)
fj_title
=
fj_title
.
replace
(
'<'
,
''
)
.
replace
(
'>'
,
''
)
if
category
not
in
fj_title
:
if
category
not
in
fj_title
:
fj_title
=
fj_title
+
category
fj_title
=
fj_title
+
category
att_id
,
full_path
=
policy
.
attuributefile
(
fj_title
,
href
,
num
,
publishDate
)
att_id
,
full_path
=
policy
.
attuributefile
(
fj_title
,
href
,
num
,
publishDate
)
if
att_id
:
if
att_id
:
id_list
.
append
(
att_id
)
id_list
.
append
(
att_id
)
a
[
'href'
]
=
full_path
a
[
'href'
]
=
full_path
...
@@ -104,14 +105,17 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
...
@@ -104,14 +105,17 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
content
=
soup
.
text
.
lstrip
()
.
strip
()
content
=
soup
.
text
.
lstrip
()
.
strip
()
contentWithTag_str
=
str
(
soup
)
contentWithTag_str
=
str
(
soup
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
@@ -135,7 +139,6 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
...
@@ -135,7 +139,6 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
def
doJob
():
def
doJob
():
num
=
1
num
=
1
data_json
=
getDataJson
()
data_json
=
getDataJson
()
for
data_
in
data_json
:
for
data_
in
data_json
:
...
@@ -152,7 +155,7 @@ def doJob():
...
@@ -152,7 +155,7 @@ def doJob():
organ
=
data_
[
'unitShowName'
]
organ
=
data_
[
'unitShowName'
]
except
:
except
:
organ
=
''
organ
=
''
data
=
getContent
(
num
,
title
,
publishDate
,
summary
,
id
,
pub_hao
,
organ
,
type
)
data
=
getContent
(
num
,
title
,
publishDate
,
summary
,
id
,
pub_hao
,
organ
,
type
)
# data_list.append(data)
# data_list.append(data)
num
+=
1
num
+=
1
time
.
sleep
(
3
)
time
.
sleep
(
3
)
...
...
REITs_policyData/policy_hubei.py
浏览文件 @
b3fa91e8
...
@@ -21,8 +21,8 @@ from reits import Policy
...
@@ -21,8 +21,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'湖北省人民政府'
webname
=
'湖北省人民政府
_
'
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
}
}
...
@@ -114,14 +114,17 @@ def getData(driver, data_, num):
...
@@ -114,14 +114,17 @@ def getData(driver, data_, num):
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
@@ -145,7 +148,8 @@ def getData(driver, data_, num):
...
@@ -145,7 +148,8 @@ def getData(driver, data_, num):
def
doJob
():
def
doJob
():
service
=
Service
(
r'D:/soft/geckodriver.exe'
)
# service = Service(r'D:/soft/geckodriver.exe')
service
=
Service
(
r'F:\spider\firefox\geckodriver_1.exe'
)
options
=
Options
()
options
=
Options
()
options
.
set_preference
(
"general.useragent.override"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
)
options
.
set_preference
(
"general.useragent.override"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
)
driver
=
webdriver
.
Firefox
(
options
=
options
,
service
=
service
)
driver
=
webdriver
.
Firefox
(
options
=
options
,
service
=
service
)
...
...
REITs_policyData/policy_jiangsu.py
浏览文件 @
b3fa91e8
...
@@ -17,8 +17,8 @@ from reits import Policy
...
@@ -17,8 +17,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'江苏省人民政府'
webname
=
'江苏省人民政府
_
'
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
...
@@ -85,14 +85,17 @@ def getContentA(url, num, publishDate, title, origin, summary):
...
@@ -85,14 +85,17 @@ def getContentA(url, num, publishDate, title, origin, summary):
content
=
contentWithTag
.
text
content
=
contentWithTag
.
text
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
@@ -163,13 +166,16 @@ def getContentB(url, num, publishDate, title, origin, summary):
...
@@ -163,13 +166,16 @@ def getContentB(url, num, publishDate, title, origin, summary):
content
=
contentWithTag
.
text
.
lstrip
()
.
strip
()
content
=
contentWithTag
.
text
.
lstrip
()
.
strip
()
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_jiangxi.py
浏览文件 @
b3fa91e8
...
@@ -16,7 +16,7 @@ policy = Policy()
...
@@ -16,7 +16,7 @@ policy = Policy()
topic
=
'research_center_fourth'
topic
=
'research_center_fourth'
webname
=
'江西省人民政府'
webname
=
'江西省人民政府
_
'
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
...
...
REITs_policyData/policy_jilin.py
浏览文件 @
b3fa91e8
...
@@ -14,8 +14,8 @@ from reits import Policy
...
@@ -14,8 +14,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'吉林市人民政府'
webname
=
'吉林市人民政府
_
'
headers
=
{
headers
=
{
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
...
@@ -155,14 +155,17 @@ def getData(num, title, url, origin, publishDate, summary):
...
@@ -155,14 +155,17 @@ def getData(num, title, url, origin, publishDate, summary):
return
return
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_liaoning.py
浏览文件 @
b3fa91e8
...
@@ -15,8 +15,8 @@ from reits import Policy
...
@@ -15,8 +15,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'辽宁省人民政府'
webname
=
'辽宁省人民政府
_
'
headers
=
{
headers
=
{
'User_Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
'User_Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
}
...
@@ -63,14 +63,17 @@ def doJob():
...
@@ -63,14 +63,17 @@ def doJob():
content
=
contentWithTag
.
text
.
lstrip
()
.
strip
()
content
=
contentWithTag
.
text
.
lstrip
()
.
strip
()
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
[],
'attachmentIds'
:
[],
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
'辽宁省人民政府'
,
'origin'
:
'辽宁省人民政府'
,
...
...
REITs_policyData/policy_neimenggu.py
浏览文件 @
b3fa91e8
...
@@ -15,8 +15,8 @@ from reits import Policy
...
@@ -15,8 +15,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'内蒙古自治区人民政府'
webname
=
'内蒙古自治区人民政府
_
'
headers
=
{
headers
=
{
'Accept'
:
'application/json, text/plain, */*'
,
'Accept'
:
'application/json, text/plain, */*'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
...
@@ -188,14 +188,17 @@ def getContent(num, data):
...
@@ -188,14 +188,17 @@ def getContent(num, data):
return
return
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_shandong.py
浏览文件 @
b3fa91e8
...
@@ -11,8 +11,8 @@ log = baseCore.getLogger()
...
@@ -11,8 +11,8 @@ log = baseCore.getLogger()
from
reits
import
Policy
from
reits
import
Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'山东省人民政府'
webname
=
'山东省人民政府
_
'
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
...
@@ -131,14 +131,17 @@ def getData(soup, num):
...
@@ -131,14 +131,17 @@ def getData(soup, num):
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_shanghai.py
浏览文件 @
b3fa91e8
...
@@ -17,8 +17,8 @@ from reits import Policy
...
@@ -17,8 +17,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'上海市人民政府'
webname
=
'上海市人民政府
_
'
headers
=
{
headers
=
{
'Accept'
:
'*/*'
,
'Accept'
:
'*/*'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
...
@@ -111,14 +111,17 @@ def getData(data_, driver, num):
...
@@ -111,14 +111,17 @@ def getData(data_, driver, num):
# fjhref_list]
# fjhref_list]
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_shanxi.py
浏览文件 @
b3fa91e8
...
@@ -14,8 +14,8 @@ from reits import Policy
...
@@ -14,8 +14,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'山西省人民政府'
webname
=
'山西省人民政府
_
'
headers
=
{
headers
=
{
'Accept'
:
'application/json, text/plain, */*'
,
'Accept'
:
'application/json, text/plain, */*'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Encoding'
:
'gzip, deflate'
,
...
@@ -130,14 +130,17 @@ def getContent(num, data):
...
@@ -130,14 +130,17 @@ def getContent(num, data):
a
[
'href'
]
=
full_path
a
[
'href'
]
=
full_path
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_sichuan.py
浏览文件 @
b3fa91e8
...
@@ -14,8 +14,8 @@ log = baseCore.getLogger()
...
@@ -14,8 +14,8 @@ log = baseCore.getLogger()
from
reits
import
Policy
from
reits
import
Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'四川省人民政府'
webname
=
'四川省人民政府
_
'
headers
=
{
headers
=
{
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
...
@@ -106,14 +106,17 @@ def getData(data_, num):
...
@@ -106,14 +106,17 @@ def getData(data_, num):
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_tianjin.py
浏览文件 @
b3fa91e8
...
@@ -16,8 +16,8 @@ from reits import Policy
...
@@ -16,8 +16,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'天津市人民政府'
webname
=
'天津市人民政府
_
'
import
urllib3
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
...
@@ -137,14 +137,17 @@ def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary):
...
@@ -137,14 +137,17 @@ def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary):
content
=
contentWithTag
.
text
.
lstrip
()
.
strip
()
content
=
contentWithTag
.
text
.
lstrip
()
.
strip
()
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
pub_time
,
'publishDate'
:
pub_time
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_yunnan.py
浏览文件 @
b3fa91e8
...
@@ -19,8 +19,8 @@ from reits import Policy
...
@@ -19,8 +19,8 @@ from reits import Policy
policy
=
Policy
()
policy
=
Policy
()
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'云南省人民政府'
webname
=
'云南省人民政府
_
'
headers
=
{
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
}
}
...
@@ -149,14 +149,17 @@ def getData(div, num):
...
@@ -149,14 +149,17 @@ def getData(div, num):
content
,
contentWithTag
,
id_list
=
getContent
(
href
,
publishDate
,
num
)
content
,
contentWithTag
,
id_list
=
getContent
(
href
,
publishDate
,
num
)
contentWithTag_str
=
str
(
contentWithTag
)
contentWithTag_str
=
str
(
contentWithTag
)
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag_str
,
'contentWithTag'
:
contentWithTag_str
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/policy_zhejiang.py
浏览文件 @
b3fa91e8
...
@@ -16,8 +16,8 @@ headers = {
...
@@ -16,8 +16,8 @@ headers = {
'X-Requested-With'
:
'XMLHttpRequest'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
}
}
topic
=
'
policy
'
topic
=
'
research_center_fourth
'
webname
=
'浙江省人民政府'
webname
=
'浙江省人民政府
_
'
class
Policy
():
class
Policy
():
def
getrequest_soup
(
self
,
headers
,
url
):
def
getrequest_soup
(
self
,
headers
,
url
):
...
@@ -502,14 +502,17 @@ def getDatas(page):
...
@@ -502,14 +502,17 @@ def getDatas(page):
continue
continue
num
+=
1
num
+=
1
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
lang
=
baseCore
.
detect_language
(
content
)
dic_info
=
{
dic_info
=
{
'attachmentIds'
:
id_list
,
'attachmentIds'
:
id_list
,
'subjectId'
:
'1729315113088765953'
,
'lang'
:
lang
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
contentWithTag
,
'contentWithTag'
:
contentWithTag
,
'deleteFlag'
:
0
,
'deleteFlag'
:
0
,
'checkStatus'
:
1
,
'checkStatus'
:
1
,
'id'
:
'
'
,
'id'
:
'
1729315113088765953'
+
str
(
int
(
time
.
time
()))
,
'title'
:
title
,
'title'
:
title
,
'publishDate'
:
publishDate
,
'publishDate'
:
publishDate
,
'origin'
:
origin
,
'origin'
:
origin
,
...
...
REITs_policyData/reits.py
浏览文件 @
b3fa91e8
...
@@ -42,10 +42,12 @@ class Policy():
...
@@ -42,10 +42,12 @@ class Policy():
return
result
return
result
def
createDriver
(
self
):
def
createDriver
(
self
):
chrome_driver
=
r'D:\cmd100\chromedriver.exe'
# chrome_driver = r'D:\cmd100\chromedriver.exe'
chrome_driver
=
r'F:\spider\cmd100\chromedriver.exe'
path
=
Service
(
chrome_driver
)
path
=
Service
(
chrome_driver
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
binary_location
=
r'D:\Google\Chrome\Application\chrome.exe'
# chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options
.
binary_location
=
r'F:\spider\85\Google\Chrome\Application\chrome.exe'
# 设置代理
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
# chrome_options.add_argument('--proxy-server=http://' + proxy)
...
...
REITs专题数据/FundAnncmnt-hkex.py
浏览文件 @
b3fa91e8
impor
t
datetime
impor
t
datetime
...
@@ -250,6 +250,7 @@ def doJob(obsOperate):
...
@@ -250,6 +250,7 @@ def doJob(obsOperate):
continue
continue
att_id
,
full_path
=
obsOperate
.
tableUpdate
(
retData
,
'RETIs文件'
,
file_title
,
num
,
str
(
date
)[:
10
])
att_id
,
full_path
=
obsOperate
.
tableUpdate
(
retData
,
'RETIs文件'
,
file_title
,
num
,
str
(
date
)[:
10
])
num
+=
1
num
+=
1
createDate
=
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d'
)
dic_info
=
{
dic_info
=
{
'code'
:
code
,
# 代码
'code'
:
code
,
# 代码
'name'
:
name
,
# 基金名称
'name'
:
name
,
# 基金名称
...
@@ -260,6 +261,7 @@ def doJob(obsOperate):
...
@@ -260,6 +261,7 @@ def doJob(obsOperate):
'date'
:
date
,
# 时间(datetime 类型)
'date'
:
date
,
# 时间(datetime 类型)
'strDate'
:
str
(
date
)[:
10
],
# 时间(字符串类型)
'strDate'
:
str
(
date
)[:
10
],
# 时间(字符串类型)
'exchange'
:
'香港交易所'
,
# 交易所
'exchange'
:
'香港交易所'
,
# 交易所
'createDate'
:
createDate
# 创建时间
}
}
db_storage
.
insert_one
(
dic_info
)
db_storage
.
insert_one
(
dic_info
)
log
.
info
(
f
'{code}==={title}===采集成功'
)
log
.
info
(
f
'{code}==={title}===采集成功'
)
...
...
REITs专题数据/cushman.py
0 → 100644
浏览文件 @
b3fa91e8
import
re
import
re
import
fitz
import
requests
from
bs4
import
BeautifulSoup
import
pandas
as
pd
import
os
import
numpy
as
np
from
base
import
BaseCore
from
requests.models
import
Response
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh-TW;q=0.9,zh;q=0.8'
,
'Cache-Control'
:
'no-cache'
,
'Pragma'
:
'no-cache'
,
'Referer'
:
'https://www.cushmanwakefield.com.cn/research-report/p94.html?expert=0'
,
'Sec-Ch-Ua'
:
'"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"'
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
'Sec-Ch-Ua-Platform'
:
'"Windows"'
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
,
}
def
getSoup
(
url
):
ip
=
baseCore
.
get_proxy
()
req
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
ip
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'html.parser'
)
return
soup
def
getPageSize
():
# url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=0'
url
=
'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=1'
soup
=
getSoup
(
url
)
total
=
int
(
re
.
findall
(
'
\
d+'
,
soup
.
find
(
'dl'
,
class_
=
'sousuo_result'
)
.
text
.
lstrip
()
.
strip
())[
0
])
if
total
%
4
==
0
:
pageSize
=
int
(
total
/
4
)
else
:
pageSize
=
int
(
total
/
4
)
+
1
return
pageSize
def
getContent
(
url
):
content
=
''
ip
=
baseCore
.
get_proxy
()
req
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
ip
)
# req.encoding = req.apparent_encoding
with
fitz
.
open
(
stream
=
req
.
content
,
filetype
=
'pdf'
)
as
doc
:
page_size
=
doc
.
page_count
for
page
in
doc
.
pages
():
content
+=
page
.
get_text
()
return
content
def
doJob
():
num
=
1
data_list
=
[]
pageSize
=
getPageSize
()
for
page
in
range
(
1
,
pageSize
+
1
):
# url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=0'
url
=
f
'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=1'
soup
=
getSoup
(
url
)
div_list
=
soup
.
find
(
'div'
,
class_
=
'guwen_list_box'
)
.
find_all
(
'div'
,
class_
=
'zhuangyuan_guwen_box'
)
for
div
in
div_list
:
fjtitle_list
=
''
fjhref_list
=
''
name
=
div
.
find
(
'div'
,
class_
=
'zhuanyuan_name'
)
.
text
.
lstrip
()
.
strip
()
summary
=
div
.
find
(
'div'
,
class_
=
'zhuanyuan_info'
)
.
text
.
lstrip
()
.
strip
()
href
=
div
.
find
(
'a'
,
class_
=
'zhuanyuan_xinxi'
)
.
get
(
'href'
)
origin
=
'戴德梁兴'
try
:
content
=
getContent
(
href
)
# print(content)
except
Exception
as
e
:
log
.
error
(
f
'第{page}页==={name}===连接失败'
)
continue
title
=
name
.
replace
(
'/'
,
' '
)
.
replace
(
'|'
,
' '
)
.
replace
(
'?'
,
' '
)
.
replace
(
'"'
,
'”'
)
if
__name__
==
'__main__'
:
doJob
()
baseCore
.
close
()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论