Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
71a0e996
提交
71a0e996
authored
12月 19, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
维护
上级
1ec2de55
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
298 行增加
和
81 行删除
+298
-81
RedisPPData.py
base/RedisPPData.py
+17
-3
get_doc_id.py
comData/YanBao/get_doc_id.py
+29
-27
uploadfile.py
comData/annualReport_US/uploadfile.py
+169
-0
bmfw.py
comData/dingzhi/bmfw.py
+83
-51
没有找到文件。
base/RedisPPData.py
浏览文件 @
71a0e996
...
...
@@ -224,8 +224,8 @@ def BaseInfoEnterprise_task():
#企业核心人员
def
CorPerson
():
cnx
,
cursor
=
connectSql
()
#
gn_query = "select SocialCode from EnterpriseInfo where Place = '1'"
gn_query
=
"SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=13 AND a.Place=1"
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1'"
#
gn_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=13 AND a.Place=1"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
cnx
.
commit
()
...
...
@@ -273,6 +273,19 @@ def FinanceFromEase_task():
print
(
'定时采集异常'
,
e
)
pass
def
JingyingfenxiFromEase
():
cnx_
,
cursor_
=
cnn11
()
# 从上市企业库中读取数据
sql_sel
=
'''select social_credit_code from sys_base_enterprise_ipo where exchange = '1' or exchange = '2' or exchange = '3' and listed = '1' '''
cursor_
.
execute
(
sql_sel
)
finance
=
cursor_
.
fetchall
()
cnx_
.
commit
()
finance_list
=
[
item
[
0
]
for
item
in
finance
]
print
(
'======='
)
for
item
in
finance_list
:
r
.
rpush
(
'Jingyingfenxi:finance_socialCode'
,
item
)
close11
(
cnx_
,
cursor_
)
#微信公众号
def
WeiXingetFromSql
():
cnx_
,
cursor_
=
cnn11
()
...
...
@@ -645,7 +658,7 @@ if __name__ == "__main__":
# shuangbaiqiye()
# zhuangjingtexind()
# NoticeEnterprise()
NoticeDF
()
#
NoticeDF()
# AnnualEnterpriseIPO()
# AnnualEnterprise()
# BaseInfoEnterprise()
...
...
@@ -670,5 +683,6 @@ if __name__ == "__main__":
# AnnualEnterprise_task()
# FinanceFromEast()
# ipo_code()
JingyingfenxiFromEase
()
log
.
info
(
f
'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}==='
)
comData/YanBao/get_doc_id.py
浏览文件 @
71a0e996
...
...
@@ -39,24 +39,23 @@ class EsMethod(object):
body
=
{
"query"
:
{
"bool"
:
{
"must"
:
[
{
"match"
:
{
"type"
:
"0"
}
},
{
"range"
:
{
"createDate"
:
{
"gte"
:
"2023-12-13T00:00:00"
,
"lte"
:
"2023-12-15T00:00:00"
}
"bool"
:
{
"must_not"
:
[
{
"exists"
:
{
"field"
:
"content"
}
}
],
"must"
:
[
{
"match"
:
{
"type"
:
"1"
}
}
]
}
}
]
}
},
},
"sort"
:
[
{
"createDate"
:
{
...
...
@@ -74,6 +73,7 @@ class EsMethod(object):
'hits.hits._source.title'
,
'hits.hits._source.sourceAddress'
,
'hits.hits._source.createDate'
,
'hits.hits._source.origin'
]
# 字段2
result
=
self
.
es
.
search
(
index
=
index_name
,
doc_type
=
'_doc'
...
...
@@ -86,9 +86,9 @@ def main(page, p, esMethod):
redis_conn
=
redis
.
Redis
(
connection_pool
=
pool
)
result
=
esMethod
.
queryatt
(
index_name
=
esMethod
.
index_name
,
pnum
=
p
)
total
=
result
[
'hits'
][
'total'
][
'value'
]
if
total
==
0
:
log
.
info
(
'++++已没有数据+++++'
)
return
#
if total == 0:
#
log.info('++++已没有数据+++++')
#
return
try
:
msglist
=
result
[
'hits'
][
'hits'
]
except
:
...
...
@@ -100,13 +100,15 @@ def main(page, p, esMethod):
id
=
mms
[
'_id'
]
title
=
mms
[
'_source'
][
'title'
]
sourceAddress
=
mms
[
'_source'
][
'sourceAddress'
]
log
.
info
(
f
'{id}--{title}--{sourceAddress}---'
)
if
redis_conn
.
lrem
(
'YanBao:id'
,
0
,
id
)
==
0
:
redis_conn
.
lpush
(
'
YanBao
:id'
,
id
)
origin
=
mms
[
'_source'
][
'origin'
]
log
.
info
(
f
'{id}--{title}--{
origin}--{
sourceAddress}---'
)
if
origin
==
'SEC美国证券交易委员会'
:
redis_conn
.
lrem
(
'NianbaoUS:id'
,
0
,
id
)
redis_conn
.
lpush
(
'
NianbaoUS
:id'
,
id
)
else
:
continue
redis_conn
.
lrem
(
f
'NianbaoOT_{origin}:id'
,
0
,
id
)
redis_conn
.
lpush
(
f
'NianbaoOT_{origin}:id'
,
id
)
def
run_threads
(
num_threads
,
esMethod
,
j
):
threads
=
[]
...
...
@@ -126,7 +128,7 @@ def run_threads(num_threads,esMethod,j):
if
__name__
==
"__main__"
:
j
=
0
for
i
in
range
(
2
):
for
i
in
range
(
10
):
esMethod
=
EsMethod
()
# result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
# total = result['hits']['total']['value']
...
...
comData/annualReport_US/uploadfile.py
0 → 100644
浏览文件 @
71a0e996
# -*- coding: utf-8 -*-
import
json
import
re
import
threading
import
time
import
uuid
from
urllib.parse
import
urljoin
import
fitz
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
obs
import
ObsClient
from
retry
import
retry
from
elasticsearch
import
Elasticsearch
from
base
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
baseCore
=
BaseCore
.
BaseCore
()
cnx_
=
baseCore
.
cnx_
cursor_
=
cnx_
.
cursor
()
lock
=
threading
.
Lock
()
pool
=
redis
.
ConnectionPool
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
6
)
class
EsMethod
(
object
):
def
__init__
(
self
):
# 创建Elasticsearch对象,并提供账号信息
self
.
es
=
Elasticsearch
([
'http://114.116.19.92:9700'
],
http_auth
=
(
'elastic'
,
'zzsn9988'
),
timeout
=
300
)
self
.
index_name
=
'researchreportdata'
def
queryatt
(
self
,
index_name
,
id
):
body
=
{
"query"
:
{
"match"
:
{
"id"
:
id
}
}
}
filter_path
=
[
'hits.hits._id'
,
'hits.total.value'
,
'hits.hits._source.title'
,
'hits.hits._source.socialCreditCode'
,
'hits.hits._source.sourceAddress'
# 'hits.hits._source.createDate',
# 'hits.hits._source.publishDate',
]
# 字段2
result
=
self
.
es
.
search
(
index
=
index_name
,
doc_type
=
'_doc'
,
filter_path
=
filter_path
,
body
=
body
)
# log.info(result)
return
result
def
updateaunn
(
self
,
index_name
,
id
,
content
,
contentWithTag
):
body
=
{
'doc'
:
{
'content'
:
content
,
'contentWithTag'
:
contentWithTag
}
}
result
=
self
.
es
.
update
(
index
=
index_name
,
id
=
id
,
body
=
body
)
log
.
info
(
'更新结果:
%
s'
%
result
)
def
paserUrl
(
html
,
listurl
):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
def
get_news
(
news_url
,
ip_dic
):
header
=
{
'Host'
:
'www.sec.gov'
,
'Connection'
:
'keep-alive'
,
'sec-ch-ua'
:
'"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Sec-Fetch-Site'
:
'none'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-User'
:
'?1'
,
'Sec-Fetch-Dest'
:
'document'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cookie'
:
'_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=
%7
B
%22
_4c_s_
%22%3
A
%22
dZJbj9owEIX
%2
FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ
%2
Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4
%2
BKmJYTzK4nam2WN
%2
Flm3
%2
FmZ1Kyxyxl9KIwnS3r4
%2
B9b9S2Y
%2
FSE5JGQTie5DMiZjjdDCGH
%2
BxVIJuI19NaovXQrd
%2
ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl
%2
FXnfK
%2
BNdtI
%2
F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z
%2
FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq
%2
B7Y
%2
Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G
%2
FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP
%2
F
%2
BBf5Un26H9H7t6sfd
%2
B
%2
FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz
%2
BTc
%3
D
%22%7
D'
}
response
=
requests
.
get
(
url
=
news_url
,
headers
=
header
,
verify
=
False
,
timeout
=
30
)
# response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
if
response
.
status_code
==
200
:
# 请求成功,处理响应数据
# print(response.text)
result
=
BeautifulSoup
(
response
.
content
,
'html.parser'
)
# print(result)
pass
else
:
# 请求失败,输出错误信息
log
.
info
(
'请求失败:'
,
response
.
status_code
,
response
.
text
)
result
=
''
return
result
def
main
(
esMethod
):
redis_conn
=
redis
.
Redis
(
connection_pool
=
pool
)
id_
=
redis_conn
.
lpop
(
'NianbaoUS:id'
)
id
=
id_
.
decode
()
# id = "23101317164"
if
id
:
pass
else
:
log
.
info
(
'已无数据'
)
return
result_
=
esMethod
.
queryatt
(
index_name
=
esMethod
.
index_name
,
id
=
id
)
result
=
result_
[
'hits'
][
'hits'
][
0
]
num
=
0
title
=
result
[
'_source'
][
'title'
]
social_code
=
result
[
'_source'
][
'socialCreditCode'
]
# origin = result['_source']['origin']
log
.
info
(
f
'====={title}=={social_code}===正在更新==='
)
sourceAddress
=
result
[
'_source'
][
'sourceAddress'
]
ip_dic
=
{
'https'
:
'http://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
soup
=
get_news
(
sourceAddress
,
ip_dic
)
if
soup
:
pass
else
:
return
# 相对路径转化为绝对路径
soup
=
paserUrl
(
soup
,
sourceAddress
)
content
=
soup
.
text
.
strip
()
esMethod
.
updateaunn
(
esMethod
.
index_name
,
str
(
id
),
content
,
str
(
soup
))
def
run_threads
(
num_threads
,
esMethod
):
threads
=
[]
for
i
in
range
(
num_threads
):
thread
=
threading
.
Thread
(
target
=
main
,
args
=
(
esMethod
,))
threads
.
append
(
thread
)
thread
.
start
()
for
thread
in
threads
:
thread
.
join
()
if
__name__
==
'__main__'
:
while
True
:
esMethod
=
EsMethod
()
start
=
time
.
time
()
num_threads
=
5
run_threads
(
num_threads
,
esMethod
)
log
.
info
(
f
'5线程 总耗时{time.time()-start}秒'
)
\ No newline at end of file
comData/dingzhi/bmfw.py
浏览文件 @
71a0e996
...
...
@@ -8,6 +8,13 @@ import urllib3
from
pyquery
import
PyQuery
as
pq
import
json
from
kafka
import
KafkaProducer
import
sys
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
#务院政策问答平台最新发布信息采集
...
...
@@ -16,37 +23,58 @@ def reqHtml(url,data,header):
proxy
=
{
'https'
:
'http://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
json_data
=
json
.
dumps
(
data
)
response
=
requests
.
post
(
url
,
data
=
json_data
,
headers
=
header
,
verify
=
False
,
timeout
=
10
)
print
(
response
.
status_code
)
log
.
info
(
response
.
status_code
)
html
=
response
.
text
except
Exception
as
e
:
html
=
''
return
html
def
page_list
():
# header = {
# 'Host':'xcx.www.gov.cn',
# 'Connection':'keep-alive',
# 'Content-Length':'25',
# 'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
# 'x-tif-did':'pb5XUGL1Zm',
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/8461',
# 'x-tif-sid':'de492c1fa84af6192b75ebad2f5077a22a',
# 'Content-Type':'application/json',
# 'xweb_xhr':'1',
# 'dgd-pre-release':'0',
# 'x-yss-page':'publicService/pages/policyQALibrary/index/index',
# 'x-yss-city-code':'4400',
# 'Accept':'*/*',
# 'Sec-Fetch-Site':'cross-site',
# 'Sec-Fetch-Mode':'cors',
# 'Sec-Fetch-Dest':'empty',
# 'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html',
# 'Accept-Encoding':'gzip, deflate, br',
# 'Accept-Language':'zh-CN,zh;q=0.9'
# }
header
=
{
'Host'
:
'xcx.www.gov.cn'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'25'
,
'x-tif-openid'
:
'ojyj-41lGcemgsREMHBh1ac7iZUw
'
,
'x-tif-did'
:
'pb5XUGL1Zm
'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/846
1'
,
'x-tif-sid'
:
'de492c1fa84af6192b75ebad2f5077a22a
'
,
'Content-Type'
:
'application/json'
,
'xweb_xhr'
:
'1'
,
'dgd-pre-release'
:
'0'
,
'x-yss-page'
:
'publicService/pages/policyQALibrary/index/index'
,
'x-yss-city-code'
:
'4400'
,
'Accept'
:
'*/*'
,
'
Sec-Fetch-Site'
:
'cross-site
'
,
'Sec-Fetch-
Mode'
:
'cors
'
,
'Sec-Fetch-
Dest'
:
'empty
'
,
'
Referer'
:
'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html
'
,
'
Accept-Encoding'
:
'gzip, deflate, br
'
,
'Accept-
Language'
:
'zh-CN,zh;q=0.9
'
'Host'
:
'xcx.www.gov.cn'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'25'
,
'x-tif-openid'
:
'ojyj-40u1IEK5a2CSK7_Pg31ySks
'
,
'x-tif-did'
:
'u8Ajuqdyap
'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/850
1'
,
'x-tif-sid'
:
'755e67ddc8f86552d3f8d356fe22721cc5
'
,
'Content-Type'
:
'application/json'
,
'xweb_xhr'
:
'1'
,
'dgd-pre-release'
:
'0'
,
'x-yss-page'
:
'publicService/pages/policyQALibrary/index/index'
,
'x-yss-city-code'
:
'4400'
,
'Accept'
:
'*/*'
,
'
Accept-Language'
:
'*
'
,
'Sec-Fetch-
Site'
:
'cross-site
'
,
'Sec-Fetch-
Mode'
:
'cors
'
,
'
Sec-Fetch-Dest'
:
'empty
'
,
'
Referer'
:
'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html
'
,
'Accept-
Encoding'
:
'gzip, deflate, br
'
}
url
=
'https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
for
i
in
range
(
1
,
4
45
):
print
(
f
'采集第{i}页数据'
)
for
i
in
range
(
1
,
4
53
):
log
.
info
(
f
'采集第{i}页数据'
)
k
=
i
da
=
'{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
data
=
da
.
replace
(
'[k]'
,
str
(
k
))
...
...
@@ -56,7 +84,7 @@ def page_list():
hjson
=
json
.
loads
(
lhtml
)
data
=
hjson
[
'data'
][
'list'
]
except
Exception
as
e
:
print
(
e
)
log
.
info
(
e
)
time
.
sleep
(
60
)
continue
for
ss
in
data
:
...
...
@@ -64,9 +92,9 @@ def page_list():
durl
=
f
'https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicy'
sourceAddress
=
f
'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
try
:
flag
=
r
.
sismember
(
'IN-20230829-0146'
,
sourceAddress
)
flag
=
r
.
sismember
(
'IN-20230829-0146
-test
'
,
sourceAddress
)
if
flag
:
print
(
'信息已采集入库过'
)
log
.
info
(
'信息已采集入库过'
)
continue
except
Exception
as
e
:
continue
...
...
@@ -77,25 +105,25 @@ def page_list():
def
detailpaser
(
dmsg
):
hh
=
{
'Host'
:
'xcx.www.gov.cn'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'25'
,
'x-tif-openid'
:
'ojyj-41lGcemgsREMHBh1ac7iZUw
'
,
'x-tif-did'
:
'pb5XUGL1Zm
'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309071d)XWEB/846
1'
,
'x-tif-sid'
:
'de492c1fa84af6192b75ebad2f5077a22a
'
,
'Content-Type'
:
'application/json'
,
'xweb_xhr'
:
'1'
,
'dgd-pre-release'
:
'0'
,
'x-yss-page'
:
'publicService/pages/policyQALibrary/index/index'
,
'x-yss-city-code'
:
'4400'
,
'Accept'
:
'*/*'
,
'
Sec-Fetch-Site'
:
'cross-site
'
,
'Sec-Fetch-
Mode'
:
'cors
'
,
'Sec-Fetch-
Dest'
:
'empty
'
,
'
Referer'
:
'https://servicewechat.com/wxbebb3cdd9b331046/731/page-frame.html
'
,
'
Accept-Encoding'
:
'gzip, deflate, br
'
,
'Accept-
Language'
:
'zh-CN,zh;q=0.9
'
'Host'
:
'xcx.www.gov.cn'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'25'
,
'x-tif-openid'
:
'ojyj-40u1IEK5a2CSK7_Pg31ySks
'
,
'x-tif-did'
:
'u8Ajuqdyap
'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/850
1'
,
'x-tif-sid'
:
'755e67ddc8f86552d3f8d356fe22721cc5
'
,
'Content-Type'
:
'application/json'
,
'xweb_xhr'
:
'1'
,
'dgd-pre-release'
:
'0'
,
'x-yss-page'
:
'publicService/pages/policyQALibrary/index/index'
,
'x-yss-city-code'
:
'4400'
,
'Accept'
:
'*/*'
,
'
Accept-Language'
:
'*
'
,
'Sec-Fetch-
Site'
:
'cross-site
'
,
'Sec-Fetch-
Mode'
:
'cors
'
,
'
Sec-Fetch-Dest'
:
'empty
'
,
'
Referer'
:
'https://servicewechat.com/wxbebb3cdd9b331046/748/page-frame.html
'
,
'Accept-
Encoding'
:
'gzip, deflate, br
'
}
try
:
durl
=
dmsg
[
'url'
]
...
...
@@ -107,8 +135,8 @@ def detailpaser(dmsg):
dd
=
json
.
loads
(
dhtml
)
sendTokafka
(
dd
)
except
Exception
as
e
:
print
(
e
)
print
(
dhtml
)
log
.
info
(
e
)
# log.info
(dhtml)
def
sendTokafka
(
ddata
):
dd
=
ddata
[
'data'
]
...
...
@@ -117,8 +145,11 @@ def sendTokafka(ddata):
content
=
dd
[
'content'
]
contentWithTag
=
dd
[
'content'
]
publishTime
=
dd
[
'publishTime'
]
time_format
=
'
%
Y年
%
m月
%
d日'
publishDate
=
str
(
datetime
.
datetime
.
strptime
(
publishTime
,
time_format
))
if
publishTime
:
time_format
=
'
%
Y年
%
m月
%
d日'
publishDate
=
str
(
datetime
.
datetime
.
strptime
(
publishTime
,
time_format
))
else
:
publishDate
=
'1900-01-01'
origin
=
dd
[
'departmentName'
]
sourceAddress
=
f
'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
sid
=
'1696404919115825153'
...
...
@@ -135,13 +166,14 @@ def sendTokafka(ddata):
'source'
:
'python定制采集'
,
'type'
:
''
}
log
.
info
(
aa_dict
)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
try
:
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
aa_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
r
.
sadd
(
info_code
,
sourceAddress
)
print
(
'发送kafka成功!'
)
r
.
sadd
(
info_code
+
'-test'
,
sourceAddress
)
log
.
info
(
'发送kafka成功!'
)
except
Exception
as
e
:
print
(
e
)
log
.
info
(
e
)
finally
:
producer
.
close
()
# r.close()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论