提交 650c704a 作者: 刘伟刚

REITs网站采集

上级 497088ef
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'663',
'Content-Type':'application/json',
'Cookie':'SESSION=MGFhMGQxNDItM2MyOS00NjU5LWI2MTgtZjdiM2UxNjFkMGI3; _trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false',
'Host':'www.cq.gov.cn',
'Origin':'https://www.cq.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.cq.gov.cn/zwgk/search.html?DOCTITLE=REITs&DEPT=&gte=&lte=&REFERENCENO=&nh=&number=',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
dlist=[]
pagenum=3
for i in range(1,pagenum):
log.info(f'henan采集第{i}页列表')
lurl='https://www.cq.gov.cn/irs/front/list'
data={
"customFilter": {
"operator": "and",
"properties": [],
"filters": [
{
"operator": "or",
"properties": [
{
"property": "f_202121500898",
"operator": "eq",
"value": "REITs"
},
{
"property": "f_202142777829",
"operator": "eq",
"value": "REITs"
}
],
"filters": []
},
{
"operator": "or",
"properties": [
{
"property": "f_202146838317",
"operator": "gte",
"value": "2023-11-09 16:14:20"
},
{
"property": "f_202146235090",
"operator": "gte",
"value": "2023-11-09 16:14:20"
}
],
"filters": [
{
"operator": "and",
"properties": [
{
"property": "f_202146838317",
"operator": "eq",
"value": None
},
{
"property": "f_202146235090",
"operator": "eq",
"value": None
}
]
}
]
}
]
},
"sorts": [],
"tableName": "t_1775cd018c6",
"tenantId": "7",
"pageSize": 10,
"pageNo": i
}
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['list']
for lmsg in datas:
try:
title=lmsg['f_202121500898']
subtitle=''
summary=lmsg['f_202142777829']
createDate=''
writeDate=''
pubDate=lmsg['save_time']
source=lmsg['f_202121437464']
durl=lmsg['doc_pub_url']
wenjianhao=lmsg['f_202121837479']
suoyihao=lmsg['f_202121273539']
content=''
siteweb='重庆市人民政府'
except Exception as e:
continue
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'wenjianhao':wenjianhao,
'suoyihao':suoyihao,
}
is_member = rr.sismember('reis_cqgov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_cqgov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'重庆市人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'_trs_uv=loqwwzcq_3486_7pa1; _trs_ua_s_1=loqwwzcq_3486_n0; _trs_gv=g_loqwwzcq_3486_7pa1; arialoadData=true; ariawapChangeViewPort=false; _trs_user=',
'Host':'www.cq.gov.cn',
'Pragma':'no-cache',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
log.info(f'详情请求地址:{durl}')
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
contentWithTag,content=soupPaserHtml(soup,'div[class="zcwjk-xlcon"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="document mt-1 mt-12"]')
if not content:
log.info(f'详情内容为空:{durl}')
contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_word"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'385',
'Content-Type':'application/json',
'Cookie':'Hm_lvt_a013af4793f2380a4bcf49ca1ce393eb=1699513646; _trs_uv=loqujlqp_3625_8md7; _trs_ua_s_1=loqujlqp_3625_2jhe; arialoadData=true; ariawapChangeViewPort=false; Hm_lpvt_a013af4793f2380a4bcf49ca1ce393eb=1699513657; SEARCHHISTORY=[%22REiTs%22]',
'Host':'www.gxzf.gov.cn',
'Origin':'http://www.gxzf.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.gxzf.gov.cn/irs-intelligent-search/search?code=181aedaa542&dataTypeId=241&configCode=&sign=9cc99c9d-94aa-44b4-aa79-41227a5385d7&orderBy=related&searchBy=all&appendixType=&granularity=ALL&isSearchForced=0&pageNo=1&pageSize=10&isAdvancedSearch&isDefaultAdvanced&advancedFilters%20&searchWord=REiTs&advancedFilters',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
dlist=[]
pagenum=5
for i in range(1,pagenum):
log.info(f'采集第{i}页列表')
lurl='http://www.gxzf.gov.cn/irs/front/search'
data={
"code": "181aedaa542",
"dataTypeId": "241",
"configCode": "",
"sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7",
"searchWord": "REiTs",
"orderBy": "related",
"searchBy": "all",
"appendixType": "",
"granularity": "ALL",
"isSearchForced": "0",
"filters": [],
"pageNo": i,
"pageSize": 10,
"isAdvancedSearch": None,
"isDefaultAdvanced": None,
"advancedFilters": None,
"advancedFilters ": None,
"historySearchWords": [
"REiTs"
]
}
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['middle']['listAndBox']
for lmsgg in datas:
lmsg=lmsgg['data']
title=lmsg['title']
subtitle=''
summary=lmsg['table-7']
createDate=''
writeDate=lmsg['table-4']
pubDate=lmsg['time']
source=lmsg['table-3']
durl=lmsg['url']
docNumberStr=lmsg['table-5']
reNum=lmsg['table-1']
content=''
siteweb='广西壮族自治区人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_gxgov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_gxgov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'广西壮族自治区人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_a013af4793f2380a4bcf49ca1ce393eb=1699513646; _trs_uv=loqujlqp_3625_8md7; _trs_ua_s_1=loqujlqp_3625_2jhe; arialoadData=true; ariawapChangeViewPort=false; SEARCHHISTORY=[%22REiTs%22]; Hm_lpvt_a013af4793f2380a4bcf49ca1ce393eb=1699514234',
'Host':'www.gxzf.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.gxzf.gov.cn/irs-intelligent-search/search?code=181aedaa542&dataTypeId=241&configCode=&sign=9cc99c9d-94aa-44b4-aa79-41227a5385d7&orderBy=related&searchBy=all&appendixType=&granularity=ALL&isSearchForced=0&pageNo=1&pageSize=10&isAdvancedSearch&isDefaultAdvanced&advancedFilters%20&searchWord=REiTs&advancedFilters',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
#class_type=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(1)>td:nth-child(4)')[1]
pub_jigou=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(2)>td:nth-child(1)')[1]
write_data=soupPaserHtml(soup,'div[class="people-desc"]>table>tbody>tr:nth-child(2)>td:nth-child(2)')[1]
#file_num=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(2)')[1]
#pub_data=soupPaserHtml(soup,'div[class="classify"]>table>tbody>tr:nth-child(4)>td:nth-child(4)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="article-con"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
# detailmsg['class_type']=class_type
detailmsg['pub_jigou']=pub_jigou
detailmsg['write_data']=write_data
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153159&searchUseTime-349',
'Host':'www.hainan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
dlist=[]
pagenum=5
for i in range(1,pagenum):
log.info(f'采集第{i}页列表')
lurl=f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={i}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
lcont=reqbase.reqGetHtml(lurl,header)
if lcont:
try:
soup = BeautifulSoup(lcont, 'html.parser')
soup = reqbase.paserUrl(str(soup), lurl)
divlist=soup.select('div[id="showPage"]>div')
for lmsg in divlist:
title=soupPaserHtml(lmsg,'h3>a')[1]
subtitle=''
summary=''
createDate=''
writeDate=''
pubDate=soupPaserHtml(lmsg,'span[class="quily-con"]')[1]
source=soupPaserHtml(lmsg,'a[class="address-con permitU"]')[1]
try:
durl=soupPaserHtml(lmsg,'h3>a')[0].get('href')
except Exception as e:
durl=''
continue
docNumberStr=''
reNum=''
content=''
siteweb='海南省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_hainangov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_hainangov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'海南省人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'HttpOnly=true; 4600000001=UkVJVHM=; HA_STICKY_web=web.srv25; firstWord=reits; JSESSIONID=D565929C82443281C9BF0565591694AB; userSearch=siteCode-4600000001&column-%E6%94%BF%E7%AD%96&uc-0&firstWord-reits&searchWord-reits&searchTime-20231109153247&searchUseTime-337; HA_STICKY_apps=apps.srv34; Hm_lvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515700; yfx_c_g_u_id_10005682=_ck23110915414012919174127333485; yfx_f_l_v_t_10005682=f_t_1699515700292__r_t_1699515700292__v_t_1699515700292__r_c_0; _trs_uv=loqvrn5x_4549_5u3r; _trs_ua_s_1=loqvrn5x_4549_1lnl; arialoadData=true; ariawapChangeViewPort=false; Hm_lpvt_b23dcf9fcb01d857002fb0a0edee33b3=1699515718',
'Host':'www.hainan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column=2677&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=1&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
log.info(f'解析详情地址:{durl}')
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
souyihao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(1)')[1]
fenlei=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(1)>span:nth-child(2)')[1]
fawenjiguan=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(1)')[1]
write_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(2)>span:nth-child(2)')[1]
wenhao=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(1)')[1]
pub_data=soupPaserHtml(soup,'div[class="zwgk_comr1"]>ul>li:nth-child(4)>span:nth-child(2)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[id="zoom"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="zw"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['souyihao']=souyihao
detailmsg['fenlei']=fenlei
detailmsg['fawenjiguan']=fawenjiguan
detailmsg['wenhao']=wenhao
detailmsg['pub_data']=pub_data
detailmsg['write_data']=write_data
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Host':'searchapi.henan.gov.cn',
'Origin':'https://www.henan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.henan.gov.cn/',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-site',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
dlist=[]
pagenum=10
for i in range(1,pagenum):
log.info(f'henan采集第{i}页列表')
lurl=f'https://searchapi.henan.gov.cn/open/api/external?keywords=&siteId=4500000001&allKeyword=&anyKeyword=&noKeyword=&searchRange=-1000&sortType=200&beginTime=&endTime=&pageNumber={i}&pageSize=15&fileType=3&channelMarkId=45000000010115416542055691'
lcont=reqbase.reqGetHtml(lurl,header)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['datas']
for lmsg in datas:
title=lmsg['title']
subtitle=lmsg['subtitle']
summary=lmsg['summary']
createDate=lmsg['createDate']
writeDate=lmsg['writeDate']
pubDate=lmsg['pubDate']
source=lmsg['source']
durl=lmsg['selfUrl']
docNumberStr=lmsg['docNumberStr']
reNum=lmsg['reNum']
content=lmsg['content']
siteweb='河南省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_henangov', durl)
if is_member:
continue
paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_henangov',durl)
except Exception as e:
print(f'请求异常{e}-异常页码{i}')
reqbase.pdwriterXLS(dlist,'河南省人民政府')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'zh_choose=n; yfx_c_g_u_id_10000001=_ck23110818022219777515353379336; yfx_f_l_v_t_10000001=f_t_1699437742968__r_t_1699437742968__v_t_1699437742968__r_c_0',
'Host':'www.henan.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.henan.gov.cn/zwgk/fgwj/szfl/',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
contentWithTag=soup.select('div[id="content"]')[0]
content = contentWithTag.text # 不带标签正文
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
return detailmsg
if __name__ == '__main__':
getList()
# _*_ coding:utf-8 _*_
# https://www.henan.gov.cn/
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'JSESSIONID=B970564BAAD37BB8E9EF19F78FF45618; Hm_lvt_5544783ae3e1427d6972d9e77268f25d=1699492684; token=43cfe913-ec04-4aa3-b037-96903ccaa188; uuid=43cfe913-ec04-4aa3-b037-96903ccaa188; Hm_lpvt_5544783ae3e1427d6972d9e77268f25d=1699492955; 924omrTVcFchT=0IZtGVa8M20F2wJXy_C6l9PPFZOO1SBDdB3qZtsaLbLGaQ5t4l6Vt8HF9dIwhxtBcLdkdRZwlK42NCaEUjZZoPsXZAZ1o.tgK50mj8FJZTM5zCxcVg3w4cOCSM4BvYApzj7YMWHycK14.NY6Y.AP6bW6g0jDIqZlbp2hKSpDfZYBhjsgwJJraXKf2S4sgG6swjXFVVUHGngt2GMQPUZQRsE0_tL9Pz3_h6JeSD9qHWLOVKJWz0z8hdC_F4kiGZj2FRjjSUZp0VLUS8pjkrJdGYrKhC5xwGy8xSFYBE_trVuCFjr8.vhLqBONYkoWvZM2qNX_WZXg_3wTLMqCMrjoCmkvHf7B9.MMVu8tMC4hwDT4wjeyoNoRjIlgKuwE.aGhn',
'Host':'www.hubei.gov.cn',
'Pragma':'no-cache',
'Referer':'http://www.hubei.gov.cn/site/hubei/search.html',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
dlist=[]
pagenum=3
for i in range(1,pagenum):
log.info(f'湖北采集第{i}页列表')
lurl=f'http://www.hubei.gov.cn/igs/front/search.jhtml?position=&timeOrder=&code=872801132c71495bbe5a938f6acff5aa&orderBy=all&pageSize=10&type=%E6%96%87%E4%BB%B6&time=&chnldesc=&pageNumber={i}&aggrFieldName=chnldesc&sortByFocus=true&siteId=50&name=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitename=%E6%B9%96%E5%8C%97%E7%9C%81%E4%BA%BA%E6%B0%91%E6%94%BF%E5%BA%9C&sitetype=%E7%9C%81%E6%94%BF%E5%BA%9C&searchWord=REITS&6LDjm9Ls=0t3_jtGlqEJQLVtYPg5o4LE8KRsDcOrdhcQJ2gpgbWwP9rQyfChv7ADuy_hXWgy2abOG9jq8_hKyrFekh7IWmLmb9VBbEQh7tULy0_6L3zqkGOSoDWEcli5Ympa58KVMviSIxe_LiYGE'
lcont=reqbase.reqGetHtml(lurl,header)
if lcont:
try:
data=json.loads(lcont)
datas=data['page']['content']
for lmsg in datas:
title=lmsg['DOCTITLE']
subtitle=lmsg['FileName']
summary=lmsg['DOCCONTENT']
createDate=''
writeDate=''
pubDate=lmsg['PUBDATE']
source=lmsg['publisher']
durl=lmsg['url']
docNumberStr=''
reNum=lmsg['IdxID']
content=''
siteweb='湖北省人民政府'
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'docNumberStr':docNumberStr,
'reNum':reNum,
}
is_member = rr.sismember('reis_hubeigov', durl)
if is_member:
continue
paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_hubeigov',durl)
except Exception as e:
print(f'请求异常{e}-异常页码{i}')
reqbase.pdwriterXLS(dlist,'河南省人民政府')
def paserdetail(detailmsg):
# opt = webdriver.ChromeOptions()
# opt.add_argument("--ignore-certificate-errors")
# opt.add_argument("--ignore-ssl-errors")
# opt.add_experimental_option("excludeSwitches", ["enable-automation"])
# opt.add_experimental_option('excludeSwitches', ['enable-logging'])
# opt.add_experimental_option('useAutomationExtension', False)
# opt.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
# chromedriver = r'D:\chrome62\cmdvip\chromedriver.exe'
# driver = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
headers={
'Host':'www.hubei.gov.cn',
'Proxy-Connection':'keep-alive',
'Pragma':'no-cache',
'Cache-Control':'no-cache',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Referer':'http://www.hubei.gov.cn/zfwj/ezbf/202303/t20230303_4569220.shtml',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'924omrTVcFchT=0ydF5mX9FkIQDMfAAr4A60Yt6sHsXZOzTlm30NLRHm2OwX_YgXaMFBUe3WeNORSf0ZqYHjvBxVL5CXNSWoCOOThArMBpBDzXVdWxVIoA5YBGBLbPUN4CbcQQLZEty.w1MZkgI1pn30uv5STvyCsHLoYGTDHDSIbaURf4XIXzC3fNhxDX.nR5ZWV_HBo9ZAyC5I93.otc4vf7nD6v3Tympw6h2ZUuyAJ0Q7Nes3n0dIB_BIhwCkjyvJibUZt04ggU6XeXnS.qXr2CaM8BJQQ4mdLJ5apGqInkYuNv2GJP1AvL',
}
durl=detailmsg['durl']
try:
dhmsg=reqbase.reqGetHtml(durl,headers)
if dhmsg:
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
idx_num=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
class_type=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
dplay_gov=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
pub_date=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
fileNum=soupPaserHtml(soup,'div[class="hbgov-article-meta"]>div:nth-child(1)')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="hbgov-article-content"]')
# contentWithTag=soup.select('div[class="hbgov-article-content"]')[0]
content = contentWithTag.text # 不带标签正文
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['idx_num']=idx_num
detailmsg['class_type']=class_type
detailmsg['dplay_gov']=dplay_gov
detailmsg['pub_date']=pub_date
detailmsg['fileNum']=fileNum
except Exception as e:
print(e)
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
# https://www.henan.gov.cn/
import json
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import xlsxwriter
import openpyxl
import BaseCore
baseCore=BaseCore.BaseCore()
def reqGetHtml(url,header):
for i in range(0,3):
try:
proxy=baseCore.get_proxy()
response=requests.get(url=url,headers=header,proxies=proxy,verify=False,timeout=10)
response.encoding=response.apparent_encoding
hcont=response.text
if hcont:
break
except Exception as e:
hcont=''
return hcont
def reqPostHtml(url,header,data):
for i in range(0,3):
try:
proxy=baseCore.get_proxy()
if isinstance(data, str):
res=requests.post(url=url,data=data,headers=header,proxies=proxy,verify=False,timeout=10)
else:
res=requests.post(url=url,data=json.dumps(data),headers=header,verify=False,timeout=10)
hcont=res.text
if hcont:
break
except Exception as e:
hcont=''
return hcont
def reqPostStrHtml(url,header,data):
for i in range(0,3):
try:
res=requests.post(url=url,data=data,headers=header,verify=False,timeout=10)
hcont=res.text
if hcont:
break
except Exception as e:
hcont=''
return hcont
def createDriver():
chrome_driver = r'D:\Google\Chrome\Application\chrome.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =r'D:\chrome\chromedriver.exe'
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
return driver
# 将html中的相对地址转换成绝对地址
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def pdwriterXLS(dlist,siteName):
df_out = pd.DataFrame(data=dlist)
df_out.to_excel(siteName+'.xlsx', engine='xlsxwriter', index=False)
# _*_ coding:utf-8 _*_
"""
信息采集的流程
1.拼接获取列表连接
2.对详情页面内容进行解析和清洗
3.对采集的信息添加链接去重
4.文件内容的输出字段
5.内容信息调用请求的方式 requests,selenium
"""
import json
import redis
from bs4 import BeautifulSoup
import reqbase
import BaseCore
baseCore=BaseCore.BaseCore()
log=baseCore.getLogger()
rr=baseCore.r
def getList():
header={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Content-Length':'185',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Origin':'https://sheng.so-gov.cn',
'Pragma':'no-cache',
'Referer':'https://sheng.so-gov.cn/',
'Sec-Ch-Ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-site',
'Suid':'cf354a807a13d634f76bf167610f9c07',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
dlist=[]
pagenum=4
for i in range(3,pagenum):
log.info(f'henan采集第{i}页列表')
lurl='https://api.so-gov.cn/s'
data=f'siteCode=5300000033&tab=zcwj&timestamp=1699525503095&wordToken=72df37fd2f1058524e0c7467610d9ab7&page={i}&pageSize=20&qt=REITs&timeOption=0&sort=dateDesc&keyPlace=0&fileType=&toolsStatus=1'
lcont=reqbase.reqPostHtml(lurl,header,data)
if lcont:
try:
data=json.loads(lcont)
datas=data['data']['search']['searchs']
for myValues in datas:
lmsg=myValues['myValues']
try:
title=lmsg['DRETITLEO']
subtitle=''
summary=lmsg['QUICKDESCRIPTION']
createDate=''
writeDate=''
pubDate=''
source=lmsg['WEBSITENAME']
durl=lmsg['URL']
wenjianhao=lmsg['C3']
suoyinhao=''
content=''
siteweb='云南省人民政府'
except Exception as e:
continue
detailmsg={
'title':title,
'subtitle':subtitle,
'summary':summary,
'createDate':createDate,
'writeDate':writeDate,
'pubDate':pubDate,
'source':source,
'durl':durl,
'content':content,
'siteweb':siteweb,
'wenjianhao':wenjianhao,
'suoyinhao':suoyinhao,
}
is_member = rr.sismember('reis_yngov', durl)
if is_member:
continue
detailmsg=paserdetail(detailmsg)
dlist.append(detailmsg)
rr.sadd('reis_yngov',durl)
except Exception as e:
log.info(f'列表解析异常{e}')
reqbase.pdwriterXLS(dlist,'云南人民政府-政策2')
def paserdetail(detailmsg):
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'_gscu_802487706=99519044sn0ozi20; _gscbrs_802487706=1; Hm_lvt_b9099e95d08017e30f6285a8b55eb822=1699519045; TrsAccessMonitor=TrsAccessMonitor-1699519056000-2819180807; _gscs_802487706=995190442fewym20|pv:2; Hm_lpvt_b9099e95d08017e30f6285a8b55eb822=1699519714',
'Host':'www.yn.gov.cn',
'Pragma':'no-cache',
'Referer':'https://sheng.so-gov.cn/',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
}
durl=detailmsg['durl']
dhmsg=reqbase.reqGetHtml(durl,headers)
try:
log.info(f'详情页标题:{detailmsg["title"]}')
log.info(f'详情请求地址:{durl}')
soup = BeautifulSoup(dhmsg, 'html.parser')
soup = reqbase.paserUrl(str(soup), durl)
suoyinhao=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(1)>dd')[1]
wenjianhao=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(2)>dd')[1]
pubDate=soupPaserHtml(soup,'div[class="referencebox"]>dl:nth-child(4)>dd')[1]
contentWithTag,content=soupPaserHtml(soup,'div[class="trs_editor_view TRS_UEDITOR trs_paper_default trs_web"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="content"]')
if not content:
contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_external trs_web trs_key4format"]')
if not content:
log.info(f'详情内容为空:{durl}')
#contentWithTag,content=soupPaserHtml(soup,'div[class="view TRS_UEDITOR trs_paper_default trs_external trs_web trs_key4format"]')
detailmsg['contentWithTag']=contentWithTag
detailmsg['content']=content
detailmsg['suoyinhao']=suoyinhao
detailmsg['wenjianhao']=wenjianhao
detailmsg['pubDate']=pubDate
except Exception as e:
print(f'详情解析异常{e}')
return detailmsg
def soupPaserHtml(soup,csstag):
try:
tagmsg=soup.select(csstag)[0]
tagmsgtext=tagmsg.text
except Exception as e:
tagmsg=''
tagmsgtext=''
log.info(f'标签解析异常{e}')
return tagmsg,tagmsgtext
if __name__ == '__main__':
getList()
索引(Index)
索引(Index)
索引就是一类文档的集合,类似于关系型数据库中的表。索引由其名称进行标识,每个索引名称必须是小写。
文档(Document)
Index中单条记录称为文档,等同于关系型数据库表中的行。
字段(Field)
json结构的字段,等同于关系型数据库表中的列。
映射(Mapping)
Mapping是处理数据的方式和规则方面做一些限制,如:某个字段的数据类型、默认值、分析器、是否被索引等等,都是映射里可以设置的。
分片(Shards)
一个索引可以存储超过单个节点硬件限制的大量数据,相当于分表的概念。
ES提供了将索引划分成多份的能力,每一份称之为分片。
当创建一个索引的时候,可以指定想要的分片数量。
允许水平分割/扩展内容容量;允许在分片之上进行分布式的、并行的操作,进而提高性能/吞吐量。
副本(Replicas)
在分片/节点失败的情况下,提供了高可用性。
复制分片从不与原/主要分片置于同一节点上是非常重要的。
扩展搜索量/吞吐量,因为搜索可以在所有副本上并行运行。
from elasticsearch import Elasticsearch
# 连接ES
es=Elasticsearch(["192.168.1.90:9200"],
sniff_on_start=True,# 连接前测试
sniff_on_connection_fail=True,# 节点无响应时刷新节点
sniffer_timeout=60) # 设置超时时间
index_name='test_data'
def main():
# 连接ES
es=Elasticsearch(["192.168.1.90:9200"],
sniff_on_start=True,# 连接前测试
sniff_on_connection_fail=True,# 节点无响应时刷新节点
sniffer_timeout=60) # 设置超时时间
if __name__ == '__main__':
main()
# 创建索引
def create_index():
# 定义mapping body
body_index = {
'mappings': {
'properties': {
'name': {
'type': 'keyword'
},
'age': {
'type': 'long'
},
'tags': {
'type': 'text'
}
}
},
'settings': {
'index': {
'number_of_shards': '3',
'number_of_replicas': '0'
}
}
}
# 创建index
res = es.indices.create(index=index_name, body=body_index, ignore=400)
def instert_data():
person1 = {
'name': '张三',
'age': 18,
'tags': '勤奋学习十载寒窗,凿壁借光,囊萤映雪,手不释卷,有良好的表达能力。有耐心心态好,善于维系客户关系。果断热情勇敢孤僻活力,思想成熟能够独立工作。'
}
res = es.index(index=index_name, body=person1)
from elasticsearch import helpers
def instert_bach():
insert_infos = []
person2 = {
'_index': index_name,
'name': '李四',
'age': 20,
'tags': '有极强的领导艺术,公正严明铁面无私,公私分明。关心他人无微不至,体贴入微。精力充沛,并有很强的事业心。气吞山河正气凛然,善于同各种人员打交道。'
}
person3 = {
'_index': index_name,
'name': '王五',
'age': 19,
'tags': '尊敬师长团结同学,乐于助人学习勤奋,用心向上,用心参加班级学校组织的各种课内外活动。用心开展批评与自我批评。'
}
insert_infos.append(person2)
insert_infos.append(person3)
helpers.bulk(client=es, actions=insert_infos)
def del_index():
# 删除index
res = es.indices.delete(index=index_name, ignore=[400])
def del_doc_byid():
# 按id删除
res = es.delete(index=index_name, id='bKTgXYUBfH4USN9RFMOh')
def del_by_condation():
# 按条件删除
body = {
'query': {
'match': {
'name': '张三'
}
}
}
res = es.delete_by_query(index=index_name, body=body, ignore=[400, 404])
# index() 方法完成两个操作,如果数据不存在,那就执行插入操作,如果已经存在,那就执行更新操作。
# index实现更新时,body中必须写入全部字段,否则未包含的字段会被置为空。
def index_update_doc():
body = {
'name': '王五',
'age': 19,
'tags': '尊敬师长团结同学,乐于助人学习勤奋,用心向上,用心参加班级学校组织的各种课内外活动。用心开展批评与自我批评。'
}
res = es.index(index=index_name, id='baTgXYUBfH4USN9RFMOh', body=body)
def update_doc():
body = {
'doc': {
'name': '王五'
}
}
es.update(index=index_name, id='baTgXYUBfH4USN9RFMOh', body=body)
def select_info():
# 查看ES中索引的信息
index_info = es.indices.get('*')
# 查看索引的名称
index_names = index_info.keys()
index_name = 'es_index'
print(es.indices.exists(index_name))
doc_count = es.count(index=index_name)
def query_by_id():
body = {
'query': {
'match': {
'_id': 'baTgXYUBfH4USN9RFMOh'
}
}
}
res = es.search(index=index_name, body=body)
def query_by_filed():
body = {
'query': {
'match': {
'age': 20
}
},
'_source': ['name', 'tags']
}
res = es.search(index=index_name, body=body)
def query_by_sort():
body = {
'sort': {
'age': {
'order': 'desc' # asc: 升序, desc: 降序
}
}
}
res = es.search(index=index_name, body=body)
def query_by_range():
body = {
'query': {
'range': {
'age': {
'gt': 18,
'lte': 20
}
}
}
}
res = es.search(index=index_name, body=body)
def query_by_page():
body = {
'sort': {
'age': {
'order': 'desc' # asc: 升序, desc: 降序
}
},
'from': 0,
'size': 1
}
res = es.search(index=index_name, body=body)
def quere_by_paser():
body = {
"query": {
"match_phrase": {
"tags": "耐心"
}
}
}
res = es.search(index=index_name, body=body)
def query_by_mult():
body = {
"query": {
"bool": {
"must": [
{
"match": {
"name": "张三"
}
},
{
"match_phrase": {
"tags": "耐心"
}
}
]
}
}
}
res = es.search(index=index_name, body=body)
def query_by_not():
body = {
"query": {
"bool": {
"must": [
{
"match": {
"name": "王五"
}
}
],
'must_not': [
{
"match_phrase": {
"tags": "耐心"
}
}
]
}
}
}
res = es.search(index=index_name, body=body)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论