提交 c1cac75e 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

...@@ -397,7 +397,7 @@ def get_content2(): ...@@ -397,7 +397,7 @@ def get_content2():
if is_href: if is_href:
num+=1 num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
time.sleep(0.5) time.sleep(1)
continue continue
try: try:
resp = requests.get(url=href, headers=headers, verify=False) resp = requests.get(url=href, headers=headers, verify=False)
...@@ -663,7 +663,8 @@ def bei_jing(): ...@@ -663,7 +663,8 @@ def bei_jing():
# bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe') # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe' chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe' chromedriver = r'D:\cmd100\chromedriver.exe'
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver) #bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
bro = webdriver.Chrome(options=chrome_options, executable_path=chromedriver)
with open('../../base/stealth.min.js') as f: with open('../../base/stealth.min.js') as f:
js = f.read() js = f.read()
...@@ -1830,6 +1831,9 @@ def hai_nan(): ...@@ -1830,6 +1831,9 @@ def hai_nan():
href = 'http://gzw.hainan.gov.cn/zwgk_23509/' + href.replace('../../', '') href = 'http://gzw.hainan.gov.cn/zwgk_23509/' + href.replace('../../', '')
elif './' in href: elif './' in href:
href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/') href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
try:
is_href = db_storage.find_one({'网址': href.split('?')[0]})
except:
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1 num+=1
...@@ -1906,7 +1910,7 @@ def hai_nan(): ...@@ -1906,7 +1910,7 @@ def hai_nan():
pub_time = tbody_text.split('发文日期:')[1].split('名  称:')[0].strip().lstrip().replace('年', pub_time = tbody_text.split('发文日期:')[1].split('名  称:')[0].strip().lstrip().replace('年',
'-').replace( '-').replace(
'月', '-').replace('日', '') '月', '-').replace('日', '')
writtenDate = '' writtenDate = None
topicClassification = tbody_text.split('分  类:')[1].split('发文机关:')[0].strip().lstrip() topicClassification = tbody_text.split('分  类:')[1].split('发文机关:')[0].strip().lstrip()
contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'}) contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'})
content = contentWithTag.text content = contentWithTag.text
...@@ -1963,7 +1967,7 @@ def hai_nan(): ...@@ -1963,7 +1967,7 @@ def hai_nan():
0].strip().lstrip() 0].strip().lstrip()
pub_source = '' pub_source = ''
pub_hao = '' pub_hao = ''
writtenDate = '' writtenDate = None
topicClassification = '' topicClassification = ''
contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'}) contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'})
content = contentWithTag.text content = contentWithTag.text
...@@ -2018,6 +2022,9 @@ def hai_nan(): ...@@ -2018,6 +2022,9 @@ def hai_nan():
title = str(doc_item).split('target="_blank">')[1].split('</a>')[0] title = str(doc_item).split('target="_blank">')[1].split('</a>')[0]
href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0] href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
# print(title,href) # print(title,href)
try:
is_href = db_storage.find_one({'网址': href.split('?')[0]})
except:
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1 num+=1
......
import json
"""
Elasticsearch 安装
pip install elasticsearch==7.8.1 版本的
使用时参考文章
https://blog.csdn.net/yangbisheng1121/article/details/128528112
https://blog.csdn.net/qiuweifan/article/details/128610083
"""
from elasticsearch import Elasticsearch
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'),timeout=300 )
self.index_name='researchreportdata'
'''
模糊
# 查询方法:模糊查询(会被分词)。
# 比如 我爱你中国,会查到只包含:“我爱你”, “中国”的内容
'''
def match(self,index_name,pnum):
body = {
'query':{
'match':{
'title' : '.pdf',
'origin' : '雪球网',
'type' : '1',
}
},
'from' : pnum,
'size' : 20,
}
filter_path=['hits.hits._source.title', # 字段1
'hits.hits._source.id'] # 字段2
result = self.es.search(index=index_name
,doc_type='_doc'
,filter_path = filter_path
,body=body)
print(result)
'''
包含查询
# 查询方法:模糊查询(不会被分词)。会查到包含:“我爱你中国”的内容
'''
def match_phrase(self,index_name):
body = {
'query':{
'match_phrase':{
'm_ext1' : 'XXXXXX' #keyword
}
}
}
filter_path=['hits.hits._source.title', # 字段1
'hits.hits._source.id',
'hits.hits._source.sourceAddress',
'hits.hits._source.publishDate'
] # 字段2
result = self.es.search(index=index_name
,doc_type='_doc'
,filter_path = filter_path
,body=body)
print(result)
'''
精准查询
'''
def term(self,index_name):
body = {
'query':{
'term':{
'm_slhm' : 'XXXXXX'
}
}
}
filter_path=['hits.hits._source.m_ext1', # 字段1
'hits.hits._source.m_ext2'] # 字段2
result = self.es.search(index=index_name
,doc_type='_doc'
,filter_path = filter_path
,body=body)
print(result)
'''
多个条件精准查询
'''
def terms(self,index_name):
body = {
'query':{
'terms':{
'm_slhm' : ['13XXXXXX ','13XXXXXX']
}
}
}
filter_path=['hits.hits._source.m_ext1', # 字段1
'hits.hits._source.m_slhm'] # 字段2
result = self.es.search(index=index_name
,doc_type='_doc'
,filter_path = filter_path
,body=body)
print(result)
'''
多条件 and 查询
'''
def multi_must(self,index_name):
body = {
'query': {
'bool': {
'must':[
{'term':{'m_slhm' : '13XXXXXXX'}},
{'terms':{'m_slhm' : ['13XXXXXX']}},
]
}
}
}
filter_path=['hits.hits._source.m_ext1', # 字段1
'hits.hits._source.m_slhm'] # 字段2
result = self.es.search(index=index_name
,doc_type='_doc'
,filter_path = filter_path
,body=body)
print(result)
'''
更新
'''
def update(self,index_name):
result = self.es.update(index=index_name
,id='20220901-XXXXX'
,body={'serialno': 'XXXXXX' })
print('更新结果:%s' % result)
'''
新增
'''
def add(self,index_name):
result = self.es.index(index=index_name
,id='20220901-XXXXXX'
,body={'serialno': 'XXXXXX' })
print('新增结果:%s' % result)
'''
删除
'''
def delete(self,index_name):
result = self.es.delete(index=index_name
,doc_type="_doc"
,id='20220901-XXXXXX')
print('删除结果 %s' % result)
'''
多条件 or 查询
'''
def multi_should(self,index_name,pnum):
body = {
'query': {
'bool': {
'should':[
{'term':{'origin' : '雪球网'}},
{'term':{'type' : 1}},
],
'must': [
{'match': {'title': '.pdf'}}
]
}
},
'from' : pnum,
'size' : 6000,
}
filter_path=['hits.hits._source.title', # 字段1
'hits.hits._source.id',
'hits.total.value',
] # 字段2
result = self.es.search(index=index_name
,doc_type='_doc'
,filter_path = filter_path
,body=body)
print(result)
return result
'''
更新
'''
def updateaunn(self,index_name,id,utitle):
body = {
'doc': {
'title': utitle
}
}
result = self.es.update(index=index_name
,id=id
,body=body)
print('更新结果:%s' % result)
def getFileds(self,index_name):
mapping = self.es.indices.get_mapping(index=index_name)
fields = mapping[index_name]['mappings']['properties'].keys()
print(fields)
if __name__ == '__main__':
esMethod=EsMethod()
# esMethod.getFileds(index_name=esMethod.index_name)
num=1
for pnum in range(0,num):
p=pnum*20
print(f'第{pnum}页数据')
result=esMethod.multi_should(index_name=esMethod.index_name,pnum=p)
msglist=result['hits']['hits']
print(msglist)
for mms in msglist:
id=mms['_source']['id']
title=mms['_source']['title']
utitle=title.replace('.pdf','')
print(f'id:{id}---title:{title}--utitle:{utitle}')
esMethod.updateaunn(esMethod.index_name,str(id),utitle)
print('跟新成功!!')
import json
from elasticsearch import Elasticsearch
# 创建Elasticsearch对象,并提供账号信息
es = Elasticsearch(
['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988') # 账号和密码
)
index_name = 'basedata'
# 搜索文档
search_query = {
"query": {
"match": {
"sourceAddress": "www"
}
}
}
# search_query=json.dumps(search_query)
res = es.search(index=index_name, body=search_query)
for hit in res['hits']['hits']:
print(hit['_source'])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论