提交 636a7138 作者: 薛凌堃

习近平讲话标题处理

上级 632d5a17
"""
"""
从es中拿到所有的标题
"""
import redis
from elasticsearch import Elasticsearch
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'subjectdatabase'
def queryatt(self,index_name,pnum):
body = {
"query": {
"match": {
"subjectId": "1534423014825668610"
}
},
"sort": [
{
"publishDate": {
"order": "desc"
}
}
],
"track_total_hits": True,
"size": 200,
"from": pnum
}
filter_path = ['hits.hits._id',
'hits.total.value',
'hits.hits._source.title',
'hits.hits._source.origin',
'hits.hits._source.publishDate',
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
# log.info(result)
return result
if __name__ == '__main__':
es_method = EsMethod()
# 连接Redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
for i in range(56):
result = es_method.queryatt('subjectdatabase', i*200)
total = result['hits']['total']['value']
try:
msglist = result['hits']['hits']
except:
log.info(f'error-----{result}')
continue
log.info(f'---第{i}页{len(msglist)}条数据----共{total}条数据----')
for mms in msglist:
id = mms['_id']
title = mms['_source']['title']
origin = mms['_source']['origin']
pub_time = mms['_source']['publishDate']
try:
log.info(f'{id}--{title}--{origin}--')
item = id + "|" + pub_time
# r.lrem(f'XJPdatabase:id_2', 0, item)
r.lpush(f'XJPdatabase:id', item)
except:
continue
"""
"""
对标题进行操作
1.有空格的去掉空格
2.精确去重
3.杰卡德相似度去重
"""
#将数据读到csv中
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_excel('./test2.xlsx')
print(df)
# 去掉空格
df['title_1'] = df['title'].str.replace(' ', '')
print(df['title_1'])
#精确去重
# df_drop = df.drop_duplicates(subset=['title'], keep='first')
# duplicates = df[df.duplicated('title_1', keep=False)]['title_1']
#杰卡德相似度去重
# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),max_features=1000)
# tfidf_matrix = vectorizer.fit_transform(df['title'])
#
# dist = 1 - cosine_similarity(tfidf_matrix)
#
# df['similar'] = dist.mean(axis=1)
#
# df_drop = df.drop_duplicates(subset=['title'],keep='last')
# df_drop.to_csv('D:/data/titles_drop.csv',index=False)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论