提交 598533dd 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

"""
"""
......@@ -39,6 +39,7 @@ class EsMethod(object):
'hits.hits._source.title',
'hits.hits._source.origin',
'hits.hits._source.publishDate',
'hits.hits._source.sourceAddress',
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
......@@ -68,9 +69,10 @@ if __name__ == '__main__':
title = mms['_source']['title']
origin = mms['_source']['origin']
pub_time = mms['_source']['publishDate']
sourceAddress = mms['_source']['sourceAddress']
try:
log.info(f'{id}--{title}--{origin}--')
item = id + "|" + pub_time
log.info(f'{id}--{title}--{origin}-{sourceAddress}-')
item = id + "|" + pub_time + "|" + title + "|" + origin + "|" + sourceAddress
# r.lrem(f'XJPdatabase:id_2', 0, item)
r.lpush(f'XJPdatabase:id', item)
except:
......
import time
import time
import redis
import requests
from elasticsearch import Elasticsearch
from pyquery import PyQuery as pq
es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
def requestTitle(href, title):
try:
href_text = requests.request("GET", href, headers=headers, verify=False).content
time.sleep(0.2)
doc_href = pq(href_text)
# rtodo: 找到标题并拼接
title1 = doc_href('.d2txt.clearfix h2').text()
title2 = doc_href('.d2txt.clearfix h1').text()
title3 = doc_href('.d2txt.clearfix h3').text()
if title1 == '' and title3 == '':
return False
else:
title_final = title1 + ' ' + title2 + ' ' + title3
return title_final
except:
print('请求错误2')
r.rpush('XJPdatabase:id', item)
return False
def updateaunn(index_name,id,u_title):
body = {
'doc': {
'title': [str(u_title)]
}
}
result = es.update(index=index_name
,id=id
,body=body)
print('更新结果:%s' % result)
if __name__ == '__main__':
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
headers = {
'Proxy-Connection': 'keep-alive',
'Accept': '*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg'
}
while True:
try:
item = r.lpop('XJPdatabase:id').decode()
print(item)
except:
item = ''
if item == '':
break
else:
title = item.split('|')[2]
id = item.split('|')[0]
href = item.split('|')[4]
origin = item.split('|')[3]
pub_time = item.split('|')[1]
year = pub_time[:4]
if year != '2020':
r.rpush('XJPdatabase:id_other',item)
else:
u_title = requestTitle(href, title)
if u_title:
updateaunn('researchreportdata', id, u_title)
import csv
import csv
import redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
if __name__ == "__main__":
with open('./title1.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
while True:
try:
term = r.lpop('XJPdatabase:id').decode()
except:
term = ''
if term == '':
break
else:
# 写入数据
writer.writerow(str(term).split('|'))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论