习讲话数据库标题修改

159f6105 · 薛凌堃 · 2ea9c487 · 159f6105 · 159f6105
--- a/习近平讲话/datasfromes.py
+++ b/习近平讲话/datasfromes.py
 """
@@ -39,6 +39,7 @@ class EsMethod(object):
                      'hits.hits._source.title',
                      'hits.hits._source.origin',
                      'hits.hits._source.publishDate',
+                      'hits.hits._source.sourceAddress',
                      ]  # 字段2
       result = self.es.search(index=index_name
                               , doc_type='_doc'
@@ -68,9 +69,10 @@ if __name__ == '__main__':
           title = mms['_source']['title']
           origin = mms['_source']['origin']
           pub_time = mms['_source']['publishDate']
+           sourceAddress = mms['_source']['sourceAddress']
           try:
-               log.info(f'{id}--{title}--{origin}--')
+               log.info(f'{id}--{title}--{origin}-{sourceAddress}-')
-               item = id + "|" + pub_time
+               item = id + "|" + pub_time + "|" + title + "|" + origin + "|" + sourceAddress
               # r.lrem(f'XJPdatabase:id_2', 0, item)
               r.lpush(f'XJPdatabase:id', item)
           except:

--- a/习近平讲话/update_title.py
+++ b/习近平讲话/update_title.py
+import time
+import time
+import redis
+import requests
+from elasticsearch import Elasticsearch
+from pyquery import PyQuery as pq
+es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+def requestTitle(href, title):
+    try:
+        href_text = requests.request("GET", href, headers=headers, verify=False).content
+        time.sleep(0.2)
+        doc_href = pq(href_text)
+        # rtodo: 找到标题并拼接
+        title1 = doc_href('.d2txt.clearfix h2').text()
+        title2 = doc_href('.d2txt.clearfix h1').text()
+        title3 = doc_href('.d2txt.clearfix h3').text()
+        if title1 == '' and title3 == '':
+            return False
+        else:
+            title_final = title1 + ' ' + title2 + ' ' + title3
+            return title_final
+    except:
+        print('请求错误2')
+        r.rpush('XJPdatabase:id', item)
+        return False
+def updateaunn(index_name,id,u_title):
+    body = {
+        'doc': {
+            'title': [str(u_title)]
+        }
+    }
+    result = es.update(index=index_name
+                            ,id=id
+                            ,body=body)
+    print('更新结果:%s' % result)
+if  __name__ == '__main__':
+    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+    headers = {
+        'Proxy-Connection': 'keep-alive',
+        'Accept': '*/*',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+        'X-Requested-With': 'XMLHttpRequest',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cookie': 'sfr=1; sso_c=0; __jsluid_h=5b9f09f6fdae46fadb89e1e02dca3238; wdcid=04fccdf5121158c0; wdses=72d07de4316a36a5; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg; wdlast=1646734820; ci_session=4irerhndk5mc48dq7ldebjn5m47fqptg'
+    }
+    while True:
+        try:
+            item = r.lpop('XJPdatabase:id').decode()
+            print(item)
+        except:
+            item = ''
+        if item == '':
+            break
+        else:
+            title = item.split('|')[2]
+            id = item.split('|')[0]
+            href = item.split('|')[4]
+            origin = item.split('|')[3]
+            pub_time = item.split('|')[1]
+            year = pub_time[:4]
+            if year != '2020':
+                r.rpush('XJPdatabase:id_other',item)
+            else:
+                u_title = requestTitle(href, title)
+                if u_title:
+                    updateaunn('researchreportdata', id, u_title)