Merge remote-tracking branch 'origin/master'

c1cac75e · 薛凌堃 · b923d30f · 90b0dc55 · c1cac75e · c1cac75e
--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py
@@ -397,7 +397,7 @@ def get_content2():
                if is_href:
                    num+=1
                    log.info('已采集----------跳过')
-                    time.sleep(0.5)
+                    time.sleep(1)
                    continue
                try:
                    resp = requests.get(url=href, headers=headers, verify=False)
@@ -663,7 +663,8 @@ def bei_jing():
    # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
    chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
    chromedriver = r'D:\cmd100\chromedriver.exe'
-    bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
+    #bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
+    bro = webdriver.Chrome(options=chrome_options, executable_path=chromedriver)
    with open('../../base/stealth.min.js') as f:
        js = f.read()
@@ -1830,7 +1831,10 @@ def hai_nan():
                        href = 'http://gzw.hainan.gov.cn/zwgk_23509/' + href.replace('../../', '')
                    elif './' in href:
                        href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
-                    is_href = db_storage.find_one({'网址': href})
+                    try:
+                        is_href = db_storage.find_one({'网址': href.split('?')[0]})
+                    except:
+                        is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        num+=1
                        continue
@@ -1906,7 +1910,7 @@ def hai_nan():
                                pub_time = tbody_text.split('发文日期：')[1].split('名　　称：')[0].strip().lstrip().replace('年',
                                                                                                                   '-').replace(
                                    '月', '-').replace('日', '')
-                                writtenDate = ''
+                                writtenDate = None
                                topicClassification = tbody_text.split('分　　类：')[1].split('发文机关：')[0].strip().lstrip()
                                contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'})
                                content = contentWithTag.text
@@ -1963,7 +1967,7 @@ def hai_nan():
                                        0].strip().lstrip()
                                pub_source = ''
                                pub_hao = ''
-                                writtenDate = ''
+                                writtenDate = None
                                topicClassification = ''
                                contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'})
                                content = contentWithTag.text
@@ -2018,7 +2022,10 @@ def hai_nan():
                title = str(doc_item).split('target="_blank">')[1].split('</a>')[0]
                href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
                # print(title,href)
-                is_href = db_storage.find_one({'网址': href})
+                try:
+                    is_href = db_storage.find_one({'网址': href.split('?')[0]})
+                except:
+                    is_href = db_storage.find_one({'网址': href})
                if is_href:
                    num+=1
                    continue

--- a/cpws/BaseCore.py
+++ b/cpws/BaseCore.py
--- a/cpws/裁判文书网列表正文.py
+++ b/cpws/裁判文书网列表正文.py
--- a/estool/Esmethod.py
+++ b/estool/Esmethod.py
+import json
+"""
+Elasticsearch 安装
+pip install elasticsearch==7.8.1 版本的
+使用时参考文章 
+https://blog.csdn.net/yangbisheng1121/article/details/128528112
+https://blog.csdn.net/qiuweifan/article/details/128610083
+"""
+from elasticsearch import Elasticsearch
+class EsMethod(object):
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'],  http_auth=('elastic', 'zzsn9988'),timeout=300 )
+        self.index_name='researchreportdata'
+    '''
+    模糊
+    # 查询方法：模糊查询（会被分词）。
+    # 比如 我爱你中国，会查到只包含：“我爱你”， “中国”的内容
+    '''
+    def match(self,index_name,pnum):
+        body = {
+            'query':{
+                'match':{
+                    'title' : '.pdf',
+                    'origin' : '雪球网',
+                    'type' : '1',
+                }
+            },
+            'from' : pnum,
+            'size' : 20,
+        }
+        filter_path=['hits.hits._source.title',  # 字段1
+                     'hits.hits._source.id']   # 字段2
+        result = self.es.search(index=index_name
+                                ,doc_type='_doc'
+                                ,filter_path = filter_path
+                                ,body=body)
+        print(result)
+    '''
+    包含查询
+    # 查询方法：模糊查询（不会被分词）。会查到包含：“我爱你中国”的内容
+    '''
+    def match_phrase(self,index_name):
+        body = {
+            'query':{
+                'match_phrase':{
+                    'm_ext1' : 'XXXXXX'   #keyword
+                }
+            }
+        }
+        filter_path=['hits.hits._source.title',  # 字段1
+                     'hits.hits._source.id',
+                     'hits.hits._source.sourceAddress',
+                     'hits.hits._source.publishDate'
+                     ]  # 字段2
+        result = self.es.search(index=index_name
+                                ,doc_type='_doc'
+                                ,filter_path = filter_path
+                                ,body=body)
+        print(result)
+    '''
+    精准查询
+    '''
+    def term(self,index_name):
+        body = {
+            'query':{
+                'term':{
+                    'm_slhm' : 'XXXXXX'
+                }
+            }
+        }
+        filter_path=['hits.hits._source.m_ext1',  # 字段1
+                     'hits.hits._source.m_ext2']   # 字段2
+        result = self.es.search(index=index_name
+                                ,doc_type='_doc'
+                                ,filter_path = filter_path
+                                ,body=body)
+        print(result)
+    '''
+    多个条件精准查询
+    '''
+    def terms(self,index_name):
+        body = {
+            'query':{
+                'terms':{
+                    'm_slhm' : ['13XXXXXX ','13XXXXXX']
+                }
+            }
+        }
+        filter_path=['hits.hits._source.m_ext1',  # 字段1
+                     'hits.hits._source.m_slhm']  # 字段2
+        result = self.es.search(index=index_name
+                                ,doc_type='_doc'
+                                ,filter_path = filter_path
+                                ,body=body)
+        print(result)
+    '''
+    多条件 and 查询 
+    '''
+    def multi_must(self,index_name):
+        body = {
+            'query': {
+                'bool': {
+                    'must':[
+                        {'term':{'m_slhm' : '13XXXXXXX'}},
+                        {'terms':{'m_slhm' : ['13XXXXXX']}},
+                    ]
+                }
+            }
+        }
+        filter_path=['hits.hits._source.m_ext1',  # 字段1
+                     'hits.hits._source.m_slhm']  # 字段2
+        result = self.es.search(index=index_name
+                                ,doc_type='_doc'
+                                ,filter_path = filter_path
+                                ,body=body)
+        print(result)
+    '''
+    更新
+    '''
+    def update(self,index_name):
+        result = self.es.update(index=index_name
+                                ,id='20220901-XXXXX'
+                                ,body={'serialno': 'XXXXXX' })
+        print('更新结果:%s' % result)
+    '''
+    新增
+    '''
+    def add(self,index_name):
+        result = self.es.index(index=index_name
+                               ,id='20220901-XXXXXX'
+                               ,body={'serialno': 'XXXXXX' })
+        print('新增结果:%s' % result)
+    '''
+    删除
+    '''
+    def delete(self,index_name):
+        result = self.es.delete(index=index_name
+                                ,doc_type="_doc"
+                                ,id='20220901-XXXXXX')
+        print('删除结果 %s' % result)
+    '''
+    多条件 or 查询 
+    '''
+    def multi_should(self,index_name,pnum):
+        body = {
+            'query': {
+                'bool': {
+                    'should':[
+                        {'term':{'origin' : '雪球网'}},
+                        {'term':{'type' : 1}},
+                    ],
+                    'must': [
+                        {'match': {'title': '.pdf'}}
+                    ]
+                }
+            },
+            'from' : pnum,
+            'size' : 6000,
+        }
+        filter_path=['hits.hits._source.title',  # 字段1
+                     'hits.hits._source.id',
+                     'hits.total.value',
+                     ]   # 字段2
+        result = self.es.search(index=index_name
+                                ,doc_type='_doc'
+                                ,filter_path = filter_path
+                                ,body=body)
+        print(result)
+        return result
+    '''
+    更新
+    '''
+    def updateaunn(self,index_name,id,utitle):
+        body = {
+            'doc': {
+                'title': utitle
+            }
+        }
+        result = self.es.update(index=index_name
+                                ,id=id
+                                ,body=body)
+        print('更新结果:%s' % result)
+    def getFileds(self,index_name):
+        mapping = self.es.indices.get_mapping(index=index_name)
+        fields = mapping[index_name]['mappings']['properties'].keys()
+        print(fields)
+if __name__ == '__main__':
+    esMethod=EsMethod()
+    # esMethod.getFileds(index_name=esMethod.index_name)
+    num=1
+    for pnum in range(0,num):
+        p=pnum*20
+        print(f'第{pnum}页数据')
+        result=esMethod.multi_should(index_name=esMethod.index_name,pnum=p)
+        msglist=result['hits']['hits']
+        print(msglist)
+        for mms in msglist:
+            id=mms['_source']['id']
+            title=mms['_source']['title']
+            utitle=title.replace('.pdf','')
+            print(f'id:{id}---title:{title}--utitle:{utitle}')
+            esMethod.updateaunn(esMethod.index_name,str(id),utitle)
+            print('跟新成功！！')
--- a/estool/esconn.py
+++ b/estool/esconn.py
+import json
+from elasticsearch import Elasticsearch
+# 创建Elasticsearch对象，并提供账号信息
+es = Elasticsearch(
+    ['http://114.116.19.92:9700'],  http_auth=('elastic', 'zzsn9988')  # 账号和密码
+)
+index_name = 'basedata'
+# 搜索文档
+search_query = {
+    "query": {
+        "match": {
+            "sourceAddress": "www"
+        }
+    }
+}
+# search_query=json.dumps(search_query)
+res = es.search(index=index_name, body=search_query)
+for hit in res['hits']['hits']:
+    print(hit['_source'])