提交 e20df706 作者: bruxellse_li

2023年智能写作搜索代码提交

上级 b9ddb61f
# 默认忽略的文件
/shelf/
/workspace.xml
# 数据源本地存储已忽略文件
/../../../:\AI-Report\.idea/dataSources/
/dataSources.local.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
<serverData>
<paths name="root@114.115.151.101:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.49.86:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.54.108:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (2)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (3)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (4)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (5)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (6)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
</component>
</project>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="21">
<item index="0" class="java.lang.String" itemvalue="protobuf" />
<item index="1" class="java.lang.String" itemvalue="Levenshtein" />
<item index="2" class="java.lang.String" itemvalue="xlrd" />
<item index="3" class="java.lang.String" itemvalue="python_Levenshtein" />
<item index="4" class="java.lang.String" itemvalue="pdfminer.six" />
<item index="5" class="java.lang.String" itemvalue="Camelot" />
<item index="6" class="java.lang.String" itemvalue="camelot-py" />
<item index="7" class="java.lang.String" itemvalue="tqdm" />
<item index="8" class="java.lang.String" itemvalue="jieba" />
<item index="9" class="java.lang.String" itemvalue="flask" />
<item index="10" class="java.lang.String" itemvalue="bert_serving" />
<item index="11" class="java.lang.String" itemvalue="setuptools" />
<item index="12" class="java.lang.String" itemvalue="pandas" />
<item index="13" class="java.lang.String" itemvalue="certifi" />
<item index="14" class="java.lang.String" itemvalue="typing_extensions" />
<item index="15" class="java.lang.String" itemvalue="charset-normalizer" />
<item index="16" class="java.lang.String" itemvalue="numpy" />
<item index="17" class="java.lang.String" itemvalue="pytz" />
<item index="18" class="java.lang.String" itemvalue="urllib3" />
<item index="19" class="java.lang.String" itemvalue="idna" />
<item index="20" class="java.lang.String" itemvalue="scikit_learn" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/AI-Report.iml" filepath="$PROJECT_DIR$/.idea/AI-Report.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : analysis_model.py
# @Time : 2022/1/12 10:32
# @Author : bruxelles_li
# @Software: PyCharm
"""
提取内容中
时间: [0-9]{1,4}年[0-9]{1,2}月[0-9]{1,2}日
标题: 第一个换行符的内容
内容: 最后一个换行符前的内容
作者: 最后三行包含的人名
来源: 最后三行包含的机构名
"""
import re
from datetime import datetime
from lac_model import lac_organize_name, lac_username
# 定义时间提取模式
time_pattern = re.compile(r"[0-9]{1,4}年[0-9]{1,2}月[0-9]{1,2}日")
# 定义主函数
def analysis_function(content):
# 先根据换行符拆分内容
para_list = content.split("\n")
title = para_list[0]
match_content = "\n".join(para_list[1:]).strip("\n")
temp_content = "\n".join(para_list[-3:]).strip("\n")
# todo: 根据后三行分别提取发布来源和作者
name_list = lac_username(temp_content)
organize_list = lac_organize_name(temp_content)
author = name_list[0] if name_list else ""
origin = organize_list[0] if organize_list else ""
# todo: 提取时间
time_list = time_pattern.findall(match_content)
if time_list:
temp_articleTime = time_list[0]
date_list = re.split("[年月日]", temp_articleTime)[:-1]
date_list_1 = ["0"+d if len(d) == 1 else d for d in date_list]
date_format = "-".join(date_list_1)
articleTime = date_format + " 00:00:00"
else:
articleTime = ""
return title, author, origin, articleTime, match_content
if __name__ == "__main__":
text = """建筑垃圾如何“变废为宝”
   建筑垃圾是众多垃圾中较难处理的一类,存在体积大、回收处理2022年12月1日号难等问题。这些建筑物、构筑物、管网以及房屋装饰产生的弃土、弃料等,如果不经过妥善的处理,不仅是对资源的浪费,还会污染水、土壤和空气。
        传统的建筑垃圾处理方法有哪些不足?
        传统的建筑垃圾处理方法主要有堆放、填埋、回填等,但这些处理方式会对环境造成极大的影响,如土壤污染、地下水污染、大气污染等。建筑垃圾堆放和填埋需要耗用大量的土地,释放的有毒有害物质会改变土壤的物理结构和化学性质,造成土壤污染;被污染的土壤由雨水冲刷会形成渗滤液进入水体中,容易引起地下水和地表水污染;露天堆放的建筑垃圾更是容易引起扬尘,渣土车运输过程中排放的大量尾气和道路遗撒引起的扬尘又加重了大气污染。
        正确处理建筑垃圾是什么样的?
        近年来,建筑垃圾再循环利用成为一个新尝试,经过加工处理让它们实现二次利用。如金属废料、废钢筋、废铁丝、废电线等经过分拣、回炉热加工可制成钢材等金属制品;废混凝土经过粉碎、筛选等过程可制成再生骨料用于公路路基、墙体的填充材料,也可生产多孔砖、空心板、空心砌块、水泥原料等产品;废木材可用于制造合成板材;沥青经过重新回收可再生成沥青。
        混凝土是目前建筑垃圾中回收价值较高的部分,废弃混凝土生成的再生骨料由于强度高、生产成本低,颇受市场青睐,再生骨料按照一定级配搅拌和碾压后具有较高地基承载力,可直接应用于软弱地基、竖井回填、路基垫层、水处理、场地抑尘等工程;还可以部分或者全部替代天然骨料,生产再生无机混合料、再生砖、再生混凝土等产品。
        要想实现建筑垃圾的环保化,也要做到源头上减少建筑垃圾产生量,大力开发和推广节能降耗的建筑新技术和新工艺,采用尽量少产生建筑垃圾的结构设计;加大对建筑垃圾综合利用的投入,限制天然骨料、石料的使用量,出台相应的优惠政策,鼓励使用再生材料、替代材料及易回收材料等,从源头上最大限度减少建筑垃圾的产生。同时,有关部门可以大力推广再生产品应用,促进循环利用,号召市政、园林、交通、水务等工程率先选用建筑废弃物再生产品,鼓励社会投资工程使用建筑废弃物再生产品。
        通过固废资源的循环利用,可以大幅提升建筑垃圾的利用率,有效缓解建筑垃圾的运输和空间存储问题,具备较高的灵活性,低碳节能,贴近近乎零污染、零排放的理想环保要求,为国家实现“双碳”目标作出贡献。
中国科协科普部
新华网
联合出品"""
print(analysis_function(text))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : app_config
# @Author : LiuYan&bruxelles_li
# @Time : 2021/4/22 10:51
import os
import multiprocessing
from pathlib import Path
bind = '0.0.0.0:4002' # 绑定ip和端口号
backlog = 512 # 监听队列
# chdir = '/home/zzsn/liuyan/bin' # gunicorn要切换到的目的工作目录
timeout = 600 # 超时 -> 目前为迎合ZZSN_NLP平台 一带一路要素抽取(文件)需求 暂时关闭超时
# worker_class = 'gevent' # 使用gevent模式,还可以使用sync 模式,默认的是sync模式
# workers = multiprocessing.cpu_count() # 进程数 12
workers = 1 # 低资源 13G 服务器负载过大可调整此处为 1
threads = 50 # 指定每个进程开启的线程数
loglevel = 'error' # 日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置
access_log_format = '%(t)s %(p)s %(h)s "%(r)s" %(s)s %(L)s %(b)s %(f)s" "%(a)s"' # 设置gunicorn访问日志格式,错误日志无法设置
"""
其每个选项的含义如下:
h remote address
l '-'
u currently '-', may be user name in future releases
t date of the request
r status line (e.g. ``GET / HTTP/1.1``)
s status
b response length or '-'
f referer
a user agent
T request time in seconds
D request time in microseconds
L request time in decimal seconds
p process ID
"""
_tmp_path = os.path.dirname(os.path.abspath(__file__))
_tmp_path = os.path.join(_tmp_path, 'log')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
accesslog = os.path.join(_tmp_path, 'gunicorn_access.log') # 访问日志文件
errorlog = os.path.join(_tmp_path, 'gunicorn_error.log') # 错误日志文件
# gunicorn -c app_config.py app_run:app -D --daemon
# -*- coding: utf-8 -*-
# @Time : 2022/9/22 11:08
# @Author : ctt
# @File : data_building
# @Project : 研究中心知识图谱
import mysql.connector
import pandas as pd
import logging
from snow_id import Snow
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
class Data:
def __init__(self):
pass
def initialize(self):
self.database = self.connect()
self.entity_df = self.get_data_base_entity()
print('==========数据获取完毕=============')
self.database.close()
def connect(self):
'''
连接数据库
:return:
'''
database = mysql.connector.connect(
host='114.115.159.144',
user='root',
passwd='zzsn9988',
database='clb_project',
auth_plugin='mysql_native_password'
)
return database
def get_article_data(self, database_conntect):
# country_sql = '''SELECT id, title, content, origin, publish_date FROM core_base_data WHERE status != 1 and status != 2 and publish_date >= '2022-03-01';'''
# country_sql = '''select id, title, content, origin, publish_date from core_base_data WHERE id not in (SELECT DISTINCT bid FROM core_base_data_entity)'''
# country_sql = '''select id, article_title, content, origin, article_time from ai_report_material WHERE id ='1670667630665899' '''
country_sql = '''select id, article_title, content, origin, article_time from ai_report_material WHERE type='par' '''
cursor = database_conntect.cursor()
cursor.execute(country_sql)
data_table = cursor.fetchall()
# columns = [_[0] for _ in cursor.description]
article_df = pd.DataFrame(data_table, columns=['id', 'article_title', 'content', 'origin', 'article_time'], dtype=str)
cursor.close()
return article_df
def get_data_base_entity(self):
country_sql = '''SELECT id, compound_word,label_uuid FROM graph_entity;'''
cursor = self.database.cursor()
cursor.execute(country_sql)
entity_base_table = cursor.fetchall()
# columns = [_[0] for _ in cursor.description]
entity_df = pd.DataFrame(entity_base_table, columns=['id', 'compound_word', 'label_uuid'], dtype=str)
cursor.close()
return entity_df
def insert_entity(self, bid, eids):
database_conntect = self.connect()
insert_data = []
for eid in eids:
id = Snow.get_guid()
logger.info(id)
insert_data.append((id, bid, eid))
sql = 'insert ignore into core_base_data_entity (id, bid, eid, status) values (%s, %s, %s, 0)'
cursor = database_conntect.cursor()
cursor.executemany(sql, insert_data)
database_conntect.commit()
cursor.close()
def insert_relation(self, relation_id, source_id, target_id, bid):
database_conntect = self.connect()
id = Snow.get_guid()
print((id, relation_id, source_id, target_id, bid, 0))
sql = 'insert into graph_entity_entity (id, relation_id, source_id, target_id, bid, status) ' \
'values (%s, %s, %s, %s, %s, 0)' % (id, relation_id, source_id, target_id, bid)
logger.info(sql)
cursor = database_conntect.cursor()
cursor.execute(sql)
database_conntect.commit()
cursor.close()
data = Data()
database_conntect = data.connect()
article_df = data.get_article_data(database_conntect)
print(article_df)
print(len(article_df))
database_conntect.close()
# article_df.to_excel(r'数据2022-03.xlsx', index=False)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : encode_sentence.py
# @Time : 2022/12/12 19:23
# @Author : bruxelles_li
# @Software: PyCharm
"""
pip install bert-serving-server && pip install bert-serving-client
"""
import pandas as pd
import numpy as np
from bert_serving.client import BertClient
from tqdm import tqdm
from numpy import *
bc = BertClient('114.116.49.86', check_length=False)
# df = pd.read_excel('素材库/句子库/入库_sent.xlsx', keep_default_na=False).astype(str)
# # df = pd.read_excel("素材库/段落库/去重后_para.xlsx", keep_default_na=False).astype(str)
# length = len(df)
# print(length)
vector_path = "素材库/句子库.txt"
np_path = "素材库/句子库.npy"
def encode_sentences(df, path):
f = df
"""
for line in f:
result.append(line.strip('\n'))
"""
with open(path, 'w', encoding='utf-8') as f_vectors:
for idx, row in tqdm(f.iterrows()):
sentence = row['content']
vector = bc.encode([sentence])
f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
return None
def save_file(length, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
if __name__ == "__main__":
# text = "nihao"
# print(bc.encode([text]))
# encode_sentences(df, vector_path)
length = 333350
save_file(length, np_path)
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : es_byid.py
# @Time : 2022/12/27 13:41
# @Author : bruxelles_li
# @Software: PyCharm
import requests
import json
import pandas as pd
# todo: 根据唯一标识id调用es查询接口目标信息
def find_sent_info(_id: list):
size = len(_id)
"""
:param _id: "1670844082074304"
:return:
"""
url = "http://114.115.215.250:9700/ai_report_material/_search"
# todo: 传入list_id
payload = json.dumps({
"query": {
"bool": {
"must": [
{
"terms": {
"id": _id
}
}
]
}
},
"track_total_hits": True,
"size": size
})
headers = {
'Authorization': 'Basic ZWxhc3RpYzp6enNuOTk4OA==',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
result_data = response.text.encode("utf-8")
obj_result = json.loads(result_data)
# todo: 解析出目标数据, 数据对象为list
hits_obj = obj_result["hits"]["hits"]
_source_list = [hits_obj[i]["_source"] for i in range(0, size)]
df = pd.DataFrame(_source_list)
df = df[["id", "content", "paragraphId", "sentParaIndex", "sentArticleIndex", "topicType", "contentTypeName",
"articleId", "sentenceId"]]
return df
# if hits_obj:
# # todo:当前list长度为1,取第一个元素中即为目标数据存在范围
# temp_result = hits_obj[0]["_source"]
# # todo: 此时带查询数据内容为dict对象
# # print(temp_result)
#
# else:
# temp_result = ""
# return temp_result
# return hits_obj
# return _source_list
# todo: 根据唯一标识id调用es查询接口目标信息
def find_para_info(_id: list):
size = len(_id)
"""
:param _id: "1670844082074304"
:return:
"""
url = "http://114.115.215.250:9700/ai_report_material/_search"
# todo: 传入list_id
payload = json.dumps({
"query": {
"bool": {
"must": [
{
"terms": {
"id": _id
}
}
]
}
},
"track_total_hits": True,
"size": size
})
headers = {
'Authorization': 'Basic ZWxhc3RpYzp6enNuOTk4OA==',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
result_data = response.text.encode("utf-8")
obj_result = json.loads(result_data)
# todo: 解析出目标数据, 数据对象为list
hits_obj = obj_result["hits"]["hits"]
_source_list = [hits_obj[i]["_source"] for i in range(0, size)]
df = pd.DataFrame(_source_list)
df = df[["id", "content", "paragraphId", "paraArticleIndex", "topicType", "contentTypeName",
"articleId"]]
return df
# if hits_obj:
# # todo:当前list长度为1,取第一个元素中即为目标数据存在范围
# temp_result = hits_obj[0]["_source"]
# # todo: 此时带查询数据内容为dict对象
# # print(temp_result)
#
# else:
# temp_result = ""
# return temp_result
# return hits_obj
# return _source_list
# todo: 根据文章id和句子id调用es查询接口目标句子前后内容
def find_sen_content(sent_article_id: str, sentence_id: str, sent_content):
"""
:param article_id:
:param sentence_id:
:return:
"""
url = "http://114.115.215.250:9700/ai_report_material/_search"
payload = json.dumps({
"query": {
"bool": {
"must": [
{
"terms": {
"articleId": [sent_article_id, sent_article_id] # ["1670829370466076"]
}
},
{
"terms": {
# "sentence_id": sentence_id # ["1670843538527672"]
"sentenceId": [str(int(sentence_id) - 1), str(int(sentence_id) + 1)]
}
}
]
}
},
"track_total_hits": True
# "size": 1
})
headers = {
'Authorization': 'Basic ZWxhc3RpYzp6enNuOTk4OA==',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
result_data = response.text.encode("utf-8")
obj_result = json.loads(result_data)
# todo: 解析出目标数据, 数据对象为list
hits_obj = obj_result["hits"]["hits"]
if len(hits_obj) >= 2:
# todo:当前list长度为1,取第一个元素中即为目标数据存在范围
pre_temp_result = hits_obj[0]["_source"]
pre_temp_content = pre_temp_result["content"]
suf_temp_result = hits_obj[1]["_source"]
suf_temp_content = suf_temp_result["content"]
elif 1 <= len(hits_obj) < 2:
if hits_obj[0]["_source"]["sentenceId"] == str(int(sentence_id) - 1):
pre_temp_content = hits_obj[0]["_source"]["content"]
suf_temp_content = ""
else:
pre_temp_content = ""
suf_temp_content = hits_obj[0]["_source"]["content"]
else:
pre_temp_content = ""
suf_temp_content = ""
content = pre_temp_content + "<font style='color:red;'>" + sent_content + "</font>" + suf_temp_content
return content
# todo: 根据文章id查询文章信息
def find_art_info(article_id: str):
# size = len(article_id)
"""
:param article_id:
:return:
"""
url = "http://114.115.215.250:9700/ai_report_material/_search"
payload = json.dumps({
"query": {
"bool": {
"must": [
{
"term": {
"articleId": article_id # "1670829371705726"
}
},
{
"term": {
"type": "art"
}
}
]
}
},
"track_total_hits": True,
"size": 1
})
headers = {
'Authorization': 'Basic ZWxhc3RpYzp6enNuOTk4OA==',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
# print(response.text)
result_data = response.text.encode("utf-8")
obj_result = json.loads(result_data)
# todo: 解析出目标数据, 数据对象为list
hits_obj = obj_result["hits"]["hits"]
# print(len(hits_obj), len(article_id))
temp_result = hits_obj[0]["_source"]
# _source_list = [hits_obj[i]["_source"] for i in range(len(article_id))]
# df = pd.DataFrame(_source_list)
# df = df[["articleId", "articleTitle", "origin", "articleTime", "author", "content"]]
# df.rename(columns={"content": "article_content"}, inplace=True)
# return df
return temp_result
# _source_list = [hits_obj[i]["_source"] for i in range(size)]
# df = pd.DataFrame(_source_list)
# df = df[["articleId", "articleTitle", "origin", "articleTime", "author", "content"]]
# df.rename(columns={"content": "article_content"}, inplace=True)
# return df
# if hits_obj:
# # todo:当前list长度为1,取第一个元素中即为目标数据存在范围
# temp_result = hits_obj[0]["_source"]
# # temp_content = temp_result["content"]
# # todo: 此时带查询数据内容为dict对象
# # print(temp_result)
#
# else:
# temp_result = ""
#
# return temp_result
# return hits_obj
if __name__ == "__main__":
#
_id = ["1670844082008296", "1670844082007284"]
find_sent_info(_id)
# print(find_content("1670829371705726", "1670844082074304"))
# articleId = ["1670829370466076", "1670829371705726"]
# print(find_art_info(articleId))
# sent_article_id, sentence_id = "1670829371705726", "1"
# pre_sent, suf_sent = find_sen_content([sent_article_id, sent_article_id], sentence_id)
# print(pre_sent)
# print(suf_sent)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : gj_app.py
# @Time : 2022/12/7 19:49
# @Author : bruxelles_li
# @Software: PyCharm
import os, json
import logging
from flask import Flask, request, jsonify
import sys
sys.path.append('../')
from 文章内容检查 import clean_html_tag
from 素材库构建程序 import *
from 文章id生成 import create_title_id
import requests, time
import queue
import pandas as pd
import traceback
from search_by_dot_matrix import get_sent_result, get_para_result
from pytime import pytime
from datetime import datetime
from pathlib import Path
from smart_extractor import extract_by_url_test
# todo: 定义缓存路径
cache_path = "测试文件"
Path(cache_path).mkdir(parents=True, exist_ok=True)
# todo: 关闭多余连接
s = requests.session()
s.keep_alive = False
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
HOST = '0.0.0.0'
PORT = 4000
DEBUG = False
app = Flask(__name__)
# Queue基本FIFO队列 先进先出 FIFO即First in First Out,先进先出
# maxsize设置队列中,数据上限,小于或等于0则不限制,容器中大于这个数则阻塞,直到队列中的数据被消掉
q = queue.Queue(maxsize=0)
# 跨域支持1
from flask_cors import CORS
CORS(app, supports_credentials=True)
# # todo: 定义段落处理
# def para_process(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: datetime, pEndTime: datetime, pageSize: int):
# pageNo = 10
# para_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
# dict_para = {
# "text": text,
# "para_list": para_list
# }
# para_result = json.dumps(dict_para)
#
# with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
# file.write(para_result)
# time.sleep(60) # 设置一个 60 秒过期的缓存文件清除时间
# os.remove(os.path.join(cache_path, "para.json"))
# return None
#
#
# # todo: 定义句子处理
# def sent_process(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: datetime, pEndTime: datetime, pageSize: int):
# pageNo = 10
# sent_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
# dict_sent = {
# "text": text,
# "sent_list": sent_list
# }
# # todo: 将内容转换为JSON字符串用来存储
# sent_result = json.dumps(dict_sent)
#
# with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
# file.write(sent_result)
# time.sleep(60)
# os.remove(os.path.join(cache_path, "sent.json"))
# return None
@app.route("/", methods=["GET"])
def hello_world():
app.logger.info('Hello World!')
return "Hello World"
@app.route('/subject_consumer', methods=['GET', 'POST'])
def subject_consumer():
if not q.empty():
config_info = q.get()
return jsonify(message='当前队列数量:' + str(q.qsize()),
queue_left_number=str(q.qsize()),
data=config_info)
else:
return jsonify(message='队列为空!', queue_left_number=0)
@app.route('/queue_size', methods=['GET', 'POST'])
def queue_size():
return jsonify(queue_left_number=q.qsize())
type2iddict = {
"speech_by_leaders": "1602095566267805697",
"policy_document": "1602095618788880386",
"expert_opinion": "1602095680285765633",
"enterprise_case": "1602095727870144513",
"other": "1602095773126684673'"
}
type2namedict = {
"speech_by_leaders": "领导讲话",
"policy_document": "政策文件",
"expert_opinion": "专家观点",
"enterprise_case": "企业案例",
"other": "其他"
}
# 运行程序接口
@app.route('/build_pro', methods=["GET", "POST"])
def get_result():
"""
-> data:
领导讲话:1602095566267805697
政策文件:1602095618788880386
专家观点:1602095680285765633
企业案例:1602095727870144513
其他:1602095773126684673
领导讲话:speech_by_leaders
政策文件:policy_document
专家观点:expert_opinion
企业案例:enterprise_case
其他:other
:return:
"""
try:
data = request.get_json()
# todo: 先判断是否提供url链接来获取来源,发布时间,正文内容
if "url" in data:
url = data["url"]
lang_code = data["lang_code"] if "lang_code" in data else "cn"
dict_parse = extract_by_url_test(url, lang_code)
title = dict_parse["title"]
content = dict_parse["content"]
publishDate = dict_parse["publishDate"]
else:
title = data['title']
ori_content = data['content']
content = clean_html_tag(ori_content)
publishDate = data['publishDate']
infoId = str(data['infoId']) if 'infoId' in data else str(create_title_id())
contentTypeFlags = data['contentTypeFlags']
topicNames = data['topicNames']
origin = data['origin']
author = data['author']
# todo: 根据typedict 获取contentType
contentNames = type2namedict[contentTypeFlags]
contentTypeIds = str(type2iddict[contentTypeFlags])
# todo: 若清洗后的文章内容长度不为空,则进行处理,否则返回日志
if len(content) >= 50:
list_para, list_sent = build_pro_new(infoId, content, contentNames, contentTypeIds, topicNames)
# todo: 利用dataframe对两个生成的列表内容进行去重
df_para = pd.DataFrame(list_para)
df_para.drop_duplicates(subset=["para_content"], keep="first", inplace=True)
dict_para = df_para.to_dict()
new_list_para = [dict(zip(dict_para, values)) for values in zip(*[dict_para[k].values() for k in dict_para])]
df_sent = pd.DataFrame(list_sent)
df_sent.drop_duplicates(subset=["sent_content"], keep="first", inplace=True)
dict_sent = df_sent.to_dict()
new_list_sent = [dict(zip(dict_sent, values)) for values in zip(*[dict_sent[k].values() for k in dict_sent])]
dict_result = {
"code": 200,
"message": "success",
"resultData": {
"article_info":
[
{
"infoId": infoId,
"content": content,
"title": title,
"contentNames": contentNames,
"contentTypeIds": contentTypeIds,
"topicNames": topicNames,
"origin": origin,
"publishDate": publishDate,
"author": author
}
],
"para_info": new_list_para,
"sent_info": new_list_sent
}
}
else:
dict_result = {
"code": 500,
"message": "failure" + "文章内容杂乱,请检查并清除杂乱格式再进行操作!",
"resultData": None
}
except Exception as e:
dict_result = {
'code': 500,
'message': "failure" + str(e),
'resultData': None
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route('/search_content', methods=["GET", "POST"])
def get_top_content():
try:
# 定义接收参数
data = request.get_json()
text = data['queryText']
contentTypeFlags = data['contentTypeFlags']
topicTypeNames = data['topicNames']
returenType = data['returenType'] if data['returenType'] else "sen"
pageNo = int(data['pageNo'])
pageSize = int(data['pageSize'])
pStartTime = data['pStartTime'] if data['pStartTime'] else "2021-00-00"
pEndTime = data['pEndTime'] if data["pEndTime"] else "2023-00-00"
# todo: 调用搜索函数返回推荐list
if returenType == "par":
# todo: 先检查缓存是否可用,若不可用则重新查找
if os.path.isfile(os.path.join(cache_path, "para.json")):
with open(os.path.join(cache_path, "para.json"), 'r', encoding='utf-8') as f:
para_dict_result = json.load(f)
# todo: 继续判断待查询的内容是否与缓存的对象相同
if text == para_dict_result["text"]:
para_list = para_dict_result["para_list"]
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
result_list = para_list[pre_index:suf_index]
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list,
"pageNo": pageNo,
"pageSize": pageSize,
"total": len(para_list)
}
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
os.remove(os.path.join(cache_path, "para.json"))
result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime,
pEndTime, pageSize, pageNo, returenType)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_para = {
"text": text,
"para_list": result_list
}
para_result = json.dumps(dict_para)
with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
file.write(para_result)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_para = {
"text": text,
"para_list": result_list
}
para_result = json.dumps(dict_para)
with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
file.write(para_result)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
# todo: 处理句子
else:
# todo: 先检查缓存是否可用,若不可用则重新查找
if os.path.isfile(os.path.join(cache_path, "sent.json")):
with open(os.path.join(cache_path, "sent.json"), 'r', encoding='utf-8') as f:
sent_dict_result = json.load(f)
# todo: 继续判断待查询的内容是否与缓存的对象相同
if text == sent_dict_result["text"]:
sent_list = sent_dict_result["sent_list"]
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
result_list = sent_list[pre_index:suf_index]
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list,
"pageNo": pageNo,
"pageSize": pageSize,
"total": len(sent_list)
}
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
os.remove(os.path.join(cache_path, "sent.json"))
result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime,
pEndTime, pageSize, pageNo)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_sent = {
"text": text,
"sent_list": result_list
}
# todo: 将内容转换为JSON字符串用来存储
sent_result = json.dumps(dict_sent)
with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
file.write(sent_result)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_sent = {
"text": text,
"sent_list": result_list
}
# todo: 将内容转换为JSON字符串用来存储
sent_result = json.dumps(dict_sent)
with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
file.write(sent_result)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
except Exception as e:
traceback.print_exc()
dic_result = {
'code': 500,
'message': "failure" + str(e),
'resultData': None
}
logger.info(dic_result)
return json.dumps(dic_result, ensure_ascii=False)
# @app.route('/search_content', methods=["GET", "POST"])
# def get_top_content():
# try:
# # 定义接收参数
# data = request.get_json()
# text = data['queryText']
# contentTypeFlags = data['contentTypeFlags']
# topicTypeNames = data['topicNames']
# returenType = data['returenType'] if data['returenType'] else "sen"
# pageNo = int(data['pageNo'])
# pageSize = int(data['pageSize'])
# pStartTime = datetime.date(pytime.parse(data['pStartTime'])) if data['pStartTime'] else pytime.parse("2020-01-01")
# pEndTime = datetime.date(pytime.parse(data['pEndTime'])) if data["pEndTime"] else pytime.today()
# logger.info(pStartTime, pEndTime)
# # todo: 调用搜索函数返回推荐list
# if returenType == "par":
# # todo: 先检查缓存是否可用,若不可用则重新查找
# if os.path.isfile(os.path.join(cache_path, "para.json")):
# with open(os.path.join(cache_path, "para.json"), 'r', encoding='utf-8') as f:
# para_dict_result = json.load(f)
# # todo: 继续判断待查询的内容是否与缓存的对象相同
# if text == para_dict_result["text"]:
# para_list = para_dict_result["para_list"]
# pre_index = pageNo * pageSize - pageSize
# suf_index = pageNo * pageSize
# result_list = para_list[pre_index:suf_index]
# dict_result = {
# 'code': 200,
# 'message': 'success',
# 'result_data': {
# "match_info": result_list,
# "pageNo": pageNo,
# "pageSize": pageSize
# }
# }
# logger.info(dict_result)
# return json.dumps(dict_result, ensure_ascii=False)
# # todo: 否则进行即时查询处理
# else:
# pageNo = 1
# result_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime,
# pageSize, pageNo)
# # todo:先返回top10,然后将后续处理加入队列
# config_info = {
# "type": "par",
# "text": text,
# "contentTypeFlags": contentTypeFlags,
# "topicTypeNames": topicTypeNames,
# "pStartTime": pStartTime,
# "pEndTime": pEndTime,
# "pageSize": pageSize
# }
# q.put(config_info)
# dict_result = {
# 'code': 200,
# 'message': 'success',
# 'result_data': {
# "match_info": result_list,
# "pageNo": pageNo,
# "pageSize": pageSize
# }
# }
# logger.info(dict_result)
# return json.dumps(dict_result, ensure_ascii=False)
# # todo: 若无缓存,则进行即时查询处理
# else:
# pageNo = 1
# result_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
# # todo:先返回top10,然后将后续处理加入队列
# config_info = {
# "type": "par",
# "text": text,
# "contentTypeFlags": contentTypeFlags,
# "topicTypeNames": topicTypeNames,
# "pStartTime": pStartTime,
# "pEndTime": pEndTime,
# "pageSize": pageSize
# }
# q.put(config_info)
# dict_result = {
# 'code': 200,
# 'message': 'success',
# 'result_data': {
# "match_info": result_list,
# "pageNo": pageNo,
# "pageSize": pageSize
# }
# }
# logger.info(dict_result)
# return json.dumps(dict_result, ensure_ascii=False)
# # todo:进入段落库查询
# else:
# # todo: 先检查缓存是否可用,若不可用则重新查找
# if os.path.isfile(os.path.join(cache_path, "sent.json")):
# with open(os.path.join(cache_path, "sent.json"), 'r', encoding='utf-8') as f:
# sent_dict_result = json.load(f)
# # todo: 继续判断待查询的内容是否与缓存的对象相同
# if text == sent_dict_result["text"]:
# sent_list = sent_dict_result["sent_list"]
# pre_index = pageNo * pageSize - pageSize
# suf_index = pageNo * pageSize
# result_list = sent_list[pre_index:suf_index]
# dict_result = {
# 'code': 200,
# 'message': 'success',
# 'result_data': {
# "match_info": result_list,
# "pageNo": pageNo,
# "pageSize": pageSize
# }
# }
# logger.info(dict_result)
# return json.dumps(dict_result, ensure_ascii=False)
# # todo: 否则进行即时查询处理
# pageNo = 1
# result_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize,
# pageNo)
# # todo:先返回top10,然后将后续处理加入队列
# config_info = {
# "type": "sent",
# "text": text,
# "contentTypeFlags": contentTypeFlags,
# "topicTypeNames": topicTypeNames,
# "pStartTime": pStartTime,
# "pEndTime": pEndTime,
# "pageSize": pageSize
# }
# q.put(config_info)
# dict_result = {
# 'code': 200,
# 'message': 'success',
# 'resultData': {
# "match_info": result_list,
# "pageNo": pageNo,
# "pageSize": pageSize
# }
# }
# logger.info(dict_result)
# return json.dumps(dict_result, ensure_ascii=False)
# else:
# pageNo = 1
# result_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
# # todo:先返回top10,然后将后续处理加入队列
# config_info = {
# "type": "sent",
# "text": text,
# "contentTypeFlags": contentTypeFlags,
# "topicTypeNames": topicTypeNames,
# "pStartTime": pStartTime,
# "pEndTime": pEndTime,
# "pageSize": pageSize
# }
# q.put(config_info)
# dict_result = {
# 'code': 200,
# 'message': 'success',
# 'resultData': {
# "match_info": result_list,
# "pageNo": pageNo,
# "pageSize": pageSize
# }
# }
# logger.info(dict_result)
# return json.dumps(dict_result, ensure_ascii=False)
#
# except Exception as e:
# traceback.print_exc()
# dict_result = {
# 'code': 500,
# 'message': "failure" + str(e),
# 'resultData': None
# }
# logger.info(dict_result)
# return json.dumps(dict_result, ensure_ascii=False)
if __name__ == '__main__':
app.run(host=HOST, port=PORT, debug=DEBUG)
if __name__ != '__main__':
gunicorn_logger = logging.getLogger('gunicorn.error')
app.logger.handlers = gunicorn_logger.handlers
app.logger.setLevel(gunicorn_logger.level)
#!/user/bin/env python
# coding=utf-8
"""
@project : 500_资讯
@author : bruxelles_li
@file : lac_ner_text.py
@ide : PyCharm
@time : 2022-07-04 09:19:43
"""
from LAC import LAC
import pandas as pd
import tqdm
import re
lac = LAC(mode="lac")
# 句子提取人名
def lac_username(sentences):
# 装载LAC模型
user_name_list = []
lac = LAC(mode="lac")
lac_result = lac.run(sentences)
# print(lac_result)
for index, lac_label in enumerate(lac_result[1]):
if lac_label == "PER":
user_name_list.append(lac_result[0][index])
# print(user_name_list)
# print(user_name_list)
return user_name_list
# 句子提取机构名
def lac_organize_name(sentences):
# 装载LAC模型
user_name_list = []
lac = LAC(mode="lac")
lac_result = lac.run(sentences)
# print(lac_result)
for index, lac_label in enumerate(lac_result[1]):
if lac_label == "ORG":
user_name_list.append(lac_result[0][index])
return user_name_list
# 句子提取地名
def lac_location_name(sentences):
# 装载LAC模型
user_name_list = []
lac = LAC(mode="lac")
lac_result = lac.run(sentences)
# print(lac_result)
for index, lac_label in enumerate(lac_result[1]):
if lac_label == "LOC":
user_name_list.append(lac_result[0][index])
return user_name_list
def match_text_one(rule, text):
# rule = ";".join(new_one)
# print(rule)
# text_one = match_text_one(rule, title)
# print(text_one)
rules = '|'.join(rule.split(';')).strip('\n')
replaced_rules = rules.replace('.', '\.')\
.replace('*', '\*')\
.replace('(', '\(')\
.replace(')', '\)')\
.replace('+', '.+')
pattern = re.compile(r'' + replaced_rules)
print(pattern)
match_result = re.sub(pattern, "A", text)
print(match_result)
return match_result
if __name__ == '__main__':
text_path = ""
data_df = pd.read_excel(text_path, nrows=1).astype(str)
result_list = []
for idx, row in tqdm.tqdm(data_df.iterrows()):
title = row['title']
a_user = lac_username(title)
a_organize = lac_organize_name(title)
a_location = lac_location_name(title)
if a_user:
user_rule = '|'.join(a_user).strip()
pattern0 = re.compile(r'' + user_rule)
result_one = re.sub(pattern0, 'A', title)
title = result_one
if a_organize:
a_organize_rule = '|'.join(a_organize).strip()
pattern1 = re.compile(r'' + a_organize_rule)
result_two = re.sub(pattern1, 'B', result_one)
title = result_two
if a_location:
a_location_rule = '|'.join(a_location).strip()
pattern2 = re.compile(r'' + a_location_rule)
print(pattern2)
result_three = re.sub(pattern2, 'C', result_two)
print(result_three)
title = result_three
row['title'] = title
result_list.append(row)
print(result_list)
# new_one = a_user + a_organize + a_location
# rule = "|".join(new_one)
# pattern = re.compile(r'' + rule)
# result_one = re.sub(pattern, "A", title)
# title = result_one
# print(title)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : main_app.py
# @Time : 2022/12/14 19:49
# @Author : bruxelles_li
# @Software: PyCharm
import os, json
import logging
import numpy as np
from flask import Flask, request, jsonify
import sys
sys.path.append('../')
from 文章内容检查 import clean_html_tag
from 素材库构建程序 import *
from 文章id生成 import create_title_id
import requests
import queue
import pandas as pd
import time
from smart_extractor import extract_by_url_test
import traceback
from pathlib import Path
from tqdm import tqdm
from analysis_model import analysis_function
from search_by_dot_matrix import get_sent_result, get_para_result, get_sen_duplicated, get_para_duplicated, put_para_list, put_sen_list
# 关闭多余连接
s = requests.session()
s.keep_alive = False
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
HOST = '0.0.0.0'
PORT = 4002
DEBUG = False
app = Flask(__name__)
# todo: 定义缓存路径
cache_path = "测试文件"
Path(cache_path).mkdir(parents=True, exist_ok=True)
# Queue基本FIFO队列 先进先出 FIFO即First in First Out,先进先出
# maxsize设置队列中,数据上限,小于或等于0则不限制,容器中大于这个数则阻塞,直到队列中的数据被消掉
q = queue.Queue(maxsize=0)
# todo: 定义文章内容
# df0 = pd.read_excel('素材库/文章库/入库_article.xlsx').astype(str)
# art_list = df0["content"].tolist()
# content2_id = {row['content']: row['id'] for idx, row in df0.iterrows()}
# 跨域支持1
from flask_cors import CORS
CORS(app, supports_credentials=True)
@app.route("/", methods=["GET"])
def hello_world():
logger.info('Hello World!')
return "Hello World"
type2iddict = {
"speech_by_leaders": "1602095566267805697",
"policy_document": "1602095618788880386",
"expert_opinion": "1602095680285765633",
"enterprise_case": "1602095727870144513",
"other": "1602095773126684673",
# "think_tanks": "",
# "policies_regulations": "",
# "enterprise_news": "",
}
type2namedict = {
"speech_by_leaders": "领导讲话",
"policy_document": "政策文件",
"expert_opinion": "专家观点",
"enterprise_case": "企业案例",
"other": "其他",
# "think_tanks": "智库",
# "policies_regulations": "政策法规",
# "enterprise_news": "企业资讯",
}
# 运行程序接口
@app.route('/analysis', methods=["POST"])
def analysis_text():
try:
data = request.get_json()
text = data["content"] if data["content"] else ""
# todo: 根据传入的正文信息,解析出标题,作者,来源, 时间, 正文
title, author, origin, articleTime, content = analysis_function(text)
dict_result = {
"code": 200,
"message": "success",
"resultData": {
"title": title,
"author": author,
"origin": origin,
"articleTime": articleTime,
"content": content
}
}
except Exception as e:
dict_result = {
'code': 500,
'message': "failure" + str(e),
'resultData': None
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route('/build_pro', methods=["POST"])
def get_result():
"""
-> data:
领导讲话:1602095566267805697
政策文件:1602095618788880386
专家观点:1602095680285765633
企业案例:1602095727870144513
其他:1602095773126684673
领导讲话:speech_by_leaders
政策文件:policy_document
专家观点:expert_opinion
企业案例:enterprise_case
其他:other
:return:
"""
try:
data = request.get_json()
# todo: 先判断是否提供url链接来获取来源,发布时间,正文内容
if "url" in data:
url = data["url"]
lang_code = data["lang_code"] if "lang_code" in data else "cn"
dict_parse = extract_by_url_test(url, lang_code)
title = dict_parse["title"]
ori_content = dict_parse["content"]
content = clean_html_tag(ori_content)
publishDate = dict_parse["publishDate"]
else:
title = data['title']
ori_content = data['content']
content = clean_html_tag(ori_content)
publishDate = data['publishDate']
infoId = str(data['infoId']) if data["infoId"] else str(create_title_id())
contentTypeFlags = data['contentTypeFlags']
topicNames = data['topicNames']
origin = data['origin']
author = data['author']
# todo: 根据typedict 获取contentType
contentNames = type2namedict[contentTypeFlags]
contentTypeIds = str(type2iddict[contentTypeFlags])
# todo: 若清洗后的文章内容长度不为空,则进行处理,否则返回日志
if len(content) >= 50:
list_para, list_sent = build_pro_new(infoId, content, contentNames, contentTypeIds, topicNames)
# todo: 利用dataframe对两个生成的列表内容进行去重
df_para = pd.DataFrame(list_para)
df_para.drop_duplicates(subset=["para_content"], keep="first", inplace=True)
dict_para = df_para.to_dict()
new_list_para = [dict(zip(dict_para, values)) for values in zip(*[dict_para[k].values() for k in dict_para])]
df_sent = pd.DataFrame(list_sent)
df_sent.drop_duplicates(subset=["sent_content"], keep="first", inplace=True)
dict_sent = df_sent.to_dict()
new_list_sent = [dict(zip(dict_sent, values)) for values in zip(*[dict_sent[k].values() for k in dict_sent])]
# todo: 新增素材库去重,更新repeatedId, is_main, 唯一标识id, create_time
final_list_para = get_para_duplicated(new_list_para)
final_list_sent = get_sen_duplicated(new_list_sent)
dict_result = {
"code": 200,
"message": "success",
"resultData": {
"article_info":
[
{
"repeatedId": "",
"is_main": "",
"create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
"infoId": infoId,
"content": content,
"title": title,
"contentNames": contentNames,
"contentTypeIds": contentTypeIds,
"topicNames": topicNames,
"origin": origin,
"publishDate": publishDate,
"author": author,
"type": "art"
}
],
"para_info": final_list_para,
"sent_info": final_list_sent
}
}
else:
dict_result = {
"code": 500,
"message": "failure" + "文章内容杂乱,请检查并清除杂乱格式再进行操作!",
"resultData": None
}
except Exception as e:
dict_result = {
'code': 500,
'message': "failure" + str(e),
'resultData': None
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route('/put_database', methods=["POST"])
def put_result():
try:
data = request.get_json()
para_list = data["resultData"]["para_info"]
sen_list = data["resultData"]["sent_info"]
# todo: 先判断是否提供url链接来获取来源,发布时间,正文内容
sen_flag = put_sen_list(sen_list)
para_flag = put_para_list(para_list)
if sen_flag == "1" and para_flag == "1":
dict_result = {
"code": 200,
"message": "success",
"resultData": "已成功处理"
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
except Exception as e:
dict_result = {
'code': 500,
'message': "failure" + str(e),
'resultData': None
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route('/search_content', methods=["POST"])
def get_top_content():
try:
# 定义接收参数
data = request.get_json()
# logger.info(data)
text = str(data['queryText']).strip()
contentTypeFlags = data['contentTypeFlags'] if "contentTypeFlags" in data else []
topicTypeNames = data['topicNames'] if 'topicNames' in data else []
returenType = data['returenType'] if data['returenType'] else "sen"
pageNo = int(data['pageNo'])
pageSize = int(data['pageSize'])
pStartTime = data['pStartTime'] if data['pStartTime'] else "2021-00-00"
pEndTime = data['pEndTime'] if data["pEndTime"] else "2023-00-00"
# todo: 根据字典获取contentTypeName
contentTypeName_list = []
if contentTypeFlags:
for type in contentTypeFlags:
content_type_name = type2namedict[type]
contentTypeName_list.append(content_type_name)
else:
contentTypeName_list = ["领导讲话", "专家观点", "政策文件", "企业案例", "其他"]
# todo: 调用搜索函数返回推荐list
if returenType == "par":
# todo: 先检查缓存是否可用,若不可用则重新查找
if os.path.isfile(os.path.join(cache_path, "para.json")):
with open(os.path.join(cache_path, "para.json"), 'r', encoding='utf-8') as f:
para_dict_result = json.load(f)
# todo: 继续判断待查询的内容是否与缓存的对象相同
if text == str(para_dict_result["text"]).strip():
final_para_list = []
para_list = para_dict_result["para_list"]
for row in tqdm(para_list):
if pd.isna(row["content_type_name"]):
final_para_list.append(row)
else:
if row["content_type_name"] in contentTypeName_list:
final_para_list.append(row)
else:
continue
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
result_list = final_para_list[pre_index:suf_index]
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list,
"pageNo": pageNo,
"pageSize": pageSize,
"total": len(result_list)
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
os.remove(os.path.join(cache_path, "para.json"))
result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime,
pEndTime, pageSize, pageNo, returenType)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_para = {
"text": text,
"para_list": result_list
}
with open(os.path.join(cache_path, "para.json"), 'w', encoding="utf-8") as file:
json.dump(dict_para, file)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo, returenType)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_para = {
"text": text,
"para_list": result_list
}
with open(os.path.join(cache_path, "para.json"), 'w', encoding="utf-8") as file:
json.dump(dict_para, file)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
# todo: 处理句子
else:
# # todo: 当前句子缓存读取有异常,暂时先不用该方式
# result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime,
# pEndTime, pageSize, pageNo, returenType)
# # logger.info(result_list)
# pre_index = pageNo * pageSize - pageSize
# suf_index = pageNo * pageSize
# # dict_sent = {
# # "text": text,
# # "sent_list": result_list
# # }
# # # todo: 将内容转换为JSON字符串用来存储
# # sent_result = json.dumps(dict_sent)
# #
# # with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
# # file.write(sent_result)
# dict_result = {
# 'code': 200,
# 'message': 'success',
# 'result_data': {
# "match_info": result_list[pre_index:suf_index],
# "pageNo": pageNo,
# "pageSize": pageSize,
# "total": len_list
# }
# }
# # logger.info(dict_result)
# return json.dumps(dict_result, ensure_ascii=False)
# todo: 先检查缓存是否可用,若不可用则重新查找
if os.path.isfile(os.path.join(cache_path, "sent.json")):
with open(os.path.join(cache_path, "sent.json"), 'r', encoding='utf-8') as f:
sent_dict_result = json.load(f)
logger.info(sent_dict_result)
# todo: 继续判断待查询的内容是否与缓存的对象相同
if text == sent_dict_result["text"]:
sent_list = sent_dict_result["sent_list"]
final_sent_list = []
for row in tqdm(sent_list):
if pd.isna(row["content_type_name"]):
final_sent_list.append(row)
else:
if row["content_type_name"] in contentTypeName_list:
final_sent_list.append(row)
else:
continue
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
result_list = final_sent_list[pre_index:suf_index]
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list,
"pageNo": pageNo,
"pageSize": pageSize,
"total": len(final_sent_list)
}
}
return json.dumps(dict_result, ensure_ascii=False)
else:
os.remove(os.path.join(cache_path, "sent.json"))
result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime,
pEndTime, pageSize, pageNo, returenType)
# logger.info(result_list)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_sent = {
"text": text,
"sent_list": result_list
}
# todo: 将内容转换为JSON字符串用来存储
with open(os.path.join(cache_path, "sent.json"), 'w', encoding="utf-8") as file:
json.dump(dict_sent, file)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo, returenType)
logger.info(result_list)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_sent = {
"text": text,
"sent_list": result_list
}
# todo: 将内容转换为JSON字符串用来存储
with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
json.dump(dict_sent, file)
# file.write(para_result)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
except Exception as e:
traceback.print_exc()
dic_result = {
'code': 500,
'message': "failure" + str(e),
'resultData': None
}
logger.info(dic_result)
return json.dumps(dic_result, ensure_ascii=False)
if __name__ == '__main__':
app.run(host=HOST, port=PORT, debug=DEBUG)
if __name__ != '__main__':
gunicorn_logger = logging.getLogger('gunicorn.error')
app.logger.handlers = gunicorn_logger.handlers
app.logger.setLevel(gunicorn_logger.level)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : main_search.py
# @Time : 2022/12/14 09:25
# @Author : bruxelles_li
# @Software: PyCharm
import requests
# import os
from pathlib import Path
import threading
# from search_method import get_para_result, get_sent_result
from gj_app import para_process, sent_process
from 缓存处理 import MemoryCache
import json
import time
import logging
# 定义日志输出格式
formatter = logging.Formatter("%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s")
# 创建一个logger, 并设置日志级别
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 创建一个handler,用于将日志输出到控制台,并设置日志级别
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
# # todo: 定义缓存变量
# memory_cache = MemoryCache()
# TODO: 定义进程存放列表
all_thread = []
# todo: 定义段落处理
# def para_process(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: str, pEndTime: str, pageSize: int):
# pageNo = 10
# para_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
# dict_para = {
# "text": para_list
# }
# para_result = json.dumps(dict_para)
#
# with open(para_path, 'w', encoding='utf-8') as file:
# file.write(para_result)
# time.sleep(120)
# os.remove(para_path)
# return None
# # todo: 定义句子处理
# def sent_process(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: str, pEndTime: str, pageSize: int):
# pageNo = 10
# sent_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
# dict_sent = {
# "text": sent_list
# }
# sent_result = json.dumps(dict_sent)
#
# with open(sent_path, 'w', encoding='utf-8') as file:
# file.write(sent_result)
# time.sleep(120)
# os.remove(sent_path)
# return None
def system_start():
while True:
# print("=====正在进行后台任务=====")
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url='http://localhost:4001/queue_size', headers=headers)
r1_json = json.loads(r1.text)
# print(r1_json)
queue_left_number = r1_json['queue_left_number']
logger.info("当前队列任务总数:" + str(queue_left_number))
if queue_left_number == 0:
# logger.warning("队列为空!无可处理任务。")
time.sleep(3)
else:
for i in range(queue_left_number):
r2 = requests.post(url='http://localhost:4001/subject_consumer', headers=headers)
r2_json = json.loads(r2.text)
config_info = r2_json['data']
logger.info(config_info)
if config_info["type"] == "par":
text = config_info["text"]
contentTypeFlags = config_info["contentTypeFlags"]
topicTypeNames = config_info["topicTypeNames"]
pStartTime = config_info["pStartTime"]
pEndTime = config_info["pEndTime"]
pageSize = config_info["pageSize"]
logger.info('##########处理后台段落查询###############')
t = threading.Thread(target=para_process, args=(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize), daemon=True)
# 启动
t.start()
all_thread.append(t)
else:
text = config_info["text"]
contentTypeFlags = config_info["contentTypeFlags"]
topicTypeNames = config_info["topicTypeNames"]
pStartTime = config_info["pStartTime"]
pEndTime = config_info["pEndTime"]
pageSize = config_info["pageSize"]
logger.info('##########处理后台句子查询###############')
t = threading.Thread(target=sent_process,
args=(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize),
daemon=True)
# 启动
t.start()
all_thread.append(t)
def system_resume():
"""
恢复模型训练服务状态
:return:
"""
headers = {
'Content-Type': 'application/json'
}
# 清空当前服务中的队列,避免重复启动同一个模型训练
r1 = requests.post(url='http://localhost:4001/queue_size', headers=headers)
r1_json = r1.json()
logger.info('当前队列数量:%d' % r1_json['queue_left_number'])
if r1_json['queue_left_number'] > 0:
logger.info('正在消费队列,直到队列为空!')
while True:
r2 = requests.post(url='http://localhost:4001/subject_consumer', headers=headers)
r2_json = r2.json()
if r2_json['queue_left_number'] == 0:
logger.info('队列消费完毕!可放心进行模型训练 ...')
break
else:
logger.info('队列为空!可放心进行模型训练 ...')
def start_up_check():
"""
启动前检查
:return:
"""
while True:
try:
headers = {
'Content-Type': 'application/json'
}
r0 = requests.post(url='http://localhost:4001/queue_size', headers=headers)
server_started = True
except requests.exceptions.ConnectionError as e:
server_started = False
logger.error("Error: ConnectionError")
logger.warning('服务未启动,请先启动server! 程序已退出。')
exit(123)
# logger.info('server正在尝试自启 ...')
# time.sleep(3)
if server_started:
logger.info("server启动成功!后台服务已启动...")
break
if __name__ == "__main__":
# 开始启动模型训练服务
start_up_check()
logger.info('后台服务恢复中 ...')
system_resume()
time.sleep(30)
logger.info('后台服务恢复完成!')
logger.info('后台服务运行中 ...')
system_start()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : matrix_file.py
# @Time : 2022/12/12 19:23
# @Author : bruxelles_li
# @Software: PyCharm
"""
pip install bert-serving-server && pip install bert-serving-client
"""
import pandas as pd
import numpy as np
from bert_serving.client import BertClient
from tqdm import tqdm
from numpy import *
# bc = BertClient('114.116.49.86', check_length=False)
# df = pd.read_excel('素材库/句子库/入库_sent.xlsx', keep_default_na=False).astype(str)
# df = pd.read_excel("素材库/段落库/入库_para.xlsx", keep_default_na=False).astype(str)
# length = len(df)
# print(length)
vector_path = "database/sent_database/policy_1_para.txt"
np_path = "database/para_database/policy_1_para.npy"
# def encode_sentences(df, path):
# f = df
# """
# for line in f:
# result.append(line.strip('\n'))
# """
# with open(path, 'w', encoding='utf-8') as f_vectors:
# for idx, row in tqdm(f.iterrows()):
# sentence = row['content']
# vector = bc.encode([sentence])
# f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# return None
def save_file(length, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
if __name__ == "__main__":
# text = "nihao"
# print(bc.encode([text]))
# encode_sentences(df, vector_path)
length = 298092
save_file(length, np_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/12/12 09:11
# @Author : bruxelles_li
# @FileName: merge_file.py
# @Software: PyCharm
import os
import pandas as pd
result = []
path = r"素材库/段落库"
for root, dirs, files in os.walk(path, topdown=False):
for name in files:
if name.endswith(".xls") or name.endswith(".xlsx"):
df = pd.read_excel(os.path.join(root, name), sheet_name=None)
result.append(df)
data_list = []
for data in result:
data_list.extend(data.values()) # 注意这里是extend()函数而不是append()函数
df = pd.concat(data_list)
df.to_excel('素材库/段落库/去重前_para.xlsx', index=False, engine='xlsxwriter')
print("合并完成!")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : merge_numpy.py
# @Time : 2022/1/15 17:00
# @Author : bruxelles_li
# @Software: PyCharm
import numpy as np
import datetime
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
path1 = "database/para_database/policy_1_para.npy"
path2 = "database/para_database/policy_document.npy"
path3 = "database/para_database/update_policy.npy"
start0_time = datetime.datetime.now()
sen_expert_1 = np.load(path1)
print(sen_expert_1.shape)
end0_time = datetime.datetime.now()
total0_time = (end0_time - start0_time).total_seconds()
logger.info("加载矩阵1 共消耗: " + "{:.2f}".format(total0_time) + " 秒")
start1_time = datetime.datetime.now()
sen_expert = np.load(path2)
print(sen_expert.shape)
end1_time = datetime.datetime.now()
total1_time = (end1_time - start1_time).total_seconds()
logger.info("加载矩阵2 共消耗: " + "{:.2f}".format(total1_time) + " 秒")
arr = np.concatenate((sen_expert, sen_expert_1), axis=0)
print(arr.shape)
np.save(path3, arr)
start2_time = datetime.datetime.now()
np.load(path3)
end2_time = datetime.datetime.now()
total2_time = (end2_time - start2_time).total_seconds()
logger.info("加载矩阵2 共消耗: " + "{:.2f}".format(total2_time) + " 秒")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
from bert_serving.client import BertClient
# 定义向量存储文件文件
# vector_parent_path = "database/sent_database/sent_vector_path"
# Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# # todo: 调用bert编码服务
# bc = BertClient("114.116.54.108", check_length=False)
# text = "nihao"
# print(bc.encode([text]))
# df_speech_by_leaders = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="领导讲话", keep_default_na=False).astype(str)
# df_expert_opinion = pd.read_excel("素材库/句子库/待入库_expert_1_sent.xlsx", keep_default_na=False).astype(str)
# print(len(df_expert_opinion))
# df_expert_opinion.dropna(axis=0, subset=["content"])
# length = len(df_expert_opinion)
# print(length)
# df_policy_document = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_leaders, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_leaders]
# path1 = "素材库/句子库/vector_path/speech_by_leaders.txt"
vector_path = "database/sent_database/sent_vector_path/expert_1_sent.txt"
np_path = "database/sent_database/sent_vector_path/expert_1_sent.npy"
# path3 = "素材库/句子库/vector_path/policy_document.txt"
# path4 = "素材库/句子库/vector_path/enterprise_case.txt"
# vector_path = [path1, path2, path3, path4]
# vector_path = [
# "speech_by_leaders.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
# def encode_sentences(vector_path, df, length, np_path):
# with open(vector_path, 'w', encoding='utf-8') as f_vectors:
# for idx, row in tqdm(df.iterrows()):
# sentence = row['content']
# vector = bc.encode([sentence])
# # print(vector)
# f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# f_vectors.close()
# save_file(length, vector_path, np_path)
#
# return None
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
# todo: 定义同时编码主函数
if __name__ == "__main__":
# encode_sentences(vector_path, df_expert_opinion, length, np_path)
save_file(774435, vector_path, np_path)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
from bert_serving.client import BertClient
# 定义向量存储文件文件
vector_parent_path = "database/sent_database/sent_vector_path"
Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# todo: 调用bert编码服务
bc = BertClient("114.116.54.108", check_length=False)
# text = "nihao"
# print(bc.encode([text]))
# df_speech_by_leaders = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="领导讲话", keep_default_na=False).astype(str)
df_expert_opinion = pd.read_excel("素材库/句子库/待入库_leader_1_sent.xlsx", keep_default_na=False).astype(str)
print(len(df_expert_opinion))
df_expert_opinion.dropna(axis=0, subset=["content"])
length = len(df_expert_opinion)
print(length)
# df_policy_document = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_leaders, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_leaders]
# path1 = "素材库/句子库/vector_path/speech_by_leaders.txt"
vector_path = "database/sent_database/sent_vector_path/leader_1_sent.txt"
np_path = "database/sent_database/sent_vector_path/leader_1_sent.npy"
# path3 = "素材库/句子库/vector_path/policy_document.txt"
# path4 = "素材库/句子库/vector_path/enterprise_case.txt"
# vector_path = [path1, path2, path3, path4]
# vector_path = [
# "speech_by_leaders.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
def encode_sentences(vector_path, df, length, np_path):
with open(vector_path, 'w', encoding='utf-8') as f_vectors:
for idx, row in tqdm(df.iterrows()):
sentence = row['content']
vector = bc.encode([sentence])
# print(vector)
f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
f_vectors.close()
save_file(length, vector_path, np_path)
return None
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
# todo: 定义同时编码主函数
if __name__ == "__main__":
encode_sentences(vector_path, df_expert_opinion, length, np_path)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
from bert_serving.client import BertClient
# 定义向量存储文件文件
# vector_parent_path = "database/sent_database/sent_vector_path"
# Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# todo: 调用bert编码服务
# bc = BertClient("114.116.54.108", check_length=False)
# text = "nihao"
# print(bc.encode([text]))
# df_speech_by_policys = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="领导讲话", keep_default_na=False).astype(str)
# df_expert_opinion = pd.read_excel("素材库/句子库/待入库_policy_1_sent.xlsx", keep_default_na=False).astype(str)
# print(len(df_expert_opinion))
# df_expert_opinion.dropna(axis=0, subset=["content"])
# length = len(df_expert_opinion)
# print(length)
# df_policy_document = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_policys, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_policys]
# path1 = "素材库/句子库/vector_path/speech_by_policys.txt"
vector_path = "database/sent_database/sent_vector_path/policy_1_sent.txt"
np_path = "database/sent_database/sent_vector_path/policy_1_sent.npy"
# path3 = "素材库/句子库/vector_path/policy_document.txt"
# path4 = "素材库/句子库/vector_path/enterprise_case.txt"
# vector_path = [path1, path2, path3, path4]
# vector_path = [
# "speech_by_policys.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
# def encode_sentences(vector_path, df, length, np_path):
# with open(vector_path, 'w', encoding='utf-8') as f_vectors:
# for idx, row in tqdm(df.iterrows()):
# sentence = row['content']
# vector = bc.encode([sentence])
# # print(vector)
# f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# f_vectors.close()
# save_file(length, vector_path, np_path)
# return None
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
# todo: 定义同时编码主函数
if __name__ == "__main__":
# encode_sentences(vector_path, df_expert_opinion, length, np_path)
save_file(615784, vector_path, np_path)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
from bert_serving.client import BertClient
# 定义向量存储文件文件
# vector_parent_path = "database/sent_database/sent_vector_path"
# Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# # todo: 调用bert编码服务
# bc = BertClient("114.116.54.108", check_length=False)
# # text = "nihao"
# # print(bc.encode([text]))
# # df_speech_by_policys = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="领导讲话", keep_default_na=False).astype(str)
# df_expert_opinion = pd.read_excel("素材库/句子库/待入库_other_sent.xlsx", keep_default_na=False).astype(str)
# print(len(df_expert_opinion))
# df_expert_opinion.dropna(axis=0, subset=["content"])
# length = len(df_expert_opinion)
# print(length)
# df_policy_document = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_policys, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_policys]
# path1 = "素材库/句子库/vector_path/speech_by_policys.txt"
vector_path = "database/sent_database/sent_vector_path/other_sent.txt"
np_path = "database/sent_database/sent_vector_path/other_sent.npy"
# path3 = "素材库/句子库/vector_path/policy_document.txt"
# path4 = "素材库/句子库/vector_path/enterprise_case.txt"
# vector_path = [path1, path2, path3, path4]
# vector_path = [
# "speech_by_policys.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
# def encode_sentences(vector_path, df, length, np_path):
# with open(vector_path, 'w', encoding='utf-8') as f_vectors:
# for idx, row in tqdm(df.iterrows()):
# sentence = row['content']
# vector = bc.encode([sentence])
# # print(vector)
# f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# f_vectors.close()
# save_file(length, vector_path, np_path)
#
# return None
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
# todo: 定义同时编码主函数
if __name__ == "__main__":
# encode_sentences(vector_path, df_expert_opinion, length, np_path)
save_file(694187, vector_path, np_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/4/26 09:45
# @Author : bruxelles_li
# @FileName: process_data.py
# @Software: PyCharm
import pandas as pd
import xlrd
from bs4 import BeautifulSoup
import re
import xlsxwriter
def drop_duplicated_data(path):
data_df = pd.read_excel(path)
print("=====test0======", len(data_df))
# data_df.dropna(subset=['title'], inplace=True)
# print("=====test1======", len(data_df))
data_df.dropna(subset=['企业官网'], inplace=True)
print("=====test2======", len(data_df))
data_df.drop_duplicates(subset=["企业官网"], keep="first", inplace=True)
print("=====test3======", len(data_df))
data_df.drop_duplicates(subset=['企业官网'], keep='first', inplace=True)
print("=====test3======", len(data_df))
data_df.reset_index(drop=True, inplace=True)
print("=====test4======", len(data_df))
# data_df.drop('list_srl', axis=1, inplace=True)
# data_df.drop('list_result', axis=1, inplace=True)
print(data_df.shape)
# 写入文件
df1 = pd.DataFrame(data_df)
df1.to_excel('500强企业官网域名_去重.xlsx', engine='xlsxwriter', index=False)
# for idx, row in df.iterrows():
if __name__ == "__main__":
path = r"500强企业资讯导出模型参数文件/500强官网域名.xlsx"
drop_duplicated_data(path)
beautifulsoup4==4.11.1
bert_serving==0.0.1
bert_serving_client==1.10.0
Flask==2.2.2
Flask_Cors==3.0.10
goose3==3.1.11
LAC==2.1.2
lxml==4.9.1
numpy==1.22.4
pandas==1.3.5
pytime==0.2.3
requests==2.27.1
scikit_learn==1.2.0
tqdm==4.64.0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : search_by_dot_matrix.py
# @Time : 2022/6/21 15:44
# @Author : bruxelles_li
# @Software: PyCharm
"""
pip install bert-serving-server && pip install bert-serving-client
"""
import time
from multiprocessing.pool import Pool
import datetime
import threading
import os, re
import multiprocessing as mp
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from bert_serving.client import BertClient
from es_byid import find_sent_info, find_para_info, find_sen_content, find_art_info
from tqdm import tqdm
from numpy import *
import numpy as np
import logging
from 文章id生成 import create_title_id
from 缓存处理 import memory_cache
# todo: 定义进程队列
q = mp.Queue()
# 段落
q1 = mp.Queue()
# 句子
q2 = mp.Queue()
lock = mp.Lock()
# todo: 根据某列的属性值获取数据 -> df.loc[df['columnName'] == 'the value']
# todo: 限制线程的最大数量为4个
# sem = threading.Semaphore(4) # 限制线程的最大数量为4个
# 为了控制最大线程数,达到最大线程时应在线程外阻塞,有线程结束后再创建新线程
# 多进程
# record = []
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
type2namedict = {
"speech_by_leaders": "领导讲话",
"policy_document": "政策文件",
"expert_opinion": "专家观点",
"enterprise_case": "企业案例",
"other": "其他"
}
# todo: 调用bert编码服务 "114.116.54.108"
bc = BertClient(check_length=False, port_out=5556, port=5555)
para_prob = 0.75
sent_prob = 0.75
# todo: 定义去重阈值
duplicated_prob = 0.97
# todo: 定义段落保存临时文件路径和文件路径
# para_vector_path = "database/para_database/para_vector_path/other.txt"
# temp_para_vector_path = "database/para_database/para_vector_path/temp_other.txt"
# temp_para_np_path = "database/para_database/para_vector_path/temp_other.npy"
# todo: 定义段落保存临时文件路径和文件路径
# sen_vector_path = "database/sent_database/sen_vector_path/other.txt"
# temp_sen_vector_path = "database/sent_database/sen_vector_path/temp_other.txt"
# temp_sen_np_path = "database/sent_database/sen_vector_path/temp_other.npy"
# todo: 段落的矩阵文件路径
# para_np_path1 = "database/para_database/speech_by_leaders.npy"
para_np_path1 = "database/para_database/update_leaders.npy"
para_np_arrary1 = np.load(para_np_path1)
# para_np_path2 = "database/para_database/expert_opinion.npy"
para_np_path2 = "database/para_database/update_expert.npy"
para_np_arrary2 = np.load(para_np_path2)
# para_np_path3 = "database/para_database/policy_document.npy"
para_np_path3 = "database/para_database/update_policy.npy"
para_np_arrary3 = np.load(para_np_path3)
para_np_path4 = "database/para_database/enterprise_case.npy"
para_np_arrary4 = np.load(para_np_path4)
para_np_path5 = "database/para_database/other_para.npy"
para_np_arrary5 = np.load(para_np_path5)
# para_np_path_list = [para_np_path1, para_np_path2, para_np_path3, para_np_path4, para_np_path5]
# para_np_arrary_list = [para_np_arrary1, para_np_arrary2, para_np_arrary3, para_np_arrary4, para_np_arrary5]
# todo: 定义name与para_path之间的关系字典
para_name2para_path = {
"领导讲话": para_np_path1,
"专家观点": para_np_path2,
"政策文件": para_np_path3,
"企业案例": para_np_path4,
"其他": para_np_path5
}
para_name2para_np_arrary = {
"领导讲话": para_np_arrary1,
"专家观点": para_np_arrary2,
"政策文件": para_np_arrary3,
"企业案例": para_np_arrary4,
"其他": para_np_arrary5
}
# todo: 句子的矩阵文件路径
# sen_np_path1 = "database/sent_database/speech_by_leaders.npy"
sen_np_path1 = "database/sent_database/update_leaders.npy"
sen_np_arrary1 = np.load(sen_np_path1)
# sen_np_path2 = "database/sent_database/expert_opinion.npy"
sen_np_path2 = "database/sent_database/update_expert.npy"
sen_np_arrary2 = np.load(sen_np_path2)
# sen_np_path3 = "database/sent_database/policy_document.npy"
sen_np_path3 = "database/sent_database/update_policy.npy"
sen_np_arrary3 = np.load(sen_np_path3)
sen_np_path4 = "database/sent_database/enterprise_case.npy"
sen_np_arrary4 = np.load(sen_np_path4)
sen_np_path5 = "database/sent_database/other_sent.npy"
sen_np_arrary5 = np.load(sen_np_path5)
# sen_np_path_list = [sen_np_path1, sen_np_path2, sen_np_path3, sen_np_path4, sen_np_path5]
# sen_np_arrary_list = [sen_np_arrary1, sen_np_arrary2, sen_np_arrary3, sen_np_arrary4, sen_np_arrary5]
# todo: 定义name与para_path之间的关系字典
sen_name2sen_path = {
"领导讲话": sen_np_path1,
"专家观点": sen_np_path2,
"政策文件": sen_np_path3,
"企业案例": sen_np_path4,
"其他": sen_np_path5
}
sen_name2sen_np_arrary = {
"领导讲话": sen_np_arrary1,
"专家观点": sen_np_arrary2,
"政策文件": sen_np_arrary3,
"企业案例": sen_np_arrary4,
"其他": sen_np_arrary5
}
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
np.save(np_path, A)
return None
# todo: 定义段落素材入库
def put_para_list(para_list):
# todo: 导出“其他”素材库构建矩阵
para_arrary = np.load(para_np_path5)
# todo: 定义段落保存临时文件路径
temp_para_vector_path = "database/para_database/temp_other.txt"
temp_para_np_path = "database/para_database/temp_other.npy"
length = len(para_list)
with open(temp_para_vector_path, "w", encoding='utf-8') as f_vectors:
for row in tqdm(para_list):
content = row["para_content"]
_id = row["id"]
is_main = row["is_main"]
if is_main == "0":
continue
else:
vector = bc.encode([content])
f_vectors.write(_id + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
f_vectors.close()
save_file(length, temp_para_vector_path, temp_para_np_path)
temp_arrary = np.load(temp_para_np_path)
arr = np.concatenate((para_arrary, temp_arrary), axis=0)
np.save(para_np_path5, arr)
flag = "1"
os.remove(temp_para_vector_path)
os.remove(temp_para_np_path)
return flag
# todo: 定义句子素材入库
def put_sen_list(sen_list):
# todo: 导出“其他”素材库构建矩阵
sen_arrary = np.load(sen_np_path5)
# todo: 定义段落保存临时文件路径
temp_sen_vector_path = "database/sent_database/temp_other.txt"
temp_sen_np_path = "database/sent_database/temp_other.npy"
length = len(sen_list)
with open(temp_sen_vector_path, "w", encoding='utf-8') as f_vectors:
for row in tqdm(sen_list):
content = row["sent_content"]
_id = row["id"]
is_main = row["is_main"]
if is_main == "0":
continue
else:
vector = bc.encode([content])
f_vectors.write(_id + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
f_vectors.close()
save_file(length, temp_sen_vector_path, temp_sen_np_path)
temp_arrary = np.load(temp_sen_np_path)
arr = np.concatenate((sen_arrary, temp_arrary), axis=0)
np.save(para_np_path5, arr)
flag = "1"
os.remove(temp_sen_vector_path)
os.remove(temp_sen_np_path)
return flag
# # todo: 定义段落并行去重判断
# def get_para_max_id(np_path, text_encode, q1):
# logger.info("子进程开始执行>>> pid={}".format(os.getpid()))
# a = text_encode
# # 导入初始矩阵
# b = np.load(np_path)
# # todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
# c = b.transpose()
# d = c[1::].transpose()
# # todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
# id_list = c[0].tolist()
# # todo: 将矩阵索引与id_list通过定义id_dict关联
# # 根据行长度初始化矩阵索引np_list
# np_list = [n for n in range(b.shape[0])]
# id_dict = dict(zip(np_list, id_list))
# s = cosine_similarity(a, d)
# temp = np.max(s)
# m = np.argmax(s)
# r, c = divmod(m, s.shape[1])
# if temp >= duplicated_prob:
# _id = str(id_dict[c]).split(".")[0]
# q1.put(_id)
# return _id
# todo: 定义段落并行去重判断
def get_para_max_id(np_arrary, text_encode_list, q1):
logger.info("子进程开始执行>>> pid={}".format(os.getpid()))
a = text_encode_list
# 导入初始矩阵
b = np_arrary
# todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
c = b.transpose()
d = c[1::].transpose()
# todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
id_list = c[0].tolist()
# todo: 将矩阵索引与id_list通过定义id_dict关联
# 根据行长度初始化矩阵索引np_list
np_list = [n for n in range(b.shape[0])]
id_dict = dict(zip(np_list, id_list))
s = cosine_similarity(a, d)
max_values = np.amax(s, axis=1)
column_list = []
for i in range(len(s)):
max_index = 0
for j in range(1, len(s[i])):
if s[i][j] > s[i][max_index]:
max_index = j
column_list.append(max_index)
sim_value_list = max_values.tolist()
sim2column = dict(zip(sim_value_list, column_list))
id_list = []
for sim in sim_value_list:
if sim >= duplicated_prob:
column_value = sim2column[sim]
_id = str(id_dict[column_value]).split(".")[0]
else:
_id = ""
id_list.append(_id)
q1.put(id_list)
return id_list
# # todo: 定义句子并行去重判断
# def get_sen_max_id(np_path, text_encode, q2):
# logger.info("子进程开始执行>>> pid={}".format(os.getpid()))
# a = text_encode
# # 导入初始矩阵
# b = np.load(np_path)
# # todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
# c = b.transpose()
# d = c[1::].transpose()
# # todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
# id_list = c[0].tolist()
# # todo: 将矩阵索引与id_list通过定义id_dict关联
# # 根据行长度初始化矩阵索引np_list
# np_list = [n for n in range(b.shape[0])]
# id_dict = dict(zip(np_list, id_list))
# s = cosine_similarity(a, d)
# temp = np.max(s)
# m = np.argmax(s)
# r, c = divmod(m, s.shape[1])
# if temp >= duplicated_prob:
# _id = str(id_dict[c]).split(".")[0]
# q2.put(_id)
# return _id
# todo: 定义段落列表去重函数
# todo: 定义句子并行去重判断
def get_sen_max_id(np_arrary, text_encode_list, q2):
logger.info("子进程开始执行>>> pid={}".format(os.getpid()))
a = text_encode_list
# 导入初始矩阵
b = np_arrary
# todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
c = b.transpose()
d = c[1::].transpose()
# todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
id_list = c[0].tolist()
# todo: 将矩阵索引与id_list通过定义id_dict关联
# 根据行长度初始化矩阵索引np_list
np_list = [n for n in range(b.shape[0])]
id_dict = dict(zip(np_list, id_list))
s = cosine_similarity(a, d)
max_values = np.amax(s, axis=1)
column_list = []
for i in range(len(s)):
max_index = 0
for j in range(1, len(s[i])):
if s[i][j] > s[i][max_index]:
max_index = j
column_list.append(max_index)
sim_value_list = max_values.tolist()
sim2column = dict(zip(sim_value_list, column_list))
id_list = []
for sim in sim_value_list:
if sim >= duplicated_prob:
column_value = sim2column[sim]
_id = str(id_dict[column_value]).split(".")[0]
else:
_id = ""
id_list.append(_id)
q2.put(id_list)
return id_list
def get_para_duplicated(para_list):
dup_record = []
dup_para_list = []
# todo: 暂做调整,将原先for循环遍历去重改为批量去重
# 利用pd将list转为df
df = pd.DataFrame(para_list)
content_list = df["para_content"].tolist()
encode_content_list = bc.encode(content_list)
# todo: 根据字典获取contentTypeName
contentTypeName_list = ["领导讲话", "专家观点", "政策文件", "企业案例", "其他"]
# todo: 应用并发方式处理文件
logger.info("主进程开始执行>>> pid={}".format(os.getpid()))
start_t = datetime.datetime.now()
for type_name in contentTypeName_list:
# np_path = sen_name2sen_path[type_name]
np_arrary = sen_name2sen_np_arrary[type_name]
logger.info(np_arrary)
process = mp.Process(target=get_para_max_id, args=(
np_arrary, encode_content_list, q1))
process.start()
dup_record.append(process)
# 获取结果
results = []
for record_item in dup_record:
while record_item.is_alive():
while False == q1.empty():
temp_results = q1.get()
results.append(temp_results)
# todo: 定义并行处理结果list
res_list = []
for res in results:
res_list.append(res)
logger.info(res_list)
for process in dup_record:
process.join()
process.terminate()
logger.info("主进程终止")
end_t = datetime.datetime.now()
elapsed_sec = (end_t - start_t).total_seconds()
logger.info("多进程计算 共消耗: " + "{:.2f}".format(elapsed_sec) + " 秒")
# todo: 追加结果, 其中非 0 为主条目
temp_list = ["id1", "id2", "id3", "id4"]
dict_temp_result = dict(zip(temp_list, res_list))
temp_df = pd.DataFrame(dict_temp_result)
new_df = temp_df["id1"].str.cat(temp_df["id2"], sep=";").str.cat(temp_df["id3"], sep=";").str.cat(temp_df["id4"],
sep=";")
new_id_list = new_df.tolist()
final_id_list = []
for id in new_id_list:
final_id = re.sub(r";;", ";", id).strip(";")
final_id_list.append(final_id)
temp_df1 = pd.DataFrame({
"para_content": content_list,
"repeatedId": final_id_list
})
# todo: 将匹配信息进行整合,包括df + temp1_df
final_df = pd.merge(df, temp_df1, on="para_content")
for idx, row in tqdm(final_df.iterrows()):
if row["repeatedId"]:
row["is_main"] = "0"
else:
row["is_main"] = ""
dup_para_list.append({
"create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
"para_id": row["para_id"],
"infoId": row["infoId"],
"para_index": row["para_index"],
"para_content": row["para_content"],
"contentTypeIds": row["contentTypeIds"],
"contentNames": row["contentNames"],
"topicNames": row["topicNames"],
"type": row["type"],
"repeatedId": row["repeatedId"],
"is_main": row["is_main"]
})
# dup_para_list.append(row)
return dup_para_list
# for row in tqdm(para_list):
# content = row["para_content"]
# # todo: 考虑编码服务在并行计算时不友好,先对待匹配文本进行编码
# text_encode = bc.encode([content])
# # todo: 根据字典获取contentTypeName
# contentTypeName_list = ["领导讲话", "专家观点", "政策文件", "企业案例", "其他"]
# # todo: 应用并发方式处理文件
# logger.info("主进程开始执行>>> pid={}".format(os.getpid()))
# start_t = datetime.datetime.now()
#
# for type_name in contentTypeName_list:
# np_path = para_name2para_path[type_name]
# logger.info(np_path)
# process = mp.Process(target=get_para_max_id, args=(
# np_path, text_encode, q1))
# process.start()
# dup_record.append(process)
#
# # 获取结果
# results = []
# for record_item in dup_record:
# while record_item.is_alive():
# while False == q1.empty():
# temp_results = q1.get()
# results.append(temp_results)
# # todo: 定义并行处理结果list
# res_list = []
# for res in results:
# res_list.append(res)
# logger.info(res_list)
#
# for process in dup_record:
# process.join()
# process.terminate()
#
# logger.info("主进程终止")
# end_t = datetime.datetime.now()
# elapsed_sec = (end_t - start_t).total_seconds()
# logger.info("多进程计算 共消耗: " + "{:.2f}".format(elapsed_sec) + " 秒")
# # todo: 追加结果, 其中非 0 为主条目
# if res_list:
# repeatedId = ";".join(res_list)
# is_main = "0"
# else:
# repeatedId = ""
# is_main = ""
#
# dup_para_list.append({
# "create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
# "para_id": row["para_id"],
# "infoId": row["infoId"],
# "para_index": row["para_index"],
# "para_content": row["para_content"],
# "contentTypeIds": row["contentTypeIds"],
# "contentNames": row["contentNames"],
# "topicNames": row["topicNames"],
# "type": row["type"],
# "repeatedId": repeatedId,
# "is_main": is_main
# })
#
# return dup_para_list
# todo: 定义句子列表去重函数
def get_sen_duplicated(sen_list):
sen_dup_record = []
dup_sen_list = []
# todo: 暂做调整,将原先for循环遍历去重改为批量去重
# 利用pd将list转为df
df = pd.DataFrame(sen_list)
content_list = df["sent_content"].tolist()
encode_content_list = bc.encode(content_list)
# todo: 根据字典获取contentTypeName
contentTypeName_list = ["领导讲话", "专家观点", "政策文件", "企业案例", "其他"]
# todo: 应用并发方式处理文件
logger.info("主进程开始执行>>> pid={}".format(os.getpid()))
start_t = datetime.datetime.now()
for type_name in contentTypeName_list:
# np_path = sen_name2sen_path[type_name]
np_arrary = sen_name2sen_np_arrary[type_name]
process = mp.Process(target=get_sen_max_id, args=(
np_arrary, encode_content_list, q2))
process.start()
sen_dup_record.append(process)
# for row in tqdm(sen_list):
# content = row["sent_content"]
# # todo: 考虑编码服务在并行计算时不友好,先对待匹配文本进行编码
# text_encode = bc.encode([content])
# # todo: 根据字典获取contentTypeName
# contentTypeName_list = ["领导讲话", "专家观点", "政策文件", "企业案例", "其他"]
# # todo: 应用并发方式处理文件
# logger.info("主进程开始执行>>> pid={}".format(os.getpid()))
# start_t = datetime.datetime.now()
# for type_name in contentTypeName_list:
# np_path = sen_name2sen_path[type_name]
# logger.info(np_path)
# process = mp.Process(target=get_sen_max_id, args=(
# np_path, text_encode, q2))
# process.start()
# sen_dup_record.append(process)
# 获取结果
results = []
for record_item in sen_dup_record:
while record_item.is_alive():
while False == q2.empty():
temp_results = q2.get()
results.append(temp_results)
# todo: 定义并行处理结果list
res_list = []
for res in results:
res_list.append(res)
# logger.info(res_list)
for process in sen_dup_record:
process.join()
process.terminate()
logger.info("主进程终止")
end_t = datetime.datetime.now()
elapsed_sec = (end_t - start_t).total_seconds()
logger.info("多进程计算 共消耗: " + "{:.2f}".format(elapsed_sec) + " 秒")
# todo: 追加结果, 其中非 0 为主条目
temp_list = ["id1", "id2", "id3", "id4"]
dict_temp_result = dict(zip(temp_list, res_list))
temp_df = pd.DataFrame(dict_temp_result)
new_df = temp_df["id1"].str.cat(temp_df["id2"], sep=";").str.cat(temp_df["id3"], sep=";").str.cat(temp_df["id4"],
sep=";")
new_id_list = new_df.tolist()
final_id_list = []
for id in new_id_list:
final_id = re.sub(r";;", ";", id).strip(";")
final_id_list.append(final_id)
# logger.info(final_id_list)
temp_df1 = pd.DataFrame({
"sent_content": content_list,
"repeatedId": final_id_list
})
# todo: 将匹配信息进行整合,包括df + temp1_df
final_df = pd.merge(df, temp_df1, on="sent_content")
# logger.info(final_df)
for idx, row in tqdm(final_df.iterrows()):
if row["repeatedId"]:
row["is_main"] = "0"
else:
row["is_main"] = ""
dup_sen_list.append({
"create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
"sent_id": row["sent_id"],
"para_id": row["para_id"],
"infoId": row["infoId"],
"sent_article_index": row["sent_article_index"],
"sent_para_index": row["sent_para_index"],
"sent_content": row["sent_content"],
"contentTypeIds": row["contentTypeIds"],
"contentNames": row["contentNames"],
"topicNames": row["topicNames"],
"type": row["type"],
"repeatedId": row["repeatedId"],
"is_main": row["is_main"]
})
# logger.info(dup_sen_list)
return dup_sen_list
# if res_list:
# repeatedId = ";".join(res_list)
# is_main = "0"
# else:
# repeatedId = ""
# is_main = ""
#
# dup_sen_list.append({
# "create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
# "sent_id": row["sent_id"],
# "para_id": row["para_id"],
# "infoId": row["infoId"],
# "sent_article_index": row["sent_article_index"],
# "sent_para_index": row["sent_para_index"],
# "sent_content": row["sent_content"],
# "contentTypeIds": row["contentTypeIds"],
# "contentNames": row["contentNames"],
# "topicNames": row["topicNames"],
# "type": row["type"],
# "repeatedId": repeatedId,
# "is_main": is_main
# })
#
# return dup_sen_list
# todo: 定义段落的相似性判断函数
def get_para_top(np_arrary, text_encode, topicTypeNames, pStartTime, pEndTime, returenType, q, lock):
logger.info("子进程开始执行>>> pid={}".format(os.getpid()))
a = text_encode
# 导入初始矩阵
# b = np.load(np_path)
b = np_arrary
# todo: 考虑当数据量在4g时,矩阵计算时间超过4秒,先将矩阵进行切片后计算, 当满足条件的内容长度大于30时不进行后续计算
start0_time = datetime.datetime.now()
sub_arrarys = np.array_split(b, 500)
sim_result = []
id_result = []
for x in sub_arrarys:
if len(sim_result) < 30:
# todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
c = x.transpose()
d = c[1::].transpose()
# todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
id_list = c[0].tolist()
# 根据行长度初始化矩阵索引np_list
np_list = [n for n in range(x.shape[0])]
id_dict = dict(zip(np_list, id_list))
r = cosine_similarity(a, d)
target = np.where(r >= sent_prob)
column_list = target[1].tolist()
if column_list:
id_list = [str(id_dict[i]).split(".")[0] for i in column_list]
sim_list = r[target].tolist()
sim_result.extend(sim_list)
id_result.extend(id_list)
else:
break
end0_time = datetime.datetime.now()
total0_time = (end0_time - start0_time).total_seconds()
logger.info(len(id_result))
logger.info("拆分矩阵计算 共消耗: " + "{:.2f}".format(total0_time) + " 秒")
df1 = pd.DataFrame({
"id": id_result,
"sim": sim_result
})
test1 = df1.sort_values(by=['sim'], axis=0, ascending=False)
# todo: 场景1 ->不勾选主题参数
if len(topicTypeNames) == 0:
df2 = test1[:10]
# todo: 场景2 ->勾选主题参数
else:
df2 = test1[:30]
# todo: 先取唯一标识id,并调用es查询获取匹配信息
new_id_list = df2["id"].tolist()
try:
info_df = find_para_info(new_id_list)
except IndexError:
return None
# todo: 将匹配信息进行整合,包括df2 + info_df
temp_df = pd.merge(df2, info_df, on="id")
result = []
for idx, row in tqdm(temp_df.iterrows()):
_id = row['id']
para_content = row["content"]
paragraphid = row["paragraphId"]
paraindex = row["paraArticleIndex"]
para_topic_type = row["topicType"]
para_content_type_name = row["contentTypeName"]
para_article_id = row["articleId"]
# todo: 根据段落所在的文章id获取文章信息
try:
art_temp_result = find_art_info(para_article_id)
except IndexError:
continue
title = art_temp_result["articleTitle"]
origin = art_temp_result["origin"]
time = art_temp_result["articleTime"]
author = art_temp_result["author"]
article_content = art_temp_result["content"]
# todo: 场景1 ->勾选主题参数,判断主题和时间范围
if topicTypeNames:
if para_topic_type in topicTypeNames:
result.append({
"content": "<font style='color:red;'>" + para_content + "</font>",
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": para_article_id,
"paragraphid": paragraphid,
"match_index": paraindex,
"topic_type": para_topic_type,
"content_type_name": para_content_type_name,
"article_content": article_content,
"publishDate": time,
"author": author,
"origin": origin,
"title": title,
"type": returenType
})
# todo: 不勾选主题参数,只判断时间参数
else:
result.append({
"content": "<font style='color:red;'>" + para_content + "</font>",
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": para_article_id,
"paragraphid": paragraphid,
"match_index": paraindex,
"topic_type": para_topic_type,
"content_type_name": para_content_type_name,
"article_content": article_content,
"publishDate": time,
"author": "",
"origin": origin,
"title": title,
"type": returenType
})
q.put(result)
# # todo: 方案2——根据for循环遍历矩阵来获取信息
# list_index = []
# for i in range(r.shape[0]):
# for j in range(r.shape[1]):
# sim_value = r[i][j]
# if sim_value >= para_prob:
# list_index.append({
# 'sim': sim_value,
# '_id': str(id_dict[j]).split(".")[0]
# })
# # todo: 根据相似值对内容进行排序
# name = ['_id', 'sim']
# df = pd.DataFrame(columns=name, data=list(list_index))
# test = df.sort_values(by=['sim'], axis=0, ascending=False)
# # todo: 场景1 ->不勾选主题参数
# if len(topicTypeNames) == 0:
# # df1 = test[:30]
# df1 = test[:10]
# # df1 = test[pre_index:suf_index]
# # todo: 场景2 ->勾选主题参数
# else:
# # df1 = test[:100]
# df1 = test[:30]
# # df1 = test[pre_index:4 * suf_index]
# # 定义结果文件
# result = []
# for idx, row in tqdm(df1.iterrows()):
# _id = row['_id']
# # todo: 根据唯一标识id获取段落信息, 获取方式为调用es查询接口
# para_temp_result = find_info(_id)
# para_content = para_temp_result["content"]
# paragraphid = para_temp_result["paragraphId"]
# paraindex = para_temp_result["paraArticleIndex"]
# para_topic_type = para_temp_result["topicType"]
# para_content_type_name = para_temp_result["contentTypeName"]
# para_article_id = para_temp_result["articleId"]
# # todo: 根据段落所在的文章id获取文章信息
# art_temp_result = find_art_info(para_article_id)
# title = art_temp_result["articleTitle"]
# origin = art_temp_result["origin"]
# time = art_temp_result["articleTime"]
# author = art_temp_result["author"]
# article_content = art_temp_result["content"]
#
# # todo: 场景1 ->勾选主题参数,判断主题和时间范围
# if topicTypeNames:
# # if para_topic_type in topicTypeNames and pStartTime <= time <= pEndTime:
# if para_topic_type in topicTypeNames:
# result.append({
# "content": "<font style='color:red;'>" + para_content + "</font>",
# "similarity": round(row['sim'], 4),
# "id": _id,
# "article_id": para_article_id,
# "paragraphid": paragraphid,
# "match_index": paraindex,
# "topic_type": para_topic_type,
# "content_type_name": para_content_type_name,
# "article_content": article_content,
# "publishDate": time,
# "author": author,
# "origin": origin,
# "title": title,
# "type": returenType
# })
# # todo: 不勾选主题参数,只判断时间参数
# else:
# # if pStartTime <= time <= pEndTime:
# result.append({
# "content": "<font style='color:red;'>" + para_content + "</font>",
# "similarity": round(row['sim'], 4),
# "id": _id,
# "article_id": para_article_id,
# "paragraphid": paragraphid,
# "match_index": paraindex,
# "topic_type": para_topic_type,
# "content_type_name": para_content_type_name,
# "article_content": article_content,
# "publishDate": time,
# "author": "",
# "origin": origin,
# "title": title,
# "type": returenType
# })
# # 释放
# # sem.release()
# q.put(result)
# lock.release()
return result
# todo: 定义句子的相似性判断函数
def get_sen_top(np_arrary, text_encode, topicTypeNames, pStartTime, pEndTime, returenType, q, lock):
logger.info("子进程开始执行>>> pid={}".format(os.getpid()))
a = text_encode
# 导入初始矩阵
# b = np.load(np_path)
b = np_arrary
# todo: 考虑当数据量在4g时,矩阵计算时间超过4秒,先将矩阵进行切片后计算, 当满足条件的内容长度大于30时不进行后续计算
start0_time = datetime.datetime.now()
sub_arrarys = np.array_split(b, 500)
sim_result = []
id_result = []
for x in sub_arrarys:
if len(sim_result) < 30:
# todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
c = x.transpose()
d = c[1::].transpose()
# todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
id_list = c[0].tolist()
# 根据行长度初始化矩阵索引np_list
np_list = [n for n in range(x.shape[0])]
id_dict = dict(zip(np_list, id_list))
r = cosine_similarity(a, d)
target = np.where(r >= sent_prob)
column_list = target[1].tolist()
if column_list:
id_list = [str(id_dict[i]).split(".")[0] for i in column_list]
sim_list = r[target].tolist()
sim_result.extend(sim_list)
id_result.extend(id_list)
else:
break
end0_time = datetime.datetime.now()
total0_time = (end0_time - start0_time).total_seconds()
logger.info(len(id_result))
logger.info("拆分矩阵计算 共消耗: " + "{:.2f}".format(total0_time) + " 秒")
df1 = pd.DataFrame({
"id": id_result,
"sim": sim_result
})
test1 = df1.sort_values(by=['sim'], axis=0, ascending=False)
# todo: 场景1 ->不勾选主题参数
if len(topicTypeNames) == 0:
df2 = test1[:10]
# todo: 场景2 ->勾选主题参数
else:
df2 = test1[:30]
# todo: 先取唯一标识id,并调用es查询获取匹配信息
new_id_list = df2["id"].tolist()
# todo: 记录es查询结果时间
start2_time = datetime.datetime.now()
try:
info_df = find_sent_info(new_id_list)
except IndexError:
return None
# todo: 将匹配信息进行整合,包括df2 + info_df
temp_df = pd.merge(df2, info_df, on="id")
# # todo: 根据段落所在的文章id获取文章信息
# new_art_id_list = temp_df["articleId"].tolist()
# art_info_df = find_art_info(new_art_id_list)
# # todo: 将文章信息整合到temp_df
# final_df = pd.merge(temp_df, art_info_df, on="articleId")
# print(final_df)
result = []
for idx, row in tqdm(temp_df.iterrows()):
sentence_id = row["sentenceId"]
sent_article_id = row["articleId"]
sent_content = row["content"]
# todo: 根据句子所在的文章id获取文章信息
try:
art_temp_result = find_art_info(sent_article_id)
except IndexError:
continue
title = art_temp_result["articleTitle"]
origin = art_temp_result["origin"]
time = art_temp_result["articleTime"]
author = art_temp_result["author"]
article_content = art_temp_result["content"]
# todo: 根据sentence_id 和 sent_article_id 获取前后句
final_content = find_sen_content(sent_article_id, sentence_id, sent_content)
# todo: 场景1 ->勾选主题参数,判断主题和时间范围
if topicTypeNames:
if row["topicType"] in topicTypeNames:
result.append({
"content": final_content,
"similarity": round(row['sim'], 4),
"id": row["id"],
"article_id": sent_article_id,
"paragraphid": row["paragraphId"],
"match_index": row["sentParaIndex"] + ";" + row["sentArticleIndex"],
"topic_type": row["topicType"],
"content_type_name": row["contentTypeName"],
"article_content": article_content,
"publishDate": time,
"author": author,
"origin": origin,
"title": title,
"type": returenType
})
# todo: 场景2 -> 不勾选类型参数, 仅判断事件范围
else:
result.append({
"content": final_content,
"similarity": round(row['sim'], 4),
"id": row["id"],
"article_id": sent_article_id,
"paragraphid": row["paragraphId"],
"match_index": row["sentParaIndex"] + ";" + row["sentArticleIndex"],
"topic_type": row["topicType"],
"article_content": article_content,
"publishDate": time,
"author": author,
"origin": origin,
"title": title,
"type": returenType
})
end2_time = datetime.datetime.now()
total2_time = (end2_time - start2_time).total_seconds()
logger.info("es查询内容 共消耗: " + "{:.2f}".format(total2_time) + " 秒")
q.put(result)
# # todo: 方案2——根据for循环遍历矩阵来获取信息
# list_index = []
# start2_time = datetime.datetime.now()
# for i in range(r.shape[0]):
# for j in range(r.shape[1]):
# sim_value = r[i][j]
# if sim_value >= sent_prob:
# list_index.append({
# 'sim': sim_value,
# '_id': str(id_dict[j]).split(".")[0]
# })
# # todo: 根据相似值对内容进行排序
# name = ['_id', 'sim']
# df = pd.DataFrame(columns=name, data=list(list_index))
# test = df.sort_values(by=['sim'], axis=0, ascending=False)
# end2_time = datetime.datetime.now()
# total2_time = (end2_time - start2_time).total_seconds()
# logger.info("for循环遍历 共消耗: " + "{:.2f}".format(total2_time) + " 秒")
# # todo: 场景1 ->不勾选主题参数
# if len(topicTypeNames) == 0:
# df1 = test[:10]
# # todo: 场景2 ->勾选主题参数
# else:
# df1 = test[:30]
# # 定义结果文件
# result = []
#
# for idx, row in tqdm(df1.iterrows()):
# _id = row['_id']
# # todo: 根据唯一标识id获取段落信息, 获取方式为调用es查询接口
# sen_temp_result = find_info(_id)
# sent_content = sen_temp_result["content"]
# paragraph_id = sen_temp_result["paragraphId"]
# sent_para_index = sen_temp_result["sentParaIndex"]
# sent_article_index = sen_temp_result["sentArticleIndex"]
# sent_topic_type = sen_temp_result["topicType"]
# sent_content_type_name = sen_temp_result["contentTypeName"]
# sent_article_id = sen_temp_result["articleId"]
# sentence_id = sen_temp_result["sentenceId"]
# # todo: 根据段落所在的文章id获取文章信息
# art_temp_result = find_art_info(sent_article_id)
# title = art_temp_result["articleTitle"]
# origin = art_temp_result["origin"]
# time = art_temp_result["articleTime"]
# author = art_temp_result["author"]
# article_content = art_temp_result["content"]
#
# # todo: 根据sentence_id 和 sent_article_id 获取前后句
# # pre_sent, suf_sent = find_sen_content(sent_article_id, sentence_id)
# final_content = find_sen_content(sent_article_id, sentence_id, sent_content),
# # pre_sent = find_sen_content(sent_article_id, str(int(sentence_id) - 1))
# #
# # suf_sent = find_sen_content(sent_article_id, str(int(sentence_id) + 1))
#
# # todo: 场景1 ->勾选主题参数,判断主题和时间范围
# if topicTypeNames:
# # if sent_topic_type in topicTypeNames and pStartTime <= time <= pEndTime:
# if sent_topic_type in topicTypeNames:
# result.append({
# # "content": pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
# "content": final_content,
# "similarity": round(row['sim'], 4),
# "id": _id,
# "article_id": sent_article_id,
# "paragraphid": paragraph_id,
# "match_index": sent_para_index + ";" + sent_article_index,
# # "sent_article_index": sent_article_index,
# "topic_type": sent_topic_type,
# "content_type_name": sent_content_type_name,
# "article_content": article_content,
# "publishDate": time,
# "author": author,
# "origin": origin,
# "title": title,
# "type": returenType
# })
#
# # todo: 场景2 -> 不勾选类型参数, 仅判断事件范围
# else:
# # if pStartTime <= time <= pEndTime:
# result.append({
# "content": final_content,
# # "content": pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
# "similarity": round(row['sim'], 4),
# "id": _id,
# "article_id": sent_article_id,
# "paragraphid": paragraph_id,
# "match_index": sent_para_index + ";" + sent_article_index,
# # "sent_article_index": sent_article_index,
# "topic_type": sent_topic_type,
# "content_type_name": sent_content_type_name,
# "article_content": article_content,
# "publishDate": time,
# "author": author,
# "origin": origin,
# "title": title,
# "type": returenType
# })
# q.put(result)
# lock.release()
return result
def get_para_result(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: str, pEndTime: str, pageSize: int, pageNo: int, returenType: str):
# todo: 考虑编码服务在并行计算时不友好,先对待匹配文本进行编码
record = []
text_encode = bc.encode([text])
# todo: 根据字典获取contentTypeName
contentTypeName_list = []
if contentTypeFlags:
for type in contentTypeFlags:
content_type_name = type2namedict[type]
contentTypeName_list.append(content_type_name)
else:
contentTypeName_list = ["领导讲话", "专家观点", "政策文件", "企业案例", "其他"]
logger.info(contentTypeName_list)
# todo: 应用并发方式处理文件
logger.info("主进程开始执行>>> pid={}".format(os.getpid()))
start_t = datetime.datetime.now()
for type_name in contentTypeName_list:
# np_path = para_name2para_path[type_name]
np_arrary = para_name2para_np_arrary[type_name]
# 创建线程对象
process = mp.Process(target=get_para_top, args=(np_arrary, text_encode, topicTypeNames, pStartTime, pEndTime, returenType, q, lock))
process.start()
record.append(process)
# 获取结果
results = []
for record_item in record:
while record_item.is_alive():
while False == q.empty():
temp_results = q.get()
results.append(temp_results)
# todo: 定义并行处理结果list
res_list = []
for res in results:
res_list.extend(res)
for process in record:
process.join()
process.terminate()
logger.info("主进程终止")
end_t = datetime.datetime.now()
elapsed_sec = (end_t - start_t).total_seconds()
logger.info("多进程计算 共消耗: " + "{:.2f}".format(elapsed_sec) + " 秒")
df2 = pd.DataFrame(res_list, columns=["content", "similarity", "id", "article_id", "paragraphid", "match_index", "topic_type", "content_type_name",
"article_content", "publishDate", "author", "origin", "title", "type"])
df2.drop_duplicates(subset=["content"], keep="first", inplace=True)
# todo: 将df 转为list
final_dict = df2.to_dict()
result_list = [dict(zip(final_dict, values)) for values in zip(*[final_dict[k].values() for k in final_dict])]
result_list = result_list[:100] if len(result_list) >= 100 else result_list
return result_list, len(result_list)
def get_sent_result(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: str, pEndTime: str, pageSize: int, pageNo: int, returenType: str ):
# todo: 考虑编码服务在并行计算时不友好,先对待匹配文本进行编码
record = []
text_encode = bc.encode([text])
# todo: 根据字典获取contentTypeName
contentTypeName_list = []
if contentTypeFlags:
for type in contentTypeFlags:
content_type_name = type2namedict[type]
contentTypeName_list.append(content_type_name)
else:
contentTypeName_list = ["领导讲话", "专家观点", "政策文件", "企业案例", "其他"]
logger.info(contentTypeName_list)
# todo: 应用并发方式处理文件
logger.info("主进程开始执行>>> pid={}".format(os.getpid()))
start_t = datetime.datetime.now()
for type_name in contentTypeName_list:
np_arrary = sen_name2sen_np_arrary[type_name]
# np_path = sen_name2sen_path[type_name]
# 创建线程对象
process = mp.Process(target=get_sen_top,
args=(np_arrary, text_encode, topicTypeNames, pStartTime, pEndTime, returenType, q, lock))
process.start()
record.append(process)
# 获取结果
results = []
for record_item in record:
while record_item.is_alive():
while False == q.empty():
temp_results = q.get()
results.append(temp_results)
# todo: 定义并行处理结果list
res_list = []
for res in results:
res_list.extend(res)
for process in record:
process.join()
process.terminate()
logger.info("主进程终止")
end_t = datetime.datetime.now()
elapsed_sec = (end_t - start_t).total_seconds()
logger.info("多进程计算 共消耗: " + "{:.2f}".format(elapsed_sec) + " 秒")
df2 = pd.DataFrame(res_list, columns=["content", "similarity", "id", "article_id", "paragraphid", "match_index", "topic_type", "content_type_name",
"article_content", "publishDate", "author", "origin", "title", "type"])
df2.drop_duplicates(subset=["content"], keep="first", inplace=True)
# todo: 将df 转为list
final_dict = df2.to_dict()
result_list = [dict(zip(final_dict, values)) for values in zip(*[final_dict[k].values() for k in final_dict])]
result_list = result_list[:100] if len(result_list) >= 100 else result_list
return result_list, len(result_list)
if __name__ == "__main__":
sent_list = [
{
"create_time": "2023-01-03 18:02:26",
"sent_id": "1",
"para_id": "1",
"infoId": "",
"sent_article_index": "2|87",
"sent_para_index": "2|87",
"sent_content": "强化创新引领 加快“三个转变” 更好推动中国制造高质量发展——国资委党委委员、副主任 翁杰明——制造业是立国之本、强国之基,以习近平同志为核心的党中央高度重视制造强国建设。",
"contentTypeIds": "1602095566267805697",
"contentNames": "领导讲话",
"topicNames": "产业链链长",
"type": "sen",
},
{
"create_time": "2023-01-03 18:02:28",
"sent_id": "2",
"para_id": "1",
"infoId": "",
"sent_article_index": "88|155",
"sent_para_index": "88|155",
"sent_content": "2014年5月10日,习近平总书记在中铁装备视察时首次提出,推动中国制造向中国创造转变、中国速度向中国质量转变、中国产品向中国品牌转变。",
"contentTypeIds": "1602095566267805697",
"contentNames": "领导讲话",
"topicNames": "产业链链长",
"type": "sen",
},
{
"create_time": "2023-01-03 18:02:30",
"sent_id": "3",
"para_id": "1",
"infoId": "",
"sent_article_index": "156|223",
"sent_para_index": "156|223",
"sent_content": "习近平总书记关于“三个转变”的重要指示为中国制造高质量发展指明了方向、提供了根本遵循,国务院国资委和中央企业深入学习领会、坚决贯彻落实。",
"contentTypeIds": "1602095566267805697",
"contentNames": "领导讲话",
"topicNames": "产业链链长",
"type": "sen",
},
{
"create_time": "2023-01-03 18:02:32",
"sent_id": "4",
"para_id": "1",
"infoId": "",
"sent_article_index": "224|409",
"sent_para_index": "224|409",
"sent_content": "近年来,国务院国资委专门出台质量品牌工作系列文件,强化考核引导激励,建立长效机制,搭建了“中国品牌论坛”、“数字中国建设峰会”、“双创”示范基地等一系列助力企业高质量发展的专业化平台,引导支持中央企业积极打造原创技术“策源地”和现代产业链“链长”,有效推动中央企业激发创新潜力、增强发展动力,在高端装备制造领域取得一系列突破性、标志性重大成果,一大批“国之重器”横空出世。",
"contentTypeIds": "1602095566267805697",
"contentNames": "领导讲话",
"topicNames": "产业链链长",
"type": "sen",
},
{
"create_time": "2023-01-03 18:02:34",
"sent_id": "5",
"para_id": "1",
"infoId": "",
"sent_article_index": "410|537",
"sent_para_index": "410|537",
"sent_content": "无论是代表国家实力的天宫探梦、嫦娥奔月、北斗导航,还是捍卫国家主权的航空母舰、东风导弹、歼20、运20,无论是享誉“一带一路”的中国桥、中国路、中国港,还是成为中国名片的高速铁路、华龙一号、5G通信网络,中央企业都发挥了重要作用,彰显了大国重器的责任担当。",
"contentTypeIds": "1602095566267805697",
"contentNames": "领导讲话",
"topicNames": "产业链链长",
"type": "sen",
}
]
temp_result = get_sen_duplicated(sent_list)[4]["sent_content"]
print(temp_result)
# print(type(get_sen_duplicated(sent_list)))
# ["speech_by_leaders"]
# contentTypeFlags = []
# # ["共同富裕"]
# topicTypeNames = []
# pStartTime = ""
# pEndTime = ""
# returenType = "par"
# text = "共同富裕"
# pageNo = 1
# pageSize = 10
# result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo, returenType)
# # print(result_list)
# logger.info("查询已失效,请重新查询!")
# cache_list = memory_cache.get_value(text)
# if cache_list:
# result_two = cache_list[10:20]
# print(result_two)
# result_three = cache_list[20:30]
# print(result_three)
# else:
# print("查询已失效,请重新查询!")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : search_method.py
# @Time : 2022/12/15 15:44
# @Author : bruxelles_li
# @Software: PyCharm
"""
pip install bert-serving-server && pip install bert-serving-client
"""
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from bert_serving.client import BertClient
from tqdm import tqdm
from numpy import *
import numpy as np
import logging
from pytime import pytime
from datetime import datetime
from 缓存处理 import memory_cache
# todo: 根据某列的属性值获取数据 -> df.loc[df['columnName'] == 'the value']
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
type2namedict = {
"speech_by_leaders": "领导讲话",
"policy_document": "政策文件",
"expert_opinion": "专家观点",
"enterprise_case": "企业案例",
"other": "其他"
}
# todo: 调用bert编码服务
bc = BertClient("114.116.54.108", check_length=False)
para_prob = 0.85
sent_prob = 0.85
# todo: 定义句子和段落的矩阵文件路径
sent_file_path = "database/sent_database/句子库.npy"
para_file_path = "database/para_database/段落库.npy"
# todo:定义段落库内容
para_df = pd.read_excel('素材库/段落库/入库_para.xlsx', keep_default_na=False).astype(str)
_id2paracont = {row['id']: row['content'] for idx, row in para_df.iterrows()}
_id2para_articleid = {row['id']: row['article_id'] for idx1, row in para_df.iterrows()}
_id2paragraphid = {row['id']: row['paragraph_id'] for idx2, row in para_df.iterrows()}
_id2paraindex = {row['id']: row['para_article_index'] for idx3, row in para_df.iterrows()}
_id2para_topic_type = {row['id']: row['topic_type'] for idx4, row in para_df.iterrows()}
_id2para_content_type_name = {row['id']: row['content_type_name'] for idx5, row in para_df.iterrows()}
# todo: 定义句子库内容
sent_df = pd.read_csv('素材库/句子库/入库_sent.csv', keep_default_na=False, encoding="gbk").astype(str)
_id2sentcont = {row['id']: row['content'] for idx6, row in sent_df.iterrows()}
_id2sent_articleid = {row['id']: row['article_id'] for idx7, row in sent_df.iterrows()}
_id2sent_paraid = {row['id']: row['paragraph_id'] for idx8, row in sent_df.iterrows()}
_id2sent_paraindex = {row['id']: row['sent_para_index'] for idx9, row in sent_df.iterrows()}
_id2sent_articleindex = {row['id']: row['sent_article_index'] for idx10, row in sent_df.iterrows()}
_id2sent_topic_type = {row['id']: row['topic_type'] for idx11, row in sent_df.iterrows()}
_id2sent_content_type_name = {row['id']: row['content_type_name'] for idx12, row in sent_df.iterrows()}
_id2sent_sentid = {row['id']: row['sentence_id'] for idx18, row in sent_df.iterrows()}
# todo: 定义文章库内容,根据段落和句子所对应的文章id获取文章的基本信息 (标题、来源、发布时间、作者)
article_df = pd.read_excel("素材库/文章库/入库_article.xlsx", keep_default_na=False).astype(str)
artcile_id2title = {row['article_id']: row['article_title'] for idx13, row in article_df.iterrows()}
artcile_id2origin = {row['article_id']: row['origin'] for idx14, row in article_df.iterrows()}
artcile_id2time = {row['article_id']: row['article_time'] for idx15, row in article_df.iterrows()}
# artcile_id2author = {row['article_id']: row['author'] for idx16, row in article_df.iterrows()}
article_id2content = {row['article_id']: row['content'] for idx17, row in article_df.iterrows()}
def get_para_result(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: datetime.date, pEndTime: datetime.date, pageSize: int, pageNo: int, returenType: str):
# todo: 根据字典获取contentTypeName
contentTypeName_list = []
if contentTypeFlags:
for type in contentTypeFlags:
content_type_name = type2namedict[type]
contentTypeName_list.append(content_type_name)
# 导入初始矩阵
b = np.load(para_file_path)
# todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
c = b.transpose()
d = c[1::].transpose()
# todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
id_list = c[0].tolist()
# todo: 将矩阵索引与id_list通过定义id_dict关联
# 根据行长度初始化矩阵索引np_list
np_list = [n for n in range(b.shape[0])]
id_dict = dict(zip(np_list, id_list))
a = bc.encode([text])
r = cosine_similarity(a, d)
list_index = []
result = []
for i in range(r.shape[0]):
for j in range(r.shape[1]):
sim_value = r[i][j]
if sim_value >= para_prob:
list_index.append({
'sim': sim_value,
'_id': str(id_dict[j]).split(".")[0]
})
name = ['_id', 'sim']
df = pd.DataFrame(columns=name, data=list(list_index))
test = df.sort_values(by=['sim'], axis=0, ascending=False)
# todo: 根据pagesize 和 pageno 获取内容长度
# pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
# todo: 场景4 ->都不勾选
if len(contentTypeName_list) == 0 and len(topicTypeNames) == 0:
df1 = test[:suf_index]
# todo: 场景1 ->勾选类型参数和主题参数 场景2 ->仅勾选类型参数 场景3 ->仅勾选主题参数
else:
df1 = test[:4 * suf_index]
for idx, row in tqdm(df1.iterrows()):
_id = row['_id']
# todo: 根据唯一标识id获取段落信息
para_content = _id2paracont.get(_id)
paragraphid = _id2paragraphid.get(_id)
paraindex = _id2paraindex.get(_id)
para_topic_type = _id2para_topic_type.get(_id)
para_content_type_name = _id2para_content_type_name.get(_id)
para_article_id = _id2para_articleid.get(_id)
# todo: 根据段落所在的文章id获取文章信息
title = artcile_id2title.get(para_article_id)
origin = artcile_id2origin.get(para_article_id)
publishDate = artcile_id2time.get(para_article_id)
# todo: 将时间转换为统一的格式
time = datetime.date(pytime.parse(artcile_id2time.get(para_article_id)))
# author = artcile_id2author.get(para_article_id)
article_content = article_id2content.get(para_article_id)
# todo: 场景1 ->勾选类型参数和主题参数
if contentTypeName_list and topicTypeNames:
if para_content_type_name in contentTypeName_list and para_topic_type in topicTypeNames \
and pStartTime <= time <= pEndTime:
result.append({
"content": "<font style='color:red;'>" + para_content + "</font>",
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": para_article_id,
"paragraphid": paragraphid,
"paraindex": paraindex,
"para_topic_type": para_topic_type,
"para_content_type_name": para_content_type_name,
"article_content": article_content,
"publishDate": publishDate,
"author": "",
"origin": origin,
"title": title
})
# todo: 场景2 ->仅勾选类型参数
elif contentTypeName_list and len(topicTypeNames) == 0:
if para_content_type_name in contentTypeName_list and pStartTime <= time <= pEndTime:
result.append({
"content": "<font style='color:red;'>" + para_content + "</font>",
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": para_article_id,
"paragraphid": paragraphid,
"paraindex": paraindex,
"para_topic_type": para_topic_type,
"para_content_type_name": para_content_type_name,
"article_content": article_content,
"publishDate": publishDate,
"author": "",
"origin": origin,
"title": title
})
# todo: 场景3 ->仅勾选主题参数
elif len(contentTypeName_list) == 0 and topicTypeNames:
if para_topic_type in topicTypeNames and pStartTime <= time <= pEndTime:
result.append({
"content": "<font style='color:red;'>" + para_content + "</font>",
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": para_article_id,
"paragraphid": paragraphid,
"paraindex": paraindex,
"para_topic_type": para_topic_type,
"para_content_type_name": para_content_type_name,
"article_content": article_content,
"publishDate": publishDate,
"author": "",
"origin": origin,
"title": title
})
else:
if pStartTime <= time <= pEndTime:
result.append({
"content": "<font style='color:red;'>" + para_content + "</font>",
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": para_article_id,
"paragraphid": paragraphid,
"paraindex": paraindex,
"para_topic_type": para_topic_type,
"para_content_type_name": para_content_type_name,
"article_content": article_content,
"publishDate": publishDate,
"author": "",
"origin": origin,
"title": title
})
# print(result)
df2 = pd.DataFrame(result, columns=["content", "similarity", "id", "article_id", "paragraphid", "paraindex", "para_topic_type", "para_content_type_name",
"article_content", "publishDate", "author", "origin", "title"])
df2.drop_duplicates(subset=["content"], keep="first", inplace=True)
# todo: 将df 转为list
final_dict = df2.to_dict()
result_list = [dict(zip(final_dict, values)) for values in zip(*[final_dict[k].values() for k in final_dict])]
# memory_cache.set_value(text, result_list[:100], 60) # 设置一个 60 秒过期的键值对
# df2.to_excel('测试文件/段落库测试.xlsx', index=False, engine="xlsxwriter")
result_list = result_list[:100] if len(result_list) >= 100 else result_list
return result_list
def get_sent_result(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: datetime.date, pEndTime: datetime.date, pageSize: int, pageNo: int):
# todo: 根据字典获取contentTypeName
contentTypeName_list = []
if contentTypeFlags:
for type in contentTypeFlags:
content_type_name = type2namedict[type]
contentTypeName_list.append(content_type_name)
# 导入初始矩阵
b = np.load(sent_file_path)
# todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
c = b.transpose()
d = c[1::].transpose()
# todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
id_list = c[0].tolist()
# todo: 将矩阵索引与id_list通过定义id_dict关联
# 根据行长度初始化矩阵索引np_list
np_list = [n for n in range(b.shape[0])]
id_dict = dict(zip(np_list, id_list))
a = bc.encode([text])
r = cosine_similarity(a, d)
list_index = []
result = []
for i in range(r.shape[0]):
for j in range(r.shape[1]):
sim_value = r[i][j]
if sim_value >= sent_prob:
list_index.append({
'sim': sim_value,
'_id': str(id_dict[j]).split(".")[0]
})
name = ['_id', 'sim']
df = pd.DataFrame(columns=name, data=list(list_index))
test = df.sort_values(by=['sim'], axis=0, ascending=False)
# todo: 根据pagesize 和 pageno 获取内容长度
# pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
# todo: 场景4 ->都不勾选
if len(contentTypeName_list) == 0 and len(topicTypeNames) == 0:
df1 = test[:suf_index]
# todo: 场景1 ->勾选类型参数和主题参数 场景2 ->仅勾选类型参数 场景3 ->仅勾选主题参数
else:
df1 = test[:4 * suf_index]
for idx, row in tqdm(df1.iterrows()):
_id = row['_id']
# todo: 根据唯一标识id获取句子信息
sent_content = _id2sentcont.get(_id)
paragraph_id = _id2sent_paraid.get(_id)
sent_para_index = _id2sent_paraindex.get(_id)
sent_article_index = _id2sent_articleindex.get(_id)
sent_topic_type = _id2sent_topic_type.get(_id)
sent_content_type_name = _id2sent_content_type_name.get(_id)
sent_article_id = _id2sent_articleid.get(_id)
sentence_id = _id2sent_sentid.get(_id)
# todo: 根据段落所在的文章id获取文章信息
title = artcile_id2title.get(sent_article_id)
origin = artcile_id2origin.get(sent_article_id)
publishDate = artcile_id2time.get(sent_article_id)
# todo: 将日期转换为统一的格式
time = datetime.date(pytime.parse(artcile_id2time.get(sent_article_id)))
# author = artcile_id2author.get(para_article_id)
article_content = article_id2content.get(sent_article_id)
# todo: 根据sentence_id 和 sent_article_id 获取前后句
a = sent_df.loc[(sent_df['article_id'] == sent_article_id) & (sent_df['sentence_id'] == str(int(sentence_id) - 1))]
if a.empty:
pre_sent = ""
else:
dict_pre = a.to_dict()
new_dict_pre = [dict(zip(dict_pre, values)) for values in zip(*[dict_pre[k].values() for k in dict_pre])]
pre_sent = new_dict_pre[0]["content"]
b = sent_df.loc[(sent_df["article_id"] == sent_article_id) & (sent_df["sentence_id"] == str(int(sentence_id) + 1))]
if b.empty:
suf_sent = ""
else:
dict_suf = b.to_dict()
new_dict_suf = [dict(zip(dict_suf, values)) for values in zip(*[dict_suf[k].values() for k in dict_suf])]
suf_sent = new_dict_suf[0]["content"]
# todo: 场景1 ->勾选类型参数和主题参数
if contentTypeName_list and topicTypeNames:
if sent_content_type_name in contentTypeName_list and sent_topic_type in topicTypeNames \
and pStartTime <= time <= pEndTime:
result.append({
"content": pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": sent_article_id,
"paragraphid": paragraph_id,
"sent_para_index": sent_para_index,
"sent_article_index": sent_article_index,
"sent_topic_type": sent_topic_type,
"sent_content_type_name": sent_content_type_name,
"article_content": article_content,
"publishDate": publishDate,
"author": "",
"origin": origin,
"title": title
})
# todo: 场景2 ->仅勾选类型参数
elif contentTypeName_list and len(topicTypeNames) == 0:
if sent_content_type_name in contentTypeName_list and pStartTime <= time <= pEndTime:
result.append({
"content": pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": sent_article_id,
"paragraphid": paragraph_id,
"sent_para_index": sent_para_index,
"sent_article_index": sent_article_index,
"sent_topic_type": sent_topic_type,
"sent_content_type_name": sent_content_type_name,
"article_content": article_content,
"publishDate": publishDate,
"author": "",
"origin": origin,
"title": title
})
# todo: 场景3 ->仅勾选主题参数
elif len(contentTypeName_list) == 0 and topicTypeNames:
if sent_topic_type in topicTypeNames and pStartTime <= time <= pEndTime:
result.append({
"content": pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": sent_article_id,
"paragraphid": paragraph_id,
"sent_para_index": sent_para_index,
"sent_article_index": sent_article_index,
"sent_topic_type": sent_topic_type,
"sent_content_type_name": sent_content_type_name,
"article_content": article_content,
"publishDate": publishDate,
"author": "",
"origin": origin,
"title": title
})
else:
if pStartTime <= time <= pEndTime:
result.append({
"content": pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
"similarity": round(row['sim'], 4),
"id": _id,
"article_id": sent_article_id,
"paragraphid": paragraph_id,
"sent_para_index": sent_para_index,
"sent_article_index": sent_article_index,
"sent_topic_type": sent_topic_type,
"sent_content_type_name": sent_content_type_name,
"article_content": article_content,
"publishDate": publishDate,
"author": "",
"origin": origin,
"title": title
})
# print(result)
df2 = pd.DataFrame(result, columns=["content", "similarity", "id", "article_id", "paragraphid", "sent_para_index",
"sent_article_index", "sent_topic_type", "sent_content_type_name",
"article_content", "publishDate", "author", "origin", "title"])
df2.drop_duplicates(subset=["content"], keep="first", inplace=True)
# todo: 将df 转为list
final_dict = df2.to_dict()
result_list = [dict(zip(final_dict, values)) for values in zip(*[final_dict[k].values() for k in final_dict])]
# memory_cache.set_value(text, result_list[:100], 60) # 设置一个 60 秒过期的键值对
# df2.to_excel('测试文件/段落库测试.xlsx', index=False, engine="xlsxwriter")
result_list = result_list[:100] if len(result_list) >= 100 else result_list
return result_list
if __name__ == "__main__":
contentTypeFlags = ["speech_by_leaders"]
topicTypeNames = ["共同富裕"]
pStartTime = pytime.parse("2021-06-08")
pEndTime = pytime.parse("2022-12-13")
text = "新时代共同富裕的宗旨有"
pageNo = 1
pageSize = 10
result_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
print(result_list)
cache_list = memory_cache.get_value(text)
if cache_list:
result_two = cache_list[10:20]
print(result_two)
result_three = cache_list[20:30]
print(result_three)
else:
print("查询已失效,请重新查询!")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
import logging
from bert_serving.client import BertClient
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
# 定义向量存储文件文件
# vector_parent_path = "素材库/段落库/vector_path"
# Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# todo: 调用bert编码服务
bc = BertClient("114.116.54.108", check_length=False)
# text = "nihao"
# print(bc.encode([text]))
# df_speech_by_leaders = pd.read_excel("素材库/段落库/入库_sent.xlsx", sheet_name="领导讲话", nrows=10, keep_default_na=False).astype(str)
# df_expert_opinion = pd.read_excel("素材库/段落库/入库_sent.xlsx", sheet_name="专家观点", nrows=10, keep_default_na=False).astype(str)
# df_policy_document = pd.read_excel("素材库/段落库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/段落库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_leaders, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_leaders]
path1 = "素材库/段落库/para_vector_path/speech_by_leaders.txt"
path2 = "素材库/段落库/para_vector_path/expert_opinion.txt"
path3 = "素材库/段落库/para_vector_path/policy_document.txt"
path4 = "素材库/段落库/para_vector_path/enterprise_case.txt"
vector_path_list = [path1, path2, path3, path4]
# length_list = [22006, 205897, 56295, 49152]
length_list = [4641, 44634, 16647, 19403]
np_path1 = "素材库/段落库/para_vector_path/speech_by_leaders.npy"
np_path2 = "素材库/段落库/para_vector_path/expert_opinion.npy"
np_path3 = "素材库/段落库/para_vector_path/policy_document.npy"
np_path4 = "素材库/段落库/para_vector_path/enterprise_case.npy"
np_path_list = [np_path1, np_path2, np_path3, np_path4]
# vector_path = [
# "speech_by_leaders.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
def encode_texts(text):
logger.info("====%s====" % text)
tem_list = [{text: int(text)+1}]
# vector = bc.encode([text])
# print(vector)
return tem_list
# def encode_sentences(path, df):
# for idx, row in tqdm(df.iterrows()):
# sentence = row['content']
# print(sentence)
# vector = bc.encode([sentence])
# print(vector)
# print("hello%s,路径%s" % (idx, path))
# """
# for line in f:
# result.append(line.strip('\n'))
# """
# # with open(path, 'w', encoding='utf-8') as f_vectors:
# # for idx, row in tqdm(df.iterrows()):
# # sentence = row['content']
# # vector = bc.encode([sentence])
# # print(vector)
# # f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# return None
def save_file(length, np_path, vector_path):
# A = zeros((int(length), 769), dtype=float)
# f = open(vector_path)
# lines = f.readlines()
# A_row = 0
# for line in lines:
# list = line.strip('\n').split(' ')
# A[A_row, :] = list[:]
# A_row += 1
# print(A.shape)
# np.save(np_path, A)
text = "nihao"
return text
# todo: 定义同时编码主函数
if __name__ == "__main__":
print("主进程开始执行>>> pid={}".format(os.getpid()))
# print("父进程开始")
start_t = datetime.datetime.now()
# 创建多个进程,表示可以同时执行的进程数量。默认大小是CPU的核心数
num_cores = int(mp.cpu_count())
p = Pool(num_cores)
# text = "nihao"
# p = mp.Process(target=encode_texts, args=(text,))
text_list = ["1", "2", "3", "4"]
res_list = []
for text in text_list:
# print(i)
# np_path = np_path_list[i]
# length = length_list[i]
# vector_path = vector_path_list[i]
# print(np_path, length, vector_path)
result_list = p.apply_async(encode_texts, args=(text))
# print(result_list.get(), type(result_list.get()))
res_list.extend(result_list.get())
print(res_list)
# i = 0
# for i in range(len(df_list)-1):
# filename = vector_path[i]
# # path = os.path.join(vector_parent_path, filename)
# # print(path, len(df_list[i]))
# # print(df_list[i])
# p.apply_async(encode_sentences, args=(vector_path[i], df_list[i]))
# # p.apply_async(encode_sentences, args=(path1, df_speech_by_leaders))
# # p.apply_async(encode_sentences, args=(path, df_list[i]))
# i += 1
# for i in range(10):
# # 创建进程,放入进程池统一管理
# p.apply_async(run, args=(i,))
# p.start()
# 如果我们用的是进程池,在调用join()之前必须要先close(),并且在close()之后不能再继续往进程池添加新的进程
p.close()
# 进程池对象调用join,会等待进程吃中所有的子进程结束完毕再去结束父进程
p.join()
print("主进程终止")
end_t = datetime.datetime.now()
elapsed_sec = (end_t - start_t).total_seconds()
print("多进程计算 共消耗: " + "{:.2f}".format(elapsed_sec) + " 秒")
# encode_sentences(path1, df_speech_by_leaders)
# -*- coding: utf-8 -*-
import requests
from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractor:
@staticmethod
def get_supported_lang_code_dict():
"""
支持语言:
1、需要分词,传递分词器(3种):
a. 中文、韩语、阿拉伯语
2、不需要分词,直接传递语言编码(16种)
a. 其中英语、俄语,单独测试
"""
supported_lang_code_dict = {
'cn': '中文', # 中文
'zh-cn': '简体中文', # 简体中文
'ko': '韩语', # 韩语
'ar': '阿拉伯语', # 阿拉伯语
'en': '英语', # 英语
'ru': '俄语', # 俄语
'da': '丹麦语', # 丹麦语
'de': '德语', # 德语
'es': '西班牙语', # 西班牙语
'fi': '芬兰语', # 芬兰语
'fr': '法语', # 法语
'hu': '匈牙利语', # 匈牙利语
'id': '印度尼西亚语', # 印度尼西亚语
'it': '意大利语', # 意大利语
'nb': '挪威语(伯克梅尔)', # 挪威语(伯克梅尔)
'nl': '荷兰语', # 荷兰语
'no': '挪威文(耐诺斯克)', # 挪威文(耐诺斯克)
'pl': '波兰语', # 波兰语
'pt': '葡萄牙语', # 葡萄牙语
'sv': '瑞典语', # 瑞典语
}
return supported_lang_code_dict
def __init__(self, lang_code='cn'):
"""
构造器:未指定 lang_code 参数时,默认为 cn
"""
# 支持语言
supported_lang_code_list = list(SmartExtractor.get_supported_lang_code_dict())
# 初始化 goose 对象:
# 1、根据语言代码,创建 goose 对象
if lang_code is None or lang_code == 'cn' or lang_code == 'zh-cn':
# 需要分词:中文
# 1、不指定lang_code参数,或不指定lang_code为 None 时,默认为中文分词
# 2、Flask Web接口:未指定get参数 lang_code 时,lang_code 会接收为 None
self.goose = Goose({'stopwords_class': StopWordsChinese})
elif lang_code == 'ko':
# 需要分词:韩语
# 1、测试:只传递语言,不传递分词器
# self.goose = Goose({'use_meta_language': False, 'target_language': 'ko'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试失败:正文采集为空
# 韩语分词:测试成功
self.goose = Goose({'stopwords_class': StopWordsKorean})
elif lang_code == 'ar':
# 需要分词:阿拉伯语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试成功
# self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
self.goose = Goose({'stopwords_class': StopWordsArabic})
elif lang_code == 'en':
# 单独测试:英文
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})
# 测试成功:创建Goose对象时,不指定语言默认为英文分词
self.goose = Goose()
elif lang_code == 'ru':
# 单独测试:俄语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
elif lang_code in supported_lang_code_list:
# 其它语言编码,统一处理,不再单独测试
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})
else:
# 未识别的语言代码
raise Exception(f'智能采集时,无法识别语言代码:{lang_code}')
def get_extraction_result(self, article, link_text=''):
"""
获取采集结果:
1、从 artcile 对象中,采集数据并封装到 ExtractionResult
"""
# 用于保存:采集后的文本
extraction_result = ExtractionResult()
# 标题
# extraction_result.title = article.title # 原办法:使用 goose 采集到的 title 中的标题
extraction_result.title = SmartExtractorUtility.get_article_title(article, link_text)
# 发布日期
extraction_result.publish_date = SmartExtractorUtility.get_publish_date(article)
# 正文(保留所有HTML标记,如:br、img)
extraction_result.text = SmartExtractorUtility.get_article_text(article)
# URL
extraction_result.url = article.final_url
# 摘要
extraction_result.meta_description = article.meta_description
# 干净正文(不带HTML)
extraction_result.cleaned_text = article.cleaned_text
# 来源(目前只支持采集中文网站中的“来源”)
extraction_result.source = ''
return extraction_result
def extract_by_url(self, url, link_text=''):
"""
按URL采集内容
"""
# 采集正文:传入url
article = self.goose.extract(url=url)
# article = goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_url_test(url: str, lang_code: str):
# 测试:按URL采集
# url_list = [
# # "http://www.news.cn/politics/2022-07/31/c_1128879636.htm", # 短文本
# # "https://baijiahao.baidu.com/s?id=1741311527693101670", # 带多张图片
# # "https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml", # 带多张图片,及一个视频(测试内容XPath失败)
# # "http://opinion.people.com.cn/n1/2022/0803/c1003-32492653.html", # 人民网
# # 韩文:中央日报-politics
# # "https://www.joongang.co.kr/article/25094974",
# # "https://www.joongang.co.kr/article/25094967",
# # 英文:加德满都邮报-national-security
# # "https://kathmandupost.com/national-security/2020/01/17/police-s-intelligence-continues-to-fail-them-as-chand-party-claims-explosion",
# # "https://kathmandupost.com/national-security/2019/11/04/india-s-new-political-map-places-disputed-territory-of-kalapani-inside-its-own-borders", # 测试采集:发布时间
# # 俄语:今日白俄罗斯报-word
# # "https://www.sb.by/articles/byvshiy-premer-ministr-italii-zayavil-chto-strane-sleduet-otkazatsya-ot-gaza-iz-rossii.html",
# # 'https://www.sb.by/articles/kryuchkov-predupredil-o-nepopravimykh-posledstviyakh-dlya-ukrainy-v-sluchae-udarov-po-krymu.html',
# # 阿语
# # "http://arabic.people.com.cn/n3/2022/0822/c31659-10137917.html",
# # "http://arabic.people.com.cn/n3/2022/0822/c31657-10137909.html",
# # 测试提取标题
# # "http://www.sasac.gov.cn/n4470048/n16518962/n20928507/n20928570/c25819031/content.html",
# # "http://www.forestry.gov.cn/main/102/20220823/092407820617754.html",
# # "http://www.sasac.gov.cn/n2588025/n2588139/c25825832/content.html", # 标题采集为空
# # 'http://www.crfeb.com.cn/1j/_124/2005409/index.html', # 内容采集失败
# # 'http://www.crfeb.com.cn/1j/_124/912248/index.html', # 内容采集失败
# # 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html', # 中国铁建股份有限公司-工作动态(日期采集错误)
# # 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html', # 中国土木工程集团有限公司-多个栏目(日期采集错误)
# # 'http://v.people.cn/n1/2022/0901/c444662-32517559.html', # 人民网视频:title必须以“元素中的标题”开始,不能判断“包含”
# # 'https://www.chec.bj.cn/cn/xwzx/gsyw/2022/202207/t20220706_8128.html', # 中国港湾工程有限责任公司-公司要闻(标题采集失败)
# # 'https://www.cscec.com/xwzx_new/gsyw_new/202208/3570377.html', # 中国建筑集团有限公司-中建要闻(标题采集失败)
# # 'https://www.crbc.com/site/crbc/276/info/2022/46884837.html', # 中国路桥工程有限责任公司-多个栏目(标题采集失败)
# # 'http://www.cgcoc.com.cn/news/432.html', # 中地海外集团有限公司-新闻中心(标题和内容采集失败)
# # 'http://www.mcc.com.cn/mcc/_132154/_132572/308233/index.html' # 中国五矿(测试:正文采集失败)
# # 'http://www.powerchina.cn/art/2015/5/27/art_7449_441845.html', # 中国电力建设集团(测试:标题、正文采集失败)
# # 中国电力建设集团(测试:标题采集失败),相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
# # 'http://world.people.com.cn/n1/2022/0624/c1002-32455607.html', # 标题采集失败:看着没有问题
# 'https://www.cscec.com/xwzx_new/zqydt_new/202209/3578274.html', # 中国建筑股份有限公司-企业动态:日期采集错误,采集到当天日期
# ]
# 语言编码
# lang_code = 'cn'
# lang_code = 'ko'
# lang_code = 'en'
# lang_code = 'ru'
# lang_code = 'ar'
print("-" * 100)
print('请求URL:', url)
extraction_result = SmartExtractor(lang_code).extract_by_url(url)
# todo: 将内容返回
dict_parse = {
"title": extraction_result.title,
"publistDate": extraction_result.publish_date,
"content": extraction_result.cleaned_text
}
return dict_parse
# for url in url_list:
# print("-" * 100)
# print('请求URL:', url)
# extraction_result = SmartExtractor(lang_code).extract_by_url(url)
#
# # 测试转换为JSON
# # 1、直接转换时,会抛异常:TypeError: Object of type ExtractionResult is not JSON serializable
# # print(json.dumps(extraction_result))
# # print(json.dumps(extraction_result, default=ExtractionResult.to_dict)) # 转换成功:指定序列化器
# # print(type(json.dumps(extraction_result.to_dict()))) # 返回类型:<class 'str'>,内容中的中文会被转义
# # print(str(extraction_result.to_dict())) # 如果直接转换为字符串,中文不会被转义
#
# # 打印测试结果
# print_extraction_result(extraction_result)
def extract_by_html_test(url):
# 测试:按HTML采集
html = '''
<html>
<head>
<title>标题</title>
</head>
<body>
<div>标题</div>
<div>内容</div>
</body>
</html>
'''
# 测试:通过请求URL,获取完整的html
# url = "http://www.news.cn/politics/2022-07/31/c_1128879636.htm" # 测试成功
# url = "http://views.ce.cn/view/ent/202208/15/t20220815_37961634.shtml" # 1、测试失败:lxml.etree.ParserError: Document is empty
# url = 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html' # 中国铁建股份有限公司-工作动态(日期采集错误)
# url = 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html' # 中国土木工程集团有限公司-多个栏目(日期采集错误)
print()
print("-" * 100)
print('请求URL:', url)
html = requests.get(url).text
# 语言编码
lang_code = 'cn'
# 采集内容
extraction_result = SmartExtractor(lang_code).extract_by_html(html)
# todo: 将内容返回
dict_parse = {
"title": extraction_result.title,
"publistDate": extraction_result.publish_date,
"content": extraction_result.cleaned_text
}
# 打印测试结果
# print_extraction_result(extraction_result)
return dict_parse
def print_extraction_result(extraction_result):
# 打印测试结果
print("标题:", extraction_result.title) # 标题
print("发布时间:", extraction_result.publish_date) # 发布时间
print("正文:", extraction_result.text) # 正文
print("URL:", extraction_result.url) # URL
print("摘要:", extraction_result.meta_description) # 摘要
print("干净正文:", extraction_result.cleaned_text) # 干净正文
if __name__ == '__main__':
try:
# 测试:按URL采集
print(extract_by_url_test("http://www.gov.cn/zhengce/zhengceku/2008-03/28/content_6253.htm"))
# # 测试:按HTML采集
# dict_parse = extract_by_html_test("http://www.gov.cn/zhengce/zhengceku/2008-03/28/content_6253.htm")
# print(dict_parse)
except Exception as e:
print("采集失败:", e)
import re
from goose3.article import Article
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractorUtility:
# 标题最小长度
title_min_len = 6
@staticmethod
def extract_publish_date(html):
pattern_list = [
# 2010-10-1 8:00:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010-10-1 8:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 2010年10月1日 8:00:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}:\d{1,2}",
# 2010年10月1日 8:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}",
# 2010/10/1 8:00:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010/10/1 8:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}",
# 2010-10-1
r"20\d{2}-\d{1,2}-\d{1,2}",
# 2010年10月1日
r"20\d{2}年\d{1,2}月\d{1,2}日",
# 2010/10/1
r"20\d{2}/\d{1,2}/\d{1,2}",
# 2022.08.28
r"20\d{2}\.\d{1,2}\.\d{1,2}"
# 12-07-02 10:10
r"\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 1月前
r"\d+(&nbsp;| )*月前",
# 12天前
r"\d+(&nbsp;| )*天前",
# 2小时前
r"\d+(&nbsp;| )*小时前",
# 15分钟前
r"\d+(&nbsp;| )*分钟前",
# 昨天&nbsp;17:59
r"昨天(&nbsp;| )*\d{1,2}:\d{1,2}",
]
# 尝试匹配所有正则式
for pattern in pattern_list:
# 提取可见日期:
# 1、必须在标签内部,不能提取HTML标签属性中的日期
# 2、提取规则:必须在 > 和 < 之间,且中间不能再有 >
tag_pattern = f'>[^>]*(?P<date>{pattern})[^>]*<'
# 搜索第一个匹配项
match = re.search(tag_pattern, html)
# 如果匹配成功,返回正确的发布时间
if match:
return match.group('date')
# 所有正则式匹配失败,返回空字符串
return ""
@staticmethod
def add_html_br(cleaned_text):
# 包装HTML标记:换行
# 1、优先替换双换行:使用goose提取到的cleaned_text,都是双换行
cleaned_text = cleaned_text.replace("\n\n", "<br>")
cleaned_text = cleaned_text.replace("\n", "<br>")
return cleaned_text
@staticmethod
def get_article_title(article: Article, link_text=''):
#
# 优先提取h1、div、span、td元素中的标题
# 1、测试任务:2.智能采集\1.测试任务\国资委-新闻发布
# a. 原title标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”-国务院国有资产监督管理委员会
# b. div元素中的标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”
# 2、测试任务:2.智能采集\1.测试任务\国家林业和草原局-地方动态
# a. 原title标题:上海完成森林资源年度监测遥感解译图斑市级质量检查_地方动态_国家林业和草原局政府网
# b. span元素中的标题:上海完成森林资源年度监测遥感解译图斑市级质量检查
#
# 根据xpath,查询标题元素时:
# 1、标签优先级:h1、特殊元素(id或class包含title)、h2、h3、div、span、td
#
title_element_list = [
'h1',
'h2',
'h3',
'div',
'span',
'td',
'p',
]
# 对比标题前,统一将空格剔除(2022-09-21):
# 1、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# 2、相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
link_text = link_text.replace(" ", "")
tag_title = article.title.replace(" ", "")
title = None
for title_element in title_element_list:
element_list = article.raw_doc.getroottree().xpath(f'//{title_element}')
# 查询XPath成功,遍历所有元素
for element in element_list:
# 取纯文本内容,包括子元素
text = etree.tounicode(element, method='text').strip()
text_no_space = text.replace(" ", "")
# 判断标题:
# 1、如果智能采集的原title标题,以“元素内容”开头,则取元素内容
# 2、查找成功后,返回text作为标题,否则继续下一个循环
# 判断是否以“元素中的标题”开始:
# 1、title必须以“元素中的标题”开始,不能判断“包含”
# 2、测试URL:http://v.people.cn/n1/2022/0901/c444662-32517559.html
# 3、title标签:<title>亿缕阳光丨小生意,大格局--人民视频--人民网</title>
# a. 如果判断“包含”,会采集到:人民网
# b. 因为存在元素:<a href="http://www.people.com.cn/" class="clink">人民网</a>
# c. 如果判断以“元素中的标题”开始,采集到:亿缕阳光丨小生意,大格局
# d. 标题元素:<h2>亿缕阳光丨小生意,大格局</h2>
# 新方案:
# 1、对比常用元素:仍判断是否以“元素中的标题”开始
# 2、优先对比“链接文本”,其次对比“title元素”
# 3、满足最少字数:6个字
# 新方案(2022-09-21):
# 1、对比“链接文本”、“title元素”时,除了判断开始,同时允许结尾
# 2、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# a. 列表中的链接文本:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电...
# b. title标签中的内容:<title>中国电力建设集团 公司要闻 【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠</title>
# c. 元素中的标题:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠
if text_no_space is not None and text_no_space != '' and len(
text_no_space) >= SmartExtractorUtility.title_min_len:
# 优先判断6个字,以方便调试:排除短文本元素
if link_text.startswith(text_no_space) or link_text.endswith(text_no_space) or tag_title.startswith(
text_no_space) or tag_title.endswith(text_no_space):
# 返回时,仍返回未剔除空格后的标题
return text
if title:
# 查找成功,返回元素中的标题
return title
else:
# 查找失败,返回提取到的title属性
# return article.title
# 新考虑:标题采集失败后,返回空值
# 1、原因:article.title 不可靠,只是提取了 title 标签中的内容
return ''
@staticmethod
def get_publish_date(article: Article):
# 优先使用正则式提取日期
# 1、测试任务:加德满都邮报-national-security
# a. 使用 publish_datetime_utc 提取英文日期后,提取错误
# b. 实际日期:Friday, August 19, 2022,但提取到了:2015-02-05
# c. 原因:在下方JS中,有一段JSON文本: "datePublished": "2015-02-05T08:00:00+08:00"
# 2、注意:中文网站,都必须使用正则式
publish_date = SmartExtractorUtility.extract_publish_date(article.raw_html)
if publish_date != '':
return publish_date
else:
if article.publish_datetime_utc:
# 优先使用提取成功的 datetime
return article.publish_datetime_utc.strftime('%Y-%m-%d')
elif article.publish_date:
# 其次使用提取成功的 date 字符串
return article.publish_date
else:
# 全部提取失败,返回字符串
return ''
@staticmethod
def get_article_text(article: Article):
# 第一种方法:在纯文本(cleaned_text)基础上,添加br标签
# 1、缺点:无法获取图片,同时会丢掉原有的p标签(只能用br替补)
# text = SmartExtractor.add_html_br(article.cleaned_text)
# 第二种方法:直接获取 top_node 的HTML内容
# 1、优点:可保留原有的p标签等
# 2、缺点:无法获取图片,img标签未被保留
# text = etree.tounicode(article.top_node, method='html')
# 测试抛出异常
# raise Exception("测试抛出异常")
# 第三种方法:获取到 top_node 的xpath,再通过xpath查询原始doc
# 1、可行:通过查询原始doc,可以获取“正文”的所有HTML内容
# 2、遇到问题:获取到 top_node 的xpath不准确,与原位置偏移一个元素
# a. 测试URL:https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml
# b. 获取到的xpath:/html/body/div/div[1]/div[2]/div[4]
# c. 实际xpath:/html/body/div/div[1]/div[2]/div[5]
# 3、解决办法:
# a. 优先使用id、class查询,如果没有id、class,再查询 top_node 的xpath
xpath = None
if type(article.top_node) is HtmlElement:
if 'id' in article.top_node.attrib:
xpath = "//*[@id='{}']".format(article.top_node.attrib['id'])
elif 'class' in article.top_node.attrib:
xpath = "//*[@class='{}']".format(article.top_node.attrib['class'])
else:
xpath = article.top_node.getroottree().getpath(article.top_node)
else:
# article.top_node 有时为空:
# 1、测试URL:https://baijiahao.baidu.com/s?id=1741311527693101670
# 2、输出日志:article.top_node 不是 HtmlElement 对象:None
print("SmartExtractor:article.top_node 为 {},不是 HtmlElement 对象。".format(article.top_node))
# article.top_node 为空时,直接输出 cleaned_text:
# 1、在纯文本(cleaned_text)基础上,添加br标签
text = SmartExtractorUtility.add_html_br(article.cleaned_text)
return text
# 根据xpath,查询元素
element_list = article.raw_doc.getroottree().xpath(xpath)
if element_list:
# 查询XPath成功,获取第一个元素的HTML
text = etree.tounicode(element_list[0], method='html')
else:
# 查询XPath失败,返回 top_node 原有的HTML
# 1、缺点:无法获取图片,img标签未被保留
text = etree.tounicode(article.top_node, method='html')
return text
# -*- coding: utf-8 -*-
# @Time : 2022/9/22 20:00
# @Author : ctt
# @File : test
# @Project : 研究中心知识图谱
# 生成唯一id
import time
class Snow:
"""雪花算法生成全局自增唯一id"""
# 154420004524033 154469287596033
init_date = time.strptime('2022-12-12 18:32:25', "%Y-%m-%d %H:%M:%S")
start = int(time.mktime(init_date) * 1000)
last = int(time.time() * 1000)
pc_room = 1
pc = 1
seq = 0
@classmethod
def get_guid(cls):
"""获取雪花算法生成的id"""
now = int(time.time() * 1000)
if now != cls.last:
cls.last = now
cls.seq = 1
else:
while cls.seq >= 4096:
time.sleep(0.1)
return cls.get_guid()
cls.seq += 1
time_diff = now - cls.start
pk = (time_diff << 22) ^ (cls.pc_room << 18) ^ (cls.pc << 12) ^ cls.seq
return str(pk)
snow = Snow.get_guid()
print(snow)
print(type(snow))
\ No newline at end of file
#!/bin/sh
cd /zzsn/lzc/智能报告搜索推荐
exec gunicorn -c app_config.py main_app:app --daemon --timeout 1200
#nohup python3 -u smi_app.py runserver -h 0.0.0.0 -p 8015 --threaded >>run.log 2>&1 &
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : es_byid.py
# @Time : 2022/12/27 13:41
# @Author : bruxelles_li
# @Software: PyCharm
# xlsx文件转csv文件
import pandas as pd
data = pd.read_excel('素材库/句子库/入库_sent.xlsx', index_col=0).astype(str) # 设置index_col=0,写入文件时第一列不会存在序列号
data.to_csv('素材库/句子库/入库_sent.csv', encoding='utf-8') # 将数据写入csv文件
print("写入完成......")
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 占位符测试.py
# @Time : 2022/12/7 10:02
# @Author : bruxelles_li
# @Software: PyCharm
"""
说明:4个字符是为了避免开头字符多次出现在文章中的不同位置
段落索引: 以段落开头4个字符所在文章位置和段落结尾四个字符所在文章位置作为索引
句子绝对位置索引:以句子开头4个字符所在文章位置和句子结尾4个字符所在文章位置作为索引
句子相对位置索引: 以句子开头4个字符所在段落位置和句子开头4个字符所在段落位置作为索引
注:高亮方法
a = '\033[1;31;40m%s\033[0m' % para[:4]
b = '\033[1;31;40m%s\033[0m' % para[-4:]
print(para_index, a, b)
注: 搜索推荐时转义
段落库:
prefix_index = int(para_index.split("|")[0])
suffix_index = int(para_index.split("|")[1])
print(prefix_index, suffix_index)
# 返回内容,即搜索推荐时高亮区域
print(text[prefix_index:suffix_index+1])
句子库:
# 文章索引
article_prefix_index = int(sent_article_index.split("|")[0])
article_suffix_index = int(sent_article_index.split("|")[1])
print(article_prefix_index, article_suffix_index)
# 返回内容,即搜索推荐时在文章中的高亮区域
print(text[article_prefix_index:article_suffix_index+2])
# 段落索引
para_prefix_index = int(sent_para_index.split("|")[0])
para_suffix_index = int(sent_para_index.split("|")[1])
print(para_prefix_index, para_suffix_index)
# 返回内容,即搜索推荐时在文章中的高亮区域
print(para[para_prefix_index:para_suffix_index+2])
注: 参数定义
输入:
文章id: infoId -> str
文章标题: title -> str
正文: content -> str
文章类型: contentTypeIds -> str
文章主题: topicTypeIds -> str
来源网站: origin -> str
发布时间: publishDate -> str
作者: author -> str
输出:
状态码: code -> str
处理消息: message -> str
返回内容体: resultData -> object
文章信息列表: article_info -> array
文章id: infoId -> str
文章标题: title -> str
正文: content -> str
文章类型: contentTypeIds -> str
文章主题: topicTypeIds -> str
来源网站: origin -> str
发布时间: publishDate -> str
作者: author -> str
段落信息列表: para_info -> array
段落库id: para_id -> str
段落所在文章id: infoId -> str
段落索引: para_index -> str
段落内容: para_content -> str
段落类型: contentTypeIds -> str
段落主题: topicTypeIds -> str
句子信息列表: sent_info -> array
句子库id: sent_id -> str
句子所在段落id: para_id -> str
句子所在文章id: infoId -> str
绝对位置索引: sent_article_index -> str
相对位置索引: sent_para_index -> str
句子内容: sent_content -> str
句子类型: contentTypeIds -> str
句子主题: topicTypeIds -> str
"""
import re
from tqdm import tqdm
import pandas as pd
from 文章id生成 import create_title_id
data_df = pd.read_excel("领导讲话结果.xlsx", nrows=1).astype(str)
# 初始化段落库id,句子库id
para_id, sent_id = 1, 1
# todo: 定义返回结果列表
list_article = []
for idx, row in tqdm(data_df.iterrows()):
# 若文章id存在则用该文章id,若不存在,则用时间戳来生成文章id
row["infoId"] = row["infoId"] if row["infoId"] else str(create_title_id())
# 处理正文内容
text = row["正文"]
# list_text = list(text)
# 基础段落拆分符号”\n“
para_list = text.split("\n")
# print(len(para_list))
# todo: 定义单篇文章的段落信息列表,句子信息列表
list_para, list_sent = [], []
# 获取段落索引
for para in para_list:
# 处理正常内容段落
if len(para) >= 50:
# 根据前四个字符内容获取索引
a0 = para[:4]
b0 = para[-4:]
# 获取首字符的索引
a0_index = text.find(a0) + (4 - len(a0.strip()))
# print(a0, a0_index, text[a0_index])
# 获取末字尾符的索引
b0_index = text.find(b0) + (4 - len(b0.strip())) + 3
# print(b0, b0_index, text[b0_index])
# 保存索引
para_index = str(a0_index) + "|" + str(b0_index)
# print(para_index)
# 获取句子索引, 句子索引有两部分:绝对位置索引,即原文中的位置索引; 相对位置索引,即所在段落中的索引
print("====句子索引====")
# 基础句子拆分符号”!。?“
sent_list = re.split(r'\s*[。!?]\s*', para)
for sent in sent_list:
# todo:当句子长度大于10才进行入库
if len(sent) >= 10:
# 根据前四个字符内容获取索引
c0 = sent[:4]
d0 = sent[-4:]
# 获取首字符的索引
c0_index = text.find(c0) + (4 - len(c0.strip()))
c1_index = para.find(c0) + (4 - len(c0.strip()))
# print(c0, c0_index, text[c0_index], c1_index, para[c1_index])
# 获取末字尾符的索引
d0_index = text.find(d0) + (4 - len(d0.strip())) + 3
d1_index = para.find(d0) + (4 - len(d0.strip())) + 3
# print(d0, d0_index, text[d0_index], d1_index, para[d1_index])
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
# 打印句子内容
print(text[c0_index:d0_index+2], para[c1_index:d1_index+2])
# 将句子信息加入句子信息列表
list_sent.append({
"sent_id": str(sent_id),
"para_id": str(para_id),
"infoId": row["infoId"],
"sent_article_index": sent_article_index,
"sent_para_index": sent_para_index,
"sent_content": text[c0_index:d0_index+2],
"contentTypeIds": "",
"topicTypeIds": ""
})
# todo: 更新句子库id
sent_id += 1
# 处理小标题段落
elif 10 <= len(para) < 50:
# todo: 根据包含的字词内容来简单过滤脏段落信息
words_list = {"微信", "如需转载", "免责声明", "公告", "jpeg", "jpg", "png", "【", "责任编辑"}
if any([i in para for i in words_list]):
continue
else:
# 根据前四个字符内容获取索引
a0 = para[:4]
b0 = para[-4:]
# 获取首字符的索引
a0_index = text.find(a0) + (4 - len(a0.strip()))
print(a0, a0_index, text[a0_index])
# 获取末字尾符的索引
b0_index = text.find(b0) + (4 - len(b0.strip())) + 3
print(b0, b0_index, text[b0_index])
# 保存索引
para_index = str(a0_index) + "|" + str(b0_index)
else:
continue
# 打印段落内容
print(text[a0_index:b0_index+1])
# 将段落信息加入段落信息列表
list_para.append({
"para_id": str(para_id),
"infoId": row["infoId"],
"para_index": para_index,
"sent_content": text[a0_index:b0_index+2],
"contentTypeIds": "",
"topicTypeIds": ""
})
# todo: 更新段落库id
para_id += 1
# 将文章信息加入文章信息列表
list_article.append({
"infoId": row["infoId"],
"content": text,
"title": row["标题"],
"contentTypeIds": row["专题库类型"],
"topicTypeIds": row["专题名称"],
"origin": row["来源"],
"publishDate": row["发布时间"],
"author": row["作者"]
})
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 句子分割完整测试.py
# @Time : 2022/12/8 15:21
# @Author : bruxelles_li
# @Software: PyCharm
"""
引用内容:“”。
"""
import re
para = "恩格斯早就指出:“我们不要过分陶醉于我们人类对自然界的胜利。对于每一次这样的胜利,自然界都对我们进行报复。”第一次工业革命以来,人类利用自然的能力不断提高,但过度开发也导致生物多样性减少,迫使野生动物迁徙,增加野生动物体内病原的扩散传播。"
def get_index(text):
sent_list = re.split(r'\s*[。!?]\s*', text)
for sent in sent_list:
# 根据句子内容获取该句首字符索引
c0 = sent.find(sent.strip())
d0 = text.find("”")
# 获取首字符的索引
c0_index = c0
# 获取末字尾符的索引
d0_index = d0 + 1
return c0_index, d0_index
if __name__ == "__main__":
# todo: 当段落中出现讲话内容时,根据特殊符号“:”和““”来处理该句子,保证该部分内容的完整,并以该部分内容为中心,分别对中心点前后部分进行处理
if ":" in para:
pre_index, suf_index = get_index(para)
print(pre_index, suf_index)
pre_text = para[:pre_index-1] if pre_index != 0 else ""
# todo: 处理temp_text
temp_text = para[pre_index:suf_index]
print(temp_text)
# todo: 处理suffix_text
suffix_text = para[suf_index:]
print(suffix_text)
else:
# 基础句子拆分符号”!。?“
sent_list = re.split(r'\s*[。!?]\s*', para)
for sent in sent_list:
# todo:当句子长度大于10才进行入库
if len(sent.strip()) >= 13:
# 获取首字符的索引
c0_index = text.find(sent.strip())
c1_index = para.find(sent.strip())
# 获取末字尾符的索引
d0_index = c0_index + len(sent.strip())
d1_index = c1_index + len(sent.strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
elif 10 <= len(sent.strip()) < 13:
# 获取首字符的索引
c0_index = text.find(sent.strip())
c1_index = para.find(sent.strip())
# 获取末字尾符的索引
d0_index = c0_index + len(sent.strip())
d1_index = c1_index + len(sent.strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
else:
continue
#!/usr/bin/python3
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/01/09 14:12
# @Author : bruxelles_li
# @FileName: 合并.py
# @Software: PyCharm
import os
import pandas as pd
path1 = "素材库/待入库数据/专家观点数据.xls"
path2 = "素材库/待入库数据/专家观点数据(权威观察筛选).xls"
df = pd.read_excel(path1, keep_default_na=False).astype(str)
df1 = pd.read_excel(path2, keep_default_na=False).astype(str)
final_df = pd.concat([df, df1])
# final_df["contentTypeIds"] = "1612323231851601921"
# result = []
# path = r"素材库/段落库"
# for root, dirs, files in os.walk(path, topdown=False):
# for name in files:
# if name.endswith(".xls") or name.endswith(".xlsx"):
# df = pd.read_excel(os.path.join(root, name), sheet_name=None)
# result.append(df)
# data_list = []
# for data in result:
# data_list.extend(data.values()) # 注意这里是extend()函数而不是append()函数
# df = pd.concat(data_list)
final_df.to_excel('素材库/待入库数据/ori_policy_document_article.xlsx', index=False, engine='xlsxwriter')
print("合并完成!")
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : main_app.py
# @Time : 2022/12/14 19:49
# @Author : bruxelles_li
# @Software: PyCharm
import os, json
import logging
from flask import Flask, request, jsonify
import sys
sys.path.append('../')
from 文章内容检查 import clean_html_tag
from 素材库构建程序 import *
from 文章id生成 import create_title_id
import requests
import queue
import pandas as pd
import time
from smart_extractor import extract_by_url_test
import traceback
from pathlib import Path
from tqdm import tqdm
from search_by_dot_matrix import get_sent_result, get_para_result, get_sen_duplicated, get_para_duplicated, put_para_list, put_sen_list
# 关闭多余连接
s = requests.session()
s.keep_alive = False
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
HOST = '0.0.0.0'
PORT = 4002
DEBUG = False
app = Flask(__name__)
# todo: 定义缓存路径
cache_path = "测试文件"
Path(cache_path).mkdir(parents=True, exist_ok=True)
# Queue基本FIFO队列 先进先出 FIFO即First in First Out,先进先出
# maxsize设置队列中,数据上限,小于或等于0则不限制,容器中大于这个数则阻塞,直到队列中的数据被消掉
q = queue.Queue(maxsize=0)
# todo: 定义文章内容
# df0 = pd.read_excel('素材库/文章库/入库_article.xlsx').astype(str)
# art_list = df0["content"].tolist()
# content2_id = {row['content']: row['id'] for idx, row in df0.iterrows()}
# 跨域支持1
from flask_cors import CORS
CORS(app, supports_credentials=True)
@app.route("/", methods=["GET"])
def hello_world():
logger.info('Hello World!')
return "Hello World"
type2iddict = {
"speech_by_leaders": "1602095566267805697",
"policy_document": "1602095618788880386",
"expert_opinion": "1602095680285765633",
"enterprise_case": "1602095727870144513",
"other": "1602095773126684673",
# "think_tanks": "",
# "policies_regulations": "",
# "enterprise_news": "",
}
type2namedict = {
"speech_by_leaders": "领导讲话",
"policy_document": "政策文件",
"expert_opinion": "专家观点",
"enterprise_case": "企业案例",
"other": "其他",
# "think_tanks": "智库",
# "policies_regulations": "政策法规",
# "enterprise_news": "企业资讯",
}
# 运行程序接口
@app.route('/build_pro', methods=["POST"])
def get_result():
"""
-> data:
领导讲话:1602095566267805697
政策文件:1602095618788880386
专家观点:1602095680285765633
企业案例:1602095727870144513
其他:1602095773126684673
领导讲话:speech_by_leaders
政策文件:policy_document
专家观点:expert_opinion
企业案例:enterprise_case
其他:other
:return:
"""
try:
data = request.get_json()
# todo: 先判断是否提供url链接来获取来源,发布时间,正文内容
if "url" in data:
url = data["url"]
lang_code = data["lang_code"] if "lang_code" in data else "cn"
dict_parse = extract_by_url_test(url, lang_code)
title = dict_parse["title"]
ori_content = dict_parse["content"]
content = clean_html_tag(ori_content)
publishDate = dict_parse["publishDate"]
else:
title = data['title']
ori_content = data['content']
content = clean_html_tag(ori_content)
publishDate = data['publishDate']
infoId = str(data['infoId']) if data["infoId"] else str(create_title_id())
contentTypeFlags = data['contentTypeFlags']
topicNames = data['topicNames']
origin = data['origin']
author = data['author']
# todo: 根据typedict 获取contentType
contentNames = type2namedict[contentTypeFlags]
contentTypeIds = str(type2iddict[contentTypeFlags])
# todo: 若清洗后的文章内容长度不为空,则进行处理,否则返回日志
if len(content) >= 50:
list_para, list_sent = build_pro_new(infoId, content, contentNames, contentTypeIds, topicNames)
# todo: 利用dataframe对两个生成的列表内容进行去重
df_para = pd.DataFrame(list_para)
df_para.drop_duplicates(subset=["para_content"], keep="first", inplace=True)
dict_para = df_para.to_dict()
new_list_para = [dict(zip(dict_para, values)) for values in zip(*[dict_para[k].values() for k in dict_para])]
df_sent = pd.DataFrame(list_sent)
df_sent.drop_duplicates(subset=["sent_content"], keep="first", inplace=True)
dict_sent = df_sent.to_dict()
new_list_sent = [dict(zip(dict_sent, values)) for values in zip(*[dict_sent[k].values() for k in dict_sent])]
# todo: 新增素材库去重,更新repeatedId, is_main, 唯一标识id, create_time
# final_list_para = get_para_duplicated(new_list_para)
# final_list_sent = get_sen_duplicated(new_list_sent)
# # todo: 根据文章内容判断文章是否重复
# if content in art_list:
# repeatedId = content2_id.get(content)
# is_main = "0"
# else:
# repeatedId = ""
# is_main = ""
dict_result = {
"code": 200,
"message": "success",
"resultData": {
"article_info":
[
{
"repeatedId": "",
"is_main": "",
"create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
"infoId": infoId,
"content": content,
"title": title,
"contentNames": contentNames,
"contentTypeIds": contentTypeIds,
"topicNames": topicNames,
"origin": origin,
"publishDate": publishDate,
"author": author,
"type": "art"
}
],
"para_info": new_list_para,
"sent_info": new_list_sent
}
}
else:
dict_result = {
"code": 500,
"message": "failure" + "文章内容杂乱,请检查并清除杂乱格式再进行操作!",
"resultData": None
}
except Exception as e:
dict_result = {
'code': 500,
'message': "failure" + str(e),
'resultData': None
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route('/put_database', methods=["POST"])
def put_result():
try:
data = request.get_json()
para_list = data["resultData"]["para_info"]
sen_list = data["resultData"]["sent_info"]
# todo: 先判断是否提供url链接来获取来源,发布时间,正文内容
sen_flag = put_sen_list(sen_list)
para_flag = put_para_list(para_list)
if sen_flag == "1" and para_flag == "1":
dict_result = {
"code": 200,
"message": "success",
"resultData": "已成功处理"
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
except Exception as e:
dict_result = {
'code': 500,
'message': "failure" + str(e),
'resultData': None
}
logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
@app.route('/search_content', methods=["POST"])
def get_top_content():
try:
# 定义接收参数
data = request.get_json()
# logger.info(data)
text = data['queryText']
contentTypeFlags = data['contentTypeFlags'] if "contentTypeFlags" in data else []
topicTypeNames = data['topicNames'] if 'topicNames' in data else []
returenType = data['returenType'] if data['returenType'] else "sen"
pageNo = int(data['pageNo'])
pageSize = int(data['pageSize'])
pStartTime = data['pStartTime'] if data['pStartTime'] else "2021-00-00"
pEndTime = data['pEndTime'] if data["pEndTime"] else "2023-00-00"
# todo: 根据字典获取contentTypeName
contentTypeName_list = []
if contentTypeFlags:
for type in contentTypeFlags:
content_type_name = type2namedict[type]
contentTypeName_list.append(content_type_name)
else:
contentTypeName_list = ["领导讲话", "专家观点", "政策文件", "企业案例", "其他"]
# todo: 调用搜索函数返回推荐list
if returenType == "par":
# todo: 先检查缓存是否可用,若不可用则重新查找
if os.path.isfile(os.path.join(cache_path, "para.json")):
with open(os.path.join(cache_path, "para.json"), 'r', encoding='utf-8') as f:
para_dict_result = json.load(f)
# todo: 继续判断待查询的内容是否与缓存的对象相同
if text == para_dict_result["text"]:
final_para_list = []
para_list = para_dict_result["para_list"]
for row in tqdm(para_list):
if row["content_type_name"] in contentTypeName_list:
final_para_list.append(row)
else:
continue
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
result_list = final_para_list[pre_index:suf_index]
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list,
"pageNo": pageNo,
"pageSize": pageSize,
"total": len(result_list)
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
os.remove(os.path.join(cache_path, "para.json"))
result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime,
pEndTime, pageSize, pageNo, returenType)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_para = {
"text": text,
"para_list": result_list
}
para_result = json.dumps(dict_para)
with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
file.write(para_result)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo, returenType)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_para = {
"text": text,
"para_list": result_list
}
para_result = json.dumps(dict_para)
with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
file.write(para_result)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
# todo: 处理句子
else:
# todo: 先检查缓存是否可用,若不可用则重新查找
if os.path.isfile(os.path.join(cache_path, "sent.json")):
with open(os.path.join(cache_path, "sent.json"), 'r', encoding='utf-8') as f:
sent_dict_result = json.load(f)
# todo: 继续判断待查询的内容是否与缓存的对象相同
if text == sent_dict_result["text"]:
sent_list = sent_dict_result["sent_list"]
final_sent_list = []
for row in tqdm(sent_list):
if row["content_type_name"] in contentTypeName_list:
final_sent_list.append(row)
else:
continue
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
result_list = final_sent_list[pre_index:suf_index]
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list,
"pageNo": pageNo,
"pageSize": pageSize,
"total": len(final_sent_list)
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
os.remove(os.path.join(cache_path, "sent.json"))
result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime,
pEndTime, pageSize, pageNo, returenType)
logger.info(result_list)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_sent = {
"text": text,
"sent_list": result_list
}
# todo: 将内容转换为JSON字符串用来存储
sent_result = json.dumps(dict_sent)
with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
file.write(sent_result)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
else:
result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo, returenType)
logger.info(result_list)
pre_index = pageNo * pageSize - pageSize
suf_index = pageNo * pageSize
dict_sent = {
"text": text,
"sent_list": result_list
}
# todo: 将内容转换为JSON字符串用来存储
sent_result = json.dumps(dict_sent)
with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
file.write(sent_result)
dict_result = {
'code': 200,
'message': 'success',
'result_data': {
"match_info": result_list[pre_index:suf_index],
"pageNo": pageNo,
"pageSize": pageSize,
"total": len_list
}
}
# logger.info(dict_result)
return json.dumps(dict_result, ensure_ascii=False)
except Exception as e:
traceback.print_exc()
dic_result = {
'code': 500,
'message': "failure" + str(e),
'resultData': None
}
logger.info(dic_result)
return json.dumps(dic_result, ensure_ascii=False)
if __name__ == '__main__':
app.run(host=HOST, port=PORT, debug=DEBUG)
if __name__ != '__main__':
gunicorn_logger = logging.getLogger('gunicorn.error')
app.logger.handlers = gunicorn_logger.handlers
app.logger.setLevel(gunicorn_logger.level)
#!/usr/bin/python3
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/12/12 17:45
# @Author : bruxelles_li
# @FileName: 处理内容_去重.py
# @Software: PyCharm
import pandas as pd
from tqdm import tqdm
import time
# from snow_id import Snow
from 文章id生成 import create_title_id
# snow = Snow.get_guid()
# para_df = pd.read_excel("素材库/段落库/去重后_para.xlsx", keep_default_na=False).astype(str)
# para_df = pd.read_excel("素材库/段落库/去重后_para.xlsx", keep_default_na=False).astype(str)
para_df = pd.read_excel("素材库/段落库/leader_1_para.xlsx", keep_default_na=False).astype(str)
# article_list = para_df["infoId"].tolist()
para_df["id"] = ""
para_df["deleted"] = "0"
para_df["if_public"] = "0"
# para_df["type"] = "sen"
print(len(para_df))
para_list, sent_list = [], []
i = 1
for idx, row in tqdm(para_df.iterrows()):
row["id"] = str(create_title_id() + i)
row["create_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
para_list.append(row)
i += 1
print(len(para_list))
df = pd.DataFrame(para_list)
df.to_excel('素材库/段落库/待入库_leader_1_para.xlsx', index=False, engine='xlsxwriter')
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 文章id生成.py
# @Time : 2022/12/7 12:07
# @Author : bruxelles_li
# @Software: PyCharm
"""
使用time,hashlib 来自动生成文章id
"""
import time, hashlib
def create_id():
m = hashlib.md5(str(time.perf_counter()).encode("utf-8"))
return m.hexdigest()
# print(type(create_id()))
# print(create_id())
# 2.使用time生成时间戳
def create_title_id():
time_stamp = int(round(time.time()*1000000))
return time_stamp
if __name__ == "__main__":
print(create_title_id(), type(create_title_id()))
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 文章内容检查.py
# @Time : 2022/12/9 16:05
# @Author : bruxelles_li
# @Software: PyCharm
from bs4 import BeautifulSoup
import re
# 程序返回处理函数
def clean_html_tag(content):
# todo: 考虑正式场景中是以</p>进行段落划分的
ori_text = re.sub("(<\/p >|<\/p>)", "\t", content)
tag_content_list = ori_text.split("\t") if "<p>" in ori_text else ori_text
temp_content_list = []
if type(tag_content_list) is list:
for text in tag_content_list:
bs = BeautifulSoup(text, 'lxml')
ori_match_content = bs.text.strip()
temp_content_list.append(ori_match_content)
match_content = "\n".join(temp_content_list)
else:
if "参考文献" not in tag_content_list:
match_content = tag_content_list
else:
match_content = tag_content_list.split("参考文献")[0]
temp = []
# 初步清洗文中的空白符,杂乱符号
pattern = re.compile(
'[▎#\xa0$*★$<=>@●▍[\]△▲^_`■▋{|}~⦅⦆ф「」\u3000〈〉《》「」『』【】※〔〕〖〗〘〙〚〛〜〰〾〿\*]')
match_content0 = pattern.sub('', match_content)
match_content_1 = re.sub(r".*(来源:|DOI:|微博|哔哩|头条号|订阅号|读完|监制:|校对:|编辑:|排版:|往期精选|关注我们|点击|阅读|作者:).*(?=\n|)", "", match_content0)
match_content_1_1 = re.sub(r"\s图.*(?=\n)", "", match_content_1)
match_content1 = re.sub(r"(&emsp;|阅读提示|点击 上方文字 关注我们 |点击 上方文字 关注我们|点击蓝字丨关注我们|点击蓝字 关注我们|- THE END - |◀——|-)", "", match_content_1_1)
match_content2 = re.sub(r"(?=\(图片[::]).+(?<=\))", "", match_content1)
match_content3 = re.sub(r"&mdash&mdash", "&mdash", match_content2)
match_content4 = re.sub(r"&mdash", "&", match_content3)
match_content5 = re.sub(r"       ", "", match_content4)
match_content6 = re.sub(r"(?=\().*(?<=图\))", "", match_content5)
match_content7 = re.sub(r'。"', "。”", match_content6)
match_content8 = re.sub(r"(。;|。,)", "。", match_content7)
match_content9 = re.sub(r"(\\t|\\)", "", match_content8)
match_content10 = re.sub(r"&;&;", "——", match_content9)
match_content11 = re.sub(r"(.*(记者).*)", "", match_content10)
match_content12 = re.sub(r"(&#13;|&zwj)", "", match_content11)
# match_content13 = re.sub(r"(?<=。).*如图.*所示[:。]", "", match_content12)
list_content = match_content12.split('\n')
temp_content = []
for text in list_content:
if len(text.strip()) <= 2:
continue
else:
text = text.strip()
if text.endswith("。") or text.endswith("“") or text.endswith(".") or text.endswith('”'):
text = text
else:
text = text + "\t" + "\t"
text = re.sub(r".*(?<=记者).*(?<=摄)", "", text)
temp_content.append(text)
# print(temp_content)
str_content = "\t".join(temp_content)
a = re.sub('\t\t\t', '——', str_content)
a0 = re.sub('\t\t', '', a)
a1 = re.sub(r":——", ":", a0)
a2 = re.sub(r"。)", ")", a1)
a3 = re.sub(r"(。”|!”)", "”", a2)
b = re.sub("\t", "\n", a3).strip()
c = b.split('\n')
# print(len(c))
for d in c:
e = d.strip('&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;')
# 去除中间包含指定字符所在的句子
f = re.sub("(微信|如需转载|免责声明|公告|jpeg|jpg|png|声明:|附件:|责任单位:|编辑:).*?(?<=。)", '', str(e))
# 删除邮箱所在行
g = re.sub(".*(?=\.com|www\.).*", "", f)
# print(g)
if len(g) <= 20:
continue
else:
temp.append(g)
h = "\t".join(temp)
j = re.sub("\t(?=而|但|对于|此外|因此|与此同时|这种|基于此|但是|然而)", "", h)
new_content = re.sub("\t", "\n", j)
new_content_list = new_content.split("\n")
final_content_list = []
for k in new_content_list:
k = " " + k
# 先去除中间包含javascript、html所在的段落内容
l = re.sub(".*(function。|html|background|javascript|image).*", '', k)
if l:
final_content_list.append(l.strip("——"))
final_content = "\n\n".join(final_content_list) if len(final_content_list) >= 2 else "".join(final_content_list)
return final_content
if __name__ == "__main__":
# text = " 从产品数字化、运营数字化向战略数字化进行转变,数字化已成为平安集团打造核心竞争力、积蓄发展动能的一大源泉。 2020年新冠疫情爆发初期,众多深圳市民都下载了一款“i深圳”APP。有数据显示,“i深圳”自2019年1月11日上线,已接入7600余项服务、55类高频电子证照和电子证明,累计下载数超1800万,累计注册"
text = """ <p>股平台及中电信创分别增资不超过8亿元、外部战略投资人增资不超过4亿元,公司作为原股东,放弃参本次增资。外部战略投资人的引入通过北京产权交易所公开挂牌进行,中电信创及员工持股平台将根据外部占略投资人摘牌价格增</p >\n<p>&nbsp;</p >\n<p>资。近日,中国电子云增资项目已在北京产权交易所正式挂牌 ,员工持股平台云启未来(武汉)管理咨询中心(有限合伙)已完成设立。2022年12月28日,北京产权交易所向国开制造业转型升基金(有限合伙) (以下简称&ldquo;国开基金&rdquo;)出具了《意向投资方资格确认通知书》,确认国开基金符合投资资格条件,成为合格竟向投咨方。根据国开基金摘牌结果,本次按中国电子云增资前估值30亿元,认购全部亿元外部战略投资人增资</p >\n<p>&nbsp;</p >\n<p>额度。 2022年12月29日,深圳市桑达实业股份有限公司及中国电子云与国开基金员丁持股平台,中电信创签署了《增资协议》,约定国开基金,云启未来(武汉)管理咨询中心(有限合中电信创以相同价格向中国电子云增资,其中,</p >\n<p>&nbsp;</p >\n<p>员工持股平台投资8亿元,中电信创投资8亿元。火)本次增资完成后股权结构如下: 公告个绍,上述《一致行动协议》的签署 ,有利于更好的规范中国电子云经营管理,增强上市公司对中国电子天的控制,不影响上市公司实际控制权,不存在对对上市公司日堂经营管理产生/利影响或损害中小投资者利</p >\n<p>&nbsp;</p >\n<p>举的情形,据悉,深圳市桑达实业股份有限公司是中国电子旗下的重要二级企业秉承集团&ldquo;加快打造网信产业核心力量和组织平台&rdquo;的战略使命,面向党政和 关键行业客户,提供数字与信服务和产业服务。其中在云计算方面,依托中国</p >\n<p>电子PKS自主安全计算体系及丰富的网信产业资源,公司基于自研可信计算技术架构和分布式云原生云操作系统,推出了&ldquo;中国电子云&rdquo;这一数字底座目前,\"中国电一天&rdquo;服务行业客户已超400家,已在天津市西青区,云南省大理州,江西省南昌市新建区,四川省遂宁市和德阳市、山东省德州市等地方政府落地政务云,承建了中国人保等金融机构及管网集团、星网集团、中国华电、中国电子等 央企集团的云平台,以及为国家互联网应急中心等部委提供云</p >
"""
# print(len(text))
# text = """"<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /><meta http-equiv=\"Content-Style-Type\" content=\"text/css\" /><meta name=\"generator\" content=\"Aspose.Words for Java 15.12.0.0\" /><title></title><style type=\"text/css\">@page Section1 { margin:72pt 90pt; size:595.3pt 841.9pt }div.Section1 { page:Section1 }</style></head><body><div class=\"Section1\"><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:宋体; font-size:10.5pt\">希望全国爱国卫生战线的同志们始终坚守初心使命,传承发扬优良传统,丰富工作内涵,创新工作方式方法,为加快推进健康中国建设作出新的贡献。”在爱国卫生运动开展</span><span style=\"font-family:Calibri; font-size:10.5pt\">70</span><span style=\"font-family:宋体; font-size:10.5pt\">周年之际,习近平总书记作出重要指示,强调“用千千万万个文明健康小环境筑牢疫情防控社会大防线,切实保障人民群众生命安全和身体健康”。</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:Calibri; font-size:10.5pt\">&#xa0;</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:宋体; font-size:10.5pt\">习近平总书记的重要指示充分肯定了</span><span style=\"font-family:Calibri; font-size:10.5pt\">70</span><span style=\"font-family:宋体; font-size:10.5pt\">年来爱国卫生运动取得的重要成果,深刻阐明了爱国卫生运动的战略地位和作用,对于应对当前新冠疫情防控新形势新任务,推动卫生健康事业高质量发展,有效防范卫生健康领域重大风险,加快推进健康中国建设,具有重大而深远的意义。</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:Calibri; font-size:10.5pt\">&#xa0;</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:Calibri; font-size:10.5pt\">&#xa0;</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:宋体; font-size:10.5pt\">爱国卫生运动是我们党把群众路线运用于卫生防病工作的成功实践,也是一项“以人民健康为中心”的重大惠民工程。</span><span style=\"font-family:Calibri; font-size:10.5pt\">70</span><span style=\"font-family:宋体; font-size:10.5pt\">年来,从除“四害”到“三讲一树”,从全国卫生城镇创建活动到新时代的城乡环境卫生整洁行动、农村“厕所革命”、垃圾分类,我们开展了一系列卓有成效的群众性卫生活动,培养全民养成良好卫生习惯,为保障人民健康发挥了重要作用。特别是新冠疫情发生以来,习近平总书记多次强调坚持以人民为中心的发展理念,广泛开展爱国卫生运动,多次强调预防疫病的重要性,为有效应对重大传染病疫情、提升社会健康治理水平提供了科学指引。</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:Calibri; font-size:10.5pt\">&#xa0;</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:Calibri; font-size:10.5pt\">&#xa0;</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:宋体; font-size:10.5pt\">近日,国家卫健委发布公告,将新冠肺炎更名为新型冠状病毒感染并依法实施“乙类乙管”。这标志着我国疫情防控进入新阶段,在做好相关医疗资源保障、加强对老年人等重点群体保护的基础上,全社会要更加有针对性地开展爱国卫生运动。一方面,要以“大卫生、大健康”理念不断丰富爱国卫生运动的内涵,突出源头治理;另一方面,要充分发挥爱国卫生运动的组织优势和群众动员优势,引导广大人民群众主动学习健康知识,掌握健康生活技能,养成良好的个人卫生习惯,践行文明健康的生活方式,用千千万万个文明健康小环境筑牢疫情防控社会大防线,以全社会整体联动“大处方”开辟爱国卫生工作全民共建共享新局面。</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:Calibri; font-size:10.5pt\">&#xa0;</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:Calibri; font-size:10.5pt\">&#xa0;</span></p><p style=\"margin:0pt; orphans:0; text-align:justify; widows:0\"><span style=\"font-family:宋体; font-size:10.5pt\">人民健康是民族昌盛和国家富强的重要标志,预防是最经济最有效的健康策略。当前,新冠疫情防控优化调整措施正在有序推进落实,各级党委政府要结合新的防控形势深刻领会习近平总书记重要指示精神,毫不松劲地抓好各项防控措施,扎实贯彻以人民为中心的发展思想。要以爱国卫生运动为重要抓手,动员全社会落实预防为主的方针和制度体系,加快形成有利于健康的生活方式、生态环境和社会环境,更加有效应对重大传染病疫情,切实保障人民群众生命安全和身体健康。</span></p></div></body></html>"
# """
print(clean_html_tag(content=text))
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 矩阵测试.py
# @Time : 2022/12/12 19:23
# @Author : bruxelles_li
# @Software: PyCharm
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from bert_serving.client import BertClient
from tqdm import tqdm
import numpy as np
from numpy import *
import datetime
import logging
from es_byid import find_sent_info, find_para_info, find_art_info, find_sen_content
# '114.115.130.239',
bc = BertClient(check_length=False)
prob = 0.85
# file_path = "素材库/句子库/句子库.npy"
# vector_path = "测试文件/sent.txt"
np_path = "database/sent_database/other_sent.npy"
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
# b = np.load(file_path)
# df = pd.read_excel("测试文件/句子库测试样例.xlsx", keep_default_na=False).astype(str)
# length = len(df)
# b = np.array([[1,2,3,4],[5,6,7,8],[11,12,13,14],[2,3,4,5]])
# print(b.shape[0], b.shape[1])
# print(b.shape)
# print(b)
# c = b.transpose()
# print(c)
# print(c[1::])
# d = c[1::].transpose()
# print(d)
#
# print(c[0], type(c[0]))
# d = c[0].tolist()
# print(d, type(d))
def encode_sentences(vector_path, df, length, np_path):
with open(vector_path, 'w', encoding='utf-8') as f_vectors:
for idx, row in tqdm(df.iterrows()):
sentence = row['content']
vector = bc.encode([sentence])
# print(vector)
f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
f_vectors.close()
save_file(length, vector_path, np_path)
return None
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
def get_result(text, np_arrary):
# 导入初始矩阵
b = np_arrary
a = bc.encode([text])
# todo: 考虑当数据量在4g时,矩阵计算时间超过4秒,先将矩阵进行切片后计算, 当满足条件的内容长度大于30时不进行后续计算
start0_time = datetime.datetime.now()
sub_arrarys = np.array_split(b, 800)
sim_result = []
id_result = []
for x in sub_arrarys:
if len(sim_result) < 30:
# todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
c = x.transpose()
d = c[1::].transpose()
# todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
id_list = c[0].tolist()
# 根据行长度初始化矩阵索引np_list
np_list = [n for n in range(x.shape[0])]
id_dict = dict(zip(np_list, id_list))
r = cosine_similarity(a, d)
target = np.where(r >= 0.85)
column_list = target[1].tolist()
if column_list:
id_list = [str(id_dict[i]).split(".")[0] for i in column_list]
sim_list = r[target].tolist()
sim_result.extend(sim_list)
id_result.extend(id_list)
else:
break
end0_time = datetime.datetime.now()
total0_time = (end0_time - start0_time).total_seconds()
logger.info(len(id_result))
logger.info("拆分矩阵计算 共消耗: " + "{:.2f}".format(total0_time) + " 秒")
# todo: 将初始矩阵转换为目标矩阵,通过先转置,后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
start1_time = datetime.datetime.now()
c = b.transpose()
d = c[1::].transpose()
# todo: 此时,id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
id_list = c[0].tolist()
# 根据行长度初始化矩阵索引np_list
np_list = [n for n in range(b.shape[0])]
id_dict = dict(zip(np_list, id_list))
a = bc.encode([text])
r = cosine_similarity(a, d)
target = np.where(r >= 0.85)
column_list = target[1].tolist()
id_list = [str(id_dict[i]).split(".")[0] for i in column_list]
sim_list = r[target].tolist()
logger.info(len(id_list))
end1_time = datetime.datetime.now()
total1_time = (end1_time - start1_time).total_seconds()
logger.info("全矩阵计算 共消耗: " + "{:.2f}".format(total1_time) + " 秒")
# # print(sim_list)
# df1 = pd.DataFrame({
# "id": id_list,
# "sim": sim_list
# })
# # print(df1)
# test = df1.sort_values(by=['sim'], axis=0, ascending=False)
# # todo: 场景1 ->不勾选主题参数
# topicTypeNames = []
# if len(topicTypeNames) == 0:
# df2 = test[:10]
# # todo: 场景2 ->勾选主题参数
# else:
# df2 = test[:30]
#
# # todo: 先取唯一标识id,并调用es查询获取匹配信息
# new_id_list = df2["id"].tolist()
# info_df = find_sent_info(new_id_list)
#
# # todo: 将匹配信息进行整合,包括df2 + info_df
# temp_df = pd.merge(df2, info_df, on="id")
# result = []
# for idx, row in tqdm(temp_df.iterrows()):
# sentence_id = row["sentenceId"]
# sent_article_id = row["articleId"]
# sent_content = row["content"]
# # todo: 根据段落所在的文章id获取文章信息
# art_temp_result = find_art_info(sent_article_id)
# title = art_temp_result["articleTitle"]
# origin = art_temp_result["origin"]
# time = art_temp_result["articleTime"]
# author = art_temp_result["author"]
# article_content = art_temp_result["content"]
#
# # todo: 根据sentence_id 和 sent_article_id 获取前后句
# final_content = find_sen_content(sent_article_id, sentence_id, sent_content)
# # todo: 场景1 ->勾选主题参数,判断主题和时间范围
# if topicTypeNames:
# if row["topicType"] in topicTypeNames:
# result.append({
# "content": final_content,
# "similarity": round(row['sim'], 4),
# "id": row["id"],
# "article_id": sent_article_id,
# "paragraphid": row["paragraphId"],
# "match_index": row["sentParaIndex"] + ";" + row["sentArticleIndex"],
# "topic_type": row["topicType"],
# "content_type_name": row["contentTypeName"],
# "article_content": article_content,
# "publishDate": time,
# "author": author,
# "origin": origin,
# "title": title,
# # "type": returenType
# })
#
# # todo: 场景2 -> 不勾选类型参数, 仅判断事件范围
# else:
# result.append({
# "content": final_content,
# "similarity": round(row['sim'], 4),
# "id": row["id"],
# "article_id": sent_article_id,
# "paragraphid": row["paragraphId"],
# "match_index": row["sentParaIndex"] + ";" + row["sentArticleIndex"],
# "topic_type": row["topicType"],
# "content_type_name": row["contentTypeName"],
# "article_content": article_content,
# "publishDate": time,
# "author": author,
# "origin": origin,
# "title": title,
# # "type": returenType
# })
#
# final_df = pd.DataFrame(result)
# final_df.to_excel("测试文件/test.xlsx", engine="xlsxwriter", index=False)
return None
if __name__ == "__main__":
test_list = [
{
"create_time": "2023-01-03 18:02:24",
"para_id": "1",
"infoId": "123",
"para_index": "2|538",
"para_content": "强化创新引领 加快“三个转变” 更好推动中国制造高质量发展——国资委党委委员、副主任 翁杰明。",
"contentTypeIds": "1602095566267805697",
"contentNames": "领导讲话",
"topicNames": "产业链链长",
"type": "par",
"repeatedId": "1670843543885716",
"is_main": "0"
},
{
"create_time": "2023-01-03 18:02:24",
"para_id": "2",
"infoId": "1234",
"para_index": "",
"para_content": "强化创新引领 加快“三个转变” 更好推动中国制造高质量发展——国资委党委委员、副主任 翁杰明。",
"contentTypeIds": "1602095566267805698",
"contentNames": "领导讲话",
"topicNames": "产业链链长",
"type": "par",
"repeatedId": "1670843543885715",
"is_main": "0"
}
]
start0_time = datetime.datetime.now()
np_arrary = np.load(np_path)
end0_time = datetime.datetime.now()
total0_time = (end0_time - start0_time).total_seconds()
logger.info("加载矩阵 共消耗: " + "{:.2f}".format(total0_time) + " 秒")
# get_result("张文魁:应进一步分行业设立国企负债率警戒线和监管线", np_arrary)
# text = "张文魁:应进一步分行业设立国企负债率警戒线和监管线"
# print(get_result(text, file_path))
# # todo: 初始化list
# list1 = [n for n in range(10)]
# list2 = [n for n in range(11, 21)]
# dict0 = dict(zip(list1, list2))
# print(list1, list2)
# print(dict0)
# encode_sentences(vector_path, df, length, np_path)
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 素材库构建批量处理.py
# @Time : 2022/12/10 11:56
# @Author : bruxelles_li
# @Software: PyCharm
import pandas as pd
from tqdm import tqdm
import requests
import json
from 文章内容检查 import *
list_article, list_para, list_sent = [], [], []
data_df = pd.read_excel("素材库/待入库数据/ori_speech_by_leaders_article.xlsx", keep_default_na=False).astype(str)
data_df["infoId"] = ""
data_df["author"] = ""
# todo: 调用素材构建程序接口进行批量素材处理
def pro_sucai(data_df):
url = "http://114.116.49.86:4001/build_pro"
headers = {
'Content-Type': 'application/json'
}
for idx, row in tqdm(data_df.iterrows()):
infoId = row["infoId"]
title = row["标题"]
content = row["正文"]
contentTypeFlags = row["内容类型"]
topicNames = row["主题类型"]
origin = row["origin"]
publishDate = row["publishDate"]
author = row["author"]
payload = json.dumps({
"infoId": infoId,
"title": title,
"contentTypeFlags": contentTypeFlags,
"topicNames": topicNames,
"origin": origin,
"publishDate": publishDate,
"author": author,
"content": content
})
response_filter = requests.request("POST", url, headers=headers, data=payload)
text_filter = response_filter.text.encode("utf-8")
obj_filter = json.loads(text_filter)
data_filter = obj_filter["resultData"]
if data_filter:
list_article.extend(data_filter["article_info"])
list_para.extend(data_filter["para_info"])
list_sent.extend(data_filter["sent_info"])
else:
continue
article_df = pd.DataFrame(list_article)
article_df.to_excel("素材库/文章库/leader_1_article.xlsx", engine="xlsxwriter", index=False)
para_df = pd.DataFrame(list_para)
para_df.to_excel("素材库/段落库/leader_1_para.xlsx", engine="xlsxwriter", index=False)
sent_df = pd.DataFrame(list_sent)
sent_df.to_excel("素材库/句子库/leader_1_sent.xlsx", engine="xlsxwriter", index=False)
return None
if __name__ == "__main__":
pro_sucai(data_df)
# 去除中间包含指定字符所在的段落
# text = """ /uploadimg/2021/02/02/1612229563745911./播放器前贴图片/。isLeftBottom:。"false",。/播放按钮是否在播放器左下角,为true表示是,false表示播放按钮在播放器中间/。isAudio:。"false",。/是否是音频播放器,为true表示是音频,false表示是视频/。isVod4k:。"false",。/是否为4k播放器,true是4k,false不是/。isHttps:。"true",。/是否https视频,true是,false不是/。wmode:。"opaque",。/flash播放器的窗口模式,默认为opaque/。wideMode:。"normal",。/flash播放器的窗口模式,默认为opaque/。listMode:。"false",。/点播播放器初始化参数:是否列表模式,默认false,false时不显示下一集按钮,不发送新增的下一集事件,设置中没有“自动播放下一集”选项;字符串类型/。nextTitle:。"",。/下一集标题,与listMode。配对使用/。nextThumbnail:。"",。/下一集预览图URL,与listMode。配对使用/。setupOn:。"true",。/是否显示设置按钮,默认为false/。hasBarrage:。"false",。/是否有弹幕功能,默认false,false时不显示弹幕、不显示弹幕设置按钮、不显示弹幕开关、不访问弹幕接口和表情包配置接口e/。barrageApp:。"",。/弹幕数据获取接口,固定或者初始化时传入/。playerType。"vod_h5",。/播放器类型,vod表示普通播放器/。drm:"true",。webFullScreenOn:。"false",。/是否显示网页全屏按钮,默认true表示显示/。language:。"",。/语言,默认中文,en表示英语/。other:。""/其它参数/。createVodPlayer(playerParas);。var。v_span1。"";。var。v_video="";。var。v_div。"";。var。aa="";。$(document).ready(function(){。function。getvideo(){。if($("video").html()==null){。clearInterval(aa);。setInterval(getvideo,100);。}else{。clearInterval(aa);。setVideo();。if(agent){。v_span1=$("#myFlash");。if(v_span1.length>0){。getvideo();。});。function。setVideo(){。v_video。$("video");。var。document.body.clientWidth||。window.innerWidth;。var。300。var。setTimeout(function(){。v_video.css({"display":"block","width":_w+"px","height":_h+"px","opacity":"1","background-color":"#000000"});。v_span1.css({"display":"block","width":_w+"px","height":_h+"px","opacity":"1","margin-bottom":"10px"});。},200)。function。changeVideo(){。if(agent){。if(v_span1.length>0){。setVideo();。window.onresize=function(){。changeVideo();。function。videoChange(){。$('#playbtn_img').css({"left":((document.documentElement.clientWidth-20)-70)/2+"px","top":(300-70)/2+"px"});。$('#video_content_is_loading').css({"left":((document.documentElement.clientWidth-20)-120)/2+"px","top":(300-120)/2+"px"});。$(window).resize(function(){。videoChange();。function。isAppendSpace(i){。console.log(i);。$('#playbtn_img').length>0?。videoChange()。"";。i--。i>=0。$('#playbtn_img').length。setTimeout(function(){isAppendSpace(i)},500)。"";。isAppendSpace(5);。中国中化控股有限责任公司成立大会5月8日在京举行。中共中央政治局常委、国务院总理李克强作出重要批示。批示指出:中国中化控股有限责任公司的重组组建,对优化国有经济结构和布局、助力我国农业现代化、增强化工行业市场竞争力具有重要意义。要坚持以习近平新时代中国特色社会主义思想为指导,认真贯彻党中央、国务院决策部署,扎实做好重组整合、深化改革等工作,加强国际合作,充分发挥行业龙头企业作用,加大种源、化工等领域关键技术攻关力度,创新管理和运营模式,不断提升经营质量效益和综合竞争力,为保持产业链供应链稳定、促进经济社会持续健康发展作出新贡献!。国务委员王勇出席成立大会并讲话。他强调,要深入贯彻习近平总书记重要指示精神,落实李克强总理批示要求,按照党中央、国务院决策部署,扎实做好中国中化控股有限责任公司重组组建工作,加快促进我国农业和化工产业高质量发展,在立足新发展阶段、贯彻新发展理念、构建新发展格局中发挥更大作用。
#
# 王勇指出,重组组建中国中化控股有限责任公司,是推进国有经济布局优化和结构调整、做强做优做大国有资本和国有企业的重大举措。要牢记使命责任,聚焦主责主业。加强资源要素整合融合,加快发展种业、现代农业、综合性化工等产业,强化关键核心技术攻关,着力打造原创技术“策源地”和现代产业链“链长”,保障产业链供应链安全稳定。坚持深化改革开放,持续健全市场化运营体制机制,守好安全生产底线红线,推进绿色低碳科技研发应用,为实现碳达峰碳中和目标贡献力量。把坚持党的领导加强党的建设融入公司治理,凝聚各方面工作合力,努力开创公司改革发展新局面。
#
# """
# print(clean_html_tag(text))
# import re
#
# list_content = text.split('\n')
# for content in list_content:
# new_content = re.sub(r".*(function。|html|background).*", '', content)
# print(new_content)
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 素材库构建程序.py
# @Time : 2022/12/7 19:02
# @Author : bruxelles_li
# @Software: PyCharm
"""
说明:4个字符是为了避免开头字符多次出现在文章中的不同位置
段落索引: 以段落开头4个字符所在文章位置和段落结尾四个字符所在文章位置作为索引
句子绝对位置索引:以句子开头4个字符所在文章位置和句子结尾4个字符所在文章位置作为索引
句子相对位置索引: 以句子开头4个字符所在段落位置和句子开头4个字符所在段落位置作为索引
注:高亮方法
a = '\033[1;31;40m%s\033[0m' % para[:4]
b = '\033[1;31;40m%s\033[0m' % para[-4:]
print(para_index, a, b)
注: 搜索推荐时转义
段落库:
prefix_index = int(para_index.split("|")[0])
suffix_index = int(para_index.split("|")[1])
print(prefix_index, suffix_index)
# 返回内容,即搜索推荐时高亮区域
print(text[prefix_index:suffix_index+1])
句子库:
# 文章索引
article_prefix_index = int(sent_article_index.split("|")[0])
article_suffix_index = int(sent_article_index.split("|")[1])
print(article_prefix_index, article_suffix_index)
# 返回内容,即搜索推荐时在文章中的高亮区域
print(text[article_prefix_index:article_suffix_index+2])
# 段落索引
para_prefix_index = int(sent_para_index.split("|")[0])
para_suffix_index = int(sent_para_index.split("|")[1])
print(para_prefix_index, para_suffix_index)
# 返回内容,即搜索推荐时在文章中的高亮区域
print(para[para_prefix_index:para_suffix_index+2])
注: 参数定义
输入:
文章id: infoId -> str
文章标题: title -> str
正文: content -> str
文章类型: contentTypeIds -> str
文章主题: topicTypeIds -> str
来源网站: origin -> str
发布时间: publishDate -> str
作者: author -> str
输出:
状态码: code -> str
处理消息: message -> str
返回内容体: resultData -> object
文章信息列表: article_info -> array
文章id: infoId -> str
文章标题: title -> str
正文: content -> str
文章类型: contentTypeIds -> str
文章主题: topicTypeIds -> str
来源网站: origin -> str
发布时间: publishDate -> str
作者: author -> str
段落信息列表: para_info -> array
段落库id: para_id -> str
段落所在文章id: infoId -> str
段落索引: para_index -> str
段落内容: para_content -> str
段落类型: contentTypeIds -> str
段落主题: topicTypeIds -> str
句子信息列表: sent_info -> array
句子库id: sent_id -> str
句子所在段落id: para_id -> str
句子所在文章id: infoId -> str
绝对位置索引: sent_article_index -> str
相对位置索引: sent_para_index -> str
句子内容: sent_content -> str
句子类型: contentTypeIds -> str
句子主题: topicTypeIds -> str
"""
import re
from 句子分割完整测试 import get_index
# todo: 输入为一篇文章,输出文章列表,段落列表,句子列表
# todo:文章信息包括:infoId, title, content, contentTypeIds, topicTypeIds, origin, publishDate, author
global para_id, sent_id
# 文章信息列表
"""
# 将文章信息加入文章信息列表
list_article.append({
"infoId": infoId,
"content": content,
"title": title,
"contentTypeIds": contentTypeIds,
"topicTypeIds": topicTypeIds,
"origin": origin,
"publishDate": publishDate,
"author": author
})"""
def build_pro(infoId, content, contentTypeIds, topicTypeIds):
print("=====自动构建程序开始====")
# 初始化段落库id,句子库id
para_id, sent_id = 1, 1
# 若文章id存在则用该文章id,若不存在,则用时间戳来生成文章id
infoId = infoId
# 处理正文
text = content
# 基础段落拆分符号”\n“
para_list = text.split("\n")
# 基础句子拆分符号”!。?“
sent_list_0 = re.split(r'\s*[。!?]\s*', text)
print("句子总长度为:{}".format(len(sent_list_0)))
print("段落总长度为:{}".format(len(para_list)))
# todo: 定义单篇文章的段落信息列表,句子信息列表
list_para, list_sent = [], []
# 获取段落索引
for para in para_list:
# 处理正常内容段落
if len(para) >= 50:
# 根据前十五个字符内容获取索引
a0 = para[:15]
b0 = para[-15:]
# 获取首字符的索引
a0_index = text.find(a0) + (15 - len(a0.strip()))
# print(a0, a0_index, text[a0_index])
# 获取末字尾符的索引
b0_index = text.find(b0) + (15 - len(b0.strip())) + 14
# print(b0, b0_index, text[b0_index])
# 保存索引
para_index = str(a0_index) + "|" + str(b0_index)
# print(para_index)
# 获取句子索引, 句子索引有两部分:绝对位置索引,即原文中的位置索引; 相对位置索引,即所在段落中的索引
print("====句子索引====")
# 基础句子拆分符号”!。?“
sent_list = re.split(r'\s*[。!?]\s*', para)
for sent in sent_list:
# todo:当句子长度大于11才进行入库
if len(sent.strip()) >= 13:
# 根据前十三个字符内容获取索引
c0 = sent[:13]
d0 = sent[-13:]
print(d0)
# 获取首字符的索引
c0_index = text.find(c0) + (13 - len(c0.strip()))
c1_index = para.find(c0) + (13 - len(c0.strip()))
# print(c0, c0_index, text[c0_index], c1_index, para[c1_index])
# 获取末字尾符的索引
d0_index = text.find(d0) + (13 - len(d0.strip())) + 12
d1_index = para.find(d0) + (13 - len(d0.strip())) + 12
print(d0, d0_index, text[d0_index], d1_index, para[d1_index])
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
# 打印句子内容
# print(text[c0_index:d0_index + 2], para[c1_index:d1_index + 2])
elif 10 <= len(sent.strip()) < 13:
# 根据前十个字符内容获取索引
c0 = sent[:10]
d0 = sent[-10:]
print(d0)
# 获取首字符的索引
c0_index = text.find(c0) + (10 - len(c0.strip()))
c1_index = para.find(c0) + (10 - len(c0.strip()))
# print(c0, c0_index, text[c0_index], c1_index, para[c1_index])
# 获取末字尾符的索引
d0_index = text.find(d0) + (10 - len(d0.strip())) + 9
d1_index = para.find(d0) + (10 - len(d0.strip())) + 9
print(d0, d0_index, text[d0_index], d1_index, para[d1_index])
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
else:
continue
# 将句子信息加入句子信息列表
list_sent.append({
"sent_id": str(sent_id),
"para_id": str(para_id),
"infoId": infoId,
"sent_article_index": sent_article_index,
"sent_para_index": sent_para_index,
"sent_content": para[c1_index:d1_index + 2] if para[d1_index] != "。" else para[
c1_index:d1_index + 1],
"contentTypeIds": contentTypeIds,
"topicTypeIds": topicTypeIds
})
# todo: 内部更新句子库id
sent_id += 1
# 处理小标题段落
elif 10 <= len(para) < 50:
# todo: 根据包含的字词内容来简单过滤脏段落信息
words_list = {"微信", "如需转载", "免责声明", "公告", "jpeg", "jpg", "png", "【", "责任编辑"}
if any([i in para for i in words_list]):
para_id += 1
continue
else:
# 根据前四个字符内容获取索引
a0 = para[:4]
b0 = para[-4:]
# 获取首字符的索引
a0_index = text.find(a0) + (4 - len(a0.strip()))
# print(a0, a0_index, text[a0_index])
# 获取末字尾符的索引
b0_index = text.find(b0) + (4 - len(b0.strip())) + 3
# print(b0, b0_index, text[b0_index])
# 保存索引
para_index = str(a0_index) + "|" + str(b0_index)
else:
para_id += 1
continue
# 打印段落内容
# print(text[a0_index:b0_index + 1])
# 将段落信息加入段落信息列表
list_para.append({
"para_id": str(para_id),
"infoId": infoId,
"para_index": para_index,
"para_content": text[a0_index:b0_index+1],
"contentTypeIds": contentTypeIds,
"topicTypeIds": topicTypeIds
})
# todo: 内部更新段落库id
para_id += 1
return list_para, list_sent
def build_pro_new(infoId, content, contentNames, contentTypeIds, topicNames):
print("=====自动构建程序开始====")
# 初始化段落库id,句子库id
para_id, sent_id = 1, 1
# 若文章id存在则用该文章id,若不存在,则用时间戳来生成文章id
infoId = infoId
# 处理正文
text = content
# 基础段落拆分符号”\n“
para_list = text.split("\n")
# 基础句子拆分符号”!。?“
# sent_list_0 = re.split(r'\s*[。!?]\s*', text)
# print("句子总长度为:{}".format(len(sent_list_0)))
# print("段落总长度为:{}".format(len(para_list)))
# todo: 定义单篇文章的段落信息列表,句子信息列表
list_para, list_sent = [], []
# 获取段落索引
for para in para_list:
para = para.strip("——")
# todo: 先选定范围
count_0 = len(re.findall(r":", para))
count_1 = len(re.findall(r"”", para))
# todo: 先根据特殊符号“:”判断段落中是否出现讲话内容
if "指出:" in para and "”" in para and count_0 == count_1:
pre_index, suf_index = get_index(para)
# print(pre_index, suf_index)
pre_text = para[:pre_index - 1] if pre_index != 0 else ""
# todo: 先处理pre_text
if pre_text:
# 基础句子拆分符号”!。?“
new_sent_list = re.split(r'\s*[。!??]\s*', pre_text)
for new_sent in new_sent_list:
# todo:当句子长度大于11才进行入库
if len(new_sent.strip()) >= 13:
# 获取首字符的索引
c0_index = text.find(new_sent.strip("——").strip())
c1_index = pre_text.find(new_sent.strip("——").strip())
# 获取末字尾符的索引
d0_index = c0_index + len(new_sent.strip("——").strip())
d1_index = c1_index + len(new_sent.strip("——").strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
elif 10 <= len(new_sent.strip()) < 13:
# 获取首字符的索引
c0_index = text.find(new_sent.strip("——").strip())
c1_index = pre_text.find(new_sent.strip("——").strip())
# 获取末字尾符的索引
d0_index = c0_index + len(new_sent.strip("——").strip())
d1_index = c1_index + len(new_sent.strip("——").strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
else:
continue
# todo: 打印句子内容
# print(text[c0_index:d0_index + 1], pre_text[c1_index:d1_index + 1])
if "。" in pre_text[c1_index:d1_index + 1]:
# 将句子信息加入句子信息列表
list_sent.append({
"sent_id": str(sent_id),
"para_id": str(para_id),
"infoId": infoId,
"sent_article_index": sent_article_index,
"sent_para_index": sent_para_index,
"sent_content": pre_text[c1_index:d1_index + 1],
"contentTypeIds": contentTypeIds,
"contentNames": contentNames,
"topicNames": topicNames,
"type": "sen"
})
# todo: 内部更新句子库id
sent_id += 1
# todo: 然后再处理temp_text
temp_text = para[pre_index:suf_index]
# print(temp_text)
# 获取首字符的索引
c00_index = text.find(temp_text.strip())
c11_index = temp_text.find(temp_text.strip())
# 获取末字尾符的索引
d00_index = c00_index + len(temp_text.strip())
d11_index = c11_index + len(temp_text.strip())
# 保存索引
sent0_article_index = str(c00_index) + "|" + str(d00_index)
sent1_para_index = str(c11_index) + "|" + str(d11_index)
list_sent.append({
"sent_id": str(sent_id),
"para_id": str(para_id),
"infoId": infoId,
"sent_article_index": sent0_article_index,
"sent_para_index": sent1_para_index,
"sent_content": temp_text,
"contentTypeIds": contentTypeIds,
"contentNames": contentNames,
"topicNames": topicNames,
"type": "sen"
})
# todo: 内部更新句子库id
sent_id += 1
# todo: 最后再处理suffix_text
suffix_text = para[suf_index:] if suf_index != len(para) - 1 else ""
if suffix_text:
# 基础句子拆分符号”!。?“
final_sent_list = re.split(r'\s*[。!??]\s*', suffix_text)
for final_sent in final_sent_list:
# todo:当句子长度大于11才进行入库
if len(final_sent.strip()) >= 13:
# 获取首字符的索引
c0_index = text.find(final_sent.strip("——").strip())
c1_index = suffix_text.find(final_sent.strip("——").strip())
# 获取末字尾符的索引
d0_index = c0_index + len(final_sent.strip("——").strip())
d1_index = c1_index + len(final_sent.strip("——").strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
elif 10 <= len(final_sent.strip()) < 13:
# 获取首字符的索引
c0_index = text.find(final_sent.strip("——").strip())
c1_index = suffix_text.find(final_sent.strip("——").strip())
# 获取末字尾符的索引
d0_index = c0_index + len(final_sent.strip("——").strip())
d1_index = c1_index + len(final_sent.strip("——").strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
else:
continue
# todo: 打印句子内容
# print(text[c0_index:d0_index + 1], suffix_text[c1_index:d1_index + 1])
if "。" in suffix_text[c1_index:d1_index + 1]:
# 将句子信息加入句子信息列表
list_sent.append({
"sent_id": str(sent_id),
"para_id": str(para_id),
"infoId": infoId,
"sent_article_index": sent_article_index,
"sent_para_index": sent_para_index,
"sent_content": suffix_text[c1_index:d1_index + 1],
"contentTypeIds": contentTypeIds,
"contentNames": contentNames,
"topicNames": topicNames,
"type": "sen"
})
# todo: 内部更新句子库id
sent_id += 1
# 最后将段落信息加入段落信息列表
# 获取首字符的索引
a0_index = text.find(para.strip())
# 获取末字尾符的索引
b0_index = a0_index + len(para.strip())
# 保存索引
para_index = str(a0_index) + "|" + str(b0_index)
list_para.append({
"para_id": str(para_id),
"infoId": infoId,
"para_index": para_index,
"para_content": text[a0_index:b0_index],
"contentTypeIds": contentTypeIds,
"contentNames": contentNames,
"topicNames": topicNames,
"type": "par"
})
# todo: 内部更新段落库id
para_id += 1
else:
# 处理正常内容段落
if len(para) >= 50:
# 获取首字符的索引
a0_index = text.find(para.strip())
# 获取末字尾符的索引
b0_index = a0_index + len(para.strip())
# 保存索引
para_index = str(a0_index) + "|" + str(b0_index)
# 获取句子索引, 句子索引有两部分:绝对位置索引,即原文中的位置索引; 相对位置索引,即所在段落中的索引
# print("====句子索引====")
# 基础句子拆分符号”!。?“
sent_list = re.split(r'\s*[。!??]\s*', para)
for sent in sent_list:
# todo:当句子长度大于11才进行入库
if len(sent.strip()) >= 13:
# 获取首字符的索引
c0_index = text.find(sent.strip("——").strip())
c1_index = para.find(sent.strip("——").strip())
# 获取末字尾符的索引
d0_index = c0_index + len(sent.strip("——").strip())
d1_index = c1_index + len(sent.strip("——").strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
elif 10 <= len(sent.strip()) < 13:
# 获取首字符的索引
c0_index = text.find(sent.strip("——").strip())
c1_index = para.find(sent.strip("——").strip())
# 获取末字尾符的索引
d0_index = c0_index + len(sent.strip("——").strip())
d1_index = c1_index + len(sent.strip("——").strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
else:
continue
# todo: 打印句子内容
# print(text[c0_index:d0_index+1], para[c1_index:d1_index+1])
if "。" in para[c1_index:d1_index+1]:
# 将句子信息加入句子信息列表
list_sent.append({
"sent_id": str(sent_id),
"para_id": str(para_id),
"infoId": infoId,
"sent_article_index": sent_article_index,
"sent_para_index": sent_para_index,
"sent_content": para[c1_index:d1_index+1],
"contentTypeIds": contentTypeIds,
"contentNames": contentNames,
"topicNames": topicNames,
"type": "sen"
})
# todo: 内部更新句子库id
sent_id += 1
else:
continue
# 处理小标题段落
elif 10 <= len(para) < 50:
# todo: 根据包含的字词内容来简单过滤脏段落信息
words_list = {"微信", "如需转载", "免责声明", "公告", "jpeg", "jpg", "png", "【", "责任编辑"}
if any([i in para for i in words_list]):
para_id += 1
continue
else:
# 获取首字符的索引
a0_index = text.find(para.strip())
# 获取末字尾符的索引
b0_index = a0_index + len(para)
# 保存索引
para_index = str(a0_index) + "|" + str(b0_index)
# 基础句子拆分符号”!。?“
sent_list = re.split(r'\s*[。!??]\s*', para)
for sent in sent_list:
# todo:当句子长度大于11才进行入库
if len(sent.strip()) >= 13:
# 获取首字符的索引
c0_index = text.find(sent.strip("——").strip())
c1_index = para.find(sent.strip("——").strip())
# 获取末字尾符的索引
d0_index = c0_index + len(sent.strip("——").strip())
d1_index = c1_index + len(sent.strip("——").strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
elif 10 <= len(sent.strip()) < 13:
# 获取首字符的索引
c0_index = text.find(sent.strip("——").strip())
c1_index = para.find(sent.strip("——").strip())
# 获取末字尾符的索引
d0_index = c0_index + len(sent.strip("——").strip())
d1_index = c1_index + len(sent.strip("——").strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
else:
continue
# todo: 打印句子内容
# print(text[c0_index:d0_index+1], para[c1_index:d1_index+1])
if "。" in para[c1_index:d1_index + 1]:
# 将句子信息加入句子信息列表
list_sent.append({
"sent_id": str(sent_id),
"para_id": str(para_id),
"infoId": infoId,
"sent_article_index": sent_article_index,
"sent_para_index": sent_para_index,
"sent_content": para[c1_index:d1_index + 1],
"contentTypeIds": contentTypeIds,
"contentNames": contentNames,
"topicNames": topicNames,
"type": "sen"
})
# todo: 内部更新句子库id
sent_id += 1
else:
continue
else:
para_id += 1
continue
# 将段落信息加入段落信息列表
list_para.append({
"para_id": str(para_id),
"infoId": infoId,
"para_index": para_index,
"para_content": text[a0_index:b0_index],
"contentTypeIds": contentTypeIds,
"contentNames": contentNames,
"topicNames": topicNames,
"type": "par"
})
# todo: 内部更新段落库id
para_id += 1
return list_para, list_sent
if __name__ == "__main__":
para = """
国家中长期经济社会发展战略若干重大问题——新冠肺炎疫情发生以来,我多次主持召开会议、作出大量指示批示,推动做好疫情防控和经济社会发展工作。同时,我也结合疫情防控对涉及国家中长期经济社会发展的重大问题进行了思考。
当今世界正经历百年未有之大变局,这次疫情也是百年不遇,既是一次危机,也是一次大考。当前,我国疫情防控形势已经越过拐点,但疫情全球大流行仍处在上升期,外部形势非常严峻,我们要切实做好外防输入、内防反弹工作,决不能让疫情卷土重来。同时,我们要举一反三,进行更有长远性的思考,完善战略布局,做到化危为机,实现高质量发展。下面,我着重从发展战略角度讲几个问题。
第一,坚定实施扩大内需战略。构建完整的内需体系,关系我国长远发展和长治久安。改革开放特别是加入世贸组织后,我国加入国际大循环,形成了市场和资源(如矿产资源)“两头在外”、形成“世界工厂”的发展模式,对我国抓住经济全球化机遇、快速提升经济实力、改善人民生活发挥了重要作用。近几年,经济全球化遭遇逆风,这次疫情可能加剧逆全球化趋势,各国内顾倾向明显上升,我国发展面临的外部环境可能出现重大变化。实施扩大内需战略,是当前应对疫情冲击的需要,是保持我国经济长期持续健康发展的需要,也是满足人民日益增长的美好生活的需要。——
大国经济的优势就是内部可循环。我国有14亿人口,人均国内生产总值已经突破1万美元,是全球最大最有潜力的消费市场。居民消费优化升级,同现代科技和生产方式相结合,蕴含着巨大增长空间。我们要牢牢把握扩大内需这一战略基点,使生产、分配、流通、消费各环节更多依托国内市场实现良性循环,明确供给侧结构性改革的战略方向,促进总供给和总需求在更高水平上实现动态平衡。扩大内需和扩大开放并不矛盾。国内循环越顺畅,越能形成对全球资源要素的引力场,越有利于构建以国内大循环为主体、国内国际双循环相互促进的新发展格局,越有利于形成参与国际竞争和合作新优势。
消费是我国经济增长的重要引擎,中等收入群体是消费的重要基础。目前,我国约有4亿中等收入人口,绝对规模世界最大。要把扩大中等收入群体规模作为重要政策目标,优化收入分配结构,健全知识、技术、管理、数据等生产要素由市场评价贡献、按贡献决定报酬的机制。要扩大人力资本投入,使更多普通劳动者通过自身努力进入中等收入群体。
第二,优化和稳定产业链、供应链。产业链、供应链在关键时刻不能掉链子,这是大国经济必须具备的重要特征。这次疫情是一次实战状态下的压力测试。我国完备的产业体系、强大的动员组织和产业转换能力,为疫情防控提供了重要物质保障。我国口罩日产能从1月底1000万只提高到目前的5亿只。同时,疫情冲击也暴露出我国产业链、供应链存在的风险隐患。为保障我国产业安全和国家安全,要着力打造自主可控、安全可靠的产业链、供应链,力争重要产品和供应渠道都至少有一个替代来源,形成必要的产业备份系统。
现在,全国都在复工复产,我们不应该也不可能再简单重复过去的模式,而应该努力重塑新的产业链,全面加大科技创新和进口替代力度,这是深化供给侧结构性改革的重点,也是实现高质量发展的关键。一是要拉长长板,巩固提升优势产业的国际领先地位,锻造一些“杀手锏”技术,持续增强高铁、电力装备、新能源、通信设备等领域的全产业链优势,提升产业质量,拉紧国际产业链对我国的依存关系,形成对外方人为断供的强有力反制和威慑能力。二是要补齐短板,就是要在关系国家安全的领域和节点构建自主可控、安全可靠的国内生产供应体系,在关键时刻可以做到自我循环,确保在极端情况下经济正常运转。
我国线上经济全球领先,在这次疫情防控中发挥了积极作用,线上办公、线上购物、线上教育、线上医疗蓬勃发展并同线下经济深度交融。我们要乘势而上,加快数字经济、数字社会、数字政府建设,推动各领域数字化优化升级,积极参与数字货币、数字税等国际规则制定,塑造新的竞争优势。同时,必须看到,实体经济是基础,各种制造业不能丢,作为14亿人口的大国,粮食和实体产业要以自己为主,这一条绝对不能丢。
国民经济要正常运转,必须增强防灾备灾意识。天有不测风云,人有旦夕祸福。要大力加强防灾备灾体系和能力建设,舍得花钱,舍得下功夫,宁肯十防九空,有些领域要做好应对百年一遇灾害的准备。要坚持两条腿走路,实行中央储备和地方储备相结合,实物储备和产能储备相结合,国家储备和企业商业储备相结合,搞好军民融合储备。要优化应急物资品种和储备布局,要合理确定储备规模,全面加大投资建设力度。——
在这次抗击疫情过程中,国有企业冲在前面,发挥了重要作用,在促进产业循环中也起到了关键作用。国有企业是中国特色社会主义的重要物质基础和政治基础,是党执政兴国的重要支柱和依靠力量,必须做强做优做大。当然,国有企业也要改革优化,但绝对不能否定、绝对不能削弱。要坚持和完善新型举国体制,不断增强领导力、组织力、执行力。
这次疫情防控使我们认识到,必须维护产业链、供应链的全球公共产品属性,坚决反对把产业链、供应链政治化、武器化。在国际经贸谈判中,要推动形成维护全球产业链供应链安全、消除非经济因素干扰的国际共识和准则,力争通过国际合作阻止打击全球产业链、供应链的恶劣行为。
第三,完善城市化战略。我国城市化道路怎么走?这是个重大问题,关键是要把人民生命安全和身体健康作为城市发展的基础目标。目前,我国常住人口城镇化率已经达到60.6%,今后一个时期还会上升。要更好推进以人为核心的城镇化,使城市更健康、更安全、更宜居,成为人民群众高品质生活的空间。
增强中心城市和城市群等经济发展优势区域的经济和人口承载能力,这是符合客观规律的。同时,城市发展不能只考虑规模经济效益,必须把生态和安全放在更加突出的位置,统筹城市布局的经济需要、生活需要、生态需要、安全需要。要坚持以人民为中心的发展思想,坚持从社会全面进步和人的全面发展出发,在生态文明思想和总体国家安全观指导下制定城市发展规划,打造宜居城市、韧性城市、智能城市,建立高质量的城市生态系统和安全系统。
产业和人口向优势区域集中是客观经济规律,但城市单体规模不能无限扩张。目前,我国超大城市(城区常住人口1000万人以上)和特大城市(城区常住人口500万人以上) 人口密度总体偏高,北京、上海主城区密度都在每平方公里2万人以上,东京和纽约只有1.3万人左右。长期来看,全国城市都要根据实际合理控制人口密度,大城市人口平均密度要有控制标准。要建设一批产城融合、职住平衡、生态宜居、交通便利的郊区新城,推动多中心、郊区化发展,有序推动数字城市建设,提高智能管理能力,逐步解决中心城区人口和功能过密问题。
我国各地情况千差万别,要因地制宜推进城市空间布局形态多元化。东部等人口密集地区,要优化城市群内部空间结构,合理控制大城市规模,不能盲目“摊大饼”。要推动城市组团式发展,形成多中心、多层级、多节点的网络型城市群结构。城市之间既要加强互联互通,也要有必要的生态和安全屏障。中西部有条件的省区,要有意识地培育多个中心城市,避免“一市独大”的弊端。我国现有1881个县市,农民到县城买房子、向县城集聚的现象很普遍,要选择一批条件好的县城重点发展,加强政策引导,使之成为扩大内需的重要支撑点。在城市旧城和老旧小区改造,地下管网、停车场建设,托幼、养老、家政、教育、医疗服务等方面都有巨大需求和发展空间。
第四,调整优化科技投入和产出结构。这次疫情防控对我国科技界是一次真刀真枪的检验。科技战线既显了身手,也露了短板。要优化科技资源布局,提升科技创新能力,走出一条符合我国国情的科技研发道路。
科技发展要坚持问题导向、目标导向。保障人民生命安全和身体健康是党和国家的重要任务,科学研究要从中凝练重大科学前沿和重大攻关课题。要更加重视遗传学、基因学、病毒学、流行病学、免疫学等生命科学的基础研究,加快相关药物疫苗的研发和技术创新,高度重视信息和大数据技术在这些领域的应用。要重视顶层设计,优化基础研究布局,做强优势领域,完善高校专业设置,加强基础学科教育和人才培养,补上冷门短板,把我国基础研究体系逐步壮大起来,努力多出“从0到1”的原创性成果。——
在这次疫情防控中,形成了不少产学研相结合的典范,值得认真总结。要创新科技成果转化机制,发挥企业主体作用和政府统筹作用,促进资金、技术、应用、市场等要素对接,努力解决基础研究“最先一公里”和成果转化、市场应用“最后一公里”有机衔接问题,打通产学研创新链、价值链。
第五,实现人与自然和谐共生。我多次强调,人与自然是生命共同体,人类必须尊重自然、顺应自然、保护自然。这次疫情防控使我们更加深切地认识到,生态文明建设是关系中华民族永续发展的千年大计,必须站在人与自然和谐共生的高度来谋划经济社会发展。
恩格斯早就指出:“我们不要过分陶醉于我们人类对自然界的胜利。对于每一次这样的胜利,自然界都对我们进行报复。”第一次工业革命以来,人类利用自然的能力不断提高,但过度开发也导致生物多样性减少,迫使野生动物迁徙,增加野生动物体内病原的扩散传播。新世纪以来,从非典到禽流感、中东呼吸综合征、埃博拉病毒,再到这次新冠肺炎疫情,全球新发传染病频率明显升高。只有更好平衡人与自然的关系,维护生态系统平衡,才能守护人类健康。要深化对人与自然生命共同体的规律性认识,全面加快生态文明建设。生态文明这个旗帜必须高扬。
越来越多的人类活动不断触及自然生态的边界和底线。要为自然守住安全边界和底线,形成人与自然和谐共生的格局。这里既包括有形的边界,也包括无形的边界。要完善国土空间规划,落实好主体功能区战略,明确生态红线,加快形成自然保护地体系,完善生物多样性保护网络,在空间上对经济社会活动进行合理限定。
要增强全民族生态环保意识,鼓励绿色生产和消费,推动形成健康文明生产生活方式。要深入开展爱国卫生运动,倡导健康饮食文化和良好生活习惯,严厉打击非法捕杀和交易野生动物的行为。
第六,加强公共卫生体系建设。我国公共卫生事业取得了举世公认的成就,但这次疫情防控也反映出我国公共卫生领域还存在明显短板,需要加快补上。
要从顶层设计上提高公共卫生体系在国家治理体系中的地位,充实中央、省、市、县四级公共卫生机构,加强专业人才培养和队伍建设,提高履职尽责能力。要改善城乡公共卫生环境,加强农村、社区等基层防控和公共卫生服务。要加强公共卫生机构、医院感染病科、生物实验室等的规划建设,做好敏感医疗和实验数据管理。要加强卫生健康教育和科学知识普及,提高群众公共卫生素养。在这次疫情防控中,中医发挥了重要作用,要及时总结经验,加强科学论证,大力发展中医药事业,加强中西医结合,不断提高能力和水平。
历次抗击重大传染病疫情的实践表明,必须加快形成从下到上早发现、早预警、早应对的体系,努力把疫情控制在萌芽状态。要把增强早期监测预警能力作为健全公共卫生体系的重中之重,完善公共卫生应急管理体系。要加强疾控、医院、科研单位间的信息共享,增强各类已知和新发传染病预警能力。
这是习近平总书记2020年4月10日在中央财经委员会第七次会议上的讲话。
"""
# todo: 先选定范围
# count_0 = len(re.findall(r":", para))
# count_1 = len(re.findall(r"“", para))
# if count_0 == count_1:
# print("kaishi")
# else:
# print("stop")
infoId = "123456"
contentTypeIds = "444444"
topicNames = "数字化转型"
contentNames = "领导讲话"
# content = """强化创新引领 加快“三个转变” 更好推动中国制造高质量发展
# 国资委党委委员、副主任 翁杰明
#
#
# 制造业是立国之本、强国之基,以习近平同志为核心的党中央高度重视制造强国建设。2014年5月10日,习近平总书记在中铁装备视察时首次提出,推动中国制造向中国创造转变、中国速度向中国质量转变、中国产品向中国品牌转变。习近平总书记关于“三个转变”的重要指示为中国制造高质量发展指明了方向、提供了根本遵循,国务院国资委和中央企业深入学习领会、坚决贯彻落实。近年来,国务院国资委专门出台质量品牌工作系列文件,强化考核引导激励,建立长效机制,搭建了“中国品牌论坛”、“数字中国建设峰会”、“双创”示范基地等一系列助力企业高质量发展的专业化平台,引导支持中央企业积极打造原创技术“策源地”和现代产业链“链长”,有效推动中央企业激发创新潜力、增强发展动力,在高端装备制造领域取得一系列突破性、标志性重大成果,一大批“国之重器”横空出世。无论是代表国家实力的天宫探梦、嫦娥奔月、北斗导航,还是捍卫国家主权的航空母舰、东风导弹、歼20、运20,无论是享誉“一带一路”的中国桥、国路、中国港,还是成为中国名片的高速铁路、华龙一号、5G通信网络,中央企业都发挥了重要作用,彰显了大国重器的责任担当。
#
# 当今世界正经历百年未有之大变局,新一轮科技革命和产业变革深入发展。党的十九届五中全会立足新发展阶段,作出建设制造强国、质量强国、网络强国、数字中国的战略部署。国务院国资委和中央企业将胸怀两个大局、心系“国之大者”,把握新发展阶段,贯彻新发展理念,构建新发展格局,加快实现从中国制造向中国创造、中国速度向中国质量、中国产品向中国品牌的转变,在推动中国制造高质量发展、建设制造强国中发挥主力军和排头兵作用。
#
# 着力做强主业实业,筑牢“三个转变”产业根基
#
#
# 坚持突出主业、做强实业,将更多资源和更大精力投入实体经济特别是制造业,坚决做实业报国的“耐心资本”。大力发展实体经济,推进国有经济布局优化和结构调整,紧紧围绕战略安全、产业引领、国计民生、公共服务等功能,更加突出主业、实业和核心竞争力标准,推动各类要素向主业实业集中,着力打造一批自主可控安全可靠的产业链,培育一批具有产业发展主导力的产业链“链长”企业,形成一批各具特色的产业集团,不断提升整体竞争力和系统稳定性。促进国民经济循环,在提升供给体系对国内需求的适配性上发挥主导作用,优化供给结构、改善供给质量,加强现代流通体系建设,促进国内供给、需求在更高层次、更高水平上实现动态平衡。高质量共建“一带一路”,通过对外合作提升关键技术、优化全球布局、打造国际品牌,增强全球价值链掌控力。提升发展质量效益,加强国有资本运作和价值管理,促进国有资本在有序流动中提升价值、提高收益。坚持以出资人为主导,推动中央企业深化战略性重组、专业化整合,确保中央企业真正成为具有核心竞争力的、对实体经济能够发挥引领带动作用的、名副其实的国家队。
#
# 深化国有企业改革,激发“三个转变”强大动力
#
#
# 实现“三个转变”需要好的制度机制作保障,深入推进国企改革三年行动要朝着实现“三个转变”聚焦发力。加快完善中国特色现代企业制度,全面落实“两个一以贯之”要求,把加强党的领导和完善公司治理统一起来,形成适应市场化、国际化、法治化新形势的现代治理模式,强化实现“三个转变”制度优势。深化混合所有制改革,积极引入战略投资者,支持引导战略投资者参与公司治理,促进各类所有制企业在“三个转变”中取长补短、共同发展。健全市场化经营机制,推动建立职业经理人制度,完善市场化薪酬分配机制,深化实施“科改示范行动”等专项工程,让广大经营管理人才、科技人才和工匠标兵在推进“三个转变”中大展身手、建功立业,成为实现“三个转变”最富创造力的主力军。
#
# 推进科技自立自强,夯实“三个转变”战略支撑
#
#
# 围绕推动高质量发展、构建新发展格局,坚持把科技创新作为“头号任务”,大力实施创新驱动发展战略,持续加大研发投入力度,优化研发支出结构,不断强化国家战略科技力量。努力打造科技攻关重地,积极与国家攻关计划对接,针对工业母机、高端芯片、基础软件、新材料、大飞机、发动机等产业薄弱环节,联合行业上下游、产学研力量开展协同攻关,发挥创新联合体优势作用,进一步把最优秀的人才、最急需的资源、最先进的设备集中配置到攻关任务上来,在解决“卡脖子”问题上实现更多更大突破。努力打造原创技术“策源地”,积极融入国家基础研究创新体系,主动承担重大项目,进一步加大原创技术研发投入,在信息、生物、能源、材料等方向,加快布局一批基础应用技术;在人工智能、空天技术、装备制造等方面,加快突破一批前沿技术;在电力装备、通信设备、高铁、核电、新能源等领域,加快锻造一批长板技术,不断增强行业共性技术供给。努力打造科技创新“特区”,深入落实激励机制,坚持特殊人才特殊激励,对重点科研团队一律实行工资总额单列,对科技人才实施股权和分红激励等中长期激励政策,建设更多高水平研发平台和新型研发机构,赋予科研人员更大自主权、给予更大容错空间,在中央企业形成创新创业浓厚氛围。
#
# 大力发展智能制造,探索“三个转变”重要途径
#
#
# 牢牢把握制造业转型升级的内在需求和创新发展的重大机遇,在新一轮科技革命和产业变革浪潮中发挥龙头作用。推动传统产业转型升级,促进传统产业高端化、智能化、绿色化,推广先进适用技术,聚焦智慧能源、智能交通和智能建造等领域,扩大制造业设备更新和技术改造投资,打造一批数字化转型标杆企业。推进新兴产业发展壮大,适应数字产业化、产业数字化要求,持续推进5G网络、数据中心、物联网、卫星互联网等新型基础设施建设,加快培育一批云计算、大数据、集成电路、人工智能等领军企业,深入推进数字经济、智能制造、生命健康、新材料等战略性新兴产业发展,打造未来发展新优势。促进产业链供应链有效协同,坚持“两个毫不动摇”,充分发挥中央企业示范带动作用和产业引领功能,牵头组织产业联盟,加强与产业链相关企业的协调合作,提高产业链协作效率,促进深度交流、资源共享、优势互补,打造发展融合、利益共享的良好生态,形成产业链有序竞合新格局。
#
# 培育世界一流企业,做强“三个转变”引领力量
#
#
# 大企业在引领科技创新、支撑产业发展上具有无可替代的重要作用,必须加快建设世界一流企业。夯实一流的管理基础,加快推进对标世界一流管理提升行动,对标世界一流补短板、强弱项,不断提升核心竞争力,广泛运用先进管理方法和管理手段,在战略管理、组织管理等重点领域全面发力。加强企业管理体系和管理能力建设,努力提高产品和服务质量,加快形成有利于高质量发展的指标体系、标准体系、绩效评价,不断夯实制造业高质量发展的基础。建设一流的人才队伍,大力弘扬企业家精神,培养更多具有全球战略眼光、市场开拓精神、管理创新能力和社会责任感的企业家,打造高素质专业化的科技、管理、技能人才队伍,特别是加快培养一批急需紧缺的科技领军人才和高水平创新团队,为中国制造高质量发展提供坚实人才保障。打造一流的企业品牌,鼓励企业加强品牌战略研究和全面质量管理,将品牌意识融入企业生产经营全过程,以质量创品牌,以服务优品牌,讲好品牌故事,塑造品牌形象,提升品牌价值,打造更多具有国际影响力的知名品牌。
#
# 深入贯彻落实习近平总书记关于“三个转变”重要指示精神,大力发展实体经济特别是加快振兴我国装备制造业,是各类所有制企业的共同使命和重大责任。我们将更加紧密地团结在以习近平同志为核心的党中央周围,大力推进制造业创新发展、转型升级、提质增效,推进制造强国建设、塑造我国国际合作和竞争新优势,为全面建设社会主义现代化国家作出更大贡献。
#
# 【责任编辑:王占朝】
# """
list_para, list_sent = build_pro_new(infoId, para, contentNames, contentTypeIds, topicNames)
print(list_para)
print(list_sent)
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : search_by_dot_matrix.py
# @Time : 2022/6/21 15:44
# @Author : bruxelles_li
# @Software: PyCharm
import pandas as pd
# text = """  巧用 5 日线把握强龙买卖机会——一个完整的交易,是买入持有卖出,所以今天我们重点来看持有和卖出节奏——今天分享一个大家都在用,简单有效,可以很好把握翻倍强龙机会的买卖技巧——5 日均线交易法。\n\n  该方法核心是 28 字真言:5 日均线分强弱,——不破 5 日继续做;——跌破 5 日减半仓,——三日不回全抛光。\n\n  我们一个个来看,首先是5 日均线分强弱——简单的说,判断一只个股是否强势,核心看 5 日均线。这条线代表主力短线意图,——如果主力进攻意愿强,5 日线通常是快速上涨;而主力如果有撤退迹象,5 日线——往往也首先拐头向下,正因为这种技术特性,5 日线是我们判断个股买卖时机的——好方法。\n\n  持有:不破 5 日继续做——强势股波动巨大,这意味着大机会,同时也包含风险。那通过弱转强信号进场后,——可以持有多久呢?——这就要看 5 日线,如果行情回踩 5 日线时,没有效跌破,那就可以继续持有。\n\n  有效跌破是指在收盘时,股价依然在 5 日线下方。如果盘中跌破 5 日线,临近——收盘时,股价重回 5 日线上方,则不算有效跌破。\n\n  还有一点,一般考虑买入时,5 日线应当是上涨状态,如果是下跌状态,则不适——用。\n\n  基建强龙浙江建投完美的诠释不破 5 日继续做的方针,在股价处在上涨通道——内,踩着 5 日线主升,但从不破 5 日线,可以长期持有。\n\n  减仓时机:跌破 5 日减半仓——前面说了不破 5 日线是买入时机,那要是有效跌破 5 日线呢?这意味风险来临——我们要适当减半仓,锁定利润了。\n\n  很多朋友会有赚钱了,舍不得卖的心理。这里我们要理性,不能光看机会,不看——风险。\n\n  如果行情跌破 5 日线,第二天很快反抽回来,我们可以灵活把减掉的仓位加回来。\n\n  下面是近期的新冠概念强势股雅本化学,可以很直观的感受到以 5 日线为基——准,灵活加减仓是多么重要。\n\n  清仓时机:三日不回全抛光——这里就很直白了,如果行情连续三个交易日在 5 日线下方,那就应该严格清仓。\n\n  不要有侥幸心理,就算偶尔行情会回来,可一旦遭遇大跌,就会非常受伤。我们——追求的应该是用科学的方法实现小亏大赚,而不是赌运气。\n\n  比如下面这只亚太药业,曾经风靡一时的幽门螺旋杆菌概念龙头,不少朋友可能——会追涨杀入,但是最后行情连续整理,连续三天都在 5 日线下方,这个时候就应——该清仓;哪怕轻套,也该认错,否者马上就差点跌停,那轻套就变成深套了。\n\n  以上就是5 日均线交易法的全部内容,这个方法就是特别简单,而且实用(买——卖都能用),我们花点时间掌握,对于强龙机会的把握有很大助益。
# """
# str = "757|806"
# prefix_index = int(str.split("|")[0])
# suffix_index = int(str.split("|")[1])
# print(text[prefix_index:suffix_index+1])
# todo: 定义句子库内容
sent_df = pd.read_excel('测试文件/句子库测试样例.xlsx', keep_default_na=False).astype(str)
_id2sentcont = {row['id']: row['content'] for idx6, row in sent_df.iterrows()}
_id2sent_articleid = {row['id']: row['article_id'] for idx7, row in sent_df.iterrows()}
_id2sent_paraid = {row['id']: row['paragraph_id'] for idx8, row in sent_df.iterrows()}
_id2sent_paraindex = {row['id']: row['sent_para_index'] for idx9, row in sent_df.iterrows()}
_id2sent_articleindex = {row['id']: row['sent_article_index'] for idx10, row in sent_df.iterrows()}
_id2sent_topic_type = {row['id']: row['topic_type'] for idx11, row in sent_df.iterrows()}
_id2sent_content_type_name = {row['id']: row['content_type_name'] for idx12, row in sent_df.iterrows()}
article_id = '1670829370466076'
sentence_id = "18"
a = sent_df.loc[(sent_df['article_id'] == article_id) & (sent_df['sentence_id'] == str(int(sentence_id)-1))]
if a.empty:
pre_sent = ""
else:
dict_pre = a.to_dict()
new_dict_pre = [dict(zip(dict_pre, values)) for values in zip(*[dict_pre[k].values() for k in dict_pre])]
pre_sent = new_dict_pre[0]["content"]
b = sent_df.loc[(sent_df["article_id"] == article_id) & (sent_df["sentence_id"] == str(int(sentence_id)+1))]
if b.empty:
suf_sent = ""
else:
dict_suf = b.to_dict()
new_dict_suf = [dict(zip(dict_suf, values)) for values in zip(*[dict_suf[k].values() for k in dict_suf])]
suf_sent = new_dict_suf[0]["content"]
print(pre_sent, suf_sent)
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 缓存处理.py
# @Time : 2022/12/13 14:26
# @Author : bruxelles_li
# @Software: PyCharm
"""
基于内存缓存
使用 memory_cache 实例即可
"""
import shutil
import time
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
class Value:
def __init__(self, value, put_time, expired):
"""
缓存值对象
:param value: 具体的值
:param put_time: 放入缓存的时间
:param expired: 缓存失效时间
"""
self.value = value
self.put_time = put_time
self.expired = expired
def __str__(self):
return f"value: {self.value} put_time: {self.put_time} expired: {self.expired}"
class MemoryCache:
def __init__(self):
self.__cache = {}
def set_value(self, k, v, expired):
"""
将值放入缓存中
:param k: 缓存的 key
:param v: 缓存值
:param expired: 缓存失效时间,单位秒(s)
"""
current_timestamp = int(time.time()) # 获取当前时间戳 10 位 秒级
value = Value(v, current_timestamp, expired)
self.__cache[k] = value
logger.info("已放入缓存, {}: {}".format(k, value))
def check_key(self, k):
"""
检查缓存是否可用
:param k: 缓存 key
:return: True or False
"""
current_timestamp = int(time.time())
value = self.__cache.get(k, None)
# 考虑k不存在的情况
if value is None:
return False
differ = current_timestamp - value.put_time
if differ > value.expired:
del self.__cache[k] # 证明缓存失效了,删除键值对
logger.info("缓存已失效, key: {}".format(k))
return False
return True
def get_value(self, k):
"""
通过缓存key获取值
:param k: key
:return: value
"""
if self.check_key(k):
return self.__cache[k].value
return None
memory_cache = MemoryCache()
if __name__ == "__main__":
import json, os
# memory_cache.set_value('my_blog', 'sunnyc.icu', 3) # 设置一个 3 秒过期的键值对
# memory_cache.set_value('my_github', 'hczs', 20) # 设置一个 6 秒过期的键值对
dic_result = {
"my_blog": 'sunnyc.icu',
"my_github": "hczs"
}
json_result = json.dumps(dic_result)
path = "测试文件/test.json"
with open(path, 'w', encoding='utf-8') as file:
file.write(json_result)
# os.remove(path)
if os.path.isfile(path):
with open("测试文件/test.json", 'r', encoding='utf-8') as f:
dict_result = json.load(f)
print(dict_result)
else:
print("no")
# time.sleep(5)
# a = "my_blog"
# if a == "my_blog":
# print(memory_cache.get_value(a))
# else:
# print('my_github: ', memory_cache.get_value('my_github'))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论