提交 e20df706 作者: bruxellse_li

2023年智能写作搜索代码提交

上级 b9ddb61f
# 默认忽略的文件
/shelf/
/workspace.xml
# 数据源本地存储已忽略文件
/../../../:\AI-Report\.idea/dataSources/
/dataSources.local.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
<serverData>
<paths name="root@114.115.151.101:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.49.86:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="root@114.116.54.108:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (2)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (3)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (4)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (5)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
<paths name="zzsn@192.168.1.149:22 password (6)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
</component>
</project>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="21">
<item index="0" class="java.lang.String" itemvalue="protobuf" />
<item index="1" class="java.lang.String" itemvalue="Levenshtein" />
<item index="2" class="java.lang.String" itemvalue="xlrd" />
<item index="3" class="java.lang.String" itemvalue="python_Levenshtein" />
<item index="4" class="java.lang.String" itemvalue="pdfminer.six" />
<item index="5" class="java.lang.String" itemvalue="Camelot" />
<item index="6" class="java.lang.String" itemvalue="camelot-py" />
<item index="7" class="java.lang.String" itemvalue="tqdm" />
<item index="8" class="java.lang.String" itemvalue="jieba" />
<item index="9" class="java.lang.String" itemvalue="flask" />
<item index="10" class="java.lang.String" itemvalue="bert_serving" />
<item index="11" class="java.lang.String" itemvalue="setuptools" />
<item index="12" class="java.lang.String" itemvalue="pandas" />
<item index="13" class="java.lang.String" itemvalue="certifi" />
<item index="14" class="java.lang.String" itemvalue="typing_extensions" />
<item index="15" class="java.lang.String" itemvalue="charset-normalizer" />
<item index="16" class="java.lang.String" itemvalue="numpy" />
<item index="17" class="java.lang.String" itemvalue="pytz" />
<item index="18" class="java.lang.String" itemvalue="urllib3" />
<item index="19" class="java.lang.String" itemvalue="idna" />
<item index="20" class="java.lang.String" itemvalue="scikit_learn" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/AI-Report.iml" filepath="$PROJECT_DIR$/.idea/AI-Report.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : analysis_model.py
# @Time : 2022/1/12 10:32
# @Author : bruxelles_li
# @Software: PyCharm
"""
提取内容中
时间: [0-9]{1,4}年[0-9]{1,2}月[0-9]{1,2}日
标题: 第一个换行符的内容
内容: 最后一个换行符前的内容
作者: 最后三行包含的人名
来源: 最后三行包含的机构名
"""
import re
from datetime import datetime
from lac_model import lac_organize_name, lac_username
# 定义时间提取模式
time_pattern = re.compile(r"[0-9]{1,4}年[0-9]{1,2}月[0-9]{1,2}日")
# 定义主函数
def analysis_function(content):
# 先根据换行符拆分内容
para_list = content.split("\n")
title = para_list[0]
match_content = "\n".join(para_list[1:]).strip("\n")
temp_content = "\n".join(para_list[-3:]).strip("\n")
# todo: 根据后三行分别提取发布来源和作者
name_list = lac_username(temp_content)
organize_list = lac_organize_name(temp_content)
author = name_list[0] if name_list else ""
origin = organize_list[0] if organize_list else ""
# todo: 提取时间
time_list = time_pattern.findall(match_content)
if time_list:
temp_articleTime = time_list[0]
date_list = re.split("[年月日]", temp_articleTime)[:-1]
date_list_1 = ["0"+d if len(d) == 1 else d for d in date_list]
date_format = "-".join(date_list_1)
articleTime = date_format + " 00:00:00"
else:
articleTime = ""
return title, author, origin, articleTime, match_content
if __name__ == "__main__":
text = """建筑垃圾如何“变废为宝”
   建筑垃圾是众多垃圾中较难处理的一类,存在体积大、回收处理2022年12月1日号难等问题。这些建筑物、构筑物、管网以及房屋装饰产生的弃土、弃料等,如果不经过妥善的处理,不仅是对资源的浪费,还会污染水、土壤和空气。
        传统的建筑垃圾处理方法有哪些不足?
        传统的建筑垃圾处理方法主要有堆放、填埋、回填等,但这些处理方式会对环境造成极大的影响,如土壤污染、地下水污染、大气污染等。建筑垃圾堆放和填埋需要耗用大量的土地,释放的有毒有害物质会改变土壤的物理结构和化学性质,造成土壤污染;被污染的土壤由雨水冲刷会形成渗滤液进入水体中,容易引起地下水和地表水污染;露天堆放的建筑垃圾更是容易引起扬尘,渣土车运输过程中排放的大量尾气和道路遗撒引起的扬尘又加重了大气污染。
        正确处理建筑垃圾是什么样的?
        近年来,建筑垃圾再循环利用成为一个新尝试,经过加工处理让它们实现二次利用。如金属废料、废钢筋、废铁丝、废电线等经过分拣、回炉热加工可制成钢材等金属制品;废混凝土经过粉碎、筛选等过程可制成再生骨料用于公路路基、墙体的填充材料,也可生产多孔砖、空心板、空心砌块、水泥原料等产品;废木材可用于制造合成板材;沥青经过重新回收可再生成沥青。
        混凝土是目前建筑垃圾中回收价值较高的部分,废弃混凝土生成的再生骨料由于强度高、生产成本低,颇受市场青睐,再生骨料按照一定级配搅拌和碾压后具有较高地基承载力,可直接应用于软弱地基、竖井回填、路基垫层、水处理、场地抑尘等工程;还可以部分或者全部替代天然骨料,生产再生无机混合料、再生砖、再生混凝土等产品。
        要想实现建筑垃圾的环保化,也要做到源头上减少建筑垃圾产生量,大力开发和推广节能降耗的建筑新技术和新工艺,采用尽量少产生建筑垃圾的结构设计;加大对建筑垃圾综合利用的投入,限制天然骨料、石料的使用量,出台相应的优惠政策,鼓励使用再生材料、替代材料及易回收材料等,从源头上最大限度减少建筑垃圾的产生。同时,有关部门可以大力推广再生产品应用,促进循环利用,号召市政、园林、交通、水务等工程率先选用建筑废弃物再生产品,鼓励社会投资工程使用建筑废弃物再生产品。
        通过固废资源的循环利用,可以大幅提升建筑垃圾的利用率,有效缓解建筑垃圾的运输和空间存储问题,具备较高的灵活性,低碳节能,贴近近乎零污染、零排放的理想环保要求,为国家实现“双碳”目标作出贡献。
中国科协科普部
新华网
联合出品"""
print(analysis_function(text))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : app_config
# @Author : LiuYan&bruxelles_li
# @Time : 2021/4/22 10:51
import os
import multiprocessing
from pathlib import Path
bind = '0.0.0.0:4002' # 绑定ip和端口号
backlog = 512 # 监听队列
# chdir = '/home/zzsn/liuyan/bin' # gunicorn要切换到的目的工作目录
timeout = 600 # 超时 -> 目前为迎合ZZSN_NLP平台 一带一路要素抽取(文件)需求 暂时关闭超时
# worker_class = 'gevent' # 使用gevent模式,还可以使用sync 模式,默认的是sync模式
# workers = multiprocessing.cpu_count() # 进程数 12
workers = 1 # 低资源 13G 服务器负载过大可调整此处为 1
threads = 50 # 指定每个进程开启的线程数
loglevel = 'error' # 日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置
access_log_format = '%(t)s %(p)s %(h)s "%(r)s" %(s)s %(L)s %(b)s %(f)s" "%(a)s"' # 设置gunicorn访问日志格式,错误日志无法设置
"""
其每个选项的含义如下:
h remote address
l '-'
u currently '-', may be user name in future releases
t date of the request
r status line (e.g. ``GET / HTTP/1.1``)
s status
b response length or '-'
f referer
a user agent
T request time in seconds
D request time in microseconds
L request time in decimal seconds
p process ID
"""
_tmp_path = os.path.dirname(os.path.abspath(__file__))
_tmp_path = os.path.join(_tmp_path, 'log')
Path(_tmp_path).mkdir(parents=True, exist_ok=True)
accesslog = os.path.join(_tmp_path, 'gunicorn_access.log') # 访问日志文件
errorlog = os.path.join(_tmp_path, 'gunicorn_error.log') # 错误日志文件
# gunicorn -c app_config.py app_run:app -D --daemon
# -*- coding: utf-8 -*-
# @Time : 2022/9/22 11:08
# @Author : ctt
# @File : data_building
# @Project : 研究中心知识图谱
import mysql.connector
import pandas as pd
import logging
from snow_id import Snow
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
class Data:
def __init__(self):
pass
def initialize(self):
self.database = self.connect()
self.entity_df = self.get_data_base_entity()
print('==========数据获取完毕=============')
self.database.close()
def connect(self):
'''
连接数据库
:return:
'''
database = mysql.connector.connect(
host='114.115.159.144',
user='root',
passwd='zzsn9988',
database='clb_project',
auth_plugin='mysql_native_password'
)
return database
def get_article_data(self, database_conntect):
# country_sql = '''SELECT id, title, content, origin, publish_date FROM core_base_data WHERE status != 1 and status != 2 and publish_date >= '2022-03-01';'''
# country_sql = '''select id, title, content, origin, publish_date from core_base_data WHERE id not in (SELECT DISTINCT bid FROM core_base_data_entity)'''
# country_sql = '''select id, article_title, content, origin, article_time from ai_report_material WHERE id ='1670667630665899' '''
country_sql = '''select id, article_title, content, origin, article_time from ai_report_material WHERE type='par' '''
cursor = database_conntect.cursor()
cursor.execute(country_sql)
data_table = cursor.fetchall()
# columns = [_[0] for _ in cursor.description]
article_df = pd.DataFrame(data_table, columns=['id', 'article_title', 'content', 'origin', 'article_time'], dtype=str)
cursor.close()
return article_df
def get_data_base_entity(self):
country_sql = '''SELECT id, compound_word,label_uuid FROM graph_entity;'''
cursor = self.database.cursor()
cursor.execute(country_sql)
entity_base_table = cursor.fetchall()
# columns = [_[0] for _ in cursor.description]
entity_df = pd.DataFrame(entity_base_table, columns=['id', 'compound_word', 'label_uuid'], dtype=str)
cursor.close()
return entity_df
def insert_entity(self, bid, eids):
database_conntect = self.connect()
insert_data = []
for eid in eids:
id = Snow.get_guid()
logger.info(id)
insert_data.append((id, bid, eid))
sql = 'insert ignore into core_base_data_entity (id, bid, eid, status) values (%s, %s, %s, 0)'
cursor = database_conntect.cursor()
cursor.executemany(sql, insert_data)
database_conntect.commit()
cursor.close()
def insert_relation(self, relation_id, source_id, target_id, bid):
database_conntect = self.connect()
id = Snow.get_guid()
print((id, relation_id, source_id, target_id, bid, 0))
sql = 'insert into graph_entity_entity (id, relation_id, source_id, target_id, bid, status) ' \
'values (%s, %s, %s, %s, %s, 0)' % (id, relation_id, source_id, target_id, bid)
logger.info(sql)
cursor = database_conntect.cursor()
cursor.execute(sql)
database_conntect.commit()
cursor.close()
data = Data()
database_conntect = data.connect()
article_df = data.get_article_data(database_conntect)
print(article_df)
print(len(article_df))
database_conntect.close()
# article_df.to_excel(r'数据2022-03.xlsx', index=False)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : encode_sentence.py
# @Time : 2022/12/12 19:23
# @Author : bruxelles_li
# @Software: PyCharm
"""
pip install bert-serving-server && pip install bert-serving-client
"""
import pandas as pd
import numpy as np
from bert_serving.client import BertClient
from tqdm import tqdm
from numpy import *
bc = BertClient('114.116.49.86', check_length=False)
# df = pd.read_excel('素材库/句子库/入库_sent.xlsx', keep_default_na=False).astype(str)
# # df = pd.read_excel("素材库/段落库/去重后_para.xlsx", keep_default_na=False).astype(str)
# length = len(df)
# print(length)
vector_path = "素材库/句子库.txt"
np_path = "素材库/句子库.npy"
def encode_sentences(df, path):
f = df
"""
for line in f:
result.append(line.strip('\n'))
"""
with open(path, 'w', encoding='utf-8') as f_vectors:
for idx, row in tqdm(f.iterrows()):
sentence = row['content']
vector = bc.encode([sentence])
f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
return None
def save_file(length, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
if __name__ == "__main__":
# text = "nihao"
# print(bc.encode([text]))
# encode_sentences(df, vector_path)
length = 333350
save_file(length, np_path)
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : es_byid.py
# @Time : 2022/12/27 13:41
# @Author : bruxelles_li
# @Software: PyCharm
import requests
import json
import pandas as pd
# todo: 根据唯一标识id调用es查询接口目标信息
def find_sent_info(_id: list):
size = len(_id)
"""
:param _id: "1670844082074304"
:return:
"""
url = "http://114.115.215.250:9700/ai_report_material/_search"
# todo: 传入list_id
payload = json.dumps({
"query": {
"bool": {
"must": [
{
"terms": {
"id": _id
}
}
]
}
},
"track_total_hits": True,
"size": size
})
headers = {
'Authorization': 'Basic ZWxhc3RpYzp6enNuOTk4OA==',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
result_data = response.text.encode("utf-8")
obj_result = json.loads(result_data)
# todo: 解析出目标数据, 数据对象为list
hits_obj = obj_result["hits"]["hits"]
_source_list = [hits_obj[i]["_source"] for i in range(0, size)]
df = pd.DataFrame(_source_list)
df = df[["id", "content", "paragraphId", "sentParaIndex", "sentArticleIndex", "topicType", "contentTypeName",
"articleId", "sentenceId"]]
return df
# if hits_obj:
# # todo:当前list长度为1,取第一个元素中即为目标数据存在范围
# temp_result = hits_obj[0]["_source"]
# # todo: 此时带查询数据内容为dict对象
# # print(temp_result)
#
# else:
# temp_result = ""
# return temp_result
# return hits_obj
# return _source_list
# todo: 根据唯一标识id调用es查询接口目标信息
def find_para_info(_id: list):
size = len(_id)
"""
:param _id: "1670844082074304"
:return:
"""
url = "http://114.115.215.250:9700/ai_report_material/_search"
# todo: 传入list_id
payload = json.dumps({
"query": {
"bool": {
"must": [
{
"terms": {
"id": _id
}
}
]
}
},
"track_total_hits": True,
"size": size
})
headers = {
'Authorization': 'Basic ZWxhc3RpYzp6enNuOTk4OA==',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
result_data = response.text.encode("utf-8")
obj_result = json.loads(result_data)
# todo: 解析出目标数据, 数据对象为list
hits_obj = obj_result["hits"]["hits"]
_source_list = [hits_obj[i]["_source"] for i in range(0, size)]
df = pd.DataFrame(_source_list)
df = df[["id", "content", "paragraphId", "paraArticleIndex", "topicType", "contentTypeName",
"articleId"]]
return df
# if hits_obj:
# # todo:当前list长度为1,取第一个元素中即为目标数据存在范围
# temp_result = hits_obj[0]["_source"]
# # todo: 此时带查询数据内容为dict对象
# # print(temp_result)
#
# else:
# temp_result = ""
# return temp_result
# return hits_obj
# return _source_list
# todo: 根据文章id和句子id调用es查询接口目标句子前后内容
def find_sen_content(sent_article_id: str, sentence_id: str, sent_content):
"""
:param article_id:
:param sentence_id:
:return:
"""
url = "http://114.115.215.250:9700/ai_report_material/_search"
payload = json.dumps({
"query": {
"bool": {
"must": [
{
"terms": {
"articleId": [sent_article_id, sent_article_id] # ["1670829370466076"]
}
},
{
"terms": {
# "sentence_id": sentence_id # ["1670843538527672"]
"sentenceId": [str(int(sentence_id) - 1), str(int(sentence_id) + 1)]
}
}
]
}
},
"track_total_hits": True
# "size": 1
})
headers = {
'Authorization': 'Basic ZWxhc3RpYzp6enNuOTk4OA==',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
result_data = response.text.encode("utf-8")
obj_result = json.loads(result_data)
# todo: 解析出目标数据, 数据对象为list
hits_obj = obj_result["hits"]["hits"]
if len(hits_obj) >= 2:
# todo:当前list长度为1,取第一个元素中即为目标数据存在范围
pre_temp_result = hits_obj[0]["_source"]
pre_temp_content = pre_temp_result["content"]
suf_temp_result = hits_obj[1]["_source"]
suf_temp_content = suf_temp_result["content"]
elif 1 <= len(hits_obj) < 2:
if hits_obj[0]["_source"]["sentenceId"] == str(int(sentence_id) - 1):
pre_temp_content = hits_obj[0]["_source"]["content"]
suf_temp_content = ""
else:
pre_temp_content = ""
suf_temp_content = hits_obj[0]["_source"]["content"]
else:
pre_temp_content = ""
suf_temp_content = ""
content = pre_temp_content + "<font style='color:red;'>" + sent_content + "</font>" + suf_temp_content
return content
# todo: 根据文章id查询文章信息
def find_art_info(article_id: str):
# size = len(article_id)
"""
:param article_id:
:return:
"""
url = "http://114.115.215.250:9700/ai_report_material/_search"
payload = json.dumps({
"query": {
"bool": {
"must": [
{
"term": {
"articleId": article_id # "1670829371705726"
}
},
{
"term": {
"type": "art"
}
}
]
}
},
"track_total_hits": True,
"size": 1
})
headers = {
'Authorization': 'Basic ZWxhc3RpYzp6enNuOTk4OA==',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
# print(response.text)
result_data = response.text.encode("utf-8")
obj_result = json.loads(result_data)
# todo: 解析出目标数据, 数据对象为list
hits_obj = obj_result["hits"]["hits"]
# print(len(hits_obj), len(article_id))
temp_result = hits_obj[0]["_source"]
# _source_list = [hits_obj[i]["_source"] for i in range(len(article_id))]
# df = pd.DataFrame(_source_list)
# df = df[["articleId", "articleTitle", "origin", "articleTime", "author", "content"]]
# df.rename(columns={"content": "article_content"}, inplace=True)
# return df
return temp_result
# _source_list = [hits_obj[i]["_source"] for i in range(size)]
# df = pd.DataFrame(_source_list)
# df = df[["articleId", "articleTitle", "origin", "articleTime", "author", "content"]]
# df.rename(columns={"content": "article_content"}, inplace=True)
# return df
# if hits_obj:
# # todo:当前list长度为1,取第一个元素中即为目标数据存在范围
# temp_result = hits_obj[0]["_source"]
# # temp_content = temp_result["content"]
# # todo: 此时带查询数据内容为dict对象
# # print(temp_result)
#
# else:
# temp_result = ""
#
# return temp_result
# return hits_obj
if __name__ == "__main__":
#
_id = ["1670844082008296", "1670844082007284"]
find_sent_info(_id)
# print(find_content("1670829371705726", "1670844082074304"))
# articleId = ["1670829370466076", "1670829371705726"]
# print(find_art_info(articleId))
# sent_article_id, sentence_id = "1670829371705726", "1"
# pre_sent, suf_sent = find_sen_content([sent_article_id, sent_article_id], sentence_id)
# print(pre_sent)
# print(suf_sent)
#!/user/bin/env python
# coding=utf-8
"""
@project : 500_资讯
@author : bruxelles_li
@file : lac_ner_text.py
@ide : PyCharm
@time : 2022-07-04 09:19:43
"""
from LAC import LAC
import pandas as pd
import tqdm
import re
lac = LAC(mode="lac")
# 句子提取人名
def lac_username(sentences):
# 装载LAC模型
user_name_list = []
lac = LAC(mode="lac")
lac_result = lac.run(sentences)
# print(lac_result)
for index, lac_label in enumerate(lac_result[1]):
if lac_label == "PER":
user_name_list.append(lac_result[0][index])
# print(user_name_list)
# print(user_name_list)
return user_name_list
# 句子提取机构名
def lac_organize_name(sentences):
# 装载LAC模型
user_name_list = []
lac = LAC(mode="lac")
lac_result = lac.run(sentences)
# print(lac_result)
for index, lac_label in enumerate(lac_result[1]):
if lac_label == "ORG":
user_name_list.append(lac_result[0][index])
return user_name_list
# 句子提取地名
def lac_location_name(sentences):
# 装载LAC模型
user_name_list = []
lac = LAC(mode="lac")
lac_result = lac.run(sentences)
# print(lac_result)
for index, lac_label in enumerate(lac_result[1]):
if lac_label == "LOC":
user_name_list.append(lac_result[0][index])
return user_name_list
def match_text_one(rule, text):
# rule = ";".join(new_one)
# print(rule)
# text_one = match_text_one(rule, title)
# print(text_one)
rules = '|'.join(rule.split(';')).strip('\n')
replaced_rules = rules.replace('.', '\.')\
.replace('*', '\*')\
.replace('(', '\(')\
.replace(')', '\)')\
.replace('+', '.+')
pattern = re.compile(r'' + replaced_rules)
print(pattern)
match_result = re.sub(pattern, "A", text)
print(match_result)
return match_result
if __name__ == '__main__':
text_path = ""
data_df = pd.read_excel(text_path, nrows=1).astype(str)
result_list = []
for idx, row in tqdm.tqdm(data_df.iterrows()):
title = row['title']
a_user = lac_username(title)
a_organize = lac_organize_name(title)
a_location = lac_location_name(title)
if a_user:
user_rule = '|'.join(a_user).strip()
pattern0 = re.compile(r'' + user_rule)
result_one = re.sub(pattern0, 'A', title)
title = result_one
if a_organize:
a_organize_rule = '|'.join(a_organize).strip()
pattern1 = re.compile(r'' + a_organize_rule)
result_two = re.sub(pattern1, 'B', result_one)
title = result_two
if a_location:
a_location_rule = '|'.join(a_location).strip()
pattern2 = re.compile(r'' + a_location_rule)
print(pattern2)
result_three = re.sub(pattern2, 'C', result_two)
print(result_three)
title = result_three
row['title'] = title
result_list.append(row)
print(result_list)
# new_one = a_user + a_organize + a_location
# rule = "|".join(new_one)
# pattern = re.compile(r'' + rule)
# result_one = re.sub(pattern, "A", title)
# title = result_one
# print(title)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : main_search.py
# @Time : 2022/12/14 09:25
# @Author : bruxelles_li
# @Software: PyCharm
import requests
# import os
from pathlib import Path
import threading
# from search_method import get_para_result, get_sent_result
from gj_app import para_process, sent_process
from 缓存处理 import MemoryCache
import json
import time
import logging
# 定义日志输出格式
formatter = logging.Formatter("%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s")
# 创建一个logger, 并设置日志级别
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# 创建一个handler,用于将日志输出到控制台,并设置日志级别
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
# # todo: 定义缓存变量
# memory_cache = MemoryCache()
# TODO: 定义进程存放列表
all_thread = []
# todo: 定义段落处理
# def para_process(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: str, pEndTime: str, pageSize: int):
# pageNo = 10
# para_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
# dict_para = {
# "text": para_list
# }
# para_result = json.dumps(dict_para)
#
# with open(para_path, 'w', encoding='utf-8') as file:
# file.write(para_result)
# time.sleep(120)
# os.remove(para_path)
# return None
# # todo: 定义句子处理
# def sent_process(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: str, pEndTime: str, pageSize: int):
# pageNo = 10
# sent_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
# dict_sent = {
# "text": sent_list
# }
# sent_result = json.dumps(dict_sent)
#
# with open(sent_path, 'w', encoding='utf-8') as file:
# file.write(sent_result)
# time.sleep(120)
# os.remove(sent_path)
# return None
def system_start():
while True:
# print("=====正在进行后台任务=====")
headers = {
'Content-Type': 'application/json'
}
r1 = requests.post(url='http://localhost:4001/queue_size', headers=headers)
r1_json = json.loads(r1.text)
# print(r1_json)
queue_left_number = r1_json['queue_left_number']
logger.info("当前队列任务总数:" + str(queue_left_number))
if queue_left_number == 0:
# logger.warning("队列为空!无可处理任务。")
time.sleep(3)
else:
for i in range(queue_left_number):
r2 = requests.post(url='http://localhost:4001/subject_consumer', headers=headers)
r2_json = json.loads(r2.text)
config_info = r2_json['data']
logger.info(config_info)
if config_info["type"] == "par":
text = config_info["text"]
contentTypeFlags = config_info["contentTypeFlags"]
topicTypeNames = config_info["topicTypeNames"]
pStartTime = config_info["pStartTime"]
pEndTime = config_info["pEndTime"]
pageSize = config_info["pageSize"]
logger.info('##########处理后台段落查询###############')
t = threading.Thread(target=para_process, args=(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize), daemon=True)
# 启动
t.start()
all_thread.append(t)
else:
text = config_info["text"]
contentTypeFlags = config_info["contentTypeFlags"]
topicTypeNames = config_info["topicTypeNames"]
pStartTime = config_info["pStartTime"]
pEndTime = config_info["pEndTime"]
pageSize = config_info["pageSize"]
logger.info('##########处理后台句子查询###############')
t = threading.Thread(target=sent_process,
args=(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize),
daemon=True)
# 启动
t.start()
all_thread.append(t)
def system_resume():
"""
恢复模型训练服务状态
:return:
"""
headers = {
'Content-Type': 'application/json'
}
# 清空当前服务中的队列,避免重复启动同一个模型训练
r1 = requests.post(url='http://localhost:4001/queue_size', headers=headers)
r1_json = r1.json()
logger.info('当前队列数量:%d' % r1_json['queue_left_number'])
if r1_json['queue_left_number'] > 0:
logger.info('正在消费队列,直到队列为空!')
while True:
r2 = requests.post(url='http://localhost:4001/subject_consumer', headers=headers)
r2_json = r2.json()
if r2_json['queue_left_number'] == 0:
logger.info('队列消费完毕!可放心进行模型训练 ...')
break
else:
logger.info('队列为空!可放心进行模型训练 ...')
def start_up_check():
"""
启动前检查
:return:
"""
while True:
try:
headers = {
'Content-Type': 'application/json'
}
r0 = requests.post(url='http://localhost:4001/queue_size', headers=headers)
server_started = True
except requests.exceptions.ConnectionError as e:
server_started = False
logger.error("Error: ConnectionError")
logger.warning('服务未启动,请先启动server! 程序已退出。')
exit(123)
# logger.info('server正在尝试自启 ...')
# time.sleep(3)
if server_started:
logger.info("server启动成功!后台服务已启动...")
break
if __name__ == "__main__":
# 开始启动模型训练服务
start_up_check()
logger.info('后台服务恢复中 ...')
system_resume()
time.sleep(30)
logger.info('后台服务恢复完成!')
logger.info('后台服务运行中 ...')
system_start()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : matrix_file.py
# @Time : 2022/12/12 19:23
# @Author : bruxelles_li
# @Software: PyCharm
"""
pip install bert-serving-server && pip install bert-serving-client
"""
import pandas as pd
import numpy as np
from bert_serving.client import BertClient
from tqdm import tqdm
from numpy import *
# bc = BertClient('114.116.49.86', check_length=False)
# df = pd.read_excel('素材库/句子库/入库_sent.xlsx', keep_default_na=False).astype(str)
# df = pd.read_excel("素材库/段落库/入库_para.xlsx", keep_default_na=False).astype(str)
# length = len(df)
# print(length)
vector_path = "database/sent_database/policy_1_para.txt"
np_path = "database/para_database/policy_1_para.npy"
# def encode_sentences(df, path):
# f = df
# """
# for line in f:
# result.append(line.strip('\n'))
# """
# with open(path, 'w', encoding='utf-8') as f_vectors:
# for idx, row in tqdm(f.iterrows()):
# sentence = row['content']
# vector = bc.encode([sentence])
# f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# return None
def save_file(length, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
if __name__ == "__main__":
# text = "nihao"
# print(bc.encode([text]))
# encode_sentences(df, vector_path)
length = 298092
save_file(length, np_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/12/12 09:11
# @Author : bruxelles_li
# @FileName: merge_file.py
# @Software: PyCharm
import os
import pandas as pd
result = []
path = r"素材库/段落库"
for root, dirs, files in os.walk(path, topdown=False):
for name in files:
if name.endswith(".xls") or name.endswith(".xlsx"):
df = pd.read_excel(os.path.join(root, name), sheet_name=None)
result.append(df)
data_list = []
for data in result:
data_list.extend(data.values()) # 注意这里是extend()函数而不是append()函数
df = pd.concat(data_list)
df.to_excel('素材库/段落库/去重前_para.xlsx', index=False, engine='xlsxwriter')
print("合并完成!")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : merge_numpy.py
# @Time : 2022/1/15 17:00
# @Author : bruxelles_li
# @Software: PyCharm
import numpy as np
import datetime
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
path1 = "database/para_database/policy_1_para.npy"
path2 = "database/para_database/policy_document.npy"
path3 = "database/para_database/update_policy.npy"
start0_time = datetime.datetime.now()
sen_expert_1 = np.load(path1)
print(sen_expert_1.shape)
end0_time = datetime.datetime.now()
total0_time = (end0_time - start0_time).total_seconds()
logger.info("加载矩阵1 共消耗: " + "{:.2f}".format(total0_time) + " 秒")
start1_time = datetime.datetime.now()
sen_expert = np.load(path2)
print(sen_expert.shape)
end1_time = datetime.datetime.now()
total1_time = (end1_time - start1_time).total_seconds()
logger.info("加载矩阵2 共消耗: " + "{:.2f}".format(total1_time) + " 秒")
arr = np.concatenate((sen_expert, sen_expert_1), axis=0)
print(arr.shape)
np.save(path3, arr)
start2_time = datetime.datetime.now()
np.load(path3)
end2_time = datetime.datetime.now()
total2_time = (end2_time - start2_time).total_seconds()
logger.info("加载矩阵2 共消耗: " + "{:.2f}".format(total2_time) + " 秒")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
from bert_serving.client import BertClient
# 定义向量存储文件文件
# vector_parent_path = "database/sent_database/sent_vector_path"
# Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# # todo: 调用bert编码服务
# bc = BertClient("114.116.54.108", check_length=False)
# text = "nihao"
# print(bc.encode([text]))
# df_speech_by_leaders = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="领导讲话", keep_default_na=False).astype(str)
# df_expert_opinion = pd.read_excel("素材库/句子库/待入库_expert_1_sent.xlsx", keep_default_na=False).astype(str)
# print(len(df_expert_opinion))
# df_expert_opinion.dropna(axis=0, subset=["content"])
# length = len(df_expert_opinion)
# print(length)
# df_policy_document = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_leaders, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_leaders]
# path1 = "素材库/句子库/vector_path/speech_by_leaders.txt"
vector_path = "database/sent_database/sent_vector_path/expert_1_sent.txt"
np_path = "database/sent_database/sent_vector_path/expert_1_sent.npy"
# path3 = "素材库/句子库/vector_path/policy_document.txt"
# path4 = "素材库/句子库/vector_path/enterprise_case.txt"
# vector_path = [path1, path2, path3, path4]
# vector_path = [
# "speech_by_leaders.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
# def encode_sentences(vector_path, df, length, np_path):
# with open(vector_path, 'w', encoding='utf-8') as f_vectors:
# for idx, row in tqdm(df.iterrows()):
# sentence = row['content']
# vector = bc.encode([sentence])
# # print(vector)
# f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# f_vectors.close()
# save_file(length, vector_path, np_path)
#
# return None
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
# todo: 定义同时编码主函数
if __name__ == "__main__":
# encode_sentences(vector_path, df_expert_opinion, length, np_path)
save_file(774435, vector_path, np_path)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
from bert_serving.client import BertClient
# 定义向量存储文件文件
vector_parent_path = "database/sent_database/sent_vector_path"
Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# todo: 调用bert编码服务
bc = BertClient("114.116.54.108", check_length=False)
# text = "nihao"
# print(bc.encode([text]))
# df_speech_by_leaders = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="领导讲话", keep_default_na=False).astype(str)
df_expert_opinion = pd.read_excel("素材库/句子库/待入库_leader_1_sent.xlsx", keep_default_na=False).astype(str)
print(len(df_expert_opinion))
df_expert_opinion.dropna(axis=0, subset=["content"])
length = len(df_expert_opinion)
print(length)
# df_policy_document = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_leaders, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_leaders]
# path1 = "素材库/句子库/vector_path/speech_by_leaders.txt"
vector_path = "database/sent_database/sent_vector_path/leader_1_sent.txt"
np_path = "database/sent_database/sent_vector_path/leader_1_sent.npy"
# path3 = "素材库/句子库/vector_path/policy_document.txt"
# path4 = "素材库/句子库/vector_path/enterprise_case.txt"
# vector_path = [path1, path2, path3, path4]
# vector_path = [
# "speech_by_leaders.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
def encode_sentences(vector_path, df, length, np_path):
with open(vector_path, 'w', encoding='utf-8') as f_vectors:
for idx, row in tqdm(df.iterrows()):
sentence = row['content']
vector = bc.encode([sentence])
# print(vector)
f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
f_vectors.close()
save_file(length, vector_path, np_path)
return None
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
# todo: 定义同时编码主函数
if __name__ == "__main__":
encode_sentences(vector_path, df_expert_opinion, length, np_path)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
from bert_serving.client import BertClient
# 定义向量存储文件文件
# vector_parent_path = "database/sent_database/sent_vector_path"
# Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# todo: 调用bert编码服务
# bc = BertClient("114.116.54.108", check_length=False)
# text = "nihao"
# print(bc.encode([text]))
# df_speech_by_policys = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="领导讲话", keep_default_na=False).astype(str)
# df_expert_opinion = pd.read_excel("素材库/句子库/待入库_policy_1_sent.xlsx", keep_default_na=False).astype(str)
# print(len(df_expert_opinion))
# df_expert_opinion.dropna(axis=0, subset=["content"])
# length = len(df_expert_opinion)
# print(length)
# df_policy_document = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_policys, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_policys]
# path1 = "素材库/句子库/vector_path/speech_by_policys.txt"
vector_path = "database/sent_database/sent_vector_path/policy_1_sent.txt"
np_path = "database/sent_database/sent_vector_path/policy_1_sent.npy"
# path3 = "素材库/句子库/vector_path/policy_document.txt"
# path4 = "素材库/句子库/vector_path/enterprise_case.txt"
# vector_path = [path1, path2, path3, path4]
# vector_path = [
# "speech_by_policys.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
# def encode_sentences(vector_path, df, length, np_path):
# with open(vector_path, 'w', encoding='utf-8') as f_vectors:
# for idx, row in tqdm(df.iterrows()):
# sentence = row['content']
# vector = bc.encode([sentence])
# # print(vector)
# f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# f_vectors.close()
# save_file(length, vector_path, np_path)
# return None
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
# todo: 定义同时编码主函数
if __name__ == "__main__":
# encode_sentences(vector_path, df_expert_opinion, length, np_path)
save_file(615784, vector_path, np_path)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
from bert_serving.client import BertClient
# 定义向量存储文件文件
# vector_parent_path = "database/sent_database/sent_vector_path"
# Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# # todo: 调用bert编码服务
# bc = BertClient("114.116.54.108", check_length=False)
# # text = "nihao"
# # print(bc.encode([text]))
# # df_speech_by_policys = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="领导讲话", keep_default_na=False).astype(str)
# df_expert_opinion = pd.read_excel("素材库/句子库/待入库_other_sent.xlsx", keep_default_na=False).astype(str)
# print(len(df_expert_opinion))
# df_expert_opinion.dropna(axis=0, subset=["content"])
# length = len(df_expert_opinion)
# print(length)
# df_policy_document = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/句子库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_policys, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_policys]
# path1 = "素材库/句子库/vector_path/speech_by_policys.txt"
vector_path = "database/sent_database/sent_vector_path/other_sent.txt"
np_path = "database/sent_database/sent_vector_path/other_sent.npy"
# path3 = "素材库/句子库/vector_path/policy_document.txt"
# path4 = "素材库/句子库/vector_path/enterprise_case.txt"
# vector_path = [path1, path2, path3, path4]
# vector_path = [
# "speech_by_policys.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
# def encode_sentences(vector_path, df, length, np_path):
# with open(vector_path, 'w', encoding='utf-8') as f_vectors:
# for idx, row in tqdm(df.iterrows()):
# sentence = row['content']
# vector = bc.encode([sentence])
# # print(vector)
# f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# f_vectors.close()
# save_file(length, vector_path, np_path)
#
# return None
def save_file(length, vector_path, np_path):
A = zeros((int(length), 769), dtype=float)
f = open(vector_path)
lines = f.readlines()
A_row = 0
for line in lines:
list = line.strip('\n').split(' ')
A[A_row, :] = list[:]
A_row += 1
print(A.shape)
np.save(np_path, A)
# todo: 定义同时编码主函数
if __name__ == "__main__":
# encode_sentences(vector_path, df_expert_opinion, length, np_path)
save_file(694187, vector_path, np_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/4/26 09:45
# @Author : bruxelles_li
# @FileName: process_data.py
# @Software: PyCharm
import pandas as pd
import xlrd
from bs4 import BeautifulSoup
import re
import xlsxwriter
def drop_duplicated_data(path):
data_df = pd.read_excel(path)
print("=====test0======", len(data_df))
# data_df.dropna(subset=['title'], inplace=True)
# print("=====test1======", len(data_df))
data_df.dropna(subset=['企业官网'], inplace=True)
print("=====test2======", len(data_df))
data_df.drop_duplicates(subset=["企业官网"], keep="first", inplace=True)
print("=====test3======", len(data_df))
data_df.drop_duplicates(subset=['企业官网'], keep='first', inplace=True)
print("=====test3======", len(data_df))
data_df.reset_index(drop=True, inplace=True)
print("=====test4======", len(data_df))
# data_df.drop('list_srl', axis=1, inplace=True)
# data_df.drop('list_result', axis=1, inplace=True)
print(data_df.shape)
# 写入文件
df1 = pd.DataFrame(data_df)
df1.to_excel('500强企业官网域名_去重.xlsx', engine='xlsxwriter', index=False)
# for idx, row in df.iterrows():
if __name__ == "__main__":
path = r"500强企业资讯导出模型参数文件/500强官网域名.xlsx"
drop_duplicated_data(path)
beautifulsoup4==4.11.1
bert_serving==0.0.1
bert_serving_client==1.10.0
Flask==2.2.2
Flask_Cors==3.0.10
goose3==3.1.11
LAC==2.1.2
lxml==4.9.1
numpy==1.22.4
pandas==1.3.5
pytime==0.2.3
requests==2.27.1
scikit_learn==1.2.0
tqdm==4.64.0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : multiprocess_test.py
# @Time : 2022/12/28 09:23
# @Author : bruxelles_li
# @Software: PyCharm
import math
import datetime
import multiprocessing as mp
import random
import pandas as pd
from tqdm import tqdm
from multiprocessing.pool import Pool
from time import sleep, time
from numpy import *
import numpy as np
from pathlib import Path
import os
import logging
from bert_serving.client import BertClient
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
# 定义向量存储文件文件
# vector_parent_path = "素材库/段落库/vector_path"
# Path(vector_parent_path).mkdir(parents=True, exist_ok=True)
# todo: 调用bert编码服务
bc = BertClient("114.116.54.108", check_length=False)
# text = "nihao"
# print(bc.encode([text]))
# df_speech_by_leaders = pd.read_excel("素材库/段落库/入库_sent.xlsx", sheet_name="领导讲话", nrows=10, keep_default_na=False).astype(str)
# df_expert_opinion = pd.read_excel("素材库/段落库/入库_sent.xlsx", sheet_name="专家观点", nrows=10, keep_default_na=False).astype(str)
# df_policy_document = pd.read_excel("素材库/段落库/入库_sent.xlsx", sheet_name="政策文件", nrows=10, keep_default_na=False).astype(str)
# df_enterprise_case = pd.read_excel("素材库/段落库/入库_sent.xlsx", sheet_name="企业案例", nrows=10, keep_default_na=False).astype(str)
# df_list = [df_speech_by_leaders, df_expert_opinion, df_policy_document, df_enterprise_case]
# df_list = [df_speech_by_leaders]
path1 = "素材库/段落库/para_vector_path/speech_by_leaders.txt"
path2 = "素材库/段落库/para_vector_path/expert_opinion.txt"
path3 = "素材库/段落库/para_vector_path/policy_document.txt"
path4 = "素材库/段落库/para_vector_path/enterprise_case.txt"
vector_path_list = [path1, path2, path3, path4]
# length_list = [22006, 205897, 56295, 49152]
length_list = [4641, 44634, 16647, 19403]
np_path1 = "素材库/段落库/para_vector_path/speech_by_leaders.npy"
np_path2 = "素材库/段落库/para_vector_path/expert_opinion.npy"
np_path3 = "素材库/段落库/para_vector_path/policy_document.npy"
np_path4 = "素材库/段落库/para_vector_path/enterprise_case.npy"
np_path_list = [np_path1, np_path2, np_path3, np_path4]
# vector_path = [
# "speech_by_leaders.txt",
# "expert_opinion.txt",
# "policy_document.txt",
# "enterprise_case.txt"
# ]
def encode_texts(text):
logger.info("====%s====" % text)
tem_list = [{text: int(text)+1}]
# vector = bc.encode([text])
# print(vector)
return tem_list
# def encode_sentences(path, df):
# for idx, row in tqdm(df.iterrows()):
# sentence = row['content']
# print(sentence)
# vector = bc.encode([sentence])
# print(vector)
# print("hello%s,路径%s" % (idx, path))
# """
# for line in f:
# result.append(line.strip('\n'))
# """
# # with open(path, 'w', encoding='utf-8') as f_vectors:
# # for idx, row in tqdm(df.iterrows()):
# # sentence = row['content']
# # vector = bc.encode([sentence])
# # print(vector)
# # f_vectors.write(str(row['id']) + ' ' + ' '.join(map(str, list(vector[0]))) + '\n')
# return None
def save_file(length, np_path, vector_path):
# A = zeros((int(length), 769), dtype=float)
# f = open(vector_path)
# lines = f.readlines()
# A_row = 0
# for line in lines:
# list = line.strip('\n').split(' ')
# A[A_row, :] = list[:]
# A_row += 1
# print(A.shape)
# np.save(np_path, A)
text = "nihao"
return text
# todo: 定义同时编码主函数
if __name__ == "__main__":
print("主进程开始执行>>> pid={}".format(os.getpid()))
# print("父进程开始")
start_t = datetime.datetime.now()
# 创建多个进程,表示可以同时执行的进程数量。默认大小是CPU的核心数
num_cores = int(mp.cpu_count())
p = Pool(num_cores)
# text = "nihao"
# p = mp.Process(target=encode_texts, args=(text,))
text_list = ["1", "2", "3", "4"]
res_list = []
for text in text_list:
# print(i)
# np_path = np_path_list[i]
# length = length_list[i]
# vector_path = vector_path_list[i]
# print(np_path, length, vector_path)
result_list = p.apply_async(encode_texts, args=(text))
# print(result_list.get(), type(result_list.get()))
res_list.extend(result_list.get())
print(res_list)
# i = 0
# for i in range(len(df_list)-1):
# filename = vector_path[i]
# # path = os.path.join(vector_parent_path, filename)
# # print(path, len(df_list[i]))
# # print(df_list[i])
# p.apply_async(encode_sentences, args=(vector_path[i], df_list[i]))
# # p.apply_async(encode_sentences, args=(path1, df_speech_by_leaders))
# # p.apply_async(encode_sentences, args=(path, df_list[i]))
# i += 1
# for i in range(10):
# # 创建进程,放入进程池统一管理
# p.apply_async(run, args=(i,))
# p.start()
# 如果我们用的是进程池,在调用join()之前必须要先close(),并且在close()之后不能再继续往进程池添加新的进程
p.close()
# 进程池对象调用join,会等待进程吃中所有的子进程结束完毕再去结束父进程
p.join()
print("主进程终止")
end_t = datetime.datetime.now()
elapsed_sec = (end_t - start_t).total_seconds()
print("多进程计算 共消耗: " + "{:.2f}".format(elapsed_sec) + " 秒")
# encode_sentences(path1, df_speech_by_leaders)
# -*- coding: utf-8 -*-
# @Time : 2022/9/22 20:00
# @Author : ctt
# @File : test
# @Project : 研究中心知识图谱
# 生成唯一id
import time
class Snow:
"""雪花算法生成全局自增唯一id"""
# 154420004524033 154469287596033
init_date = time.strptime('2022-12-12 18:32:25', "%Y-%m-%d %H:%M:%S")
start = int(time.mktime(init_date) * 1000)
last = int(time.time() * 1000)
pc_room = 1
pc = 1
seq = 0
@classmethod
def get_guid(cls):
"""获取雪花算法生成的id"""
now = int(time.time() * 1000)
if now != cls.last:
cls.last = now
cls.seq = 1
else:
while cls.seq >= 4096:
time.sleep(0.1)
return cls.get_guid()
cls.seq += 1
time_diff = now - cls.start
pk = (time_diff << 22) ^ (cls.pc_room << 18) ^ (cls.pc << 12) ^ cls.seq
return str(pk)
snow = Snow.get_guid()
print(snow)
print(type(snow))
\ No newline at end of file
#!/bin/sh
cd /zzsn/lzc/智能报告搜索推荐
exec gunicorn -c app_config.py main_app:app --daemon --timeout 1200
#nohup python3 -u smi_app.py runserver -h 0.0.0.0 -p 8015 --threaded >>run.log 2>&1 &
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : es_byid.py
# @Time : 2022/12/27 13:41
# @Author : bruxelles_li
# @Software: PyCharm
# xlsx文件转csv文件
import pandas as pd
data = pd.read_excel('素材库/句子库/入库_sent.xlsx', index_col=0).astype(str) # 设置index_col=0,写入文件时第一列不会存在序列号
data.to_csv('素材库/句子库/入库_sent.csv', encoding='utf-8') # 将数据写入csv文件
print("写入完成......")
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 占位符测试.py
# @Time : 2022/12/7 10:02
# @Author : bruxelles_li
# @Software: PyCharm
"""
说明:4个字符是为了避免开头字符多次出现在文章中的不同位置
段落索引: 以段落开头4个字符所在文章位置和段落结尾四个字符所在文章位置作为索引
句子绝对位置索引:以句子开头4个字符所在文章位置和句子结尾4个字符所在文章位置作为索引
句子相对位置索引: 以句子开头4个字符所在段落位置和句子开头4个字符所在段落位置作为索引
注:高亮方法
a = '\033[1;31;40m%s\033[0m' % para[:4]
b = '\033[1;31;40m%s\033[0m' % para[-4:]
print(para_index, a, b)
注: 搜索推荐时转义
段落库:
prefix_index = int(para_index.split("|")[0])
suffix_index = int(para_index.split("|")[1])
print(prefix_index, suffix_index)
# 返回内容,即搜索推荐时高亮区域
print(text[prefix_index:suffix_index+1])
句子库:
# 文章索引
article_prefix_index = int(sent_article_index.split("|")[0])
article_suffix_index = int(sent_article_index.split("|")[1])
print(article_prefix_index, article_suffix_index)
# 返回内容,即搜索推荐时在文章中的高亮区域
print(text[article_prefix_index:article_suffix_index+2])
# 段落索引
para_prefix_index = int(sent_para_index.split("|")[0])
para_suffix_index = int(sent_para_index.split("|")[1])
print(para_prefix_index, para_suffix_index)
# 返回内容,即搜索推荐时在文章中的高亮区域
print(para[para_prefix_index:para_suffix_index+2])
注: 参数定义
输入:
文章id: infoId -> str
文章标题: title -> str
正文: content -> str
文章类型: contentTypeIds -> str
文章主题: topicTypeIds -> str
来源网站: origin -> str
发布时间: publishDate -> str
作者: author -> str
输出:
状态码: code -> str
处理消息: message -> str
返回内容体: resultData -> object
文章信息列表: article_info -> array
文章id: infoId -> str
文章标题: title -> str
正文: content -> str
文章类型: contentTypeIds -> str
文章主题: topicTypeIds -> str
来源网站: origin -> str
发布时间: publishDate -> str
作者: author -> str
段落信息列表: para_info -> array
段落库id: para_id -> str
段落所在文章id: infoId -> str
段落索引: para_index -> str
段落内容: para_content -> str
段落类型: contentTypeIds -> str
段落主题: topicTypeIds -> str
句子信息列表: sent_info -> array
句子库id: sent_id -> str
句子所在段落id: para_id -> str
句子所在文章id: infoId -> str
绝对位置索引: sent_article_index -> str
相对位置索引: sent_para_index -> str
句子内容: sent_content -> str
句子类型: contentTypeIds -> str
句子主题: topicTypeIds -> str
"""
import re
from tqdm import tqdm
import pandas as pd
from 文章id生成 import create_title_id
data_df = pd.read_excel("领导讲话结果.xlsx", nrows=1).astype(str)
# 初始化段落库id,句子库id
para_id, sent_id = 1, 1
# todo: 定义返回结果列表
list_article = []
for idx, row in tqdm(data_df.iterrows()):
# 若文章id存在则用该文章id,若不存在,则用时间戳来生成文章id
row["infoId"] = row["infoId"] if row["infoId"] else str(create_title_id())
# 处理正文内容
text = row["正文"]
# list_text = list(text)
# 基础段落拆分符号”\n“
para_list = text.split("\n")
# print(len(para_list))
# todo: 定义单篇文章的段落信息列表,句子信息列表
list_para, list_sent = [], []
# 获取段落索引
for para in para_list:
# 处理正常内容段落
if len(para) >= 50:
# 根据前四个字符内容获取索引
a0 = para[:4]
b0 = para[-4:]
# 获取首字符的索引
a0_index = text.find(a0) + (4 - len(a0.strip()))
# print(a0, a0_index, text[a0_index])
# 获取末字尾符的索引
b0_index = text.find(b0) + (4 - len(b0.strip())) + 3
# print(b0, b0_index, text[b0_index])
# 保存索引
para_index = str(a0_index) + "|" + str(b0_index)
# print(para_index)
# 获取句子索引, 句子索引有两部分:绝对位置索引,即原文中的位置索引; 相对位置索引,即所在段落中的索引
print("====句子索引====")
# 基础句子拆分符号”!。?“
sent_list = re.split(r'\s*[。!?]\s*', para)
for sent in sent_list:
# todo:当句子长度大于10才进行入库
if len(sent) >= 10:
# 根据前四个字符内容获取索引
c0 = sent[:4]
d0 = sent[-4:]
# 获取首字符的索引
c0_index = text.find(c0) + (4 - len(c0.strip()))
c1_index = para.find(c0) + (4 - len(c0.strip()))
# print(c0, c0_index, text[c0_index], c1_index, para[c1_index])
# 获取末字尾符的索引
d0_index = text.find(d0) + (4 - len(d0.strip())) + 3
d1_index = para.find(d0) + (4 - len(d0.strip())) + 3
# print(d0, d0_index, text[d0_index], d1_index, para[d1_index])
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
# 打印句子内容
print(text[c0_index:d0_index+2], para[c1_index:d1_index+2])
# 将句子信息加入句子信息列表
list_sent.append({
"sent_id": str(sent_id),
"para_id": str(para_id),
"infoId": row["infoId"],
"sent_article_index": sent_article_index,
"sent_para_index": sent_para_index,
"sent_content": text[c0_index:d0_index+2],
"contentTypeIds": "",
"topicTypeIds": ""
})
# todo: 更新句子库id
sent_id += 1
# 处理小标题段落
elif 10 <= len(para) < 50:
# todo: 根据包含的字词内容来简单过滤脏段落信息
words_list = {"微信", "如需转载", "免责声明", "公告", "jpeg", "jpg", "png", "【", "责任编辑"}
if any([i in para for i in words_list]):
continue
else:
# 根据前四个字符内容获取索引
a0 = para[:4]
b0 = para[-4:]
# 获取首字符的索引
a0_index = text.find(a0) + (4 - len(a0.strip()))
print(a0, a0_index, text[a0_index])
# 获取末字尾符的索引
b0_index = text.find(b0) + (4 - len(b0.strip())) + 3
print(b0, b0_index, text[b0_index])
# 保存索引
para_index = str(a0_index) + "|" + str(b0_index)
else:
continue
# 打印段落内容
print(text[a0_index:b0_index+1])
# 将段落信息加入段落信息列表
list_para.append({
"para_id": str(para_id),
"infoId": row["infoId"],
"para_index": para_index,
"sent_content": text[a0_index:b0_index+2],
"contentTypeIds": "",
"topicTypeIds": ""
})
# todo: 更新段落库id
para_id += 1
# 将文章信息加入文章信息列表
list_article.append({
"infoId": row["infoId"],
"content": text,
"title": row["标题"],
"contentTypeIds": row["专题库类型"],
"topicTypeIds": row["专题名称"],
"origin": row["来源"],
"publishDate": row["发布时间"],
"author": row["作者"]
})
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 句子分割完整测试.py
# @Time : 2022/12/8 15:21
# @Author : bruxelles_li
# @Software: PyCharm
"""
引用内容:“”。
"""
import re
para = "恩格斯早就指出:“我们不要过分陶醉于我们人类对自然界的胜利。对于每一次这样的胜利,自然界都对我们进行报复。”第一次工业革命以来,人类利用自然的能力不断提高,但过度开发也导致生物多样性减少,迫使野生动物迁徙,增加野生动物体内病原的扩散传播。"
def get_index(text):
sent_list = re.split(r'\s*[。!?]\s*', text)
for sent in sent_list:
# 根据句子内容获取该句首字符索引
c0 = sent.find(sent.strip())
d0 = text.find("”")
# 获取首字符的索引
c0_index = c0
# 获取末字尾符的索引
d0_index = d0 + 1
return c0_index, d0_index
if __name__ == "__main__":
# todo: 当段落中出现讲话内容时,根据特殊符号“:”和““”来处理该句子,保证该部分内容的完整,并以该部分内容为中心,分别对中心点前后部分进行处理
if ":" in para:
pre_index, suf_index = get_index(para)
print(pre_index, suf_index)
pre_text = para[:pre_index-1] if pre_index != 0 else ""
# todo: 处理temp_text
temp_text = para[pre_index:suf_index]
print(temp_text)
# todo: 处理suffix_text
suffix_text = para[suf_index:]
print(suffix_text)
else:
# 基础句子拆分符号”!。?“
sent_list = re.split(r'\s*[。!?]\s*', para)
for sent in sent_list:
# todo:当句子长度大于10才进行入库
if len(sent.strip()) >= 13:
# 获取首字符的索引
c0_index = text.find(sent.strip())
c1_index = para.find(sent.strip())
# 获取末字尾符的索引
d0_index = c0_index + len(sent.strip())
d1_index = c1_index + len(sent.strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
elif 10 <= len(sent.strip()) < 13:
# 获取首字符的索引
c0_index = text.find(sent.strip())
c1_index = para.find(sent.strip())
# 获取末字尾符的索引
d0_index = c0_index + len(sent.strip())
d1_index = c1_index + len(sent.strip())
# 保存索引
sent_article_index = str(c0_index) + "|" + str(d0_index)
sent_para_index = str(c1_index) + "|" + str(d1_index)
else:
continue
#!/usr/bin/python3
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/01/09 14:12
# @Author : bruxelles_li
# @FileName: 合并.py
# @Software: PyCharm
import os
import pandas as pd
path1 = "素材库/待入库数据/专家观点数据.xls"
path2 = "素材库/待入库数据/专家观点数据(权威观察筛选).xls"
df = pd.read_excel(path1, keep_default_na=False).astype(str)
df1 = pd.read_excel(path2, keep_default_na=False).astype(str)
final_df = pd.concat([df, df1])
# final_df["contentTypeIds"] = "1612323231851601921"
# result = []
# path = r"素材库/段落库"
# for root, dirs, files in os.walk(path, topdown=False):
# for name in files:
# if name.endswith(".xls") or name.endswith(".xlsx"):
# df = pd.read_excel(os.path.join(root, name), sheet_name=None)
# result.append(df)
# data_list = []
# for data in result:
# data_list.extend(data.values()) # 注意这里是extend()函数而不是append()函数
# df = pd.concat(data_list)
final_df.to_excel('素材库/待入库数据/ori_policy_document_article.xlsx', index=False, engine='xlsxwriter')
print("合并完成!")
#!/usr/bin/python3
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/12/12 17:45
# @Author : bruxelles_li
# @FileName: 处理内容_去重.py
# @Software: PyCharm
import pandas as pd
from tqdm import tqdm
import time
# from snow_id import Snow
from 文章id生成 import create_title_id
# snow = Snow.get_guid()
# para_df = pd.read_excel("素材库/段落库/去重后_para.xlsx", keep_default_na=False).astype(str)
# para_df = pd.read_excel("素材库/段落库/去重后_para.xlsx", keep_default_na=False).astype(str)
para_df = pd.read_excel("素材库/段落库/leader_1_para.xlsx", keep_default_na=False).astype(str)
# article_list = para_df["infoId"].tolist()
para_df["id"] = ""
para_df["deleted"] = "0"
para_df["if_public"] = "0"
# para_df["type"] = "sen"
print(len(para_df))
para_list, sent_list = [], []
i = 1
for idx, row in tqdm(para_df.iterrows()):
row["id"] = str(create_title_id() + i)
row["create_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
para_list.append(row)
i += 1
print(len(para_list))
df = pd.DataFrame(para_list)
df.to_excel('素材库/段落库/待入库_leader_1_para.xlsx', index=False, engine='xlsxwriter')
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 文章id生成.py
# @Time : 2022/12/7 12:07
# @Author : bruxelles_li
# @Software: PyCharm
"""
使用time,hashlib 来自动生成文章id
"""
import time, hashlib
def create_id():
m = hashlib.md5(str(time.perf_counter()).encode("utf-8"))
return m.hexdigest()
# print(type(create_id()))
# print(create_id())
# 2.使用time生成时间戳
def create_title_id():
time_stamp = int(round(time.time()*1000000))
return time_stamp
if __name__ == "__main__":
print(create_title_id(), type(create_title_id()))
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 素材库构建批量处理.py
# @Time : 2022/12/10 11:56
# @Author : bruxelles_li
# @Software: PyCharm
import pandas as pd
from tqdm import tqdm
import requests
import json
from 文章内容检查 import *
list_article, list_para, list_sent = [], [], []
data_df = pd.read_excel("素材库/待入库数据/ori_speech_by_leaders_article.xlsx", keep_default_na=False).astype(str)
data_df["infoId"] = ""
data_df["author"] = ""
# todo: 调用素材构建程序接口进行批量素材处理
def pro_sucai(data_df):
url = "http://114.116.49.86:4001/build_pro"
headers = {
'Content-Type': 'application/json'
}
for idx, row in tqdm(data_df.iterrows()):
infoId = row["infoId"]
title = row["标题"]
content = row["正文"]
contentTypeFlags = row["内容类型"]
topicNames = row["主题类型"]
origin = row["origin"]
publishDate = row["publishDate"]
author = row["author"]
payload = json.dumps({
"infoId": infoId,
"title": title,
"contentTypeFlags": contentTypeFlags,
"topicNames": topicNames,
"origin": origin,
"publishDate": publishDate,
"author": author,
"content": content
})
response_filter = requests.request("POST", url, headers=headers, data=payload)
text_filter = response_filter.text.encode("utf-8")
obj_filter = json.loads(text_filter)
data_filter = obj_filter["resultData"]
if data_filter:
list_article.extend(data_filter["article_info"])
list_para.extend(data_filter["para_info"])
list_sent.extend(data_filter["sent_info"])
else:
continue
article_df = pd.DataFrame(list_article)
article_df.to_excel("素材库/文章库/leader_1_article.xlsx", engine="xlsxwriter", index=False)
para_df = pd.DataFrame(list_para)
para_df.to_excel("素材库/段落库/leader_1_para.xlsx", engine="xlsxwriter", index=False)
sent_df = pd.DataFrame(list_sent)
sent_df.to_excel("素材库/句子库/leader_1_sent.xlsx", engine="xlsxwriter", index=False)
return None
if __name__ == "__main__":
pro_sucai(data_df)
# 去除中间包含指定字符所在的段落
# text = """ /uploadimg/2021/02/02/1612229563745911./播放器前贴图片/。isLeftBottom:。"false",。/播放按钮是否在播放器左下角,为true表示是,false表示播放按钮在播放器中间/。isAudio:。"false",。/是否是音频播放器,为true表示是音频,false表示是视频/。isVod4k:。"false",。/是否为4k播放器,true是4k,false不是/。isHttps:。"true",。/是否https视频,true是,false不是/。wmode:。"opaque",。/flash播放器的窗口模式,默认为opaque/。wideMode:。"normal",。/flash播放器的窗口模式,默认为opaque/。listMode:。"false",。/点播播放器初始化参数:是否列表模式,默认false,false时不显示下一集按钮,不发送新增的下一集事件,设置中没有“自动播放下一集”选项;字符串类型/。nextTitle:。"",。/下一集标题,与listMode。配对使用/。nextThumbnail:。"",。/下一集预览图URL,与listMode。配对使用/。setupOn:。"true",。/是否显示设置按钮,默认为false/。hasBarrage:。"false",。/是否有弹幕功能,默认false,false时不显示弹幕、不显示弹幕设置按钮、不显示弹幕开关、不访问弹幕接口和表情包配置接口e/。barrageApp:。"",。/弹幕数据获取接口,固定或者初始化时传入/。playerType。"vod_h5",。/播放器类型,vod表示普通播放器/。drm:"true",。webFullScreenOn:。"false",。/是否显示网页全屏按钮,默认true表示显示/。language:。"",。/语言,默认中文,en表示英语/。other:。""/其它参数/。createVodPlayer(playerParas);。var。v_span1。"";。var。v_video="";。var。v_div。"";。var。aa="";。$(document).ready(function(){。function。getvideo(){。if($("video").html()==null){。clearInterval(aa);。setInterval(getvideo,100);。}else{。clearInterval(aa);。setVideo();。if(agent){。v_span1=$("#myFlash");。if(v_span1.length>0){。getvideo();。});。function。setVideo(){。v_video。$("video");。var。document.body.clientWidth||。window.innerWidth;。var。300。var。setTimeout(function(){。v_video.css({"display":"block","width":_w+"px","height":_h+"px","opacity":"1","background-color":"#000000"});。v_span1.css({"display":"block","width":_w+"px","height":_h+"px","opacity":"1","margin-bottom":"10px"});。},200)。function。changeVideo(){。if(agent){。if(v_span1.length>0){。setVideo();。window.onresize=function(){。changeVideo();。function。videoChange(){。$('#playbtn_img').css({"left":((document.documentElement.clientWidth-20)-70)/2+"px","top":(300-70)/2+"px"});。$('#video_content_is_loading').css({"left":((document.documentElement.clientWidth-20)-120)/2+"px","top":(300-120)/2+"px"});。$(window).resize(function(){。videoChange();。function。isAppendSpace(i){。console.log(i);。$('#playbtn_img').length>0?。videoChange()。"";。i--。i>=0。$('#playbtn_img').length。setTimeout(function(){isAppendSpace(i)},500)。"";。isAppendSpace(5);。中国中化控股有限责任公司成立大会5月8日在京举行。中共中央政治局常委、国务院总理李克强作出重要批示。批示指出:中国中化控股有限责任公司的重组组建,对优化国有经济结构和布局、助力我国农业现代化、增强化工行业市场竞争力具有重要意义。要坚持以习近平新时代中国特色社会主义思想为指导,认真贯彻党中央、国务院决策部署,扎实做好重组整合、深化改革等工作,加强国际合作,充分发挥行业龙头企业作用,加大种源、化工等领域关键技术攻关力度,创新管理和运营模式,不断提升经营质量效益和综合竞争力,为保持产业链供应链稳定、促进经济社会持续健康发展作出新贡献!。国务委员王勇出席成立大会并讲话。他强调,要深入贯彻习近平总书记重要指示精神,落实李克强总理批示要求,按照党中央、国务院决策部署,扎实做好中国中化控股有限责任公司重组组建工作,加快促进我国农业和化工产业高质量发展,在立足新发展阶段、贯彻新发展理念、构建新发展格局中发挥更大作用。
#
# 王勇指出,重组组建中国中化控股有限责任公司,是推进国有经济布局优化和结构调整、做强做优做大国有资本和国有企业的重大举措。要牢记使命责任,聚焦主责主业。加强资源要素整合融合,加快发展种业、现代农业、综合性化工等产业,强化关键核心技术攻关,着力打造原创技术“策源地”和现代产业链“链长”,保障产业链供应链安全稳定。坚持深化改革开放,持续健全市场化运营体制机制,守好安全生产底线红线,推进绿色低碳科技研发应用,为实现碳达峰碳中和目标贡献力量。把坚持党的领导加强党的建设融入公司治理,凝聚各方面工作合力,努力开创公司改革发展新局面。
#
# """
# print(clean_html_tag(text))
# import re
#
# list_content = text.split('\n')
# for content in list_content:
# new_content = re.sub(r".*(function。|html|background).*", '', content)
# print(new_content)
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : search_by_dot_matrix.py
# @Time : 2022/6/21 15:44
# @Author : bruxelles_li
# @Software: PyCharm
import pandas as pd
# text = """  巧用 5 日线把握强龙买卖机会——一个完整的交易,是买入持有卖出,所以今天我们重点来看持有和卖出节奏——今天分享一个大家都在用,简单有效,可以很好把握翻倍强龙机会的买卖技巧——5 日均线交易法。\n\n  该方法核心是 28 字真言:5 日均线分强弱,——不破 5 日继续做;——跌破 5 日减半仓,——三日不回全抛光。\n\n  我们一个个来看,首先是5 日均线分强弱——简单的说,判断一只个股是否强势,核心看 5 日均线。这条线代表主力短线意图,——如果主力进攻意愿强,5 日线通常是快速上涨;而主力如果有撤退迹象,5 日线——往往也首先拐头向下,正因为这种技术特性,5 日线是我们判断个股买卖时机的——好方法。\n\n  持有:不破 5 日继续做——强势股波动巨大,这意味着大机会,同时也包含风险。那通过弱转强信号进场后,——可以持有多久呢?——这就要看 5 日线,如果行情回踩 5 日线时,没有效跌破,那就可以继续持有。\n\n  有效跌破是指在收盘时,股价依然在 5 日线下方。如果盘中跌破 5 日线,临近——收盘时,股价重回 5 日线上方,则不算有效跌破。\n\n  还有一点,一般考虑买入时,5 日线应当是上涨状态,如果是下跌状态,则不适——用。\n\n  基建强龙浙江建投完美的诠释不破 5 日继续做的方针,在股价处在上涨通道——内,踩着 5 日线主升,但从不破 5 日线,可以长期持有。\n\n  减仓时机:跌破 5 日减半仓——前面说了不破 5 日线是买入时机,那要是有效跌破 5 日线呢?这意味风险来临——我们要适当减半仓,锁定利润了。\n\n  很多朋友会有赚钱了,舍不得卖的心理。这里我们要理性,不能光看机会,不看——风险。\n\n  如果行情跌破 5 日线,第二天很快反抽回来,我们可以灵活把减掉的仓位加回来。\n\n  下面是近期的新冠概念强势股雅本化学,可以很直观的感受到以 5 日线为基——准,灵活加减仓是多么重要。\n\n  清仓时机:三日不回全抛光——这里就很直白了,如果行情连续三个交易日在 5 日线下方,那就应该严格清仓。\n\n  不要有侥幸心理,就算偶尔行情会回来,可一旦遭遇大跌,就会非常受伤。我们——追求的应该是用科学的方法实现小亏大赚,而不是赌运气。\n\n  比如下面这只亚太药业,曾经风靡一时的幽门螺旋杆菌概念龙头,不少朋友可能——会追涨杀入,但是最后行情连续整理,连续三天都在 5 日线下方,这个时候就应——该清仓;哪怕轻套,也该认错,否者马上就差点跌停,那轻套就变成深套了。\n\n  以上就是5 日均线交易法的全部内容,这个方法就是特别简单,而且实用(买——卖都能用),我们花点时间掌握,对于强龙机会的把握有很大助益。
# """
# str = "757|806"
# prefix_index = int(str.split("|")[0])
# suffix_index = int(str.split("|")[1])
# print(text[prefix_index:suffix_index+1])
# todo: 定义句子库内容
sent_df = pd.read_excel('测试文件/句子库测试样例.xlsx', keep_default_na=False).astype(str)
_id2sentcont = {row['id']: row['content'] for idx6, row in sent_df.iterrows()}
_id2sent_articleid = {row['id']: row['article_id'] for idx7, row in sent_df.iterrows()}
_id2sent_paraid = {row['id']: row['paragraph_id'] for idx8, row in sent_df.iterrows()}
_id2sent_paraindex = {row['id']: row['sent_para_index'] for idx9, row in sent_df.iterrows()}
_id2sent_articleindex = {row['id']: row['sent_article_index'] for idx10, row in sent_df.iterrows()}
_id2sent_topic_type = {row['id']: row['topic_type'] for idx11, row in sent_df.iterrows()}
_id2sent_content_type_name = {row['id']: row['content_type_name'] for idx12, row in sent_df.iterrows()}
article_id = '1670829370466076'
sentence_id = "18"
a = sent_df.loc[(sent_df['article_id'] == article_id) & (sent_df['sentence_id'] == str(int(sentence_id)-1))]
if a.empty:
pre_sent = ""
else:
dict_pre = a.to_dict()
new_dict_pre = [dict(zip(dict_pre, values)) for values in zip(*[dict_pre[k].values() for k in dict_pre])]
pre_sent = new_dict_pre[0]["content"]
b = sent_df.loc[(sent_df["article_id"] == article_id) & (sent_df["sentence_id"] == str(int(sentence_id)+1))]
if b.empty:
suf_sent = ""
else:
dict_suf = b.to_dict()
new_dict_suf = [dict(zip(dict_suf, values)) for values in zip(*[dict_suf[k].values() for k in dict_suf])]
suf_sent = new_dict_suf[0]["content"]
print(pre_sent, suf_sent)
#!/usr/bin/env python3
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File : 缓存处理.py
# @Time : 2022/12/13 14:26
# @Author : bruxelles_li
# @Software: PyCharm
"""
基于内存缓存
使用 memory_cache 实例即可
"""
import shutil
import time
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
'message)s')
logger = logging.getLogger(__name__)
class Value:
def __init__(self, value, put_time, expired):
"""
缓存值对象
:param value: 具体的值
:param put_time: 放入缓存的时间
:param expired: 缓存失效时间
"""
self.value = value
self.put_time = put_time
self.expired = expired
def __str__(self):
return f"value: {self.value} put_time: {self.put_time} expired: {self.expired}"
class MemoryCache:
def __init__(self):
self.__cache = {}
def set_value(self, k, v, expired):
"""
将值放入缓存中
:param k: 缓存的 key
:param v: 缓存值
:param expired: 缓存失效时间,单位秒(s)
"""
current_timestamp = int(time.time()) # 获取当前时间戳 10 位 秒级
value = Value(v, current_timestamp, expired)
self.__cache[k] = value
logger.info("已放入缓存, {}: {}".format(k, value))
def check_key(self, k):
"""
检查缓存是否可用
:param k: 缓存 key
:return: True or False
"""
current_timestamp = int(time.time())
value = self.__cache.get(k, None)
# 考虑k不存在的情况
if value is None:
return False
differ = current_timestamp - value.put_time
if differ > value.expired:
del self.__cache[k] # 证明缓存失效了,删除键值对
logger.info("缓存已失效, key: {}".format(k))
return False
return True
def get_value(self, k):
"""
通过缓存key获取值
:param k: key
:return: value
"""
if self.check_key(k):
return self.__cache[k].value
return None
memory_cache = MemoryCache()
if __name__ == "__main__":
import json, os
# memory_cache.set_value('my_blog', 'sunnyc.icu', 3) # 设置一个 3 秒过期的键值对
# memory_cache.set_value('my_github', 'hczs', 20) # 设置一个 6 秒过期的键值对
dic_result = {
"my_blog": 'sunnyc.icu',
"my_github": "hczs"
}
json_result = json.dumps(dic_result)
path = "测试文件/test.json"
with open(path, 'w', encoding='utf-8') as file:
file.write(json_result)
# os.remove(path)
if os.path.isfile(path):
with open("测试文件/test.json", 'r', encoding='utf-8') as f:
dict_result = json.load(f)
print(dict_result)
else:
print("no")
# time.sleep(5)
# a = "my_blog"
# if a == "my_blog":
# print(memory_cache.get_value(a))
# else:
# print('my_github: ', memory_cache.get_value('my_github'))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论