2023-09-28 10:58:58 +08:00
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
from typing import List
|
|
|
|
import os
|
|
|
|
|
|
|
|
from langchain.embeddings.base import Embeddings
|
|
|
|
from langchain.docstore.document import Document
|
|
|
|
|
2024-01-26 14:03:25 +08:00
|
|
|
# from configs.model_config import (
|
|
|
|
# kbs_config, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD,
|
|
|
|
# EMBEDDING_MODEL, EMBEDDING_DEVICE, KB_ROOT_PATH
|
|
|
|
# )
|
|
|
|
# from configs.model_config import embedding_model_dict
|
|
|
|
from coagent.base_configs.env_config import (
|
|
|
|
VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, kbs_config
|
2023-09-28 10:58:58 +08:00
|
|
|
)
|
2024-01-26 14:03:25 +08:00
|
|
|
from coagent.orm.commands import *
|
|
|
|
from coagent.utils.path_utils import *
|
|
|
|
from coagent.orm.utils import DocumentFile
|
|
|
|
from coagent.embeddings.utils import load_embeddings, load_embeddings_from_path
|
2024-03-12 15:31:06 +08:00
|
|
|
from coagent.retrieval.text_splitter import LCTextSplitter
|
2024-01-26 14:03:25 +08:00
|
|
|
from coagent.llm_models.llm_config import EmbedConfig
|
2023-09-28 10:58:58 +08:00
|
|
|
|
|
|
|
|
|
|
|
class SupportedVSType:
|
|
|
|
FAISS = 'faiss'
|
|
|
|
# MILVUS = 'milvus'
|
|
|
|
# DEFAULT = 'default'
|
|
|
|
# PG = 'pg'
|
|
|
|
|
|
|
|
|
|
|
|
class KBService(ABC):
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
knowledge_base_name: str,
|
2024-01-26 14:03:25 +08:00
|
|
|
embed_config: EmbedConfig,
|
|
|
|
# embed_model: str = "text2vec-base-chinese",
|
|
|
|
kb_root_path: str,
|
2023-09-28 10:58:58 +08:00
|
|
|
):
|
|
|
|
self.kb_name = knowledge_base_name
|
2024-01-26 14:03:25 +08:00
|
|
|
# self.embed_model = embed_model
|
|
|
|
self.embed_config = embed_config
|
|
|
|
self.kb_root_path = kb_root_path
|
|
|
|
self.kb_path = get_kb_path(self.kb_name, kb_root_path)
|
|
|
|
self.doc_path = get_doc_path(self.kb_name, kb_root_path)
|
2023-09-28 10:58:58 +08:00
|
|
|
self.do_init()
|
|
|
|
|
2024-01-26 14:03:25 +08:00
|
|
|
def _load_embeddings(self) -> Embeddings:
|
|
|
|
# return load_embeddings(self.embed_model, embed_device, embedding_model_dict)
|
2024-03-12 15:31:06 +08:00
|
|
|
return load_embeddings_from_path(self.embed_config.embed_model_path, self.embed_config.model_device, self.embed_config.langchain_embeddings)
|
2023-09-28 10:58:58 +08:00
|
|
|
|
|
|
|
def create_kb(self):
|
|
|
|
"""
|
|
|
|
创建知识库
|
|
|
|
"""
|
|
|
|
if not os.path.exists(self.doc_path):
|
|
|
|
os.makedirs(self.doc_path)
|
|
|
|
self.do_create_kb()
|
2024-01-26 14:03:25 +08:00
|
|
|
status = add_kb_to_db(self.kb_name, self.vs_type(), self.embed_config.embed_model)
|
2023-09-28 10:58:58 +08:00
|
|
|
return status
|
|
|
|
|
|
|
|
def clear_vs(self):
|
|
|
|
"""
|
|
|
|
删除向量库中所有内容
|
|
|
|
"""
|
|
|
|
self.do_clear_vs()
|
|
|
|
status = delete_files_from_db(self.kb_name)
|
|
|
|
return status
|
|
|
|
|
|
|
|
def drop_kb(self):
|
|
|
|
"""
|
|
|
|
删除知识库
|
|
|
|
"""
|
|
|
|
self.do_drop_kb()
|
|
|
|
status = delete_kb_from_db(self.kb_name)
|
|
|
|
return status
|
|
|
|
|
|
|
|
def add_doc(self, kb_file: DocumentFile, **kwargs):
|
|
|
|
"""
|
|
|
|
向知识库添加文件
|
|
|
|
"""
|
|
|
|
lctTextSplitter = LCTextSplitter(kb_file.filepath)
|
|
|
|
docs = lctTextSplitter.file2text()
|
|
|
|
if docs:
|
|
|
|
self.delete_doc(kb_file)
|
|
|
|
embeddings = self._load_embeddings()
|
|
|
|
self.do_add_doc(docs, embeddings, **kwargs)
|
|
|
|
status = add_doc_to_db(kb_file)
|
|
|
|
else:
|
|
|
|
status = False
|
|
|
|
return status
|
|
|
|
|
|
|
|
def delete_doc(self, kb_file: DocumentFile, delete_content: bool = False, **kwargs):
|
|
|
|
"""
|
|
|
|
从知识库删除文件
|
|
|
|
"""
|
|
|
|
self.do_delete_doc(kb_file, **kwargs)
|
|
|
|
status = delete_file_from_db(kb_file)
|
|
|
|
if delete_content and os.path.exists(kb_file.filepath):
|
|
|
|
os.remove(kb_file.filepath)
|
|
|
|
return status
|
|
|
|
|
|
|
|
def update_doc(self, kb_file: DocumentFile, **kwargs):
|
|
|
|
"""
|
|
|
|
使用content中的文件更新向量库
|
|
|
|
"""
|
|
|
|
if os.path.exists(kb_file.filepath):
|
|
|
|
self.delete_doc(kb_file, **kwargs)
|
|
|
|
return self.add_doc(kb_file, **kwargs)
|
|
|
|
|
|
|
|
def exist_doc(self, file_name: str):
|
|
|
|
return doc_exists(DocumentFile(knowledge_base_name=self.kb_name,
|
2024-01-26 14:03:25 +08:00
|
|
|
filename=file_name, kb_root_path=self.kb_root_path))
|
2023-09-28 10:58:58 +08:00
|
|
|
|
|
|
|
def list_docs(self):
|
|
|
|
return list_docs_from_db(self.kb_name)
|
|
|
|
|
|
|
|
def search_docs(self,
|
|
|
|
query: str,
|
|
|
|
top_k: int = VECTOR_SEARCH_TOP_K,
|
|
|
|
score_threshold: float = SCORE_THRESHOLD,
|
|
|
|
):
|
|
|
|
embeddings = self._load_embeddings()
|
|
|
|
docs = self.do_search(query, top_k, score_threshold, embeddings)
|
|
|
|
return docs
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def do_create_kb(self):
|
|
|
|
"""
|
|
|
|
创建知识库子类实自己逻辑
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def list_kbs_type():
|
|
|
|
return list(kbs_config.keys())
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def list_kbs(cls):
|
|
|
|
return list_kbs_from_db()
|
|
|
|
|
|
|
|
def exists(self, kb_name: str = None):
|
|
|
|
kb_name = kb_name or self.kb_name
|
|
|
|
return kb_exists(kb_name)
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def vs_type(self) -> str:
|
|
|
|
pass
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def do_init(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def do_drop_kb(self):
|
|
|
|
"""
|
|
|
|
删除知识库子类实自己逻辑
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def do_search(self,
|
|
|
|
query: str,
|
|
|
|
top_k: int,
|
|
|
|
embeddings: Embeddings,
|
|
|
|
) -> List[Document]:
|
|
|
|
"""
|
|
|
|
搜索知识库子类实自己逻辑
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def do_add_doc(self,
|
|
|
|
docs: List[Document],
|
|
|
|
embeddings: Embeddings,
|
|
|
|
):
|
|
|
|
"""
|
|
|
|
向知识库添加文档子类实自己逻辑
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2024-01-26 14:03:25 +08:00
|
|
|
@abstractmethod
|
|
|
|
def get_all_documents(self,
|
|
|
|
embeddings: Embeddings,
|
|
|
|
):
|
|
|
|
"""
|
|
|
|
获取知识库所有文档
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
2023-09-28 10:58:58 +08:00
|
|
|
@abstractmethod
|
|
|
|
def do_delete_doc(self,
|
|
|
|
kb_file: DocumentFile):
|
|
|
|
"""
|
|
|
|
从知识库删除文档子类实自己逻辑
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def do_clear_vs(self):
|
|
|
|
"""
|
|
|
|
从知识库删除全部向量子类实自己逻辑
|
|
|
|
"""
|
|
|
|
pass
|