2023-09-28 10:58:58 +08:00
|
|
|
import os
|
|
|
|
import importlib
|
|
|
|
from loguru import logger
|
|
|
|
|
|
|
|
from langchain.document_loaders.base import BaseLoader
|
|
|
|
from langchain.text_splitter import (
|
|
|
|
SpacyTextSplitter, RecursiveCharacterTextSplitter
|
|
|
|
)
|
|
|
|
|
2024-01-26 14:03:25 +08:00
|
|
|
# from configs.model_config import (
|
|
|
|
# CHUNK_SIZE,
|
|
|
|
# OVERLAP_SIZE,
|
|
|
|
# ZH_TITLE_ENHANCE
|
|
|
|
# )
|
|
|
|
from coagent.utils.path_utils import *
|
2023-09-28 10:58:58 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LCTextSplitter:
|
|
|
|
'''langchain textsplitter 执行file2text'''
|
|
|
|
def __init__(
|
2024-01-26 14:03:25 +08:00
|
|
|
self, filepath: str, text_splitter_name: str = None,
|
|
|
|
chunk_size: int = 500,
|
|
|
|
overlap_size: int = 50
|
2023-09-28 10:58:58 +08:00
|
|
|
):
|
|
|
|
self.filepath = filepath
|
|
|
|
self.ext = os.path.splitext(filepath)[-1].lower()
|
|
|
|
self.text_splitter_name = text_splitter_name
|
2024-01-26 14:03:25 +08:00
|
|
|
self.chunk_size = chunk_size
|
|
|
|
self.overlap_size = overlap_size
|
2023-09-28 10:58:58 +08:00
|
|
|
if self.ext not in SUPPORTED_EXTS:
|
|
|
|
raise ValueError(f"暂未支持的文件格式 {self.ext}")
|
|
|
|
self.document_loader_name = get_LoaderClass(self.ext)
|
|
|
|
|
|
|
|
def file2text(self, ):
|
|
|
|
loader = self._load_document()
|
|
|
|
text_splitter = self._load_text_splitter()
|
|
|
|
if self.document_loader_name in ["JSONLoader", "JSONLLoader"]:
|
2023-11-07 19:44:47 +08:00
|
|
|
# docs = loader.load()
|
|
|
|
docs = loader.load_and_split(text_splitter)
|
2024-01-26 14:03:25 +08:00
|
|
|
# logger.debug(f"please check your file can be loaded, docs.lens {len(docs)}")
|
2023-09-28 10:58:58 +08:00
|
|
|
else:
|
|
|
|
docs = loader.load_and_split(text_splitter)
|
2023-11-07 19:44:47 +08:00
|
|
|
|
2023-09-28 10:58:58 +08:00
|
|
|
return docs
|
|
|
|
|
|
|
|
def _load_document(self, ) -> BaseLoader:
|
|
|
|
DocumentLoader = EXT2LOADER_DICT[self.ext]
|
|
|
|
if self.document_loader_name == "UnstructuredFileLoader":
|
|
|
|
loader = DocumentLoader(self.filepath, autodetect_encoding=True)
|
|
|
|
else:
|
|
|
|
loader = DocumentLoader(self.filepath)
|
|
|
|
return loader
|
|
|
|
|
|
|
|
def _load_text_splitter(self, ):
|
|
|
|
try:
|
|
|
|
if self.text_splitter_name is None:
|
|
|
|
text_splitter = SpacyTextSplitter(
|
|
|
|
pipeline="zh_core_web_sm",
|
2024-01-26 14:03:25 +08:00
|
|
|
chunk_size=self.chunk_size,
|
|
|
|
chunk_overlap=self.overlap_size,
|
2023-09-28 10:58:58 +08:00
|
|
|
)
|
|
|
|
self.text_splitter_name = "SpacyTextSplitter"
|
2023-11-07 19:44:47 +08:00
|
|
|
# elif self.document_loader_name in ["JSONLoader", "JSONLLoader"]:
|
|
|
|
# text_splitter = None
|
2023-09-28 10:58:58 +08:00
|
|
|
else:
|
|
|
|
text_splitter_module = importlib.import_module('langchain.text_splitter')
|
|
|
|
TextSplitter = getattr(text_splitter_module, self.text_splitter_name)
|
|
|
|
text_splitter = TextSplitter(
|
2024-01-26 14:03:25 +08:00
|
|
|
chunk_size=self.chunk_size,
|
|
|
|
chunk_overlap=self.overlap_size)
|
2023-09-28 10:58:58 +08:00
|
|
|
except Exception as e:
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
2024-01-26 14:03:25 +08:00
|
|
|
chunk_size=self.chunk_size,
|
|
|
|
chunk_overlap=self.overlap_size,
|
2023-09-28 10:58:58 +08:00
|
|
|
)
|
|
|
|
return text_splitter
|