62 lines
2.1 KiB
Python
62 lines
2.1 KiB
Python
import json
|
|
from pathlib import Path
|
|
from typing import AnyStr, Callable, Dict, List, Optional, Union
|
|
|
|
from langchain.docstore.document import Document
|
|
from langchain.document_loaders.base import BaseLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
|
|
|
from dev_opsgpt.utils.common_utils import read_jsonl_file
|
|
|
|
|
|
class JSONLLoader(BaseLoader):
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: Union[str, Path],
|
|
schema_key: str = "all_text",
|
|
content_key: Optional[str] = None,
|
|
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
|
|
text_content: bool = True,
|
|
):
|
|
self.file_path = Path(file_path).resolve()
|
|
self.schema_key = schema_key
|
|
self._content_key = content_key
|
|
self._metadata_func = metadata_func
|
|
self._text_content = text_content
|
|
|
|
def load(self, ) -> List[Document]:
|
|
"""Load and return documents from the JSON file."""
|
|
docs: List[Document] = []
|
|
datas = read_jsonl_file(self.file_path)
|
|
self._parse(datas, docs)
|
|
return docs
|
|
|
|
def _parse(self, datas: List, docs: List[Document]) -> None:
|
|
for idx, sample in enumerate(datas):
|
|
metadata = dict(
|
|
source=str(self.file_path),
|
|
seq_num=idx,
|
|
)
|
|
text = sample.get(self.schema_key, "")
|
|
docs.append(Document(page_content=text, metadata=metadata))
|
|
|
|
def load_and_split(
|
|
self, text_splitter: Optional[TextSplitter] = None
|
|
) -> List[Document]:
|
|
"""Load Documents and split into chunks. Chunks are returned as Documents.
|
|
|
|
Args:
|
|
text_splitter: TextSplitter instance to use for splitting documents.
|
|
Defaults to RecursiveCharacterTextSplitter.
|
|
|
|
Returns:
|
|
List of Documents.
|
|
"""
|
|
if text_splitter is None:
|
|
_text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
|
|
else:
|
|
_text_splitter = text_splitter
|
|
|
|
docs = self.load()
|
|
return _text_splitter.split_documents(docs) |