import json from pathlib import Path from typing import AnyStr, Callable, Dict, List, Optional, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader from dev_opsgpt.utils.common_utils import read_jsonl_file class JSONLLoader(BaseLoader): def __init__( self, file_path: Union[str, Path], schema_key: str = "all_text", content_key: Optional[str] = None, metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None, text_content: bool = True, ): self.file_path = Path(file_path).resolve() self.schema_key = schema_key self._content_key = content_key self._metadata_func = metadata_func self._text_content = text_content def load(self, ) -> List[Document]: """Load and return documents from the JSON file.""" docs: List[Document] = [] datas = read_jsonl_file(self.file_path) self._parse(datas, docs) return docs def _parse(self, datas: List, docs: List[Document]) -> None: for idx, sample in enumerate(datas): metadata = dict( source=str(self.file_path), seq_num=idx, ) text = sample.get(self.schema_key, "") docs.append(Document(page_content=text, metadata=metadata))