42 lines
1.4 KiB
Python
42 lines
1.4 KiB
Python
|
import json
|
||
|
from pathlib import Path
|
||
|
from typing import AnyStr, Callable, Dict, List, Optional, Union
|
||
|
|
||
|
from langchain.docstore.document import Document
|
||
|
from langchain.document_loaders.base import BaseLoader
|
||
|
|
||
|
from dev_opsgpt.utils.common_utils import read_jsonl_file
|
||
|
|
||
|
|
||
|
class JSONLLoader(BaseLoader):
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
file_path: Union[str, Path],
|
||
|
schema_key: str = "all_text",
|
||
|
content_key: Optional[str] = None,
|
||
|
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
|
||
|
text_content: bool = True,
|
||
|
):
|
||
|
self.file_path = Path(file_path).resolve()
|
||
|
self.schema_key = schema_key
|
||
|
self._content_key = content_key
|
||
|
self._metadata_func = metadata_func
|
||
|
self._text_content = text_content
|
||
|
|
||
|
def load(self, ) -> List[Document]:
|
||
|
"""Load and return documents from the JSON file."""
|
||
|
docs: List[Document] = []
|
||
|
datas = read_jsonl_file(self.file_path)
|
||
|
self._parse(datas, docs)
|
||
|
return docs
|
||
|
|
||
|
def _parse(self, datas: List, docs: List[Document]) -> None:
|
||
|
for idx, sample in enumerate(datas):
|
||
|
metadata = dict(
|
||
|
source=str(self.file_path),
|
||
|
seq_num=idx,
|
||
|
)
|
||
|
text = sample.get(self.schema_key, "")
|
||
|
docs.append(Document(page_content=text, metadata=metadata))
|