codefuse-chatbot/dev_opsgpt/document_loaders/json_loader.py

import json
from pathlib import Path
from typing import AnyStr, Callable, Dict, List, Optional, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

from dev_opsgpt.utils.common_utils import read_json_file


class JSONLoader(BaseLoader):

    def __init__(
            self,
            file_path: Union[str, Path],
            schema_key: str = "all_text",
            content_key: Optional[str] = None,
            metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
            text_content: bool = True,
    ):
        self.file_path = Path(file_path).resolve()
        self.schema_key = schema_key
        self._content_key = content_key
        self._metadata_func = metadata_func
        self._text_content = text_content

    def load(self, ) -> List[Document]:
        """Load and return documents from the JSON file."""
        docs: List[Document] = []
        datas = read_json_file(self.file_path)
        self._parse(datas, docs)
        return docs

    def _parse(self, datas: List, docs: List[Document]) -> None:
        for idx, sample in enumerate(datas):
            metadata = dict(
                source=str(self.file_path),
                seq_num=idx,
            )
            text = sample.get(self.schema_key, "")
            docs.append(Document(page_content=text, metadata=metadata))