codefuse-chatbot/dev_opsgpt/document_loaders/json_loader.py

import json
from pathlib import Path
from typing import AnyStr, Callable, Dict, List, Optional, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter

from dev_opsgpt.utils.common_utils import read_json_file


class JSONLoader(BaseLoader):

    def __init__(
            self,
            file_path: Union[str, Path],
            schema_key: str = "all_text",
            content_key: Optional[str] = None,
            metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
            text_content: bool = True,
    ):
        self.file_path = Path(file_path).resolve()
        self.schema_key = schema_key
        self._content_key = content_key
        self._metadata_func = metadata_func
        self._text_content = text_content

    def load(self, ) -> List[Document]:
        """Load and return documents from the JSON file."""
        docs: List[Document] = []
        datas = read_json_file(self.file_path)
        self._parse(datas, docs)
        return docs
    
    def _parse(self, datas: List, docs: List[Document]) -> None:
        for idx, sample in enumerate(datas):
            metadata = dict(
                source=str(self.file_path),
                seq_num=idx,
            )
            text = sample.get(self.schema_key, "")
            docs.append(Document(page_content=text, metadata=metadata))

    def load_and_split(
        self, text_splitter: Optional[TextSplitter] = None
    ) -> List[Document]:
        """Load Documents and split into chunks. Chunks are returned as Documents.

        Args:
            text_splitter: TextSplitter instance to use for splitting documents.
              Defaults to RecursiveCharacterTextSplitter.

        Returns:
            List of Documents.
        """
        if text_splitter is None:
            _text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
        else:
            _text_splitter = text_splitter
        docs = self.load()
        return _text_splitter.split_documents(docs)
upload 2023-09-28 10:58:58 +08:00			`import json`
			`from pathlib import Path`
			`from typing import AnyStr, Callable, Dict, List, Optional, Union`

			`from langchain.docstore.document import Document`
			`from langchain.document_loaders.base import BaseLoader`
update codebase & multi-agent framework 2023-11-07 19:44:47 +08:00			`from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter`
upload 2023-09-28 10:58:58 +08:00
			`from dev_opsgpt.utils.common_utils import read_json_file`


			`class JSONLoader(BaseLoader):`

			`def __init__(`
			`self,`
			`file_path: Union[str, Path],`
			`schema_key: str = "all_text",`
			`content_key: Optional[str] = None,`
			`metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,`
			`text_content: bool = True,`
			`):`
			`self.file_path = Path(file_path).resolve()`
			`self.schema_key = schema_key`
			`self._content_key = content_key`
			`self._metadata_func = metadata_func`
			`self._text_content = text_content`

			`def load(self, ) -> List[Document]:`
			`"""Load and return documents from the JSON file."""`
			`docs: List[Document] = []`
			`datas = read_json_file(self.file_path)`
			`self._parse(datas, docs)`
			`return docs`

			`def _parse(self, datas: List, docs: List[Document]) -> None:`
			`for idx, sample in enumerate(datas):`
			`metadata = dict(`
			`source=str(self.file_path),`
			`seq_num=idx,`
			`)`
			`text = sample.get(self.schema_key, "")`
			`docs.append(Document(page_content=text, metadata=metadata))`
update codebase & multi-agent framework 2023-11-07 19:44:47 +08:00
			`def load_and_split(`
			`self, text_splitter: Optional[TextSplitter] = None`
			`) -> List[Document]:`
			`"""Load Documents and split into chunks. Chunks are returned as Documents.`

			`Args:`
			`text_splitter: TextSplitter instance to use for splitting documents.`
			`Defaults to RecursiveCharacterTextSplitter.`

			`Returns:`
			`List of Documents.`
			`"""`
			`if text_splitter is None:`
			`_text_splitter: TextSplitter = RecursiveCharacterTextSplitter()`
			`else:`
			`_text_splitter = text_splitter`
			`docs = self.load()`
			`return _text_splitter.split_documents(docs)`