codefuse-chatbot/dev_opsgpt/document_loaders/jsonl_loader.py

import json
from pathlib import Path
from typing import AnyStr, Callable, Dict, List, Optional, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

from dev_opsgpt.utils.common_utils import read_jsonl_file


class JSONLLoader(BaseLoader):

    def __init__(
            self,
            file_path: Union[str, Path],
            schema_key: str = "all_text",
            content_key: Optional[str] = None,
            metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
            text_content: bool = True,
    ):
        self.file_path = Path(file_path).resolve()
        self.schema_key = schema_key
        self._content_key = content_key
        self._metadata_func = metadata_func
        self._text_content = text_content

    def load(self, ) -> List[Document]:
        """Load and return documents from the JSON file."""
        docs: List[Document] = []
        datas = read_jsonl_file(self.file_path)
        self._parse(datas, docs)
        return docs
    
    def _parse(self, datas: List, docs: List[Document]) -> None:
        for idx, sample in enumerate(datas):
            metadata = dict(
                source=str(self.file_path),
                seq_num=idx,
            )
            text = sample.get(self.schema_key, "")
            docs.append(Document(page_content=text, metadata=metadata))
upload 2023-09-28 10:58:58 +08:00			`import json`
			`from pathlib import Path`
			`from typing import AnyStr, Callable, Dict, List, Optional, Union`

			`from langchain.docstore.document import Document`
			`from langchain.document_loaders.base import BaseLoader`

			`from dev_opsgpt.utils.common_utils import read_jsonl_file`


			`class JSONLLoader(BaseLoader):`

			`def __init__(`
			`self,`
			`file_path: Union[str, Path],`
			`schema_key: str = "all_text",`
			`content_key: Optional[str] = None,`
			`metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,`
			`text_content: bool = True,`
			`):`
			`self.file_path = Path(file_path).resolve()`
			`self.schema_key = schema_key`
			`self._content_key = content_key`
			`self._metadata_func = metadata_func`
			`self._text_content = text_content`

			`def load(self, ) -> List[Document]:`
			`"""Load and return documents from the JSON file."""`
			`docs: List[Document] = []`
			`datas = read_jsonl_file(self.file_path)`
			`self._parse(datas, docs)`
			`return docs`

			`def _parse(self, datas: List, docs: List[Document]) -> None:`
			`for idx, sample in enumerate(datas):`
			`metadata = dict(`
			`source=str(self.file_path),`
			`seq_num=idx,`
			`)`
			`text = sample.get(self.schema_key, "")`
			`docs.append(Document(page_content=text, metadata=metadata))`