Source code for orichain.knowledge_base

from typing import Any, List, Union, Optional, Dict
import warnings

from orichain.knowledge_base import pinecone_knowledgbase, chromadb_knowledgebase
from orichain import error_explainer

DEFAULT_KNOWLEDGE_BASE = "pinecone"



[docs]
class KnowledgeBase(object):
    """
    Synchronous interface for interacting with vector databases.

    This class provides a unified API to communicate with supported vector databases.
    Currently, Pinecone and ChromaDB are supported.
    """

    default_knowledge_base = DEFAULT_KNOWLEDGE_BASE


[docs]
    def __init__(self, vector_db_type: Optional[str], **kwds: Any) -> None:
        """Initializes the knowledge base.

        Args:
            - vector_db_type (str, optional): Type of knowledge base. Default: pinecone

        **Authentication parameters by provider:**

            **Pinecone:**
                - api_key (str): Pinecone API key
                - index_name (str): Pinecone index name
                - namespace (str): Pinecone namespace

            **ChromaDB:**
                - collection_name (str): ChromaDB collection name
                - path (str, optional): Path to the ChromaDB database Default: `/home/ubuntu/projects/chromadb`

        Raises:
            - ValueError: If the knowledge base type is not supported
            - KeyError: If the required params is not found

        Warns:
            - UserWarning: If the knowledge base type is not defined Default: pinecone
        """
        try:
            # Dictionary mapping vector database types to their respective handler classes
            knowledge_base_handler = {
                "pinecone": pinecone_knowledgbase.DataBase,
                "chromadb": chromadb_knowledgebase.DataBase,
            }

            # If no vector_db_type is provided, default to pinecone
            if not vector_db_type:
                warnings.warn(
                    f"\nKnowledge base type not defined hence defaulting to \
                    {self.default_knowledge_base}",
                    UserWarning,
                )
                self.vector_db_type = self.default_knowledge_base
            # If vector_db_type is not supported, raise a ValueError
            elif vector_db_type not in list(knowledge_base_handler.keys()):
                raise ValueError(
                    f"\nUnsupported knowledge base: {self.model_name}\nSupported knowledge bases are:"
                    f"\n- " + "\n- ".join(list(knowledge_base_handler.keys()))
                )
            else:
                self.vector_db_type = vector_db_type

            # Initialize the knowledge base handler
            self.retriver = knowledge_base_handler.get(
                vector_db_type, self.default_knowledge_base
            )(**kwds)

        except Exception as e:
            error_explainer(e)



[docs]
    def __call__(
        self,
        num_of_chunks: int,
        user_message_vector: Optional[List[Union[int, float]]] = None,
        **kwds: Any,
    ) -> Dict:
        """Retrieves the chunks from the knowledge base

        Args:
            - user_message_vector (Optional[List[Union[int, float]]]): Embedding of the text. Defaults to None.
            - num_of_chunks (int): Number of chunks to retrieve

            **Retrieval Arguments by VectorDB:**

                **Pinecone:**
                    - vector (List[float]): The query vector. This should be the same length as the dimension of the index
                      being queried. Each `query()` request can contain only one of the parameters `id` or `vector`.. [optional]
                    - id (str): The unique ID of the vector to be used as a query vector.
                      Each `query()` request can contain only one of the parameters `vector` or  `id`.. [optional]
                    - top_k (int): The number of results to return for each query. Must be an integer greater than 1.
                    - namespace (str): The namespace to fetch vectors from.
                      If not specified, the default namespace is used. [optional]
                    - filter (Dict[str, Union[str, float, int, bool, List, dict]]):
                      The filter to apply. You can use vector metadata to limit your search.
                      See https://www.pinecone.io/docs/metadata-filtering/ [optional]
                    - include_values (bool): Indicates whether vector values are included in the response.
                      If omitted the server will use the default value of False [optional]
                    - include_metadata (bool): Indicates whether metadata is included in the response as well as the ids.
                      If omitted the server will use the default value of False  [optional]
                    - sparse_vector: (Union[SparseValues, Dict[str, Union[List[float], List[int]]]]): sparse values of the query vector.
                      Expected to be either a SparseValues object or a dict of the form:
                      ``{'indices': List[int], 'values': List[float]}``, where the lists each have the same length.

                **ChromaDB:**
                    - collection_name (str, optional): The name of the collection to get documents from. Defaults to the collection_name set during class instantiation.
                    - where (Dict, optional):  A Where type dict used to filter results by. E.g. ``{$and: [{"color" : "red"}, {"price": 4.20}]}``. Default: None.
                    - where_document (Dict, optional): A WhereDocument type dict used to filter by the documents. E.g. ``{"$contains" : "hello"}``. Default: None.
                    - include (List, optional): A list of what to include in the results.
                      Can contain ``"embeddings"``, ``"metadatas"``, ``"documents"``, ``"distances"``.
                      Ids are always included. Defaults to ``["metadatas", "documents", "distances"]``.
                      Default: ``["metadatas", "documents"]``

        Returns:
            Dict: Result of retrieving the chunks

        Raises:
            - ValueError: If `user_message_vector` is needed except for pinecone but if ids are also not provided for pinecone this error will be raised
            - KeyError: If required `namespace` is not found for pinecone
        """
        try:
            if not user_message_vector and not self.vector_db_type == "pinecone":
                raise ValueError("`user_message_vector` is needed except for pinecone")

            chunks = self.retriver(
                user_message_vector=user_message_vector,
                num_of_chunks=num_of_chunks,
                **kwds,
            )

            return chunks

        except Exception as e:
            error_explainer(e)
            return {"error": 500, "reason": str(e)}



[docs]
    def fetch(
        self,
        ids: List[str],
        **kwds: Any,
    ) -> Dict:
        """Fetches the chunks based on the ids from the knowledge base

        Args:
            - ids (List[str]): List of ids to fetch

            **Retrieval Arguments by VectorDB:**

                **Pinecone:**
                    - namespace (str, optional): The namespace to fetch vectors from. If not specified, the default namespace is used.

                **ChromaDB:**
                    - collection_name (str, optional): The name of the collection to fetch documents from. Defaults to the collection_name set during class instantiation.
                    - limit (int, optional): The number of documents to return. Default: None.
                    - offset (int, optional): The offset to start returning results from. Useful for paging results with limit. Default: None.
                    - where (Dict, optional):  A Where type dict used to filter results by. E.g. ``{$and: [{"color" : "red"}, {"price": 4.20}]}``. Default: None.
                    - where_document (Dict, optional): A WhereDocument type dict used to filter by the documents. E.g. ``{"$contains" : "hello"}``. Default: None.
                    - include (List, optional): A list of what to include in the results.
                      Can contain ``"embeddings"``, ``"metadatas"``, ``"documents"``, ``"distances"``.
                      Ids are always included. Defaults to ``["metadatas", "documents", "distances"]``.
                      Default: ``["metadatas", "documents"]``

        Returns:
            Dict: Result of fetching the chunks
        """
        try:
            # Fetching the chunks based on the ids
            chunks = self.retriver.fetch(
                ids=ids,
                **kwds,
            )

            return chunks

        except Exception as e:
            error_explainer(e)
            return {"error": 500, "reason": str(e)}





[docs]
class AsyncKnowledgeBase(object):
    """
    Asynchronous interface for interacting with vector databases.

    This class provides a unified API to communicate with supported vector databases.
    Currently, Pinecone and ChromaDB are supported.
    """

    default_knowledge_base = DEFAULT_KNOWLEDGE_BASE


[docs]
    def __init__(self, vector_db_type: Optional[str], **kwds: Any) -> None:
        """Initializes the knowledge base.

        Args:
            - vector_db_type (str, optional): Type of knowledge base. Default: pinecone

        **Authentication parameters by provider:**

            **Pinecone:**
                - api_key (str): Pinecone API key
                - index_name (str): Pinecone index name
                - namespace (str): Pinecone namespace

            **ChromaDB:**
                - collection_name (str): ChromaDB collection name
                - path (str, optional): Path to the ChromaDB database Default: `/home/ubuntu/projects/chromadb`

        Raises:
            - ValueError: If the knowledge base type is not supported
            - KeyError: If the required params is not found

        Warns:
            - UserWarning: If the knowledge base type is not defined Default: pinecone
        """
        try:
            # Dictionary mapping vector database types to their respective handler classes
            knowledge_base_handler = {
                "pinecone": pinecone_knowledgbase.AsyncDataBase,
                "chromadb": chromadb_knowledgebase.AsyncDataBase,
            }

            # If no vector_db_type is provided, default to pinecone
            if not vector_db_type:
                warnings.warn(
                    f"\nKnowledge base type not defined hence defaulting to \
                    {self.default_knowledge_base}",
                    UserWarning,
                )
                self.vector_db_type = self.default_knowledge_base
            # If vector_db_type is not supported, raise a ValueError
            elif vector_db_type not in list(knowledge_base_handler.keys()):
                raise ValueError(
                    f"\nUnsupported knowledge base: {self.model_name}\nSupported knowledge bases are:"
                    f"\n- " + "\n- ".join(list(knowledge_base_handler.keys()))
                )
            else:
                self.vector_db_type = vector_db_type

            # Initialize the knowledge base handler
            self.retriver = knowledge_base_handler.get(
                vector_db_type, self.default_knowledge_base
            )(**kwds)

        except Exception as e:
            error_explainer(e)



[docs]
    async def __call__(
        self,
        num_of_chunks: int,
        user_message_vector: Optional[List[Union[int, float]]] = None,
        **kwds: Any,
    ) -> Dict:
        """Retrieves the chunks from the knowledge base

        Args:
            - num_of_chunks (int): Number of chunks to retrieve
            - user_message_vector (Optional[List[Union[int, float]]]): Embedding of text. Defaults to None.

            **Retrieval Arguments by VectorDB:**

                **Pinecone:**
                    - vector (List[float]): The query vector. This should be the same length as the dimension of the index
                      being queried. Each `query()` request can contain only one of the parameters `id` or `vector`.. [optional]
                    - id (str): The unique ID of the vector to be used as a query vector.
                      Each `query()` request can contain only one of the parameters `vector` or  `id`.. [optional]
                    - top_k (int): The number of results to return for each query. Must be an integer greater than 1.
                    - namespace (str): The namespace to fetch vectors from.
                      If not specified, the default namespace is used. [optional]
                    - filter (Dict[str, Union[str, float, int, bool, List, dict]]):
                      The filter to apply. You can use vector metadata to limit your search.
                      See https://www.pinecone.io/docs/metadata-filtering/ [optional]
                    - include_values (bool): Indicates whether vector values are included in the response.
                      If omitted the server will use the default value of False [optional]
                    - include_metadata (bool): Indicates whether metadata is included in the response as well as the ids.
                      If omitted the server will use the default value of False  [optional]
                    - sparse_vector: (Union[SparseValues, Dict[str, Union[List[float], List[int]]]]): sparse values of the query vector.
                      Expected to be either a SparseValues object or a dict of the form:
                      ``{'indices': List[int], 'values': List[float]}``, where the lists each have the same length.

                **ChromaDB:**
                    - collection_name (str, optional): The name of the collection to get documents from. Defaults to the collection_name set during class instantiation.
                    - where (Dict, optional):  A Where type dict used to filter results by. E.g. ``{$and: [{"color" : "red"}, {"price": 4.20}]}``. Default: None.
                    - where_document (Dict, optional): A WhereDocument type dict used to filter by the documents. E.g. ``{"$contains" : "hello"}``. Default: None.
                    - include (List, optional): A list of what to include in the results.
                      Can contain ``"embeddings"``, ``"metadatas"``, ``"documents"``, ``"distances"``.
                      Ids are always included. Defaults to ``["metadatas", "documents", "distances"]``.
                      Default: ``["metadatas", "documents"]``

        Returns:
            Dict: Result of retrieving the chunks

        Raises:
            - ValueError: If `user_message_vector` is needed except for pinecone but if ids are also not provided for pinecone this error will be raised
            - KeyError: If required `namespace` is not found for pinecone
        """
        try:
            if not user_message_vector and not self.vector_db_type == "pinecone":
                raise ValueError("`user_message_vector` is needed except for pinecone")

            chunks = await self.retriver(
                user_message_vector=user_message_vector,
                num_of_chunks=num_of_chunks,
                **kwds,
            )

            return chunks

        except Exception as e:
            error_explainer(e)
            return {"error": 500, "reason": str(e)}



[docs]
    async def fetch(
        self,
        ids: List[str],
        **kwds: Any,
    ) -> Dict:
        """Fetches the chunks based on the ids from the knowledge base

        Args:
            - ids (List[str]): List of ids to fetch

            **Retrieval Arguments by VectorDB:**

                **Pinecone:**
                    - namespace (str, optional): The namespace to fetch vectors from. If not specified, the default namespace is used.

                **ChromaDB:**
                    - collection_name (str, optional): The name of the collection to fetch documents from. Defaults to the collection_name set during class instantiation.
                    - limit (int, optional): The number of documents to return. Default: None.
                    - offset (int, optional): The offset to start returning results from. Useful for paging results with limit. Default: None.
                    - where (Dict, optional):  A Where type dict used to filter results by. E.g. ``{$and: [{"color" : "red"}, {"price": 4.20}]}``. Default: None.
                    - where_document (Dict, optional): A WhereDocument type dict used to filter by the documents. E.g. ``{"$contains" : "hello"}``. Default: None.
                    - include (List, optional): A list of what to include in the results.
                      Can contain ``"embeddings"``, ``"metadatas"``, ``"documents"``, ``"distances"``.
                      Ids are always included. Defaults to ``["metadatas", "documents", "distances"]``.
                      Default: ``["metadatas", "documents"]``

        Returns:
            Dict: Result of fetching the chunks
        """
        try:
            # Fetching the chunks based on the ids
            chunks = await self.retriver.fetch(
                ids=ids,
                **kwds,
            )

            return chunks

        except Exception as e:
            error_explainer(e)
            return {"error": 500, "reason": str(e)}