
    +#h                     v    d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlZd dl	m
Z
  G d dej                        Zy)    N)defaultdict)ListUnion)
Predictionc            	       b     e Zd ZdZd fd	Zd	deeee   f   dedede	j                  fdZ xZS )
DatabricksRMa	  
    A retrieval module that uses Databricks Vector Search Endpoint to return the top-k embeddings for a given query.

    Args:
        databricks_index_name (str): Databricks vector search index to query
        databricks_endpoint (str): Databricks index endpoint url
        databricks_token (str): Databricks authentication token
        columns (list[str]): Column names to include in response
        filters_json (str, optional): JSON string for query filters
        k (int, optional): Number of top embeddings to retrieve. Defaults to 3.
        docs_id_column_name (str, optional): Column name for retrieved doc_ids to return.
        text_column_name (str, optional): Column name for retrieved text to return.

    Examples:
        Below is a code snippet that shows how to configure Databricks Vector Search endpoints:

        (example adapted from "Databricks: How to create and query a Vector Search Index: 
        https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#create-a-vector-search-index)

        ```python
        from databricks.vector_search.client import VectorSearchClient

        #Creating Vector Search Client

        client = VectorSearchClient()

        client.create_endpoint(
            name="your_vector_search_endpoint_name",
            endpoint_type="STANDARD"
        )

        #Creating Vector Search Index using Python SDK 
        #Example for Direct Vector Access Index

        index = client.create_direct_access_index(
            endpoint_name="your_databricks_host_url",
            index_name="your_index_name",
            primary_key="id",
            embedding_dimension=1024,
            embedding_vector_column="text_vector",
            schema={
            "id": "int",
            "field2": "str",
            "field3": "float",
            "text_vector": "array<float>"}
        )
        
        llm = dspy.OpenAI(model="gpt-3.5-turbo")
        retriever_model = DatabricksRM(databricks_index_name = "your_index_name", 
        databricks_endpoint = "your_databricks_host_url", databricks_token = "your_databricks_token", columns= ["id", "field2", "field3", "text_vector"], k=3)
        dspy.settings.configure(lm=llm, rm=retriever_model)
        ```

        Below is a code snippet that shows how to query the Databricks Direct Vector Access Index using the forward() function.
        ```python
        self.retrieve = DatabricksRM(query=[1, 2, 3], query_type = 'vector')
        ```
    c	                    t         	|   |       |s*t        j                  j	                  d      st        d      |s*t        j                  j	                  d      st        d      |st        d      |st        d      |r|nt        j                  d   | _        |r|nt        j                  d   | _        || _        || _	        || _
        || _        || _        || _        y )N)kDATABRICKS_TOKENzMYou must supply databricks_token or set environment variable DATABRICKS_TOKENDATABRICKS_HOSTzOYou must supply databricks_endpoint or set environment variable DATABRICKS_HOSTz!You must supply vector index namezFYou must specify a list of column names to be included in the response)super__init__osenvironget
ValueErrordatabricks_tokendatabricks_endpointdatabricks_index_namecolumnsfilters_jsonr
   docs_id_column_nametext_column_name)
selfr   r   r   r   r   r
   r   r   	__class__s
            X/var/www/html/sandstorm/venv/lib/python3.12/site-packages/dspy/retrieve/databricks_rm.pyr   zDatabricksRM.__init__G   s    1

7I(Jlmm"2::>>:K+Lnoo$@AAeff4D 0"**UgJh:M#6SUS]S]^oSp %:"(#6  0    query
query_typer   returnc                    d| j                    dd}| j                  | j                  d}|dk(  r!t        |t              st        d      ||d<   n1|dk(  r!t        |t              st        d	      ||d
<   nt        d      |r|n| j                  |d<   t        j                  | j                   d| j                   d||      }|j                         }t        t              }g }	d\  }
}|d   d   D ]  }t        |d   d   |      D ]  \  }}|d   | j                   k(  r^| j                   dk(  r3t        j"                  |      }|	j%                  t        |d                n|	j%                  t        |             |}
|d   | j&                  k(  r|}
|d   dk(  s|} ||
xx   |z  cc<    t)        |j+                         d d      d| j                   }t-        |D cg c]  \  }}|	 c}}|	      S c c}}w )a  Search with Databricks Vector Search Client for self.k top results for query

        Args:
            query (Union[str, List[float]]): query to search for.
            query_type (str): 'vector' for Direct Vector Access Index and Delta Sync Index using self-managed vectors or 'text' for Delta Sync Index using model endpoint.

        Returns:
            dspy.Prediction: An object containing the retrieved results.
        zBearer zapplication/json)AuthorizationzContent-Type)r   num_resultsvectorz/Query must be a list of floats for query_vectorquery_vectortextz%Query must be a string for query_text
query_textz5Invalid query type specified. Use 'vector' or 'text'.r   z/api/2.0/vector-search/indexes/z/query)jsonheaders)NNresult
data_arraymanifestr   namemetadatadocument_idscorec                     | d   S )N    )xs    r   <lambda>z&DatabricksRM.forward.<locals>.<lambda>   s
    1 r   T)keyreverseN)docsdoc_ids)r   r   r
   
isinstancelistr   strr   requestspostr   r   r(   r   floatzipr   loadsappendr   sorteditemsr   )r   r   r   r   r)   payloadresponseresultsr8   r9   r&   r0   data_rowcolval	docs_dictsorted_docsdoc_s                      r   forwardzDatabricksRM.forwardZ   s     't'<'<&=>.

 ||66
 !eT* !RSS&+GN#6!eS) !HII$)GL!TUU2>,DDUDU==''((GHbHbGccij

 --/5! e),7 	 H
 3I >I  Sv;$":":://:=$(JJsO	s9]+C'DEs3x0Dv;$"7"77Dv;')E  J%J	  TZZ\~tLWdffU+>Q>'RR>s   %G;
)NNNNN   idr&   )r&   N)__name__
__module____qualname____doc__r   r   r<   r   r?   dspyr   rO   __classcell__)r   s   @r   r   r      sJ    9t1&9SU3U#34 9S# 9S^a 9Smqm|m| 9Sr   r   )r(   r   collectionsr   typingr   r   r=   rV   dspy.primitives.predictionr   Retriever   r3   r   r   <module>r\      s.     	 #    1GS4== GSr   