
    +#hk                        d dl Z d dlmZmZ d dlZd dlZ G d de j                        Z G d de      Z	 G d de      Z
 G d	 d
e      Z	  eej                  j                  d          d k(  Z G d de      Zy# e$ r dZY w xY w)    N)ListOptionalc                   z    e Zd ZdZdZd
dZej                  ded   de	j                  fd       Zdedee   fd	Zy)BaseSentenceVectorizera[  
    Base Class for Vectorizers. The main purpose is to vectorize text (doc/query)
    for ANN/KNN indexes. `__call__` method takes `List[Example]` as a single input, then extracts
    `field_to_vectorize` from every Example and convert them into embeddings.
    You can customize extraction logic in the `_extract_text_from_examples` method.
    text_to_vectorizereturnNc                      y N )selfs    \/var/www/html/sandstorm/venv/lib/python3.12/site-packages/dsp/modules/sentence_vectorizer.py__init__zBaseSentenceVectorizer.__init__   s        inp_examplesExamplec                      y r
   r   )r   r   s     r   __call__zBaseSentenceVectorizer.__call__   s    r   c                     t        |d   t              r|S |D cg c].  }dj                  |j                  D cg c]  }||   	 c}      0 c}}S c c}w c c}}w )Nr    )
isinstancestrjoin_input_keys)r   r   examplekeys       r   _extract_text_from_examplesz2BaseSentenceVectorizer._extract_text_from_examples   sK    l1os+WcdG'2E2EF3'#,FGddFds   AA	AA)r   N)__name__
__module____qualname____doc__field_to_vectorizer   abcabstractmethodr   npndarrayr   r   r   r   r   r   r   r      s^     - 	T)_   e ec er   r   c            	       V    e Zd ZdZ	 	 	 	 ddedededefdZdede	j                  fd	Zy
)SentenceTransformersVectorizerz
    Vectorizer based on `SentenceTransformers` models. You can pick any model from this link:
    https://huggingface.co/models?library=sentence-transformers
    More details about models:
    https://www.sbert.net/docs/pretrained_models.html
    model_name_or_pathvectorize_bsmax_gpu_devicesnormalize_embeddingsc                     	 ddl m} ddlm}  ||      \  | _        | _        | j                  rdnd| _         ||| j                        | _        || _	        || _
        || _        y # t        $ r t        d      w xY w)Nr   )SentenceTransformerzYou need to install sentence_transformers library to use pretrained embedders. Please check the official doc https://www.sbert.net/ or simply run `pip install sentence-transformers)determine_devicescudacpu)device)sentence_transformersr-   ImportErrordsp.utils.ann_utilsr.   num_devicesis_gpuproxy_devicemodelr(   r)   r+   )r   r(   r)   r*   r+   r-   r.   s          r   r   z'SentenceTransformersVectorizer.__init__&   s    	A 	:(9/(J%$+&*kkFu();DDUDUV
"4($8!  	C 	s   A' 'A<r   r   c                    | j                  |      }| j                  r| j                  dkD  rt        t	        | j                              }| j
                  j                  |      }| j
                  j                  ||| j                        }| j
                  j                  |       | j                  r"|t        j                  j                  |      z  }|S | j
                  j                  || j                  | j                        }|S )N   )target_devices)	sentencespool
batch_size)r<   r>   r+   )r   r6   r5   listranger8   start_multi_process_poolencode_multi_processr)   stop_multi_process_poolr+   r$   linalgnormencode)r   r   r   r;   r=   embs         r   r   z'SentenceTransformersVectorizer.__call__D   s     <<\J;;4++a/!%(8(8"9:N::66n6UD**11+,, 2 C
 JJ..t4((BIINN3//J**##+,,%)%>%> $ C
 Jr   N)zall-MiniLM-L6-v2   r:   F)r   r   r   r    r   intboolr   r   r$   r%   r   r   r   r   r'   r'      sW     #5 %*99 9 	9
 #9<T bjj r   r'   c                   H    e Zd ZdZd	defdZded   dej                  fdZ	y)
NaiveGetFieldVectorizerz
    If embeddings were precomputed, then we could just extract them from the proper field 
    (set by `field_with_embedding`) from each `Example`.
    field_with_embeddingc                     || _         y r
   )rM   )r   rM   s     r   r   z NaiveGetFieldVectorizer.__init__d   s
    $8!r   r   r   r   c                     |D cg c](  }t        || j                        j                  dd      * }}t        j                  |d      j                  t        j                        }|S c c}w )Nr:   r   )axis)getattrrM   reshaper$   concatenateastypefloat32)r   r   cur_example
embeddingss       r   r   z NaiveGetFieldVectorizer.__call__g   sg      ,
 K!:!:;CCArJ

 
 ^^JQ7>>rzzJ

s   -A*N)
vectorized)
r   r   r   r    r   r   r   r$   r%   r   r   r   r   rL   rL   _   s.    9S 9T)_  r   rL   c            	       Z    e Zd ZdZ	 	 	 ddedededefdZded   d	ej                  fd
Z
y)CohereVectorizera  
    This vectorizer uses the Cohere API to convert texts to embeddings.
    More about the available models: https://docs.cohere.com/reference/embed
    `api_key` should be passed as an argument and can be retrieved
    from https://dashboard.cohere.com/api-keys
    api_keyr8   embed_batch_sizeembedding_typec                 b    || _         || _        || _        dd l}|j	                  |      | _        y )Nr   )r8   r]   r^   cohereClientclient)r   r\   r8   r]   r^   r`   s         r   r   zCohereVectorizer.__init__w   s.     
 0,mmG,r   r   r   r   c                    | j                  |      }g }t        |      dz
  | j                  z  dz   }t        |      D ]u  }|| j                  z  }|dz   | j                  z  }||| }| j                  j                  || j                  | j                        }	|j                  |	j                         w t        j                  |t        j                        }
|
S )Nr:   )textsr8   
input_typedtype)r   lenr]   r@   rb   embedr8   r^   extendrX   r$   arrayrV   )r   r   r   embeddings_list	n_batchescur_batch_idx	start_idxend_idx	cur_batchresponserX   s              r   r   zCohereVectorizer.__call__   s     <<\J*+a/D4I4IIAM	"9- 	8M%(=(==I$q(D,A,AAG))W=I{{((jj.. ) H ""8#6#67	8 XXoRZZ@
r   N)zembed-english-v3.0`   search_document)r   r   r   r    r   rI   r   r   r$   r%   r   r   r   r   r[   r[   p   sX     * "/-- - 	-
 -T)_  r   r[   Tc                   \    e Zd ZdZ	 	 	 ddededee   fdZded   d	e	j                  fd
Zy)OpenAIVectorizera  
    This vectorizer uses OpenAI API to convert texts to embeddings. Changing `model` is not
    recommended. More about the model: https://openai.com/blog/new-and-improved-embedding-model/
    `api_key` should be passed as an argument or as env variable (`OPENAI_API_KEY`).
    Nr8   r]   r\   c                     || _         || _        t        rt        j                  | _        nt        j
                  | _        |r|t        _        y y r
   )r8   r]   OPENAI_LEGACYopenai	EmbeddingrX   r\   )r   r8   r]   r\   s       r   r   zOpenAIVectorizer.__init__   s?     
 0#--DN#..DN$FN r   r   r   r   c                    | j                  |      }g }t        |      dz
  | j                  z  dz   }t        |      D ]u  }|| j                  z  }|dz   | j                  z  }||| }| j                  j                  | j                  |      }	|	d   D 
cg c]  }
|
d   	 }}
|j                  |       w t        j                  |t        j                        }|S c c}
w )Nr:   )r8   inputdata	embeddingrf   )r   rh   r]   r@   rz   creater8   rj   r$   rk   rV   )r   r   r   rl   rm   rn   ro   rp   rq   rr   cur_objcur_batch_embeddingsrX   s                r   r   zOpenAIVectorizer.__call__   s     <<\J*+a/D4I4IIAM	"9- 	9M%(=(==I$q(D,A,AAG))W=I~~,,jj - H
 IQQWHX#YWGK$8#Y #Y""#78	9 XXoRZZ@
	 $Zs   C)ztext-embedding-ada-002i   N)r   r   r   r    r   rI   r   r   r   r$   r%   r   r   r   r   rv   rv      sR     . $!%	%% % #	%"T)_  r   rv   )r"   typingr   r   numpyr$   ry   ABCr   r'   rL   r[   rI   version__version__rx   	Exceptionrv   r   r   r   <module>r      s    
 !  eSWW e.=%; =@4 ")- )X22156!;M
+- +	  Ms   "A: :BB