
    Ig3              
          d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ d
ddddddiddgdd	gZd
diZ  G d deee         Z! G d de      Z"y)    N)Path)	AnyDictIterableListOptionalSequenceTupleTypeUnion)SnowballStemmer)OnnxProvider)OnnxOutputContext)define_cache_dir)SparseEmbeddingSparseTextEmbeddingBase)OnnxTextModelTextEmbeddingWorkerz'Qdrant/bm42-all-minilm-l6-v2-attentionsi:w  zYLight sparse embedding model, which assigns an importance score to each token in the textz
apache-2.0g
ףp=
?hfz'Qdrant/all_miniLM_L6_v2_with_attentionsz
model.onnxstopwords.txtT)	model
vocab_sizedescriptionlicense
size_in_GBsources
model_fileadditional_filesrequires_idfenglishc                       e Zd ZdZdgZ	 	 	 	 	 	 	 	 d%dedee   dee   deee	      de
ded	eee      d
edee   f fdZd&dZdeeeef      deeeef      fdZdeeeef      deeeef      fdZedeeeee   f      dee
   deeee
f      fd       Zdeeeef      deeeee   f      fdZdeee
f   deee
f   fdZdedee   fdZedeeeef      fd       Zededee   fd       Z	 	 d'deeee   f   dedee   dee   fd Z edee   deee
f   fd!       Z!d"eeee   f   dee   fd#Z"ede#e$   fd$       Z% xZ&S )(Bm42a  
    Bm42 is an extension of BM25, which tries to better evaluate importance of tokens in the documents,
    by extracting attention weights from the transformer model.

    Traditional BM25 uses a count of tokens in the document to evaluate the importance of the token,
    but this approach doesn't work well with short documents or chunks of text, as almost all tokens
    there are unique.

    BM42 addresses this issue by replacing the token count with the attention weights from the transformer model.
    This allows sparse embeddings to work well with short documents, handle rare tokens and leverage traditional NLP
    techniques like stemming and stopwords.

    WARNING: This model is expected to be used with `modifier="idf"` in the sparse vector index of Qdrant.
    attention_6
model_name	cache_dirthreads	providersalphacuda
device_ids	lazy_load	device_idc
                    t        |   |||fi |
 || _        || _        || _        || _        |	|	| _        n(| j                  | j                  d   | _        nd| _        | j                  |      | _        t        |      | _
        | j                  | j                  | j                  | j                        | _        i | _        t               | _        t               | _        t        t$        j&                        | _        t        | j)                  | j                              | _        t-        t.        |         | _        || _        | j                  s| j5                          yy)a  
        Args:
            model_name (str): The name of the model to use.
            cache_dir (str, optional): The path to the cache directory.
                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
                                       Defaults to `fastembed_cache` in the system's temp directory.
            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
            providers (Optional[Sequence[OnnxProvider]], optional): The providers to use for onnxruntime.
            alpha (float, optional): Parameter, that defines the importance of the token weight in the document
                versus the importance of the token frequency in the corpus. Defaults to 0.5, based on empirical testing.
                It is recommended to only change this parameter based on training data for a specific dataset.
            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
                Defaults to False.
            device_ids (Optional[List[int]], optional): The list of device ids to use for data parallel processing in
                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.

        Raises:
            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
        Nr   )local_files_only)super__init__r'   r+   r*   r)   r,   _get_model_descriptionmodel_descriptionr   r%   download_model_local_files_only
_model_dirinvert_vocabsetspecial_tokensspecial_tokens_idsstringpunctuation_load_stopwords	stopwordsr   MODEL_TO_LANGUAGEstemmerr(   load_onnx_model)selfr$   r%   r&   r'   r(   r)   r*   r+   r,   kwargs	__class__s              R/var/www/html/answerous/venv/lib/python3.12/site-packages/fastembed/sparse/bm42.pyr0   zBm42.__init__:   s4   H 	YB6B"" %	  &DN__(!__Q/DN!DN!%!<!<Z!H))4--""DNNTE[E[ . 
 !e"%%v112T11$//BC&'8'DE
~~  "     returnc                     | j                  | j                  | j                  d   | j                  | j                  | j
                  | j                         | j                  j                         j                         D ]  \  }}|| j                  |<    t        | j                  j                               | _        t        | j                  j                               | _        t        | j#                  | j                              | _        y )Nr   )	model_dirr   r&   r'   r)   r,   )_load_onnx_modelr5   r2   r&   r'   r)   r,   	tokenizer	get_vocabitemsr6   r7   special_token_to_idkeysr8   valuesr9   r<   r=   )rA   tokenidxs      rD   r@   zBm42.load_onnx_model   s    oo--l;LLnnnn 	 	
 ..224::< 	+JE3%*Dc"	+!$":":"?"?"AB"%d&>&>&E&E&G"HT11$//BCrE   tokensc                 ~    g }|D ]5  \  }}|| j                   v s|| j                  v r#|j                  ||f       7 |S N)r=   r;   append)rA   rR   resultrP   values        rD   _filter_pair_tokenszBm42._filter_pair_tokens   sM    " 	*LE5&%43C3C*CMM5%.)	* rE   c                 z    g }|D ]3  \  }}| j                   j                  |      }|j                  ||f       5 |S rT   )r?   	stem_wordrU   )rA   rR   rV   rP   rW   processed_tokens         rD   _stem_pair_tokenszBm42._stem_pair_tokens   sF    " 	4LE5"ll44U;OMM?E23	4 rE   weightsc                 n    g }|D ],  \  }}t        fd|D              }|j                  ||f       . |S )Nc              3   (   K   | ]	  }|     y wrT    ).0rQ   r]   s     rD   	<genexpr>z*Bm42._aggregate_weights.<locals>.<genexpr>   s     :cWS\:s   )sumrU   )clsrR   r]   rV   rP   idxs
sum_weights     `    rD   _aggregate_weightszBm42._aggregate_weights   sE     ! 	/KE4:T::JMM5*-.	/ rE   
bpe_tokensc                 n   g }d}g }| j                   j                  j                  }t        |      }|D ]i  \  }}|| j                  v r|j                  |      r|||d  z  }|j                  |       @|r|j                  ||f       g }|}|j                  |       k |r|j                  ||f       |S )N )rJ   r   continuing_subword_prefixlenr8   
startswithrU   )	rA   rh   rV   accacc_idxrk   continuing_subword_prefix_lenrQ   rP   s	            rD   _reconstruct_bpezBm42._reconstruct_bpe   s     $(NN$8$8$R$R!(+,E(F%$ 	$JC+++ 9:u:;<<s#MM3.1 Gs#	$ MM3.)rE   vectorc                     i }|j                         D ]K  \  }}t        t        j                  |            }t	        j
                  d|z         | j                  z  ||<   M |S )z
        Orders all tokens in the vector by their importance and generates a new score based on the importance order.
        So that the scoring doesn't depend on absolute values assigned by the model, but on the relative importance.
              ?)rL   absmmh3hashmathlogr(   )rA   rr   
new_vectorrP   rW   token_ids         rD   _rescore_vectorzBm42._rescore_vector   sa     
"LLN 	GLE5499U+,H
 $(88C%K#8DJJ#FJx 	G rE   outputc              #   D   K   |j                   t        d      |j                   }t        j                  |j                  d d d d df   d      |j
                  z  }t        ||      D ]  \  }} fdt        |      D        } j                  |      } j                  |      } j                  |      }	 j                  |	|      }
i }|
D ]$  \  }}t        |j                  |d      |      ||<   &  j                  |      }t        j                   |        y w)Nz7input_ids must be provided for document post-processingr      )axisc              3   F   K   | ]  \  }}|j                   |   f  y wrT   )r6   )ra   rQ   r{   rA   s      rD   rb   z1Bm42._post_process_onnx_output.<locals>.<genexpr>   s-      (!C d''12(s   !)	input_ids
ValueErrornpmeanmodel_outputattention_maskzip	enumeraterq   rX   r\   rg   maxgetr|   r   	from_dict)rA   r}   token_ids_batchpooled_attentiondocument_token_idsattention_valuedocument_tokens_with_idsreconstructedfilteredstemmedweightedmax_token_weightrP   weightrescoreds   `              rD   _post_process_onnx_outputzBm42._post_process_onnx_output   s0    #VWW ** 776#6#6q!Qw#?aH6K`K``36HX3Y 	6/(%./A%B($
 !112JKM//>H,,X6G..wHH!!) Vv*-.>.B.B5!.Lf*U 'V ++,<=H!++H55+	6s   DD c                     t         S )zLists the supported models.

        Returns:
            List[Dict[str, Any]]: A list of dictionaries containing the model information.
        )supported_bm42_modelsrd   s    rD   list_supported_modelszBm42.list_supported_models   s
     %$rE   rH   c                     |dz  }|j                         sg S t        |d      5 }|j                         j                         cd d d        S # 1 sw Y   y xY w)Nr   r)existsopenread
splitlines)rd   rH   stopwords_pathfs       rD   r<   zBm42._load_stopwords  sP    "_4$$&I.#& 	)!668&&(	) 	) 	)s   AA	documents
batch_sizeparallelc              +      K   | j                  | j                  t        | j                        |||| j                  | j
                  | j                  | j                  	      E d{    y7 w)a  
        Encode a list of documents into list of embeddings.
        We use mean pooling with attention so that the model can handle variable-length inputs.

        Args:
            documents: Iterator of documents or single document to embed
            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
            parallel:
                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
                If 0, use all available cores.
                If None, don't use data-parallel processing, use default onnxruntime threading instead.

        Returns:
            List of embeddings, one per document
        )	r$   r%   r   r   r   r'   r)   r*   r(   N)_embed_documentsr$   strr%   r'   r)   r*   r(   )rA   r   r   r   rB   s        rD   embedz
Bm42.embed
  s]     , (($..)!nn** ) 

 
	
 
	
s   A#A-%A+&A-c                 ^    i }|D ]%  }t        t        j                  |            }d||<   ' |S )Nrt   )ru   rv   rw   )rd   rR   rV   rP   r{   s        rD   _query_rehashzBm42._query_rehash,  s:     	#E499U+,H"F8	# rE   queryc              +     K   t        |t              r|g}t        | d      r| j                  | j	                          |D ]  }| j
                  j                  |      }t        |j                        }| j                  |      }| j                  |      }| j                  |      }t        j                  | j                  d |D                      yw)z
        To emulate BM25 behaviour, we don't need to use smart weights in the query, and
        it's enough to just hash the tokens and assign a weight of 1.0 to them.
        It is also faster, as we don't need to run the model for the query.
        r   Nc              3   &   K   | ]	  \  }}|  y wrT   r`   )ra   rP   _s      rD   rb   z#Bm42.query_embed.<locals>.<genexpr>G  s     >]PQu>]s   )
isinstancer   hasattrr   r@   rJ   encoder   rR   rq   rX   r\   r   r   r   )	rA   r   rB   textencodedr   r   r   r   s	            rD   query_embedzBm42.query_embed4  s      eS!GEtW%);  " 	_Dnn++D1G'0'@$ 112JKM//>H,,X6G!++D,>,>>]U\>],]^^	_s   CCc                     t         S rT   )Bm42TextEmbeddingWorkerr   s    rD   _get_worker_classzBm42._get_worker_classI  s    &&rE   )NNNg      ?FNFN)rF   N)   N)'__name__
__module____qualname____doc__ONNX_OUTPUT_NAMESr   r   intr	   r   floatboolr   r0   r@   r
   r   rX   r\   classmethodrg   r   rq   r   r|   r   r   r   r   r   r<   r   r   r   r   r   r   r   __classcell__)rC   s   @rD   r"   r"   (   s    '
 $(!%6:*.#'E#E# C=E# #	E#
 H\23E# E# E# T#Y'E# E# C=E#ND$uS#X*? DsTWxDY U38_(= $uSRUXBW  %T#Y/0;?;	eCJ	  "5c?3	eCcN#	$:d3:&6 4U
;K $60A 6hF_ 6@ %d4S>&: % % ) )c ) ) "&	 
hsm+, 
  
 3-	 
 
/	" 
D 8C= T#u*5E  _sHSM'9!: _RaIb _* '$':"; ' 'rE   r"   c                        e Zd ZdededefdZy)r   r$   r%   rF   c                     t        d||d|S )N)r$   r%   r`   )r"   )rA   r$   r%   rB   s       rD   init_embeddingz&Bm42TextEmbeddingWorker.init_embeddingO  s$     
!
 
 	
rE   N)r   r   r   r   r"   r   r`   rE   rD   r   r   N  s    
 
 
4 
rE   r   )#rx   r:   pathlibr   typingr   r   r   r   r   r	   r
   r   r   rv   numpyr   py_rust_stemmersr   fastembed.commonr   fastembed.common.onnx_modelr   fastembed.common.utilsr   &fastembed.sparse.sparse_embedding_baser   r   fastembed.text.onnx_text_modelr   r   r   r>   r"   r   r`   rE   rD   <module>r      s       T T T   , ) 9 3 N ;r;
 #,- " .y 
c'"M/$B c'L	
1 
rE   