
    Ig +                     4   d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZmZ d dlmZmZ d d	lmZmZ d d
l m!Z! g dZ"ddddddide"D  cg c]  } |  d	 c} ddgZ# G d de      Z$ G d de      Z%yc c} w )    N)defaultdict)get_all_start_methods)Path)AnyDictIterableListOptionalTupleTypeUnion)SnowballStemmer)define_cache_dir
iter_batchget_all_punctuationremove_non_alphanumeric)ParallelWorkerPoolWorker)SparseEmbeddingSparseTextEmbeddingBase)SimpleTokenizer)arabicazerbaijanibasquebengalicatalanchinesedanishdutchenglishfinnishfrenchgermangreekhebrewhinglish	hungarian
indonesianitaliankazakhnepali	norwegian
portugueseromanianrussianslovenespanishswedishtajikturkishzQdrant/bm25z6BM25 as sparse embeddings meant to be used with Qdrantz
apache-2.0g{Gz?hfz	mock.file.txtT)modeldescriptionlicense
size_in_GBsources
model_fileadditional_filesrequires_idfc                       e Zd ZdZ	 	 	 	 	 	 ddedee   dededededef fd	Ze	d
e
eeef      fd       Ze	deded
e
e   fd       Z	 	 ddededeeee   f   dedee   d
ee   fdZ	 	 ddeeee   f   dedee   d
ee   fdZde
e   d
e
e   fdZde
e   d
e
e   fdZde
e   d
eeef   fdZe	ded
efd       Zdeeee   f   d
ee   fdZe	d
ed   fd       Z xZS )Bm25a  Implements traditional BM25 in a form of sparse embeddings.
    Uses a count of tokens in the document to evaluate the importance of the token.

    WARNING: This model is expected to be used with `modifier="idf"` in the sparse vector index of Qdrant.

    BM25 formula:

    score(q, d) = SUM[ IDF(q_i) * (f(q_i, d) * (k + 1)) / (f(q_i, d) + k * (1 - b + b * (|d| / avg_len))) ],

    where IDF is the inverse document frequency, computed on Qdrant's side
    f(q_i, d) is the term frequency of the token q_i in the document d
    k, b, avg_len are hyperparameters, described below.

    Args:
        model_name (str): The name of the model to use.
        cache_dir (str, optional): The path to the cache directory.
            Can be set using the `FASTEMBED_CACHE_PATH` env variable.
            Defaults to `fastembed_cache` in the system's temp directory.
        k (float, optional): The k parameter in the BM25 formula. Defines the saturation of the term frequency.
            I.e. defines how fast the moment when additional terms stop to increase the score. Defaults to 1.2.
        b (float, optional): The b parameter in the BM25 formula. Defines the importance of the document length.
            Defaults to 0.75.
        avg_len (float, optional): The average length of the documents in the corpus. Defaults to 256.0.
    Raises:
        ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
    
model_name	cache_dirkbavg_lenlanguagetoken_max_lengthc                    t        
|   ||fi | |t        vrt        | d      || _        || _        || _        || _        | j                  |      }	t        |      | _
        | j                  |	| j                  | j                        | _        || _        t        t!                     | _        t        | j%                  | j                  | j                              | _        t)        |      | _        t,        | _        y )Nz language is not supported)local_files_only)super__init__supported_languages
ValueErrorrF   rC   rD   rE   _get_model_descriptionr   rB   download_model_local_files_only
_model_dirrG   setr   punctuation_load_stopwords	stopwordsr   stemmerr   	tokenizer)selfrA   rB   rC   rD   rE   rF   rG   kwargsmodel_description	__class__s             R/var/www/html/answerous/venv/lib/python3.12/site-packages/fastembed/sparse/bm25.pyrK   zBm25.__init__c   s     	Y9&9..z)CDEE$DM 77
C))4--t~~@V@V . 
 !1245T11$//4==QR&x0(    returnc                     t         S )zLists the supported models.

        Returns:
            List[Dict[str, Any]]: A list of dictionaries containing the model information.
        )supported_bm25_modelsclss    r\   list_supported_modelszBm25.list_supported_models   s
     %$r]   	model_dirc                     || dz  }|j                         sg S t        |d      5 }|j                         j                         cd d d        S # 1 sw Y   y xY w)Nr6   r)existsopenread
splitlines)rb   rd   rF   stopwords_pathfs        r\   rT   zBm25._load_stopwords   sW    "z%66$$&I.#& 	)!668&&(	) 	) 	)s   AA	documents
batch_sizeparallelc              #     K   d}t        |t              r|g}d}t        |t              rt        |      |k  rd}||r+t	        ||      D ]  }| j                  |      E d {     y |dk(  rt        j                         }dt               v rdnd}||| j                  | j                  | j                  d}	t        |xs d| j                         |      }
 |
j                  t	        ||      fi |	D ]  }|D ]  }|   y 7 w)	NFTr   
forkserverspawn)rA   rB   rC   rD   rE      )num_workersworkerstart_method)
isinstancestrlistlenr   	raw_embedos	cpu_countr   rC   rD   rE   r   _get_worker_classordered_map)rX   rA   rB   rm   rn   ro   is_smallbatchrv   paramspoolrecords               r\   _embed_documentszBm25._embed_documents   s     i%"IHi&9~
*x#Iz: 1>>%0001 1}<<>+7;P;R+R<X_L(&VVVV<<F &$M--/)D
 *))*Y
*KVvV !# !F L!!% 1s   AD!D"B%Dc              +      K   | j                  | j                  t        | j                        |||      E d{    y7 w)a  
        Encode a list of documents into list of embeddings.
        We use mean pooling with attention so that the model can handle variable-length inputs.

        Args:
            documents: Iterator of documents or single document to embed
            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
            parallel:
                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
                If 0, use all available cores.
                If None, don't use data-parallel processing, use default onnxruntime threading instead.

        Returns:
            List of embeddings, one per document
        )rA   rB   rm   rn   ro   N)r   rA   rx   rB   )rX   rm   rn   ro   rY   s        r\   embedz
Bm25.embed   sA     , (($..)! ) 
 	
 	
s   7A?Atokensc                    g }|D ]  }|| j                   v r|j                         | j                  v r/t        |      | j                  kD  rH| j
                  j                  |j                               }|st|j                  |        |S N)rS   lowerrU   rz   rG   rV   	stem_wordappend)rX   r   stemmed_tokenstokenstemmed_tokens        r\   _stemz
Bm25._stem   s     	5E((({{}.5zD111 LL225;;=AM%%m4	5 r]   c                     g }|D ]n  }t        |      }| j                  j                  |      }| j                  |      }| j	                  |      }|j                  t        j                  |             p |S r   )r   rW   tokenizer   _term_frequencyr   r   	from_dict)rX   rm   
embeddingsdocumentr   r   token_id2values          r\   r{   zBm25.raw_embed   sw     
! 	IH.x8H^^,,X6F!ZZ/N!11.ANo77GH	I r]   c           	      X   i }t        t              }|D ]  }||xx   dz  cc<    t        |      }|D ]t  }| j                  |      }||   }|| j                  dz   z  ||<   ||xx   || j                  d| j
                  z
  | j
                  |z  | j                  z  z   z  z   z  cc<   v |S )ag  Calculate the term frequency part of the BM25 formula.

        (
            f(q_i, d) * (k + 1)
        ) / (
            f(q_i, d) + k * (1 - b + b * (|d| / avg_len))
        )

        Args:
            tokens (List[str]): The list of tokens in the document.

        Returns:
            Dict[int, float]: The token_id to term frequency mapping.
        rs   )r   intrz   compute_token_idrC   rD   rE   )rX   r   tf_mapcounterr   doc_lentoken_idnum_occurrencess           r\   r   zBm25._term_frequency   s     c"# 	(MM"a'"	( f+$ 	M,,];H%m4O.$&&1*=F88$&&DFF
TVVg-<<3 ! 		 r]   r   c                 >    t        t        j                  |            S r   )absmmh3hash)rb   r   s     r\   r   zBm25.compute_token_id  s    499U#$$r]   queryc           	   +   x   K   t        |t              r|g}|D ]  }t        |      } j                  j	                  |      } j                  |      }t        j                  t        t         fd|D                    t        j                        }t        j                  |      }t        ||        yw)zTo emulate BM25 behaviour, we don't need to use weights in the query, and
        it's enough to just hash the tokens and assign a weight of 1.0 to them.
        c              3   @   K   | ]  }j                  |        y wr   )r   ).0r   rX   s     r\   	<genexpr>z#Bm25.query_embed.<locals>.<genexpr>.  s     R%..u5Rs   )dtype)indicesvaluesN)rw   rx   r   rW   r   r   nparrayry   rR   int32	ones_liker   )rX   r   rY   textr   r   	token_idsr   s   `       r\   query_embedzBm25.query_embed"  s      eS!GE 		DD*40D^^,,T2F!ZZ/NSR>RRShhI \\),F!)FCC		Ds   B7B:
Bm25Workerc                     t         S r   )r   ra   s    r\   r~   zBm25._get_worker_class4  s    r]   )Ng333333?g      ?g      p@r    (   )   N)__name__
__module____qualname____doc__rx   r
   floatr   rK   classmethodr	   r   r   rc   r   rT   r   r   r   r   r   r   r{   r   r   r   r   r~   __classcell__)r[   s   @r\   r@   r@   G   s/   < $(! "")") C=") 	")
 ") ") ") ")H %d4S>&: % % ) ) )S	 ) ) "&(!(! (! hsm+,	(!
 (! 3-(! 
/	"(!Z "&	
hsm+,
 
 3-	
 
/	"
<DI $s) $9 
o	d3i De4D < %S %S % %DsHSM'9!: DRaIb D$ $|"4  r]   r@   c            	           e Zd ZdedefdZededededd fd       Zdee	e
ef      dee	e
ef      fdZedededefd	       Zy
)r   rA   rB   c                 6     | j                   ||fi || _        y r   )init_embeddingr7   )rX   rA   rB   rY   s       r\   rK   zBm25Worker.__init__:  s     )T((YI&I
r]   rY   r^   c                      | d||d|S N)rA   rB    r   )rb   rA   rB   rY   s       r\   startzBm25Worker.startB  s    HjIHHHr]   itemsc              #   b   K   |D ]&  \  }}| j                   j                  |      }||f ( y wr   )r7   r{   )rX   r   idxr   onnx_outputs        r\   processzBm25Worker.processF  s8      	#JC**..u5K{""	#s   -/c                     t        d| |d|S r   )r@   )rA   rB   rY   s      r\   r   zBm25Worker.init_embeddingK  s    IzYI&IIr]   N)r   r   r   rx   rK   r   r   r   r   r   r   r   staticmethodr@   r   r   r]   r\   r   r   9  s    JJ J Is Is Ic Il I I#XeCHo6 #8E#s(O;T #
 J3 J3 JT J Jr]   r   )&r|   collectionsr   multiprocessingr   pathlibr   typingr   r   r   r	   r
   r   r   r   r   numpyr   py_rust_stemmersr   fastembed.common.utilsr   r   r   r   fastembed.parallel_processorr   r   &fastembed.sparse.sparse_embedding_baser   r    fastembed.sparse.utils.tokenizerr   rL   r`   r@   r   )langs   0r\   <module>r      s    	 # 1  J J J   ,  D = D O-
 "7JKtvT]K  o" odJ Jq Ls   +B