
    +#hP                     D    d dl Z d dlmZ d dlmZ d dlmZ  G d d      Zy)    N)Union)Dataset)dotdictc                   v    e Zd ZdZdddddgfdeded	ed
edee   ddfdZ	 ddededede	ee   ee
   f   fdZy)PyseriniRetrieverzsWrapper for retrieval with Pyserini. Supports using either pyserini prebuilt faiss indexes or your own faiss index.zcastorini/dkrr-dpr-nq-retrieverzwikipedia-dpr-dkrr-nqN_idtextquery_encoderindexdatasetid_fieldtext_fieldsreturnc                    ddl m}m}m} ddlm}	 |	j                  |      | _        || _        || _	        || _
        ||v s||v s||v r"|	j                  || j                        | _        y |	|| j                        | _        | j                  J i | _        t        | j                  | j                           D ]  \  }
}|
| j                  |<    y)az  
        Args:
        
            query_encoder (`str`):
                Huggingface model to encode queries
            index (`str`):
                Either a prebuilt index from pyserini or a local path to a faiss index
            dataset (`Dataset`):
                Only required when using a local faiss index. The dataset should be the one that has been put into the faiss index.
            id_field (`str`):
                The name of the id field of the dataset used for retrieval.
            text_fields (`list[str]`):
                A list of the names of the text fields for the dataset used for retrieval.
        r   )FAISS_INDEX_INFOIMPACT_INDEX_INFOTF_INDEX_INFO)FaissSearcher)	index_dirr
   N)pyserini.prebuilt_index_infor   r   r   pyserini.searchr   _init_encoder_from_strencoderr   r   r   from_prebuilt_indexsearcherdataset_id_to_index	enumerate)selfr
   r   r   r   r   r   r   r   r   idocids               Q/var/www/html/sandstorm/venv/lib/python3.12/site-packages/dsp/modules/pyserini.py__init__zPyseriniRetriever.__init__   s    , 	dc1$;;MJ &M!U.>%>%K\B\)==eT\\RDM)EVDM<<+++')D$%dll4==&AB 4523((/4    querykthreadsc           	         
  j                   j                  |||      }g }t        |d      D ]  \  }} j                  [ j                  |j
                     dj                   fd j                  D              } j                   j                        }	nyt        j                   j                   j                  |j
                        j                               
dj                  
fd j                  D              }
 j                     }	|j                  |||	|j                  |d       
 |D 
cg c]  }
t        |
       c}
S c c}
w )N)r%   r&      )start c              3   B   K   | ]  }j                   |        y wN)r   ).0fieldrowr   s     r!   	<genexpr>z-PyseriniRetriever.__call__.<locals>.<genexpr>=   s     WUU 3C 8Ws   c              3   (   K   | ]	  }|     y wr,    )r-   r.   psgs     r!   r0   z-PyseriniRetriever.__call__.<locals>.<genexpr>B   s     IuE
Is   )r	   	long_textpidscorerank)r   searchr   r   r   r    joinr   r   jsonloadsdocrawappendr6   r   )r   r$   r%   r&   hitstopkr7   hitr	   r5   r3   r/   s   `         `@r!   __call__zPyseriniRetriever.__call__4   s     }}##EQ#@"4q1 	ID#||'..syy9xxWdFVFVWWll4==1#6 jj!2!2399!=!A!A!CDxxI8H8HII$--(KK! 	& )-----s    E)
      )__name__
__module____qualname____doc__strr   listr"   intr   r   rB   r2   r#   r!   r   r   	   s    } 'H5$(!&+1(%4 #%4%4 "%4 	%4
 #3i%4
 8<%4R 79.. .03.	tCy$w-'	(.r#   r   )r:   typingr   datasetsr   	dsp.utilsr   r   r2   r#   r!   <module>rO      s       D. D.r#   