
    +#h                         d Z ddlZddlZ G d d      Z G d d      Z G d de      Zd	 Zd
 Z e       Zd Z	d Z
d Zy)z
    Source: DPR Implementation from Facebook Research
    https://github.com/facebookresearch/DPR/tree/master/dpr
    Original license: https://github.com/facebookresearch/DPR/blob/main/LICENSE
    Nc                   r    e Zd ZdZdZdZdZdZdZdZ	dd	Z
d
 ZddZd ZddZd Zd Zd Zd ZddZd Zy)Tokensz.A class to represent a list of tokenized text.r                  Nc                 6    || _         || _        |xs i | _        y N)data
annotatorsopts)selfr   r   r   s       J/var/www/html/sandstorm/venv/lib/python3.12/site-packages/dsp/utils/dpr.py__init__zTokens.__init__   s    	$JB	    c                 ,    t        | j                        S )zThe number of tokens.)lenr   r   s    r   __len__zTokens.__len__   s    499~r   c                 X    t         j                  |       }| j                  || |_        |S )z0Return a view of the list of tokens from [i, j).)copyr   )r   ij
new_tokenss       r   slicezTokens.slice   s&    YYt_
))Aq/
r   c                     dj                  | j                  D cg c]  }|| j                      c}      j                         S c c}w )z7Returns the original text (with whitespace reinserted). )joinr   TEXT_WSstripr   ts     r   
untokenizezTokens.untokenize$   s4    ww;A$,,;<BBDD;s   Ac                     |r5| j                   D cg c]  }|| j                     j                         ! c}S | j                   D cg c]  }|| j                      c}S c c}w c c}w )zfReturns a list of the text of each token

        Args:
            uncased: lower cases text
        )r   TEXTlower)r   uncasedr#   s      r   wordszTokens.words(   sR     26))<QAdiiL&&(<<*.))4QAdiiL44 =4s   $AA$c                 Z    | j                   D cg c]  }|| j                      c}S c c}w )z?Returns a list of [start, end) character offsets of each token.)r   SPANr"   s     r   offsetszTokens.offsets3   s"    &*ii0$))000s   (c                 x    d| j                   vry| j                  D cg c]  }|| j                      c}S c c}w )zwReturns a list of part-of-speech tags of each token.
        Returns None if this annotation was not included.
        posN)r   r   POSr"   s     r   r.   z
Tokens.pos7   3     '%)YY/$((///   7c                 x    d| j                   vry| j                  D cg c]  }|| j                      c}S c c}w )zwReturns a list of the lemmatized text of each token.
        Returns None if this annotation was not included.
        lemmaN)r   r   LEMMAr"   s     r   lemmaszTokens.lemmas?   s3     $//)'+yy1!$**111r1   c                 x    d| j                   vry| j                  D cg c]  }|| j                      c}S c c}w )zReturns a list of named-entity-recognition tags of each token.
        Returns None if this annotation was not included.
        nerN)r   r   NERr"   s     r   entitieszTokens.entitiesG   r0   r1   c                 t   fd}| j                  |      }t        t        |            D cg c]>  }t        |t        ||z   t        |                  D ]  } ||||dz          s||dz   f @ }	}}|r5|	D cg c](  \  }}dj	                  dj                  |||             * }	}}|	S c c}}w c c}}w )a\  Returns a list of all ngrams from length 1 to n.

        Args:
            n: upper limit of ngram length
            uncased: lower cases text
            filter_fn: user function that takes in an ngram list and returns
              True or False to keep or not keep the ngram
            as_string: return the ngram as a string vs list
        c                     sy |       S )NF )gram	filter_fns    r   _skipzTokens.ngrams.<locals>._skipZ   s    T?"r   r   z{} )r)   ranger   minformatr   )
r   nr(   r>   
as_stringsr?   r)   sengramss
      `      r   rH   zTokens.ngramsO   s    	#
 

7# U,0 CAs5z$:;0uQq1u~. a!e* 0* 0 0 FLMFQdkk#((51:"67MFM0 Ns   AB.<-B4c                    | j                         }|sy| j                  j                  dd      }g }d}|t        |      k  r||   }||k7  rf|}|t        |      k  r$||   |k(  r|dz  }|t        |      k  r	||   |k(  r|j	                  | j                  ||      j                         |f       n|dz  }|t        |      k  r|S )z6Group consecutive entity tokens with the same NER tag.Nnon_entOr   r   )r9   r   getr   appendr   r$   )r   r9   rJ   groupsidxner_tagstarts          r   entity_groupszTokens.entity_groupsk   s    ==?))--	3/CM!smG'!S]*x}/G1HC S]*x}/Gtzz%5@@BGLMq CM! r   r   )NN)F)r   FNT)__name__
__module____qualname____doc__r&   r    r+   r/   r4   r8   r   r   r   r$   r)   r,   r.   r5   r9   rH   rR   r<   r   r   r   r      s[    8DGD
CE
C
E	510208r   r   c                   "    e Zd ZdZd Zd Zd Zy)	TokenizerzaBase tokenizer class.
    Tokenizers implement tokenize, which should return a Tokens class.
    c                     t         r   )NotImplementedError)r   texts     r   tokenizezTokenizer.tokenize   s    !!r   c                      y r   r<   r   s    r   shutdownzTokenizer.shutdown   s    r   c                 $    | j                          y r   )r^   r   s    r   __del__zTokenizer.__del__   s    r   N)rS   rT   rU   rV   r\   r^   r`   r<   r   r   rX   rX      s    "r   rX   c                        e Zd ZdZdZd Zd Zy)SimpleTokenizerz[\p{L}\p{N}\p{M}]+z[^\p{Z}\p{C}]c                    t        j                  d| j                  d| j                  dt         j                  t         j
                  z   t         j                  z         | _        t        |j                  di             dkD  r<t        j                  t        |       j                  d|j                  d             t               | _        y)	zS
        Args:
            annotators: None or empty set (only tokenizes).
        (z)|())flagsr   r   z& only tokenizes! Skipping annotators: N)regexcompile	ALPHA_NUMNON_WS
IGNORECASEUNICODE	MULTILINE_regexpr   rL   loggerwarningtyperS   setr   )r   kwargss     r   r   zSimpleTokenizer.__init__   s    
 }}>>4;;7""U]]2U__D
 vzz,+,q0NN J//L1IK L%r   c                    g }| j                   j                  |      D cg c]  }| }}t        t        |            D ]t  }||   j	                         }||   j                         }|d   }|dz   t        |      k  r||dz      j                         d   }	n|d   }	|j                  ||||	 |f       v t        || j                        S c c}w )Nr   r   )	rn   finditerrA   r   groupspanrM   r   r   )
r   r[   r   mmatchesr   tokenrw   start_wsend_wss
             r   r\   zSimpleTokenizer.tokenize   s    "ll33D9:1::s7|$ 	AAJ$$&E 1:??$DAwH1us7|# Q,,.q1a KKXv& 	$ dDOO,,' ;s   	CN)rS   rT   rU   ri   rj   r   r\   r<   r   r   rb   rb      s    %IF -r   rb   c           	          t        |      }| D ]B  }t        dt        |      t        |      z
  dz         D ]  }||||t        |      z    k(  s  y D y)Nr   r   TF)DPR_normalizerA   r   )tokenized_answersr[   single_answerr   s       r   
has_answerr      se    D* q#d)c-&881<= 	AQC,>(> ??	
 r   c           	         t        |      }g }|j                  d      |j                         }}| D cg c]  }|j                  d       }}|D ]s  }t        dt	        |      t	        |      z
  dz         D ]J  }	|||	|	t	        |      z    k(  s||	   ||	t	        |      z   dz
     c\  }
}\  }}|j                  |
|f       L u |S c c}w )z\
    Returns each occurrence of an answer as (offset, endpos) in terms of *characters*.
    Tr(   r   r   )DPR_tokenizer)   r,   rA   r   rM   )r   r[   tokenized_textoccurrences
text_wordstext_word_positionsansanswers_wordsr   r   offset_endposs                r   locate_answersr      s     "$'NK&4&:&:4&:&H.J`J`Jb#J8IJSYYtY,JMJ& 5q#j/C,>>BC 	5A
1a#m2D.D EE+>q+ACVWXY\]jYkWklmWmCn([a""FF#34	55  Ks   C	c                 T    t         j                  t        j                  d|             S )NNFD)
STokenizerr\   unicodedata	normalizer[   s    r   r   r      s     {44UDABBr   c                 8    t        |       j                  d      S )NTr   )r   r)   r   s    r   r~   r~      s    ##D#11r   c                     t        j                  d|       } g }| D ].  }t        j                  |      }|dk(  r|j                  |       0 dj	                  |      S )z$Strips accents from a piece of text.r   Mnr   )r   r   categoryrM   r   )r[   outputcharcats       r   strip_accentsr      s^      -DF   &c	mmD	
 776?r   )rV   r   rg   r   rX   rb   r   r   r   r   r~   r   r<   r   r   <module>r      sZ     r rj '-i '-T& 
C2
	r   