
    ##h.                        d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZ ddlmZmZmZ ddlmZmZmZmZ ddlmZ  ej.                  e      Zdd	Zd
edefdZd Z G d d      Z G d de      Z dedefdZ! G d d      Z" G d de"      Z# G d de"      Z$ G d de"      Z% G d de"      Z& G d de"      Z' G d  d!e"      Z( G d" d#e"      Z) G d$ d%e"      Z* G d& d'e"      Z+ G d( d)e"      Z, G d* d+e"      Z- G d, d-e"      Z. G d. d/e.      Z/ G d0 d1e.      Z0 G d2 d3e.      Z1 G d4 d5e.      Z2 G d6 d7e.      Z3 G d8 d9e.      Z4 G d: d;e.      Z5 G d< d=e.      Z6 G d> d?e.      Z7 G d@ dAe.      Z8 G dB dCe.      Z9 G dD dEe.      Z: G dF dGe.      Z; G dH dIe.      Z< G dJ dKe.      Z= G dL dMe.      Z> G dN dOe"      Z? G dP dQe.      Z@ G dR dSe"      ZA G dT dUe"      ZB G dV dWe"      ZC G dX dYe.      ZD G dZ d[e.      ZE G d\ d]e.      ZF G d^ d_e"      ZG G d` dae.      ZH G db dce.      ZIdd ZJ G de df      ZKi dge/dhe+die0dje#dke@dleCdme1dneAdoe(dpe#dqe-dre2dse#dte#due#dve#dwe#i dxe/dye%dze(d{e)d|e#d}e#d~e+de7de+de+de#deGde3de4de&de#de+i de5de'de<de*de#de9de:de#de+de,de6de#de=de>de?de7de8e$eDeFeFeEeFdZLddefdZMy)z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERRORc                    t               rddlm} |S t               rSdd l}t        j                  |j                  j                        t        j                  d      k  rddl	m} |S ddl	m
} |S t        t        j                  |             )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   googles      `/var/www/html/sandstorm/venv/lib/python3.12/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufr"   "   sl    !#9&&==445g8NNB '& b&&/66}EFF    add_prefix_spacereturnc                 4    | rd}t        |dd      sd}|S d}|S )NalwayslegacyTfirstnever)getattr)r$   original_tokenizerprepend_schemes      r!   _get_prepend_schemer.   3   s1    !)8T:$N  !r#   c                     |d u}|rt        |      n }g }|j                         D ]j  \  }}g }t        dt        |            D ]*  }|d | ||d  }	}| v s|	 v s|j	                  ||	|f       , t        | fd      }|j                  |       l t        |d |      }|D 
cg c]  }
|
d   |
d   f }}
|S c c}
w )Nr   c                 $    | d      | d      fS Nr   r    )xvocabs    r!   <lambda>z!generate_merges.<locals>.<lambda>H   s    U1Q4[%!+,F r#   keyc                 B    | d   t        | d         t        | d         fS )N   r   r   )lenvals    r!   r5   z!generate_merges.<locals>.<lambda>K   s!    SVSQ[#c!f+,N r#   r7   reverser   )dictitemsranger:   appendsortedextend)r4   vocab_scoresr>   mergesmergepiece_scorelocalindexpiece_lpiece_rr<   s   `          r!   generate_mergesrM   =   s    $&G)04%eLF*002 {1c%j) 	>E$Ve}eEFmWG%Gu$4gw<=	> u"FGe F NX_`F*013s1vs1v1F1M 2s   'B<c                   D    e Zd ZdZdefdZddeeeef   e	e   f   fdZ
y)SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 v    t        | d       ddlm}  |       | _        | j                  j	                  |       y )Nr   r   )SentencePieceProcessor)r   r   rR   spLoad)selfrP   rR   s      r!   __init__zSentencePieceExtractor.__init__U   s)    $08(*Ur#   Nr%   c                     | j                   }t        |j                               D ci c]  }|j                  |      | }}t	        ||      }||fS c c}w )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        )rS   rA   GetPieceSizeid_to_piecerM   rU   rE   rS   rJ   r4   rF   s         r!   extractzSentencePieceExtractor.extract\   sV    
 WW;@AR;ST%&-TT 5f}	 Us   AN)__name__
__module____qualname____doc__strrV   tupler?   intlistr\   r2   r#   r!   rO   rO   P   s5    c 
E$sCx.$u+2M,N 
r#   rO   c                   4    e Zd Zddeeeef   ee   f   fdZy)GemmaSentencePieceExtractorNr%   c                     | j                   }t        |j                               D ci c]  }|j                  |      | }}d|vr|j	                  d      |d<   t        ||      }||fS c c}w )rX   	<0x09>)rS   rA   rY   rZ   getrM   r[   s         r!   r\   z#GemmaSentencePieceExtractor.extractj   sr    
 WW;@AR;ST%&-TT u))H-E$K 5f} Us   A+r]   )	r^   r_   r`   rc   r?   rb   rd   re   r\   r2   r#   r!   rg   rg   i   s$    E$sCx.$u+2M,N r#   rg   piecec                 ^    t        |       dk  xs | d   dk7  xs | d   j                          S )Nr9   ,)r:   isdigit)rl   s    r!   check_number_commarr   z   s3    u:>HU2Y#-HU2Y5F5F5H1HHr#   c                       e Zd Zd ZdefdZy)	Converterc                     || _         y r]   )r,   )rU   r,   s     r!   rV   zConverter.__init__   s
    "4r#   r%   c                     t               r]   )NotImplementedErrorrU   s    r!   	convertedzConverter.converted   s    !##r#   N)r^   r_   r`   rV   r   ry   r2   r#   r!   rt   rt   ~   s    5$9 $r#   rt   c                       e Zd ZdefdZy)BertConverterr%   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr,   r4   r   r   rb   r~   hasattrr   tokenize_chinese_charsr   do_lower_caser   BertNormalizer
normalizerr	   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr
   TemplateProcessingpost_processorr   decoder
rU   r4   	tokenizerr   r   r   clssepr   r   s
             r!   ry   zBertConverter.converted      ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	r#   Nr^   r_   r`   r   ry   r2   r#   r!   r{   r{          #9 #r#   r{   c                       e Zd ZdefdZy)SplinterConverterr%   c           
         | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }t	        | j                   j&                        }d}	| j                   j(                  }
| j                   j*                  }| j                   j,                  }| j                   j/                  d      }| j                   j0                  dk(  r| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}t3        j4                  | d| d|||
f||f||f|	|fg      |_        t9        j                  d      |_        |S )Nr}   Fr   Tr   .rightr    r   r   r   r   r   r   )r,   r4   r   r   rb   r~   r   r   r   r   r   r   r   r   r	   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider
   r   r   r   r   )rU   r4   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   s                  r!   ry   zSplinterConverter.converted   s"   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334t..==>..;;..;; 33EE..DDSI""//7:U(8*AcU!C5RHDU(3%xz3%qRHD#-#@#@U(3%r*l#l#,-l#		$
	  %..d;	r#   Nr   r2   r#   r!   r   r      s    .9 .r#   r   c                       e Zd ZdefdZy)FunnelConverterr%   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )Nr}   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   s
             r!   ry   zFunnelConverter.converted   r   r#   Nr   r2   r#   r!   r   r      r   r#   r   c                       e Zd ZdefdZy)MPNetConverterr%   c                 r   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	| d
||f||	fg      |_        t1        j                  d      |_        |S )Nr}   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   s
             r!   ry   zMPNetConverter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5SXcU"=l#l#$
	  %..d;	r#   Nr   r2   r#   r!   r   r     r   r#   r   c                       e Zd ZdefdZy)OpenAIGPTConverterr%   c           
         | j                   j                  }t        | j                   j                  j	                               }| j                   j
                  }t        t        ||d t        |      dd            }|j                  t        |            |j                  t        |      g       t        j                  d      |_        t        j                         |_        t#        j$                  d      |_        |S )N</w>F)r4   rF   dropoutr~   end_of_word_suffixfuse_unkT)r   suffix)r,   encoderre   	bpe_rankskeysr~   r   r   rb   token_to_idadd_special_tokensr   r   r   r	   r   r   r   
BPEDecoderr   rU   r4   rF   r~   r   s        r!   ry   zOpenAIGPTConverter.converted.  s    ''//d--77<<>?++55	i.#)	
	   Y0<((#i.)9:*99DI	"0"A"A"C	$//v>	r#   Nr   r2   r#   r!   r   r   -  s    9 r#   r   c                   <    e Zd Zddeeef   deeeef      defdZ	y)GPT2ConverterNr4   rF   r%   c           
      N   |s| j                   j                  }|st        | j                   j                        }t	        t        ||d ddd            }t        | j                   dd      }t        j                  |      |_	        t        j                         |_        t        | j                   dd      rT| j                   j                  }| j                   j                  }t        j                  | d| d||fg	      |_        |S t        j                  d
      |_        |S )N Fr4   rF   r   continuing_subword_prefixr   r   r$   r$   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r,   r   re   r   r   r   r+   r	   	ByteLevelr   r   r   	bos_tokenbos_token_idr
   r   r   )rU   r4   rF   r   r$   bosr   s          r!   ry   zGPT2Converter.convertedI  s   ++33E$11;;<F*,#%	
	 #4#:#:<NPUV"0":":L\"]	$..0	4**OUC))33C22??L'1'D'DguL),' (I$  (2';';'OI$r#   NN
r^   r_   r`   r?   rb   rd   re   rc   r   ry   r2   r#   r!   r   r   H  s2    "tCH~ "d5c?>S "_h "r#   r   c                       e Zd ZdefdZy)HerbertConverterr%   c           	         d}d}| j                   j                  }t        | j                   j                  j	                               }||d   d   v r|dd  }t        t        ||d | j                   j                  |            }t        j                  dd      |_
        t        j                         |_        t        j                  |      |_        t#        j$                  | j                   j&                  | j                   j(                  f| j                   j*                  | j                   j,                  f	      |_        |S )
Nz	#version:r   r   r   )r   r~   r   F)r   r   r   )r   r   )r,   r   re   r   r   r   r   r~   r   r   r   r	   r   r   r   r   r   r
   BertProcessingr   r   r   r   r   )rU   tokenizer_info_strtoken_suffixr4   rF   r   s         r!   ry   zHerbertConverter.convertedo  s   (''//d--77<<>?1-ABZF11;;#/
	  +99EY^_	"0"A"A"C	$//|D	#-#<#<((22D4K4K4X4XY((22D4K4K4X4XY$
	 
 r#   Nr   r2   r#   r!   r   r   n      9 r#   r   c                   <    e Zd Zddeeef   deeeef      defdZ	y)Qwen2ConverterNr4   rF   r%   c                 0   |s| j                   j                  }|s-t        | j                   j                  j	                               }t        t        ||d d dddd            }t        j                         |_	        t        j                  t        j                  t        d      dd      t        j                  t        | j                   dd      d      g      |_        t#        j                         |_        t'        j                  d	      |_        |S )
Nr   F)r4   rF   r   r~   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr$   r$   	use_regexr   )r,   r   re   r   r   r   r   r   NFCr   r	   SequenceSplitr   r   r+   r   r   r   r
   r   )rU   r4   rF   r   s       r!   ry   zQwen2Converter.converted  s   ++33E$11;;@@BCF*,#%#	
	  +0	"0"9"9$$ N (  ((%,T-D-DFXZ_%`##
	  %..0	#-#7#7U#K	 r#   r   r   r2   r#   r!   r   r     s2    (tCH~ (d5c?>S (_h (r#   r   c                       e Zd ZdefdZy)RobertaConverterr%   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  |j                  |j                   f|j"                  |j$                  f|j                  d      |_        |S )Nr   Fr   r   Tr   r   r$   r   )r,   r   re   r   r   r   r   r	   r   r$   r   r   r   r
   RobertaProcessingr   r   r   r   r   rU   otr4   rF   r   s        r!   ry   zRobertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#?#?r/r/00	$
	  r#   Nr   r2   r#   r!   r   r         9 r#   r   c                       e Zd ZdefdZy)RoFormerConverterr%   c           	      V   ddl m} | j                  j                  }t	        t        |t        | j                  j                                    }d}d}t        | j                  d      r@| j                  j                  j                  }| j                  j                  j                  }t        j                  dd||      |_        t        j                   j#                   ||            |_        t        | j                  j&                        }t        | j                  j(                        }| j                  j*                  }| j                  j,                  }	t/        j0                  | d| d	| d| d
| d||f||	fg      |_        t5        j
                  d      |_        |S )Nr   )JiebaPreTokenizerr}   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr   r,   r4   r   r   rb   r~   r   r   r   r   r   r   r   r	   PreTokenizercustomr   r   r   r   r   r
   r   r   r   r   )
rU   r   r4   r   r   r   r   r   r   r   s
             r!   ry   zRoFormerConverter.converted  sy   I''--iT=T=T=^=^9_`a	4**,=> 33CCQQM 33CCQQM*99!&'#	 
	 #1"="="D"DEVW\E]"^	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	r#   Nr   r2   r#   r!   r   r     r   r#   r   c                       e Zd ZdefdZy)DebertaConverterr%   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  ddd| j                   j                  d      fd| j                   j                  d      fg	      |_        |S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r,   r   re   r   r   r   r   r	   r   r$   r   r   r   r
   r   r   r   r   s        r!   ry   zDebertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@)4$11GGPQ$11GGPQ$
	  r#   Nr   r2   r#   r!   r   r     r   r#   r   c                   `     e Zd ZdZeZi Z fdZd Zd Z	d Z
d Zd Zd Zd	 Zd
efdZ xZS )SpmConverterFc                    t        | d       t        |   |  t               }|j	                         }t        | j                  j                  d      5 }|j                  |j                                d d d        || _
        | j                  j                  j                  r#| j                  st        j                  d       y y y # 1 sw Y   TxY w)Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrV   r"   
ModelProtoopenr,   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rU   args	model_pb2mf	__class__s        r!   rV   zSpmConverter.__init__!  s    $
+$ $%	  "$))44d; 	(qaffh'	(
::""009R9RMMe :S0		( 	(s    CCc                 l    |j                   D cg c]  }|j                  |j                  f c}S c c}w r]   piecesrl   scorerU   r  rl   s      r!   r4   zSpmConverter.vocab6  s'    8=Euekk*EEEs   1c                 .    |j                   j                  S r]   )r  unk_idrU   r  s     r!   r  zSpmConverter.unk_id9  s    !!(((r#   c                 ~   |j                   j                  }| j                  |      }|dk(  r1t        t	        || j                  |      | j                              }n|dk(  r| j                  | j                  j                        j                  |      \  }}t        |      D 	ci c]  \  }\  }}	|| }
}}}	t        t        |
||j                   j                  d| j                  d             }nt        d      t        |j                        D cg c]I  \  }}|j                   dv r6||j"                  |j                   dk(  xs |j"                  | j$                  v fK }}}|j'                  t)        |d	 
      D cg c]  \  }}}t+        |d|       c}}}       |S c c}	}}w c c}}w c c}}}w )Nr   r  r   r9   T)r~   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithm      r"  c                     | d   S Nr   r2   r3   s    r!   r5   z(SpmConverter.tokenizer.<locals>.<lambda>h      QRSTQU r#   r6   F
normalizedspecial)r  
model_typer4   r   r   r  r  SpmExtractorr,   r
  r\   	enumerater   	unk_piece	Exceptionr  typerl   r   
add_tokensrC   r   )rU   r  r+  rE   r   _rF   iwordr  	bpe_vocabidpspm_added_tokenstokenr*  s                   r!   r   zSpmConverter.tokenizer<  s   ''22
zz%(?! ;;u-"&";";I 1_))$*A*A*L*LMUUVbcIAv9B<9PQQ%5QuqQIQ!#00::!"&";"; 	I o  #5<<0
Avv !&&A+GD4G4G)GH
 

 	 +11A~*V &Bw 5UGD	
 C R*
s   )F+AF2F8c                     |j                   j                  }t        j                  dd      t        j                  t        d      d      g}|st        j                  |      S t        j                  t        j                  |      g|z         S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr   StripReplacer   r   PrecompiledrU   r  r?  _normalizerss       r!   r   zSpmConverter.normalizern  s{    $44II55g6
 $''55'')@)@AU)V(WZf(fggr#   c                 \    t        || j                        }t        j                  ||      S Nreplacementr-   )r.   r,   r	   	MetaspacerU   rH  r$   r-   s       r!   r   zSpmConverter.pre_tokenizery  s)    ,-=t?V?VW''KP^__r#   c                      y r]   r2   rx   s    r!   r   zSpmConverter.post_processor}  s    r#   c                 \    t        || j                        }t        j                  ||      S rF  )r.   r,   r   rI  rJ  s       r!   r   zSpmConverter.decoder  s(    ,-=t?V?VW!!k.YYr#   r%   c                 z   | j                  | j                        }| j                  | j                        }|||_        d}d}t        | j                  d      r| j                  j
                  }| j                  ||      }|||_        | j                  ||      |_        | j                         }|r||_        |S )Nr=  Tr$   )	r   r  r   r   r,   r$   r   r   r   )rU   r   r   rH  r$   r   r   s          r!   ry   zSpmConverter.converted  s    NN4::.	 __TZZ0
!#-I 4**,>?#66GG**;8HI$&3I# LL6FG	,,.'5I$r#   )r^   r_   r`   r  rO   r,  r   rV   r4   r  r   r   r   r   r   r   ry   __classcell__r  s   @r!   r  r    sL     )LN*F)0d	h`Z9 r#   r  c                       e Zd Zd Zd Zd Zy)AlbertConverterc                     |j                   D cg c]J  }t        |j                        r|j                  |j                  fn|j                  |j                  dz
  fL c}S c c}w Nd   r  rr   rl   r  r  s      r!   r4   zAlbertConverter.vocab  ^     
 +=U[[*IU[[%++&PUP[P[]b]h]hkn]nOoo
 	
 
   AA!c                    t        j                  dd      t        j                  dd      g}| j                  j                  sF|j	                  t        j
                                |j	                  t        j                                | j                  j                  r#|j	                  t        j                                |j                  j                  }|r$|j	                  t        j                  |             |j	                  t        j                  t        d      d             t        j                  |      S Nz``"z''r<  r   r   rA  r,   keep_accentsrB   NFKDStripAccentsr   	Lowercaser>  r?  rB  r   r   rU   r  list_normalizersr?  s       r!   r   zAlbertConverter.normalizer      c*c*
 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR 3 3E'NC HI##$455r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S Nr   r   r  r  r   r
   r   r,   r   rx   s    r!   r   zAlbertConverter.post_processor  R    ,,)4$11GGPQ$11GGPQ
 	
r#   Nr^   r_   r`   r4   r   r   r2   r#   r!   rQ  rQ        
6&
r#   rQ  c                       e Zd Zd Zd Zy)BarthezConverterc                 
    d}|S Nr"  r2   rU   r  r  s      r!   r  zBarthezConverter.unk_id      r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   re  rx   s    r!   r   zBarthezConverter.post_processor  R    ,, +//EEeLM00FFvNO
 	
r#   N)r^   r_   r`   r  r   r2   r#   r!   rj  rj    s    
r#   rj  c                       e Zd Zd Zd Zd Zy)CamembertConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|dgz  }|S c c}w )N))z
<s>NOTUSED        <pad>rw  )z</s>NOTUSEDrw  z<unk>rw  )z<unk>NOTUSEDir   z<mask>rw  r  rU   r  r4   rl   s       r!   r4   zCamembertConverter.vocab  sP    
 	%,,qr:JK5;;,KK/"" L   Ac                      yrl  r2   r  s     r!   r  zCamembertConverter.unk_id  s    r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S rp  re  rx   s    r!   r   z!CamembertConverter.post_processor  rs  r#   Nr^   r_   r`   r4   r  r   r2   r#   r!   ru  ru    s    
r#   ru  c                       e Zd Zd Zd Zd Zy)DebertaV2Converterc                    g }| j                   j                  r%|j                  t        j                  d             t        || j                         }|j                  t        j                  ||             t        j                  |      S )Nr   )r   rG  )r,   split_by_punctrB   r	   Punctuationr.   rI  r   )rU   rH  r$   list_pretokenizersr-   s        r!   r   z DebertaV2Converter.pre_tokenizer  sq    ""11%%n&@&@*&UV,-=t?V?VW!!.":":{cq"rs&&'9::r#   c                    g }| j                   j                  r#|j                  t        j                                |j                  t        j
                                |j                  j                  }|r$|j                  t        j                  |             |j                  t        j                  t        d      d             t        j                  |      S )Nr<  r   )r,   r   rB   r   r_  r@  r>  r?  rB  rA  r   r   r`  s       r!   r   zDebertaV2Converter.normalizer  s    ""00##K$9$9$;< 1 1 34$44II##K$;$;<P$QR 3 3E'NC HI##$455r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S rd  re  rx   s    r!   r   z!DebertaV2Converter.post_processor  rf  r#   N)r^   r_   r`   r   r   r   r2   r#   r!   r  r    s    ;6
r#   r  c                       e Zd Zd Zd Zd Zy)MBartConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|dgz  }|S c c}w )Nrq  rw  rx  rr  rw  rz  r"  )ar_ARrw  cs_CZrw  de_DErw  en_XXrw  es_XXrw  et_EErw  fi_FIrw  fr_XXrw  gu_INrw  hi_INrw  it_ITrw  ja_XXrw  kk_KZrw  ko_KRrw  lt_LTrw  lv_LVrw  my_MMrw  ne_NPrw  nl_XXrw  ro_ROrw  ru_RUrw  si_LKrw  tr_TRrw  vi_VNrw  zh_CNrw  r{  r  r|  s       r!   r4   zMBartConverter.vocab  sa    
 	%,,qr:JK5;;,KK 
 	
6 	/""; L   A
c                      yrl  r2   r  s     r!   r  zMBartConverter.unk_id7      r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz$A </s> en_XXz$A $B </s> en_XXr  rr  r   re  rx   s    r!   r   zMBartConverter.post_processor:  R    ,,"#$11GGPQ00FFvNO
 	
r#   Nr  r2   r#   r!   r  r    s    $L
r#   r  c                       e Zd Zd Zd Zd Zy)MBart50Converterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|dgz  }|S c c}w )Nr  r"  )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZArw  )az_AZrw  )bn_INrw  )fa_IRrw  )he_ILrw  )hr_HRrw  )id_IDrw  )ka_GErw  )km_KHrw  )mk_MKrw  )ml_INrw  )mn_MNrw  )mr_INrw  )pl_PLrw  )ps_AFrw  )pt_XXrw  )sv_SErw  )sw_KErw  )ta_INrw  )te_INrw  )th_THrw  )tl_XXrw  )uk_UArw  )ur_PKrw  )xh_ZArw  )gl_ESrw  )sl_SIrw  r{  r  r|  s       r!   r4   zMBart50Converter.vocabF  sa    
 	%,,qr:JK5;;,KK  R  	R/"" Lr  c                      yrl  r2   r  s     r!   r  zMBart50Converter.unk_idR  r  r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nzen_XX $A </s>zen_XX $A $B </s>r  rr  r   re  rx   s    r!   r   zMBart50Converter.post_processorU  r  r#   Nr  r2   r#   r!   r  r  E  s    

r#   r  c                       e Zd Zd Zd Zd Zy)NllbConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )Nr  r"  r  r|  s       r!   r4   zNllbConverter.vocaba  C    
 	%,,qr:JK5;;,KK L   =c                      yrl  r2   r  s     r!   r  zNllbConverter.unk_idk  r  r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnrr  r   re  rx   s    r!   r   zNllbConverter.post_processorn  sR    ,,%&T44JJ:VW00FFvNO
 	
r#   Nr  r2   r#   r!   r  r  `  s    
r#   r  c                       e Zd Zd Zd Zd Zy)SeamlessM4TConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )N)rx  rz  r  r  r"  r  r|  s       r!   r4   zSeamlessM4TConverter.vocabz  r  r  c                 .    | j                   j                  S r]   )r,   unk_token_idr  s     r!   r  zSeamlessM4TConverter.unk_id  s    &&333r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__rr  r   re  rx   s    r!   r   z#SeamlessM4TConverter.post_processor  sR    ,,$%D33II)TU00FFvNO
 	
r#   Nr  r2   r#   r!   r  r  y  s    4
r#   r  c                       e Zd Zd Zd Zd Zy)XLMRobertaConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|dgz  }|S c c}w )Nr  r"  r{  r  r|  s       r!   r4   zXLMRobertaConverter.vocab  sP    
 	%,,qr:JK5;;,KK/"" Lr}  c                 
    d}|S rl  r2   rm  s      r!   r  zXLMRobertaConverter.unk_id  rn  r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S rp  re  rx   s    r!   r   z"XLMRobertaConverter.post_processor  rs  r#   Nr  r2   r#   r!   r  r        	
r#   r  c                       e Zd Zd Zd Zd Zy)XLNetConverterc                     |j                   D cg c]J  }t        |j                        r|j                  |j                  fn|j                  |j                  dz
  fL c}S c c}w rS  rU  r  s      r!   r4   zXLNetConverter.vocab  rV  rW  c                    t        j                  dd      t        j                  dd      g}| j                  j                  sF|j	                  t        j
                                |j	                  t        j                                | j                  j                  r#|j	                  t        j                                |j                  j                  }|r$|j	                  t        j                  |             |j	                  t        j                  t        d      d             t        j                  |      S rY  r[  r`  s       r!   r   zXLNetConverter.normalizer  rb  r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   re  rx   s    r!   r   zXLNetConverter.post_processor  rf  r#   Nrg  r2   r#   r!   r  r    rh  r#   r  c                       e Zd Zy)ReformerConverterNr^   r_   r`   r2   r#   r!   r  r        r#   r  c                       e Zd Zd Zd Zy)RemBertConverterc                 b   t        j                  dd      t        j                  dd      t        j                  t        d      d      g}| j                  j                  sF|j                  t        j                                |j                  t        j                                | j                  j                  r#|j                  t        j                                |j                  j                  }|r$|j                  t        j                  |             t        j                  |      S rY  )r   rA  r   r,   r\  rB   r]  r^  r   r_  r>  r?  rB  r   r`  s       r!   r   zRemBertConverter.normalizer  s    c*c*g4

 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR##$455r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S rd  re  rx   s    r!   r   zRemBertConverter.post_processor  rf  r#   N)r^   r_   r`   r   r   r2   r#   r!   r  r    s    6&
r#   r  c                       e Zd Zy)BertGenerationConverterNr  r2   r#   r!   r	  r	    r  r#   r	  c                   $    e Zd Zd Zd Zd Zd Zy)PegasusConverterc                 v   | j                   j                  df| j                   j                  dfg}| j                   j                  || j                   j                  dfgz  }| j                   j                  I| j                   j
                  | j                   j                  k  r|| j                   j                  dfgz  }|t        d| j                   j                        D cg c]
  }d| ddf c}z  }||j                  dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w c c}w )Nrw  r9   z<unk_>g      Y)r,   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrA   r  rl   r  )rU   r  r4   r3  rl   s        r!   r4   zPegasusConverter.vocab  s%   $$..4$$..4

 ""22>t..>>DEEE ##..:''558O8O8V8VVt..993?@@E%4;R;R;Y;Y2Z[QU1#Q<([[%,,qr:JK5;;,KK \Ks   %D1D6c                 \    |j                   j                  | j                  j                  z   S r]   )r  r  r,   r  r  s     r!   r  zPegasusConverter.unk_id  s%    !!((4+B+B+I+IIIr#   c                     t        || j                        }t        j                  t        j                         t        j
                  ||      g      S rF  )r.   r,   r	   r   WhitespaceSplitrI  rJ  s       r!   r   zPegasusConverter.pre_tokenizer  sJ    ,-=t?V?VW&&..0(([Q_`
 	
r#   c                     | j                   j                  }|| j                   j                  fg}t        j                  d|gdd|g|      S )N$A$Br   )r,   r  eos_token_idr
   r   )rU   eosr   s      r!   r   zPegasusConverter.post_processor  sR    %%//$))667
 ,,T3KtTSVFWhvwwr#   N)r^   r_   r`   r4   r  r   r   r2   r#   r!   r  r    s    &J
xr#   r  c                       e Zd Zd Zd Zy)T5Converterc                     | j                   j                  }|j                  D cg c]  }|j                  |j                  f }}|t        |dz
  dd      D cg c]
  }d| ddf c}z  }|S c c}w c c}w )Nr   rn   z
<extra_id_r  rw  )r,   
_extra_idsr  rl   r  rA   )rU   r  num_extra_idsrl   r4   r3  s         r!   r4   zT5Converter.vocab#  sw    //::9>F%++u{{+FFE-!:KRQS4TUqZs!$c*UU GUs   A/A4c                 r    t        j                  ddgg dd| j                  j                  d      fg      S Nr  rr  )r  rr  r  rr  r   re  rx   s    r!   r   zT5Converter.post_processor)  =    ,,&>-00FFvNO
 	
r#   N)r^   r_   r`   r4   r   r2   r#   r!   r  r  "  s    
r#   r  c                       e Zd Zd Zy)UdopConverterc                 r    t        j                  ddgg dd| j                  j                  d      fg      S r"  re  rx   s    r!   r   zUdopConverter.post_processor4  r#  r#   Nr^   r_   r`   r   r2   r#   r!   r%  r%  3  s    
r#   r%  c                       e Zd ZdefdZy)WhisperConverterr%   c           
         | j                   j                  }t        | j                   j                  j	                               }t        t        ||d ddd            }t        j                  | j                   j                        |_
        t        j                         |_        | j                   j                  }| j                   j                  |      }| j                   j                  }| j                   j                   }dj#                  |D cg c]  }| d	 c}      }	t%        j&                  |	 d| d|	 d| d	||fgt)        ||      
      |_        |S c c}w )Nr   Fr   r   r   r   z $A:0 z $A:0 $B:1 r   r   )r,   r   re   r   r   r   r   r	   r   r$   r   r   r   prefix_tokensconvert_ids_to_tokensr  r  joinr
   r   zipr   )
rU   r4   rF   r   prefix_token_idsprefixesr  r  r9  prefix_templates
             r!   ry   zWhisperConverter.converted?  sR   ''//d--77<<>?*,#%	
	 #1":":DLcLcLtLt"u	$..0	22@@**@@AQR%%//..;;((h#GUugRL#GH#-#@#@%&fSE4#$KuB7l#X/0$
	   $Hs   ENr   r2   r#   r!   r)  r)  >  s     9  r#   r)  c                       e Zd Zd Zy)BigBirdConverterc           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S rd  re  rx   s    r!   r   zBigBirdConverter.post_processorc  rf  r#   Nr'  r2   r#   r!   r3  r3  b  s    
r#   r3  c                       e Zd ZdefdZy)CLIPConverterr%   c                 p   | j                   j                  }t        | j                   j                  j	                               }| j                   j
                  }t        t        ||d dddt        |                  }t        j                  t        j                         t        j                  t        d      d      t        j                         g      |_        t!        j                  t!        j"                  t        d      dd	
      t!        j$                  d      g      |_        t)        j$                         |_        t-        j.                  | j                   j0                  | j                   j2                  f| j                   j4                  | j                   j6                  fdd      |_        |S )Nr   r   Fr4   rF   r   r   r   r   r~   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r,   r   re   r   r   r~   r   r   rb   r   r   r   rA  r   r_  r   r	   r   r   r   r   r   r
   r   r  r  r   r   r   r   s        r!   ry   zCLIPConverter.convertedo  sk   ''//d--77<<>?++55	*,#)i.

	  +33__ 3 3E&M3 GI^I^I`a 
	 #1"9"9$$Z[&
 ((%@	#
	 %..0	 $.#?#?((22D4K4K4X4XY((22D4K4K4X4XY"	$
	  r#   Nr   r2   r#   r!   r6  r6  n  s    '9 'r#   r6  c                       e Zd ZdefdZy)LayoutLMv2Converterr%   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )Nr}   FTr   r   r   r   r   r   r   r   r   r   r   s
             r!   ry   zLayoutLMv2Converter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	r#   Nr   r2   r#   r!   r;  r;    r   r#   r;  c                       e Zd ZdefdZy)BlenderbotConverterr%   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  d|j                   d|j                  |j                   fg      |_        |S )Nr   Fr   r   z$A:0 r   )r   r   )r,   r   re   r   r   r   r   r	   r   r$   r   r   r   r
   r   r  r  r   r   s        r!   ry   zBlenderbotConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@2<<.+r/$
	  r#   Nr   r2   r#   r!   r>  r>    r   r#   r>  c                       e Zd Zd Zd Zd Zy)XGLMConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|S c c}w )Nr  r"  ))z<madeupword0>rw  )z<madeupword1>rw  )z<madeupword2>rw  )z<madeupword3>rw  )z<madeupword4>rw  )z<madeupword5>rw  )z<madeupword6>rw  r  r|  s       r!   r4   zXGLMConverter.vocab  sT    
 	%,,qr:JK5;;,KK  z  	z Ls   Ac                 
    d}|S rl  r2   rm  s      r!   r  zXGLMConverter.unk_id  rn  r#   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz</s> $Az</s> $A </s> </s> $Brq  rr  r   re  rx   s    r!   r   zXGLMConverter.post_processor  sR    ,,'//EEeLM00FFvNO
 	
r#   Nr  r2   r#   r!   rA  rA    r  r#   rA  c                   <    e Zd ZdZeZddhZ	 d Zd Zd Z	d Z
d Zy	)
GemmaConverterTz<start_of_turn>z<end_of_turn>c                 .    t        j                  dd      S Nr   r=  )r   rA  r  s     r!   r   zGemmaConverter.normalizer  s    ""3..r#   c                 t   | j                   j                  df| j                   j                  df| j                   j                  dfg}||j                  dd  D cg c]  }|j
                  |j                  f c}z  }t        d |D              s#t        d t        |      D        d       }|d||<   |S c c}w )Nrw  r"  c              3   ,   K   | ]  }|d    dk(    yw)r   ri   Nr2   ).0r3   s     r!   	<genexpr>z'GemmaConverter.vocab.<locals>.<genexpr>  s     /A1Q44</s   c              3   8   K   | ]  \  }}|d    dk(  s|  yw)r   rj   Nr2   )rK  r3  r3   s      r!   rL  z'GemmaConverter.vocab.<locals>.<genexpr>  s     "VAQqTXEU1"Vs   )ri   rw  )
r,   r  r  r   r  rl   r  anynextr-  )rU   r  r4   rl   override_indexs        r!   r4   zGemmaConverter.vocab  s    $$..4$$..4$$..4

 	%,,qr:JK5;;,KK ///!"V51A"VX\]N)(3n% Ls   B5c                 .    t        j                  dd      S )Nr   merged_with_previous)r	   r   rU   rH  r$   s      r!   r   zGemmaConverter.pre_tokenizer  s    ##C)?@@r#   c                 
    d}|S rl  r2   rm  s      r!   r  zGemmaConverter.unk_id  rn  r#   c                     t        j                  t        j                  dd      t        j                         t        j                         g      S )Nr=  r   )r   r   rA  ByteFallbackFuserS  s      r!   r   zGemmaConverter.decoder"  s?        ,%%'
 	
r#   N)r^   r_   r`   r  rg   r,  r   r   r4   r   r  r   r2   r#   r!   rF  rF    s6    .L'9N/ A
r#   rF  c                   4    e Zd ZdZd Zd Zd Zd Zd Zd Z	y)	LlamaConverterTc                 (   | j                   j                  d      df| j                   j                  d      df| j                   j                  d      dfg}||j                  dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )Nr   rw  r   r9   r"  )r,   r,  r  rl   r  r|  s       r!   r4   zLlamaConverter.vocab/  s    $$::1=sC$$::1=sC$$::1=sC

 	%,,qr:JK5;;,KK Ls   )Bc                 
    d}|S r%  r2   rm  s      r!   r  zLlamaConverter.unk_id8  rn  r#   c                     t        j                  dd      t        j                         t        j                         g}|r|t        j                  dd      gz  }t        j
                  |      S Nr=  r   r   )contentr;  r   rA  rV  rW  r@  r   rU   rH  r$   sequences       r!   r   zLlamaConverter.decoder<  \    UC(!!#MMO

 !<==H  **r#   c                     t        | j                  dd      rcg }t        | j                  dd      r|t        j                  d      gz  }|t        j                  dd      gz  }t        j
                  |      S y )Nr(   Tr$   r=  )prependr   )patternr^  )r+   r,   r   PrependrA  r   )rU   r  ra  s      r!   r   zLlamaConverter.normalizerF  sr    4**Hd;Ht..0BDI[00?@@,,S%HIIH''11r#   c                     t        | j                  dd      s.t        || j                        }t        j                  ||d      S y )Nr(   TFrH  r-   split)r+   r,   r.   r	   rI  rJ  s       r!   r   zLlamaConverter.pre_tokenizerO  sA    t..$?01A4CZCZ[N!++Tbjoppr#   c                      y r]   r2   rx   s    r!   r   zLlamaConverter.post_processorU  s    r#   N)
r^   r_   r`   r  r4   r  r   r   r   r   r2   r#   r!   rY  rY  ,  s&    +r#   rY  c                       e Zd ZdefdZy)MarkupLMConverterr%   c                    | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd| j                   j                              }t        j                  |j                        |_        t        j                         |_        t        | j                   j                        }t        | j                   j                         }| j                   j"                  }| j                   j$                  }t'        j(                  | d| | d| d| ||f||fg      |_        |S )Nr   Fr8  r   z $A z $B r   )r,   r   re   r   r   r   r   r~   r	   r   r$   r   r   r   rb   r   r   r   r   r
   r   r   )	rU   r   r4   rF   r   r   r   r   r   s	            r!   ry   zMarkupLMConverter.converted[  s,   $$

bll'')**,#%11;;

	 #1":":BL_L_"`	$..0	$))334$))334..;;..;;#-#@#@U$se$5SEcU+l#l#$
	  r#   Nr   r2   r#   r!   rl  rl  Z  s    "9 "r#   rl  c                   *    e Zd ZdZddZd Zd Zd Zy)MoshiConverterTNc                    t        | d       t        j                  | |       t               }|j	                         }t        |d      5 }|j                  |j                                d d d        || _        y # 1 sw Y   || _        y xY wNr   r  	r   rt   rV   r"   r  r	  r  r  r  )rU   r
  model_max_lengthkwargsr  r  r  s          r!   rV   zMoshiConverter.__init__  sr    $
+4, $%	  "*d# 	(qaffh'	(
	(
   	 A99B	c                     |j                   j                  }t        j                  dd      g}|st        j                  |      S t        j                  t        j
                  |      g|z         S rH  )r>  r?  r   rA  r   rB  rC  s       r!   r   zMoshiConverter.normalizer  sg    $44IIU+
 $''55'')@)@AU)V(WZf(fggr#   c                     t        j                  dd      t        j                         t        j                         g}|r|t        j                  dd      gz  }t        j
                  |      S r]  r_  r`  s       r!   r   zMoshiConverter.decoder  rb  r#   c                 6    d}t        j                  ||d      S )Nr)   Frh  )r	   rI  rJ  s       r!   r   zMoshiConverter.pre_tokenizer  s     ''KP^fkllr#   r]   )r^   r_   r`   r  rV   r   r   r   r2   r#   r!   ro  ro    s    h+mr#   ro  c                   B    e Zd ZdZddZd Zd Zd Zd Zd Z	d	 Z
d
 Zy)HeliumConverterTNc                    t        | d       t        j                  | |       t               }|j	                         }t        |d      5 }|j                  |j                                d d d        || _        y # 1 sw Y   || _        y xY wrq  rr  )rU   r
  r  r  r  r  s         r!   rV   zHeliumConverter.__init__  sp    $
+4,#%	  "*d# 	(qaffh'	(
	(
ru  c                 V   | j                  |      }t        t        || j                  |      | j                              }t        |j                        D cg c]I  \  }}|j                  dv r6||j                  |j                  dk(  xs |j                  | j                  v fK }}}|j                  t        |d       D cg c]  \  }}}t        |d|d       c}}}       |j                  t        d	dd
      g       |j                  dd       |S c c}}w c c}}}w )Nr   r!  r"  c                     | d   S r%  r2   r&  s    r!   r5   z+HeliumConverter.tokenizer.<locals>.<lambda>  r'  r#   r6   FT)r)  r*  single_word
r(  ry  )r  pad_id)r4   r   r   r  r  r-  r  r0  rl   r   r1  rC   r   enable_padding)	rU   r  rE   r   r6  r7  r8  r9  r*  s	            r!   r   zHeliumConverter.tokenizer  s    zz%({{5)"77
	 #5<<0
Avv !&&A+GD4G4G)GH
 

 	 +11A~*V &Bw 5UGQUV	
 	j%OPQ  71 =
s   ADD$c                     g }|j                   D ]@  }|j                  dk(  r|d|j                  fgz  }%||j                  |j                  fgz  }B |S )Nz<0x0A>r  r  r|  s       r!   r4   zHeliumConverter.vocab  s]    \\ 	6E{{h&4-..5;;455		6
 r#   c                 
    d}|S r%  r2   rm  s      r!   r  zHeliumConverter.unk_id  rn  r#   c                     t        j                  dd      t        j                         t        j                         g}|t        j                  dd      gz  }t        j
                  |      S r]  r_  r`  s       r!   r   zHeliumConverter.decoder  sY    UC(!!#MMO

 	X^^Ca899  **r#   c                 ~    t        j                  t        j                  d      t        j                  dd      g      S rH  )r   r   rf  rA  r  s     r!   r   zHeliumConverter.normalizer  s2    ##[%8%8%={?R?RSWY^?_$`aar#   c                 V    t        j                  t        j                  dd      g      S )Nr  
contiguous)r	   r   r   rS  s      r!   r   zHeliumConverter.pre_tokenizer  s#    &&(<(<T<(P'QRRr#   c                 <    t        j                  ddgg ddg      S )Nrq  r  )rq  r  rq  r  )rq  r   r   )r
   r   rx   s    r!   r   zHeliumConverter.post_processor  s/    ,, 
 	
r#   r]   )r^   r_   r`   r  rV   r   r4   r  r   r   r   r   r2   r#   r!   rz  rz    s2    
8+bS
r#   rz  c            	         t        t        t        d      t        d      dz               t        t        t        d      t        d      dz               z   t        t        t        d      t        d      dz               z   } | dd }d	}t        d
      D ]1  }|| vs| j                  |       |j                  d
|z          |dz  }3 |D cg c]  }t	        |       }}t        t        | |            S c c}w )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      )re   rA   ordrB   chrr?   r.  )bscsnbs       r!   bytes_to_unicoder    s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[ B;IIaLIIdQhFA	
 	Q#a&	B	B 
s   C4c                   J     e Zd ZdZ	 	 	 	 d fd	ZdefdZd ZdefdZ	 xZ
S )	TikTokenConverterz'
    A general tiktoken converter.
    c                     t        |   |  || _        || _        || _        t        |      t        u r|j                         | _        y || _        y r]   )	r  rV   r
  re  r$   r0  r?   r   additional_special_tokens)rU   r
  re  r$   r  r  rt  r  s          r!   rV   zTikTokenConverter.__init__   sV     	$$ 0045N0OSW0W%**, 	&]v 	&r#   tiktoken_urlc                 0   	 ddl m}  ||      t	               fd}g }i }j                         D ]  \  }}|| ||      <   t        |      dk(  r g }t        dt        |            D ]2  }	|d |	 ||	d  }}
|
v s|v s|
|z   v s|j                  |
||f       4 t        |fdd      }|j                  |        t        |d	 d      }|D cg c]  } ||d          ||d         f }}||fS # t        $ r t        d      w xY wc c}w )
Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c           	          dj                  | j                  d      D cg c]  }t        |          c}      S c c}w )Nr   zlatin-1)r-  decoder  )r  charbyte_encoders     r!   token_bytes_to_stringzPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string<  s2    77@STLT3TUUTs   <r   c                 $    | d      | d      fS r1   r2   )r3   r   s    r!   r5   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>J  s    1Q4)AaD/0R r#   Fr=   c                     | d   S )Nr9   r2   r;   s    r!   r5   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>L  s
    A r#   )tiktoken.loadr  r/  
ValueErrorr  r@   r:   rA   rB   rC   rD   )rU   r  r  r  rF   r4   r9  rankrI   rJ   rK   rL   r<   r   r  s                @@r!   extract_vocab_merges_from_modelz1TikTokenConverter.extract_vocab_merges_from_model1  sY   	7 &l3	')	V $??, 
	!KE426E'./5zQEq#e*- ;#(%=%-i'Gy,@gPWFW\eEeLL'7D!9:; 5&R\abEMM% 
	! $6F\bcUX(Q02GA2OPccf}5  	k 	2 ds   C; D;Dc                     | j                  | j                        \  }}t        t        ||d            }t	        |j
                  d      rd|j
                  _        |S )NF)r   ignore_mergesT)r  r
  r   r   r   rP   r  )rU   rE   rF   r   s       r!   r   zTikTokenConverter.tokenizerP  sN    #CCDOOTfc,GH	9??O4,0IOO)r#   r%   c           
         | j                         }t        j                  t        j                  t	        | j
                        dd      t        j                  | j                  d      g      |_        t        j                         |_
        |j                  | j                  D cg c]  }t        |dd       c}       t        j                  d      |_        |S c c}w )Nr   Fr   r   Tr(  r   )r   r	   r   r   r   re  r   r$   r   r   r   r   r  r   r
   r   )rU   r   r9  s      r!   ry   zTikTokenConverter.convertedW  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$LPLjLjk5Z%>k	
 $.#7#7U#K	  ls   'C )Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)r^   r_   r`   ra   rV   rb   r  r   r   ry   rN  rO  s   @r!   r  r    s;      K"&
"C >9 r#   r  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3Tokenizerc                 v   | j                   j                  }|t        v r!|st        |   } ||       j                         S 	 t        j                  d       t        | j                  | j                        j                         S # t        $ r* t        dt        t        j                                      w xY w)a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zConverting from Tiktoken)r
  r  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r  r^   SLOW_TO_FAST_CONVERTERSry   loggerinfor  r
  r  r/  r  re   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       r!   convert_slow_tokenizerr    s      1::CC66}12FG45??AA	KK23$0;;*?*Y*Y ik  	>>BCZC_C_Ca>b=ce 	s   AB 3B8)r   )F)Nra   r  	packagingr   
tokenizersr   r   r   r   r   r	   r
   tokenizers.modelsr   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerr^   r  r"   boolrb   r.   rM   rO   rg   rr   rt   r{   r   r   r   r   r   r   r   r   r   r   r  rQ  rj  ru  r  r  r  r  r  r  r  r  r  r	  r  r  r%  r)  r3  r6  r;  r>  rA  rF  rY  rl  ro  rz  r  r  r  r  r2   r#   r!   <module>r     sg     f f f 5 5 ` ` 5 
		H	%G"$ s & 2"8 "Ic Id I$ $$I $N/	 /d$i $N$Y $N 6#I #Ly >)Y )Xy :$	 $Ny >~9 ~B"
l "
J
| 
 
 
:
 
B2
\ 2
j
| 
6
L 
2
< 
2
, 
6"
\ "
J	 	
| 
@	l 	%x| %xP
, 
"
L 
!y !H	
| 	
(I (V$) $N) :
L 
61
\ 1
h+\ +\#	 #L&m\ &mRV
l V
t0L L^::%: (: ]	:
 (: .: ,: ]: : : (: ,: =: -: "=:  !-!:" #:$ _%:& ':( ]):* (+:, -:. =/:0 +1:2 -3:4 +5:6 $7:8 }9:: *;:< n=:> (?:@ nA:B =C:D $E:F ]G:H ,I:J (K:L nM:N mO:P *Q:R (S:T -U:V (W:X *Y:Z 0[:\ M]:^ ;_:` ]a:b (c:d .e:f ng:h +"$($#s: z!) !r#   