
    :Qg              	           d dl Z d dlmZmZ d dlZd dlmZmZ d dlm	Z	 d dl
mZ dedefdZd	efd
Zddedee   dedefdZd Zy)    N)ListOptional)MarianMTModelMarianTokenizer)sent_tokenize)chunk_by_attention_windowsource_langtarget_langc                     d|  d| S )zjConstructs the name of the MarianMT machine translation model based on the
    source and target language.zHelsinki-NLP/opus-mt-- )r	   r
   s     \/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/cleaners/translate.py_get_opus_mt_model_namer      s     #;-q>>    language_codec                 ^    t        | t              rt        |       dk7  rt        d|  d      y )N   zInvalid language code: z,. Language codes must be two letter strings.)
isinstancestrlen
ValueError)r   s    r   _validate_language_coder      s8    mS)S-?1-D%m_4`a
 	
 .Er   textreturnc                    | j                         dk(  r| S ||nt        j                  |       }|j                  d      rd}t	        |       t	        |       ||k(  r| S t        ||      }	 t        j                  |      }t        j                  |      }t        | |t              }g }|D ]  }	|j                  t        | ||               dj                  |      S # t        $ r t        d| d      w xY w)a  Translates the foreign language text. If the source language is not specified, the
    function will attempt to detect it using langdetect.

    Parameters
    ----------
    text: str
        The text to translate
    target_lang: str
        The two letter language code for the target langague. Defaults to "en".
    source_lang: Optional[str]
        The two letter language code for the language of the input text. If source_lang is
        not provided, the function will try to detect it.
     zhz2Transformers could not find the translation model z>. The requested source/target language combo is not supported.)split_function )strip
langdetectdetect
startswithr   r   r   from_pretrainedr   OSErrorr   r   r   append_translate_textjoin)
r   r	   r
   _source_lang
model_name	tokenizermodelchunkstranslated_chunkschunks
             r   translate_textr0      s    zz|r'2'>JDUDUVZD[L t$K(L)l"({CJ
#33J?	--j9 2$	R_`F#% J  ui!HIJ 88%&&  
@ MK K
 	

s   ,*C C7c                    t        j                         5  t        j                  d        |j                  d
i  || gddd      }ddd       D cg c]  }|j	                  |dd       c}d	   S # 1 sw Y   .xY wc c}w )z8Translates text using the specified model and tokenizer.ignorept
max_lengthi   )return_tensorspaddingr4   NT)max_new_tokensskip_special_tokensr   r   )warningscatch_warningssimplefiltergeneratedecode)r   r,   r+   
translatedts        r   r'   r'   J   s    
 
	 	 	" 
h'#U^^ 
t\VYZ



 XbbRSIQsMb	 
 

 cs   3A5B5A>)Nen)r9   typingr   r   r!   transformersr   r   unstructured.nlp.tokenizer    unstructured.staging.huggingfacer   r   r   r   r0   r'   r   r   r   <module>rE      sb     !  7 3 F? ?3 ?
3 
/' /'8C= /'c /']` /'dr   