
    :Qg#                       d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZmZ  ej&                  e      j*                  j-                         Zej0                  j3                  eddd      Zd	 Zd
 Zd Zd Zd Zd Z d>dZ!d>dZ"d Z#d>dZ$ejJ                  jM                  dg d      d?d       Z'd>dZ(ejJ                  jM                  dg d      d?d       Z)d Z*d Z+d Z,d Z-d>dZ.d Z/d>dZ0ejJ                  jM                  d d!d"g      d@d#       Z1ejJ                  jM                  d$d%gd%fd&d'd(d)gd*fd+gd*fd,g      dAd-       Z2d. Z3ejJ                  jM                  d/d0gd*d0gfd1gd2d0gfd0gd)d0gfd0gd+d0gfd0gd*gd0gfd0gd)gd0gfd3gd4d0d*gfg      	 	 	 	 	 	 	 	 dBd5       Z4ejJ                  jM                  d/g d*d*gfd6gd+d*gfd6gd*d*gfd6gd)d*gfg      	 	 	 	 	 	 	 	 dCd7       Z5ejJ                  jM                  d8g dfd6gdfg      	 	 	 	 dDd9       Z6d: Z7ejJ                  jM                  d8g d;gfd6gd<fg      	 	 	 	 dEd=       Z8y)Fz=Unit-test suite for the `unstructured.partition.lang` module.    )annotationsN)LogCaptureFixture)NarrativeText	PageBreak)_clean_ocr_languages_arg/_convert_language_code_to_pytesseract_lang_codeapply_lang_metadatacheck_language_argsdetect_languagesprepare_languages_for_tesseracttesseract_to_paddle_languagez..zexample-docsc                 *    dg} t        |       dk(  sJ y Nenengr   	languagess    i/var/www/html/answerous/venv/lib/python3.12/site-packages/test_unstructured/partition/common/test_lang.py6test_prepare_languages_for_tesseract_with_one_languager      s    I*95>>>    c                 ,    ddg} t        |       dk(  sJ y r   r   r   s    r   >test_prepare_languages_for_tesseract_with_duplicated_languagesr   $   s    uI*95>>>r   c                 P    dg} t        |       dk(  sJ dg} t        |       dk(  sJ y )Nosdequr   r   s    r   1test_prepare_languages_for_tesseract_special_caser   )   s6    I*95>>>I*95>>>r   c                 ,    ddg} t        |       dk(  sJ y )Nkbdeszspa+spa_oldr   r   s    r   9test_prepare_languages_for_tesseract_removes_empty_inputsr!   1   s    I*95FFFr   c                 *    dg} t        |       dk(  sJ y )Nchi)chi_sim+chi_sim_vert+chi_tra+chi_tra_vertr   r   s    r   6test_prepare_languages_for_tesseract_includes_variantsr%   6   s    I*959ddddr   c                 ,    g d} t        |       dk(  sJ y )N)jaafrr   r   zjpn+jpn_vert+afr+eng+equr   r   s    r   <test_prepare_languages_for_tesseract_with_multiple_languagesr)   ;   s    *I*959SSSSr   c                L    ddg}t        |      dk(  sJ d| j                  v sJ y )Nzzzr#   r$   z"not a valid standard language coder   textcaplogr   s     r   ?test_prepare_languages_for_tesseract_warns_nonstandard_languager0   @   s1    I*959dddd/6;;>>>r   c                L    ddg}t        |      dk(  sJ d| j                  v sJ y )Nr   r   z%not a language supported by Tesseractr,   r.   s     r   Atest_prepare_languages_for_tesseract_warns_non_tesseract_languager2   F   s0    I*95>>>2fkkAAAr   c                 ~    t        j                  t        d      5  d } t        |        d d d        y # 1 sw Y   y xY w)Nz`languages` can not be `None`)match)pytestraises
ValueErrorr   r   s    r   3test_prepare_languages_for_tesseract_None_languagesr8   L   s3    	z)H	I 3	'	23 3 3s   3<c                J    dg}t        |      dk(  sJ d| j                  v sJ y )N r   z>Failed to find any valid standard language code from languagesr,   r.   s     r   7test_prepare_languages_for_tesseract_no_valid_languagesr;   R   s.    I*95>>>Kv{{ZZZr   tesseract_langexpected_lang))r   r   )chi_simch)chi_trachinese_cht)deugerman)jpnjapan)korkoreanc                $    |t        |       k(  sJ y Nr   r<   s     r   -test_tesseract_to_paddle_language_valid_codesrL   X   s     8HHHHr   c                H    d}t        |      dk(  sJ d| j                  v sJ y )Nunsupported_langr   z?unsupported_lang is not a language code supported by PaddleOCR,)r   r-   )r/   r=   s     r   /test_tesseract_to_paddle_language_invalid_codesrO   g   s/    'N'74???LPVP[P[[[[r   ))ENGr   )Frafr)DEUrD   c                $    |t        |       k(  sJ y rJ   rK   r<   s     r   2test_tesseract_to_paddle_language_case_sensitivityrU   m   s     8HHHHr   c                 *    d} t        |       dgk(  sJ y )NThis is a short sentence.r   r   r-   s    r   "test_detect_languages_english_autorZ   y   s    &DD!eW,,,r   c                 2    d} dg}t        | |      dgk(  sJ y )NThis is another short sentence.r   r   rX   r-   r   s     r   &test_detect_languages_english_providedr^   ~   s%    ,DID),777r   c                 *    d} t        |       dgk(  sJ y )Nu   안녕하세요rG   rX   rY   s    r   !test_detect_languages_korean_autor`      s    DD!eW,,,r   c                 ,    d} t        |       g dk(  sJ y )NzMy lubimy mleko i chleb.)cespolslkrX   rY   s    r   -test_detect_languages_gets_multiple_languagesre      s    %DD!%::::r   c                T    d}g d}t        ||      dgk(  sJ d| j                  v sJ y )Nr\   )r   autorusr   z.rest of the inputted languages will be ignored)r   r-   )r/   r-   r   s      r   4test_detect_languages_warns_for_auto_and_other_inputri      s5    ,D%ID),777;v{{JJJr   c                     t        j                  t              5  d} t        | d      dgk(   d d d        y # 1 sw Y   y xY w)NrW   r   r   )r5   r6   	TypeErrorr   rY   s    r   <test_detect_languages_raises_TypeError_for_invalid_languagesrl      s9    	y	! ;*/E7:; ; ;s	   7A c                    t        d      t        d      g}t        t        |dgd            }d| j                  D cg c]  }|j
                   c}vsJ y c c}w )NzSample text.r:   rg   T)elementsr   detect_language_per_elementzNo features in text.)r   r   listr	   recordsmessage)r/   rn   recs      r   5test_apply_lang_metadata_has_no_warning_for_PageBreakrt      sX    n-y}=Hh(,	
H ")P##++)PPPP)Ps   Alang_inr>   )r   r   )rR   frac                $    |t        |       k(  sJ y rJ   )r   ru   s     r   3test_convert_language_code_to_pytesseract_lang_codery      s     KGTTTTr   input_ocr_langsexpectedr   )"deu"rC   )[deu]rC   )z['deu']rC   r~   rC   r}   )deu+spar   c                $    t        |       |k(  sJ y rJ   )r   rz   s     r   test_clean_ocr_languages_argr      s     $O4@@@r   c                 0    t        ddg      } | dgk(  sJ y )Nz Sample text longer than 5 words.Spanishr]   sparX   r   s    r   3test_detect_languages_handles_spelled_out_languagesr      s#     &HU^T_`Ir   )r   ocr_languagesexpected_langsr   spanishenglishzspa+deuzeng+deuc                X    t        | |      }|D ]  }||v sJ d|j                  v rJ  y Nr   r   r   r
   r-   r   r   r   r/   returned_langslangs         r   Ytest_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_definedr      sE    $ )#N  .~%%%&++---.r   r:   c                X    t        | |      }|D ]  }||v sJ d|j                  v rJ  y r   r   r   s         r   Ktest_check_language_args_uses_ocr_languages_when_languages_is_empty_or_Noner      s?      )9MZN .~%%%&++---.r   r   c                &    t        | |      }|J y Nr   r
   )r   r   r   s      r   %test_check_language_args_returns_Noner      s     )9MZN!!!r   c                 .    t        g dd       dgk(  sJ y )N)r   r   rg   r   rg   r    r   r   %test_check_language_args_returns_autor     s    )?tTY_X````r   rg   zeng+autoc                z    t        j                  t              5  t        | |       d d d        y # 1 sw Y   y xY wr   )r5   r6   r7   r
   r   s     r   Ftest_check_language_args_raises_error_when_ocr_languages_contains_autor     s3     
z	" 
'	

 
 
s   1:)r/   r   )r=   strr>   r   )rv   r   r>   r   )r{   r   r|   r   )r   	list[str]r   zlist[str] | strr   r   r/   r   )r   r   r   r   r   r   r/   r   )r   r   r   None)r   r   r   zstr | list[str])9__doc__
__future__r   ospathlibr5   test_unstructured.unit_utilsr   unstructured.documents.elementsr   r   "unstructured.partition.common.langr   r   r	   r
   r   r   r   Path__file__parentresolve	DIRECTORYpathjoinEXAMPLE_DOCS_DIRECTORYr   r   r   r!   r%   r)   r0   r2   r8   r;   markparametrizerL   rO   rU   rZ   r^   r`   re   ri   rl   rt   ry   r   r   r   r   r   r   r   r   r   r   <module>r      s   D " 	   :   GLL"))113	it^L ?
?
?G
e
T
?B3[ '
I
I\ 'II-
8-
;
K;	Q  UU #
%
E
EAA 
 4
%%!
i%)
'E7#
'E7#
5'E7#
7)eW%
i%0..". . 	.. 4 
UUG
w 
uug
w 		.	.	. 	. 		.		. "	T

t""""a "	fX
z

"

r   