
    :Qg9                       d dl mZ d dlZd dlZd dlmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZ d dlmZ  ed       ed       ed       ed       ed       ed      gZdj;                  dd      ZdZej@                  jC                  dg d      dGd       Z"d Z#ej@                  jC                  dg d      dHd       Z$ej@                  jC                  ddd e%fd!d e&fg      	 	 	 	 	 	 dId"       Z'd# Z(d$ Z)ej@                  jC                  dg d      dHd%       Z*d& Z+ej@                  jC                  dg d      dHd'       Z,d( Z-d) Z.d* Z/d+ Z0d, Z1d- Z2d. Z3d/ Z4d0 Z5d1 Z6d2 Z7d3 Z8d4 Z9d5 Z:d6 Z;dJd7Z<d8 Z=d9 Z>dJd:Z?d; Z@d< ZAd= ZBd> ZCej@                  jC                  d?g d      dKd@       ZDdA ZEdB ZFdC ZGdD ZHdE ZIdF ZJy)L    )annotationsN)OptionalType)MockerFixture)assert_round_trips_through_JSONexample_doc_path)chunk_by_title)group_broken_paragraphs)AddressListItemNarrativeTextTitle)FileTypepartition_text)#UNSTRUCTURED_INCLUDE_DEBUG_METADATAz.This is a test document to use for unit tests.textzDoylestown, PA 18901zImportant points:zHamburgers are deliciouszDogs are the bestzI love fuzzy blanketszThis is a story. This is a story that doesn't matter
 because it is just being used as an example. Hi. Hello. Howdy. Hola.
 The example is simple and repetitive and long and somewhat boring,
 but it serves a purpose. End.
 zThis is a story.

This is a story that doesn't matter because it is just being used as an example.

Hi.

Hello.

Howdy.

Hola.

The example is simple and repetitive and long and somewhat boring, but it serves a purpose.

End.
filenameencoding))fake-text.txtutf-8r   N)fake-text-utf-16-be.txtz	utf-16-ber   c                   t        t        |       |      }t        |      dkD  sJ |t        k(  sJ |D ]  }|j                  j
                  | k(  rJ  t        r+|D ch c]  }|j                  j                   c}dhk(  sJ y y c c}w )Nr   r   r   )r   r   lenEXPECTED_OUTPUTmetadatar   r   detection_origin)r   r   elementselements       b/var/www/html/answerous/venv/lib/python3.12/site-packages/test_unstructured/partition/test_text.py!test_partition_text_from_filenamer'   7   s     .x88LHx=1&&& 5((H4445*AIJg  11JvhVVV +Js   Bc                     t        t        d      dd      } | t        k(  sJ | D ]  }|j                  j                  dk(  rJ  y )Nr   r   test)r   metadata_filename)r   r   r!   r"   r   r$   r%   s     r&   8test_partition_text_from_filename_with_metadata_filenamer,   J   sQ    )GvH &&& 3((F2223    )zfake-text-utf-16.txtzfake-text-utf-16-le.txtzfake-text-utf-32.txtc                    t        t        |             }t        |      dkD  sJ |t        k(  sJ |D ]  }|j                  j
                  | k(  rJ  y )Nr   )r   r   r    r!   r"   r   )r   r$   r%   s      r&   2test_partition_text_from_filename_default_encodingr/   T   s]    
 .x89Hx=1&&& 5((H4445r-   r   r   errorr   zutf-16r   c                    t        j                  |      5  t        |       } t        | |       d d d        y # 1 sw Y   y xY w)Nr   )pytestraisesr   r   r0   s      r&   7test_partition_text_from_filename_raises_econding_errorr5   a   s9     
u	 =#H-8<= = =s	   8Ac                     t        t        d      d      5 } t        |       }d d d        t              dkD  sJ |t        k(  sJ |D ]  }|j
                  j                  J  y # 1 sw Y   DxY wNr   rbfiler   openr   r   r    r!   r"   r   fr$   r%   s      r&   test_partition_text_from_filer?   r   |    	/	6 *!!q)* x=1&&& 1((0001* *   A&&A/c                     t        d      } t        | d      5 }t        |d      }d d d        t              dkD  sJ |t        k(  sJ |D ]  }|j
                  j                  dk(  rJ  y # 1 sw Y   GxY w)Nr   r8   r)   r:   r*   r   )r   r<   r   r    r!   r"   r   r   r>   r$   r%   s       r&   4test_partition_text_from_file_with_metadata_filenamerE   |   s    0H	h	 D!qFCDx=1&&& 3((F2223	D Ds   A,,A5c                    t        t        |       d      5 }t        |      }d d d        t              dkD  sJ |t        k(  sJ |D ]  }|j
                  j                  J  y # 1 sw Y   DxY wNr8   r9   r   r;   rD   s       r&   .test_partition_text_from_file_default_encodingrH      s|    
 
x($	/ *1!q)*x=1&&& 1((0001	* *rA   c                     t        t        d      d      5 } t        |       }d d d        t              dkD  sJ |t        k(  sJ |D ]  }|j
                  j                  J  y # 1 sw Y   DxY wr7   r;   r=   s      r&   #test_partition_text_from_bytes_filerJ      r@   rA   c                    t        t        |       d      5 }t        |      }d d d        t              dkD  sJ |t        k(  sJ |D ]  }|j
                  j                  J  y # 1 sw Y   DxY wrG   r;   rD   s       r&   4test_partition_text_from_bytes_file_default_encodingrL      s~    
 
x($	/ *1!q)* x=1&&& 1((0001* *rA   c                 r    t        t        d      ddg      } | d   j                  j                  dgk(  sJ y )Nbook-war-and-peace-1p.txtfasten)strategy	languagesr   engr   r   r"   rR   r$   s    r&   <test_text_partition_element_metadata_user_provided_languagesrV      s?    45SWRXH A;))eW444r-   c                    t        t        d            5 } | j                         }d d d        t              }t	        |      dkD  sJ |t
        k(  sJ |D ]  }|j                  j                  J  y # 1 sw Y   PxY w)Nr   r   r   )r<   r   readr   r    r!   r"   r   )r>   r   r$   r%   s       r&   test_partition_text_from_textrY      s    	/	0 Avvx 4(Hx=1&&& 1((0001 s   A55A>c                 &    t        d      g k(  sJ y )Nr   r   r    r-   r&   5test_partition_text_from_text_works_with_empty_stringr\      s    r"b(((r-   c                 t    t        j                  t              5  t                d d d        y # 1 sw Y   y xY wN)r3   r4   
ValueErrorr   r[   r-   r&   .test_partition_text_raises_with_none_specifiedr`      s)    	z	"   s   .7c                     t        d      } t        |       5 }|j                         }d d d        t        j                  t
              5  t        |        d d d        y # 1 sw Y   9xY w# 1 sw Y   y xY w)Nr   )r   r   )r   r<   rX   r3   r4   r_   r   )r   r>   r   s      r&   2test_partition_text_raises_with_too_many_specifiedrb      sg    0H	h 1vvx 
z	" 5t45 5 5 5s   A 	A, A),A5c                     d} t        |       }|t        d      t        d      gk(  sJ |D ]  }|j                  j                  J  y )Nz6
    VERY IMPORTANT MEMO
    DOYLESTOWN, PA 18901
    r   zVERY IMPORTANT MEMOzDOYLESTOWN, PA 18901)r   r   r   r"   r   r   r$   r%   s      r&   <test_partition_text_captures_everything_even_with_linebreaksre      sg    D 4(H()+,     1((0001r-   c                     d} t        | t              }|t        d      t        d      gk(  sJ |D ]  }|j                  j                  J  y )NzYThe big brown fox
was walking down the lane.

At the end of the lane,
the fox met a bear.)r   paragraph_grouperz,The big brown fox was walking down the lane.r   z+At the end of the lane, the fox met a bear.)r   r
   r   r"   r   rd   s      r&   ,test_partition_text_groups_broken_paragraphsrh      sm    	 	 4;RSHIJHI     1((0001r-   c                     t        t        d            } t        |       dkD  sJ | d   j                  j	                  d      sJ | d   j                  j                  d      sJ y )Nnorwich-city.txtr   zIwan RobertszExternal links)r   r   r    r   
startswithendswithrU   s    r&   $test_partition_text_splits_long_textrn      sb    ./ABCHx=1A;&&~666B<%%&6777r-   c                     d} t        |       }t        |      dk(  sJ |d   j                  | k(  sJ t        |d   t              rJ y )Nz--------------------r      r   )r   r    r   
isinstancer   )r   r$   s     r&   *test_partition_text_doesnt_get_page_breaksrr      sR    !D4(Hx=AA;t###(1+x0000r-   c                 |    t        t        d            } t        d | D              sJ t        d | D              sJ y )Nr   c              3  N   K   | ]  }|j                   j                  d k(    ywr   r"   r   .0es     r&   	<genexpr>zZtest_partition_text_from_filename_gets_filename_metadata_from_file_path.<locals>.<genexpr>       H!qzz""o5H   #%c              3  `   K   | ]&  }|j                   j                  t        d       k(   ( yw)r   N)r"   file_directoryr   rv   s     r&   ry   zZtest_partition_text_from_filename_gets_filename_metadata_from_file_path.<locals>.<genexpr>  s%     SQqzz((,<R,@@Ss   ,.r   r   allrU   s    r&   Gtest_partition_text_from_filename_gets_filename_metadata_from_file_pathr     s:    .?@HHxHHHHS(SSSSr-   c                     t        t        d      d      5 } t        |       }d d d        t        d D              sJ t        d |D              sJ y # 1 sw Y   2xY w)Nr   r8   r9   c              3  L   K   | ]  }|j                   j                  d u   y wr^   ru   rv   s     r&   ry   zLtest_partition_text_from_file_gets_filename_metadata_None.<locals>.<genexpr>  s     =qqzz""d*=   "$c              3  L   K   | ]  }|j                   j                  d u   y wr^   r"   r}   rv   s     r&   ry   zLtest_partition_text_from_file_gets_filename_metadata_None.<locals>.<genexpr>  s     CQqzz((D0Cr   r<   r   r   r   r>   r$   s     r&   9test_partition_text_from_file_gets_filename_metadata_Noner   
  s]    	/	6 *!!q)* =H====C(CCCC	* *s   AAc                     t        t        d      d      } t        d | D              sJ t        d | D              sJ y )Nr   z	a/b/c.txt)r*   c              3  N   K   | ]  }|j                   j                  d k(    yw)zc.txtNru   rv   s     r&   ry   zNtest_partition_text_from_filename_prefers_metadata_filename.<locals>.<genexpr>       @!qzz""g-@r{   c              3  N   K   | ]  }|j                   j                  d k(    yw)za/bNr   rv   s     r&   ry   zNtest_partition_text_from_filename_prefers_metadata_filename.<locals>.<genexpr>       Daqzz((E1Dr{   r~   rU   s    r&   ;test_partition_text_from_filename_prefers_metadata_filenamer     s=    .?S^_H@x@@@@D8DDDDr-   c                     t        t        d      d      5 } t        | d      }d d d        t        d D              sJ t        d |D              sJ y # 1 sw Y   2xY w)Nr   r8   z	d/e/f.txtrC   c              3  N   K   | ]  }|j                   j                  d k(    yw)zf.txtNru   rv   s     r&   ry   zJtest_partition_text_from_file_prefers_metadata_filename.<locals>.<genexpr>  r   r{   c              3  N   K   | ]  }|j                   j                  d k(    yw)zd/eNr   rv   s     r&   ry   zJtest_partition_text_from_file_prefers_metadata_filename.<locals>.<genexpr>  r   r{   r   r   s     r&   7test_partition_text_from_file_prefers_metadata_filenamer     sc    	/	6 I!!qKHI @x@@@@D8DDDD	I Is   AAc                     dt        t        d            } t        fd| D              s-J d dt        | d   j                  j
                                y )Nz
text/plainr   c              3  P   K   | ]  }|j                   j                  k(    y wr^   r"   filetype)rw   rx   TXT_MIME_TYPEs     r&   ry   zRtest_partition_text_gets_the_TXT_MIME_type_in_metadata_filetype.<locals>.<genexpr>'  s      Fqzz""m3F   #&zExpected all elements to have 'z' as their filetype, but got: r   )r   r   r   reprr"   r   )r$   r   s    @r&   ?test_partition_text_gets_the_TXT_MIME_type_in_metadata_filetyper   $  s_     M.?@HFXFF 
)- 9!%%../0	2Fr-   c                     t        t        d      t        j                        } t	        d | D              s*J dt        | d   j                  j                                y )Nz	README.md)metadata_file_typec              3  N   K   | ]  }|j                   j                  d k(    yw)ztext/markdownNr   rv   s     r&   ry   zAtest_partition_text_prefers_metadata_file_type.<locals>.<genexpr>/  rz   r{   zJExpected all elements to have 'text/markdown' as their filetype, but got: r   )r   r   r   MDr   r   r"   r   rU   s    r&   .test_partition_text_prefers_metadata_file_typer   -  s[    .{;PXP[P[\HHxHH !%%../0	2Hr-   c                    d| j                  d       t        t        d            }t        fd|D              sJ y )N2029-07-05T09:24:282unstructured.partition.text.get_last_modified_datereturn_valuer   c              3  P   K   | ]  }|j                   j                  k(    y wr^   r"   last_modified)rw   rx   filesystem_last_modifieds     r&   ry   zXtest_partition_text_from_file_path_gets_last_modified_from_filesystem.<locals>.<genexpr>@  s!     Vqzz''+CCVr   patchr   r   r   )mockerr$   r   s     @r&   Etest_partition_text_from_file_path_gets_last_modified_from_filesystemr   8  sF    4
LL<Kc   .?@HVXVVVVr-   c                     t        t        d      d      5 } t        |       }d d d        t        d D              sJ y # 1 sw Y   xY w)Nr   r8   r9   c              3  L   K   | ]  }|j                   j                  d u   y wr^   r   rv   s     r&   ry   zHtest_partition_text_from_file_gets_last_modified_None.<locals>.<genexpr>G       BAqzz''4/Br   r   r   s     r&   5test_partition_text_from_file_gets_last_modified_Noner   C  sI    	/	6 *!!q)* BBBBB* *s   A  A	c                     t        t        d            5 } | j                         }d d d        t              }t	        d |D              sJ y # 1 sw Y   *xY w)Nr   r   c              3  L   K   | ]  }|j                   j                  d u   y wr^   r   rv   s     r&   ry   zHtest_partition_text_from_text_gets_last_modified_None.<locals>.<genexpr>P  r   r   r<   r   rX   r   r   )r>   r   r$   s      r&   5test_partition_text_from_text_gets_last_modified_Noner   J  sR    	/	0 Avvx 4(HBBBBB s   AAc                    d}d| j                  d|       t        t        d            }t        fd|D              sJ y )Nr   2020-07-05T09:24:28r   r   r   )metadata_last_modifiedc              3  P   K   | ]  }|j                   j                  k(    y wr^   r   rw   rx   r   s     r&   ry   zTtest_partition_text_from_file_path_prefers_metadata_last_modified.<locals>.<genexpr>^  !     Taqzz''+AATr   r   )r   r   r$   r   s      @r&   Atest_partition_text_from_file_path_prefers_metadata_last_modifiedr   S  sS    42
LL<Kc   )BXH T8TTTTr-   c                     dt        t        d      d      5 } t        |       }d d d        t        fdD              sJ y # 1 sw Y    xY w)Nr   r   r8   )r:   r   c              3  P   K   | ]  }|j                   j                  k(    y wr^   r   r   s     r&   ry   zOtest_partition_text_from_file_prefers_metadata_last_modified.<locals>.<genexpr>f  r   r   r   )r>   r$   r   s     @r&   <test_partition_text_from_file_prefers_metadata_last_modifiedr   a  sW    2	/	6 Y!!qAWXY T8TTTTY Ys   AAc                     dt        t        d            5 } | j                         }d d d        t              }t	        fd|D              sJ y # 1 sw Y   -xY w)Nr   r   )r   r   c              3  P   K   | ]  }|j                   j                  k(    y wr^   r   r   s     r&   ry   zOtest_partition_text_from_text_prefers_metadata_last_modified.<locals>.<genexpr>p  r   r   r   )r>   r   r$   r   s      @r&   <test_partition_text_from_text_prefers_metadata_last_modifiedr   i  s\    2	/	0 Avvx 4@VWHT8TTTT s   AAc                 f    t        d      D  cg c]  } | j                   }} |g dk(  sJ y c c} w )Nhello
hello
hellor   ) 8657c0ec31a4cfc822f6cd4a5684cafd 72aefb4a12be063ad160931fdb380163 ba8c1a216ca585aecdd365a72e6124f1)r   id)r%   idss     r&   Etest_Text_element_assigns_id_hashes_that_are_unique_and_deterministicr   v  s<    %39N%O
P'7::
PC
P     Qs   .c                     t        dd      } | D ]H  }t        j                  |j                  d      sJ t	        j
                  |j                                J y )Nr   T)r   unique_element_ids   )version)r   uuidUUIDr   jsondumpsto_dictr+   s     r&   >test_Text_element_assings_UUID_when_unique_element_ids_is_Truer     sK    #8TRH &yyQ/// 	

7??$%	&r-   )	file_namer   c                F    t        t        |       |      }t        |       y )Nr   )r   r   r   )r   r   r$   s      r&   test_partition_text_with_jsonr     s     .y9HMH#H-r-   c                 ~    t        d      } t        |       }t        | d      }t        |      }||k7  sJ ||k(  sJ y )NrN   )r   by_title)chunking_strategy)r   r   r	   )r   r$   chunk_elementschunkss       r&   ,test_add_chunking_strategy_on_partition_textr     sI     ;<Hx0H#H
KNH%FX%%%V###r-   c                 j    t        t        d            } | d   j                  j                  dgk(  sJ y )Nrj   r   rS   rT   rU   s    r&   2test_partition_text_element_metadata_has_languagesr     s3    ./ABCHA;))eW444r-   c                     t        t        d      d      } | D cg c]  }|j                  j                   }}|dgddgdgdgdggk(  sJ y c c}w )Nzlanguage-docs/eng_spa_mult.txtTdetect_language_per_elementrS   sparT   )r$   r%   langss      r&   8test_partition_text_respects_detect_language_per_elementr     se    9:X\H 8@@GW''@E@eWuenugwHHHH As   Ac                 p    t        t        d      dg      } | d   j                  j                  dgk(  sJ y )Nrj   deurR   r   rT   rU   s    r&   *test_partition_text_respects_languages_argr     s7    ./ABugVHA;))eW444r-   c                     t        j                  t              5  t        t	        d      d       d d d        y # 1 sw Y   y xY w)Nrj   rS   r   )r3   r4   	TypeErrorr   r   r[   r-   r&   5test_partition_text_element_metadata_raises_TypeErrorr     s8    	y	! N'(:;uMN N Ns	   :Ac                     t        t        d      d      } | D cg c]2  }|j                  j                  s|j                  j                  d   4 }}t	        |      dkD  sJ y c c}w )Nz(language-docs/UDHR_first_article_all.txtTr   r   
   )r   r   r"   rR   r    )r$   rx   r   s      r&   1test_partition_text_detects_more_than_3_languagesr     s^    CD$(H /7O!**:N:NQZZ!!!$OEOu:?? Ps
   A%A%)r   strr   Optional[str])r   r   )r   r   r   r   r1   zType[BaseException])r   r   )r   r   r   z
str | None)K
__future__r   r   r   typingr   r   r3   pytest_mockr   test_unstructured.unit_utilsr   r   unstructured.chunking.titler	   unstructured.cleaners.corer
   unstructured.documents.elementsr   r   r   r   unstructured.file_utils.modelr   unstructured.partition.textr   &unstructured.partition.utils.constantsr   r!   replaceMIN_MAX_TEXTSHORT_PARAGRAPHSmarkparametrizer'   r,   r/   UnicodeDecodeErrorUnicodeErrorr5   r?   rE   rH   rJ   rL   rV   rY   r\   r`   rb   re   rh   rn   rr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r[   r-   r&   <module>r     sr   #   !  % Z 6 > S S 2 6 V GH'(	"#,-%&)*" #*'#  $ WW3 O5	5 %	($67	"Hl;=== ==13 O1	11 O1	15	1)
511&81TDEEWCCUUU& ..
$5
I5
N
r-   