
    :Qg&                       d Z ddlmZ ddlZddlmZmZ ddlZddlm	Z	 ddl
mZmZmZmZmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZ dd	lmZ dd
lm Z  d%dZ!d&dZ"	 	 	 	 d&dZ#	 	 	 	 d&dZ$d Z%d Z&d'dZ'd Z(d Z)d Z*d Z+d Z,d(dZ-	 	 d(dZ.d Z/d Z0ejb                  je                  di dfddidfddidfg      	 	 	 	 	 	 d)d       Z3d  Z4d! Z5d" Z6d# Z7 ejp                         d*d$       Z9y)+z3Test suite for `unstructured.partition.doc` module.    )annotationsN)AnyIterator)MockFixture)ANYCaptureFixtureFixtureRequestassert_round_trips_through_JSONexample_doc_pathmethod_mock)chunk_elements)	AddressCompositeElementElementListItemNarrativeTextTable
TableChunkTextTitle)partition_doc)partition_docxc                b    t        d      }t        d      }t        |      t        |      k(  sJ y )N
simple.doczsimple.docx)r   r   r   )requestdoc_file_pathdocx_file_paths      a/var/www/html/answerous/venv/lib/python3.12/site-packages/test_unstructured/partition/test_doc.py)test_partition_doc_matches_partition_docxr   %   s.    $\2M%m4N'>.+IIII    c                    t        t        d            }|| k(  sJ t        d |D              sJ |j                         j                  dk(  sJ |j                         j
                  dk(  sJ y )Nr   c              3  `   K   | ]&  }|j                   j                  t        d       k(   ( yw) N)metadatafile_directoryr   .0es     r   	<genexpr>z3test_partition_doc_from_filename.<locals>.<genexpr>3   s%     SQqzz((,<R,@@Ss   ,.r#   )r   r   all
readouterrouterr)expected_elementscapsyselementss      r    test_partition_doc_from_filenamer1   /   so    -l;<H((((S(SSSS""b(((""b(((r    c                    t        t        d      d      5 }t        |d      }d d d        | k(  sJ |j                         j                  dk(  sJ |j                         j
                  dk(  sJ y # 1 sw Y   OxY w)Nr   rbzMS Word 2007 XMLfilelibre_office_filterr#   )openr   r   r+   r,   r-   r.   r/   fr0   s       r   5test_partition_doc_from_file_with_libre_office_filterr:   8   s     
|,d	3 Qq a=OPQ ((((""b(((""b(((Q Qs   A22A;c                $   t        t        d      d      5 }t        |d       }d d d        | k(  sJ |j                         j                  dk(  sJ |j                         j
                  dk(  sJ t        d |D              sJ y # 1 sw Y   cxY w)Nr   r3   r4   r#   c              3  L   K   | ]  }|j                   j                  d u   y wNr$   filenamer&   s     r   r)   zKtest_partition_doc_from_file_with_no_libre_office_filter.<locals>.<genexpr>L        =qqzz""d*=   "$)r7   r   r   r+   r,   r-   r*   r8   s       r   8test_partition_doc_from_file_with_no_libre_office_filterrB   C   s     
|,d	3 Cq aTBC ((((""b(((""b(((=H====C Cs   BBc                     t        d      } t        | d      5 }t        j                  t        d      5  t        | |       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr   r3   2Exactly one of filename and file must be specifiedmatch)r?   r5   )r   r7   pytestraises
ValueErrorr   )r   r9   s     r   Etest_partition_doc_raises_when_both_a_filename_and_file_are_specifiedrJ   O   s`    $\2M	mT	" :a]]:-ab 	:=q9	:: :	: 	:: :s"   AAAA	AA(c                 x    t        j                  t        d      5  t                d d d        y # 1 sw Y   y xY w)NrD   rE   )rG   rH   rI   r    r    r   Vtest_partition_doc_raises_when_neither_a_file_path_nor_a_file_like_object_are_providedrM   W   s+    	z)]	^   s   09c                    t        | dz        }t        j                  t        d      5  t	        |       d d d        y # 1 sw Y   y xY w)Nzasdf.doczasdf.doc does not existrE   )r?   )strrG   rH   rI   r   )tmp_pathdoc_filenames     r   &test_partition_raises_with_missing_docrR   \   s=    x*,-L	z)B	C -|,- - -s   A  A	c                 t    t        t        d            } t        |       dkD  sJ t        d | D              sJ y )Nr   r   c              3  N   K   | ]  }|j                   j                  d k(    yw)r   Nr>   r&   s     r   r)   zStest_partition_doc_from_filename_gets_filename_from_filename_arg.<locals>.<genexpr>j   s     Eqqzz""l2E   #%r   r   lenr*   r0   s    r   @test_partition_doc_from_filename_gets_filename_from_filename_argrY   f   s8    -l;<Hx=1EHEEEEr    c                     t        t        d      d      5 } t        |       }d d d        t              dkD  sJ t	        d |D              sJ y # 1 sw Y   .xY w)Nr   r3   )r5   r   c              3  L   K   | ]  }|j                   j                  d u   y wr=   r>   r&   s     r   r)   zBtest_partition_doc_from_file_gets_filename_None.<locals>.<genexpr>r   r@   rA   )r7   r   r   rW   r*   r9   r0   s     r   /test_partition_doc_from_file_gets_filename_Noner]   m   s[    	|,d	3 )q a() x=1=H====	) )s   AAc                 x    t        t        d      d      } t        |       dkD  sJ t        d | D              sJ y )Nr   test)metadata_filenamer   c              3  N   K   | ]  }|j                   j                  d k(    ywr_   Nr>   )r'   elements     r   r)   zMtest_partition_doc_from_filename_prefers_metadata_filename.<locals>.<genexpr>y   s!     Kww((F2KrU   rV   rX   s    r   :test_partition_doc_from_filename_prefers_metadata_filenamerd   u   s:    -l;vVHx=1K(KKKKr    c                     t        t        d      d      5 } t        | d      }d d d        t        d D              sJ y # 1 sw Y   xY w)Nr   r3   r_   )r5   r`   c              3  N   K   | ]  }|j                   j                  d k(    ywrb   r>   r&   s     r   r)   zItest_partition_doc_from_file_prefers_metadata_filename.<locals>.<genexpr>   s     ?qzz""f,?rU   )r7   r   r   r*   r\   s     r   6test_partition_doc_from_file_prefers_metadata_filenamerg   |   sO    	|,d	3 Cq a6BC ?h????C Cs   AA
c                     dt        t        d            } t        fd| D              s-J d dt        | d   j                  j
                                y )Nzapplication/mswordr   c              3  P   K   | ]  }|j                   j                  k(    y wr=   )r$   filetype)r'   r(   DOC_MIME_TYPEs     r   r)   zQtest_partition_doc_gets_the_DOC_MIME_type_in_metadata_filetype.<locals>.<genexpr>   s      Fqzz""m3F   #&zExpected all elements to have 'z' as their filetype, but got: r   )r   r   r*   reprr$   rj   )r0   rk   s    @r   >test_partition_doc_gets_the_DOC_MIME_type_in_metadata_filetypern      s_    (M-l;<HFXFF 
)- 9!%%../0	2Fr    c                    d| j                  d       t        t        d            }t        fd|D              sJ y )N2029-07-05T09:24:281unstructured.partition.doc.get_last_modified_datereturn_valuezfake.docc              3  P   K   | ]  }|j                   j                  k(    y wr=   r$   last_modified)r'   r(   filesystem_last_modifieds     r   r)   zItest_partition_doc_pulls_last_modified_from_filesystem.<locals>.<genexpr>   s!     Vqzz''+CCVrl   patchr   r   r*   )mockerr0   rw   s     @r   6test_partition_doc_pulls_last_modified_from_filesystemr{      sF    4
LL;Jb   -j9:HVXVVVVr    c                    d}d| j                  d|       t        t        d            }t        fd|D              sJ y )Nrp   z2020-07-05T09:24:28rq   rr   r   )metadata_last_modifiedc              3  P   K   | ]  }|j                   j                  k(    y wr=   ru   )r'   r(   r}   s     r   r)   zRtest_partition_doc_prefers_metadata_last_modified_when_provided.<locals>.<genexpr>   s!     Taqzz''+AATrl   rx   )rz   rw   r0   r}   s      @r   ?test_partition_doc_prefers_metadata_last_modified_when_providedr      sU      52
LL;Jb   &?UH T8TTTTr    c                 T    t        t        d            } t        d | D              sJ y )Nr   c              3  P   K   | ]  }|j                   j                  d gk(     yw)engN)r$   	languagesr&   s     r   r)   z=test_partition_doc_adds_languages_metadata.<locals>.<genexpr>   s!     A1qzz##w.As   $&)r   r   r*   rX   s    r   *test_partition_doc_adds_languages_metadatar      s&    -l;<HAAAAAr    c                     t        t        d      d      } | D cg c]  }|j                  j                   c}dgddgdgdgdggk(  sJ y c c}w )Nzlanguage-docs/eng_spa_mult.docT)detect_language_per_elementr   spa)r   r   r$   r   )r0   r(   s     r   ;test_partition_doc_respects_detect_language_per_element_argr      sd    9:X\H +33QAJJ  3					8   3s   A)kwargsexpected_valuehi_resstrategyautoc                    ddl m} dd}t        | |d|      }t        t	        d      fi |\  }|j                  t               |j                  d| k(  sJ y )	Nr   )_DocxPartitionerc              3  T   K   t        d| j                  j                          y w)Nstrategy == )r   _optsr   )selfs    r   fake_iter_document_elementsz_test_partition_odt_forwards_strategy_arg_to_partition_docx.<locals>.fake_iter_document_elements   s#     \$**"5"5!6788s   &(_iter_document_elements)side_effectr   r   )r   r   returnzIterator[Element])unstructured.partition.docxr   r   r   r   assert_called_once_withr   text)r   r   r   r   r   _iter_elements_rc   s          r   :test_partition_odt_forwards_strategy_arg_to_partition_docxr      se     =9 "!/	O /=HHJW++C0<<\.)9::::r    c                    g d} g d}t        t        d            }t        |d   t              sJ |d   j                  j
                  | k(  sJ |d   j                  j                  |k(  sJ |d   t        d      k(  sJ |d   j                  j
                  | k(  sJ |d   j                  j                  |k(  sJ |d   t        d      k(  sJ |d   j                  j
                  J |d   j                  j                  J y )	N)bolditalicbold-italicr   )bir   r   zfake-doc-emphasized-text.docr      z$I am a bold italic bold-italic text.   zI am a normal text.)r   r   
isinstancer   r$   emphasized_text_contentsemphasized_text_tagsr   )!expected_emphasized_text_contentsexpected_emphasized_text_tagsr0   s      r   )test_partition_doc_grabs_emphasized_textsr      s   (X%$8!-.LMNHhqk5)))A;88<]]]]A;448UUUUA;-(NOOOOA;88<]]]]A;448UUUUA;-(=>>>>A;88@@@A;44<<<r    c                 >    t        t        t        d                   y)zCElements produced can be serialized then deserialized without loss.r   N)r
   r   r   rL   r    r   +test_partition_doc_round_trips_through_jsonr      s    #M2B<2P$QRr    c                     t        d      } t        |       }t        | d      }t        d |D              sJ |t        |      k(  sJ y )Nr   basic)chunking_strategyc              3  R   K   | ]  }t        |t        t        t        f       ! y wr=   )r   r   r   r   )r'   cs     r   r)   zYtest_partition_doc_chunks_elements_when_chunking_strategy_is_specified.<locals>.<genexpr>   s     Tz!.zBCTs   %')r   r   r*   r   )document_pathr0   chunkss      r   Ftest_partition_doc_chunks_elements_when_chunking_strategy_is_specifiedr      sK    $\2M]+H=GDF TVTTTT^H----r    c                    t        d      } t        |       D cg c]  }|j                   }}t        |       D cg c]  }|j                   }}||k(  sJ t        |      t        t	        |            k(  sJ y c c}w c c}w )Nzduplicate-paragraphs.doc)r   r   idrW   set)r   rc   idsids_2s       r   ?test_partition_doc_assigns_deterministic_and_unique_element_idsr     sv    $%?@M%2=%A
B'7::
BC
B'4]'CDGWZZDED %<<s8s3s8}$$$ CDs
   A:A?c            
         t        d      t        d      t        d      t        d      t        d      t        d      t        d      t	        d      gS )	Nz&These are a few of my favorite things:ParrotsHockeyAnalysisz4This is my first thought. This is my second thought.zThis is my third thought.2023zDOYLESTOWN, PA 18901)r   r   r   r   r   rL   r    r   r.   r.     sQ     	67jLM12V&'	 	r    )r   r	   )r.   list[Element]r/   zCaptureFixture[str])rP   zpathlib.Path)rz   r   )r   r	   r   zdict[str, Any]r   z
str | None)r   r   ):__doc__
__future__r   pathlibtypingr   r   rG   pytest_mockr   test_unstructured.unit_utilsr   r   r	   r
   r   r   unstructured.chunking.basicr   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   unstructured.partition.docr   r   r   r   r1   r:   rB   rJ   rM   rR   rY   r]   rd   rg   rn   r{   r   r   r   markparametrizer   r   r   r   r   fixturer.   rL   r    r   <module>r      sQ   : "     #  7
 
 
 5 6J))$).A)	>$	>.A	>:
-F>L@WUU&B

   (^z4((3z66JF5ST;;%3;EO;	;*=&S
.	% 
 
r    