
    :Qg                       d Z ddlmZ ddlZddlZddlZddlZddlZddlm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2 	 	 	 	 dAdZ3	 	 	 	 dBdZ4dBdZ5	 	 	 	 dBdZ6d Z7d Z8ejr                  ju                  dddg      dCd       Z;d Z<d Z=d Z>d Z?d Z@d ZAd ZBd ZCdDdZDd  ZEdDd!ZFd" ZG	 	 	 	 dEd#ZH	 	 	 	 dEd$ZI	 	 	 	 	 	 dFd%ZJ	 	 	 	 dGd&ZKdHd'ZLdId(ZMdId)ZNdId*ZOd+ ZPd, ZQd- ZRd. ZSd/ ZTd0 ZUd1 ZVd2 ZWd3 ZXd4 ZYd5 ZZ ej                         dJd6       Z\ ej                         dKd7       Z] ej                         dKd8       Z^ ej                         d9        Z_ ej                         d:        Z` ej                         dLd;       Za ej                         dMd<       Zb G d= d>      Zc G d? d@      Zdy)Nz4Test suite for `unstructured.partition.docx` module.    )annotationsN)AnyIterator)Document)	Paragraph)MockFixture)FixtureRequestMockassert_round_trips_through_JSONexample_doc_pathfunction_mockinstance_mockproperty_mock)chunk_by_title)AddressCompositeElementElementFooterHeaderImageListItemNarrativeText	PageBreakTable
TableChunkTextTitle)DocxPartitionerOptions_DocxPartitionerpartition_docxregister_picture_partitioner)#UNSTRUCTURED_INCLUDE_DEBUG_METADATAPartitionStrategyc                   t        |       }||k(  sJ |d   j                  j                  J |D ]  }|j                  j                  dk(  rJ  t        r+|D ch c]  }|j                  j
                   c}dhk(  sJ y y c c}w )Nr   mock_document.docxdocx)r    metadatapage_numberfilenamer"   detection_origin)mock_document_file_pathexpected_elementselementselements       b/var/www/html/answerous/venv/lib/python3.12/site-packages/test_unstructured/partition/test_docx.py!test_partition_docx_from_filenamer0   ;   s     56H((((A;++333 A((,@@@@A*AIJg  11JvhVVV +Js   Bc                4   t        | d      5 }t        j                         }|j                  |j	                                |j                  d       t        |      }||k(  sJ |D ]  }|j                  j                  J  	 ddd       y# 1 sw Y   yxY w)z`partition_docx()` accepts a SpooledTemporaryFile as its `file` argument.

    `python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
    to ensure the source file is appropriately converted in this case.
    rbr   fileN)	opentempfileSpooledTemporaryFilewritereadseekr    r'   r)   )r+   r,   	test_filespooled_temp_filer-   r.   s         r/   %test_partition_docx_with_spooled_filer=   H   s     
%t	, 5	$99;	 01q!!'89,,,, 	5G##,,444	55 5 5s   A3BBBc                    t        | d      5 }t        |      }d d d        |k(  sJ |D ]  }|j                  j                  J  y # 1 sw Y   0xY w)Nr2   r3   )r5   r    r'   r)   )r+   r,   fr-   r.   s        r/   test_partition_docx_from_filer@   Z   sd    	%t	, *!q)*(((( 1((0001* *s   A		Ac                V    t        j                  d      }t        | |      }||k(  sJ y )Ns   abcde)r)   r4   )ioBytesIOr    )r+   r,   r?   r-   s       r/   :test_partition_docx_uses_file_path_when_both_are_specifiedrD   b   s.     	

8A'>QGH((((    c                 x    t        j                  t        d      5  t                d d d        y # 1 sw Y   y xY w)Nz5either `filename` or `file` argument must be providedmatch)pytestraises
ValueErrorr     rE   r/   'test_partition_docx_raises_with_neitherrM   j   s,    	z)`	a   s   09c                     t        t        d            } | D cg c]  }|j                   c}g dk(  sJ | D cg c]  }t        |       c}t        t        t
        gk(  sJ yc c}w c c}w )zHDocx with no sections partitions recognizing both paragraphs and tables.teams_chat.docx)z)0:0:0.0 --> 0:0:1.510
Some Body
OK. Yeah.z'0:0:3.270 --> 0:0:4.250
James Bond
Umm.zsaved-by Dennis ForsytheN)r    r   texttyper   r   r-   es     r/   !test_parition_docx_from_team_chatrT   r   si    ./@ABH$%qAFF% *   
 &&DG&4u*==== &
 's
   A$A)infer_table_structureTFc                    t        t        d      |       }t        |d   j                  d      xr |d   j                  j                  d u}|| k(  sJ y )Nfake_table.docx)rU   r   text_as_html)r    r   hasattrr'   rX   )rU   r-   $table_element_has_text_as_html_fields      r/   )test_partition_docx_infer_table_structurer[   }   sa    *+CXH 	$$n5 	:QK  --T9 ) 03HHHHrE   c                     t        t        d            } t        | d   t              sJ | d   j                  dk(  sJ | d   j
                  j                  dk(  sJ | d   j
                  j                  dk(  sJ y )NrW   r   z4Header Col 1 Header Col 2 Lorem ipsum A Link examplezv<table><tr><td>Header Col 1</td><td>Header Col 2</td></tr><tr><td>Lorem ipsum</td><td>A Link example</td></tr></table>)r    r   
isinstancer   rP   r'   rX   r)   r-   s    r/   #test_partition_docx_processes_tabler_      s    ./@ABHhqk5)))A; VWWWA;,,	   A;((,====rE   c                     t        t        d            } | d   t        d      k(  sJ | d   t        d      k(  sJ | D ]  }|j                  j
                  dk(  rJ  y )Nhandbook-1p.docxr   zUS Trustee Handbook	Copyright)r    r   r   r   r'   r)   )r-   r.   s     r/   +test_partition_docx_grabs_header_and_footerrd      sl    ./ABCHA;&!67777B<6+.... ?((,>>>>?rE   c                     t        t        d      d      } d| D cg c]  }t        |      j                   c}vsJ t	        d | D              sJ yc c}w )a  Hard page-breaks by themselves are not enough to locate page-breaks in a document.

    In particular, they are redundant when rendered page-breaks are present, which they usually are
    in a native Word document, so lead to double-counting those page-breaks. When rendered page
    breaks are *not* present, only a small fraction will be represented by hard page-breaks so hard
    breaks are a false-positive and will generally produce incorrect page numbers.
    z(handbook-1p-no-rendered-page-breaks.docxTinclude_page_breaksr   c              3  L   K   | ]  }|j                   j                  d u   y wN)r'   r(   .0rS   s     r/   	<genexpr>zptest_partition_docx_includes_neither_page_breaks_nor_numbers_when_rendered_breaks_not_present.<locals>.<genexpr>   s     @!qzz%%-@s   "$N)r    r   rQ   __name__allrR   s     r/   ]test_partition_docx_includes_neither_page_breaks_nor_numbers_when_rendered_breaks_not_presentro      sY     CDZ^H BAtAw//BBBB@x@@@@ Cs   Ac                     t        t        d      d      } d| D cg c]  }t        |      j                   c}vsJ | d   j                  j
                  dk(  sJ | d   j                  j
                  dk(  sJ yc c}w )	zPage-number metadata is not supressed when `include_page_breaks` arga is False.

    Only inclusion of PageBreak elements is affected by that option.
    ra   Frf   r         Nr    r   rQ   rm   r'   r(   rR   s     r/   Qtest_partition_docx_includes_page_numbers_when_page_break_elements_are_suppressedru      s}    
 ./ABX]^HBAtAw//BBBBA;++q000B<  ,,111 Cs   A;c                    t        t        d      dd      } d| D cg c]  }t        |      j                   c}v sJ | d   j                  j
                  dk(  sJ | d   j                  j
                  dk(  sJ y c c}w )	Nra   T   )rg   starting_page_numberr   rq   rr      rt   rR   s     r/   Ctest_partition_docx_includes_page_break_elements_when_so_instructedrz      s    +,$]^H X>47++>>>>A;++q000B<  ,,111 ?s   A<c                     t        t        d            } | d   t        d      k(  sJ t        d | D              dk(  sJ y )Nz example-list-items-multiple.docxrb   zCThis is simply dummy text of the printing and typesetting industry.c              3  B   K   | ]  }t        |t              sd   yw)rq   N)r]   r   rj   s     r/   rl   z4test_partition_docx_detects_lists.<locals>.<genexpr>   s     >QjH&=q>s   
   )r    r   r   sumr^   s    r/   !test_partition_docx_detects_listsr      sN    ./QRSHB<8M    >(>>"DDDrE   c                 X    t        t        d      d      } t        d | D              sJ y )Nsimple.docxtest)metadata_filenamec              3  N   K   | ]  }|j                   j                  d k(    ywr   Nr'   r)   rk   r.   s     r/   rl   z\test_partition_docx_from_filename_prefers_metadata_filename_when_provided.<locals>.<genexpr>   !     Kww((F2K   #%)r    r   rn   r^   s    r/   Itest_partition_docx_from_filename_prefers_metadata_filename_when_providedr      s)    .}=QWXHK(KKKKrE   c                     t        t        d      d      5 } t        | d      }d d d        t        d D              sJ y # 1 sw Y   xY w)Nr   r2   r   )r4   r   c              3  N   K   | ]  }|j                   j                  d k(    ywr   r   r   s     r/   rl   zXtest_partition_docx_from_file_prefers_metadata_filename_when_provided.<locals>.<genexpr>   r   r   )r5   r   r    rn   r?   r-   s     r/   Etest_partition_docx_from_file_prefers_metadata_filename_when_providedr      sM    	}-t	4 D!qFCDK(KKKKD Ds   AA
c                    d}| j                  d|       t        t        d            }|d   j                  j                  |k(  sJ y )Nz2029-07-05T09:24:282unstructured.partition.docx.get_last_modified_datereturn_value	fake.docxr   patchr    r   r'   last_modified)mockerfilesystem_last_modifiedr-   s      r/   Etest_partition_docx_from_file_path_gets_last_modified_from_filesystemr      sP    4
LL<Kc   .{;<HA;--1IIIIrE   c                     t        t        d      d      5 } t        |       }d d d        d   j                  j                  J y # 1 sw Y   %xY w)Nr   r2   r3   r   r5   r   r    r'   r   r   s     r/   5test_partition_docx_from_file_gets_last_modified_Noner      sQ    	}-t	4 *!q)* A;--555* *s   AAc                    d}d}| j                  d|       t        t        d      |      }|d   j                  j                  |k(  sJ y )Nz2023-11-01T14:13:072020-07-05T09:24:28r   r   r   )metadata_last_modifiedr   r   )r   r   r   r-   s       r/   Atest_partition_docx_from_file_path_prefers_metadata_last_modifiedr      s]    42
LL<Kc   %>TH A;--1GGGGrE   c                     d} t        t        d      d      5 }t        ||       }d d d        d   j                  j                  | k(  sJ y # 1 sw Y   (xY w)Nr   r   r2   )r4   r   r   r   )r   r?   r-   s      r/   <test_partition_docx_from_file_prefers_metadata_last_modifiedr     sa    2	}-t	4 Y!qAWXY A;--1GGGGY Ys   AAc                   t        d      | d<   t        d	i | }t        |      }|j                  j                  d   }t        |j                  |            }|j                  dk(  sJ ||k(  sJ |j                  j                  d   }t        |j                  |            }|j                  dk(  sJ |g k(  sJ |j                  j                  d   }t        |j                  |            }|j                  dk(  sJ |g k(  sJ y )
Nfake-doc-emphasized-text.docx	file_pathrq   $I am a bold italic bold-italic text.rs    rw   I am a normal text.rL   )r   r   r   	_document
paragraphslist_iter_paragraph_emphasisrP   )	opts_argsexpected_emphasized_textsoptspartitioner	paragraphemphasized_textss         r/   (test_get_emphasized_texts_from_paragraphr     s    ..MNIk!.I.D"4(K%%003IK@@KL>>CCCC8888%%003IK@@KL>>Rr!!!%%003IK@@KL>>2222r!!!rE   c                    t        d      | d<   t        di | }t        |      }|j                  j                  d   }t        |j                  |            }||k(  sJ y Nr   r   r   rL   )r   r   r   r   tablesr   _iter_table_emphasis)r   r   r   r   tabler   s         r/   test_iter_table_emphasisr   (  sf     ..MNIk!.I.D"4(K!!((+EK<<UCD8888rE   c                    t        d      | d<   t        di | }t        |      }|j                  j                  d   }|j                  |      \  }}||k(  sJ ||k(  sJ y r   )r   r   r   r   r   _table_emphasis)r   !expected_emphasized_text_contentsexpected_emphasized_text_tagsr   r   r   emphasized_text_contentsemphasized_text_tagss           r/   test_table_emphasisr   5  sw    
 ..MNIk!.I.D"4(K!!((+E5@5P5PQV5W22#'HHHH#@@@@rE   c                   t        t        d            }t        |d   t              sJ |d   j                  j
                  | k(  sJ |d   j                  j                  |k(  sJ |d   t        d      k(  sJ |d   j                  j
                  | k(  sJ |d   j                  j                  |k(  sJ |d   t        d      k(  sJ |d   j                  j
                  J |d   j                  j                  J y )Nr   r   rq   r   rs   r   )r    r   r]   r   r'   r   r   r   )r   r   r-   s      r/   *test_partition_docx_grabs_emphasized_textsr   E  s    ./NOPHhqk5)))A;88<]]]]A;448UUUUA;-(NOOOOA;88<]]]]A;448UUUUA;-(=>>>>A;88@@@A;44<<<rE   c                0    t        |       }t        |       y ri   )r    r   )r+   r-   s     r/   test_partition_docx_with_jsonr   X  s    56H#H-rE   c                :   t        d      | d<   t        d	i | }t        |      }g d}|j                  j                  }t        |      D ]O  \  }\  }}||   }|j                  |      }	||j                  v sJ d|g d|        |	|k(  rAJ d| d| d|	         y )
Nzcategory-level.docxr   ))r   zCall me Ishmael.)r   zA Heading 1)r   z#Whenever I find myself growing grim)r   zA top level list item)rq   z
Next level)rq   Same)r   zSecond top-level list item)r   z$whenever I find myself involuntarily)r   r   )rq   zA Heading 2)r   z)This is my substitute for pistol and ball)r   zAnother Heading 1)r   zThere now is your insular cityz
paragraph[z].text does not contain zexpected paragraph[z] to have depth==z, got rL   )r   r   r   r   r   	enumerate_parse_category_depth_by_stylerP   )
r   r   r   
test_casesr   idxdepthrP   r   actual_depths
             r/   "test_parse_category_depth_by_styler   ]  s    -.CDIk!.I.D"4(KJ  &&11J'
3 S]eTsO	"AA)Ly~~%YSE7:RSWRX'YY%E!	S %6ugVL>R	S!SrE   c                    t        di | }t        |      }g d}t        |      D ]*  \  }\  }}|j                  |      |k(  rJ d||    d        y )N))r   	Heading 1)rq   z	Heading 2)rs   z	Heading 3)rq   Subtitle)r   List)rq   zList 2)rs   zList 3)r   List Bullet)rq   zList Bullet 2)rs   zList Bullet 3)r   zList Number)rq   zList Number 2)rs   zList Number 3z
test case z failedrL   )r   r   r   #_parse_category_depth_by_style_name)r   r   r   r   r   r   rP   s          r/   'test_parse_category_depth_by_style_namer   ~  sp    !.I.D"4(KJ  (
3 1]eT;;DAUJ	1
3(0	1J1rE   c                Z    t        di | }t        |      }|j                         dk(  sJ y )Nr   rL   )r   r   #_parse_category_depth_by_style_ilvl)r   r   r   s      r/   'test_parse_category_depth_by_style_ilvlr     s0    !.I.D"4(K::<AAArE   c                     t        t        d      d      } t        t        d            }t        |      }| |k7  sJ | |k(  sJ y )Nra   by_title)chunking_strategy)r    r   r   )chunk_elementsr-   chunkss      r/   9test_add_chunking_strategy_on_partition_docx_default_argsr     sP    #+,
N ./ABCHH%FX%%%V###rE   c                     t        d      } t        | ddd      }t        |       }t        |dd      }||k(  sJ ||k7  sJ |D ]4  }t        |t        t
        f      sJ t        |j                        dk  r4J  y )Nr   r   	      )r   max_characterscombine_text_under_n_chars)r   r   )r   r    r   r]   r   r   lenrP   )	docx_pathr   r-   r   chunks        r/   ,test_add_chunking_strategy_on_partition_docxr     s     !@AI#Z^_N i(HHQSTUFV###~%%% $%"2J!?@@@5::!###$rE   c                 p    t        d      } t        |       }|d   j                  j                  dgk(  sJ y )Nra   r)   r   engr   r    r'   	languagesr)   r-   s     r/   2test_partition_docx_element_metadata_has_languagesr     s7     23Hx0HA;))eW444rE   c                     t        d      } t        | d      }|D cg c]  }|j                  j                   }}|dgddgdgdgdggk(  sJ y c c}w )Nzlanguage-docs/eng_spa_mult.docxT)r)   detect_language_per_elementr   spar   )r)   r-   r.   langss       r/   8test_partition_docx_respects_detect_language_per_elementr     sc     ABHxTRH7?@GW''@E@eWuenugwHHHH As   Ac                 t    t        d      } t        | dg      }|d   j                  j                  dgk(  sJ y )Nra   deur)   r   r   r   r   s     r/   *test_partition_docx_respects_languages_argr     s;     23HxE7CHA;))eW444rE   c                     t        j                  t              5  t        d      } t	        | d       d d d        y # 1 sw Y   y xY w)Nra   r   r   )rI   rJ   	TypeErrorr   r    r   s    r/   :test_partition_docx_raises_TypeError_for_invalid_languagesr     s8    	y	! ;#$67E:; ; ;s	   <Ac                    t        t        d            } | d   }|j                  dk(  sJ |j                  }|j                  J |j
                  J |j                  J | d   }|j                  dk(  sJ |j                  }|j                  J |j
                  J |j                  J | d   }|j                  dk(  sJ |j                  }|j                  dd	d
dgk(  sJ |j
                  d	gk(  sJ |j                  d
gk(  sJ | d   }|j                  dk(  sJ |j                  }|j                  ddddgk(  sJ |j
                  dgk(  sJ |j                  dgk(  sJ | d   }|j                  dk(  sJ |j                  }|j                  ddddgk(  sJ |j
                  dgk(  sJ |j                  dgk(  sJ | d   }|j                  dk(  sJ |j                  }|j                  ddddgk(  sJ |j
                  dgk(  sJ |j                  dgk(  sJ | d   }|j                  dk(  sJ |j                  }|j                  J |j
                  J |j                  J y )Nzhlink-meta.docxr   Onerq   zTwo with link to bookmark.rs   zThree with link to foo.com.   zlink to foo.comzhttps://foo.com)start_indexrP   urlrw   z,Four with link to foo.com searching for bar.r}   z!link to foo.com searching for barzhttps://foo.com?q=barry   z/Five with link to foo.com introduction section.z$link to foo.com introduction sectionzhttp://foo.com/#intro   zEight with link to file.zlink to filezcourt-exif.jpg   zNine.)r    r   rP   r'   links
link_texts	link_urls)r-   r.   r'   s      r/   /test_partition_docx_includes_hyperlink_metadatar    s7   ./@ABH qkG<<5   H>>!!!&&&%%% qkG<<7777H>>!!!&&&%%% qkG<<8888H>>%$	
    #4"5555"3!4444 qkG<<IIIIH>>7*	
    #F"GGGG"9!:::: qkG<<LLLLH>>:*	
    #I"JJJJ"9!:::: qkG<<5555H>>"#	
    >"2222"2!3333 qkG<<7"""H>>!!!&&&%%%rE   c                    t        d      } t        |       D cg c]  }|j                   }}t        |       D cg c]  }|j                   }}||k(  sJ t        |      t        t	        |            k(  sJ y c c}w c c}w )Nzduplicate-paragraphs.docx)r   r    idr   set)document_pathr.   idsids_2s       r/   @test_partition_docx_assigns_deterministic_and_unique_element_idsr  +  sv    $%@AM%3M%B
C'7::
CC
C'5m'DEGWZZEEE %<<s8s3s8}$$$ DEs
   A:A?c                 x    t        t        d            } | D cg c]  }|j                   c}g dk(  sJ y c c}w )Nzdocx-shapes.docx)z,Paragraph with single <inline-image> within.z:Paragraph with <inline-image1> and <inline-image2> within.z'Paragraph with floating shape attached.)r    r   rP   )partitioned_docr.   s     r/   $test_it_considers_text_inside_shapesr  :  s<    $%56H%IJO(78WGLL8 =   8s   7c                 P    t        d t        t        d            D              rJ y )Nc              3  <   K   | ]  }t        |t                y wri   )r]   r   rj   s     r/   rl   zMtest_partition_docx_generates_no_Image_elements_by_default.<locals>.<genexpr>I  s      !"
1es   contains-pictures.docx)anyr    r   rL   rE   r/   :test_partition_docx_generates_no_Image_elements_by_defaultr  H  s2     &45EF^5_&`    rE   c                 B    G d d      } t        |        t        t        d            }d t        _        t        |      dk(  sJ |D cg c]  }t        |t              s| }}t        |      dk(  sJ |D cg c]  }|j                   c}g dk(  sJ y c c}w c c}w )Nc                  *    e Zd Ze	 	 	 	 	 	 dd       Zy)`test_partition_docx_uses_registered_picture_partitioner.<locals>.FakeParagraphPicturePartitionerc              3     K   t        j                  |j                   |j                   j	                               j                         }t        d| d|j                          y w)NzImage with hash z, strategy: )hashlibsha1rP   strategyencode	hexdigestr   )clsr   r   	call_hashs       r/   iter_elementszntest_partition_docx_uses_registered_picture_partitioner.<locals>.FakeParagraphPicturePartitioner.iter_elementsP  sW       	'7%G%N%N%PQ[[]I*9+\$--QRRs   A(A*N)r   r   r   r   returnzIterator[Image])rm   
__module____qualname__classmethodr  rL   rE   r/   FakeParagraphPicturePartitionerr  O  s1    		S%	S-C	S	S 
	SrE   r$  r  r      )JImage with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_reszJImage with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_resr&  zJImage with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_reszJImage with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_reszJImage with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res)	r!   r    r   r   _PicturePartitionerClsr   r]   r   rP   )r$  r-   rS   image_elementss       r/   7test_partition_docx_uses_registered_picture_partitionerr)  N  s    S S !!@A./GHIH 591x=B!)BAZ5-AaBNB~!###*+qAFF+ 0    C+s   	BB9Bc            
         t        d      t        d      t        d      t        d      t        d      t        d      t        d      t	        d      gS )	N&These are a few of my favorite things:ParrotsHockeyAnalysis4This is my first thought. This is my second thought.This is my third thought.2023DOYLESTOWN, PA 18901)r   r   r   r   r   rL   rE   r/   r,   r,   o  sQ     	67jLM12V&'	 	rE   c                 
    g dS )N)bolditalicbold-italicr6  rL   rL   rE   r/   r   r   }  s    ;;rE   c                 
    g dS )N)bir8  r9  rL   rL   rE   r/   r   r     s    rE   c                 &    ddddddddddddgS )Nr4  r8  )rP   tagr5  r9  r6  rL   rL   rE   r/   r   r     s.     $#&s+s+	 rE   c                    t        j                         } | j                  dd       | j                  dd       | j                  dd       | j                  dd       | j                  d	d       | j                  d
d       | j                  d	d       | j                  dd       | j                  dd       | j                  d       | j                  d       | S )Nr+  r   )styleu   • ParrotsNormalu   • r-  r   r   r.  r/  r0  z	Body Textr1  r2  )r&   r   add_paragraph)documents    r/   mock_documentrA    s    }}HC;W=9628=92]3:X62X.QYab6kJ6"12OrE   c                D    t        |dz        }| j                  |       |S )Nr%   )strsave)rA  tmp_pathr)   s      r/   r+   r+     s%    8223Hx OrE   c                     ddddddS )zAll default arguments for `DocxPartitionerOptions`.

    Individual argument values can be changed to suit each test. Makes construction of opts more
    compact for testing purposes.
    NT)r4   r   rg   rU   r  rL   rL   rE   r/   r   r     s     #!% rE   c                  &   e Zd ZdZd)dZd)dZ	 	 	 	 d*dZej                  j                  dddg      	 	 	 	 d+d       Z
ej                  j                  dddg      	 	 	 	 d+d	       Z	 	 d)d
Z	 	 d)dZ	 	 	 	 d,dZ	 	 d)dZej                  j                  dddg      	 	 	 	 d-d       Zej                  j                  dddg      	 	 	 	 	 	 	 	 	 	 d.d       Zd)dZ	 	 d)dZej                  j                  ddej*                  dfej,                  dfg      	 	 	 	 	 	 d/d       Zej                  j                  dddg      	 	 	 	 	 	 d0d        Z	 	 d)d!Z	 	 d)d"Z	 	 d)d#Zd)d$Z	 	 d)d%Z	 	 d)d&Zd)d'Z ej@                         d1d(       Z!y)2DescribeDocxPartitionerOptionszQUnit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects.c                n    t        d      |d<   t        j                  di |}t        |t              sJ y )Nr   r   rL   )r   r   loadr]   selfr   r   s      r/   $it_provides_a_validating_constructorzCDescribeDocxPartitionerOptions.it_provides_a_validating_constructor  s5    !1-!@	+%**7Y7$ 6777rE   c                    t        j                  t        d      5  t        j                  di | d d d        y # 1 sw Y   y xY w)Nzno DOCX document specified, rG   rL   rI   rJ   rK   r   rJ  rL  r   s     r/   (and_it_raises_when_options_are_not_validzGDescribeDocxPartitionerOptions.and_it_raises_when_options_are_not_valid  s7    ]]:-KL 	5"''4)4	5 	5 	5	   ;Ac                    t        |t              }t        |d|      }t        |t        dd      }t	        di |}|j
                  }|j                          |j                  d       ||u sJ y )Nz)unstructured.partition.docx.docx.Documentr   
_docx_filez
abcde.docxrL   )r   r   r   r   r   r@  assert_called_once_with)rL  requestr   	document_docx_Document__docx_file_prop_r   r@  s           r/   it_loads_the_docx_documentz9DescribeDocxPartitionerOptions.it_loads_the_docx_document  sy    
 "'84	&@y
 )+\
 &2	2==002..|<9$$$rE   	arg_valueTFc                D    ||d<   t        di |}|j                  |u sJ y )Nrg   rL   )r   rg   rL  r[  r   r   s       r/   Pit_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_streamzoDescribeDocxPartitionerOptions.it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream  s0     ,5	'(%2	2''9444rE   c                D    ||d<   t        di |}|j                  |u sJ y )NrU   rL   )r   rU   r]  s       r/   :it_knows_whether_to_include_text_as_html_in_Table_metadatazYDescribeDocxPartitionerOptions.it_knows_whether_to_include_text_as_html_in_Table_metadata  s0     .7	)*%2	2))Y666rE   c                   t        di |}|j                         }t        t        |d       t              sJ |j
                  dk(  sJ t        j                  t              5  t        |       d d d        y # 1 sw Y   y xY w)Nrs   rL   )	r   increment_page_numberr]   nextr   r(   rI   rJ   StopIterationrL  r   r   page_break_iters       r/   Dit_generates_a_PageBreak_element_when_the_page_number_is_incrementedzcDescribeDocxPartitionerOptions.it_generates_a_PageBreak_element_when_the_page_number_is_incremented	  ss     &2	2446$5yAAA1$$$]]=) 	"!	" 	" 	"s   "A77B c                    d|d<   t        di |}|j                         }t        j                  t              5  t        |       d d d        |j                  dk(  sJ y # 1 sw Y   xY w)NFrg   rs   rL   )r   rb  rI   rJ   rd  rc  r(   re  s       r/   Sbut_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_offzrDescribeDocxPartitionerOptions.but_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off  sg     ,1	'(%2	2446]]=) 	"!	"1$$$	" 	"s   A  A)c                z    d|d<   d|_         t        di |}|j                  }|j                  d       |dk(  sJ y )Nza/b/document.docxr   z2024-04-02T20:32:35rL   )r   r   r   rU  )rL  r   get_last_modified_date_r   r   s        r/   Dit_gets_last_modified_from_the_filesystem_when_file_path_is_providedzcDescribeDocxPartitionerOptions.it_gets_last_modified_from_the_filesystem_when_file_path_is_provided#  sM     "5	+/D,%2	2**778KL 5555rE   c                j    t        j                  d      }||d<   t        di |}|j                  J y Ns   abcdefgr4   rL   )rB   rC   r   r   )rL  r   r4   r   s       r/   Rbut_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_providedzqDescribeDocxPartitionerOptions.but_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_provided/  s;     zz*% 	&%2	2!!)))rE   r   z
u/v/w.docxNc                F    ||d<   t        di |}|j                  |k(  sJ y Nr   rL   )r   metadata_file_path)rL  r   r   r   s       r/   ,it_uses_the_file_path_argument_when_providedzKDescribeDocxPartitionerOptions.it_uses_the_file_path_argument_when_provided:  s/     "+	+%2	2&&)333rE   )
page_countdocument_contains_pagebreaksexpected_value)r   Tr   )rq   FNc                    t        |t        d|      }t        di |}||_        |j                  }|j	                          ||u sJ y )N_document_contains_pagebreaksr   rL   )r   r   _page_countermetadata_page_numberrU  )	rL  rV  r   rt  ru  rv  #_document_contains_pagebreaks_prop_r   rz  s	            r/   Bit_reports_None_when_no_rendered_page_breaks_are_found_in_documentzaDescribeDocxPartitionerOptions.it_reports_None_when_no_rendered_page_breaks_are_found_in_documentE  sX     /<"+5	/
+ &2	2'#88+CCE#~555rE   c                    t        di |}|j                  dk(  sJ t        |j                                |j                  dk(  sJ t        |j                                |j                  dk(  sJ y)z)In DOCX, page-number is the slide number.rq   rs   rw   NrL   r   r(   r   rb  rK  s      r/   !it_keeps_track_of_the_page_numberz@DescribeDocxPartitionerOptions.it_keeps_track_of_the_page_numbera  sm    %2	21$$$T'')*1$$$T'')*1$$$rE   c                    t        di |ddi}|j                  dk(  sJ t        |j                                |j                  dk(  sJ y )Nrx   rw   ry   rL   r~  rK  s      r/   Eit_assigns_the_correct_page_number_when_starting_page_number_is_givenzdDescribeDocxPartitionerOptions.it_assigns_the_correct_page_number_when_starting_page_number_is_givenk  sO     &J	JJ1$$$T'')*1$$$rE   )r[  rv  )Nhi_resfastr  c                F    ||d<   t        di |}|j                  |k(  sJ y )Nr  rL   )r   r  )rL  r   r[  rv  r   s        r/   +it_knows_which_partitioning_strategy_to_usezJDescribeDocxPartitionerOptions.it_knows_which_partitioning_strategy_to_usev  s-     !*	*%2	2}}...rE   )	file_namerv  )page-breaks.docxT)rO   Fc                V    t        |      |d<   t        di |}|j                  |u sJ y rq  )r   r   rx  )rL  r   r  rv  r   s        r/   2it_knows_whether_the_document_contains_page_breakszQDescribeDocxPartitionerOptions.it_knows_whether_the_document_contains_page_breaks  s4     "2)!<	+%2	211^CCCrE   c                F    d|d<   t        di |}|j                  dk(  sJ y )N
l/m/n.docxr   rL   )r   rT  rK  s      r/   Dit_uses_the_path_to_open_the_presentation_when_file_path_is_providedzcDescribeDocxPartitionerOptions.it_uses_the_path_to_open_the_presentation_when_file_path_is_provided  s-     ".	+%2	2,...rE   c                    t        j                         }|j                  d       ||d<   t        di |}|j                  }||usJ t        |t        j                        sJ |j                         dk(  sJ y rn  )	r6   r7   r8   r   rT  r]   rB   rC   getvalue)rL  r   r<   r   	docx_files        r/   Fand_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_providedzeDescribeDocxPartitionerOptions.and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided  s{     %99;
+-	&%2	2OO	 1111)RZZ000!!#z111rE   c                    t        j                  d      }||d<   t        di |}|j                  }||u sJ t	        |t         j                        sJ |j                         dk(  sJ y rn  )rB   rC   r   rT  r]   r  )rL  r   r4   r   r  s        r/   Fand_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFilezeDescribeDocxPartitionerOptions.and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile  sk     zz*% 	&%2	2OO	D   )RZZ000!!#z111rE   c                    d|d<   t        j                  t        d      5  t        j                  di | d d d        y # 1 sw Y   y xY w)Nr  r   z'no such file or directory: 'l/m/n.docx'rG   rL   )rI   rJ   FileNotFoundErrorr   rJ  rP  s     r/   *it_raises_when_no_file_exists_at_file_pathzIDescribeDocxPartitionerOptions.it_raises_when_no_file_exists_at_file_path  sB    !-	+]],4]^ 	5"''4)4	5 	5 	5s   A  A	c                    t        d      |d<   t        j                  t        d      5  t	        j
                  di | d d d        y # 1 sw Y   y xY w)N
simple.docr   *not a ZIP archive \(so not a DOCX file\): rG   rL   )r   rI   rJ   rK   r   rJ  rP  s     r/   =and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archivez\DescribeDocxPartitionerOptions.and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archive  sH     "2,!?	+]]:-Z[ 	5"''4)4	5 	5 	5s   A		Ac                    t        t        d      d      5 }||d<   t        j                  t        d      5  t        j                  di | d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr  r2   r4   r  rG   rL   )r5   r   rI   rJ   rK   r   rJ  )rL  r   r?   s      r/   <and_it_raises_when_the_file_like_object_is_not_a_ZIP_archivez[DescribeDocxPartitionerOptions.and_it_raises_when_the_file_like_object_is_not_a_ZIP_archive  so     "<0$7 	91 !Ifz1^_ 9&++8i89	9 	99 9	9 	9s"   !A*AA*A'	#A**A3c                    t        j                  t        d      5  t        j                  di | d d d        y # 1 sw Y   y xY w)Nz1no DOCX document specified, either `filename` or rG   rL   rO  rP  s     r/   :and_it_raises_when_neither_a_file_path_or_file_is_providedzYDescribeDocxPartitionerOptions.and_it_raises_when_neither_a_file_path_or_file_is_provided  s7    ]]:-`a 	5"''4)4	5 	5 	5rR  c                    t        |d      S )Nr   )r   )rL  rV  s     r/   rk  z6DescribeDocxPartitionerOptions.get_last_modified_date_  s    W&Z[[rE   r   dict[str, Any])rV  r	   r   r  )r[  boolr   r  )r   r  rk  r
   )r   z
str | Noner   r  )
rV  r	   r   r  rt  intru  r  rv  z
int | None)r   r  r[  rC  rv  rC  )r   r  r  rC  rv  r  )rV  r	   r   r
   )"rm   r!  r"  __doc__rM  rQ  rZ  rI   markparametrizer^  r`  rg  ri  rl  ro  rs  r|  r  r  r#   FASTHI_RESr  r  r  r  r  r  r  r  r  fixturerk  rL   rE   r/   rH  rH    s   [85%% "%, [[[4-855*85 95 [[[4-877*87 97
"'
"
%'
%
6'
6BF
6*'* [[[<*>?4#40>4 @4 [[H	'(66 "6 	6
 '+6 #6	60%%'% [['	-22F;>O>V>VX`=ab/'/47/IL/	/ [['*DF`)aD'D47DIMDD/'/2'22'25
5'59'95 V^^\ \rE   rH  c                  V    e Zd ZdZddZddZddZddZddZd Z	ddZ
dd	Zdd
Zy)Describe_DocxPartitionerzCUnit-test suite for `unstructured.partition.docx._DocxPartitioner`.c                    t        di |}t        j                  t        d            j                  d   }t        |      j                  |      dk(  sJ y )Ndocx-tables.docxr   zv<table><tr><td>Header Col 1</td><td>Header Col 2</td></tr><tr><td>Lorem ipsum</td><td>A link example</td></tr></table>rL   )r   r&   r   r   r   r   _convert_table_to_htmlrL  r   r   r   s       r/   it_can_convert_a_table_to_htmlz7Describe_DocxPartitioner.it_can_convert_a_table_to_html  sV    %2	2./ABCJJ1M%<<UC
 	
 
rE   c                    t        di |}t        j                  t        d            j                  d   }t        j                  ddt        |      j                  |            }|dk(  sJ y)  
        Fixture table is:

            +---+-------------+---+
            | a |     >b<     | c |
            +---+-------------+---+
            |   | +-----+---+ |   |
            |   | |  e  | f | |   |
            | d | +-----+---+ | i |
            |   | | g&t | h | |   |
            |   | +-----+---+ |   |
            +---+-------------+---+
            | j |      k      | l |
            +---+-------------+---+
        r  rq   z +<<z<table><tr><td>a</td><td>&gt;b&lt;</td><td>c</td></tr><tr><td>d</td><td>e f g&amp;t h</td><td>i</td></tr><tr><td>j</td><td>k</td><td>l</td></tr></table>NrL   )	r   r&   r   r   r   resubr   r  )rL  r   r   r   htmls        r/   )and_it_can_convert_a_nested_table_to_htmlzBDescribe_DocxPartitioner.and_it_can_convert_a_nested_table_to_html  sn      &2	2./ABCJJ1M vvfc#3D#9#P#PQV#WX
 	
 
rE   c                    t        di |}t        j                  t        d            j                  d   }dj                  t        |      j                  |            dk(  sJ y )Nr  r    z4Header Col 1 Header Col 2 Lorem ipsum A link examplerL   r   r&   r   r   r   joinr   _iter_table_textsr  s       r/   $it_can_convert_a_table_to_plain_textz=Describe_DocxPartitioner.it_can_convert_a_table_to_plain_text  s^    %2	2./ABCJJ1Mxx(.@@GHB
 	
 
rE   c                    t        di |}t        j                  t        d            j                  d   }dj                  t        |      j                  |            dk(  sJ y)r  r  rq   r  za >b< c d e f g&t h i j k lNrL   r  r  s       r/   /and_it_can_convert_a_nested_table_to_plain_textzHDescribe_DocxPartitioner.and_it_can_convert_a_nested_table_to_plain_text  s`      &2	2./ABCJJ1Mxx(.@@GH)
 	
 
rE   c                    t        di |}t        j                  t        d            j                  d   }dj                  t        |      j                  |            dk(  sJ y)z
        Fixture table is:

            +---+-------+
            | a | b     |
            |   +---+---+
            |   | c | d |
            +---+---+   |
            | e     |   |
            +-------+---+
        r  rs   r  z	a b c d eNrL   r  r  s       r/   /but_the_text_of_a_merged_cell_appears_only_oncezHDescribe_DocxPartitioner.but_the_text_of_a_merged_cell_appears_only_once"  sY     &2	2./ABCJJ1Mxx(.@@GHKWWWrE   c                &   t        t        t        d                  }t        |      }|j                  j                  d      sJ t        |      }t        |      j                  dk(  sJ |j                  dk(  sJ |j                  j                  dk(  sJ t        |      }t        |      j                  dk(  sJ |j                  dk(  sJ d|j                         |j                  j                  dk(  sJ d	|j                  j                         t        |      }t        |      j                  dk(  sJ |j                  dk(  sJ d|j                         |j                  j                  d
k(  sJ d	|j                  j                         t        |      }t        |      j                  dk(  sJ |j                  dk(  sJ d|j                         |j                  j                  dk(  sJ d	|j                  j                         t        |      }t        |      j                  dk(  sJ |j                  dk(  sJ d|j                         |j                  j                  dk(  sJ d	|j                  j                         t        |      }t        |      j                  dk(  sJ |j                  dk(  sJ d|j                         |j                  j                  dk(  sJ d	|j                  j                         y)ak  DOCX permits table rows to start late and end early.

        It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that
        they allow rows to start late, like in column 3, and end early, like the last cell is in
        column 5 of a 7 column table.

        A practical example might look like this:

                       +------+------+
                       | East | West |
            +----------+------+------+
            | Started  |  25  |  32  |
            +----------+------+------+
            | Finished |  17  |  21  |
            +----------+------+------+
        z tables-with-incomplete-rows.docxzExample of DOCX table r   za b c dzI<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>za b czactual e.text=zD<table><tr><td>a</td><td/></tr><tr><td>b</td><td>c</td></tr></table>zactual e.metadata.text_as_html=zX<table><tr><td>a</td><td>a</td><td/></tr><tr><td>b</td><td>c</td><td>d</td></tr></table>zX<table><tr><td>a</td><td>b</td><td/></tr><tr><td>a</td><td>c</td><td>d</td></tr></table>za b c d e fz<table><tr><td>a</td><td>a</td><td>b</td><td>c</td></tr><tr><td/><td>d</td><td>d</td><td/></tr><tr><td>e</td><td>d</td><td>d</td><td>f</td></tr><tr><td/><td>d</td><td>d</td><td/></tr></table>z"Data More Dato WTF? Strange FormataD  <table><tr><td>Data</td><td>Data</td><td/></tr><tr><td>Data</td><td>Data</td><td/></tr><tr><td>Data</td><td>Data</td><td/></tr><tr><td/><td>More</td><td/></tr><tr><td>Dato</td><td/></tr><tr><td>WTF?</td><td>WTF?</td><td/></tr><tr><td>Strange</td><td>Strange</td><td/></tr><tr><td/><td>Format</td><td>Format</td></tr></table>N)
iterr    r   rc  rP   
startswithrQ   rm   r'   rX   )rL  r-   rS   s      r/   ,it_can_partition_tables_with_incomplete_rowszEDescribe_DocxPartitioner.it_can_partition_tables_with_incomplete_rows2  s   " '78Z'[\]Nvv  !9::: NAw7***vv"""zz&&W
 	
 
 NAw7***vv 5OAFF9"55 zz&&R
 	0-QZZ,,./	0 
 NAw7***vv"7oaffY$77"zz&&
 	0
 .QZZ,,./	0 
 NAw7***vv"7oaffY$77"zz&&
 	0
 .QZZ,,./	0 
  NAw7***vv&;/!&&(;;&zz&&
 	0 .QZZ,,./	0 
 NAw7***vv==R	?RR=zz&&	
 	0 .QZZ,,./	0 
rE   c                   dd}t        d      |d<   t        di |}t        d      t        d      t        d      t        d      t        d      t        d      t        d	      t        d      t        d      t        d
      t        d      t        d      t	        d      g}t        j                  |      }t        |      D ]*  \  }}|||   k(  rJ d |||          d ||       d        y)a  Page-break behavior has some subtleties.

        * A hard page-break does not generate a PageBreak element (because that would double-count
          it). Word inserts a rendered page-break for the hard break at the effective location.
        * A (rendered) page-break mid-paragraph produces two elements, like `Text, PageBreak, Text`,
          so each Text (subclass) element gets the right page-number.
        * A rendered page-break mid-hyperlink produces two text elements, but the hyperlink itself
          is not split; the entire hyperlink goes on the page where the hyperlink starts, even
          though some of its text appears on the following page. The rest of the paragraph, after
          the hyperlink, appears on the following page.
        * Odd and even-page section starts can lead to two page-breaks, like an odd-page section
          start could go from page 3 to page 5 because 5 is the next odd page.
        c                :    | j                   j                   d|  dS )z?A more detailed `repr()` to aid debugging when assertion fails.z('z'))	__class__rm   )rS   s    r/   str_reprz[Describe_DocxPartitioner.it_places_page_breaks_precisely_where_they_occur.<locals>.str_repr  s     kk**+2aS33rE   r  r   zsFirst page, tab here:	followed by line-break here:
here:
and here:
no-break hyphen here:-and hard page-break here>>r   z<<Text on second page. The font is big so it breaks onto third page--------------------here-->> <<but break falls inside link so text stays together.zContinuous section break here>>z<<followed by text on same pagezOdd-page section break here>>z9<<producing two page-breaks to get from page-3 to page-5.zQThen text gets big again so a "natural" rendered page break happens again here>> z<<and then more text proceeds.z

Expected: z

Got:      
N)rS   r   r   rC  rL   )r   r   r   r   r   r   iter_document_elementsr   )rL  r   r  r   expectedr-   r   rS   s           r/   0it_places_page_breaks_precisely_where_they_occurzIDescribe_DocxPartitioner.it_places_page_breaks_precisely_where_they_occur  s   	4 "22D!E	+%2	2 - bM
 bM;<;<9:bMbMUVc bM23A!
F $::4@) 	FC%  (3-!8 9 !R1%	rE   c                    t        d      |d<   t        di |}t        |      }|j                  j                  d   }|j                  |      }t        |      }|j                  dk(  sJ y )Ndocx-hdrftr.docxr   r   z:First header para
Table cell1 Table cell2
Last header pararL   )r   r   r   r   sections_iter_section_headersrc  rP   )rL  r   r   r   sectionheader_iterr.   s          r/   *it_includes_table_cell_text_in_Header_textzCDescribe_DocxPartitioner.it_includes_table_cell_text_in_Header_text  sl    !12D!E	+%2	2&t,''003!77@{#||]]]]rE   c                    t        d      |d<   t        di |}t        |      }|j                  j                  d   }|j                  |      }t        |      }|j                  dk(  sJ y)z?This case also verifies nested-table and merged-cell behaviors.r  r   r   zpara1
cell1 a b c d e f
para2NrL   )r   r   r   r   r  _iter_section_footersrc  rP   )rL  r   r   r   r  footer_iterr.   s          r/   *it_includes_table_cell_text_in_Footer_textzCDescribe_DocxPartitioner.it_includes_table_cell_text_in_Footer_text  sl    !12D!E	+%2	2&t,''003!77@{#||@@@@rE   Nr  )rm   r!  r"  r  r  r  r  r  r  r  r  r  r  rL   rE   r/   r  r    s8    M	

<

.X p0h?F	^
ArE   r  )r+   rC  r,   zlist[Element])r+   rC  r,   
list[Text])rU   r  )r   r   )r   r  r   zlist[dict[str, str]])r   r  r   	list[str]r   r  )r   r  r   r  )r+   rC  r  )r   r  )r   r  )rA  r   rE  zpathlib.Pathr   rC  )r   r  )er  
__future__r   r  rB   pathlibr  r6   typingr   r   r&   rI   docx.documentr   docx.text.paragraphr   pytest_mockr   test_unstructured.unit_utilsr	   r
   r   r   r   r   r   unstructured.chunking.titler   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   r   r   r   r   unstructured.partition.docxr   r   r    r!   &unstructured.partition.utils.constantsr"   r#   r0   r=   r@   rD   rM   rT   r  r  r[   r_   rd   ro   ru   rz   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r)  r  r,   r   r   r   rA  r+   r   rH  r  rL   rE   r/   <module>r     s   ; "  	  	      " ) #   7    
W 
W5B
W5 55?5$1) )5?)> 04-@I AI>?A 	22EL
LJ6HH"":N".
9
9:N
9AA'0A $-A ='0=#,=&.
SB12B$$&5I5;Q&h	%B 
 
 < <        4    .G\ G\TiA iArE   