
    :QgEF                    
   d Z ddlmZ ddlmZmZ ddlZddlmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZmZmZmZmZmZ dd
lmZ d Zd Zd Z d Z!d Z"d Z#d Z$d Z%d Z&d Z'd Z( G d d      Z) G d d      Z*y)z8Test suite for the `unstructured.chunking.title` module.    )annotations)AnyOptionalN)FixtureRequestMockfunction_mock)CHUNK_MULTI_PAGE_DEFAULT)_ByTitleChunkingOptionschunk_by_title)CoordinateSystem)	CheckBoxCompositeElementCoordinatesMetadataElementElementMetadataListItemTableTextTitle)partition_htmlc                     t        d      t        d      g} t        | d      }|t        d      t        d      t        d      gk(  sJ y )NIntroductionzcLorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.2   max_charactersz1Lorem ipsum dolor sit amet consectetur adipiscingz1elit. In rhoncus ipsum sed lectus porta volutpat.)r   r   r   r   elementschunkss     b/var/www/html/answerous/venv/lib/python3.12/site-packages/test_unstructured/chunking/test_title.py3test_it_splits_a_large_element_into_multiple_chunksr    $   s]    n	
H HR8F(LMLM       c                    t        d      t        d      t        d      t        d      t        d      t        d      t        d      t        d      t        d	      t        d
      t               g} t	        | dd      }t        |      dk(  sJ |d   }t        |t              sJ |j                  j                  t        d      t        d      t        d      gk(  sJ |d   }t        |t              sJ |j                  j                  t        d      gk(  sJ |d   }t        |t              sJ |j                  j                  t        d      t        d      t        d      gk(  sJ |d   }t        |t              sJ |j                  j                  t        d      t        d	      t        d
      t               gk(  sJ y )NA Great DayToday is a great day.It is sunny outside.Heading
Cell textAn Okay DayToday is an okay day.It is rainy outside.	A Bad DayToday is a bad day.It is storming outside.r   Tcombine_text_under_n_charsinclude_orig_elements            )
r   r   r   r   r   len
isinstancer   metadataorig_elements)r   r   chunks      r   *test_it_splits_elements_by_title_and_tabler9   6   s   m$%#$"#m$%#$k"#&'
H HZ^_Fv;!1IEe-...>>''m$%#$,    1IEeU###>>''E2F,G+HHHH1IEe-...>>''m$%#$,    1IEe-...>>''k"#&'
	,   r!   c                    t        dt        dg            t        dt        dg            t        d      t        d      t        d	      t        d
      t        d      t        d      t        d      t        d      t	               g} t        | dd      }|t        d      t        d      t        d      t        d      gk(  sJ |d   j                  t        ddg      k(  sJ y )Nr#   Day)emphasized_text_contentsr6   r$   dayr%   r&   r'   r(   r)   r*   r+   r,   r   Fr-   8A Great Day

Today is a great day.

It is sunny outside.8An Okay Day

Today is an okay day.

It is rainy outside.7A Bad Day

Today is a bad day.

It is storming outside.)r   r   r   r   r   r   r   r6   r   s     r   test_chunk_by_titlerB   g   s    moPUw&WX$Y^X_/`a#$"#m$%#$k"#&'
H HZ_`FJ	
 	"#WXI	
	 	 	 	 !95RW.!YYYYr!   c                    t        dt        d            t        dt        d            t        dt        d            t        d      t        d	      t        d
      t        d      t        d      t        d      t        d      t	               g} t        | dd      }|t        d      t        d      t        d      t        d      t        d      gk(  sJ y )Nr#   r1   page_numberr=   r$   r2   r%   r&   r'   r(   r)   r*   r+   r,   Fr   multipage_sectionsr.   z+Today is a great day.

It is sunny outside.r@   rA   r   r   r   r   r   r   r   r   s     r   ,test_chunk_by_title_separates_by_page_numberrI      s    mo!&DE$1/MN#o!.LM"#m$%#$k"#&'
H H[\]F	
 	;	
 	"#WXI	
   r!   c                    t        dt        d            t        dt        d            t        dt        d            t        d      t        d	      t        d
      t        d      t        d      t        d      t        d      t	               g} t        | dd      }|t        d      t        d      t        d      t        d      gk(  sJ y Nr#   r1   rD   r=   r$   r2   r%   r&   r'   r(   r)   r*   r+   r,   Tr   rF   r?   r@   rA   rH   r   s     r   &test_chuck_by_title_respects_multipagerL      s    mo!&DE$1/MN#o!.LM"#m$%#$k"#&'
H HZ[\FJ	
 	"#WXI	
	 	 	 	r!   c                    t        dt        d            t        dt        d            t        dt        d            t        d      t        d	      t        d
      t        d      t        d      t        d      t        d      t	               g} t        | dd      }|t        d      t        d      t        d      t        d      gk(  sJ y rK   rH   r   s     r   'test_chunk_by_title_groups_across_pagesrN      s    mo!&DE$1/MN#o!.LM"#m$%#$k"#&'
H HZ[\FJ	
 	"#WXI	
	 	 	 	r!   c                 j    d} t        | d      }t        |       }t        |      }||k7  sJ ||k(  sJ y )N example-docs/example-10k-1p.htmlby_title)chunking_strategy)r   r   )filenamechunk_elementsr   r   s       r   ,test_add_chunking_strategy_on_partition_htmlrU      sC    1H#H
KNh'HH%FX%%%V###r!   c                 D   d} t        | dddd      }t        |       }t        |ddd      }|D ].  }t        |t              sJ t	        |j
                        dk  r.J  |D ].  }t        |t              sJ t	        |j
                        dk  r.J  ||k7  sJ ||k(  sJ y )NrP   rQ   r   r   d   )rR   r.   new_after_n_charsr   )r.   rX   r   )r   r   r5   r   r4   text)rS   rT   r   r   r8   chunk_elements         r   2test_add_chunking_strategy_respects_max_charactersr[      s    1H#$#$N h'H#$	F  &%&&&5::#%%%& ( .-...=%%&#---. X%%%V###r!   c            
        t        dt        d            t        dt        d            t        dt        d            t        d	t        d
            t        dt        d            g} t        | d      }t	        |d         t	        t        d            k(  sJ t	        |d         t	        t        d            k(  sJ y )Nr#         ?)detection_class_probr=   r$   gףp=
?r%   g\(\?r'   gzG?r(   gffffff?r   r.   r?   r1   "An Okay Day

Today is an okay day.)r   r   r   r   strr   r   s     r   .test_chunk_by_title_drops_detection_class_probrb      s    $%(	
 	#$%)	
 	"$%)	
 	$%)	
 	#$%)	
3H@ HCFvay>SWX    vay>S!12X!YZZZZr!   c                 n   t        dt        t        dt        dd                        t	        dt        t        d	t        d
d
                        t	        dt        t        dt        dd                        t        dt        t        dt        dd                        t	        dt        t        dt        dd                        g} t        | d      }t        |d         t        t        d            k(  sJ t        |d         t        t        d            k(  sJ y )Nr#   ))皙?rd   )皙?rd   )rd   re   re   re   rd   )widthheight)pointssystem)coordinatesr=   r$   )rf   )333333?re   )re   rl   rl   rl   re   r%   )rm   )皙?rl   )rl   rn   rn   rn   rl   r'   r(   )ro   )r]   rn   )rn   r]   )r]   r]   rn   r   r_   r?   r1   r`   )r   r   r   r   r   r   ra   r   r   s     r   (test_chunk_by_title_drops_extra_metadatarp   '  sL   $/ ,#cB
	
 	#$/ ,#cB
	
 	"$/ ,#cB
	
 	$/ ,#cB
	
 	#$/ ,#cB
	
sGHR HCFvay>SWX    vay>S!12X!YZZZZr!   c                     t        d      t        d      t        d      t        d      g} t        | d      }|t        d      t        d      gk(  sJ y)	zHPreChunker includes length of separators when computing remaining space.zChunking Prioritiesz"Divide text into manageable chunkszPreserve semantic boundariesz!Minimize mid-text chunk-splittings   r   zUChunking Priorities

Divide text into manageable chunks

Preserve semantic boundariesN)r   r   r   r   r   s     r   4test_it_considers_separator_length_when_pre_chunkingrs   z  sl     	#$56/045	H HS9F/	

 	<=   r!   c                      e Zd ZdZej
                  j                  dddidfddidfddidfi dfg      	 	 	 	 	 	 d	d       Z ej                         d
d       Z	y)Describe_chunk_by_titlezLUnit-test suite for `unstructured.chunking.title.chunk_by_title()` function.)kwargsexpected_valuer/   TFNc                n    t        g fi | |j                  j                  \  }}|j                  |u sJ y )N)r   	call_argsargsr/   )selfrv   rw   _chunk_by_title__optss         r   ,it_supports_the_include_orig_elements_optionzDDescribe_chunk_by_title.it_supports_the_include_orig_elements_option  s:     	r$V$",,114))^;;;r!   c                    t        |d      S )Nz+unstructured.chunking.title._chunk_by_title)r   )r{   requests     r   r|   z(Describe_chunk_by_title._chunk_by_title_  s    W&STTr!   )rv   zdict[str, Any]rw   boolr|   r   )r   r   )
__name__
__module____qualname____doc__pytestmarkparametrizer   fixturer|    r!   r   ru   ru     s    V[[$%t,d3%u-u5%t,d3J		
<$<6:<NR<< V^^U Ur!   ru   c                     e Zd ZdZej
                  j                  dddg      dd       Zd Zd Z	ej
                  j                  dd	d
g      	 	 	 	 	 	 dd       Z
d Zej
                  j                  ddddefg      	 	 	 	 dd       Zy)Describe_ByTitleChunkingOptionszRUnit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects.n_charsic                    t        j                  t        d|       5  t        j                  |       d d d        y # 1 sw Y   y xY w)Nz8'combine_text_under_n_chars' argument must be >= 0, got matchr_   r   raises
ValueErrorr
   new)r{   r   s     r   :it_rejects_combine_text_under_n_chars_for_n_less_than_zerozZDescribe_ByTitleChunkingOptions.it_rejects_combine_text_under_n_chars_for_n_less_than_zero  sD    ]]LWIV
 	L $''7K		L 	L 	Ls	   ?Ac                >    t        d      }|j                  dk(  sJ y)zSSpecifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining.r   r_   N)r
   r.   r{   r~   s     r   Fit_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combiningzfDescribe_ByTitleChunkingOptions.it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining  s     &!D..!333r!   c                    	 t        d      }j                  dk(  sJ y# t        $ r t        j                  d       Y 2w xY w)zUCaller can specify `combine_text_under_n_chars` arg without specifying other options.r   r_   z?did not accept `combine_text_under_n_chars` as option by itselfN)r
   r   r   failr.   r   s     r   Iit_does_not_complain_when_specifying_combine_text_under_n_chars_by_itselfziDescribe_ByTitleChunkingOptions.it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself  sH    	[*bID .."444  	[KKYZ	[s     A A)r.   r   expected_hard_max)X  Ni  )r     r   c                    t        j                  t        d| d|       5  t        j                  ||       ddd       y# 1 sw Y   yxY w)aw  `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users.

        The behavior is no different from `combine_text_under_n_chars == max_characters`, but if
        `max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a
        larger number like 1500 then it can look like chunk-combining isn't working.
        zR'combine_text_under_n_chars' argument must not exceed `max_characters` value, got z > r   )r   r.   Nr   )r{   r.   r   r   s       r   ;it_rejects_combine_text_under_n_chars_greater_than_maxcharsz[Describe_ByTitleChunkingOptions.it_rejects_combine_text_under_n_chars_greater_than_maxchars  sW     ]]2337H6IK
 		 $''-Jd		 		 		s   AAc                    	 t        j                  d      }j
                  dk(  sJ y# t        $ r t        j                  d       Y 2w xY w)zPCaller can specify `new_after_n_chars` arg without specifying any other options.   )rX   z6did not accept `new_after_n_chars` as option by itselfN)r
   r   r   r   r   soft_maxr   s     r   @it_does_not_complain_when_specifying_new_after_n_chars_by_itselfz`Describe_ByTitleChunkingOptions.it_does_not_complain_when_specifying_new_after_n_chars_by_itself  sL    	R*..ED }}###  	RKKPQ	Rs   * A
A)rG   rw   )TT)FFNc                <    t        |      }|j                  |u sJ y )N)rG   )r
   rG   )r{   rG   rw   r~   s       r   3it_knows_whether_to_break_chunks_on_page_boundarieszSDescribe_ByTitleChunkingOptions.it_knows_whether_to_break_chunks_on_page_boundaries  s#     ':LM&&.888r!   )r   int)r.   r   r   zOptional[int]r   r   )rG   r   rw   r   )r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r!   r   r   r     s    \[[YS	2L 3L4
5 [[M	?+*-?Lad	($ [[0	~.F'GH9"&98<9	9r!   r   )+r   
__future__r   typingr   r   r   test_unstructured.unit_utilsr   r   r   unstructured.chunking.baser	   unstructured.chunking.titler
   r   "unstructured.documents.coordinatesr   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   unstructured.partition.htmlr   r    r9   rB   rI   rL   rN   rU   r[   rb   rp   rs   ru   r   r   r!   r   <module>r      s    ? "    L L ? O ?
 
 
 7$.bZ<>68$$6%[PP[f:U U:B9 B9r!   