
    :Qg\                       d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ d Zej:                  j=                  d	g d
      d&d       Z G d d      Z  G d d      Z! G d d      Z" G d d      Z# G d d      Z$ G d d      Z% G d d      Z& G d d      Z' G d d      Z( G d d      Z) G d  d!      Z* G d" d#      Z+ G d$ d%      Z,y)'z;Test suite for `unstructured.partition.html.parser` module.    annotations)dequeN)etree)AddressElementListItemNarrativeTextTextTitle)
AnnotationDefaultElementFlowPhrasingRemovedPhrasingTextSegment_consolidate_annotations_ElementAccumulator_normalize_text_PhraseAccumulator_PreElementAccumulatorhtml_parserc                     ddddddddg} t        |       } | ddgddgdgdgdk(  sJ t        j                  t        d	
      5  d| d<   d d d        | d   j	                  d       | d   g dk(  sJ y # 1 sw Y   *xY w)NzFord Prefectzhttps://wikipedia/Ford_Prefectb)
link_textslink_urlemphasized_text_contentsemphasized_text_tagszalien encounterbir   r   )r   r   r   r   z'object does not support item assignment)matchfoobarnew_keyr   xyz)r   r   r$   )r   pytestraises	TypeErrorappendr   s    i/var/www/html/answerous/venv/lib/python3.12/site-packages/test_unstructured/partition/html/test_parser.py7it_consolidates_annotations_from_multiple_text_segmentsr*   "   s     )8(6$'		
 ):$(	
K +;7K%35F$G!$d%&56    
y(Q	R *!)I* &'..u5-.2DDDD	* *s   A44A=textexpected_value))iterators allowr.   )zalgorithm
to   bezalgorithm to be)z  separated
  from  zseparated from)z
 container
 details
 zcontainer details)zM
  iterators  allow 
 algorithm to be   
expressed  without container  
noisezAiterators allow algorithm to be expressed without container noisec                $    t        |       |k(  sJ y N)r   r+   s     r)   ,test_normalize_text_produces_normalized_textr1   E   s      4 N222    c                  "    e Zd ZdZd Zd Zd Zy)Describe_PhraseAccumulatorzUIsolated unit-test suite for `unstructured.partition.html.parser._PhraseAccumulator`.c                    t               }|j                         }t        j                  t              5  t        |       d d d        y # 1 sw Y   y xY wr0   r   flushr%   r&   StopIterationnextselfaccumphrase_iters      r)   it_is_empty_on_constructionz6Describe_PhraseAccumulator.it_is_empty_on_construction^   >    "$kkm]]=) 		 	 	   A		Ac                f   t               }|j                  t        di              |j                  t        di              |j                         }t	        |      }|t        di       t        di       fk(  sJ t        j                  t              5  t	        |       d d d        y # 1 sw Y   y xY w)NFord... you're turning into a penguin.)r   addr   r7   r9   r%   r&   r8   )r;   r<   r=   phrases       r)   it_accumulates_text_segmentsz7Describe_PhraseAccumulator.it_accumulates_text_segmentsh   s    "$		+7<=		+/45kkmk"126)2.
 
 	
 

 ]]=) 		 	 	s   B''B0c                    t               }|j                         }t        j                  t              5  t        |       d d d        y # 1 sw Y   y xY wr0   r6   r:   s      r)   -it_generates_zero_phrases_on_flush_when_emptyzHDescribe_PhraseAccumulator.it_generates_zero_phrases_on_flush_when_emptyz   r?   r@   N)__name__
__module____qualname____doc__r>   rF   rH    r2   r)   r4   r4   [   s    _$r2   r4   c                     e Zd ZdZd,dZd,dZd,dZ	 	 d,dZ	 	 d,dZd,dZ	d,dZ
	 	 d,d	Z	 	 d,d
Zd Zd,dZej                   j#                  dddedfddedfddedfddedfddedfddedfddedfddedfddedfd d!ed"fd#d$ed%fd&d'ed(fg      	 	 	 	 	 	 	 	 d-d)       Z	 	 d,d*Z ej.                         d.d+       Zy)/Describe_ElementAccumulatorzVIsolated unit-test suite for `unstructured.partition.html.parser._ElementAccumulator`.c                    t        |      }|j                  d       }t        j                  t              5  t        |       d d d        y # 1 sw Y   y xY wr0   r   r7   r%   r&   r8   r9   r;   html_elementr<   element_iters       r)   r>   z7Describe_ElementAccumulator.it_is_empty_on_construction   C    #L1{{4(]]=) 		 	 	   AAc                P   t        |      }|j                  t        di              |j                  t        di              |j                  d       }t	        |      }|t        d      k(  sJ t        j                  t              5  t	        |       d d d        y # 1 sw Y   y xY w)NrB   rC   &Ford... you're turning into a penguin.)	r   rD   r   r7   r9   r
   r%   r&   r8   )r;   rS   r<   rT   elements        r)   rF   z8Describe_ElementAccumulator.it_accumulates_text_segments   s    #L1		+7<=		+/45{{4(|$-(PQQQQ]]=) 		 	 	s   BB%c                    t        |      }|j                  d       }t        j                  t              5  t        |       d d d        y # 1 sw Y   y xY wr0   rQ   rR   s       r)   %it_generates_zero_elements_when_emptyzADescribe_ElementAccumulator.it_generates_zero_elements_when_empty   rU   rV   c                   t        |      }|j                  t        di              |j                  t        di              t        j                  t
              5  t        |j                  d              d d d        y # 1 sw Y   y xY w)N 
   	 
z   
r   rD   r   r%   r&   r8   r9   r7   r;   rS   r<   s      r)   Mand_it_generates_zero_elements_when_all_its_text_segments_are_whitespace_onlyziDescribe_ElementAccumulator.and_it_generates_zero_elements_when_all_its_text_segments_are_whitespace_only   f     $L1		+mR01		+gr*+]]=) 	$T"#	$ 	$ 	$   A??Bc                   t        |      }|j                  t        di              |j                  t        di              t        j                  t
              5  t        |j                  d              d d d        y # 1 sw Y   y xY w)Nr]   z X 
r^   r_   s      r)   Nand_it_generates_zero_elements_when_there_is_only_one_non_whitespace_characterzjDescribe_ElementAccumulator.and_it_generates_zero_elements_when_there_is_only_one_non_whitespace_character   ra   rb   c                    t        |      }|j                  t        di              |j                  t        di              |j                  d       \  }|j                  dk(  sJ y Nz 
  Ford...   you're 	 turning
zinto a   penguin.
rX   )r   rD   r   r7   r,   r;   rS   r<   rY   s       r)   4it_normalizes_the_text_of_its_text_segments_on_flushzPDescribe_ElementAccumulator.it_normalizes_the_text_of_its_text_segments_on_flush   sW    #L1		+BBGH		+3R89[[&
||GGGGr2   c                    t        |      }|j                  t        di              |j                  t              \  }|t	        d      k(  sJ y NrX   r   rD   r   r7   r	   rg   s       r)   3it_creates_a_document_element_of_the_specified_typezODescribe_ElementAccumulator.it_creates_a_document_element_of_the_specified_type   sD    #L1		+FKL[[*
(#KLLLLr2   c                    t        |      }|j                  t        di              |j                  d       \  }|t	        d      k(  sJ y rj   )r   rD   r   r7   r
   rg   s       r)   Dbut_it_derives_the_element_type_from_the_text_when_none_is_specifiedz`Describe_ElementAccumulator.but_it_derives_the_element_type_from_the_text_when_none_is_specified   sF     $L1		+FKL[[&
-(PQQQQr2   c                    t        |      }|j                  t        di              |j                  d       \  }|t	        d      k(  sJ y )Nz* turning into a penguinzturning into a penguinrk   rg   s       r)   @it_removes_an_explicit_leading_bullet_character_from_a_list_itemz\Describe_ElementAccumulator.it_removes_an_explicit_leading_bullet_character_from_a_list_item   sF     $L1		+8"=>[[&
(#;<<<<r2   c                2   t        j                  dt              j                  d      d   }t	        |      }|j                  t        di              |j                  t              \  }|j                         }|j                  d       |ddiddd	k(  sJ y )
Nz<h3>About fish</h3>z.//h3r   zThanks for all those!
element_idcategory_depth   r   metadatar,   type)r   
fromstringr   xpathr   rD   r   r7   r   to_dictpopr;   rS   r<   rY   es        r)   "it_applies_category_depth_metadataz>Describe_ElementAccumulator.it_applies_category_depth_metadata   s    ''(={KQQRYZ[\]#L1		+5r:;[['
OO	l)1-+
 
 	
 
r2   c                   t        |      }|j                  t        dddd             |j                  t        di              |j                  t        dddd             |j                  t        di              |j                  t              \  }|j                         }|j                  d	       |ddgddgdd
ddk(  sJ y )Nz
    Ford...Fordr   r    z you're turning into a penguiniz.
rr   rX   r
   ru   )r   rD   r   r7   r
   rz   r{   r|   s        r)   -and_it_consolidates_annotations_into_metadatazIDescribe_ElementAccumulator.and_it_consolidates_annotations_into_metadata   s    #L1		06,/	
 			+7<=		09,/	
 			+eR()[[/
OO	l -
 )	 =#
 
 	
 
r2   )	html_texttag
ElementClsr-   z5<p>Ford... you're turning into a penguin. Stop it.<p>pNz!<p>* thanks for all the fish.</p>r   z!<li>thanks for all the fish.</li>liz><ul><li>So long</li><li>and thanks for all the fish.</li></ul>   z><dl><dd>So long<ol><li>and thanks for the fish.</li></ol></ul>rt   z<p>Examples</p>z<h1>Examples</h1>h1z<h2>Examples</h2>h2z<h3>Examples</h3>h3z<h4>Examples</h4>h4   z<h5>Examples</h5>h5   z<h6>Examples</h6>h6   c                    t        j                  |t              j                  d|       d   }t	        |      }|j                  |      |k(  sJ y )Nz.//r   )r   rx   r   ry   r   _category_depth)r;   r   r   r   r-   r}   r<   s          r)   &it_computes_the_category_depth_to_helpzBDescribe_ElementAccumulator.it_computes_the_category_depth_to_help  sO    ( Y4::S;GJ#A&$$Z0NBBBr2   c                    t        |      }|j                  t        di              |j                  t        di              |j                  dk(  sJ y rf   )r   rD   r   _normalized_textr_   s      r)   <it_computes_the_normalized_text_of_its_text_segments_to_helpzXDescribe_ElementAccumulator.it_computes_the_normalized_text_of_its_text_segments_to_help5  sK     $L1		+BBGH		+3R89%%)QQQQr2   c                Z    t        j                  dt              j                  d      d   S )N<p/>.//pr   )r   rx   r   ry   )r;   s    r)   rS   z(Describe_ElementAccumulator.html_element@  s%    4::6B1EEr2   )rS   etree.ElementBase)r   strr   r   r   ztype[Element]r-   z
int | None)returnr   )rI   rJ   rK   rL   r>   rF   r[   r`   rd   rh   rl   rn   rp   r~   r   r%   markparametrizer   r	   r   r   r   fixturerS   rM   r2   r)   rO   rO      su   `$-$$-$HMR-R=-=
(
X [[<Dc4QUV0#xC0$!DMtU]_`aMtU]_`aUA. $q1 $q1 $q1 $q1 $q1 $q1	
"CC#&C4ACS]C#"CR-R V^^F Fr2   rO   c                      e Zd ZdZd Zy)Describe_PreElementAccumulatorzYIsolated unit-test suite for `unstructured.partition.html.parser._PreElementAccumulator`.c                l   t        j                  dt              j                  d      d   }t	        |      }|j                  t        di              |j                  t        di              |j                  t        di              |j                  t        di              |j                  dk(  sJ y )	Nr   r   r   z

z    The panel lit up
z(    with the words 'Please do not press
z    this button again'

zU
    The panel lit up
    with the words 'Please do not press
    this button again'
)r   rx   r   ry   r   rD   r   r   r_   s      r)   r   z[Describe_PreElementAccumulator.it_computes_the_normalized_text_of_its_text_segments_to_helpH  s    ''<BB6J1M&|4		+fb)*		+6;<		+I2NO		+:B?@ %%'
 	
 
r2   N)rI   rJ   rK   rL   r   rM   r2   r)   r   r   E  s
    c
r2   r   c                      e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
ej                  j                  d	d
di fgfddi fddddfdi fddddfdi fgfddi fddddfddddfdi fgfg      	 	 	 	 dd       Zy)DescribeFlowzIsolated unit-test suite for `unstructured.partition.html.parser.Flow`.

    The `Flow` class provides most behaviors for flow (block-level) elements.
    c                    t        j                  dt              j                  d      d   }t	        |t
              sJ |j                  du sJ y )Nz<p>Hello</p>r   r   F)r   rx   r   ry   
isinstancer   is_phrasing)r;   r   s     r)   %it_knows_it_is_NOT_a_phrasing_elementz2DescribeFlow.it_knows_it_is_NOT_a_phrasing_elementd  sF    ^[9??GJ!T"""}}%%%r2   c                t   d}t        j                  |t              j                  d      d   }|j	                         }t        |      }|t        d      k(  sJ |j                  j                         dg dg ddk(  sJ t        |      }|t        d      k(  sJ |j                  j                         d	gd
gdk(  sJ t        |      }|t        d      k(  sJ |j                  j                         dg dg ddk(  sJ t        |      }|t        d      k(  sJ |j                  j                         ddik(  sJ t        |      }|t        d      k(  sJ |j                  j                         dddgddgdk(  sJ t        j                  t              5  t        |      }ddd       y# 1 sw Y   yxY w)a?  Phrasing siblings of child block elements are processed with text or tail.

        In the general case, a Flow element can contain text, phrasing content, and child flow
        elements.

        Each of these five lines in this example is a "paragraph" and gives rise to a distinct
        document-element.
        a  
          <div>
            Text of div <b>with <i>hierarchical</i>
phrasing</b> content before first block item
            <p>Click <a href="http://blurb.io">here</a> to see the blurb for this block item. </p>
            tail of block item <b>with <i>hierarchical</i> phrasing </b> content
            <p>second block item</p>
            tail of block item <b>with <i>  hierarchical  </i></b> phrasing content
          </div>
        .//divr   zFText of div with hierarchical phrasing content before first block item)withhierarchicalphrasingr   r   r   )rs   r   r   z0Click here to see the blurb for this block item.herezhttp://blurb.ior   	link_urlsz5tail of block item with hierarchical phrasing contentzsecond block itemrs   r   r   r   r   N)r   rx   r   ry   iter_elementsr9   r   rv   rz   r
   r%   r&   r8   r;   r   divelementsr}   s        r)   8it_generates_the_document_elements_from_the_Flow_elementzEDescribeFlow.it_generates_the_document_elements_from_the_Flow_elementl  s   	 y+6<<XFqI$$&NEbcccczz!!#(L$4(
 
 	
 

 NM"TUUUUzz!!#vhN_M`'aaaaNEQRRRRzz!!#(L$4(
 
 	
 

 NE-....zz!!#(8!'<<<<NEQRRRRzz!!#)/(@%($K(
 
 	
 

 ]]=) 	XA	 	 	s   F..F7c                6   d}t        j                  |t              j                  d      d   }|j	                  |j
                  t        |      t              }t        |      }|t        d      k(  sJ |j                  j                         g dg ddk(  sJ y)	zJText and tails and their phrasing content are both processed the same way.zK<div>The 
 Roman <b>poet <i>   Virgil</i> gave</b> his <q>pet</q> fly</div>r   r   z&The Roman poet Virgil gave his pet fly)poetVirgilgaver   r    N)r   rx   r   ry   _element_from_text_or_tailr,   r   r   r9   rv   rz   r   s        r)   4it_assembles_text_and_tail_document_elements_to_helpzADescribeFlow.it_assembles_text_and_tail_document_elements_to_help  s    b	y+6<<XFqI11#((E#JMNDABBBBzz!!#(B$4(
 
 	
 
r2   c                &   d}t        j                  |t              j                  d      d   }|j	                  |j
                  t        |      t              }t        j                  t              5  t        |       d d d        y # 1 sw Y   y xY w)Nz6<div>   <b> 
 <i>  
 </i>  </b>   <q> 
 </q> 
  </div>r   r   )r   rx   r   ry   r   r,   r   r   r%   r&   r8   r9   r;   r   r   r   s       r)   Mbut_it_does_not_generate_a_document_element_when_only_whitespace_is_containedzZDescribeFlow.but_it_does_not_generate_a_document_element_when_only_whitespace_is_contained  sm    P	y+6<<XFqI11#((E#JM]]=) 	N	 	 	s   2BBc                   d}t        j                  |t              j                  d      d   }|j	                  |j
                  t        |      t              }t        |      }|t        d      k(  sJ |j                  j                         i k(  sJ t        j                  t              5  t        |       d d d        y # 1 sw Y   y xY w)Nz;<div>
  The line-storm clouds fly tattered and swift
</div>r   r   z,The line-storm clouds fly tattered and swift)r   rx   r   ry   r   r,   r   r   r9   rv   rz   r%   r&   r8   r   s        r)   @it_uses_the_specified_element_class_to_form_the_document_elementzMDescribeFlow.it_uses_the_specified_element_class_to_form_the_document_element  s    S	y+6<<XFqI11#((E#JPNGJKKKKzz!!#r)))]]=) 	N	 	 	s   ,CC
c                    d}t        j                  |t              j                  d      d   }|j	                  |j
                  t        |            }t        |      t        d      k(  sJ y )Nz<<div>
  The line-storm clouds fly tattered and swift,
</div>r   r   z-The line-storm clouds fly tattered and swift,)	r   rx   r   ry   r   r,   r   r9   r
   r   s       r)   Rand_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specifiedz_DescribeFlow.and_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specified  s[    T	y+6<<XFqI11#((E#JGH~/^!____r2   c                   d}t        j                  |t              j                  d      d   }|j	                  |j
                  t        |            }t        j                  t              5  t        |       d d d        y # 1 sw Y   y xY w)Nz<div> * </div>r   r   )r   rx   r   ry   r   r,   r   r%   r&   r8   r9   r   s       r)   Ubut_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_containedzbDescribeFlow.but_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_contained  sk    $	y+6<<XFqI11#((E#JG]]=) 	N	 	 	s   -BBr   r-   z,<p>Ford... you're turning into a penguin.<p>rX   z:<p>Ford... <b>you're turning</b> into
a <i>penguin</i>.<p>zFord... zyou're turningr   r    z into
a r   r   .z:<p>Ford... <b>you're <i>turning</i></b> into a penguin.<p>zyou're zyou'returningr   z into a penguin.c                    t        j                  |t              j                  d      d   }t	        |j                  |j                  t        |                  }||k(  sJ y )Nr   r   )r   rx   r   ry   list_iter_text_segmentsr,   r   )r;   r   r-   r   text_segmentss        r)   Eit_recursively_generates_text_segments_from_text_and_phrasing_to_helpzRDescribeFlow.it_recursively_generates_text_segments_from_text_and_phrasing_to_help  sT    V Y4::6B1EQ2216658DE...r2   N)r   r   r-   zlist[Annotation])rI   rJ   rK   rL   r   r   r   r   r   r   r   r%   r   r   r   rM   r2   r)   r   r   \  s   &2l
 
` [[' ?:B?@
 N$(5E_bc !"%!5>X[\ "I" M$!5=WZ[
 "5>X\] (,+$	
'P//.>/Q'P/r2   r   c                  n    e Zd ZdZd Zej                  j                  dg d      	 	 	 	 dd       Zd Z	y)	DescribePrezIsolated unit-test suite for `unstructured.partition.html.parser.Pre`.

    The `Pre` class specializes behaviors for the `<pre>` (pre-formatted text) element.
    c                (   d}t        j                  |t              j                  d      d   }|j	                         }t        |      }|t        d      k(  sJ t        j                  t              5  t        |       ddd       y# 1 sw Y   yxY w)z4A `<pre>` element can contain only phrasing content.z<pre>
  The Answer to the Great Question...   Of Life, the Universe and Everything...
  Is... Forty-two, said Deep Thought, with infinite majesty and calm.
</pre>
.//prer   z  The Answer to the Great Question...   Of Life, the Universe and Everything...
  Is... Forty-two, said Deep Thought, with infinite majesty and calm.N)
r   rx   r   ry   r   r9   r   r%   r&   r8   )r;   r   prer   r}   s        r)   9it_preserves_the_whitespace_of_its_phrasing_only_contentszEDescribePre.it_preserves_the_whitespace_of_its_phrasing_only_contents  s     	 y+6<<XFqI$$&NDT
 
 	
 
 ]]=) 	N	 	 	s   3BBr   ))z<pre>
  foo  </pre>  foo  )z<pre> 
  foo  </pre>z	 
  foo  )z<pre>

  foo  </pre>z
  foo  )z<pre>  foo  
</pre>r   )z<pre>  foo  
 </pre>z	  foo  
 )z<pre>  foo  

</pre>z  foo  
)z<pre>
  foo  
</pre>r   )z<pre> 
  foo  
 </pre>z 
  foo  
 c                    t        j                  |t              j                  d      d   }t	        |j                               }|j                  |k(  sJ y)zSContent starts on next line when opening `<pre>` tag is immediately followed by `
`r   r   N)r   rx   r   ry   r9   r   r,   )r;   r   r-   r   r}   s        r)   2but_it_strips_a_single_leading_or_trailing_newlinez>DescribePre.but_it_strips_a_single_leading_or_trailing_newline%  sK    2 y+6<<XFqI""$%vv'''r2   c                   d}t        j                  |t              j                  d      d   }t	        |j                               }|j                  dk(  sJ |j                  j                  dgk(  sJ |j                  j                  dgk(  sJ |j                  j                  dgk(  sJ |j                  j                  dgk(  sJ y )	NzL<pre>You're <b>turning</b> into a <a href="http://eie.io">penguin</a>.</pre>r   r   zYou're turning into a penguin.r   r   r   http://eie.io)r   rx   r   ry   r9   r   r,   rv   r   r   r   r   )r;   r   r   r}   s       r)   Pit_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elementsz\DescribePre.it_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elementsC  s    c	y+6<<XFqI""$%vv9999zz22ykAAAzz..3%777zz$$333zz##'8888r2   N)r   r   r-   r   )
rI   rJ   rK   rL   r   r%   r   r   r   r   rM   r2   r)   r   r     sL    
( [['	
*((.1(+*(
9r2   r   c                      e Zd ZdZd Zy)DescribeRemovedBlockzIsolated unit-test suite for `unstructured.partition.html.parser.RemovedBlock`.

    This class is used for block level items we want to skip like `<hr/>` and `<figure>`.
    c                    d}t        j                  |t              j                  d      d   }t	        |j                               t        d      gk(  sJ y )Na&  
          <div>
            <hr/>
            <figure>
              <img src="/media/cc0-images/elephant-660-480.jpg" alt="Elephant at sunset" />
              <figcaption>An elephant at sunset</figcaption>
            </figure>
            <p>Content we want.</p>
          </div>
          r   r   zContent we want.)r   rx   r   ry   r   r   r
   )r;   r   r   s      r)   it_is_skipped_during_parsingz1DescribeRemovedBlock.it_is_skipped_during_parsingV  sR    		 y+6<<XFqIC%%'(];M-N,OOOOr2   N)rI   rJ   rK   rL   r   rM   r2   r)   r   r   P  s    
Pr2   r   c                  h   e Zd ZdZd Zej                  j                  ddg fddi fgfddi fgfd	d
i fdi fgfddi fd
i fdi fdi fdi fgfg      	 	 	 	 d=d       Zej                  j                  dd e	d      gfd e
dddd       e	d      gfd e	d       e
dddd      gfd e
dddd       e	d       e
dddd      gfg      	 	 	 	 d>d       Zd Zej                  j                  dddg      d?d       Zd Zej                  j                  d g d!      	 	 d@d"       Zej                  j                  dd#g fd$ e
di       gfd% e
di        e
di       gfd& e
di        e
di        e
di        e
d'i        e
d(i        e
d)i       gfg      	 	 	 	 d=d*       Zej                  j                  d+d,dg fd-d e	d      gfd.d e	d       e
di       gfd/d e	d       e
di        e
di        e
d'i       gfd0d e
dddd       e	d       e
dddd       e
d'd'dd       e
d(d(dd      gfg      	 	 	 	 	 	 dAd1       Zej                  j                  d2d3dg fd4d e
di       gfd5d e
di       gfd6d e
di        e
di       gfd7d e
di        e
di        e
di        e
d'i        e
d(i       gfd8d e
dddd       e
dddd       e
ddd9d       e
d'd'dd       e
d(d(dd      gfd:d e
di        e
di        e	d       e
d'i        e
d(i       gfg      	 	 	 	 	 	 dBd;       Zy<)CDescribePhrasingzIsolated unit-test suite for `unstructured.partition.html.parser.Phrasing`.

    The `Phrasing` class provides most behaviors for phrasing (inline) elements.
    c                    t        j                  dt              j                  d      d   }t	        |t
              sJ |j                  du sJ y )Nz<b>Hello</b>.//br   T)r   rx   r   ry   r   r   r   )r;   r   s     r)   !it_knows_it_is_a_phrasing_elementz2DescribePhrasing.it_knows_it_is_a_phrasing_elementq  sF    ^[9??GJ!X&&&}}$$$r2   r   z<code></code>z<data> foo </data>z foo z<dfn/> bar z bar z.<kbd><mark>foo <meter>bar</meter></mark></kbd>zfoo barz4<kbd> <mark>foo <meter>bar</meter> baz</mark> </kbd> z bazc                    t        j                  |t              j                  d      d   d   }t	        |j                               |k(  sJ y N.//bodyr   r   rx   r   ry   r   iter_text_segmentsr;   r   r-   r}   s       r)   =it_generates_text_segments_for_its_text_and_children_and_tailzNDescribePhrasing.it_generates_text_segments_for_its_text_and_children_and_taily  sG    6 Y4::9EaHKA((*+~===r2   z<strong><p>aaa</p></strong>aaaz<strong>aaa<p>bbb</p></strong>r   r    bbbz<strong><p>aaa</p>bbb</strong>z!<strong>aaa<p>bbb</p>ccc</strong>cccc                    t        j                  |t              j                  d      d   d   }t	        |j                               |k(  sJ y r   r   r   s       r)   Fbut_it_can_also_generate_an_element_when_it_has_a_nested_block_elementzWDescribePhrasing.but_it_can_also_generate_an_element_when_it_has_a_nested_block_element  sH    V Y4::9EaHKA((*+~===r2   c                    t        j                  dt              j                  d      d   }|j	                  dd      dddk(  sJ y )N<cite/>.//citer   z
  foobar
  r   r"   r    r   rx   r   ry   _annotationr;   cites     r)   &it_forms_its_annotations_from_emphasisz7DescribePhrasing.it_forms_its_annotations_from_emphasis  sO    	;7==iHK 0$7(0$(<
 
 	
 
r2   r,    z
  	  c                    t        j                  dt              j                  d      d   }|j	                  |d      i k(  sJ y )Nr   r   r   r   r   )r;   r,   r   s      r)   (but_not_when_text_is_empty_or_whitespacez9DescribePhrasing.but_not_when_text_is_empty_or_whitespace  s?    	;7==iHKd+r111r2   c                    t        j                  dt              j                  d      d   }|j	                  dd      i k(  sJ y )Nr   r   r   r"   r   r   r   s     r)   !and_not_when_there_is_no_emphasisz2DescribePhrasing.and_not_when_there_is_no_emphasis  s?    	;7==iHK"-333r2   enclosing_emphasis)r   r   r   c                    t        j                  dt              j                  d      d   }|j	                  |      |k(  sJ y)zInside emphasis is applied to text inside the phrasing element (but not its tail).

        The `._inside_emphasis()` method is overridden by Bold and Italic classes which add their
        specific emphasis characters.
        z<abbr/>z.//abbrr   N)r   rx   r   ry   _inside_emphasis)r;   r   abbrs      r)   =it_uses_the_enclosing_emphasis_as_the_default_inside_emphasiszNDescribePhrasing.it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis  sA     	;7==iHK$$%78<NNNNr2   z<abbr>aaa</abbr>z<bdi>x<bdo>bbb</bdo></bdi>z<bdi>x<bdo>bbb</bdo>ccc</bdi>zG<big>xxx<cite>aaa<code>bbb<data>ccc</data>ddd</code>eee</cite>fff</big>dddeeefffc                    t        j                  |t              j                  d      d   d   }t	        |j                  d            |k(  sJ y )Nr   r   r   r   rx   r   ry   r   _iter_child_text_segmentsr   s       r)   ;it_generates_text_segments_for_its_children_and_their_tailszLDescribePhrasing.it_generates_text_segments_for_its_children_and_their_tails  sI    6 Y4::9EaHKA//34FFFr2   )r   inside_emphasisr-   z<dfn></dfn>z<kbd><p>aaa</p></kbd>z<kbd><p>aaa</p>bbb</kbd>z+<kbd><p>aaa</p>bbb<mark>ccc</mark>ddd</kbd>z5<strong><q>aaa</q><p>bbb</p>ccc<s>ddd</s>eee</strong>c                    t        j                  |t              j                  d      d   d   }t	        |j                  |            |k(  sJ y r   r	  )r;   r   r  r-   r}   s        r)   0and_it_generates_elements_for_its_block_childrenzADescribePhrasing.and_it_generates_elements_for_its_block_children  sJ    Z Y4::9EaHKA//@A^SSSr2   r   emphasisr-   z<cite><p/></cite>z<cite><p/>aaa</cite>z<cite><p/><s>aaa</s></cite>z<bdi><p/><s>aaa</s>bbb</bdi>z,<sub><p/>aaa<s>bbb<q>ccc</q>ddd</s>eee</sub>z2<strong><p/>aaa<s>bbb<i>ccc</i>ddd</s>eee</strong>r   z4<cite><p/>aaa<abbr>bbb<p>ccc</p>ddd</abbr>eee</cite>c                   t        j                  |t              j                  d      d   d   }|j                  d      d   }|j                  xs d}t        |dd        }t        |j                  |||            |k(  sJ y )Nr   r   z./pr   r   )r   rx   r   ry   tailr   r   0_iter_text_segments_from_block_tail_and_phrasing)r;   r   r  r-   r}   r   r  qs           r)   @it_generates_text_segments_from_the_tail_and_contiguous_phrasingzQDescribePhrasing.it_generates_text_segments_from_the_tail_and_contiguous_phrasing7  s    | Y4::9EaHKGGEN1vv|!AB%L CCD!XVW	
r2   N)r   r   r-   list[TextSegment])r   r   r-   list[TextSegment | Element]r,   r   )r   r   )r   r   r  r   r-   r  )r   r   r  r   r-   r  )rI   rJ   rK   rL   r   r%   r   r   r   r   r   r   r   r   r   r  r  r  r  rM   r2   r)   r   r   i  s~   % [[' b!!WbM?3gr]O,=uVXk?Z[ G"IRLBKRL"I		
0>>.?>10> [[' +U5\N; 1E[^_ %L	 1%LE[^_ 4E[^_ %LE[^_1$	
'P>>.I>Q'P>
 [[Vb*%562 724 [[1?C	O"%	O D	O [['  $)Kr,B+CD,{5"/E{SXZ\G].^_ Zr*r*r*r*r*r*
	
0GG.?G10G [[: B#$b5<.9'eElKr<R-ST >%Lr*r*r*		 HE[^_ %LE[^_  E[^_  E[^_'&	
)TTT/2TD_TU)TT [[3 !"b)#R+eR*@)AB*BUB1G0HI+R+eR2H+V[]_J`1ab ?r*r*r*r*r*
 EE[^_  E[^_  E[_`  E[^_  E[^_. Gr*r*%Lr*r*
Y7	
:v

(+
=X
w:v
r2   r   c                     e Zd ZdZej
                  j                  dddg fdd edi       gfdd edi       gfd	d ed
dgdgd      gfdd ed
dgdgd       edi       gfdd eddgdgdgdgd       edi       gfdd eddgdgdgdgd       edddd      gfg      	 	 	 	 	 	 d1d       Zd Z	d Z
d  Zd! Zd" Zd# Zd$ Zd% Zd& Zd' Zd( Zd) Zd* Zd+ Zej
                  j                  d,dd-g      d2d.       Zd/ Zy0)3DescribeAnchorzIsolated unit-test suite for `unstructured.partition.html.parser.Anchor`.

    The `Anchor` class is used for `<a>` tags and provides link metadata.
    r  z<a href="http://abc.com"></a>r   z(<a href="http://abc.com"></a> long tail z long tail z<a href="http://abc.com">  </a>z  z)<a href="http://abc.com"> click here </a>z click here z
click herehttp://abc.comr   z3<a href="http://abc.com"> click here </a> long tailz
 long tailzI<p>I am <a href="http://eie.io">one <u>with<i> the</i></u> Force</a>.</p>zone with the Forcether   r   r   r   r   r   r   zL<p>I am <strong><a href="http://eie.io">one with</a> the Force.</strong></p>r   one with the Force.
the Force.r    c                    t        j                  |t              j                  d      d   }t	        |j                  |            |k(  sJ y )N.//ar   r   )r;   r   r  r-   as        r)   Nit_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segmentz]DescribeAnchor.it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment  sE    \ Y4::6B1EA((23~EEEr2   c           	         d}t        j                  |t              j                  d      d   }t	        |j                  d            t        ddgdgdgdgd      t        d	      t        d
ddd      gk(  sJ y )Nz;<a href="http://eie.io">I am <p>one with</p> the Force.</a>r"  r   r   zI am zI amr   r  r  r  r   r    )r   rx   r   ry   r   r   r   r   r;   r   r#  s      r)   6it_generates_enclosed_block_items_as_separate_elementszEDescribeAnchor.it_generates_enclosed_block_items_as_separate_elements  s    U	Y4::6B1EA((-.17-0E#)("1!2	 *0<,/3
 
 	
 
r2   c           	     b   d}t        j                  |t              j                  d      d   }t	        |j                  d            }|t        di       t        d      t        dddd	      gk(  sJ |d
   }|j                  j                  dgk(  sJ |j                  j                  dgk(  sJ y )Nz><a href="http://eie.io"> 
 <p>I am one with</p> the Force.</a>r"  r   r   z 
 zI am one withr  r   r    r   r   )r   rx   r   ry   r   r   r   r
   rv   r   r   )r;   r   r#  actualrY   s        r)   Yand_it_annotates_first_enclosed_block_Element_when_no_non_whitespace_phrase_appears_firstzhDescribeAnchor.and_it_annotates_first_enclosed_block_Element_when_no_non_whitespace_phrase_appears_first  s     Z	Y4::6B1Ea**3/0#/*0<,/

 

 
	
 

 )**.????))o->>>>r2   c                    d}t        j                  |t              j                  d      d   }t	        |j                  d            t        di       ft        d      t        di       fgk(  sJ y )	Nzi
          <a href="http://eie.io">But always <p>see first.</p> Otherwise you </a> will only see
        r"  r   r   )r  But always 
see first.z Otherwise you )r   rx   r   ry   r   _iter_phrases_and_elementsr   r
   r&  s      r)   Eit_divides_the_anchor_contents_but_not_tail_into_phrases_and_elementszTDescribeAnchor.it_divides_the_anchor_contents_but_not_tail_into_phrases_and_elements
  s{    	 Y4::6B1EA00"0=>+-,'*B/1C
 
 	
 
r2   c           	        d}t        j                  |t              j                  d      d   }t	        j
                  t              5  t        |j                  dt        g       d             d d d        y # 1 sw Y   y xY w)N<a href="http://eie.io"></a>r"  r   r   r,   r  r  )
r   rx   r   ry   r%   r&   r8   r9   _iter_phrasingr   r&  s      r)   6it_generates_zero_items_when_both_text_and_q_are_emptyzEDescribeAnchor.it_generates_zero_items_when_both_text_and_q_are_empty  sl    6	Y4::6B1E]]=) 	F!!rU2Y!DE	F 	F 	Fs   'A88Bc                    d}t        j                  |t              j                  d      d   }t	        |j                  |j                  t        |      d            t        di       fgk(  sJ y )Nz5<a href="http://eie.io">
  But always see first.
</a>r"  r   r   r2  z
  But always see first.
	r   rx   r   ry   r   r3  r,   r   r   r&  s      r)   /it_generates_a_phrase_when_only_text_is_presentz>DescribeAnchor.it_generates_a_phrase_when_only_text_is_present  sn    Q	Y4::6B1EA$$!&&E!Hr$JK6;=P
 
 	
 
r2   c           	     0   d}t        j                  |t              j                  d      d   }t	        |j                  |j                  t        |      d            t        di       t        ddd	d
      t        dddd
      t        di       fgk(  sJ y )NzI<a href="http://eie.io">But always <b>see <i>first</i></b>. Otherwise</a>r"  r   r   r2  r,  zsee seer   r    firstr   z. Otherwiser6  r&  s      r)   Jand_it_generates_a_phrase_when_that_text_is_followed_by_a_phrasing_elementzYDescribeAnchor.and_it_generates_a_phrase_when_that_text_is_followed_by_a_phrasing_element'  s    c	Y4::6B1EA$$!&&E!Hr$JKM2.4903 4;04 M2.!P
 
 	
 
r2   c                    d}t        j                  |t              j                  d      d   }t	        |j                  |j                  t        |      d            t        di       fgk(  sJ y )NzE<a href="http://eie.io">But always see first.</a> Otherwise you will r"  r   r   r2  zBut always see first.r6  r&  s      r)   ,it_ends_the_phrase_at_the_end_of_the_elementz;DescribeAnchor.it_ends_the_phrase_at_the_end_of_the_element@  sn    _	Y4::6B1EA$$!&&E!Hr$JK0"57P
 
 	
 
r2   c                    d}t        j                  |t              j                  d      d   }t	        |j                  |j                  t        |      d            t        di       fgk(  sJ y )NzH<a href="http://eie.io">But always see first. <p>Otherwise you </p> </a>r"  r   r   r2  zBut always see first. r6  r&  s      r)   2but_it_ends_at_a_block_element_if_one_occurs_firstzADescribeAnchor.but_it_ends_at_a_block_element_if_one_occurs_firstH  sn    b	Y4::6B1EA$$!&&E!Hr$JK1268P
 
 	
 
r2   c           	     F   d}t        j                  |t              j                  d      d   }t	        |j                  |j                  t        |      d            t        di       t        ddd	d
      ft        d      t        ddd	d
      t        di       fgk(  sJ y )Nzk
          <a href="http://eie.io">But <strong>always <p>see first.</p>Otherwise</strong> you </a>
        r"  r   r   r2  zBut zalways alwaysr   r    r-  	Otherwisez you )
r   rx   r   ry   r   r3  r,   r   r   r
   r&  s      r)   ?it_generates_an_element_for_a_block_item_nested_inside_phrasingzNDescribeAnchor.it_generates_an_element_for_a_block_item_nested_inside_phrasingP  s    	 Y4::6B1EA$$!&&E!Hr$JKFB'4<03	 ,'4?03 GR(	P
 
 	
 
r2   c                   d}t        j                  |t              j                  d      d   }t	        d      }|j                  |      }||u sJ |j                  j                  dgk(  sJ |j                  j                  dgk(  sJ y )Nr1  r"  r   r   r   	r   rx   r   ry   r   _link_annotate_elementrv   r   r   r;   r   r#  rY   r}   s        r)   +it_adds_link_metadata_to_an_element_to_helpz:DescribeAnchor.it_adds_link_metadata_to_an_element_to_helpp  s    6	Y4::6B1Eu+$$W-G||zz$$///zz##'8888r2   c                `   d}t        j                  |t              j                  d      d   }t	        d      }dg|j
                  _        dg|j
                  _        |j                  |      }||u sJ |j
                  j                  ddgk(  sJ |j
                  j                  ddgk(  sJ y )Nr1  r"  r   r   abcr  r   )	r   rx   r   ry   r   rv   r   r   rF  rG  s        r)   :and_it_preserves_any_existing_link_metadata_on_the_elementzIDescribeAnchor.and_it_preserves_any_existing_link_metadata_on_the_element{  s    6	Y4::6B1Eu+',g#&6%7"$$W-G||zz$$666zz##(8/'JJJJr2   c                   d}t        j                  |t              j                  d      d   }t	        d      }|j                  |      }||u sJ |j                  j                  J |j                  j                  J y )Nz<a href="http://eie.io"/>r"  r   r   rE  rG  s        r)   but_not_when_the_text_is_emptyz-DescribeAnchor.but_not_when_the_text_is_empty  sz    3	Y4::6B1Er($$W-G||zz$$,,,zz##+++r2   c                   d}t        j                  |t              j                  d      d   }t	        d      }|j                  |      }||u sJ |j                  j                  J |j                  j                  J y )Nz<a/>r"  r   zzzrE  rG  s        r)   and_not_when_there_is_no_urlz+DescribeAnchor.and_not_when_there_is_no_url  sz    	Y4::6B1Eu+$$W-G||zz$$,,,zz##+++r2   c           	         d}t        j                  |t              j                  d      d   }t	        ddgdgd      t	        dd	d
d      f}|j                  |      }|t	        ddd	gdd
gdgdgd      k(  sJ y )Nr1  r"  r   zOtherwise you will only rB  r   r    zsee what you were expecting.
	expectingr   z5Otherwise you will only see what you were expecting.
z4Otherwise you will only see what you were expecting.r   r  r   rx   r   ry   r   _link_text_segment)r;   r   r#  rE   link_text_segments        r)   Iit_consolidates_a_phrase_into_a_single_link_annotated_TextSegment_to_helpzXDescribeAnchor.it_consolidates_a_phrase_into_a_single_link_annotated_TextSegment_to_help  s    6	Y4::6B1E*1<-0E 00;,/
" 008 KD-8+,F),c
UV-.	%
 
 	
 
r2   r,   z 
 	 c                    d}t        j                  |t              j                  d      d   }t	        |i       t	        |i       t	        |i       f}|j                  |      J y )Nr1  r"  r   rS  )r;   r,   r   r#  rE   s        r)   1but_not_when_the_text_is_empty_or_whitespace_onlyz@DescribeAnchor.but_not_when_the_text_is_empty_or_whitespace_only  sb    6	Y4::6B1EdB'T2)>DRT@UV##F+333r2   c                    d}t        j                  |t              j                  d      d   }t	        di       t	        di       f}|j                  |      J y )Nz<a>foobar</a>r"  r   rB  z	 you willrS  )r;   r   r#  rE   s       r)   'and_not_when_the_anchor_has_no_href_urlz6DescribeAnchor.and_not_when_the_anchor_has_no_href_url  sX    '	Y4::6B1Ek2.K0LM##F+333r2   N)r   r   r  r   r-   r  r  )rI   rJ   rK   rL   r%   r   r   r   r$  r'  r*  r/  r4  r7  r;  r=  r?  rC  rH  rK  rM  rP  rV  rX  rZ  rM   r2   r)   r  r    s    [[3 -b"57k-Y[>\=]^.[r5J4KL <&(4~EUDVW	 F&(4~EUDVW  b1
 \,9>58E+?*@*9):	  R($ _"9C58E+5,*9):	  %8D47cG	
JVFF(+F=NFWJVF
0?2

F

2


@	9K	,	,
@ [[Vb)_54 644r2   r  c                  "    e Zd ZdZd Zd Zd Zy)DescribeBoldzIsolated unit-test suite for `unstructured.partition.html.parser.Bold`.

    The `Bold` class is used for `<b>` and `<strong>` tags and adds emphasis metadata.
    c                    t        j                  dt              j                  d      d   }|j	                         }t        |      \  }}|dk(  sJ |dddk(  sJ y )Nz<b>rhombus</b>r   r   rhombusr   r    r   rx   r   ry   r   r9   r;   r   r   r,   
annotations        r)   0it_annotates_its_text_segment_with_bold_emphasisz=DescribeBold.it_annotates_its_text_segment_with_bold_emphasis  m    -{;AA&I!L,,..jy   (1$'
 
 	
 
r2   c                    t        j                  dt              j                  d      d   }|j	                         }t        |      \  }}|dk(  sJ |dddk(  sJ t        |      \  }}|dk(  sJ |dd	dk(  sJ y )
Nz<b>rhombus <i>pentagon</i></b>r   r   rhombus r^  r   r    pentagonr   r_  r`  s        r)   6and_its_children_are_also_annotated_with_bold_emphasiszCDescribeBold.and_its_children_are_also_annotated_with_bold_emphasis  s    ={KQQRXYZ[\,,..jz!!!(1$'
 
 	
 
  .jz!!!(2$(
 
 	
 
r2   c                    t        j                  dt              j                  d      d   }|j	                         }t        |      \  }}|dk(  sJ |dddk(  sJ t        |      \  }}|dk(  sJ |i k(  sJ y )Nz<b>rhombus</b> pentagonr   r   r^  r   r    	 pentagonr_  r`  s        r)   but_not_its_tailzDescribeBold.but_not_its_tail      6DJJ6RSTU,,..jy   (1$'
 
 	
 
  .j{"""Rr2   N)rI   rJ   rK   rL   rb  rg  rj  rM   r2   r)   r\  r\        



$ r2   r\  c                  "    e Zd ZdZd Zd Zd Zy)DescribeItaliczIsolated unit-test suite for `unstructured.partition.html.parser.Italic`.

    The `Italic` class is used for `<i>` and `<em>` tags and adds emphasis metadata.
    c                    t        j                  dt              j                  d      d   }|j	                         }t        |      \  }}|dk(  sJ |dddk(  sJ y )Nz<i>rhombus</i>.//ir   r^  r   r    r_  r;   r   r   r,   ra  s        r)   2it_annotates_its_text_segment_with_italic_emphasiszADescribeItalic.it_annotates_its_text_segment_with_italic_emphasis  rc  r2   c                    t        j                  dt              j                  d      d   }|j	                         }t        |      \  }}|dk(  sJ |dddk(  sJ t        |      \  }}|dk(  sJ |dd	dk(  sJ y )
Nz <em>rhombus <b>pentagon</b></em>z.//emr   re  r^  r   r    rf  r   r_  )r;   emr   r,   ra  s        r)   8and_its_children_are_also_annotated_with_italic_emphasiszGDescribeItalic.and_its_children_are_also_annotated_with_italic_emphasis  s    @+NTTU\]^_`--/.jz!!!(1$'
 
 	
 
  .jz!!!(2$(
 
 	
 
r2   c                    t        j                  dt              j                  d      d   }|j	                         }t        |      \  }}|dk(  sJ |dddk(  sJ t        |      \  }}|dk(  sJ |i k(  sJ y )Nz<i>rhombus</i> pentagonrp  r   r^  r   r    ri  r_  rq  s        r)   rj  zDescribeItalic.but_not_its_tail)  rk  r2   N)rI   rJ   rK   rL   rr  ru  rj  rM   r2   r)   rn  rn    rl  r2   rn  c                      e Zd ZdZd Zy)DescribeLineBreaka:  Isolated unit-test suite for `unstructured.partition.html.parser.LineBreak`.

    Used for `<br/>` elements, it's only special behavior is to add whitespace such that phrasing
    butted up tight on both sides of the `<br/>` element is not joined, like `abc<br/>def` should
    become "abc def", not "abcdef".
    c                   t        j                  dt              j                  d      d   }|j	                         }|D cg c]  }|j
                   }}|g dk(  sJ t        dj                  |            dk(  sJ y c c}w )Nz:<cite>spaceships of the<br/>Vogon Constructor Fleet</cite>r   r   )zspaceships of the
zVogon Constructor Fleetr   z)spaceships of the Vogon Constructor Fleet)r   rx   r   ry   r   r,   r   join)r;   r   r   tstextss        r)   it_adds_a_newline_in_its_placez0DescribeLineBreak.it_adds_a_newline_in_its_placeA  s    H+

%	
1 //1#01R11NNNNrwwu~.2]]]] 2s   A?N)rI   rJ   rK   rL   r~  rM   r2   r)   rx  rx  9  s    	^r2   rx  c                      e Zd ZdZd Zy)DescribeRemovedPhrasingzIsolated unit-test suite for `unstructured.partition.html.parser.RemovedPhrasing`.

    Used for phrasing elements like `<label>` that we want to skip, including any content they
    enclose. The tail of such an element is not skipped though.
    c                    t        j                  dt              j                  d      d   }t	        |j                               \  }t        |t              sJ |j                  du sJ |j                  dk(  sJ y )Nzh<div>
  <label>Space<p>is big</p>, <b>mind-bogglingly</b> big.</label>
  Like vastly, hugely big.
</div>z.//labelr   Tz
  Like vastly, hugely big.
)
r   rx   r   ry   r   r   r   r   r   r,   )r;   labeltext_segments      r)    it_behaves_like_an_empty_elementz8DescribeRemovedPhrasing.it_behaves_like_an_empty_elementT  s       
 %

A u779:%111  D(((  $DDDDr2   N)rI   rJ   rK   rL   r  rM   r2   r)   r  r  M  s    Er2   r  c                  (    e Zd ZdZd Zd Zd Zd Zy)DescribeDefaultElementa  Isolated unit-test suite for `unstructured.partition.html.parser.DefaultElement`.

    Used for any element we haven't assigned a custom element-class too. This prominently includes
    any non-HTML elements that can be embedded in the HTML.

    It identifies as a block item but it can behave as either a block-item or phrasing. Its behavior
    is a combination of RemovedBlock and RemovedPhrasing. Namely, it iterates zero elements and only
    iterates a text-segment for its tail.
    c                    t        j                  dt              j                  d      d   }t	        |t
              sJ |j                  du sJ y )Nz<foobar>Vogon</foobar>	.//foobarr   T)r   rx   r   ry   r   r   r   )r;   r"   s     r)   #it_identifies_as_a_phrasing_elementz:DescribeDefaultElement.it_identifies_as_a_phrasing_elementt  sJ    !!":KHNN{[\]^&.111!!T)))r2   c                    t        j                  dt              j                  d      d   }|j	                         }t        j                  t              5  t        |       ddd       y# 1 sw Y   yxY w)z0Should never be called but belts and suspenders.z@<foobar>Space<p>is big</p>, <b>mind-bogglingly</b> big.</foobar>r  r   N)	r   rx   r   ry   r   r%   r&   r8   r9   )r;   r"   r   s      r)   *it_generates_zero_elements_as_a_block_itemzADescribeDefaultElement.it_generates_zero_elements_as_a_block_item|  sc    !!N
 %
Q 
 '')]]=) 	N	 	 	s   A++A4c                    t        j                  dt              j                  d      d   }|j	                         D cg c]  }|j
                   }}|dgk(  sJ y c c}w )N<div>
  O Deep Thought computer, he said,
  <foobar>Vogon Constructor Fleet</foobar>
  The task we have designed you to perform is this.
  <p>We want you to tell us.... he paused,</p>
</div>r  r   z7
  The task we have designed you to perform is this.
  )r   rx   r   ry   r   r,   )r;   r"   r|  r}  s       r)   Jit_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasingzaDescribeDefaultElement.it_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasing  si    !! 
 %
Q  $*#<#<#>?R??TUUUU @s   Ac                    t        j                  dt              j                  d      d   }|j	                         D cg c]  }|j
                   }}|ddgk(  sJ y c c}w )Nr  r   r   zSO Deep Thought computer, he said, The task we have designed you to perform is this.z%We want you to tell us.... he paused,)r   rx   r   ry   r   r,   )r;   r   r}   r}  s       r)   Dand_it_behaves_like_an_empty_phrasing_element_inside_a_block_elementz[DescribeDefaultElement.and_it_behaves_like_an_empty_phrasing_element_inside_a_block_element  sr     
 %/! "%!2!2!45A55a3
 
 	
 
 6s   AN)rI   rJ   rK   rL   r  r  r  r  rM   r2   r)   r  r  g  s    *
V
r2   r  )r,   r   r-   r   )-rL   
__future__r   collectionsr   r%   lxmlr   unstructured.documents.elementsr   r   r	   r
   r   r   "unstructured.partition.html.parserr   r   r   r   r   r   r   r   r   r   r   r   r*   r   r   r1   r4   rO   r   r   r   r   r   r  r\  rn  rx  r  r  rM   r2   r)   <module>r     s   B "    b b   (EF 33% %PF FD
 
.l/ l/^B9 B9JP P2T
 T
nN4 N4b
1  1 h1  1 h^ ^(E E4B
 B
r2   