
    :Qg                    X   d Z ddlmZ ddlZddlZddlmZ ddlZddlm	Z	 ddl
mZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZ dd
lmZ ddl m!Z! ddl"m#Z#m$Z$ dadZ%dadZ&d Z'd Z(dbdZ)d Z*ejV                  jY                  dddg      dcd       Z-ejV                  jY                  dg d      dcd       Z.ejV                  jY                  dddg      dcd       Z/ejV                  jY                  dg d      dcd       Z0ejV                  jY                  dddg      dcd       Z1ejV                  jY                  dg d      dcd       Z2d Z3d Z4dbdZ5dbd Z6dbd!Z7d" Z8d# Z9d$ Z:d% Z;d& Z<d' Z=d( Z>ejV                  jY                  d)d*d+g      ddd-       Z?d. Z@d/ ZAd0 ZBd1 ZCd2 ZDd3 ZEd4 ZFd5 ZGd6 ZHd7 ZId8 ZJd9 ZKd: ZLd; ZMd< ZNd= ZOd> ZPd? ZQejV                  jY                  d,g d@      dedA       ZRdB ZSdC ZTdD ZUdE ZVdF ZWdG ZXdH ZYdI ZZdJ Z[dK Z\dL Z]dfdMZ^dN Z_dO Z`dP ZaejV                  jY                  dQdRdSg      	 	 	 	 dgdT       ZbdbdUZcdV Zd edWX      gZe G dY dZ      Zfej                  dhd[       Zhej                  did\       Zi G d] d^      Zj G d_ d`      Zky)jz>Test suite for `unstructured.partition.html.partition` module.    )annotationsN)Any)etree)FixtureRequestMockassert_round_trips_through_JSONexample_doc_pathexample_doc_textfunction_mock)chunk_by_title)clean_extra_whitespace)AddressCompositeElementListItemNarrativeTextTable
TableChunkTextTitle)read_txt_filepartition_html)HtmlPartitionerOptions_HtmlPartitionerc           
     |   t        | dz        }t        |d      5 }|j                  d       d d d        t        |      }t	        |      dk(  sJ |t        d      t        d      t        d      t        d      t        d	      t        d
      t        d      gk(  sJ t        d |D              sJ y # 1 sw Y   xY w)Nsample-doc.htmlwa  <html>
  <body>
    <h1>A Great and Glorious Section</h1>
    <p>Dear Leader is the best. He is such a wonderful engineer!</p>
    <p></p>
    <p>Another Magnificent paragraph</p>
    <p><b>The prior element is a title based on its capitalization patterns!</b></p>
    <table>
      <tbody>
        <tr>
          <td><p>I'm in a table</p></td>
        </tr>
      </tbody>
    </table>
    <h2>A New Beginning</h2>
    <div>Here is the start of a new page.</div>
  </body>
</html>
   zA Great and Glorious Sectionz9Dear Leader is the best. He is such a wonderful engineer!zAnother Magnificent paragraphzBThe prior element is a title based on its capitalization patterns!zI'm in a tablezA New Beginningz Here is the start of a new page.c              3  N   K   | ]  }|j                   j                  d k(    yw)r   Nmetadatafilename.0es     l/var/www/html/answerous/venv/lib/python3.12/site-packages/test_unstructured/partition/html/test_partition.py	<genexpr>z:test_partition_html_accepts_a_file_path.<locals>.<genexpr>Q   s      JAqzz""&77J   #%)	stropenwriter   lenr   r   r   all)tmp_path	file_pathfelementss       r&   'test_partition_html_accepts_a_file_pathr2   -   s    H001I	i	 
		

, i(Hx=A,-QR-.Z[ 89    JJJJJE
 
s   B22B;c                    | dz  }|j                  t        d             |j                  d       t        t	        |j                                     }t        |      dkD  sJ y )Nzexample-10k-readonly.htmlexample-10k-1p.htmli$  )r"   r   )
write_textr
   chmodr   r)   resolver,   )r.   read_only_file_pathr1   s      r&   :test_user_without_file_write_permission_can_partition_htmlr9   T   s\    "%@@""#34I#JKe$s+>+F+F+H'IJHx=1    c                     t        t        d      d      5 } t        |       }d d d        t              dkD  sJ t	        d |D              sJ y # 1 sw Y   .xY w)Nr4   rbfiler   c              3  L   K   | ]  }|j                   j                  d u   y wNr    r#   s     r&   r'   zAtest_partition_html_accepts_a_file_like_object.<locals>.<genexpr>c   s     =qqzz""d*=   "$r*   r	   r   r,   r-   r0   r1   s     r&   .test_partition_html_accepts_a_file_like_objectrD   ^   s\    	45t	< *!q)* x=1=H====	* *s   AAc                 N    t        t        d            } t        |       dkD  sJ y )Nr4   textr   )r   r
   r,   r1   s    r&   'test_partition_html_accepts_an_html_strrI   f   s%    #34I#JKHx=1r:   c                    t        t        d      dddi      | _        t        d      }| j	                  di d	       t        |      d
kD  sJ y )Nr4      Content-Type	text/htmlrG   status_codeheadershttps://fake.urlurlTrP   verifyr   )FakeResponser
   return_valuer   assert_called_once_withr,   requests_get_r1   s     r&   5test_partition_html_accepts_a_url_to_an_HTML_documentr[   k   sY    !-34-"M "45H))*<bQU)Vx=1r:   c                 x    t        j                  t        d      5  t                d d d        y # 1 sw Y   y xY w)Nz6Exactly one of filename, file, text, or url must be spmatch)pytestraises
ValueErrorr    r:   r&   Ktest_partition_html_raises_when_no_path_or_file_or_text_or_url_is_specifiedrc   x   s,    	z)a	b   s   09r"   example-10k-utf-16.html&example-steelJIS-datasheet-utf-16.htmlc                    t        j                  t              5  t        t	        |       d      5 }t        |d       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr<   utf-8r>   encodingr_   r`   UnicodeDecodeErrorr*   r	   r   r"   r0   s     r&   Htest_partition_html_from_filename_raises_when_explicit_encoding_is_wrongrm      s\     
)	* 5"8,d3 	5qG4	55 5	5 	55 5s!   AAAA	AA$)rd   re   fake-html-lang-de.htmlc                     t        t                     }t        |      dkD  sJ t         fd|D              sJ  dk(  r|t        k(  sJ y y )Nr   c              3  P   K   | ]  }|j                   j                  k(    y wr@   r    )r$   r%   r"   s     r&   r'   zEtest_partition_html_from_filename_default_encoding.<locals>.<genexpr>   s      A1qzz""h.As   #&rn   )r   r	   r,   r-   EXPECTED_OUTPUT_LANGUAGE_DE)r"   r1   s   ` r&   2test_partition_html_from_filename_default_encodingrr      sX    
 .x89Hx=1AAAAA++6666 ,r:   c                   t        t        |       d      5 }t        j                  |j	                               }d d d        t        j                  t        d      5  t        d       d d d        y # 1 sw Y   ;xY w# 1 sw Y   y xY w)Nr<   ,'utf-8' codec can't decode byte 0xff in posir]   rg   rh   )	r*   r	   ioBytesIOreadr_   r`   rk   r   )r"   r0   r>   s      r&   3test_partition_html_from_file_raises_encoding_errorrx      su     
x($	/ $1zz!&&(#$ 
)1_	` 4D734 4$ $4 4s   $A4B 4A= B	c                    t        t        |       d      5 }t        |      }d d d        t              dkD  sJ | dk(  r|t        k(  sJ y y # 1 sw Y   +xY wNr<   r=   r   rn   r*   r	   r   r,   rq   r"   r0   r1   s      r&   .test_partition_html_from_file_default_encodingr}      f    
 
x($	/ *1!q)* x=1++6666 ,	* *   AAc                    t        j                  t        d      5  t        t	        |       d      5 }t        |d       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nrt   r]   r<   rg   rh   rj   rl   s     r&   6test_partition_html_from_file_rb_raises_encoding_errorr      s_     
)1_	` 5"8,d3 	5qG4	55 5	5 	55 5s"   AA AA	AA&c                    t        t        |       d      5 }t        |      }d d d        t              dkD  sJ | dk(  r|t        k(  sJ y y # 1 sw Y   +xY wrz   r{   r|   s      r&   1test_partition_html_from_file_rb_default_encodingr      r~   r   c                 H    d} t        |       }|d   j                  dk(  sJ y )Nu+   <html><div><p>每日新闻</p></div></html>rF   r   u   每日新闻)r   rG   	html_textr1   s     r&   /test_partition_html_processes_chinese_chractersr      s)    =I9-HA;~---r:   c                 :    t        d      t        d      gk(  sJ y )Nz3<html charset="utf-8"><p>Hello &#128512;</p></html>rF   u
   Hello 😀r   r   rb   r:   r&   'test_emoji_appears_with_emoji_utf8_coder      s(    TUlZ   r:   c                    t        t        d      dddi      | _        t        j                  t
        d      5  t        d	       d d d        y # 1 sw Y   y xY w)
Nr4   i  rL   rM   rN   z-Error status code on GET of provided URL: 500r]   rQ   rR   rV   r
   rW   r_   r`   ra   r   rZ   s    r&   Ctest_partition_html_from_url_raises_on_failure_response_status_coder      sR    !-34-"M 
z)X	Y /-./ / /   AAc                    t        t        d      dddi      | _        t        j                  t
        d      5  t        d	       d d d        y # 1 sw Y   y xY w)
Nr4   rK   rL   zapplication/jsonrN   z6Expected content type text/html. Got application/json.r]   rQ   rR   r   r   s    r&   Etest_partition_html_from_url_raises_on_response_of_wrong_content_typer      sS    !-34!34"M 
z)a	b /-./ / /r   c                x    t        ddddi      | _        t        dddi	       | j                  dddid
       y )NzS<html><head></head><body><p>What do I know? Who needs to know it?</p></body></html>rK   rL   rM   rN   zhttps://example.comz
User-Agenttest)rS   rP   TrT   )rV   rW   r   rX   r   s    r&   <test_partition_from_url_includes_provided_headers_in_requestr      sN    !-b-"M ,|V6LM))f'=d * r:   c                    t        t        d            } t        |       dk(  sJ | d   }|t        d      k(  sJ |j                  j
                  J |j                  j                  J |j                  j                  J y )Nzideas-page.html   r   a  January 2023 ( Someone fed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from.  The
answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge. Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.)r   r	   r,   r   r!   emphasized_text_contents	link_urlstext_as_htmlr1   r%   s     r&   !test_partition_html_on_ideas_pager      s    ./@ABHx=AA	$    ::..666::'''::""...r:   c                 T    t        t        d            } t        d | D              sJ y )Nzexample-with-scripts.htmlc              3  8   K   | ]  }d |j                   v  yw)z
function (NrF   r#   s     r&   r'   z?test_it_does_not_extract_text_in_script_tags.<locals>.<genexpr>  s     <a|166)<s   )r   r	   r-   rH   s    r&   ,test_it_does_not_extract_text_in_script_tagsr     s'    ./JKLH<8<<<<r:   c                 j    d} t        |       \  }t        |t              sJ |j                  dk(  sJ y )Nzc<html>
<body>
  <p><style> p { margin:0; padding:0; } </style>Lorem ipsum dolor</p>
</body>
</html>rF   zLorem ipsum dolor)r   
isinstancer   rG   r   elements     r&   +test_it_does_not_extract_text_in_style_tagsr     s=    	   Y/JWgt$$$<<....r:   c                     d} t        |       \  }t        |t              sJ |j                  dk(  sJ |j                  j
                  dk(  sJ y)z@Bare-bones means no `<thead>`, `<tbody>`, or `<tfoot>` elements.z<html>
<body>
  <table>
    <tr><td>Lorem</td><td>Ipsum</td></tr>
    <tr><td>Ut enim non</td><td>ad minim
veniam quis</td></tr>
  </table>
</body>
</html>rF   z,Lorem Ipsum Ut enim non ad minim
veniam quiszr<table><tr><td>Lorem</td><td>Ipsum</td></tr><tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr></table>N)r   r   r   rG   r!   r   r   s     r&   7test_it_can_parse_a_bare_bones_table_to_a_Table_elementr   .  s`    	   Y/JWgu%%%<<JJJJ((	  r:   c                 ~    d} t        |       \  }t        |t              sJ |j                  j                  dk(  sJ y)a	  Cells within a `table/thead` element are included in the text and html.

    The presence of a `<thead>` element in the original also determines whether a `<thead>` element
    appears in `.text_as_html` or whether the first row of cells is simply in the body.
    W  <html>
<body>
  <table>
    <thead>
      <tr><th>Lorem</th><th>Ipsum</th></tr>
    </thead>
    <tbody>
      <tr><th>Lorem ipsum</th><td>dolor sit amet nulla</td></tr>
      <tr><th>Ut enim non</th><td>ad minim
veniam quis</td></tr>
    </tbody>
    <tfoot>
      <tr><th>Dolor</th><td>Equis</td></tr>
    </tfoot>
  </table>
</body>
</html>rF   z<table><tr><td>Lorem</td><td>Ipsum</td></tr><tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr><tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr><tr><td>Dolor</td><td>Equis</td></tr></table>N)r   r   r   r!   r   r   s     r&   Ttest_it_accommodates_column_heading_cells_enclosed_in_thead_tbody_and_tfoot_elementsr   I  sM    	 &  Y/JWgu%%%((	  r:   c                 *    d} t        |       g k(  sJ y )Nzv<html>
<body>
  <table>
    <tr><td> </td><td> </td></tr>
    <tr><td> </td><td> </td></tr>
  </table>
</body>
</html>rF   r   )r   s    r&   >test_it_does_not_emit_a_Table_element_for_a_table_with_no_textr   o  s!    	  y)R///r:   c                     d} t        |       \  }|j                  j                  }|J t        j                  |t        j
                               }|J t        j                  |t              dk(  sJ y )Nr   rF   )ri   z<html><body><table><tr><td>Lorem</td><td>Ipsum</td></tr><tr><td>Lorem ipsum</td><td>dolor sit amet nulla</td></tr><tr><td>Ut enim non</td><td>ad minim<br/>veniam quis</td></tr><tr><td>Dolor</td><td>Equis</td></tr></table></body></html>)r   r!   r   r   
fromstring
HTMLParsertostringr)   )r   r   r   htmls       r&   /test_it_provides_parseable_HTML_in_text_as_htmlr   ~  s    	 &  Y/JW##00L###L%*:*:*<=D>>$-		 	 	r:   )tagexpected_text_as_html)thead:<table><tr><td>Header 1</td><td>Header 2</td></tr></table>)tfootr   r   c                f    t        d|  d|  d      }|d   j                  j                  |k(  sJ y )Nz<table>
  <z6>
    <tr><th>Header 1</th><th>Header 2</th></tr>
  </z
>
</table>rF   r   )r   r!   r   )r   r   r1   s      r&   .test_partition_html_parses_table_without_tbodyr     sM      % H A;,,0EEEEr:   c                 z    d} t        |       \  }|t        d      k(  sJ |j                  j                  dk(  sJ y )Nz<table>
 <tr>
  <td>
   <table>
     <tr><td>foo</td><td>bar</td></tr>
     <tr><td>baz</td><td>bng</td></tr>
   </table>
  </td>
  <td>
   <table>
     <tr><td>fizz</td><td>bang</td></tr>
   </table>
  </td>
 </tr>
</table>rF   zfoo bar baz bng fizz bangzB<table><tr><td>foo bar baz bng</td><td>fizz bang</td></tr></table>r   r   r!   r   r   s     r&   Ztest_partition_html_reduces_a_nested_table_to_its_text_placed_in_the_cell_that_contains_itr     sO    	 $  Y/JWe78888((L  r:   c                 z    d} t        |       \  }|t        d      k(  sJ |j                  j                  dk(  sJ y)z)Like this example from an SEC 10k filing.a  <table>
 <tr>
  <td></td>
  <td></td>
 </tr>
 <tr>
  <td>
   <p>
    <span>
     <ix:nonNumeric id="F_be4cc145-372a-4689-be60-d8a70b0c8b9a" contextRef="C_1de69f73-df01-4830-8af0-0f11b469bc4a" name="dei:DocumentAnnualReport" format="ixt-sec:boolballotbox">
     <span>&#9746;</span>
     </ix:nonNumeric>
    </span>
   </p>
  </td>
  <td>
   <p>
    <span>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</span>
   </p>
  </td>
 </tr>
</table>
rF   uX   ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934u   <table><tr><td/><td/></tr><tr><td>☒</td><td>ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</td></tr></table>Nr   r   s     r&   8test_partition_html_accommodates_tds_with_child_elementsr     s[    	 8  Y/JWeb    ((	  r:   c                 :   d} t        |       }t        |      dk(  sJ |d   }|t        d      k(  sJ |j                  j                  J |d   }|t        d      k(  sJ |j                  j                  dk(  sJ |d   }|t        d	      k(  sJ |j                  j                  dk(  sJ |d
   }|t        d      k(  sJ |j                  j                  dk(  sJ |d   }|t        d      k(  sJ |j                  j                  d
k(  sJ |d   }|t        d      k(  sJ |j                  j                  dk(  sJ |d   }|t        d      k(  sJ |j                  j                  dk(  sJ |d   }|t        d      k(  sJ |j                  j                  dk(  sJ y )Naa  <p>This is narrative text, it's long, flows and has meaning</p>
<h1>This heading is a title, even though it's long, flows and has meaning</h1>
<h2>A heading that is at the second level</h2>
<h3>Finally, the third heading</h3>
<h4>December 1-17, 2017</h4>
<h5>email@example.com</h5>
<h6>* bullet point</h6>
<h3><li>- invalidly nested list item</li></h3>
rF      r   z8This is narrative text, it's long, flows and has meaningr   zEThis heading is a title, even though it's long, flows and has meaning   z%A heading that is at the second level   zFinally, the third heading   zDecember 1-17, 2017   zemail@example.com   z* bullet pointr   z- invalidly nested list item)r   r,   r   r!   category_depthr   r   r   r1   r%   s      r&   Dtest_partition_html_recognizes_h1_to_h6_as_Title_with_category_depthr     s   	;  9-Hx=AAXYYYY::$$,,,A]^^^^::$$)))A=>>>>::$$)))A23333::$$)))A+,,,,::$$)))A)****::$$)))A&''''::$$)))A78888::$$)))r:   c                 n   t        t        d            } t        dt        |              t        |       dkD  sJ t	        | d   j
                        j                  d      sJ t        | d   t              sJ | d   j                  j                  dk(  sJ | d   j                  j                  dk(  sJ y )Nzfake-html-pre.htmzlen(elements)=r   z[107th Congress Public Law 56]rM   )r   r	   printr,   r   rG   
startswithr   r   r!   filetyper"   rH   s    r&   4test_partition_html_with_widely_encompassing_pre_tagr   .  s    ./BCDH	^S]
x=1!(1+"2"23>>?_```hqk=111A;((K777A;((,????r:   c                 v    t        d      t        d      t        d      t        d      t        d      gk(  sJ y )Nz<pre>The Big Brown Bear</pre>
<div>The big brown bear is growling.</div>
<pre>The big brown bear is sleeping.</pre>
<div>The Big Blue Bear</div>
rF   zThe Big Brown BearzThe big brown bear is growling.zThe big brown bear is sleeping.zThe Big Blue Bearr   r   r   rb   r:   r&   #test_pre_tag_parsing_respects_orderr   9  sI    - 	"#7878!"	
  r:   c                     d} t        |       }|t        d      t        d      t        d      t        d      gk(  sJ |d   }|j                  j                  g dk(  sJ |j                  j
                  g d	k(  sJ y )
Na*  <!DOCTYPE html>
<html>
<body>
<div>
  <h1>Header 1</h1>
  <p>Text</p>
  <h2>Header 2</h2>
  <pre>
    <b>Param1</b> = Y<br><b>Param2</b> = 1<br><b>Param3</b> = 2<br><b>Param4</b> = A
    <br><b>Param5</b> = A,B,C,D,E<br><b>Param6</b> = 7<br><b>Param7</b> = Five<br>
  </pre>
</div>
</body>
</html>
rF   zHeader 1r   zHeader 2zd    Param1 = Y
Param2 = 1
Param3 = 2
Param4 = A
    
Param5 = A,B,C,D,E
Param6 = 7
Param7 = Five

  r   )Param1Param2Param3Param4Param5Param6Param7)br   r   r   r   r   r   )r   r   r   r!   r   emphasized_text_tagsr   s      r&   "test_partition_html_br_tag_parsingr   I  s    	 " 9-Hjfj.	
	    	A::.. 3    ::**.QQQQr:   c                 j    d} t        |       }|t        d      t        d      t        d      gk(  sJ y )Nz`<html>
<body>
<div>
    Head
    <div><span>Nested</span></div>
    Tail
</div>
</body>
</html>
rF   HeadNestedTailr   r   s     r&   $test_partition_html_tag_tail_parsingr   t  s9    	  9-HfuXfFFFFr:   c                 &    t        d      g k(  sJ y )N rF   r   rb   r:   r&   5test_partition_html_from_text_works_with_empty_stringr     s    r"b(((r:   c                     d} t        |       }|t        d      t        d      t        d      gk(  sJ |d   j                  j                  dgk(  sJ |d   j                  j                  dgk(  sJ y )	Na  
    <div>
      We start out normally
      <cite>
        and then add a citation
        <p>But whoa, this is a paragraph inside a phrasing element.</p>
        so we close the first element at the start of the block element and emit it, then we
        <b>emit</b> the block element,
      </cite>
      and then start a new element for the tail and whatever phrasing follows it.
    </div>
    rF   z-We start out normally and then add a citationz8But whoa, this is a paragraph inside a phrasing element.zso we close the first element at the start of the block element and emit it, then we emit the block element, and then start a new element for the tail and whatever phrasing follows it.r   emitr   )r   r   r!   r   r   r   s     r&   Jtest_partition_html_accommodates_block_item_nested_inside_phrasing_elementr     s    I 9-HEFPQ[	
    A;88VHDDDA;44===r:   c                 B   d} t        |       }|D cg c]  }|j                   c}g dk(  sJ |d   }|j                  j                  dgk(  sJ |j                  j                  dgk(  sJ t        d |dd  D              sJ t        d	 |dd  D              sJ y c c}w )
Nz
    <div>
      O Deep Thought
      <a href="http://eie.io">
        computer, he said,
        <p>The task we have designed you to perform is this.</p>
        We want you to tell us....
      </a>
      he paused,
    </div>
    rF   )z!O Deep Thought computer, he said,z1The task we have designed you to perform is this.z%We want you to tell us.... he paused,r   zcomputer, he said,zhttp://eie.ioc              3  L   K   | ]  }|j                   j                  d u   y wr@   )r!   
link_textsr#   s     r&   r'   zLtest_partition_html_handles_anchor_with_nested_block_item.<locals>.<genexpr>  s     Cqzz$$,CrA   r   c              3  L   K   | ]  }|j                   j                  d u   y wr@   )r!   r   r#   s     r&   r'   zLtest_partition_html_handles_anchor_with_nested_block_item.<locals>.<genexpr>  s     Bqzz##t+BrA   )r   rG   r!   r   r   r-   )r   r1   r%   link_annotated_elements       r&   9test_partition_html_handles_anchor_with_nested_block_itemr     s    
I 9-H$%qAFF% *   
 &a[!**55:N9OOOO!**448IIIIChqrlCCCCBXab\BBBB &s   Bc                     d} t        |       }|t        d      t        d      t        d      t        d      t        d      t        d      t        d	      t	        d
      t        d      g	k(  sJ y )Na  <div dir=3D"ltr">Hi All,
  <div><br></div>
  <div>Get excited for our first annual family day!</div>
  <div>Best.<br clear="all">
    <div><br></div>
    -- <br>
    <div dir=3D"ltr">
      <div dir=3D"ltr">Dino the Datasaur<div>
      Unstructured Technologies<br>
      <div>Data Scientist</div>
      <div>Doylestown, PA 18901</div>
      <div><br></div>
    </div>
  </div>
  See you there!
</div>
rF   zHi All,z,Get excited for our first annual family day!zBest.z--zDino the DatasaurzUnstructured TechnologieszData ScientistzDoylestown, PA 18901zSee you there!)r   r   r   r   r   r   s     r&   'test_containers_with_text_are_processedr     sz    	 & 9-HYDEgT
!")*&'&'
 
 
 
r:   c                 V    d} t        |       }|t        d      t        d      gk(  sJ y )Nz<html>
  <body>
    <ol>
      <li>Happy Groundhog's day!</li>
      <li>Looks like six more weeks of winter ...</li>
    </ol>
  </body>
</html>
rF   Happy Groundhog's day!'Looks like six more weeks of winter ...r   r   r   s     r&   %test_html_grabs_bulleted_text_in_tagsr     sA    	  9-H)*:;   r:   c                 V    d} t        |       }|t        d      t        d      gk(  sJ y )Nz<html>
  <body>
    <p>
      <span>&#8226; Happy Groundhog's day!</span>
    </p>
    <p>
      <span>&#8226; Looks like six more weeks of winter ...</span>
    </p>
  </body>
</html>
rF   r   r   r   r   s     r&   &test_html_grabs_bulleted_text_in_parasr     sC    		  9-H )*:;   r:   c                 >    t        d      } | t        d      gk(  sJ y )Nz&<p>Hello again peet mag<i>ic</i>al</p>rF   zHello again peet magicalr   rH   s    r&   test_joins_tag_text_correctlyr     s$    #KLH89::::r:   c                 >    t        d      } | t        d      gk(  sJ y )Nu8   <html charset="unicode">
<p>Hello again 😀</p>
</html>rF   u   Hello again 😀)r   r   rH   s    r&   test_sample_doc_with_emojir     s$    #_`H&89::::r:   c                 >    t        d      } | t        d      gk(  sJ y )Nz<body>Hello</body>rF   Hellor   rH   s    r&   &test_only_text_and_no_elements_in_bodyr     s"    #78Hg''''r:   c                 R    t        d      } | t        d      t        d      gk(  sJ y )Nz<body>Hello<p>World</p></body>rF   r   Worldr   rH   s    r&   !test_text_before_elements_in_bodyr   "  s(    #CDHgg7777r:   c                 >    t        d      } | t        d      gk(  sJ y )Nz<div>Hello<br/>World</div>rF   zHello Worldr   rH   s    r&   test_line_break_in_containerr   '  s#    #?@Hm,----r:   )delformnoscriptc                <    d|  d|  d}t        |      }|g k(  sJ y )Nz
<body>
  <z#>
    There is some text here.
  </z
>
</body>
rF   r   )r   r   r1   s      r&   test_exclude_tag_typesr  ,  s-    cU"GuLYI9-Hr>>r:   c                     t        d      } t        | d      }t        t        |             }t        d |D              sJ ||k(  sJ y )Nr4   by_title)chunking_strategyc              3  R   K   | ]  }t        |t        t        t        f       ! y wr@   )r   r   r   r   )r$   cs     r&   r'   zCtest_partition_html_can_chunk_while_partitioning.<locals>.<genexpr>@  s     Tz!.zBCTs   %')r	   r   r   r-   )r/   chunkschunks_2s      r&   0test_partition_html_can_chunk_while_partitioningr  <  sJ     !67IIDFnY78HTVTTTTXr:   c                 P    t        dd      t        d      t        d      gk(  sJ y )Nz<html>
  <header>
    <p>Header</p>
  </header>
  <body>
    <h1>My First Heading</h1>
    <p>It was a dark and stormy night. No one was around.</p>
  </body>
  <footer>
    <p>Footer</p>
  </footer>
</html>
T)rG   skip_headers_and_footerszMy First Headingz2It was a dark and stormy night. No one was around.r   rb   r:   r&   0test_partition_html_can_skip_headers_and_footersr  G  s<     "&" 	 !JK
!  r:   c                     t        t        d            D  cg c]  } | j                   }} t        |      t        t	        |            k(  sJ y c c} w )Nz&fake-html-with-duplicate-elements.html)r   r	   idr,   set)r%   idss     r&   test_all_element_ids_are_uniquer  a  sE    '(89a(bc
dA144
dC
ds8s3s8}$$$ es   Ac                     t        d      D  cg c]  } | j                   }} t        d      D  cg c]  } | j                   }} ||k(  sJ y c c} w c c} w )Nz3example-docs/fake-html-with-duplicate-elements.html)r   r  )r%   r  ids_2s      r&   "test_element_ids_are_deterministicr  f  sQ    '(]^
_A144
_C
_)*_`aaQTTaEa%<< `as
   AAc                    t        d      } t        |       dk(  sJ | d   }t        |t              sJ |j                  dk(  sJ |j
                  j                  J |j
                  j                  J | d   }t        |t              sJ |j                  dk(  sJ |j
                  j                  dk(  sJ |j
                  j                  J | d   }t        |t              sJ |j                  d	k(  sJ |j
                  j                  J |j
                  j                  | d   j                  k(  sJ | d
   }t        |t              sJ |j                  dk(  sJ |j
                  j                  dk(  sJ |j
                  j                  | d   j                  k(  sJ | d   }t        |t              sJ |j                  dk(  sJ |j
                  j                  dk(  sJ |j
                  j                  | d   j                  k(  sJ | d   }t        |t              sJ |j                  dk(  sJ |j
                  j                  J |j
                  j                  | d   j                  k(  sJ y )Na  <html>
  <p>Preamble gets no category_depth or parent_id</p>
  <h1>Heading gets category_depth but no parent_id</h1>
  <p>Body paragraph gets parent_id but no category_depth</p>
  <ul>
    <li>List item gets category_depth and parent_id</li>
    <li>Second list item gets category_depth and parent_id</li>
  </ul>
  <p>Body paragraph after list gets parent_id but no category_depth</p>
</html>
rF   r   r   z,Preamble gets no category_depth or parent_idr   z,Heading gets category_depth but no parent_idr   z3Body paragraph gets parent_id but no category_depthr   z+List item gets category_depth and parent_idr   z2Second list item gets category_depth and parent_idr   z>Body paragraph after list gets parent_id but no category_depth)r   r,   r   r   rG   r!   r   	parent_idr   r  r   r   s     r&   .test_partition_html_records_hierarchy_metadatar  s  sc   	H x=AAa'''66CCCC::$$,,,::'''Aa66CCCC::$$)))::'''Aa'''66JJJJ::$$,,,::8A;>>111Aa"""66BBBB::$$)))::8A;>>111Aa"""66IIII::$$)))::8A;>>111Aa'''66UUUU::$$,,,::8A;>>111r:   c                    t        d      } | d   }|t        d      k(  sJ |j                  j                  dgk(  sJ |j                  j                  dgk(  sJ | d   }|t        d      k(  sJ |j                  j                  g d	k(  sJ |j                  j                  g d
k(  sJ | d   }|t        d      k(  sJ |j                  j                  dgk(  sJ |j                  j                  dgk(  sJ | d   }|t        d      k(  sJ |j                  j                  J |j                  j                  J | d   }|t        d      k(  sJ |j                  j                  J |j                  j                  J y )Nz<html>
  <p>Hello there I am a very <strong>important</strong> text!</p>
  <p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>
  <ul>
    <li><em>Parrots</em></li>
    <li>Dogs</li>
  </ul>
  <span>A lone span text!</span>
</html>
rF   r   z'Hello there I am a very important text!	importantr   r   $Here is a list of my favorite things)myfavoritethings)r   bir   r   Parrotsir   Dogsr   zA lone span text!)r   r   r!   r   r   r   r   r   s     r&   *test_partition_html_grabs_emphasized_textsr%    s   H 	AGHHHH::..;-???::**se333ADEEEE::..2NNNN::**.>>>>A####::..9+===::**se333A    ::..666::**222A)****::..666::**222r:   c                     t        t        d            } t        |       dkD  sJ t        d | D              sJ t        d | D              sJ y )Nr4   r   c              3  N   K   | ]  }|j                   j                  d k(    yw)r4   Nr    r#   s     r&   r'   zatest_partition_html_from_filename_uses_source_filename_for_metadata_by_default.<locals>.<genexpr>  s      Nqzz""&;;Nr(   c              3  `   K   | ]&  }|j                   j                  t        d       k(   ( yw)r   N)r!   file_directoryr	   r#   s     r&   r'   zatest_partition_html_from_filename_uses_source_filename_for_metadata_by_default.<locals>.<genexpr>  s%     SQqzz((,<R,@@Ss   ,.r   r	   r,   r-   rH   s    r&   Ntest_partition_html_from_filename_uses_source_filename_for_metadata_by_defaultr+    sM    ./DEFHx=1NXNNNNS(SSSSr:   c                 x    t        t        d      d      } t        |       dkD  sJ t        d | D              sJ y )Nr4   r   )metadata_filenamer   c              3  N   K   | ]  }|j                   j                  d k(    ywr   Nr    )r$   r   s     r&   r'   zNtest_partition_html_from_filename_prefers_metadata_filename.<locals>.<genexpr>  s!     Kww((F2Kr(   r*  rH   s    r&   ;test_partition_html_from_filename_prefers_metadata_filenamer0    s<    ./DEY_`Hx=1K(KKKKr:   c                     t        t        d      d      5 } t        | d      }d d d        t              dkD  sJ t	        d |D              sJ y # 1 sw Y   .xY w)Nr4   r<   r   )r>   r-  r   c              3  N   K   | ]  }|j                   j                  d k(    ywr/  r    r#   s     r&   r'   zJtest_partition_html_from_file_prefers_metadata_filename.<locals>.<genexpr>  s     ?qzz""f,?r(   rB   rC   s     r&   7test_partition_html_from_file_prefers_metadata_filenamer3    sb    	45t	< D!qFCD x=1?h????	D Ds   AAc                 j    t        t        d            } | d   j                  j                  dgk(  sJ y )Nr4   r   engr   r	   r!   	languagesrH   s    r&   2test_partition_html_element_metadata_has_languagesr8    s3    ./DEFHA;))eW444r:   c                     t        t        d      d      } | D cg c]  }|j                  j                   c}dgddgdgdgdggk(  sJ y c c}w )Nzlanguage-docs/eng_spa_mult.htmlT)detect_language_per_elementr5  spar6  r   s     r&   8test_partition_html_respects_detect_language_per_elementr<    sd    :;Y]H +33QAJJ  3					8   3s   Ac                    t        | dd      }t        d      }t        |      }|j                  |       |sJ t	        d |D              sJ y )N<unstructured.partition.html.partition.get_last_modified_date2024-06-17T22:22:20)rW   fake-html.htmlc              3  N   K   | ]  }|j                   j                  d k(    yw)r?  Nr!   last_modifiedr#   s     r&   r'   zXtest_partition_html_from_filename_pulls_last_modified_from_filesystem.<locals>.<genexpr>
        SQqzz''+@@Sr(   )r   r	   r   rX   r-   )requestget_last_modified_date_r/   r1   s       r&   Etest_partition_html_from_filename_pulls_last_modified_from_filesystemrG    sV    +F*
 !!12Ii(H33I>O8S(SSSSr:   c                     t        t        d      d      } t        | d   t              sJ t	        d | D              sJ y )Nr@  2023-07-05T09:24:28)metadata_last_modifiedr   c              3  N   K   | ]  }|j                   j                  d k(    yw)rI  NrB  r#   s     r&   r'   zStest_partition_html_from_filename_prefers_metadata_last_modified.<locals>.<genexpr>  rD  r(   )r   r	   r   r   r-   rH   s    r&   @test_partition_html_from_filename_prefers_metadata_last_modifiedrL    sB    )*CXH hqk5)))S(SSSSr:   c                    d} t        |       }|d   }|t        d      k(  sJ |j                  j                  dgk(  sJ |j                  j                  dgk(  sJ |d   }|t        d      k(  sJ |j                  j                  J |j                  j                  J |d	   }|t        d
      k(  sJ |j                  j                  dgk(  sJ |j                  j                  d
gk(  sJ |d   }|t        d      k(  sJ |j                  j                  J |j                  j                  J |d   }|t        d      k(  sJ |j                  j                  dgk(  sJ |j                  j                  dgk(  sJ y )Na  <html>
  <p>Hello there I am a <a href="/link">very important link!</a></p>
  <p>Here is a list of my favorite things</p>
  <ul>
    <li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li>
    <li>Dogs</li>
  </ul>
  <a href="/loner">A lone link!</a>
</html>
rF   r   z'Hello there I am a very important link!/linkzvery important link!r   r  r   r"  z$https://en.wikipedia.org/wiki/Parrotr   r$  r   A lone link!/loner)r   r   r!   r   r   r   r   r   s      r&   test_partition_html_grabs_linksrQ    s   	  9-HAGHHHH::G9,,,::  %;$<<<<ADEEEE::'''::  (((A####::$J#KKKK::  YK///A    ::'''::  (((An%%%%::H:---::  ^$4444r:   c                    d} t        |       }|d   }|j                  j                  dgk(  sJ |j                  j                  dgk(  sJ |d   }|j                  j                  dgk(  sJ |j                  j                  dgk(  sJ |d	   }|j                  j                  dgk(  sJ |j                  j                  dgk(  sJ |d
   }|j                  j                  ddgk(  sJ |j                  j                  ddgk(  sJ y )Nz<html>
  <a href="/loner">A lone link!</a>
  <p>Hello <a href="/link">link!</a></p>
  <p>
   Hello <a href="/link">link!</a></p>
  <p><a href="/wiki/parrots">Parrots</a> and <a href="/wiki/dogs">Dogs</a></p>
</html>
rF   r   rO  rP  r   zlink!rN  r   r   r"  r$  z/wiki/parrotsz
/wiki/dogs)r   r!   r   r   r   s      r&   test_partition_html_linksrS  >  s   	  9-HA::  ^$4444::H:---A::  WI---::G9,,,A::  WI---::G9,,,A::  Y$7777::O\#BBBBr:   )r   expected_value)z:<table><tr><th>Header 1</th><th>Header 2</th></tr></table>r   )zd<table><tr><td>Dimensions</td><td>Weight</td></tr><tr><td>4'-6" x 1'</td><td>18 kg</td></tr></table>zs<table><tr><td>Dimensions</td><td>Weight</td></tr><tr><td>4&#x27;-6&quot; x 1&#x27;</td><td>18 kg</td></tr></table>c                x    t        |       }t        |      dk(  sJ |d   j                  j                  |k(  sJ y )NrF   r   r   )r   r,   r!   r   )r   rT  r1   s      r&   <test_partition_html_applies_text_as_html_metadata_for_tablesrV  [  s>    . 9-Hx=AA;,,>>>r:   c                    t        t        d      dddi      | _        t        d      }| j	                  di d	       t        |      d
kD  sJ t        d |D              sJ y )Nr4   rK   rL   rM   rN   https://trusttheforceluke.comrR   TrT   r   c              3  N   K   | ]  }|j                   j                  d k(    yw)rX  N)r!   rS   r#   s     r&   r'   zDtest_partition_html_from_url_adds_url_to_metadata.<locals>.<genexpr>  s     SQqzz~~!@@Sr(   )rV   r
   rW   r   rX   r,   r-   rY   s     r&   1test_partition_html_from_url_adds_url_to_metadatarZ  {  sn    !-34-"M "ABH))*ISU^b)cx=1S(SSSSr:   c                 B    t        t        d            } t        |        y )Nr4   )r   r	   r   rH   s    r&   ,test_partition_html_round_trips_through_jsonr\    s    ./DEFH#H-r:   uD   Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020rF   c                      e Zd Zi fddZy)rV   c                B    || _         || _        |dk  | _        || _        y )Ni,  )rG   rO   okrP   )selfrG   rO   rP   s       r&   __init__zFakeResponse.__init__  s$    	&#r:   N)rG   r)   rO   intrP   zdict[str, str])__name__
__module____qualname__ra  rb   r:   r&   rV   rV     s
    NP r:   rV   c            
         dddddi dddd	S )zAll default arguments for `HtmlPartitionerOptions`.

    Individual argument values can be changed to suit each test. Makes construction of opts more
    compact for testing purposes.
    NTF)	r>   r/   rG   ri   rS   rP   
ssl_verifyr  detection_originrb   rb   r:   r&   	opts_argsri    s)     $) 
 
r:   c                    t        | d      S )Nz2unstructured.partition.html.partition.requests.getr   )rE  s    r&   rZ   rZ     s    "VWWr:   c                     e Zd ZdZej
                  j                  dddg      	 	 	 	 dd       Zej
                  j                  dddg      	 	 	 	 dd       Zdd	Z		 	 dd
Z
	 	 ddZ	 	 	 	 ddZ	 	 ddZ	 	 	 	 ddZ	 	 ddZej
                  j                  dddg      	 	 	 	 dd       Z ej"                         dd       Zy)DescribeHtmlPartitionerOptionszSUnit-test suite for `unstructured.partition.html.partition.HtmlPartitionerOptions`.rh  r   Nc                F    ||d<   t        di |}|j                  |k(  sJ y )Nrh  rb   )r   rh  )r`  rh  ri  optss       r&   -it_knows_the_caller_provided_detection_originzLDescribeHtmlPartitionerOptions.it_knows_the_caller_provided_detection_origin  s1     )9	$%%2	2$$(8888r:   ri   rg   c                F    ||d<   t        di |}|j                  |k(  sJ y )Nri   rb   )r   ri   )r`  ri   ri  ro  s       r&   %it_knows_the_caller_provided_encodingzDDescribeHtmlPartitionerOptions.it_knows_the_caller_provided_encoding  s-     !)	*%2	2}}(((r:   c                    t        d      }||d<   t        di |}|j                  }t        |t              sJ |t        |      d   k(  sJ y )Nr4   r/   r   rb   )r	   r   r   r   r)   r   )r`  ri  r/   ro  r   s        r&   8it_gets_the_HTML_from_the_file_path_when_one_is_providedzWDescribeHtmlPartitionerOptions.it_gets_the_HTML_from_the_file_path_when_one_is_provided  sU    $%:;	!*	+%2	2NN	)S)))M)4Q7777r:   c                $   t        d      }t        |d      5 }t        j                  |j	                               }d d d        |d<   t        di |}|j                  }t        |t              sJ |t        |      d   k(  sJ y # 1 sw Y   KxY w)Nr4   r<   r>   r   rb   )
r	   r*   ru   rv   rw   r   r   r   r)   r   )r`  ri  r/   r0   r>   ro  r   s          r&   Cand_it_gets_the_HTML_from_the_file_like_object_when_one_is_providedzbDescribeHtmlPartitionerOptions.and_it_gets_the_HTML_from_the_file_like_object_when_one_is_provided  s     %%:;	)T" 	(a::affh'D	( 	&%2	2NN	)S)))M)4Q7777	( 	(s   $BBc                F    d|d<   t        di |}|j                  dk(  sJ y )Nz-<html><body><p>Hello World!</p></body></html>rG   rb   )r   r   r`  ri  ro  s      r&   ?and_it_uses_the_HTML_in_the_text_argument_when_that_is_providedz^DescribeHtmlPartitionerOptions.and_it_uses_the_HTML_in_the_text_argument_when_that_is_provided  s/     L	&%2	2~~!PPPPr:   c                p    t        ddddi      |_        d|d<   t        di |}|j                  dk(  sJ y )	Nz?<html><body><p>I just flew over the internet!</p></body></html>rK   rL   rM   rN   zhttps://insta.tweet.face.orgrS   rb   )rV   rW   r   r   )r`  rZ   ri  ro  s       r&   6and_it_gets_the_HTML_from_the_url_when_one_is_providedzUDescribeHtmlPartitionerOptions.and_it_gets_the_HTML_from_the_url_when_one_is_provided   sK     &2R#[1&
"
 :	%%2	2~~!bbbbr:   c                    t        di |}t        j                  t        d      5  |j                   d d d        y # 1 sw Y   y xY w)Nz3Exactly one of filename, file, text, or url must ber]   rb   )r   r_   r`   ra   r   rx  s      r&   >but_it_raises_when_no_path_or_file_or_text_or_url_was_providedz]DescribeHtmlPartitionerOptions.but_it_raises_when_no_path_or_file_or_text_or_url_was_provided  s<     &2	2]]:-bc 	NN	 	 	s	   =Ac                z    d|d<   d|_         t        di |}|j                  }|j                  d       |dk(  sJ y )Nza/b/document.htmlr/   z2024-04-02T20:32:35rb   )rW   r   rC  rX   )r`  ri  rF  ro  rC  s        r&   Dit_gets_last_modified_from_the_filesystem_when_file_path_is_providedzcDescribeHtmlPartitionerOptions.it_gets_last_modified_from_the_filesystem_when_file_path_is_provided  sM     "5	+/D,%2	2**778KL 5555r:   c                n    t        j                  d      }||d<   t        di |}|j                  }|J y )Ns   abcdefgr>   rb   )ru   rv   r   rC  )r`  ri  r>   ro  rC  s        r&   Rbut_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_providedzqDescribeHtmlPartitionerOptions.but_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_provided#  s@     zz*% 	&%2	2**$$$r:   r  TFc                D    ||d<   t        di |}|j                  |u sJ y )Nr  rb   )r   r  )r`  r  ri  ro  s       r&   =it_knows_the_caller_provided_skip_headers_and_footers_settingz\DescribeHtmlPartitionerOptions.it_knows_the_caller_provided_skip_headers_and_footers_setting0  s2     1I	,-%2	2,,0HHHHr:   c                    t        |d      S )Nr>  rk  )r`  rE  s     r&   rF  z6DescribeHtmlPartitionerOptions.get_last_modified_date_;  s    S
 	
r:   )rh  
str | Noneri  dict[str, Any])ri   r  ri  r  ri  r  )rZ   r   ri  r  )ri  r  rF  r   )r  boolri  r  )rE  r   returnr   )rc  rd  re  __doc__r_   markparametrizerp  rr  rt  rv  ry  r{  r}  r  r  r  fixturerF  rb   r:   r&   rm  rm    s=   ] [[/&$@9 *97E9 A9 [[Z'49)")/=) :)88'8Q'Qc!c.<c'
6'
6BF
6	%'	% [[7$GI(,I9GI HI V^^
 
r:   rm  c                  X    e Zd ZdZd
dZd
dZ	 	 d
dZ	 	 d
dZ	 	 d
dZd
dZ		 	 d
dZ
y	)Describe_HtmlPartitionerzMUnit-test suite for `unstructured.partition.html.partition._HtmlPartitioner`.c                p    d|d<   t        di |}t        |      }|j                  j                  dk(  sJ y )Nz<body>
  <header></header>
  <p>Lots preamble stuff yada yada yada</p>
  <main>
    <h2>A Wonderful Section!</h2>
    <p>Look at this amazing section!</p>
  </main>
</body>
rG   mainrb   r   r   _mainr   r`  ri  ro  partitioners       r&   ,it_can_find_the_main_element_in_the_documentzEDescribe_HtmlPartitioner.it_can_find_the_main_element_in_the_documentG  sD     	& &2	2&t,  $$...r:   c                p    d|d<   t        di |}t        |      }|j                  j                  dk(  sJ y)zQAnd there is always a <body>, the parser adds one if there's not one in the HTML.z<body>
  <header></header>
  <p>Lots preamble stuff yada yada yada</p>
  <h2>A Wonderful Section!</h2>
  <p>Look at this amazing section!</p>
</body>
rG   bodyNrb   r  r  s       r&   3and_it_falls_back_to_the_body_when_there_is_no_mainzLDescribe_HtmlPartitioner.and_it_falls_back_to_the_body_when_there_is_no_mainX  sD     	& &2	2&t,  $$...r:   c                    d|d<   t        di |}t        t        j                  |            \  }|t	        d      k(  sJ y )Nz<p>NO PARTICULAR TYPE.</p>rG   zNO PARTICULAR TYPE.rb   )r   listr   iter_elementsr   r`  ri  ro  r   s       r&   Mit_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_titlezfDescribe_HtmlPartitioner.it_produces_a_Text_element_when_the_tag_contents_are_not_narrative_or_a_titlej  sG     9	&%2	2*88>?
$45555r:   c                    d|d<   t        di |}t        t        j                  |            \  }|t	        d      k(  sJ y )Nu   <p>● An excellent point!</p>rG   zAn excellent point!rb   )r   r  r   r  r   r  s       r&   Wit_produces_a_ListItem_element_when_the_tag_contains_are_preceded_by_a_bullet_characterzpDescribe_HtmlPartitioner.it_produces_a_ListItem_element_when_the_tag_contains_are_preceded_by_a_bullet_charactert  sG     =	&%2	2*88>?
(#89999r:   c                j    d|d<   t        di |}t        t        j                  |            g k(  sJ y )Nu
   <p>●</p>rG   rb   r   r  r   r  rx  s      r&   Abut_not_when_the_tag_contains_only_a_bullet_character_and_no_textzZDescribe_HtmlPartitioner.but_not_when_the_tag_contains_only_a_bullet_character_and_no_text~  s:     )	&%2	2$22489R???r:   c                j    d|d<   t        di |}t        t        j                  |            g k(  sJ y )Nz<p></p>rG   rb   r  rx  s      r&   2it_produces_no_element_when_the_tag_has_no_contentzKDescribe_HtmlPartitioner.it_produces_no_element_when_the_tag_has_no_content  s8    %	&%2	2$22489R???r:   c                j    d|d<   t        di |}t        t        j                  |            g k(  sJ y )Nz<p>$</p>rG   rb   r  rx  s      r&   <and_it_produces_no_element_when_the_tag_contains_only_a_stubzUDescribe_HtmlPartitioner.and_it_produces_no_element_when_the_tag_contains_only_a_stub  s:     '	&%2	2$22489R???r:   Nr  )rc  rd  re  r  r  r  r  r  r  r  r  rb   r:   r&   r  r  B  sM    W/"/$6'6:':@'@@@'@r:   r  )r.   zpathlib.Path)rZ   r   )r"   r)   )r   r)   r   r)   )r   r)   )rE  r   )r   r)   rT  r)   )r  r  )rE  zpytest.FixtureRequest)lr  
__future__r   ru   pathlibtypingr   r_   lxmlr   test_unstructured.unit_utilsr   r   r   r	   r
   r   unstructured.chunking.titler   unstructured.cleaners.corer   unstructured.documents.elementsr   r   r   r   r   r   r   r    unstructured.file_utils.encodingr   unstructured.partition.htmlr   %unstructured.partition.html.partitionr   r   r2   r9   rD   rI   r[   rc   r  r  rm   rr   rx   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r%  r+  r0  r3  r8  r<  rG  rL  rQ  rS  rV  rZ  r\  rq   rV   r  ri  rZ   rm  r  rb   r:   r&   <module>r     sx   E " 	      7 =	 	 	 ; 6 Z$KN>

 *,TU55 c7	7 *,TU44 c7	7 *,TU55 c7	7.//&/6=
/$6#L0&R $OO
F
F6)^&*R@ (RVG*)>:C6 F(.;
;
(
8
.
  ;< =4%
.2h!3NTL@5
"TT"5JC: #	

	
(??$'?)(?T&. 
UV 
   & X Xz
 z
zP@ P@r:   