
    :Qg                        d Z ddlmZ ddlZddlmZmZmZmZ ddl	m
Z
 ddlmZ ddlmZ erddlmZ dd	Z G d
 d      Z G d d      Z G d d      Zy)zProvides operations related to the HTML table stored in `.metadata.text_as_html`.

Used during partitioning as well as chunking.
    )annotationsN)TYPE_CHECKINGIteratorSequencecast)etree)fragment_fromstring)lazyproperty)HtmlElementc                V    dfd}dd| rddj                   ||              dS dS )a  Form an HTML table from "rows" and "columns" of `matrix`.

    Character overhead is minimized:
    - No whitespace padding is added for human readability
    - No newlines ("
") are added
    - No `<thead>`, `<tbody>`, or `<tfoot>` elements are used; we can't tell where those might be
      semantically appropriate anyway so at best they would consume unnecessary space and at worst
      would be misleading.
    c              3  \   K   | D ]"  }|sddj                   |             d $ y w)Nz<tr> z</tr>join)rows_of_cell_strsrow_cell_strsiter_tdss     [/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/common/html_table.pyiter_trsz.htmlify_matrix_of_cell_texts.<locals>.iter_trs   s<     . 	AM -!89:%@@		As   ),c              3     K   | D ]b  }t        j                  |      }dj                  |j                  d            }dj                  |j                               }|rd| dnd d y w)Nz<br/>
 z<td>z</td><td/>)htmlescaper   split)r   s	cell_texts      r   r   z.htmlify_matrix_of_cell_texts.<locals>.iter_tds&   sa      	DAAAQWWT]+A+I-6D5)GC	Ds   A)A+z<table>r   z</table>)r   Sequence[Sequence[str]]returnIterator[str])r   zSequence[str]r    r!   r   )matrixr   r   s     @r   htmlify_matrix_of_cell_textsr#      s7    A	D =CWRWWXf-./x8JJ    c                  V    e Zd ZdZddZed	d       Zed
d       ZddZ	ed
d       Z
y)	HtmlTablezA `<table>` element.c                    || _         y N)_table)selftables     r   __init__zHtmlTable.__init__7   s	    r$   c                   t        |      }|j                  d      }|st        d      |d   }|j                  d      }|D ]  }|j                           |j	                         D ]  }|j
                  j                          |j                  dk(  rd|_        |j                  r.dj                  |j                  j                               |_        |j                  szd |_          | |      S )Nz//tablez)`html_text` contains no `<table>` elementr   z.//thead | .//tbody | .//tfootthtdr   )r	   xpath
ValueErrordrop_tagiterattribcleartagtextr   r   tail)cls	html_textroottablesr+   noise_elementses          r   from_html_textzHtmlTable.from_html_text:   s     #9-I&HIIq	 %EF 	AJJL	  	A HHNN uu} vv!&&,,.1 vv!	$ 5zr$   c                L    t        j                  | j                  t              S )a  The HTML-fragment for this `<table>` element, all on one line.

        Like: `<table><tr><td>foo</td></tr><tr><td>bar</td></tr></table>`

        The HTML contains no human-readability whitespace, attributes, or `<thead>`, `<tbody>`, or
        `<tfoot>` tags. It is made as compact as possible to maximize the semantic content in a
        given space. This is particularly important for chunking.
        encoding)r   tostringr)   strr*   s    r   r   zHtmlTable.html]   s     ~~dkkC88r$   c              #  x   K   d t        d| j                  j                  d            D        E d {    y 7 w)Nc              3  2   K   | ]  }t        |        y wr(   )HtmlRow).0trs     r   	<genexpr>z&HtmlTable.iter_rows.<locals>.<genexpr>j   s     _BGBK_s   zlist[HtmlElement]z./tr)r   r)   r0   rE   s    r   	iter_rowszHtmlTable.iter_rowsi   s,     _$/BDKKDUDUV\D]*^___s   0:8:c                    dj                  | j                  j                               }dj                  |j                               S )z-The clean, concatenated, text for this table.r   )r   r)   itertextr   )r*   
table_texts     r   r7   zHtmlTable.textl   s7     XXdkk2245
xx
((*++r$   N)r+   r   )r:   rD   r    r&   r    rD   )r    zIterator[HtmlRow])__name__
__module____qualname____doc__r,   classmethodr?   r
   r   rL   r7    r$   r   r&   r&   4   sM        D 	9 	9` , ,r$   r&   c                  :    e Zd ZdZddZedd       Zd	dZd
dZy)rH   zA `<tr>` element.c                    || _         y r(   )_tr)r*   rJ   s     r   r,   zHtmlRow.__init__w   	    r$   c                L    t        j                  | j                  t              S )z*Like  "<tr><td>foo</td><td>bar</td></tr>".rA   )r   rC   rY   rD   rE   s    r   r   zHtmlRow.htmlz   s     ~~dhh55r$   c              #  H   K   | j                   D ]  }t        |        y wr(   )rY   HtmlCellr*   r/   s     r   
iter_cellszHtmlRow.iter_cells   s#     (( 	B2,	s    "c              #  z   K   | j                   D ](  }|j                  x}|j                         }|s%| * yw)zGenerate contents of each cell of this row as a separate string.

        A cell that is empty or contains only whitespace does not generate a string.
        N)rY   r7   strip)r*   r/   r7   s      r   iter_cell_textszHtmlRow.iter_cell_texts   sB     
 (( 	B(::<DJ	s   9;N)rJ   r   rP   )r    zIterator[HtmlCell])r    r!   )	rQ   rR   rS   rT   r,   r
   r   r_   rb   rV   r$   r   rH   rH   t   s(     6 6r$   rH   c                  <    e Zd ZdZddZedd       Zedd       Zy)r]   zA `<td>` element.c                    || _         y r(   )_tdr^   s     r   r,   zHtmlCell.__init__   rZ   r$   c                h    | j                   r%t        j                  | j                  t              S dS )zLike  "<td>foo bar baz</td>".rA   r   )r7   r   rC   re   rD   rE   s    r   r   zHtmlCell.html   s$     :>u~~dhh5OOr$   c                T    | j                   j                  x}y|j                         S )z6Text inside `<td>` element, empty string when no text.r   )re   r7   ra   )r*   r7   s     r   r7   zHtmlCell.text   s&     HHMM!D*zz|r$   N)r/   r   rP   )rQ   rR   rS   rT   r,   r
   r   r7   rV   r$   r   r]   r]      s4     P P  r$   r]   )r"   r   r    rD   )rT   
__future__r   r   typingr   r   r   r   lxmlr   	lxml.htmlr	   unstructured.utilsr
   r   r#   r&   rH   r]   rV   r$   r   <module>rm      sM   
 #  : :  ) +%K@=, =,@ : r$   