
    :Qg%                        d dl mZ d dlmZmZmZ d dlmZ d dlm	Z	 ddddZ
ddZddZdd	Zdd
ZddZddZ	 	 	 	 ddZ	 d	 	 	 	 	 ddZddZddZy)    )annotations)AnyDictList)BeautifulSoup)cells_to_html 	row_index	col_indexcontentc                    t        | d       }t               }|D ]  }|d   dkD  s|d   dkD  r]t        |d   |d   |d   z         D ]B  }t        |d   |d   |d   z         D ]%  }||f|d   |d   fk7  s|j                  ||f       ' D |d   |d   f|v s}|d   |d   }}|D cg c]  }|d   |k(  s|d   |k\  s| }	}|	D ]  }
|
dxx   dz  cc<    |j	                  ||f       |d   |d   f|v r_ |S c c}w )a  Move cells to the right if spanned cells have an influence on the rendering.

    Args:
        cells: List of cells in the table in Deckerd format.

    Returns:
        List of cells in the table in Deckerd format with cells moved to the right if spanned.
    c                    | d   | d   fS )Nyx )r   s    h/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/metrics/table/table_extraction.py<lambda>z/_move_cells_for_spanned_cells.<locals>.<lambda>   s    ##/?     keyw   hr   r   )sortedsetrangeaddremove)cellssorted_cellscells_occupied_by_spannedcellijcell_ycell_xccells_to_the_rightcell_to_moves              r   _move_cells_for_spanned_cellsr+      se    %%?@L # ?9q=DIM49d3i$s)&;< >tCy$s)d3i*?@ >A1v$s)T#Y!77155q!f=>> Cy$s)$(AA!#YS	FF-9!cQsVv=MRSTWRX\bRb!!c!c 2 'S!Q&!'%,,ff-=> Cy$s)$(AA? 	 "ds   &C;4C;=C;c                   t        | d      }|j                  d      }|j                  dg      }g }t        |      D ]  \  }}|j                  ddg      }t        |      D ]n  \  }}	||t	        |	j
                  j                  dd            t	        |	j
                  j                  dd            |	j                  d	}
|j                  |
       p  t        |      S )
zConvert html format to Deckerd table structure.

    Args:
        content: The html content with a table to extract.

    Returns:
        A list of dictionaries where each dictionary represents a cell in the table.
    zhtml.parsertabletrthtdcolspanr   rowspan)r   r   r   r   r   )
r   findfindAll	enumerateintattrsgettextappendr+   )r   soupr-   rows
table_datar$   rowr    r%   	cell_datar#   s              r   html_table_to_deckerdr@   )   s     -0DIIgE==$ DJD/ 
$3T4L)%e, 	$LAy,,Y:;,,Y:;$>>D d#	$
$ )44r   c           
        g }| D cg c]  }|d   dk(  s| }}t        d |D              }t        t        |            }| D ]h  }t        t        |d   |d   |d   z               t        t        |d   |d   |d   z               |d   |d   |d   |d   |v d}|j	                  |       j t        |      }|S c c}w )	zConvert Deckerd table structure to html format.

    Args:
        cells: List of dictionaries where each dictionary represents a cell in the table.

    Returns:
        A string with the html content of the table.
    r   r   c              3  &   K   | ]	  }|d      yw)r   Nr   ).0r#   s     r   	<genexpr>z(deckerd_table_to_html.<locals>.<genexpr>S   s     >dS	>s   r   r   r   r   )row_numscolumn_numsr   r   z	cell textzcolumn header)maxr   r   listr:   r   )r    transformer_cellsr#   first_row_cellsheader_lengthheader_rowsr?   r-   s           r   deckerd_table_to_htmlrM   F   s      ).@cat@O@>o>>MeM*+K 	,U49d3i$s).CDEd3icT#Y1F GHcci!#Y+5
	 	  +	, +,EL As
   B;B;c                .    t        |       }t        |      S )a7  Convert html format to table structure. As a middle step it converts
    html to the Deckerd format as it's more convenient to work with.

    Args:
        content: The html content with a table to extract.

    Returns:
        A list of dictionaries where each dictionary represents a cell in the table.
    )r@   _convert_table_from_deckerd)r   deckerd_cellss     r   _convert_table_from_htmlrQ   d   s     *'2M&}55r   c                    g }| D ]#  }	 |d   |d   |d   d}|j                  |       % |S # t         $ r	 t        }Y &t        $ r	 t        }Y 6w xY w)zConvert deckerd format to table structure.

    Args:
      content: The deckerd formatted content with a table to extract.

    Returns:
      A list of dictionaries where each dictionary represents a cell in the table.
    r   r   r   r
   )KeyError
EMPTY_CELL	TypeErrorr:   )r   r=   r-   r?   s       r   rO   rO   r   su     J %		#"3Z"3Z +I 	)$%   	#"I 	#"I	#s   -AAAc                    t        | d       S )Nc                    | d   | d   fS )Nr   r   r   )r#   s    r   r   z#_sort_table_cells.<locals>.<lambda>   s    [0A4CT/U r   r   )r   )r=   s    r   _sort_table_cellsrX      s    *"UVVr   c                    g }| D ]>  }d|v s|d   dk(  sd|v s	 t        |d         }|j                  t        |             @ |S # t        $ r)}t	        d|        |j                  i        Y d}~pd}~ww xY w)a  Extracts and converts tables data to a structured format based on the specified table type.

    Args:
        file_elements: List of elements from the ground truth file.

    Returns:
        A list of tables with each table represented as a list of cell data dictionaries.

    typeTabler9   z$Error converting ground truth data: N)rO   r:   rX   	Exceptionprint)file_elementsground_truth_table_dataelementconverted_dataes        r   ,extract_and_convert_tables_from_ground_truthrc      s     !  	3WG!;'@Q3!<FO" (../@/PQ	3 #"	  3<QC@A'..r223s   (A	A:A55A:c                   t         t        d}||vrt        d| d      ||   }|dk(  rt        nt         }g }| D ]H  }|j                  d      dk(  s ||      }|s ||      }|s-t	        |      }|j                  |       J |S )ap  Extracts and converts table data to a structured format

    Args:
      file_elements: List of elements from the file.
      source_type: 'cells' or 'html'. 'cells' refers to reading 'table_as_cells' field while
        'html' is extracted from 'text_as_html'

    Returns:
      A list of tables with each table represented as a list of cell data dictionaries.

    )htmlr    zsource_type z: is not valid. Allowed source_types are "html" and "cells"r    rZ   r[   )extract_cells_from_text_as_html!extract_cells_from_table_as_cells
ValueErrorr8   rX   r:   )	r^   source_type$source_type_to_extraction_strategiesextract_cells_fnfallback_extract_cells_fnpredicted_table_datar`   extracted_cellsr!   s	            r   *extract_and_convert_tables_from_predictionro      s     02,( >>;-'ab
 	
 <KH '! 	*,    :;;v').w7O"";G"D0A$++L9:  r   c                    | d   j                  d      }|rd|vryd}	 t        |      }|S # t        $ r}t        d|        Y d}~|S d}~ww xY w)a  Extracts and parse cells from "text_as_html" field in Element structure

    Args:
        element: Example element:
        {
            "type": "Table",
            "metadata": {
                "text_as_html": "<table>
                                    <thead>
                                        <tr>
                                            <th>Month A.</th>
                                        </tr>
                                    </thead>
                                    </tbody>
                                        <tr>
                                            <td>22</td><
                                        </tr>
                                    </tbody>
                                </table>"
            }
        }

    Returns:
        List of extracted cells in a format:
        [
            {
                "row_index": 0,
                "col_index": 0,
                "content": "Month A.",
            },
            ...,
        ]
    metadatatext_as_htmlz<table>Nz*Error converting Unstructured table data: )r8   rQ   r\   r]   )r`   valpredicted_cellsrb   s       r   rf   rf      sq    D *

!
!.
1C)3&O@237   @:1#>??@s   , 	AA		Ac                L    | d   j                  d      }d}|rt        |      }|S )aj  Extracts and parse cells from "table_as_cells" field in Element structure

    Args:
        element: Example element:
        {
            "type": "Table",
            "metadata": {
                "table_as_cells": [{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
                                   {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"}]
            }
        }

    Returns:
        List of extracted cells in a format:
        [
            {
                "row_index": 0,
                "col_index": 0,
                "content": "Month A.",
            },
            ...,
        ]
    rq   table_as_cellsN)r8   rO   )r`   rt   converted_cellss      r   rg   rg     s2    0 j)--.>?OO5oFr   N)r    List[Dict[str, Any]])r   strreturnrx   )r    rx   rz   ry   )r   rx   rz   rx   )r=   List[List[Dict[str, Any]]]rz   r{   )r^   rx   rz   r{   )re   )r^   rx   ri   ry   rz   r{   )r`   Dict[str, Any]rz   zList[Dict[str, Any]] | None)r`   r|   rz   rx   )
__future__r   typingr   r   r   bs4r   $unstructured_inference.models.tablesr   rT   r+   r@   rM   rQ   rO   rX   rc   ro   rf   rg   r   r   r   <module>r      s    " " "  > 
45:<62W#'##8 =C( '( 69( ( V,^r   