
    :Qg)B                         d dl Z d dlZd dlZd dlZd dlZddlmZ ddl	m
Z
mZ ddlmZmZmZmZ  ej"                  d      Z G d d	e      Zy)
    N   )
BaseParser   )	TextEdgesTable)text_in_bboxget_table_indexcompute_accuracycompute_whitespacecamelotc                       e Zd ZdZ	 	 	 	 	 	 	 	 	 ddZed        Zedd       Zedd       Zed        Z	ed	        Z
ed
        Zd Zd Zd Zd Zd Zdi fdZy)Streama8  Stream method of parsing looks for spaces between text
    to parse the table.

    If you want to specify columns when specifying multiple table
    areas, make sure that the length of both lists are equal.

    Parameters
    ----------
    table_regions : list, optional (default: None)
        List of page regions that may contain tables of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    columns : list, optional (default: None)
        List of column x-coordinates strings where the coordinates
        are comma-separated.
    split_text : bool, optional (default: False)
        Split text that spans across multiple cells.
    flag_size : bool, optional (default: False)
        Flag text based on font size. Useful to detect
        super/subscripts. Adds <s></s> around flagged text.
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.
    edge_tol : int, optional (default: 50)
        Tolerance parameter for extending textedges vertically.
    row_tol : int, optional (default: 2)
        Tolerance parameter used to combine text vertically,
        to generate rows.
    column_tol : int, optional (default: 0)
        Tolerance parameter used to combine text horizontally,
        to generate columns.

    NFc
                     || _         || _        || _        | j                          || _        || _        || _        || _        || _        |	| _	        y N)
table_regionstable_areascolumns_validate_columns
split_text	flag_size
strip_textedge_tolrow_tol
column_tol)selfr   r   r   r   r   r   r   r   r   kwargss              S/var/www/html/answerous/venv/lib/python3.12/site-packages/camelot/parsers/stream.py__init__zStream.__init__9   sQ     +& $"$ $    c                    t        | D cg c]  }| |   D ]  }|j                    c}}      }t        | D cg c]  }| |   D ]  }|j                    c}}      }t        | D cg c]  }| |   D ]  }|j                    c}}      }t        | D cg c]  }| |   D ]  }|j
                    c}}      }||||f}|S c c}}w c c}}w c c}}w c c}}w )a  Returns bounding box for the text present on a page.

        Parameters
        ----------
        t_bbox : dict
            Dict with two keys 'horizontal' and 'vertical' with lists of
            LTTextLineHorizontals and LTTextLineVerticals respectively.

        Returns
        -------
        text_bbox : tuple
            Tuple (x0, y0, x1, y1) in pdf coordinate space.

        )minx0y0maxx1y1)t_bbox	directiontxminyminxmaxymax	text_bboxs           r   
_text_bboxzStream._text_bboxQ   s      &LY&:KLQADDLDLM&LY&:KLQADDLDLM&LY&:KLQADDLDLM&LY&:KLQADDLDLM4t,	 MLLLs   C
C
)C
C
c                    d}g }g }| D ]  }|j                         j                         s"t        j                  ||j                  |      s+|j                  t        |d              g }|j                  }|j                  |        |j                  t        |d              t        |      dkD  r|j                  d      }|S )ac  Groups PDFMiner text objects into rows vertically
        within a tolerance.

        Parameters
        ----------
        text : list
            List of PDFMiner text objects.
        row_tol : int, optional (default: 2)

        Returns
        -------
        rows : list
            Two-dimensional list of text objects grouped into rows.

        r   atolc                     | j                   S r   r"   r)   s    r   <lambda>z$Stream._group_rows.<locals>.<lambda>   s
    144 r   keyc                     | j                   S r   r4   r5   s    r   r6   z$Stream._group_rows.<locals>.<lambda>   s
    qtt r   r   )	get_textstripnpiscloser#   appendsortedlenpop)textr   row_yrowstempr)   __s          r   _group_rowszStream._group_rowsh   s    "  		A zz|!!#zz%G<KKt @ADDDEA		 	F4^45t9q=!Br   c                 4   g }| D ]  }|s|j                  |       |d   }|dk\  rg|d   |d   k  st        j                  |d   |d   |      r,t        |d   |d         }t	        |d   |d         }||f|d<   w|j                  |       |dk  s|d   |d   k  ret        j                  |d   |d   t        |            r|j                  |       t        |d   |d         }t	        |d   |d         }||f|d<   |j                  |        |S )ah  Merges column boundaries horizontally if they overlap
        or lie within a tolerance.

        Parameters
        ----------
        l : list
            List of column x-coordinate tuples.
        column_tol : int, optional (default: 0)

        Returns
        -------
        merged : list
            List of merged column x-coordinate tuples.

        r   r   r1   )r>   r<   r=   r$   r!   abs)lr   mergedhigherlowerupper_boundlower_bounds          r   _merge_columnszStream._merge_columns   s-   "  	.Ff%r
?ayE!H,

q	58*1 '*%(F1I&>&)%(F1I&>&1;%?r
f-!^ayE!H,::fQiqJP"MM&1*-eAhq	*BK*-eAhq	*BK*5{)CF2Jf-/	.0 r   c                    | D cg c]Q  }t        |      dkD  r?t        |D cg c]  }|j                  |j                  z   dz    c}      t        |      z  ndS }}}t	        dt        |            D cg c]  }||   ||dz
     z   dz   }}|j                  d|       |j                  |       t	        dt        |      dz
        D cg c]  }||   ||dz      f }}|S c c}w c c}}w c c}w c c}w )aQ  Makes row coordinates continuous.

        Parameters
        ----------
        rows_grouped : list
            Two-dimensional list of text objects grouped into rows.
        text_y_max : int
        text_y_min : int

        Returns
        -------
        rows : list
            List of continuous row y-coordinate tuples.

        r   r   r   )r@   sumr#   r&   rangeinsertr>   )rows_grouped
text_y_max
text_y_minrr)   row_midsirD   s           r   
_join_rowszStream._join_rows   s    & "
 =@FQJCA.q!$$+"./#a&8AM
 
 >C1c(m=TU!xA.!3UUAz"J05aTQ0GH1a$q1u+&HH /
 V Is"   C*#C%C*6C0C5%C*c                 d   |rt         j                  ||      }|D cg c]  }t        |       }}|D cg c]9  }t        |      t        |      k(  s|D ]  }|j                  |j
                  f ; }}}| j                  t         j                  t        |                   | S c c}w c c}}w )a  Adds columns to existing list by taking into account
        the text that lies outside the current column x-coordinates.

        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text : list
            List of PDFMiner text objects.
        ytol : int

        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.

        r   )	r   rG   r@   r$   r"   r%   extendrQ   r?   )colsrB   r   rY   elementsr)   new_colss          r   _add_columnszStream._add_columns   s    & %%dG%<D(,-1A-H-&*!"c!fH.EqJKqttH  KK--fX.>?@ .s   B'B,!B,c                 D   t        |       } t        dt        |             D cg c]  }| |   d   | |dz
     d   z   dz   } }| j                  d|       | j	                  |       t        dt        |       dz
        D cg c]  }| |   | |dz      f } }| S c c}w c c}w )a8  Makes column coordinates continuous.

        Parameters
        ----------
        cols : list
            List of column x-coordinate tuples.
        text_x_min : int
        text_y_max : int

        Returns
        -------
        cols : list
            Updated list of column x-coordinate tuples.

        r   r   r   )r?   rT   r@   rU   r>   )r`   
text_x_min
text_x_maxr[   s       r   _join_columnszStream._join_columns   s    " d|;@CI;NOaad1q5k!n,1OOAz"J05aTQ0GH1a$q1u+&HH	 P Is   B Bc                     | j                   D| j                  7t        | j                         t        | j                        k7  rt        d      y y y )Nz1Length of table_areas and columns should be equal)r   r   r@   
ValueError)r   s    r   r   zStream._validate_columns  sM    'DLL,D4##$DLL(99 !WXX : -E'r   c                 B   |j                  d        t        | j                        }|j                  |       |j	                         }| j
                  j                  |       |j                  ||      }t        |      sdd| j                  | j                  fdi}|S )a1  A general implementation of the table detection algorithm
        described by Anssi Nurminen's master's thesis.
        Link: https://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3

        Assumes that tables are situated relatively far apart
        vertically.
        c                 4    | j                    | j                  fS r   r#   r"   xs    r   r6   z2Stream._nurminen_table_detection.<locals>.<lambda>  s    qtteQTT] r   r7   )r   r   N)sortr   r   generateget_relevant	textedgesr_   get_table_areasr@   	pdf_width
pdf_height)r   	textlinesrr   relevant_textedges
table_bboxs        r   _nurminen_table_detectionz Stream._nurminen_table_detection  s     	23t}}5	9%&33501..y:LM
:aA4HJr   c                 V   g | _         | j                  | j                  }| j                  g }| j                  D ]o  }|j	                  d      \  }}}}t        |      }t        |      }t        |      }t        |      }t        ||||f| j                        }|j                  |       q | j                  |      }|| _	        y i }| j                  D ]M  }	|	j	                  d      \  }}}}t        |      }t        |      }t        |      }t        |      }d |||||f<   O || _	        y )N,)
rr   r   horizontal_textr   splitfloatr   r_   ry   rx   )
r   hor_textregionr%   r&   x2y2region_textrx   areas
             r   _generate_table_bboxzStream._generate_table_bbox&  s3   #++H!!-"00 1F%+\\#%6NBBrBrBrBrB".BB/?AUAU"VKOOK01 77AJ % J(( 4!%CBB2Y2Y2Y2Y/3
BB+,4 %r   c           
         i }t        || j                        |d<   t        || j                        |d<   |d   j                  d        |d   j                  d        || _        | j                  | j                        \  }}}}| j                  | j                  d   | j                        }| j                  |||      }	|D 
cg c]  }
t        |
       }}
| j                  | j                  |   dk7  r| j                  |   j                  d      }|D cg c]  }t        |       }}|j                  d	|       |j                  |       t        d	t        |      d
z
        D cg c]  }||   ||d
z      f }}||	fS t        |      s	||fg}||	fS t!        t#        |      |j$                        }|d
k(  r]t'        t)        d |            }t        |      r!t!        t#        |      |j$                        }nt+        j,                  d|d
z           |D 
cg c]0  }
t        |
      |k(  s|
D ]  }|j.                  |j0                  f 2 }}
}| j3                  t5        |      | j6                        }g }t        d
t        |            D ]r  }||d
z
     d
   }||   d	   }|j9                  | j                  D cg c]6  }| j                  |   D ]"  }|j.                  |kD  r|j0                  |k  r|$ 8 c}}       t | j                  D cg c]B  }| j                  |   D ].  }|j.                  |d   d
   kD  s|j0                  |d	   d	   k  r|0 D }}}|j9                  |       | j;                  ||| j                        }| j=                  |||      }||	fS c c}
w c c}w c c}w c c}}
w c c}}w c c}}w )N
horizontalverticalc                 4    | j                    | j                  fS r   rl   rm   s    r   r6   z3Stream._generate_columns_and_rows.<locals>.<lambda>H  s    !$$ r   r7   c                 4    | j                   | j                   fS r   )r"   r#   rm   s    r   r6   z3Stream._generate_columns_and_rows.<locals>.<lambda>I  s    qttaddUm r   r^    r{   r   r   c                     | dk7  S Nr    rm   s    r   r6   z3Stream._generate_columns_and_rows.<locals>.<lambda>i  s
    Q!V r   zNo tables found in table area )r   rI   )r   r|   vertical_textro   r'   r/   rG   r   r\   r@   r   r}   r~   rU   r>   rT   r$   setcountlistfilterwarningswarnr"   r%   rQ   r?   r   r_   rc   rg   )r   	table_idxtkr'   re   rX   rf   rW   rV   rD   rY   ra   r`   cr[   ncolsr)   
inner_textleftrightr(   
outer_texts                         r   _generate_columns_and_rowsz!Stream._generate_columns_and_rowsB  s   +B0D0DE|)"d.@.@Az|!!&=!>z$;<9=9U6
J
J''L(A4<<'X|ZD$01qCF11<<#Y(?2(E
 <<	*005D&*+E!H+D+KK:&KK
#49!SY]4KLqT!Wd1q5k*LDLZ TzS x=#Z01P TzM CMx~~>A:  $F+;X$FGH8} #CMx~~ F (FyST}o&VW*6%&#a&E/STNOQTT144L   **6$<DOO*T
q#d), 
AA;q>D GAJE%% .2[[ )%)[[%; !" ttd{qtte| 
 &*[[!![[3 ttd2hqk)QTTDGAJ-> 
  !!*-((z4<<H))$
JGTzq 2 , M&s+   	N+N0.N5#N:7!N:;O AOc           
         t        ||      }|j                         }g }dD ]  }| j                  |   D ]p  }t        |||| j                  | j
                  | j                        \  }	}
|	d d dk7  s>|j                  |
       |	D ]  \  }}}||j                  |   |   _	         r  t        d|gg      }|j                  }t        j                  |      |_        |j                  j                  |_        t!        |      }d|_        ||_        ||_        |dz   |_        t+        t,        j.                  j1                  | j2                        j5                  dd	            |_        g }|j9                  | j:                  D cg c]0  }|j<                  |j>                  |j@                  |jB                  f2 c}       |j9                  | jD                  D cg c]0  }|j<                  |j>                  |j@                  |jB                  f2 c}       ||_#        d |_$        d |_%        | jL                  |_'        |S c c}w c c}w )
N)r   r   )r   r   r   r   )rI   rI   d   streamr   zpage-r   )(r   set_all_edgesr'   r	   r   r   r   r>   cellsrB   r
   datapd	DataFramedfshaper   flavoraccuracy
whitespaceorderintospathbasenamerootnamereplacepager_   r|   r"   r#   r%   r&   r   _text_image	_segmentsrr   
_textedges)r   r   r`   rD   r   table
pos_errorsr(   r)   indiceserrorr_idxc_idxrB   r   r   r   r   s                     r   _generate_tablezStream._generate_table  s   dD!##%
 4 	>I[[+ >!0#"nn#" 2A;(*%%e,.5 >*ud9=E*516>>	> $c:%6$78zz<<%hhnn'-
!%!m))$--8@@"MN
 8L8LM1qttQTT144.MN8J8JK1qttQTT144.KL>> NKs   5H<5Ic                 \   | j                  ||       t        j                  j                  | j                        }|st
        j                  d|        | j                  s@| j                  rt        j                  | d       g S t        j                  d|        g S | j                          g }t        t        | j                  j                         d d            D ]E  \  }}| j!                  ||      \  }}	| j#                  |||	      }
||
_        |j'                  |
       G |S )NzProcessing z8 is image-based, camelot only works on text-based pages.zNo tables found on c                     | d   S r   r   rm   s    r   r6   z'Stream.extract_tables.<locals>.<lambda>  s
    1 r   T)r8   reverse)_generate_layoutr   r   r   r   loggerinfor|   imagesr   r   r   	enumerater?   rx   keysr   r   _bboxr>   )r   filenamesuppress_stdoutlayout_kwargsbase_filename_tablesr   r   r`   rD   r   s              r   extract_tableszStream.extract_tables  s   h6((7KK+m_56##{{$o &) ) I  3M?CDI!!#&4??'')~tL
 	"MIr 88BGJD$((D$?EEKNN5!	" r   )	NNNFFr   2   r   r   )r   )r   )__name__
__module____qualname____doc__r   staticmethodr/   rG   rQ   r\   rc   rg   r   ry   r   r   r   r   r   r   r   r   r      s    $P %0  , " "H ) )V  4  6  .Y
2%8FP+Z 8=B r   r   )r   loggingr   numpyr<   pandasr   baser   corer   r   utilsr   r	   r
   r   	getLoggerr   r   r   r   r   <module>r      sD    
      # W W 
		9	%BZ Br   