
    :Qg<                         d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z
ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZmZmZmZ  ej:                  d      Z G d	 d
e      Zy)    N   )
BaseParser   )Table)scale_image	scale_pdfsegments_in_bboxtext_in_bboxmerge_close_linesget_table_indexcompute_accuracycompute_whitespace)adaptive_threshold
find_linesfind_contoursfind_jointscamelotc                       e Zd ZdZdddddddgddddddd	d
dfdZed        Zedd       Zd Zd Z	d Z
d Zdi fdZy)Latticea-  Lattice method of parsing looks for lines between text
    to parse the table.

    Parameters
    ----------
    table_regions : list, optional (default: None)
        List of page regions that may contain tables of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
        in PDF coordinate space.
    process_background : bool, optional (default: False)
        Process background lines.
    line_scale : int, optional (default: 15)
        Line size scaling factor. The larger the value the smaller
        the detected lines. Making it very large will lead to text
        being detected as lines.
    copy_text : list, optional (default: None)
        {'h', 'v'}
        Direction in which text in a spanning cell will be copied
        over.
    shift_text : list, optional (default: ['l', 't'])
        {'l', 'r', 't', 'b'}
        Direction in which text in a spanning cell will flow.
    split_text : bool, optional (default: False)
        Split text that spans across multiple cells.
    flag_size : bool, optional (default: False)
        Flag text based on font size. Useful to detect
        super/subscripts. Adds <s></s> around flagged text.
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.
    line_tol : int, optional (default: 2)
        Tolerance parameter used to merge close vertical and horizontal
        lines.
    joint_tol : int, optional (default: 2)
        Tolerance parameter used to decide whether the detected lines
        and points lie close to each other.
    threshold_blocksize : int, optional (default: 15)
        Size of a pixel neighborhood that is used to calculate a
        threshold value for the pixel: 3, 5, 7, and so on.

        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    threshold_constant : int, optional (default: -2)
        Constant subtracted from the mean or weighted mean.
        Normally, it is positive but may be zero or negative as well.

        For more information, refer `OpenCV's adaptiveThreshold <https://docs.opencv.org/2.4/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold>`_.
    iterations : int, optional (default: 0)
        Number of times for erosion/dilation is applied.

        For more information, refer `OpenCV's dilate <https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#dilate>`_.
    resolution : int, optional (default: 300)
        Resolution used for PDF to PNG conversion.

    NF   lt r   r   i,  c                     || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        y N)table_regionstable_areasprocess_background
line_scale	copy_text
shift_text
split_text	flag_size
strip_textline_tol	joint_tolthreshold_blocksizethreshold_constant
iterations
resolution)selfr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   kwargss                    T/var/www/html/answerous/venv/lib/python3.12/site-packages/camelot/parsers/lattice.py__init__zLattice.__init__a   sr    & +&"4$"$$"$ "#6 "4$$    c                 b   g }|D ]  \  }}}|D ]  }|dk(  rZ| j                   |   |   j                  r>| j                   |   |   j                  s"|dz  }| j                   |   |   j                  s"|dk(  rZ| j                   |   |   j                  r>| j                   |   |   j                  s"|dz  }| j                   |   |   j                  s"|dk(  rZ| j                   |   |   j                  r>| j                   |   |   j
                  s"|dz  }| j                   |   |   j
                  s"|dk(  s(| j                   |   |   j                  sF| j                   |   |   j                  rd|dz  }| j                   |   |   j                  s" |j                  |||f        |S )a  Reduces index of a text object if it lies within a spanning
        cell.

        Parameters
        ----------
        table : camelot.core.Table
        idx : list
            List of tuples of the form (r_idx, c_idx, text).
        shift_text : list
            {'l', 'r', 't', 'b'}
            Select one or more strings from above and pass them as a
            list to specify where the text in a spanning cell should
            flow.

        Returns
        -------
        indices : list
            List of tuples of the form (r_idx, c_idx, text) where
            r_idx and c_idx are new row and column indices for text.

        r   r   rr   b)cellshspanleftrightvspantopbottomappend)r   idxr"   indicesr_idxc_idxtextds           r.   _reduce_indexzLattice._reduce_index   s   . "% 	1E5$ '8wwu~e,22"#''%."7"<"<!QJE #$''%."7"<"<8wwu~e,22"#''%."7"="=!QJE #$''%."7"="=8wwu~e,22"#''%."7";";!QJE #$''%."7";";8wwu~e,22"#''%."7">">!QJE #$''%."7">">'" NNE5$/0%	1& r0   c                    |D ]  }|dk(  rt        t        | j                              D ]  }t        t        | j                  |               D ]  }| j                  |   |   j                  j	                         dk(  s1| j                  |   |   j
                  sN| j                  |   |   j                  rk| j                  |   |dz
     j                  | j                  |   |   _          |dk(  st        t        | j                              D ]  }t        t        | j                  |               D ]  }| j                  |   |   j                  j	                         dk(  s1| j                  |   |   j                  sN| j                  |   |   j                  rk| j                  |dz
     |   j                  | j                  |   |   _           | S )a  Copies over text in empty spanning cells.

        Parameters
        ----------
        t : camelot.core.Table
        copy_text : list, optional (default: None)
            {'h', 'v'}
            Select one or more strings from above and pass them as a list
            to specify the direction in which text should be copied over
            when a cell spans multiple rows or columns.

        Returns
        -------
        t : camelot.core.Table

        hr   r   v)	rangelenr4   r@   stripr5   r6   r8   r9   )r   r!   fijs        r.   _copy_spanning_textzLattice._copy_spanning_text   s   $  	LACxs177|, LA"3qwwqz?3 L771:a=--335; wwqz!}221771:a=;M;M56WWQZA5F5K5K
1 2LL
 cs177|, LA"3qwwqz?3 L771:a=--335; wwqz!}221771:a=;L;L56WWQU^A5F5K5K
1 2LL	L r0   c                    ddl m} dj                  | j                  dg      | _        dj                  | j                  | j                  | j                        }|j                         j                         }t        t        j                  d      } ||d|i5 }	 d d d        |j                          y # 1 sw Y   xY w)Nr   )Ghostscriptr   z.pngz -q -sDEVICE=png16m -o {} -r{} {}wbstdout)ext.ghostscriptrN   joinrootname	imagenameformatr+   filenameencodesplitopenosdevnullclose)r,   rN   gs_callnullgss        r.   _generate_imagezLattice._generate_image   s    1$--!894;;NNDOOT]]
 .."((*BJJ%'/$/ 	2	

	 	s   B55B>c                    fd}t        | j                  | j                  | j                  | j                        \  | _        | _        | j
                  j                  d   }| j
                  j                  d   }|t        | j                        z  }|t        | j                        z  }| j                  t        |      z  }| j                  t        |      z  }||| j                  f|||f}| j                  d }	| j                   || j                        }	t        | j                  |	d| j                  | j                        \  }
}t        | j                  |	d| j                  | j                        \  }}t!        |
|      }t#        ||
|      }nt        | j                  d| j                  | j                        \  }
}t        | j                  d| j                  | j                        \  }} || j                        }t#        ||
|      }t%        j&                  |      | _        t+        ||||      \  | _        | _        | _        y )	Nc           
      "   g }| D ]  }|j                  d      \  }}}}t        |      }t        |      }t        |      }t        |      }t        ||||f      \  }}}}|j                  ||t	        ||z
        t	        ||z
        f        |S )N,)rX   floatr   r;   abs)areasscaled_areasareax1y1x2y2image_scalerss          r.   scale_areasz1Lattice._generate_table_bbox.<locals>.scale_areas   s    L J!%CBB2Y2Y2Y2Y!*BB+;]!KBB##RSb\3rBw<$HIJ  r0   )r   	blocksizecr   r   vertical)regions	directionr    r*   
horizontal)rs   r    r*   )r   rT   r   r(   r)   image	thresholdshaperd   	pdf_width
pdf_heightr   r   r   r    r*   r   r   copydeepcopytable_bbox_unscaledr   
table_bboxvertical_segmentshorizontal_segments)r,   rn   image_widthimage_heightimage_width_scalerimage_height_scalerpdf_width_scalerpdf_height_scalerpdf_scalersrr   vertical_maskr~   horizontal_maskr   contoursr}   rf   rm   s                    @r.   _generate_table_bboxzLattice._generate_table_bbox   s    
	  &8NN#66..%%	&
"
DN jj&&q)zz''*(5+@@*U4??-CC>>E+,>> OOeL.AA+-@$//R'):LI#G!!-%d&8&89/9$????0,M, 4>&????40O0 %]ODH$X}oNJ/9$????	0,M, 4>&????	40O0   0 01E$UM?KJ#'==#< LW)+>M
I/1Ir0   c                 <   i }t        || j                  | j                        \  }}t        || j                        |d<   t        || j
                        |d<   |d   j                  d        |d   j                  d        || _        t        | j                  |    \  }}t        |      t        |      }}|j                  |d   |d   g       |j                  |d   |d	   g       t        t        |      | j                  
      }t        t        |d      | j                  
      }t        dt!        |      dz
        D cg c]  }||   ||dz      f }}t        dt!        |      dz
        D cg c]  }||   ||dz      f }}||||fS c c}w c c}w )Nrt   rq   c                 4    | j                    | j                  fS r   )y0x0xs    r.   <lambda>z4Lattice._generate_columns_and_rows.<locals>.<lambda>2  s    !$$ r0   )keyc                 4    | j                   | j                   fS r   )r   r   r   s    r.   r   z4Lattice._generate_columns_and_rows.<locals>.<lambda>3  s    qttaddUm r0   r   r   r      )r&   T)reverse)r	   r~   r   r
   horizontal_textvertical_textsortt_bboxzipr}   listextendr   sortedr&   rF   rG   )	r,   	table_idxtkr   v_sh_scolsrowsrJ   s	            r.   _generate_columns_and_rowsz"Lattice._generate_columns_and_rows)  s   #&&(@(@
S  ,B0D0DE|)"d.@.@Az|!!&=!>z$;<$//"-.
d$ZddRUBqEN#RUBqEN# F d!;dmmT05aTQ0GH1a$q1u+&HH05aTQ0GH1a$q1u+&HHT3## IHs   F8Fc           
      ^   |j                  d      }|j                  d      }||$t        dj                  | j                              t	        ||      }|j                  ||| j                        }|j                         }|j                         }g }dD ]  }	| j                  |	   D ]  }
t        ||
|	| j                  | j                  | j                        \  }}|d d dk7  s>|j                  |       t        j!                  ||| j"                  	      }|D ]  \  }}}||j$                  |   |   _           t)        d
|gg      }| j*                  !t        j-                  || j*                        }|j.                  }t1        j2                  |      |_        |j4                  j6                  |_        t9        |      }d|_        ||_        ||_        |dz   |_         tC        tD        jF                  jI                  | j                        jK                  dd            |_&        g }|jO                  | jP                  D 
cg c]0  }
|
jR                  |
jT                  |
jV                  |
jX                  f2 c}
       |jO                  | jZ                  D 
cg c]0  }
|
jR                  |
jT                  |
jV                  |
jX                  f2 c}
       ||_.        | j^                  | j`                  f|_1        | jd                  | jf                  f|_4        d |_5        |S c c}
w c c}
w )Nr   r   zNo segments found on {})r'   )rq   rt   )r#   r$   r%   r   )r   )r"   d   )r!   latticer   zpage-r   )6get
ValueErrorrU   rS   r   	set_edgesr'   
set_borderset_spanr   r   r#   r$   r%   r;   r   rB   r"   r4   r@   r   r!   rL   datapd	DataFramedfrw   r   flavoraccuracy
whitespaceorderintrZ   pathbasenamereplacepager   r   r   r   ri   rj   r   _textru   r|   _imager~   r   	_segments
_textedges)r,   r   r   r   r-   r   r   table
pos_errorsrs   r   r=   errorr>   r?   r@   r   r   r   r   s                       r.   _generate_tablezLattice._generate_tableD  s   jjjj;#+6==dmmLMMdD!SDNNC  " 
 4 	>I[[+ >!0#"nn#" 2A;(*%%e,%33w4?? 4 G /6 >*ud9=E*516>>	>" $c:%6$78>>%///PEzz<<%hhnn'-
 !%!m))$--8@@"MN
 8L8LM1qttQTT144.MN8J8JK1qttQTT144.KL

D$<$<=1143K3KL NKs   
5L%5L*c                 f   | j                  ||       |sKt        j                  dj                  t        j
                  j                  | j                                     | j                  s| j                  rMt        j                  dj                  t        j
                  j                  | j                                     g S t        j                  dj                  t        j
                  j                  | j                                     g S | j                          | j                          g }t        t        | j                   j#                         d d            D ]J  \  }}| j%                  ||      \  }}}	}
| j'                  ||||	|
      }||_        |j+                  |       L |S )NzProcessing {}z:{} is image-based, camelot only works on text-based pages.zNo tables found on {}c                     | d   S )Nr    r   s    r.   r   z(Lattice.extract_tables.<locals>.<lambda>  s
    1 r0   T)r   r   )r   r   )_generate_layoutloggerinforU   rZ   r   r   rS   r   imageswarningswarnr`   r   	enumerater   r}   keysr   r   _bboxr;   )r,   rV   suppress_stdoutlayout_kwargs_tablesr   r   r   r   r   r   r   s               r.   extract_tableszLattice.extract_tables  sZ   h6KK..rww/?/?/NOP##{{))/0@0@0O)P I +222773C3CDMM3RS I!!#&4??'')~tL
 	"MIr $(#B#B9b#Q D$S((D$CS(QEEKNN5!	" r0   r   )__name__
__module____qualname____doc__r/   staticmethodrB   rL   r`   r   r   r   r   r   r0   r.   r   r   %   s    9z  :!!%F * *X  @H
T$6;z 8=B r0   r   ) rZ   sysrz   localeloggingr   
subprocessnumpynppandasr   baser   corer   utilsr   r   r	   r
   r   r   r   r   image_processingr   r   r   r   	getLoggerr   r   r   r0   r.   <module>r      sc    
 
         	 	 	  
		9	%zj zr0   