
    :Qgi                     4   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZmZmZ d dl m!Z!m"Z" d dl#m$Z% d dl#m&Z&m'Z'm(Z(  e)e&e'z   e(z         Z*e*jW                  d       d Z,d Z-d Z.g dZ/g dZ0d.dZ1d.dZ2 G d de3      Z4d Z5d Z6d Z7d Z8d Z9d Z:d Z;d e<fd!Z=d e<fd"Z>d e?fd#Z@d e?fd$ZAd/d%ZBd0d&ZCd0d'ZDd1d(ZE	 d2d)ZFd* ZGd+ ZH	 	 	 	 	 	 	 d3d,ZId4d-ZJy)5    N)groupby)
itemgetter)	PDFParser)PDFDocument)PDFPage)PDFTextExtractionNotAllowed)PDFResourceManager)PDFPageInterpreter)PDFPageAggregator)LAParamsLTAnnoLTCharLTTextLineHorizontalLTTextLineVerticalLTImage)Requesturlopen)urlparse)uses_relativeuses_netlocuses_params c                 X    	 t        |       j                  t        v S # t        $ r Y yw xY w)zCheck to see if a URL has a valid protocol.

    Parameters
    ----------
    url : str or unicode

    Returns
    -------
    isurl : bool
        If url has a valid protocol return True otherwise False.

    F)	parse_urlscheme_VALID_URLS	Exception)urls    J/var/www/html/answerous/venv/lib/python3.12/site-packages/camelot/utils.pyis_urlr    (   s.    ~$$33 s    	))c                     d}| rP|t        j                  t        j                  t        j                  z   t        j
                  z         z  }| dz  } | rP|S )Nr      )randomchoicestringdigitsascii_lowercaseascii_uppercase)lengthrets     r   random_stringr+   ;   sS    
C
v}}MMF222V5K5KK
 	
 	!	 
 J    c                    t        d       d}t        j                  dd      5 }ddi}t        | d|      }t	        |      }|j                         j                         }|d	k7  rt        d
      |j                  |j                                ddd       t        j                  j                  t        j                  j                  j                        |      }t        j                   |j                  |       |S # 1 sw Y   rxY w)zDownload file from specified URL.

    Parameters
    ----------
    url : str or unicode

    Returns
    -------
    filepath : str or unicode
        Temporary filepath.

       z.pdfwbF)deletez
User-AgentzMozilla/5.0Nzapplication/pdfzFile format not supported)r+   tempfileNamedTemporaryFiler   r   infoget_content_typeNotImplementedErrorwritereadospathjoindirnamenameshutilmove)r   filenamefheadersrequestobjcontent_typefilepaths           r   download_urlrF   E   s      "#4(H		$	$T%	8 A/#tW-gxxz224,,%&ABB	
 ww||BGGOOAFF3X>H
KK!O s   A*DD
)columnsedge_tolrow_tol
column_tol)
process_background
line_scale	copy_text
shift_textline_tol	joint_tolthreshold_blocksizethreshold_constant
iterations
resolutionc                 P    fd}dk(  r |t         |        y  |t        |        y )Nc                     t        |       j                  t        |j                                     }|r)t        dj	                  t        |             d d      y )N,z cannot be used with flavor='')setintersectionkeys
ValueErrorr:   sorted)parser_kwargsinput_kwargsisecflavors      r   check_intersectionz*validate_input.<locals>.check_intersectionp   sZ    =!..s<3D3D3F/GH88F4L)**GxqQ  r,   lattice)stream_kwargslattice_kwargs)kwargsra   rb   s    ` r   validate_inputrg   o   s&     =&1>62r,   c                     |dk(  r1| j                         D ]  }|t        v s| j                  |        | S | j                         D ]  }|t        v s| j                  |        | S )Nrc   )r[   rd   popre   )rf   ra   keys      r   remove_extrark   }   sj    ;;= 	 Cm#

3	  M ;;= 	 Cn$

3	  Mr,   c                       e Zd Zd Zd Zy)TemporaryDirectoryc                 L    t        j                         | _        | j                  S N)r1   mkdtempr<   )selfs    r   	__enter__zTemporaryDirectory.__enter__   s    $$&	yyr,   c                 B    t        j                  | j                         y ro   )r=   rmtreer<   )rq   exc_type	exc_value	tracebacks       r   __exit__zTemporaryDirectory.__exit__   s    dii r,   N)__name__
__module____qualname__rr   rx    r,   r   rm   rm      s    !r,   rm   c                     || z  }|S )zTranslates x2 by x1.

    Parameters
    ----------
    x1 : float
    x2 : float

    Returns
    -------
    x2 : float

    r|   )x1x2s     r   	translater      s     "HBIr,   c                     | |z  } | S )zScales x by scaling factor s.

    Parameters
    ----------
    x : float
    s : float

    Returns
    -------
    x : float

    r|   )xss     r   scaler      s     FAHr,   c                 $   | \  }}}}|\  }}}t        ||      }t        t        t        | |            |      }t        ||      }t        t        t        | |            |      }t        |      t        |      t        |      t        |      f}	|	S )ay  Translates and scales pdf coordinate space to image
    coordinate space.

    Parameters
    ----------
    k : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in PDFMiner coordinate
        space.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, pdf_y) where the
        first two elements are scaling factors and pdf_y is height of
        pdf.

    Returns
    -------
    knew : tuple
        Tuple (x1, y1, x2, y2) representing table bounding box where
        (x1, y1) -> lt and (x2, y2) -> rb in OpenCV coordinate
        space.

    )r   absr   int)
kfactorsr~   y1r   y2scaling_factor_xscaling_factor_ypdf_yknews
             r   	scale_pdfr      s    . NBB07-&	r#	$B	s9eVR()+;	<B	r#	$B	s9eVR()+;	<BGSWc"gs2w/DKr,   c                    |\  }}}i }| j                         D ]  }|\  }	}
}}t        |	|      }	t        t        t        | |
            |      }
t        ||      }t        t        t        | |            |      }t	        | |    \  }}|D cg c]  }t        ||       }}|D cg c]"  }t        t        t        | |            |      $ }}t	        ||      }|||	|
||f<    g }|D ]{  }t        |d   |      t        |d   |      }}	t        t        t        | |d               |      t        t        t        | |d               |      }}
|j                  |	|
||f       } g }|D ]{  }t        |d   |      t        |d   |      }}	t        t        t        | |d               |      t        t        t        | |d               |      }}
|j                  |	|
||f       } |||fS c c}w c c}w )av  Translates and scales image coordinate space to pdf
    coordinate space.

    Parameters
    ----------
    tables : dict
        Dict with table boundaries as keys and list of intersections
        in that boundary as value.
    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of horizontal line segments.
    factors : tuple
        Tuple (scaling_factor_x, scaling_factor_y, img_y) where the
        first two elements are scaling factors and img_y is height of
        image.

    Returns
    -------
    tables_new : dict
    v_segments_new : dict
    h_segments_new : dict

    r      r"      )r[   r   r   r   zipappend)tables
v_segments
h_segmentsr   r   r   img_y
tables_newr   r~   r   r   r   j_xj_yjjointsv_segments_newvh_segments_newhs                        r   scale_imager      s5   2 18-&J[[] 
.BB2'(3y%,-/?@2'(3y%,-/?@q	?S367auQ()77KNOauSE61-.0@AOOS#'-
BB#$
. N 0qt-.ad<L0MB#i!-.0@A#i!-.0@A  	r2r2./0 N 0qt-.ad<L0MB#i!-.0@A#i!-.0@A  	r2r2./0 ~~55/ 8Os   G/&'G4c                 b   d}t        |D cg c]#  }|j                         j                         s"|% c}      }t        |D cg c]#  }|j                         j                         s"|% c}      }||k  r-t        d | D              }t        d | D              }||k  rdnd}|S c c}w c c}w )aC  Detects if text in table is rotated or not using the current
    transformation matrix (CTM) and returns its orientation.

    Parameters
    ----------
    horizontal_text : list
        List of PDFMiner LTTextLineHorizontal objects.
    vertical_text : list
        List of PDFMiner LTTextLineVertical objects.
    ltchar : list
        List of PDFMiner LTChar objects.

    Returns
    -------
    rotation : string
        '' if text in table is upright, 'anticlockwise' if
        rotated 90 degree anticlockwise and 'clockwise' if
        rotated 90 degree clockwise.

    r   c              3   h   K   | ]*  }|j                   d    dk  xr |j                   d   dkD   , ywr"   r   r   Nmatrix.0ts     r   	<genexpr>zget_rotation.<locals>.<genexpr>+  s0     Ka;AHHQK!O;K   02c              3   h   K   | ]*  }|j                   d    dkD  xr |j                   d   dk   , ywr   r   r   s     r   r   zget_rotation.<locals>.<genexpr>,  s0     OAAHHQK!O?a?Or   anticlockwise	clockwise)lenget_textstripsum)	charshorizontal_textvertical_textrotationr   hlenvlenr   r   s	            r   get_rotationr     s    * H?Caajjl.@.@.BCDD=AaAJJL,>,>,@ABDd{KUKK	OOO&/-&??[O DAs   #B'B'#B,(B,c                 f   | d   | d   f}| d   | d   f}|D cg c]=  }|d   |d   dz
  kD  r-|d   |d   dz   k  r|d   dz
  |d   cxk  r|d   dz   k  rn n|? }}|D cg c]=  }|d   |d   dz
  kD  r-|d   |d   dz   k  r|d   dz
  |d   cxk  r|d   dz   k  rn n|? }}||fS c c}w c c}w )a6  Returns all line segments present inside a bounding box.

    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
        space.
    v_segments : list
        List of vertical line segments.
    h_segments : list
        List of vertical horizontal segments.

    Returns
    -------
    v_s : list
        List of vertical line segments that lie inside table.
    h_s : list
        List of horizontal line segments that lie inside table.

    r   r"   r   r   r|   )	bboxr   r   lbrtr   v_sr   h_ss	            r   segments_in_bboxr   1  s   , q'47	B
q'47	B Q4"Q%!)!r!uqy 0RUQY!A$5S"Q%RS)5S 	
C  Q4"Q%!)!r!uqy 0RUQY!A$5S"Q%RS)5S 	
C 
 8O
s   AB)!AB.c                 6   | d   | d   f}| d   | d   f}|D cg c]l  }|d   dz
  |j                   |j                  z   dz  cxk  r|d   dz   k  r9n n6|d   dz
  |j                  |j                  z   dz  cxk  r|d   dz   k  rn n|n }}|D ch c]  }| }}|D ]d  }|j	                         D ]O  }||k(  r	t        ||      st        ||      t        |      z  dkD  s2t        ||      s?|j                  |       Q f t        |      }	|	S c c}w c c}w )a  Returns all text objects present inside a bounding box.

    Parameters
    ----------
    bbox : tuple
        Tuple (x1, y1, x2, y2) representing a bounding box where
        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
        space.
    text : List of PDFMiner text objects.

    Returns
    -------
    t_bbox : list
        List of PDFMiner text objects that lie inside table, discarding the overlapping ones

    r   r"   r   r          @g?)x0r~   y0r   copybbox_intersectbbox_intersection_area	bbox_areabbox_longerdiscardlist)
r   textr   r   r   t_bboxrestbabbunique_boxess
             r   text_in_bboxr   V  s9   " q'47	B
q'47	B a519+8r!uqy8qEAI!$$+,91	9 	
F  !AD )))+ 	)BRxb"%*2r2Yr]BcI"2r*R(	)) :L) s   A1D	Dreturnc                 2   t        | j                  |j                        }t        | j                  |j                        }t        | j                  |j                        }t        | j
                  |j
                        }||k  s||kD  ry||z
  ||z
  z  }|S )a.  Returns area of the intersection of the bounding boxes of two PDFMiner objects.

    Parameters
    ----------
    ba : PDFMiner text object
    bb : PDFMiner text object

    Returns
    -------
    intersection_area : float
        Area of the intersection of the bounding boxes of both objects

            )maxr   minr   r~   r   )r   r   x_lefty_topx_righty_bottomintersection_areas          r   r   r     s     FruuE"%%G255"%% H8e+ 6)eh.>?r,   c                 h    | j                   | j                  z
  | j                  | j                  z
  z  S )zReturns area of the bounding box of a PDFMiner object.

    Parameters
    ----------
    bb : PDFMiner text object

    Returns
    -------
    area : float
        Area of the bounding box of the object

    r~   r   r   r   )r   s    r   r   r     s'     EEBEEMbeebeem,,r,   c                     | j                   |j                  k\  xrO |j                   | j                  k\  xr4 | j                  |j                  k\  xr |j                  | j                  k\  S )a   Returns True if the bounding boxes of two PDFMiner objects intersect.

    Parameters
    ----------
    ba : PDFMiner text object
    bb : PDFMiner text object

    Returns
    -------
    overlaps : bool
        True if the bounding boxes intersect

    r   r   r   s     r   r   r     sI     55BEE>RbeeruunR"%%RBEERUUNRr,   c                 h    | j                   | j                  z
  |j                   |j                  z
  k\  S )a3  Returns True if the bounding box of the first PDFMiner object is longer or equal to the second.

    Parameters
    ----------
    ba : PDFMiner text object
    bb : PDFMiner text object

    Returns
    -------
    longer : bool
        True if the bounding box of the first object is longer or equal

    )r~   r   r   s     r   r   r     s'     EEBEEMruuruu}--r,   c                     g }| D ]R  }|s|j                  |       |d   }t        j                  |||      r||z   dz  }||d<   B|j                  |       T |S )zMerges lines which are within a tolerance by calculating a
    moving mean, based on their x or y axis projections.

    Parameters
    ----------
    ar : list
    line_tol : int, optional (default: 2)

    Returns
    -------
    ret : list

    )atolr   )r   npisclose)arrO   r*   atemps        r   merge_close_linesr     sh     C 	JJqMr7Dzz$1qC'B

1	 Jr,   c           	          |s| S t        j                  ddj                  t        t         j                  |             dd| t         j
                        }|S )a  Strips any characters in `strip` that are present in `text`.
    Parameters
    ----------
    text : str
        Text to process and strip.
    strip : str, optional (default: '')
        Characters that should be stripped from `text`.
    Returns
    -------
    stripped : str
    [r   ])flags)resubr:   mapescapeUNICODE)r   r   strippeds      r   
text_stripr     sL     vvRWWSE*+,A.D

H Or,   c                    |dk(  rO| D cg c]C  }t        |t              s1|j                         t        j                  |j
                  d      fE }}nS|dk(  rN| D cg c]C  }t        |t              s1|j                         t        j                  |j                  d      fE }}D cg c]  \  }}t        j                  |d       }}}t        t        |            dkD  r	g }t        |      }	t        |t        d            D ]  \  }
}|
|	k(  rv|D cg c]  }|d   	 }}dj                  |      j                         s=|j                  dd       |j                  d	       |j                  dj                  |             |D cg c]  }|d   	 }}dj                  |      j                         s|j                  dj                  |              dj                  |      }n-dj                  | D cg c]  }|j                          c}      }t!        ||      S c c}w c c}w c c}}w c c}w c c}w c c}w )
a  Flags super/subscripts in text by enclosing them with <s></s>.
    May give false positives.

    Parameters
    ----------
    textline : list
        List of PDFMiner LTChar objects.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.

    Returns
    -------
    fstring : string

    
horizontalr.   )decimalsverticalr"   r   r   z<s>z</s>)
isinstancer   r   r   roundheightwidthr   rY   r   r   r   r:   r   insertr   r   )textline	direction
strip_textr   dr   sizelflistmin_sizerj   r   fcharsfstrings                 r   flag_font_sizer    s   & L  
a( ZZ\288AHHq9:
 

 
j	  
a( ZZ\288AGGa89
 

 788
d$	#8A8
3q6{Qq6!!Z]3 
	2JCh(-.1!A$..776?((*MM!U+MM&)LL1(-.1!A$..776?((*LL1
	2 ''%.'':A1::<:;gz**;


 	9 / /
 ;s%   AH+AH0-!H5H;I Ic                 
   d}g }|j                   }	 |dk(  r|j                         st        | j                        D 	cg c]  \  }}	|	d   |d   k  r|d   |	d   k  r| }
}}	t        | j                        D cg c]'  \  }}|d   |d   |d   z   dz  cxk  r|d   k  rn n|) }}}|d   }|
D cg c]=  }| j
                  |   |   j                  s || j
                  |   |   j                  f? }}|s"|
d   | j
                  |   d   j                  fg}|j                  D ]  }| j                  |   }|D ]  }t        |t              r|d   |j                  |j                  z   dz  cxk  r|d   k  r>n n;|j                  |j                  z   dz  |d   k  r|j                  ||d   |f        ||d   k(  s|j                  ||d   dz   |f       t        |t               s|j                  ||d   |f         n|dk(  r|j                         st        | j                        D cg c]  \  }}|d   |d   k  r|d   |d   k  r| }}}t        | j                        D cg c]'  \  }}|d   |d   |d   z   dz  cxk  r|d   k  rn n|) }}}|d   }|D cg c]=  }| j
                  |   |   j"                  s || j
                  |   |   j                  f? }}|s"|d   | j
                  d   |   j                  fg}|j                  D ]  }| j                  |   }|D ]  }t        |t              r|d   |j                  |j                  z   dz  cxk  r|d   k  r>n n;|j                  |j                  z   dz  |d   k\  r|j                  |d   ||f        ||d   k(  s|j                  |d   dz
  ||f       t        |t               s|j                  |d   ||f         g }t)        |t+        dd            D ]  \  }}|r8|j                  |d   |d   t-        |D cg c]  }|d   	 c}||      f       @|D cg c]  }|d   j'                          }}|j                  |d   |d   t/        d	j1                  |      |      f        |S c c}	}w c c}}w c c}w c c}}w c c}}w c c}w # t$        $ r dd|j'                         fgcY S w xY wc c}w c c}w )
a9  Splits PDFMiner LTTextLine into substrings if it spans across
    multiple rows/columns.

    Parameters
    ----------
    table : camelot.core.Table
    textline : object
        PDFMiner LTTextLine object.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string. (Useful for
        super and subscripts.)
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.

    Returns
    -------
    grouped_chars : list
        List of tuples of the form (idx, text) where idx is the index
        of row/column and text is the an lttextline substring.

    r   r   r   r"   r   r   r   r  r   )r   is_empty	enumeratecolsrowscellsrightr   _objsr   r   r   r   r   r~   r   r   bottom
IndexErrorr   r   r   r  r   r:   )tabler  r  	flag_sizer  idxcut_textr   ir   	x_overlapr   rr_idxcx_cutsrC   rowcuty	y_overlapc_idxy_cutscolgrouped_charsrj   r   r   gcharss                                r   split_textliner*  6  sE   4 CH==DD/$X->->-@ &ejj1AqQ447?tAw!A$ I  &ejj1AqQ4DGd1g-2:ad: E 
 aA3<./Aq@Q@W@WEKKN1%(()F  $Q<Q);)>)>?@~~ :jjm! :C!#v.Fsvv!&;Es1vE!$#&&A 5Q ?$OOQA,<=!  #fRj0 (CFQJ0D E#C0 CFC(89::  *$X->->-@ &ejj1AqQ447?tAw!A$ I  &ejj1AqQ4DGd1g-2:ad: E 
 aA3<./Aq@Q@X@XEKKN1%(()F  $Q<R);)>)>?@~~ :jjm! :C!#v.Fsvv!&;Es1vE!$#&&A 5Q ?$OOSVQ,<=!  #fRj0 (Q!Q0D E#C0 QC(89::$ Mh
1a(89 
U  FF"',-!1-yZ 055!admmo5F5  QQBGGFOZ!HI  i
,
*  /R**,-../ .
 6s   /S "R3#S >,R9*S 6!R? R?8C
S +S /AS ;"SS 8,S
$S 0!S S2C
S =+S )S  S8 S=3"S S54S5c           
      X   dgdz  \  }}t        t        | j                              D ]  }|j                  |j                  z   dz  | j                  |   d   k  s4|j                  |j                  z   dz  | j                  |   d   kD  sdg }	| j
                  D ]  }
|
d   |j                  k  r|
d   |j                  k\  ry|
d   |j                  k  r|j                  n|
d   }|
d   |j                  k\  r|j                  n|
d   }|	j                  t        ||z
        t        |
d   |
d   z
        z         |	j                  d        t        t        t        d |	                  dk(  rx|j                         j                  d      }|j                  |j                  f}| j
                  d   d   | j
                  d   d   f}t        j                  | d| d	|        |}|	j!                  t#        |	            } n dgd
z  \  }}}}|j                  | j                  |   d   kD  r(t        |j                  | j                  |   d   z
        }|j                  | j                  |   d   k  r(t        |j                  | j                  |   d   z
        }|j                  | j
                  |   d   k  r(t        |j                  | j
                  |   d   z
        }|j                  | j
                  |   d   kD  r(t        |j                  | j
                  |   d   z
        }t        |j                  |j                  z
        dk(  rdn!t        |j                  |j                  z
        }t        |j                  |j                  z
        dk(  rdn!t        |j                  |j                  z
        }||z  }|||z   z  |||z   z  z   |z  }|rt%        | ||||      |fS |r||t'        |j(                  ||      fg|fS ||t+        |j                         |      fg|fS )a}  Gets indices of the table cell where given text object lies by
    comparing their y and x-coordinates.

    Parameters
    ----------
    table : camelot.core.Table
    t : object
        PDFMiner LTTextLine object.
    direction : string
        Direction of the PDFMiner LTTextLine object.
    split_text : bool, optional (default: False)
        Whether or not to split a text line if it spans across
        multiple cells.
    flag_size : bool, optional (default: False)
        Whether or not to highlight a substring using <s></s>
        if its size is different from rest of the string. (Useful for
        super and subscripts)
    strip_text : str, optional (default: '')
        Characters that should be stripped from a string before
        assigning it to a cell.

    Returns
    -------
    indices : list
        List of tuples of the form (r_idx, c_idx, text) where r_idx
        and c_idx are row and column indices.
    error : float
        Assignment error, percentage of text area that lies outside
        a cell.
        +-------+
        |       |
        |   [Text bounding box]
        |       |
        +-------+

    r   r   r   r   r"   c                     | dk7  S )Nr   r|   )r   s    r   <lambda>z!get_table_index.<locals>.<lambda>  s
    b r,   
 z does not lie in column range    r         ?)r  r  r  )ranger   r  r   r   r  r~   r   r   r   r   filterr   r   warningswarnindexr   r*  r  r  r   )r  r   r  
split_textr  r  r  r%  r  lt_col_overlapr  leftr  r   
text_range	col_range	y0_offset	y1_offset	x0_offset	x1_offsetXYchareaerrors                           r   get_table_indexrD    s   N 4!8LE53uzz?# DD144K3Aq!11qttadd{c6IEJJM

M 7  NZZ .Q4144<AaDADDL#$Q4144<144QqTD$%aDADDLADDadE"))#dUl*;c!A$1+>N*NO"))"-. 40.ABCqHzz|))$/ddADD\
"ZZ]1-uzz"~a/@A	fAj\)G	{S E"((^)<=E+0 34q.Iy)Yttejj""uzz%0334	ttejj""uzz%0334	ttejj""uzz%0334	ttejj""uzz%0334	144!$$;3&Cqtt,<A144!$$;3&Cqtt,<AUF9y()a9y3H.IJfTEq)yZ 	
 	
  &qww	jQ 	 	 E:ajjlJ#GHI5PPr,   c                     d}	 d}t        | D cg c]  }|d   	 c}      |k7  rt        d      | D ]+  }|d   t        |d         z  }|d   D ]  }||d|z
  z  z  } - 	 |S c c}w # t        $ r d}Y |S w xY w)aw  Calculates a score based on weights assigned to various
    parameters and their error percentages.

    Parameters
    ----------
    error_weights : list
        Two-dimensional list of the form [[p1, e1], [p2, e2], ...]
        where pn is the weight assigned to list of errors en.
        Sum of pn should be equal to 100.

    Returns
    -------
    score : float

    d   r   z&Sum of weights should be equal to 100.r"   )r   r\   r   ZeroDivisionError)error_weights	SCORE_VALscoreewweighterror_percentages         r   compute_accuracyrN    s      I	."1./9<EFF 	9BUSAZ'F$&qE 9 1'7#7889	9 L /  Ls"   A( A#AA( #A( (A76A7c           	          d}g g }}| D ]"  }|D ]  }|j                         dk(  s|dz  } $ d|t        t        |       t        | d         z        z  z  }|S )zCalculates the percentage of empty strings in a
    two-dimensional list.

    Parameters
    ----------
    d : list

    Returns
    -------
    whitespace : float
        Percentage of empty cells.

    r   r   r"   rF  )r   floatr   )r  
whitespacer_nempty_cellsc_nempty_cellsr  r   s         r   compute_whitespacerT  /  ss     J%'NN   	 AwwyBa
	   
U3q6C!I+=%>>?Jr,   c           
         t        | d      5 }t        |      }	t        |	      }
|
j                  st	        d|        t        |||||||      }t               }t        ||      }t        ||      }t        j                  |
      D ]E  }|j                  |       |j                         }|j                  d   }|j                  d   }||f}G fcddd       S # 1 sw Y   yxY w)aM  Returns a PDFMiner LTPage object and page dimension of a single
    page pdf. To get the definitions of kwargs, see
    https://pdfminersix.rtfd.io/en/latest/reference/composable.html.

    Parameters
    ----------
    filename : string
        Path to pdf file.
    line_overlap : float
    char_margin : float
    line_margin : float
    word_margin : float
    boxes_flow : float
    detect_vertical : bool
    all_texts : bool

    Returns
    -------
    layout : object
        PDFMiner LTPage object.
    dim : tuple
        Dimension of pdf page in the form (width, height).

    rbz Text extraction is not allowed: )line_overlapchar_marginline_marginword_margin
boxes_flowdetect_vertical	all_texts)laparamsr   r   N)openr   r   is_extractabler   r   r	   r   r
   r   create_pagesprocess_page
get_resultr   )r?   rW  rX  rY  rZ  r[  r\  r]  r@   parserdocumentr^  rsrcmgrdeviceinterpreterpagelayoutr   r   dims                       r   get_page_layoutrl  G  s    D 
h	 1v&&&-28*=  %###!+
 %&"7X>(&9((2 	"D$$T*&&(FKKNE[[^F&/C	" s{3  s   CCC&c                    |dk(  rt         }n#|dk(  rt        }n|dk(  rt        }n|dk(  rt        }|g }	 | j                  D ]0  }t        |      r|j                  |       !|t        ||      z  }2 	 |S # t        $ r Y |S w xY w)a  Recursively parses pdf layout to get a list of
    PDFMiner text objects.

    Parameters
    ----------
    layout : object
        PDFMiner LTPage object.
    ltype : string
        Specify 'char', 'lh', 'lv' to get LTChar, LTTextLineHorizontal,
        and LTTextLineVertical objects respectively.
    t : list

    Returns
    -------
    t : list
        List of PDFMiner text objects.

    charimager   r   )ltype)	r   r   r   r   r  r   r   get_text_objectsAttributeError)rj  rp  r   LTObjectrC   s        r   rq  rq    s    & 	'		#	#'	/	!%y<< 	8C#x(%c77		8 H  Hs   ?A7 7	BB)rc   )r   )r   )Fr   )FFr   )      ?r1  rt  g?rt  TT)rn  N)Kr8   r   r#   r=   r%   r1   r4  	itertoolsr   operatorr   numpyr   pdfminer.pdfparserr   pdfminer.pdfdocumentr   pdfminer.pdfpager   r   pdfminer.pdfinterpr	   r
   pdfminer.converterr   pdfminer.layoutr   r   r   r   r   r   urllib.requestr   r   urllib.parser   r   r   r   r   rY   r   r   r    r+   rF   rd   re   rg   rk   objectrm   r   r   r   r   r   r   r   rP  r   r   boolr   r   r   r   r  r*  rD  rN  rT  rl  rq  r|   r,   r   <module>r     sU   
 	         ( , $ 8 1 1 0  , . @ @ -+-;<   B &6 A3	! !""B96x>"J'Te 4-U - Sd S".4 ."841+hsn HJbQJ<4 ;|%r,   