
    :Qgd                       d dl mZ d dlmZmZmZmZmZmZm	Z	 d dl
Zd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZmZm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z) erd dlm*Z* d dl+m,Z, dZ-dZ.	 	 d+	 	 	 	 	 d,dZ/ e)d      	 	 d-	 	 	 	 	 d.d       Z0d Z1e.fd/dZ2e.f	 	 	 	 	 d0dZ3de.f	 	 	 	 	 d1dZ4de.fd1dZ5 e)d      	 	 	 	 	 	 	 	 d2d       Z6d3dZ7 e)d      	 d4	 	 	 	 	 d5d       Z8	 	 	 	 	 	 d6d Z9d7d!Z:	 	 	 	 	 	 	 	 	 	 d8d"Z;	 	 	 	 	 	 	 	 	 	 d9d#Z<d:d$Z=	 	 	 	 	 	 	 	 	 	 d;d%Z>	 	 	 	 	 	 d<d&Z?d=d'Z@	 	 	 	 	 	 d>d(ZAd?d)ZBd@d*ZCy)A    )annotations)TYPE_CHECKINGAnyBinaryIOListOptionalUnioncastN)LTChar	LTTextBox)	PDFObjRef)open_filename)	Rectangle)
PixelSpace
PointSpace)CoordinatesMetadata)remove_control_characters)extract_image_objectsextract_text_objectsopen_pdfminer_pages_generatorrect_to_bbox)
env_config)SORT_MODE_BASICSource)sort_text_regions)requires_dependencies)
TextRegion)DocumentLayoutg{Gz?   c                    t        | d      5 }t        t        |      }t        ||      \  }}||fcd d d        S # 1 sw Y   y xY w)Nrb)filedpi)r   r
   r   process_data_with_pdfminer)filenamer#   fpextracted_layoutlayouts_linkss        q/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/partition/pdf_image/pdfminer_processing.pyprocess_file_with_pdfminerr*   #   sQ     
x	& /"(B*D+
'-  ./ / /s	   $;Aunstructured_inferencec           
        ddl m}m} g }g }|dz  }t        t	        |             D ]  \  }\  }}	|	j
                  |	j                  }}
g }g }g }t        |
|      }|j                  rt        |j                  |||      }t        j                  }g }|	D ]  }t        |j                  |      \  }}}}||||f}t        |      dkD  rOt        |t               r?t#        ||||      }t%        ||      \  }}|D ]  }|j'                  t)        ||              t+        |d      rt-        |      }|D ]  }|j/                         }t1        g t        |j                  |      ||t2        j4                  | }|j                  U|j                  j6                  dkD  so|j'                  |        t9        |      } | D ]o  }!t1        g t        |!j                  |      |dt2        j4                  | }|j                  E|j                  j6                  dkD  s_|j'                  |       q  |D "#cg c]&  }"|"d   D #cg c]  }#|#|z  	 c}#|"d   |"d	   |"d
   d( }$}"}#t;        |t        j<                        }%t;        |t        j>                        }&g |%|&}'tA        |'tB              }'tA        |'      }'|j'                  |'       |j'                  |$        ||fS c c}#w c c}#}"w )zzLoads the image and word objects from a pdf using pdfplumber and the image renderings of the
    pdf pages using pdf2imager   )EmbeddedTextRegionImageTextRegionH   )widthheightget_textNbboxtexturistart_index)r3   r4   urlr6   )")unstructured_inference.inference.elementsr-   r.   	enumerater   r0   r1   r   annotsget_urisr   PDF_ANNOTATION_THRESHOLDr   r3   len
isinstancer    check_annotations_within_elementget_words_from_objappendmap_bbox_and_indexhasattrr   r2   _create_text_regionr   PDFMINERarear   remove_duplicate_elements#EMBEDDED_TEXT_SAME_REGION_THRESHOLD$EMBEDDED_IMAGE_SAME_REGION_THRESHOLDr   r   )(r"   r#   r-   r.   layoutsr(   coefpage_numberpagepage_layoutr0   r1   text_layoutimage_layoutannotation_listcoordinate_systemannotation_thresholdurls_metadataobjx1y1x2y2r3   annotations_within_element_wordsannotinner_text_objects	inner_obj_texttext_regioninner_image_objectsimg_objmetadataxlinksclean_text_layoutclean_image_layoutlayouts(                                           r)   r$   r$   0   sb   
 GM8D,56STX6Y,Z Q$((dK#));+=+=v&
 ;;&t{{F<M{[O)BB.0 '	9C)#((F;NBBB#D?#a'JsI,F-M#(	.* .c6:57 KE!(();E5)IJK sJ'%9#%>"!3 
8I%..0E"5 #%innf=## # 	#
 +#K #''38H8H8M8MPQ8Q#**;7
8 '<C&@#2 	9G"5 #%gllF;## # 	#
 (#K #''38H8H8M8MPQ8Q$++K8	9='	9^ *
  ,4F+;<aT< ('6	
 
 6GG
 7*II
 ;$:'9: #6?; #6*vU#cQ$d M!!1 =
s   9K*K%K*%K*c                H    |j                  | |z  ||z  ||z  ||z  ||      S )zECreates a text region of the specified class with scaled coordinates.)r4   source)from_coords)rV   rW   rX   rY   rK   r4   rk   region_classs           r)   rD   rD      s<    ##
T	
T	
T	
T	 $      c                   t        j                  t        |       dft         j                        }t	        |       D ]:  \  }}|j
                  |j                  |j                  |j                  g||ddf<   < |j                  |      S )z.convert a list of boxes's coords into np array   )dtypeN)
npzerosr=   float32r9   rV   rW   rX   rY   round)bboxesround_tocoordsir3   s        r)   get_coords_from_bboxesrz      sp     XXs6{A&bjj9FV$ <4$''477;q!t< <<!!rn   c           
        t        j                  | dd      \  }}}}t        j                  |dd      \  }}}	}
t        j                  t        j                  |t        j                  |	            t        j                  |t        j                  |            z
  dz   d      t        j                  t        j                  |t        j                  |
            t        j                  |t        j                  |            z
  dz   d      z  }||z
  dz   ||z
  dz   z  }|	|z
  dz   |
|z
  dz   z  }|j                  |      |j                  |      |j                  |      fS )zHcompute intersection area and own areas for two groups of bounding boxesrp      axisr   )rr   splitmaximumminimum	transposeru   )coords1coords2rw   x11y11x12y12x21y21x22y22
inter_area	boxa_area	boxb_areas                 r)   $areas_of_boxes_and_intersection_arear      s+    '115Cc3'115Cc3	Cc*	+bjjbll3>O.P	PST	TWX


BJJsBLL$56CVYIZ9[[^__bcdeJ sQ39q=1IsQ39q=1IH%yx'@)//RZB[[[rn         ?c                    t        | |      }t        ||      }t        |||      \  }}}|t        j                  |t              z  |kD  ||j
                  k  z  S )zacompute if each element from bboxes1 is almost a subregion of one or more elements in
    bboxes2rw   rz   r   rr   r   EPSILON_AREAT)	bboxes1bboxes2	thresholdrw   r   r   r   r   r   s	            r)   &bboxes1_is_almost_subregion_of_bboxes2r      sd    
 %Wx@G$Wx@G'K8($J	9 I|<<yHY[[  rn   c                    t        | |      }t        |||      \  }}}|t        j                  t        ||j
                  z   |z
        z  |kD  S )z#compute iou for a group of elementsr   r   )rv   r   rw   rx   r   r   r   s          r)   boxes_self_iour      sR    #FX>F'K($J	9 L)ikk2IJ2VWW[dddrn   c                   ddl m} ddlm} | j                  }t        t        ||            D ]  \  }\  }}|j                  }	|j                  }
|
j                  d      }|
j                  d      }||f}i }t        |j                  |      rd|j                  j                  vrddd} |d|	||d	|}t        t        t        d
   |      t               }g }|D ]S  }|j"                  t%        t        d
|      |      }n|j"                  }t'        |      |_        |j)                  |       U ||j                  dd  | S )z1Merge an inferred layout with an extracted layoutr   )+merge_inferred_layout_with_extracted_layout)UnstructuredDetectronONNXModelr0   r1   R_50r   )same_region_thresholdsubregion_threshold)inferred_layoutr'   page_image_sizer   N)ra   pdf_objects ).unstructured_inference.inference.layoutelementr   ,unstructured_inference.models.detectron2onnxr   pagesr9   zipelementsimage_metadatagetr>   detection_model
model_pathr   r
   r   r   r4    aggregate_embedded_text_by_blockr   rA   )inferred_document_layoutr'   hi_res_model_name"merge_inferred_with_extracted_pager   inferred_pagesry   inferred_pageextracted_page_layoutr   r   wh
image_sizethreshold_kwargsmerged_layoutr   	layout_elr4   s                      r)   $merge_inferred_with_extracted_layoutr      sm    \-33N5>N,-6 '-11M0 (00&55w'x(V
 }446TUm;;FFF9<UXY: 
+2&
 	
 *$tL/A=*QSbc& 		'I~~%7 $\9 = 5
 !~~6t<INOOI&		' %-q!O'-R $#rn   c                ^   | j                   D ]  }|j                  D cg c],  }|j                  t        j                  k7  s!|j
                  . }}g }i }d}t        |j                        D ]H  \  }}|j                  t        j                  k7  r$|j                  |j
                         |||<   |dz  }J t        ||t        j                        j                  d      dk(  }	t        |j                        D cg c]  \  }}||vs|	||      s| c}}|_         | S c c}w c c}}w )zClean pdfminer elements from inside tables.

    This function removes elements sourced from PDFMiner that are subregions within table elements.
    r   r|   r}   )r   r   rk   r   rE   r3   r9   rA   r   r   -EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLDsum)
documentrM   enon_pdfminer_element_boxeselement_boxeselement_to_subregion_mapsubregion_indicery   element&is_element_subregion_of_other_elementss
             r)   clean_pdfminer_inner_elementsr     s3     
6:mm%cqxxSYSbSbGbaff%c"%c#% #DMM2 	"JAw~~0  .*:$Q'!	" 3*HH cqck 	/ "$--0
122=>VWX>YZ	 
+
< O; &d(
s   "D$D$>D)c                    g }t        |       D ]   \  }}|j                  |j                         " t        ||      }g }t        |       D ]1  \  }}|||dz   df   j	                         r!|j                  |       3 |S )zMRemoves duplicate text elements extracted by PDFMiner from a document layout.r|   N)r9   rA   r3   r   any)r   r   rv   ry   r   ioufiltered_elementss          r)   rG   rG   :  s     F) $
7gll#$ 
+C) *
7q!a%'z?   )*
 rn   c                L   t        |D cg c]  }|j                   c}| j                  gt        j                        j	                  d      }dj                  t        |      D cg c]$  \  }}||   s|j                  s|j                  & c}}      }|S c c}w c c}}w )zgExtracts the text aggregated from the elements of the given layout that lie within the given
    block.r|   r}    )r   r3   r   r   r   joinr9   r4   )ra   r   rU   maskry   r4   s         r)   r   r   P  s     2()c)			@@ 
cqck	 	 88Ik,B]&!StAwSVS[S[SXX]^DK 	*
 ^s   B(B 
6B 
B 
c           	     B   | D cg c]  }t        |j                  d        }}t        ||g      }t        |      D cg c]L  \  }}t	        |      r<| |   j                  d      | |   j                  d      | |   j                  d      dN }}}|S c c}w c c}}w )Nr3   r4   r7   r6   )r4   r7   r6   )r   r   r   r9   r   )
page_linksregionlinklinks_bboxesresultsidxresultrf   s           r)   get_links_in_elementr   a  s    =GHTItxx/0HLH4\F8LG %W- Cv; sO''/c?&&u-%c?..}=	
E  L Is   B ABc                    t        | t              rt        | |||      S | j                         }|g S t        ||||      S )a  
    Extracts URI annotations from a single or a list of PDF object references on a specific page.
    The type of annots (list or not) depends on the pdf formatting. The function detectes the type
    of annots and then pass on to get_uris_from_annots function as a list.

    Args:
        annots (PDFObjRef | list[PDFObjRef]): A single or a list of PDF object references
            representing annotations on the page.
        height (float): The height of the page in the specified coordinate system.
        coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent
            the annotations' coordinates.
        page_number (int): The page number from which to extract annotations.

    Returns:
        list[dict]: A list of dictionaries, each containing information about a URI annotation,
        including its coordinates, bounding box, type, URI link, and page number.
    )r>   listget_uris_from_annotsresolve)r:   r1   rR   rL   resolved_annotss        r)   r;   r;   r  sI    . &$#FF4E{SSnn&O	9JKXXrn   c           	        g }| D ]p  }t        |      }t        |t              s |j                  dd      }|rt        |t              st        |      dk7  rS|j                  dd      }|rt        |t              st        |      dk7  rt        ||      \  }	}
}}|	|
f|	|f||f||
ff}t        ||      }d|vrt        |d         }t        |t              sd}d|v r!t        |d   t              st        |d         }d}	 |d	k(  r&t        t        |d
               j                  d      }|dk(  r&t        t        |d               j                  d      }|j                  ||	|
||f|||d       s |S # t        $ r Y +w xY w)a  
    Extracts URI annotations from a list of PDF object references.

    Args:
        annots (list[PDFObjRef]): A list of PDF object references representing annotations on
            a page.
        height (int | float): The height of the page in the specified coordinate system.
        coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent
            the annotations' coordinates.
        page_number (int): The page number from which to extract annotations.

    Returns:
        list[dict]: A list of dictionaries, each containing information about a URI annotation,
        including its coordinates, bounding box, type, URI link, and page number.
    SubtypeNz/'Link'Rectrp   )pointssystemASz/'URI'URIzutf-8z/'GoTo'D)coordinatesr3   typer5   rL   )try_resolver>   dictr   r   strr=   r   r   decode	ExceptionrA   )r:   r1   rR   rL   rQ   
annotationannotation_dictsubtyperectrV   rW   rX   rY   r   coordinates_metadatauri_dicturi_typer5   s                     r)   r   r     s   * O -

%j1/40!%%i6*Wi8CLI<U""640z$	2c$i1n%dF3BBr(RHr2hR92$ 

 o%s34(D)(?:hsmY#G8C=)H	8#!+huo">?FFwO9$!+hsm"<=DDWM 	3RR( *	
K-
\   		s   AE;;	FFc                F    	 | j                         S # t        $ r | cY S w xY w)z
    Attempt to resolve a PDF object reference. If successful, returns the resolved object;
    otherwise, returns the original reference.
    )r   r   )r]   s    r)   r   r     s'    
}} s      c                    g }| D ]C  }|d   |k(  st        |d         }|st        ||d         |z  |kD  s3|j                  |       E |S )a  
    Filter annotations that are within or highly overlap with a specified element on a page.

    Args:
        annotation_list (list[dict[str,Any]]): A list of dictionaries, each containing information
            about an annotation.
        element_bbox (tuple[float, float, float, float]): The bounding box coordinates of the
            specified element in the bbox format (x1, y1, x2, y2).
        page_number (int): The page number to which the annotations and element belong.
        annotation_threshold (float, optional): The threshold value (between 0.0 and 1.0)
            that determines the minimum overlap required for an annotation to be considered
            within the element. Default is 0.9.

    Returns:
        list[dict[str,Any]]: A list of dictionaries containing information about annotations
        that are within or highly overlap with the specified element on the given page, based on
        the specified threshold.
    rL   r3   )calculate_bbox_areacalculate_intersection_arearA   )rQ   element_bboxrL   rS   rZ   r   annotation_bbox_sizes          r)   r?   r?     sn    0 "$% >
m$3#6z&7I#J #+L*V:LMPdd&' +11*=> &%rn   c           	        g }g }d}| D ]j  }d}d\  }}}	}
d}t        |      D ]?  \  }}t        |t              r|j                  |       |j	                         }|r,|j                         s|j                  ||||	|
f|d       d}g|s|j                         }|r>|j                         k7  r+|j                         }|j                  ||||	|
f|d       d}t        |      dk(  r<||z   }|j                  }||j                  z
  }
|j                  }	||j                  z
  }n|j                  }	||j                  z
  }
||z  }%|j                  ||||	|
f|d       d}B |t        |      z  }m ||fS )a|  
    Extracts characters and word bounding boxes from a PDF text element.

    Args:
        obj (LTTextBox): The PDF text element from which to extract characters and words.
        height (float): The height of the page in the specified coordinate system.

    Returns:
        tuple[list[LTChar], list[dict[str,Any]]]: A tuple containing two lists:
            - list[LTChar]: A list of LTChar objects representing individual characters.
            - list[dict[str,Any]]]: A list of dictionaries, each containing information about
                a word, including its text, bounding box, and start index in the element's text.
    r    )NNNN)r4   r3   r6   )r9   r>   r   rA   r2   stripisalnumr=   x0y0rV   rW   )rU   r1   
charactersr\   text_len	text_linewordrV   rW   rX   rY   r6   index	charactercharr   s                   r)   r@   r@     s   " JEH +#	/BB )) 4 &	E9)V,!!), ))+

LL!%BB/?P[\ D "llnGDLLNg5"llnGLL!%BB/?P[\ Dt9>"*U"2K"B),,.B"B),,.B"B),,.B!BB+;KX M&	N 	C	N"W+#X urn   c                *   t        |       dk(  rd|d<   d|d<   |S t        j                  |d   d   t        j                  | D cg c]
  }|d   d    c}      z
  dz  |d   d   t        j                  | D cg c]
  }|d   d    c}      z
  dz  z         }t        j                  |d   d   t        j                  | D cg c]
  }|d   d    c}      z
  dz  |d   d	   t        j                  | D cg c]
  }|d   d	    c}      z
  dz  z         }t	        |      }t	        |      }d}||k\  r%t        ||dz         D ]  }|d
z  }|| |   d   z  } n| |   d   }|j                         |d<   | |   d   |d<   |S c c}w c c}w c c}w c c}w )aq  
    Maps a bounding box annotation to the corresponding text and start index within a list of words.

    Args:
        words (list[dict[str,Any]]): A list of dictionaries, each containing information about
            a word, including its text, bounding box, and start index.
        annot (dict[str,Any]): The annotation dictionary to be mapped, which will be updated with
        "text" and "start_index" fields.

    Returns:
        dict: The updated annotation dictionary with "text" representing the mapped text and
            "start_index" representing the start index of the mapped text in the list of words.
    r   r   r4   r6   r3      r|      r   )r=   rr   sqrtarray
try_argminranger   )	r\   r]   r  distance_from_bbox_startdistance_from_bbox_endclosest_startclosest_endr4   r[   s	            r)   rB   rB   K  s    5zQf!m!ww	vq	BHH%%H$d6l1o%HI	IaO=bhhE'JDVQ'JKKPQ
Q	R   WW	vq	BHH%%H$d6l1o%HI	IaO=bhhE'JDVQ'JKKPQ
Q	R 78M34K Dm#}kAo6 	%ACKDE!HV$$D	% ]#F+JJLE&M />E-L+ &I'J &I'Js   F8FF6Fc                    | \  }}}}|\  }}}}	t        ||      }
t        ||      }t        ||      }t        ||	      }|
|k  r||k  rt        |
|||f      }|S y)a  
    Calculate the area of intersection between two bounding boxes.

    Args:
        bbox1 (tuple[float, float, float, float]): The coordinates of the first bounding box
            in the format (x1, y1, x2, y2).
        bbox2 (tuple[float, float, float, float]): The coordinates of the second bounding box
            in the format (x1, y1, x2, y2).

    Returns:
        float: The area of intersection between the two bounding boxes. If there is no
        intersection, the function returns 0.0.
    g        )maxminr   )bbox1bbox2x1_1y1_1x2_1y2_1x1_2y1_2x2_2y2_2x_intersectiony_intersectionx2_intersectiony2_intersectionintersection_areas                  r)   r   r   v  s}    " #D$d"D$dt_Nt_N$oO$oO'N_,L/^_oN
 ! rn   c                *    | \  }}}}||z
  ||z
  z  }|S )a(  
    Calculate the area of a bounding box.

    Args:
        bbox (tuple[float, float, float, float]): The coordinates of the bounding box
            in the format (x1, y1, x2, y2).

    Returns:
        float: The area of the bounding box, computed as the product of its width and height.
    r   )r3   rV   rW   rX   rY   rF   s         r)   r   r     s(     NBBGR DKrn   c                ^    	 t        t        j                  |             S # t        $ r Y yw xY w)a;  
    Attempt to find the index of the minimum value in a NumPy array.

    Args:
        array (np.ndarray): The NumPy array in which to find the minimum value's index.

    Returns:
        int: The index of the minimum value in the array. If the array is empty or an
        IndexError occurs, it returns -1.
    r  )intrr   argmin
IndexError)r  s    r)   r  r    s-    299U#$$ s     	,,)r      )r%   r   r#   r*  return+tuple[List[List['TextRegion']], List[List]])Nr-  )r"   z Optional[Union[bytes, BinaryIO]]r#   r*  r.  r/  )rw   r*  r.  
np.ndarray)r   r0  r   r0  rw   r*  )r   floatrw   r*  r.  r0  )r   'DocumentLayout'r'   zList[List['TextRegion']]r   r   r.  r2  )r   r2  r.  r2  )r   )r   list['TextRegion']r   r1  r.  r3  )ra   z'TextRegion'r   r3  r.  r   )r   r   r   r   r.  r   )
r:   zPDFObjRef | list[PDFObjRef]r1   r1  rR   PixelSpace | PointSpacerL   r*  r.  list[dict[str, Any]])
r:   zlist[PDFObjRef]r1   zint | floatrR   r4  rL   r*  r.  r5  )r]   r   )
rQ   r5  r   !tuple[float, float, float, float]rL   r*  rS   r1  r.  r5  )rU   r   r1   r1  r.  z)tuple[list[LTChar], list[dict[str, Any]]])r\   r5  r]   zdict[str, Any])r  r6  r  r6  r.  r1  )r3   r6  r.  r1  )r  r0  r.  r*  )D
__future__r   typingr   r   r   r   r   r	   r
   numpyrr   pdfminer.layoutr   r   pdfminer.pdftypesr   pdfminer.utilsr   r8   r   "unstructured.documents.coordinatesr   r   unstructured.documents.elementsr   0unstructured.partition.pdf_image.pdf_image_utilsr   /unstructured.partition.pdf_image.pdfminer_utilsr   r   r   r   #unstructured.partition.utils.configr   &unstructured.partition.utils.constantsr   r   $unstructured.partition.utils.sortingr   unstructured.utilsr   r   'unstructured_inference.inference.layoutr   r   DEFAULT_ROUNDr*   r$   rD   rz   r   r   r   r   r   rG   r   r   r;   r   r   r?   r@   rB   r   r   r  r   rn   r)   <module>rG     s   " L L L  - ' ( ? E ? V  ; J B 4DF  
/
/	
/ 1
/ /0-1b"
*b"	b" 1b" 1b"J	 4A " ?L\\",\8;\" *-m!&8;" /2= e /06$.6$.6$ 6$ 	6$ 16$r$N /0    1*# 	""Y'YY /Y 	Y
 Y>DDD /D 	D
 DN!&)!&3!& !&  	!&
 !&HA	AA /AH(V,, D rn   