
    :Qgm              
          d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	m
Z
mZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
 Zdedee   fdZdedee   fdZdeeeeef   dedeeeeef   fdZ eddg      defd       Zy)    N)BinaryIOListTuple)PDFPageAggregator)LAParamsLTContainerLTImageLTItem
LTTextLine)PDFPageInterpreterPDFResourceManager)PDFPage)PSSyntaxError)logger)requires_dependenciesc                  d    t               } t               }t        | |      }t        | |      }||fS )N)laparams)r   r   r   r   )rsrcmgrr   deviceinterpreters       l/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/partition/pdf_image/pdfminer_utils.pyinit_pdfminerr      s4     "GzHw:F$Wf5K;    parent_objectreturnc                     g }t        | t              r|j                  |        |S t        | t              r!| D ]  }|j	                  t        |              |S )zPRecursively extracts image objects from a given parent object in a PDF document.)
isinstancer	   appendr   extendextract_image_objectsr   objectschilds      r   r    r       sX    G-)}%
 N	 
M;	/" 	9ENN078	9 Nr   c                     g }t        | t              r|j                  |        |S t        | t              r!| D ]  }|j	                  t        |              |S )zORecursively extracts text objects from a given parent object in a PDF document.)r   r   r   r   r   extract_text_objectsr!   s      r   r%   r%   %   sX    G-,}%
 N	 
M;	/" 	8ENN/67	8 Nr   rectheightc                 0    | \  }}}}||z
  }||z
  }||||fS )ab  
    Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified
    coordinate system where the vertical axis is measured from the top of the page.

    Args:
        rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle
            coordinates (x1, y1, x2, y2).
        height (float): The height of the page in the specified coordinate system.

    Returns:
        Tuple[float, float, float, float]: A tuple representing the bounding box coordinates
        (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page.
     )r&   r'   x1y2x2y1s         r   rect_to_bboxr.   2   s2    " NBB	"B	"BBr   pikepdfpypdffpc              #     K   ddl }ddlm} t               \  }}t	        j
                         5 }t        j                  j                  |d      }	 t        j                  |       }t        |      D ]-  \  }}		 |j                  |	       |j                         }
|	|
f / 	 ddd       y# t        $ r t        j                   d       t        j                   d|dz    d        || |	      }|j"                  j%                  |      5 }|j'                  |       ddd       n# 1 sw Y   nxY wt)        t        j                  t%        |d
                  }	|j                  |	       |j                         }
Y w xY w# t        $ r t        j                   d       t        j                   d       |j"                  j%                  |       5 }|j'                  |       ddd       n# 1 sw Y   nxY wt        j                  t%        |d
            }|D ])  }	|j                  |	       |j                         }
|	|
f + Y w xY w# 1 sw Y   yxY ww)zTOpen PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.r   N)get_page_datatmp_filez2Detected invalid dictionary construct for PDFminerzRepairing the PDF page    z ...)page_numberrbzRepairing the PDF document ...)r/   ,unstructured.partition.pdf_image.pypdf_utilsr3   r   tempfileTemporaryDirectoryospathjoinr   	get_pages	enumerateprocess_page
get_resultr   r   infoPdfopensavenext)r1   r/   r3   r   r   tmp_dir_pathtmp_file_pathpagesipagepage_layouterror_page_datapdfs                r   open_pdfminer_pages_generatorrO   I   s     J'/FK		$	$	&  (,\:>	(%%b)E$U+ (46,,T2"("3"3"5K K''!( (  ( % 
6KK TUKK"9!A#d CD&3BA&FO ))/: 0c/0 0 0 1 1$}d2K LMD,,T2"("3"3"5K
6  
	(KKLMKK89!!"% ('( ( (%%d=$&?@E (((.$//1K''(
	(- (  (s   ,I!I&E?7!B*E? I!	I*AE<	D$		E<$D-)AE<9E?;E<<E??AIG(	I(G1-AIIIIII)r;   r9   typingr   r   r   pdfminer.converterr   pdfminer.layoutr   r   r	   r
   r   pdfminer.pdfinterpr   r   pdfminer.pdfpager   pdfminer.pdfparserr   unstructured.loggerr   unstructured.utilsr   r   r    r%   floatr.   rO   r)   r   r   <module>rY      s    	  ( ( 0 N N E $ , & 4
 
DM 

 
4
3C 

ueU*
+ 5%%&. 	7+,*(*( -*(r   