
    Ig_                    ~   d Z ddlmZ ddlZddlmZmZmZmZm	Z	m
Z
mZmZmZ ddlmZ ddlZddlmZ ddlmZ ddlmZ er ddlZddlZddlZddlZddl Z!dd	lm"Z" dd
l#m$Z$ g dZ%g dZ&	 	 	 	 ddZ' G d de      Z( G d de      Z) G d de      Z* G d de      Z+ G d de      Z, G d de      Z- G d de      Z.y)z(Module contains common parsers for PDFs.    )annotationsN)	TYPE_CHECKINGAnyDictIterableIteratorMappingOptionalSequenceUnion)urlparse)Document)BaseBlobParser)Blob)
PageObject)TextLinearizationConfig)	DCTDecodeDCT	JPXDecode)	LZWDecodeLZWFlateDecodeFlASCII85DecodeA85ASCIIHexDecodeAHxRunLengthDecodeRLCCITTFaxDecodeCCFJBIG2Decodec                    	 ddl m}  |       }d}| D ]6  } ||      \  }}|s|D cg c]  }|d   	 }}dj                  |      z  }8 |S # t        $ r t        d      w xY wc c}w )zExtract text from images with RapidOCR.

    Args:
        images: Images to extract text from.

    Returns:
        Text extracted from images.

    Raises:
        ImportError: If `rapidocr-onnxruntime` package is not installed.
    r   )RapidOCRzc`rapidocr-onnxruntime` package not found, please install it with `pip install rapidocr-onnxruntime`    
)rapidocr_onnxruntimer$   ImportErrorjoin)imagesr$   ocrtextimgresult_s          m/var/www/html/answerous/venv/lib/python3.12/site-packages/langchain_community/document_loaders/parsers/pdf.py!extract_from_images_with_rapidocrr2   5   s    
1 *CD &H	*01$d1g1F1DIIf%%D	&
 K  
1
 	

 2s   A A&A#c                  D    e Zd ZdZ	 	 dddd	 	 	 	 	 	 	 d	dZd
dZddZy)PyPDFParserzLoad `PDF` using `pypdf`Nplain)extraction_modeextraction_kwargsc               D    || _         || _        || _        |xs i | _        y N)passwordextract_imagesr6   r7   )selfr:   r;   r6   r7   s        r1   __init__zPyPDFParser.__init__W   s(     !,.!2!8b    c              #     K   	 ddl d	 fd}|j                         5 } j                  | j                        }t        |j                        D cg c]8  \  }}t         ||       j                  |      z   |j                  |d      : c}}E d{    ddd       y# t        $ r t        d      w xY wc c}}w 7 +# 1 sw Y   yxY ww)
Lazily parse the blob.r   NzE`pypdf` package not found, please install it with `pip install pypdf`c                    j                   j                  d      r| j                         S  | j                  ddj                  ij                  S )zM
            Extract text from image given the version of pypdf.
            3r6    )__version__
startswithextract_textr6   r7   )pagepypdfr<   s    r1   _extract_text_from_pagez7PyPDFParser.lazy_parse.<locals>._extract_text_from_pagen   sY       ++C0((**(t(( $($8$8,, r>   )r:   )rG   sourcerG   page_contentmetadata)rG   z'PageObject'returnstr)
rH   r)   as_bytes_io	PdfReaderr:   	enumeratepagesr   _extract_images_from_pagerK   )r<   blobrI   pdf_file_obj
pdf_readerpage_numberrG   rH   s   `      @r1   
lazy_parsezPyPDFParser.lazy_parsed   s     	
	  
	<(NJ *3:3C3C)D &K !8d!C44T:";(,[I  
	 
	%  	& 	* 
	 
	sP   CB* C7C
=CC
CC
!	C*B??CC

CCc                4   | j                   rd|d   j                         vry|d   d   j                         }g }|D ]  }||   d   dk(  s||   d   dd t        v rg||   d	   ||   d
   }}|j	                  t        j                  ||   j                         t
        j                        j                  ||d             ||   d   dd t        v r#|j	                  ||   j                                t        j                  d        t        |      S )8Extract images from page and get the text with RapidOCR.z/XObjectz
/Resourcesr%   z/Subtypez/Imagez/Filterr&   Nz/Heightz/WidthdtypeUnknown PDF Filter!)r;   keys
get_object_PDF_FILTER_WITHOUT_LOSSappendnp
frombufferget_datauint8reshape_PDF_FILTER_WITH_LOSSwarningswarnr2   )r<   rG   xObjectr+   objheightwidths          r1   rU   z%PyPDFParser._extract_images_from_page   s   ""j\8J8O8O8Q&Q|$Z0;;= 	9Cs|J'833<	*12.2JJ$+CL$;WS\(=SEFMMgcl&;&;&=RXXNVV"E2
 S\),QR04IIMM'#,"7"7"9:MM"78	9 188r>   NF)r:   zOptional[Union[str, bytes]]r;   boolr6   rP   r7   zOptional[Dict[str, Any]]rV   r   rO   Iterator[Document])rG   zpypdf._page.PageObjectrO   rP   __name__
__module____qualname____doc__r=   rZ   rU   rC   r>   r1   r4   r4   T   sP    " 15$9
  '6:9-9 9
 9 49 D9r>   r4   c                  0    e Zd ZdZdddd	dZd
dZddZy)PDFMinerParserzParse `PDF` using `PDFMiner`.T)concatenate_pagesc                    || _         || _        y)a$  Initialize a parser based on PDFMiner.

        Args:
            extract_images: Whether to extract images from PDF.
            concatenate_pages: If True, concatenate all PDF pages into one a single
                               document. Otherwise, return one document per page.
        N)r;   r|   )r<   r;   r|   s      r1   r=   zPDFMinerParser.__init__   s     -!2r>   c              #  d  K   | j                   s	 ddlm} |j	                         5 }| j
                  r& ||      }d|j                  i}t        ||       n\ddlm	} |j                  |      }t        |      D ]7  \  }}	 |||g      }|j                  t        |      d}t        ||       9 d	d	d	       y	dd	l}
dd
lm}m} ddlm} ddlm}m} ddlm	} |
j-                         }|j	                         5 }|j                  |      } |       } ||| |             } || |             } |||      } |||      }t        |      D ]  \  }}|j/                  |       |j/                  |       |j1                         | j3                  |j5                               z   }|j7                  d       |j9                  d       |j                  t        |      d}t        ||        	 d	d	d	       y	# t        $ r t        d      w xY w# 1 sw Y   y	xY w# 1 sw Y   y	xY ww)r@   r   )rF   zO`pdfminer` package not found, please install it with `pip install pdfminer.six`rK   rL   )PDFPage)page_numbersrJ   N)PDFPageAggregatorTextConverter)LAParams)PDFPageInterpreterPDFResourceManager)laparams)r;   pdfminer.high_levelrF   r)   rQ   r|   rK   r   pdfminer.pdfpager   	get_pagesrS   rP   iopdfminer.converterr   r   pdfminer.layoutr   pdfminer.pdfinterpr   r   StringIOprocess_pagegetvaluerU   
get_resulttruncateseek)r<   rV   rF   rW   r-   rN   r   rT   ir0   r   r   r   r   r   r   text_iorsrcmgrdevice_for_textdevice_for_imageinterpreter_for_textinterpreter_for_imagerG   contents                           r1   rZ   zPDFMinerParser.lazy_parse   s)     ""< !!# M|))'5D ($++6H"xHH8#--l;E )% 0 M1+LsK.2kk3q6#J&D8LLMM M K0Q0kkmG!!# L|)),7,."/8:"V#4Wxz#R '9'?'S$(:7DT(U%(/ 	LGAt(55d;)66t<%..043Q3Q(3354 G $$Q'LLO*.++s1vFH"(KK	LL L9  !1 M M,L LsM   H0H  H0BH5A	H0>C8H$7	H0 HH0H!H0$H-)H0c           	     p   ddl d
fdg }t        t        t        t	        |                  D ]  }|j
                  d   j                  t        v rx|j                  t        j                  |j
                  j                         t        j                        j                  |j
                  d   |j
                  d   d             |j
                  d   j                  t        v r*|j                  |j
                  j                                t        j                   d	        t#        |      S )r\   r   Nc                    t        | j                  j                        r| S t        | j                  j                        r| D ]  } |      c S  y y r9   )
isinstancelayoutLTImageLTContainer)layout_objectchild	get_imagepdfminers     r1   r   z;PDFMinerParser._extract_images_from_page.<locals>.get_image   sP    -)@)@A$$-)D)DE* ,E$U++, r>   Filterr]   HeightWidthr_   r`   )r   r   rO   r   )r   listfilterrr   mapstreamnamerc   rd   re   rf   rg   rh   ri   rj   rk   rl   r2   )r<   rG   r+   r.   r   r   s       @@r1   rU   z(PDFMinerParser._extract_images_from_page   s    	 tSD%9:; 
	5Czz(#((,DDMM#**"5"5"7rxxHPP

8,cjj.A2
 H%**.CCcjj113434
	5 188r>   NF)r;   rr   r|   rr   rs   )rG   zpdfminer.layout.LTPagerO   rP   ru   rC   r>   r1   r{   r{      s    '	3RV 	32Lh9r>   r{   c                  t    e Zd ZdZ	 	 d	 	 	 	 	 d	dZd
dZ	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZy)PyMuPDFParserzParse `PDF` using `PyMuPDF`.Nc                (    |xs i | _         || _        y)z~Initialize the parser.

        Args:
            text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
        N)text_kwargsr;   )r<   r   r;   s      r1   r=   zPyMuPDFParser.__init__   s     ',",r>   c              #  \  K   ddl }|j                         5 }|j                   |j                  |      }n |j                  |d      }|D cg c]1  }t	        | j                  |||      | j                  |||            3 c}E d{    ddd       yc c}w 7 # 1 sw Y   yxY ww)r@   r   Npdf)r   filetyperL   )fitzrQ   dataopenr   _get_page_content_extract_metadata)r<   rV   r   	file_pathdocrG   s         r1   rZ   zPyMuPDFParser.lazy_parse  s      	 	9yy dii	*diiy5A  
 	 !%!7!7T4!H!33CtD  	 	 	 	s:   B,8B 6BB BB 	B,B  B)%B,c                     |j                   di | j                  | j                  ||      z   }|s/t        j                  d|j
                   d|j                          |S )zq
        Get the text of the page using PyMuPDF and RapidOCR and issue a warning
        if it is empty.
        zWarning: Empty content on page z of document rC   )get_textr   rU   rk   rl   numberrK   )r<   r   rG   rV   r   s        r1   r   zPyMuPDFParser._get_page_content   sg      $--3$"2"23d6T6T7
 
 MM1;;-}T[[M;
 r>   c                   t        |j                  |j                  |j                  t        |      dfi |j                  D ci c]5  }t        |j                  |   t        t        f      r||j                  |   7 c}S c c}w )z,Extract metadata from the document and page.rK   r   rG   total_pages)dictrK   r   lenrN   r   rP   int)r<   r   rG   rV   ks        r1   r   zPyMuPDFParser._extract_metadata3  s{     ++![["3x	
 cll1oSz: 3<<?"
 	
s   :Bc                b   | j                   syddl}|j                         }g }|D ]}  }|d   } |j                  ||      }|j	                  t        j                  |j                  t
        j                        j                  |j                  |j                  d              t        |      S )r\   r%   r   Nr]   r_   )r;   r   
get_imagesPixmaprd   re   rf   samplesrh   ri   ro   rp   r2   )	r<   r   rG   r   img_listimgsr.   xrefpixs	            r1   rU   z'PyMuPDFParser._extract_images_from_pageE  s     ""??$ 	Cq6D$++c4(CKKckk:BBJJ		2	 166r>   rq   )r   Optional[Mapping[str, Any]]r;   rr   rO   Noners   )r   fitz.fitz.DocumentrG   fitz.fitz.PagerV   r   rO   rP   )r   r   rG   r   rV   r   rO   r   )r   r   rG   r   rO   rP   )	rv   rw   rx   ry   r=   rZ   r   r   rU   rC   r>   r1   r   r      s    & 48$-0- - 
	-&%-;CG	&
%
-;
CG
	
$7%7-;7	7r>   r   c                  *    e Zd ZdZdddZddZd	dZy)
PyPDFium2ParserzParse `PDF` with `PyPDFium2`.c                L    	 ddl }|| _        y# t        $ r t        d      w xY w)zInitialize the parser.r   NzKpypdfium2 package not found, please install it with `pip install pypdfium2`)	pypdfium2r)   r;   )r<   r;   r   s      r1   r=   zPyPDFium2Parser.__init__]  s7    	 -  	+ 	s    #c              #    K   ddl }|j                         5 } |j                  |d      }	 t        |      D ]z  \  }}|j	                         }|j                         }|j                          |d| j                  |      z   z  }|j                          |j                  |d}	t        ||	       | 	 |j                          	 ddd       y# |j                          w xY w# 1 sw Y   yxY ww)r@   r   NT)	autocloser'   rJ   rL   )
r   rQ   PdfDocumentrS   get_textpageget_text_rangecloserU   rK   r   )
r<   rV   r   r   rX   rY   rG   	text_pager   rN   s
             r1   rZ   zPyPDFium2Parser.lazy_parseh  s       	#9...yDIJ
#)2:)> L%K $ 1 1 3I'668GOO%td&D&DT&JJJGJJL*.++{KH"(KKL   "	# 	#   "	# 	#s4   C1C%BC6C%	C1C""C%%C.*C1c                    | j                   syddlm} t        |j	                  |j
                  f            }t        t        d |            }t        |      S )r\   r%   r   N)r   c                >    | j                         j                         S r9   )
get_bitmapto_numpy)xs    r1   <lambda>z;PyPDFium2Parser._extract_images_from_page.<locals>.<lambda>  s    ALLN$;$;$= r>   )r;   pypdfium2.rawrawr   get_objectsFPDF_PAGEOBJ_IMAGEr   r2   )r<   rG   pdfium_cr+   s       r1   rU   z)PyPDFium2Parser._extract_images_from_page|  sO    ""(d&&x/J/J.L&MNc=vFG088r>   Nr   )r;   rr   rO   r   rs   )rG   zpypdfium2._helpers.page.PdfPagerO   rP   ru   rC   r>   r1   r   r   Z  s    '	-#(
9r>   r   c                  F    e Zd ZdZ	 	 	 d	 	 	 	 	 	 	 ddZd	dZd
dZd
dZy)PDFPlumberParserzParse `PDF` with `PDFPlumber`.Nc                6    |xs i | _         || _        || _        y)zInitialize the parser.

        Args:
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
        N)r   deduper;   )r<   r   r   r;   s       r1   r=   zPDFPlumberParser.__init__  s      ',",r>   c              #  \  K   ddl }|j                         5 } |j                  |      }|j                  D cg c]  }t	        | j                  |      dz   | j                  |      z   t        |j                  |j                  |j                  dz
  t        |j                        dfi |j                  D ci c]6  }t        |j                  |         t        t        fv r||j                  |   8 c}       c}}E d{    ddd       yc c}w c c}}w 7 # 1 sw Y   yxY ww)r@   r   Nr'   r&   r   rL   )
pdfplumberrQ   r   rT   r   _process_page_contentrU   r   rK   rY   r   rN   typerP   r   )r<   rV   r   r   r   rG   r   s          r1   rZ   zPDFPlumberParser.lazy_parse  s#     	9!*//),C*  II'& % !%!;!;D!A"44T:"; "&*kk)-$($4$4q$8+.syy>	 &)\\ !#CLLO4c
B s||A.	  	 	 	 	sL   D,"D A>D7;D2D>D DD 
	D,DD  D)%D,c                    | j                   r* |j                         j                  di | j                  S  |j                  di | j                  S )z)Process the page content based on dedupe.rC   )r   dedupe_charsrF   r   )r<   rG   s     r1   r   z&PDFPlumberParser._process_page_content  sJ    ;;34$$&33Gd6F6FGG t  44#3#344r>   c                   | j                   syg }|j                  D ]  }|d   d   j                  t        v rc|j	                  t        j                  |d   j                         t
        j                        j                  |d   d   |d   d   d             ~|d   d   j                  t        v r#|j	                  |d   j                                t        j                  d        t        |      S )	r\   r%   r   r   r]   r   r   r_   r`   )r;   r+   r   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   r2   )r<   rG   r+   r.   s       r1   rU   z*PDFPlumberParser._extract_images_from_page  s    "";; 
	5C8}X&++/GGMM#h-"8"8":"((KSSHh/Xw1G
 Xx(--1FFc(m446734
	5 188r>   )NFF)r   r   r   rr   r;   rr   rO   r   rs   )rG   zpdfplumber.page.PagerO   rP   )rv   rw   rx   ry   r=   rZ   r   rU   rC   r>   r1   r   r     sJ    ( 48$	-0- - 	-
 
- :59r>   r   c                  :    e Zd ZdZ	 	 ddd	 	 	 	 	 	 	 ddZddZy)	AmazonTextractPDFParsera{  Send `PDF` files to `Amazon Textract` and parse them.

    For parsing multi-page PDFs, they have to reside on S3.

    The AmazonTextractPDFLoader calls the
    [Amazon Textract Service](https://aws.amazon.com/textract/)
    to convert PDFs into a Document structure.
    Single and multi-page documents are supported with up to 3000 pages
    and 512 MB of size.

    For the call to be successful an AWS account is required,
    similar to the
    [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
    requirements.

    Besides the AWS configuration, it is very similar to the other PDF
    loaders, while also supporting JPEG, PNG and TIFF and non-native
    PDF formats.

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
    documents = loader.load()
    ```

    One feature is the linearization of the output.
    When using the features LAYOUT, FORMS or TABLES together with Textract

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    # you can mix and match each of the features
    loader=AmazonTextractPDFLoader(
        "example_data/alejandro_rosalez_sample-small.jpeg",
        textract_features=["TABLES", "LAYOUT"])
    documents = loader.load()
    ```

    it will generate output that formats the text in reading order and
    try to output the information in a tabular structure or
    output the key/value pairs with a colon (key: value).
    This helps most LLMs to achieve better accuracy when
    processing these texts.

    N)linearization_configc                  	 ddl }ddlmc m} || _        || _        |%|D cg c]  }|j                  |       c}| _        ng | _        ||| _        n$| j
                  j                  dddd      | _        |s	 ddl}|j                  d	      | _        y|| _        yc c}w # t        $ r t        d      w xY w# t        $ r t        d
      w xY w)a5  Initializes the parser.

        Args:
            textract_features: Features to be used for extraction, each feature
                               should be passed as an int that conforms to the enum
                               `Textract_Features`, see `amazon-textract-caller` pkg
            client: boto3 textract client
            linearization_config: Config to be used for linearization of the output
                                  should be an instance of TextLinearizationConfig from
                                  the `textractor` pkg
        r   NTz# z## *)hide_figure_layouttitle_prefixsection_header_prefixlist_element_prefixzCould not import amazon-textract-caller or amazon-textract-textractor python package. Please install it with `pip install amazon-textract-caller` & `pip install amazon-textract-textractor`.textractzRCould not import boto3 python package. Please install it with `pip install boto3`.)textractcallertextractor.entities.documententitiesdocumenttc
textractorTextract_Featurestextract_featuresr   r   r)   boto3clientboto3_textract_client)r<   r  r  r   r  r  fr  s           r1   r=   z AmazonTextractPDFParser.__init__  s    &	'==DG(DO ,5F*01B((+*& *,&#/,@),0OO,S,S'+!%*/(+	 -T -) -2\\*-E* *0D&E*  	< 	  !B s'   !B$ B>B$ <B< B$ $B9<Cc              #    K   |j                   rt        t        |j                               nd}|ra|j                  dk(  rR|j                  rF| j
                  j                  t        |j                         | j                  | j                        }n_| j
                  j                  |j                         | j                  | j
                  j                  j                  | j                        }| j                  j                  j                  |      }t        |j                         D ]>  \  }}t        |j#                  | j$                        |j&                  |dz   d       @ yw)	zIterates over the Blob pages and returns an Iterator with a Document
        for each page, like the other parsers If multi-page document, blob.path
        has to be set to the S3 URI and for single page docs
        the blob.data is taken
        Ns3)input_documentfeaturesr  )r  r  	call_moder  )configr&   rJ   rL   )pathr   rP   schemenetlocr  call_textractr  r  as_bytesTextract_Call_Mode
FORCE_SYNCr  r   r   rS   rT   r   r   rK   )r<   rV   url_parse_resulttextract_response_jsonr  idxrG   s          r1   rZ   z"AmazonTextractPDFParser.lazy_parseA  s*     8<yy8C		N3d  ''4/ ''%)WW%:%:"499~//&*&@&@ &; &" &*WW%:%:#}}//''44??&*&@&@	 &; &" ??++001GH"8>>2 	IC!]]$2K2K]L$(KKqA 	s   E+E-)NN)r  zOptional[Sequence[int]]r  zOptional[Any]r   z#Optional['TextLinearizationConfig']rO   r   rs   )rv   rw   rx   ry   r=   rZ   rC   r>   r1   r   r     sN    +^ 6: $=0
 EI=02=0 =0
 B=0 
=0~!r>   r   c                  (    e Zd ZdZddZddZddZy)	DocumentIntelligenceParserzjLoads a PDF with Azure Document Intelligence
    (formerly Form Recognizer) and chunks at character level.c                J    t        j                  d       || _        || _        y )Na<  langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParserand langchain_community.document_loaders.pdf.DocumentIntelligenceLoader are deprecated. Please upgrade to langchain_community.document_loaders.DocumentIntelligenceLoader for any file parsing purpose using Azure Document Intelligence service.)rk   rl   r  model)r<   r  r  s      r1   r=   z#DocumentIntelligenceParser.__init__i  s#    	
 
r>   c              #     K   |j                   D ]]  }dj                  |j                  D cg c]  }|j                   c}      }t	        ||j
                  |j                  d      }| _ y c c}w w)N rJ   rL   )rT   r*   linesr   r   rK   rY   )r<   rV   r/   pliner   ds          r1   _generate_docsz)DocumentIntelligenceParser._generate_docsu  sd      
	AhhAABG$"kkMMA G
	As   )A5A0
7A5c              #     K   |j                         5 }| j                  j                  | j                  |      }|j	                         }| j                  ||      }|E d{    ddd       y7 # 1 sw Y   yxY ww)r@   N)rQ   r  begin_analyze_documentr  r/   r$  )r<   rV   file_objpollerr/   docss         r1   rZ   z%DocumentIntelligenceParser.lazy_parse  ss       	8[[77

HMF]]_F&&tV4DOO	 	 	 	s/   A=AA1!A/"A1&	A=/A11A:6A=N)r  r   r  rP   )rV   r   r/   r   rO   rt   rs   )rv   rw   rx   ry   r=   r$  rZ   rC   r>   r1   r  r  e  s    A
	r>   r  )r+   z,Sequence[Union[Iterable[np.ndarray], bytes]]rO   rP   )/ry   
__future__r   rk   typingr   r   r   r   r   r	   r
   r   r   urllib.parser   numpyre   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   	fitz.fitzr   r   r   pdfplumber.pager   pypdf._pagerH   pypdfium2._helpers.pager   r   )textractor.data.text_linearization_configr   rj   rc   r2   r4   r{   r   r   r   r   r  rC   r>   r1   <module>r6     s    . " 
 
 
 "  - D B" Q :  "8>G9. G9T\9^ \9~Z7N Z7z,9n ,9^H9~ H9VNn Nb& &r>   