
    :Qg\<                       d dl mZ d dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZmZ d dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZm Z m!Z! d d	l"m#Z# d d
l$m%Z%m&Z& erd dl'm(Z( d dl)m*Z* 	 	 	 d	 	 	 	 	 	 	 	 	 ddZ+	 	 	 	 	 	 	 	 	 	 	 	 ddZ,	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ-ddZ.d Z/	 	 	 d 	 	 	 	 	 	 	 	 	 d!dZ0d"dZ1 ed      Z2d#dZ3d$dZ4d%dZ5d&dZ6	 	 	 d'	 	 	 	 	 	 	 	 	 	 	 d(dZ7y))    )annotationsN)BufferedReaderBytesIOTextIOWrapper)SpooledTemporaryFile)sleep)IOTYPE_CHECKINGAnyOptionalTypeVarcast)CoordinateSystem
PixelSpace)	TYPE_TO_TEXT_ELEMENT_MAPCheckBoxCoordinatesMetadataElementElementMetadataElementTypeListItem	PageBreakText)logger)ENUMERATED_BULLETS_REUNICODE_BULLETS_RE)
PageLayout)LayoutElementc                   t        | t              r|dk(  r| S t        | t              rt        d      S t        | t              s| j	                         }n| }|j                  dd      }|j                  d      }|j                  d      }|j                  d      }|j                  dd	      }	d	}
|	r|	j                  }
|r@t        |t        t        t        t        j                  f      rt        t        |      
      }n
t               }||||
d}|t        j                  k(  r|rt        |fi |S t!        dd|i|S |t"        v rqt        |t              sJ t"        |   } |dd|i|}|t        j$                  k(  rd|j&                  _        |S |t        j*                  k(  rd|j&                  _        |S |t        j,                  t        j.                  t        j0                  t        j2                  t        j4                  t        j6                  fv r>|t        j,                  t        j0                  t        j4                  fv }t9        dd|i|S t;        dd|i|S )zSConverts an unstructured_inference LayoutElement object to an unstructured Element.html )textr"   coordinatestypeprobsourceN)detection_class_prob)r#   coordinate_systemmetadatadetection_origin      checked )
isinstancer   r   dictto_dictgetvalueintstrfloatnumbersNumberr   r   LISTlayout_list_to_list_itemsr   r   HEADLINEr)   category_depthSUB_HEADLINECHECK_BOX_CHECKEDCHECK_BOX_UNCHECKEDRADIO_BUTTON_CHECKEDRADIO_BUTTON_UNCHECKEDCHECKED	UNCHECKEDr   r   )layout_elementr(   infer_list_itemssource_formatlayout_dictr"   r#   element_typer%   
aux_originoriginclass_prob_metadatacommon_kwargs_element_classr-   s                  a/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/partition/common/common.pynormalize_layout_elementrO   !   sr    .'*}/F .),b!!nd+$,,.$??62&D //-0K??6*L??6"D40JF!!
4#sE7>>!BC-5;O-/".'"	M {''', 
   
 
1	1,,,,1,?' 


 ;///56N##2  [55556N##2	%%''((** 
 )),,#
 

  


 	

  


 	
    c                0   | rt        j                  |       ng }t        |      dk(  r| rt        j                  |       ng }g }|D ]N  }t        |j	                               dkD  s t        |j	                         ||||      }|j                  |       P |S )z=Converts a list LayoutElement to a list of ListItem elements.r+   r   )r"   r#   r(   r)   r*   )r   splitlenr   stripr   append)	r"   r#   r(   r)   r*   split_items
list_itemstext_segmentitems	            rN   r:   r:   y   s     8<'--d3K
;18<(..t4" "J# $|!!#$q( !'')'"3!!1D d#$ rP   c                n   ||t        ||      nd}t        | d      r$t        | j                        dkD  r| j                  nd}|r|D cg c]  }|j	                  d       c}nd}|r|D cg c]  }|j	                  d       c}nd}|r|D cg c]  }|j	                  d       c}nd}t        | d      r$t        | j
                        dkD  r| j
                  nd}|r|D cg c]  }|j	                  d       c}nd}|r|D cg c]  }|j	                  d	       c}nd}| j                  j                  r| j                  j                  nd}t        ||||||||||||||

      }| j                  j                  |       |	|	| j                  _
        | S c c}w c c}w c c}w c c}w c c}w )zAdds document metadata to the document element.

    Document metadata includes information like the filename, source url, and page number.
    N)pointssystemlinksr   urlr"   start_indexemphasized_textstag)r#   filenamefiletypepage_numberr^   text_as_html	link_urls
link_textslink_start_indexesemphasized_text_contentsemphasized_text_tagsr<   
image_path	languages)r   hasattrrS   r]   r2   r`   r)   r<   r   updater*   )elementrb   rc   rd   r^   re   r#   r(   rk   r*   rl   kwargscoordinates_metadatar]   linkrf   rg   rh   r`   emphasized_textri   rj   depthr)   s                           rN   add_element_metadataru      s   0 "'8'D	 	$	

   %Wg63w}};MPQ;QGMMW[E7<U3T%3$I9>u5t$((6"5DJHMeDd$((=1DSW 7./C8P8P4QTU4U 	     =MM		V	$M   <LL		U	#L 
 07/?/?/N/NG++TXE(!-!91H  H%#,<)NO 45D 	N
 	Ms   F/F#F('F-
F2c                    g }t               }| D ]U  }t        |      }t        |t              r |D ]	  }||_         |j                  |       >||_        |j                  |       W |S )zRemoves document metadata from the document element.

    Document metadata includes information like the filename, source url, and page number.
    )r   rO   r/   listr)   extendrU   )layout_elementselementsr)   rD   ro   _elements         rN   remove_element_metadatar|      sr    
 !H H) %*>:gt$# -$,!-OOG$'GOOG$% OrP   c                     t        j                         D ]%  } 	 d| j                         j                         v r y' y# t         j                  t         j
                  t         j                  f$ r Y ]w xY w)NsofficeTF)psutilprocess_iternamelowerNoSuchProcessAccessDeniedZombieProcess)procs    rN   _is_soffice_runningr      sp    ##% 	DIIK--// 0  $$f&9&96;O;OP 		s    >2A32A3c                   || d| }ddd|d|| g}	 d}d}t        j                  |d	
      }|j                  j                         j	                         }	||k  rj|	dk(  re||z  }t               rt        |       n?t        j                  |d	
      }|j                  j                         j	                         }	||k  r|	dk(  ret        j                  |	       |j                  dk7  s|	dk(  r]t        j                  d||j                         t        j                  |j                  j                         j	                                yy# t        $ r t        d      w xY w)a  Converts a .doc/.ppt file to a .docx/.pptx file using the libreoffice CLI.

    Parameters
    ----------
    input_filename: str
        The name of the .doc file to convert to .docx
    output_directory: str
        The output directory for the convert .docx file
    target_format: str
        The desired output format
    target_filter: str
        The output filter name to use when converting. See references below
        for details.
    wait_for_soffice_ready_time_out: int
        The max wait time in seconds for soffice to become available to run

    References
    ----------
    https://stackoverflow.com/questions/52277264/convert-doc-to-docx-using-soffice-not-working
    https://git.libreoffice.org/core/+/refs/heads/master/filter/source/config/fragments/filters

    N:r~   z
--headlessz--convert-toz--outdirr   g?T)capture_outputr!   a  soffice command was not found. Please install libreoffice
on your system and try again.

- Install instructions: https://www.libreoffice.org/get-help/install-howto/
- Mac: https://formulae.brew.sh/cask/libreoffice
- Debian: https://wiki.debian.org/LibreOfficez3soffice failed to convert to format %s with code %i)
subprocessrunstdoutdecoderT   r   r   FileNotFoundErrorr   info
returncodeerrorstderr)
input_filenameoutput_directorytarget_formattarget_filterwait_for_soffice_ready_time_outcommand	wait_time
sleep_timeoutputmessages
             rN   convert_office_docr      s_   :  (/=/: 	G
	
=--&&(..0 ::B#I"$j!#E --..0668 ::B" KKABA=RXRcRc	
 	V]]))+1134	 "/  
1
 	

s   B1E E"c                 :   t        | j                         D cg c]  }|duxr |dk7   c}      dk7  rct        | j                               }t	        |      dkD  r)ddj                  |dd        d|d    d}t        |      |d	    d}t        |      yc c}w )
z
    Verify arguments; exactly one of all keyword arguments must not be None.

    Example:
        >>> exactly_one(filename=filename, file=file, text=text, url=url)
    Nr!   r+   zExactly one of z, z and z must be specified.r   )sumvaluesrw   keysrS   join
ValueError)rp   argnamesr   s       rN   exactly_oner   H  s     v}}GS_**GHAMV[[]#u:>'		%*(='>eE"I;NabG !! q
"56G!! NGs   B_Tc                    t        | t              r8| j                  d       t        t	        t
        | j                                     S | S )a  Convert `file` to `BytesIO` when it is a `SpooledTemporaryFile`.

    Note that `file` does not need to be IO[bytes]. It can be `None` or `bytes` and this function
    will not complain.

    In Python <3.11, `SpooledTemporaryFile` does not implement `.readable()` or `.seekable()` which
    triggers an exception when the file is loaded by certain packages. In particular, the stdlib
    `zipfile.Zipfile` raises on opening a `SpooledTemporaryFile` as does `Pandas.read_csv()`.
    r   )r/   r   seekr   r   bytesread)files    rN   spooled_to_bytes_io_if_neededr   [  s9     $,-		!tE499;/00 KrP   c                   t        | t              r| S t        | t              r4| j                  d       | j	                         }| j                  d       |S t        | t
              r| j                         S t        | t        t        f      r0t        | j                  d      5 }|j	                         cddd       S t        d      # 1 sw Y   t        d      xY w)zExtract the bytes from `file` without preventing it from being read again later.

    As a convenience to simplify client code, also returns `file` unchanged if it is already bytes.
    r   rbNzInvalid file-like object type)r/   r   r   r   r   r   getvaluer   r   openr   r   )r   f_bytesfs      rN   convert_to_bytesr   m  s    
 $$,-		!))+		!$ }}$78$))T" 	a668	 	 4
55	 4
55s   #CCc                >    t        t        j                  |             S )z
    Check if the input string contains any emoji characters.

    Parameters:
    - s (str): The input string to check.

    Returns:
    - bool: True if the string contains any emoji, False otherwise.
    )boolemojiemoji_count)ss    rN   contains_emojir     s     !!!$%%rP   c                   t        | dd      }t        | dd      }|r%|j                  }|j                  }|j                  }n<|r4|j	                  d      }|j	                  d      }|j	                  d      }nd}d}d}|||dS )z:Retrieve image metadata and coordinate system from a page.imageNimage_metadataformatwidthheight)r   r   r   )getattrr   r   r   r2   )pager   r   image_formatimage_widthimage_heights         rN   get_page_image_metadatar     s     D'4(ET#3T:N||kk||	%))(3$((1%))(3  rP   c                    |\  }}t        ||      }g }| D ]C  }	t        |	|||r|nd      }
|r|
j                  j                  |       |j	                  |
       E |S )zNConvert OCR layout data into `unstructured` elements with associated metadata.)r   r   r    )r(   rE   rF   )r   rO   r)   rn   rU   )ocr_data
image_sizecommon_metadatarE   rF   r   r   r(   rz   rD   ro   s              rN   ocr_data_to_elementsr     su     !+K"\J H" !*/-+8-f	
 ##O4 ! OrP   )NTr    )
rD   z(LayoutElement | Element | dict[str, Any]r(   Optional[CoordinateSystem]rE   r   rF   Optional[str]returnzElement | list[Element])r"   r   r#   )Optional[tuple[tuple[float, float], ...]]r(   r   r)   Optional[ElementMetadata]r*   r   r   list[Element])
NNNNNNNNNN)ro   r   rb   r   rc   r   rd   zOptional[int]r^   r   re   r   r#   r   r(   r   rk   r   r*   r   rl   zOptional[list[str]]rp   r   r   r   )ry   r   r   r   )docxN
   )
r   r5   r   r5   r   r5   r   r   r   r4   )rp   r   r   None)r   z _T | SpooledTemporaryFile[bytes]r   z_T | BytesIO)r   zbytes | IO[bytes]r   r   )r   r5   r   r   )r   r   r   zdict[str, Any])NTN)r   zlist['LayoutElement']r   ztuple[int | float, int | float]r   r   rE   r   rF   r   r   r   )8
__future__r   r7   r   ior   r   r   tempfiler   timer   typingr	   r
   r   r   r   r   r   r   "unstructured.documents.coordinatesr   r   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   unstructured.loggerr   unstructured.nlp.patternsr   r   'unstructured_inference.inference.layoutr   .unstructured_inference.inference.layoutelementr   rO   r:   ru   r|   r   r   r   r   r   r   r   r   r   r.   rP   rN   <module>r      s9   "   5 5 )  B B   K
 
 
 ' OBL
 59!#)	U
<U
1U
 U
 !	U

 U
p
: 2 (	
 $ @ #"!%"&=A48 $&*%)CCC C 	C
 
C  C ;C 2C C $C #C C CL&  #'+-K5K5K5 K5 !	K5
 &)K5\"  T]$60&: 26!#'#/ / 	
 ! rP   