
    :Qg,                        d Z ddlmZ ddlZddlZddlZddlZddlZddl	m
Z
mZmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ  ed
      ZddZg dg ddZef	 	 	 	 	 ddZ	 d	 	 	 ddZddZddZy)z>Helpers used across multiple partitioners to compute metadata.    )annotationsN)AnyCallableIteratorSequence)	ParamSpec)ElementElementMetadata)FileType)apply_lang_metadata)get_call_args_applying_defaults_Pc                    t         j                  j                  |       syt        j                  j                  t         j                  j                  |             }|j                  d      S )a
  Modification time of file at path `filename`, if it exists.

    Returns `None` when `filename` is not a path to a file on the local filesystem.

    Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
    "2024-03-05T17:02:53".
    Nz%Y-%m-%dT%H:%M:%S%z)ospathisfiledtdatetimefromtimestampgetmtimestrftime)filenamemodify_dates     c/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/partition/common/metadata.pyget_last_modified_dater      sL     77>>(#++++BGG,<,<X,FGK 566    )	TextUncategorizedTextNarrativeTextListItemBulletedTextTableFigureCaptionCheckBoxr"   )
Titler   r   r   r    r!   r"   r#   r$   r"   )r%   Headerc                   g }| D ]  }|j                   j                  d}t        |dd      }t        |j                   dd      xs d}|sG|ro|d   }t        |d      }t        |j                   dd      xs d}	||k(  r|	|k  s||k7  r!||j                  |g       v r|j                  }n|j                          |ro||j                   _        |j                  |        t        |       S )aK  Sets `.metadata.parent_id` for each element it applies to.

    `parent_id` assignment is based on the element's category and depth. The importance of an
    element's category is determined by a rule set. The rule set trumps category_depth. That is,
    category_depth is only relevant when elements are of the same category.
    Ncategorycategory_depthr   )metadata	parent_idgetattrgetidpopappendlist)
elementsrulesetstackelementr,   element_categoryelement_category_depthtop_elementtop_element_categorytop_element_category_depths
             r   set_element_hierarchyr<   @   s    E $%%1	"7J=!()9)9;KQ!O!TST  #(9K#*;
#C (($ 
  ' %(88.1GG$(88$4H"(MM'NN	IIK- 0 &/"WI$L >r   c                     d fd}|S )a  Post-process element-metadata for this document.

    This decorator adds a post-processing step to a partitioner, primarily to apply metadata that
    is common to all partitioners. It assumes the following responsibilities:

      - Hash element-ids. Computes and applies SHA1 hash element.id when `unique_element_ids`
        argument is False.

      - Element Hierarchy. Computes and applies `parent_id` metadata based on `category_depth`
        etc. added by partitioner.

      - Language metadata. Computes and applies `language` metadata based on a language detection
        model.

      - Apply `filetype` (MIME-type) metadata. There are three cases; first one in this order that
        applies is used:

          - `metadata_file_type` argument is present in call, use that.
          - `file_type` decorator argument is populated, use that.
          - `file_type` decorator argument is omitted or None, don't apply `.metadata.filetype`
            (assume the partitioner will do that for itself, like `partition_image()`.

      - Replace `filename` with `metadata_filename` when present.

      - Replace `last_modified` with `metadata_last_modified` when present.

      - Apply `url` metadata when present.
    c                H     t        j                         d fd       }|S )a  The decorator function itself.

        This function is returned by the `apply_metadata()` function and is the actual decorator.
        Think of `apply_metadata()` as a factory function that configures this decorator, in
        particular by setting its `file_type` value.
        c                     | i |}t        g| i |}t        |      }|j                  d      }|j                  dd      }t        t	        |||            }i }|j                  d      xs }||j
                  |d<   |j                  d      xs |j                  d      }|r||d<   |j                  d	      }	|	r|	|d
<   |j                  d      }
|
r|
|d<   |D ]=  }|j                  j                  r|j                  j                  t        di |       ? |j                  dd      }|du rt        |      }t        |      }|S )N	languagesdetect_language_per_elementF)r3   r@   rA   metadata_file_typefiletypemetadata_filenamer   metadata_last_modifiedlast_modifiedurlunique_element_ids )r    _uniqueify_elements_and_metadatar.   r2   r   	mime_typer+   attached_to_filenameupdater
   _assign_hash_idsr<   )argskwargsr3   	call_argsr@   rA   metadata_kwargsrB   r   rE   rG   r6   rH   	file_typefuncs                r   wrapperz2apply_metadata.<locals>.decorator.<locals>.wrapper   s   T,V,H7NtNvNI 8AH "k2I*3--8UW\*]'#%'0KH /1O "+/C!D!Q	!-.@.J.J
+ !}}%89VY]]:=VH.6
+ &/]]3K%L"%3I0 --&C),& $ L ##88  ''(J/(JKL (1}}5I5'Q!U*+H5 -X6HOr   )rO   z_P.argsrP   z	_P.kwargsreturnlist[Element])	functoolswraps)rT   rU   rS   s   ` r   	decoratorz!apply_metadata.<locals>.decorator   s*     
	O	 
O	b r   )rT   Callable[_P, list[Element]]rV   r[   rI   )rS   rZ   s   ` r   apply_metadatar\   x   s    @Zx r   c                &   | D cg c]  }|j                   j                   }}t        j                  |      D cg c]  \  }}t	        |      D ]  \  }}|	  }}}}t        | |      D ]  \  }}|j                  |        | S c c}w c c}}}w )a.  Converts `.id` of each element from UUID to hash.

    The hash is based on the `.text` of the element, but also on its page-number and sequence number
    on that page. This provides for deterministic results even when the document is split into one
    or more fragments for parallel processing.
    )r+   page_number	itertoolsgroupby	enumeratezip
id_to_hash)	r3   epage_numbers_groupseq_on_pagepage_seq_numbersr6   seq_on_page_counters	            r   rN   rN      s     5==qAJJ**=L= ")),7 Au'. K 	  ),H6F(G 0$$./0 O >s
   BBc                ,    dd}t         ||             S )zEnsure each of `elements` and their metadata are unique instances.

    This prevents hard-to-diagnose bugs downstream when mutating one element unexpectedly also
    mutates others because they are the same instance.
    c              3    K   t               }t               }| D ]  }t        |      |v rt        j                  |      }t        |j                        |v r$t        j                  |j                        |_        |j                  t        |             |j                  t        |j                               |  yw)zLSubstitute deep-copies of any non-unique elements or metadata in `elements`.N)setr/   copydeepcopyr+   add)r3   seen_elementsseen_metadatar6   s       r   iter_unique_elementsz>_uniqueify_elements_and_metadata.<locals>.iter_unique_elements  s     "%%"%% 	G'{m+--0'""#}4#'==1A1A#B bk*b!1!123M	s   B<B>)r3   rW   rV   zIterator[Element])r2   )r3   rs   s     r   rJ   rJ     s     $X.//r   )r   strrV   z
str | None)r3   zSequence[Element]r4   zdict[str, list[str]]rV   rW   )N)rS   zFileType | NonerV   zDCallable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]])r3   rW   rV   rW   ) __doc__
__future__r   rn   r   r   rX   r_   r   typingr   r   r   r   typing_extensionsr   unstructured.documents.elementsr	   r
   unstructured.file_utils.modelr   "unstructured.partition.common.langr   unstructured.utilsr   r   r   HIERARCHY_RULE_SETr<   r\   rN   rJ   rI   r   r   <module>r~      s    D "     	 4 4 ' D 2 B >t_7 
 8 BT00*>00r "&||I|~*0r   