
    :Qg!                        d Z ddlmZ ddlmZmZmZ ddlmZm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZ dddddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	ZddZ G d d
e
      Zy)zdImplementation of chunking by title.

Main entry point is the `@add_chunking_strategy()` decorator.
    )annotations)IterableIteratorOptional)CHUNK_MULTI_PAGE_DEFAULTBoundaryPredicateChunkingOptionsPreChunkCombiner
PreChunkeris_on_next_pageis_title)Element)lazypropertyNcombine_text_under_n_charsinclude_orig_elementsmax_charactersmultipage_sectionsnew_after_n_charsoverlapoverlap_allc          	     R    t         j                  |||||||      }t        | |      S )a	  Uses title elements to identify sections within the document for chunking.

    Splits off into a new CompositeElement when a title is detected or if metadata changes, which
    happens when page numbers or sections change. Cuts off sections once they have exceeded a
    character length of max_characters.

    Parameters
    ----------
    elements
        A list of unstructured elements. Usually the output of a partition function.
    combine_text_under_n_chars
        Combines elements (for example a series of titles) until a section reaches a length of
        n characters. Defaults to `max_characters` which combines chunks whenever space allows.
        Specifying 0 for this argument suppresses combining of small chunks. Note this value is
        "capped" at the `new_after_n_chars` value since a value higher than that would not change
        this parameter's effect.
    include_orig_elements
        When `True` (default), add elements from pre-chunk to the `.metadata.orig_elements` field
        of the chunk(s) formed from that pre-chunk. Among other things, this allows access to
        original-element metadata that cannot be consolidated and is dropped in the course of
        chunking.
    max_characters
        Chunks elements text and text_as_html (if present) into chunks of length
        n characters (hard max)
    multipage_sections
        If True, sections can span multiple pages. Defaults to True.
    new_after_n_chars
        Cuts off new sections once they reach a length of n characters (soft max). Defaults to
        `max_characters` when not specified, which effectively disables any soft window.
        Specifying 0 for this argument causes each element to appear in a chunk by itself (although
        an element with text longer than `max_characters` will be still be split into two or more
        chunks).
    overlap
        Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the
        next chunk as a context-preserving mechanism. By default, this only applies to split-chunks
        where an oversized element is divided into multiple chunks by text-splitting.
    overlap_all
        Default: `False`. When `True`, apply overlap between "normal" chunks formed from whole
        elements and not subject to text-splitting. Use this with caution as it entails a certain
        level of "pollution" of otherwise clean semantic chunk boundaries.
    r   )_ByTitleChunkingOptionsnew_chunk_by_title)	elementsr   r   r   r   r   r   r   optss	            X/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/chunking/title.pychunk_by_titler      s>    h #&&#=3%-+ ' D 8T**    r   c                    t        t        j                  | |      |      j                         }|D cg c]  }|j	                         D ]  }|  c}}S c c}}w )z-Implementation of actual "by-title" chunking.)r   )r
   r   iter_pre_chunksiter_combined_pre_chunksiter_chunks)r   r   
pre_chunks	pre_chunkchunks        r   r   r   W   sW     """8T2   $.Si9;P;P;RS%ESESSSs   Ac                  Z     e Zd ZdZedd       Zedd       Zedd       Zd	 fdZ xZ	S )
r   a9  Adds the by-title-specific chunking options to the base case.

    `by_title`-specific options:

    combine_text_under_n_chars
        A remedy to over-chunking caused by elements mis-identified as Title elements.
        Every Title element would start a new chunk and this setting mitigates that, at the
        expense of sometimes violating legitimate semantic boundaries.
    multipage_sections
        Indicates that page-boundaries should not be respected while chunking, i.e. elements
        appearing on two different pages can appear in the same chunk.
    c                0     d fd}t         |             S )a(  The semantic-boundary detectors to be applied to break pre-chunks.

        For the `by_title` strategy these are sections indicated by a title (section-heading), an
        explicit section metadata item (only present for certain document types), and optionally
        page boundaries.
        c               3  P   K   t           j                  st                y y w)N)r   r   r   )selfs   r   iter_boundary_predicateszM_ByTitleChunkingOptions.boundary_predicates.<locals>.iter_boundary_predicatesy   s#     N**%'' +s   #&)returnzIterator[BoundaryPredicate])tuple)r+   r,   s   ` r   boundary_predicatesz+_ByTitleChunkingOptions.boundary_predicatesp   s    	(
 -/00r    c                X    | j                   j                  d      }|| j                  S |S )a  Combine consecutive text pre-chunks if former is smaller than this and both will fit.

        - Does not combine table chunks with text chunks even if they would both fit in the
          chunking window.
        - Does not combine text chunks if together they would exceed the chunking window.
        - Defaults to `max_characters` when not specified.
        - Is reduced to `new_after_n_chars` when it exceeds that value.
        r   )_kwargsgethard_maxr+   	arg_values     r   r   z2_ByTitleChunkingOptions.combine_text_under_n_chars   s-     LL$$%AB	 ) 1t}}@y@r    c                ^    | j                   j                  d      }|t        S t        |      S )z0When False, break pre-chunks on page-boundaries.r   )r1   r2   r   boolr4   s     r   r   z*_ByTitleChunkingOptions.multipage_sections   s.     LL$$%9:	+4+<'Q$y/Qr    c                    t         |           | j                  dk  rt        d| j                         | j                  | j                  kD  r%t        d| j                   d| j                         y)z2Raise ValueError if request option-set is invalid.r   z8'combine_text_under_n_chars' argument must be >= 0, got zR'combine_text_under_n_chars' argument must not exceed `max_characters` value, got z > N)super	_validater   
ValueErrorr3   )r+   	__class__s    r   r:   z!_ByTitleChunkingOptions._validate   s     	 **Q.778:  **T]]:#>>?s4==/S  ;r    )r-   ztuple[BoundaryPredicate, ...])r-   int)r-   r7   )r-   None)
__name__
__module____qualname____doc__r   r/   r   r   r:   __classcell__)r<   s   @r   r   r   b   sR     1 1 A A R R
 r    )r   Iterable[Element]r   Optional[int]r   Optional[bool]r   rE   r   rF   r   rE   r   rE   r   rF   r-   list[Element])r   rD   r   r   r-   rG   )rB   
__future__r   typingr   r   r   unstructured.chunking.baser   r   r	   r
   r   r   r   unstructured.documents.elementsr   unstructured.utilsr   r   r   r    r    r   <module>rN      s   
 # / /   4 + 15,0$()-'+!"&=+=+ !.=+ *	=+
 "=+ '=+ %=+ =+  =+ =+@TGo Gr    