
    :Qg                        d Z ddlmZ ddlmZmZ ddlmZmZ ddl	m
Z
 dddddd	 	 	 	 	 	 	 	 	 	 	 	 	 ddZdd
Z G d d	e      Zy)a  Implementation of baseline chunking.

This is the "plain-vanilla" chunking strategy. All the fundamental chunking behaviors are present in
this strategy and also in all other strategies. Those are:

- Maximally fill each chunk with sequential elements.
- Isolate oversized elements and divide (only) those chunks by text-splitting.
- Overlap when requested.

"Fancier" strategies add higher-level semantic-unit boundaries to be respected. For example, in the
by-title strategy, section boundaries are respected, meaning a chunk never contains text from two
different sections. When a new section is detected the current chunk is closed and a new one
started.
    )annotations)IterableOptional)ChunkingOptions
PreChunker)ElementNinclude_orig_elementsmax_charactersnew_after_n_charsoverlapoverlap_allc               N    t         j                  |||||      }t        | |      S )av  Combine sequential `elements` into chunks, respecting specified text-length limits.

    Produces a sequence of `CompositeElement`, `Table`, and `TableChunk` elements (chunks).

    Parameters
    ----------
    elements
        A list of unstructured elements. Usually the output of a partition function.
    include_orig_elements
        When `True` (default), add elements from pre-chunk to the `.metadata.orig_elements` field
        of the chunk(s) formed from that pre-chunk. Among other things, this allows access to
        original-element metadata that cannot be consolidated and is dropped in the course of
        chunking.
    max_characters
        Hard maximum chunk length. No chunk will exceed this length. A single element that exceeds
        this length will be divided into two or more chunks using text-splitting.
    new_after_n_chars
        A chunk that of this length or greater is not extended to include the next element, even if
        that element would fit without exceeding `max_characters`. A "soft max" length that can be
        used in conjunction with `max_characters` to limit most chunks to a preferred length while
        still allowing larger elements to be included in a single chunk without resorting to
        text-splitting. Defaults to `max_characters` when not specified, which effectively disables
        any soft window. Specifying 0 for this argument causes each element to appear in a chunk by
        itself (although an element with text longer than `max_characters` will be still be split
        into two or more chunks).
    overlap
        Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the
        next chunk as a context-preserving mechanism. By default, this only applies to split-chunks
        where an oversized element is divided into multiple chunks by text-splitting.
    overlap_all
        Default: `False`. When `True`, apply overlap between "normal" chunks formed from whole
        elements and not subject to text-splitting. Use this with caution as it produces a certain
        level of "pollution" of otherwise clean semantic chunk boundaries.
    r	   )_BasicChunkingOptionsnew_chunk_elements)elementsr
   r   r   r   r   optss          X/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/chunking/basic.pychunk_elementsr      s8    X !$$3%+ % D 8T**    r   c                    t        j                  | |      D cg c]  }|j                         D ]  }|  c}}S c c}}w )z(Implementation of actual basic chunking.)r   iter_pre_chunksiter_chunks)r   r   	pre_chunkchunks       r   r   r   O   sO     $33HdC**,  	  s   <c                      e Zd ZdZy)r   zOptions for `basic` chunking.N)__name__
__module____qualname____doc__ r   r   r   r   Z   s    'r   )r   Iterable[Element]r
   Optional[bool]r   Optional[int]r   r%   r   r%   r   r$   returnlist[Element])r   r#   r   r   r&   r'   )r!   
__future__r   typingr   r   unstructured.chunking.baser   r   unstructured.documents.elementsr   r   r   r   r"   r   r   <module>r,      s    # % B 3 -1$('+!"&4+4+ *4+ "	4+
 %4+ 4+  4+ 4+n(O (r   