
    Ig(<                        d dl mZ d dlZd dlmZmZmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ  G d de      Z G d	 d
      Z G d de      Z G d de      Z G d d      Zy)    )annotationsN)AnyDictListTuple	TypedDictUnion)Document)Language)RecursiveCharacterTextSplitterc                  $     e Zd ZdZd fdZ xZS )MarkdownTextSplitterz=Attempts to split the text along Markdown-formatted headings.c                f    | j                  t        j                        }t        |   dd|i| y)z"Initialize a MarkdownTextSplitter.
separatorsN )get_separators_for_languager   MARKDOWNsuper__init__)selfkwargsr   	__class__s      ^/var/www/html/answerous/venv/lib/python3.12/site-packages/langchain_text_splitters/markdown.pyr   zMarkdownTextSplitter.__init__   s.    55h6G6GH
9J9&9    )r   r   returnNone)__name__
__module____qualname____doc__r   __classcell__)r   s   @r   r   r      s    G: :r   r   c                  8    e Zd ZdZ	 	 d	 	 	 	 	 ddZddZd	dZy)
MarkdownHeaderTextSplitterz4Splitting markdown files based on specified headers.c                H    || _         t        |d d      | _        || _        y)a  Create a new MarkdownHeaderTextSplitter.

        Args:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
            strip_headers: Strip split headers from the content of the chunk
        c                    t        | d         S )Nr   )len)splits    r   <lambda>z5MarkdownHeaderTextSplitter.__init__.<locals>.<lambda>*   s    3uQx= r   T)keyreverseN)return_each_linesortedheaders_to_split_onstrip_headersr   r-   r+   r.   s       r   r   z#MarkdownHeaderTextSplitter.__init__   s-     !1 $*%@$$
  +r   c                   g }|D ]  }|r%|d   d   |d   k(  r|d   dxx   d|d   z   z  cc<   *|r||d   d   |d   k7  rnt        |d   d         t        |d         k  rN|d   d   j                  d      d   d   dk(  r.| j                  s"|d   dxx   d|d   z   z  cc<   |d   |d   d<   |j                  |        |D cg c]  }t	        |d   |d          c}S c c}w )	zCombine lines with common metadata into chunks
        Args:
            lines: Line of text / associated header metadata
        metadatacontentz  

r   #page_contentr2   )r&   r'   r.   appendr
   )r   linesaggregated_chunkslinechunks        r   aggregate_lines_to_chunksz4MarkdownHeaderTextSplitter.aggregate_lines_to_chunks/   s8   
 -/ 	/D!%b)*5j9II
 ""%i0FT)_4LL0!%b)*5j9II)"-j9:SjAQ=RR%b))4::4@DQG3N** ""%i0FT)_4LL0484D!"%j1 "((.9	/@ +
 %	"2U:=NO
 	
 
s   C!c                   |j                  d      }g }g }i }g }i }d}d}	|D ]Q  }
|
j                         }dj                  t        t        j
                  |            }|s@|j                  d      r|j                  d      dk(  rd}d}	n+|j                  d      rd}d}	n|j                  |	      rd}d}	|r|j                  |       | j                  D ]3  \  }}|j                  |      st        |      t        |      k(  s|t        |         dk(  sB||j                  d
      }|rD|d   d   |k\  r9|j                         }|d   |v r|j                  |d          |r|d   d   |k\  r9|||t        |      d	 j                         d}|j                  |       |d   ||<   |rA|j                  dj                  |      |j                         d       |j                          | j                  s|j                  |        nX |r|j                  |       nC|rA|j                  dj                  |      |j                         d       |j                          |j                         }T |r#|j                  dj                  |      |d       | j                  s| j!                  |      S |D cg c]  }t#        |d   |d          c}S c c}w )zASplit markdown file
        Args:
            text: Markdown filer4   F z```   Tz~~~ Nr5   r1   levelname)rB   rC   datarD   )r3   r2   r3   r2   r6   )r'   stripjoinfilterstrisprintable
startswithcountr8   r-   r&   popcopyclearr.   r+   r=   r
   )r   textr9   lines_with_metadatacurrent_contentcurrent_metadataheader_stackinitial_metadatain_code_blockopening_fencer;   stripped_lineseprC   current_header_levelpopped_headerheaderr<   s                     r   
split_textz%MarkdownHeaderTextSplitter.split_textY   s    

4 .0%'+- *,+- U	7D JJLM GGF3??M$JKM  ++E2}7J7J57QUV7V$(M$)M"--e4$(M$)M ++M:$)M$&M&&}5 "55 <,	T ++C0 &#c(2mCH6MQT6T '/2yy~, ) ,R 0 9=Q Q -9,<,<,>M  -V48HH 0 4 4]65J K ) ,R 0 9=Q Q &:$($1#c(*$=$C$C$E.
 %++F317(. '+22+/99_+E,<,A,A,C (--/--'..}=c<,f !#**=9$'..'+yy'A(8(=(=(? $))+/446kU	7n && IIo6DTU $$112EFF 1 eI&6zARS  s   =KN)FT)r-   zList[Tuple[str, str]]r+   boolr.   r]   )r9   zList[LineType]r   List[Document]rO   rH   r   r^   )r   r   r   r    r   r=   r\   r   r   r   r#   r#      s:    >
 "'"	+2+ + 	+.(
Txr   r#   c                  &    e Zd ZU dZded<   ded<   y)LineTypezLine type as typed dict.zDict[str, str]r2   rH   r3   Nr   r   r   r    __annotations__r   r   r   ra   ra      s    "Lr   ra   c                  0    e Zd ZU dZded<   ded<   ded<   y)
HeaderTypezHeader type as typed dict.intrB   rH   rC   rD   Nrb   r   r   r   re   re      s    $J
I
Ir   re   c                  t    e Zd ZdZdddddddZ	 	 	 d	 	 	 	 	 dd
ZddZddZddZddZ	ddZ
ddZddZy	)&ExperimentalMarkdownSyntaxTextSplittera  
    An experimental text splitter for handling Markdown syntax.

    This splitter aims to retain the exact whitespace of the original text while
    extracting structured metadata, such as headers. It is a re-implementation of the
    MarkdownHeaderTextSplitter with notable changes to the approach and
    additional features.

    Key Features:
    - Retains the original whitespace and formatting of the Markdown text.
    - Extracts headers, code blocks, and horizontal rules as metadata.
    - Splits out code blocks and includes the language in the "Code" metadata key.
    - Splits text on horizontal rules (`---`) as well.
    - Defaults to sensible splitting behavior, which can be overridden using the
      `headers_to_split_on` parameter.

    Parameters:
    ----------
    headers_to_split_on : List[Tuple[str, str]], optional
        Headers to split on, defaulting to common Markdown headers if not specified.
    return_each_line : bool, optional
        When set to True, returns each line as a separate chunk. Default is False.

    Usage example:
    --------------
    >>> headers_to_split_on = [
    >>>     ("#", "Header 1"),
    >>>     ("##", "Header 2"),
    >>> ]
    >>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
    >>>     headers_to_split_on=headers_to_split_on
    >>> )
    >>> chunks = splitter.split(text)
    >>> for chunk in chunks:
    >>>     print(chunk)

    This class is currently experimental and subject to change based on feedback and
    further development.
    zHeader 1zHeader 2zHeader 3zHeader 4zHeader 5zHeader 6)r5   z##z###z####z#####z######Nc                    g | _         t        d      | _        g | _        || _        |rt        |      | _        || _        y | j                  | _        || _        y )Nr?   r7   )	chunksr
   current_chunkcurrent_header_stackr.   dictsplittable_headersDEFAULT_HEADER_KEYSr+   r/   s       r   r   z/ExperimentalMarkdownSyntaxTextSplitter.__init__  s[     ')%26;=!*&*+>&?D# !1 '+&>&>D# 0r   c           	        |j                  d      }|ra|j                  d      }| j                  |      }| j                  |      }| j	                  |      }|ry| j                          | j                  s| j                  xj                  |z  c_        t        |j                  d            }|j                  d      }| j                  ||       n|rj| j                          | j                  ||      | j                  _        |j                  d      | j                  j                  d<   | j                          n2|r| j                          n| j                  xj                  |z  c_        |ra| j                          | j                  rb| j                  D 	
cg c]J  }	|	j                  j                         D ]+  }
|
r'|
j!                         st#        |
|	j                        - L c}
}	S | j                  S c c}
}	w )NT)keependsr   r@      Coder6   )
splitlinesrL   _match_header_match_code_match_horz_complete_chunk_docr.   rl   r7   r&   group_resolve_header_stack_resolve_code_chunkr2   r+   rk   isspacer
   )r   rO   	raw_linesraw_lineheader_match
code_match
horz_matchheader_depthheader_textr<   r;   s              r   r\   z1ExperimentalMarkdownSyntaxTextSplitter.split_text&  s   OOTO2	 }}Q'H--h7L))(3J))(3J((*))&&33x?3  #<#5#5a#89*003**<E((*262J2Ji3""/ 7A6F6Fq6I""++F3((*((*""//8;/3 6 	  "    "[[!..99;  dU^^DD  {{s   "AHc                    t        | j                        D ]7  \  }\  }}||k(  s||f| j                  |<   | j                  d |dz    | _         y  | j                  j                  ||f       y )Nr@   )	enumeraterm   r8   )r   r   r   idepth_s         r   r{   z<ExperimentalMarkdownSyntaxTextSplitter._resolve_header_stackQ  sx    &t'@'@A 	MAzq$0<k/J))!,,0,E,EgA,N)		
 	!!((,)DEr   c                d    |}|r,|j                  d      }||z  }| j                  |      r|S |r,y)Nr   r?   )rL   rw   )r   current_liner~   r<   r   s        r   r|   z:ExperimentalMarkdownSyntaxTextSplitter._resolve_code_chunkY  s?     }}Q'HXE)	 
 r   c                V   | j                   j                  }|r|j                         sp| j                  D ]<  \  }}| j                  j                  d|z        }|| j                   j                  |<   > | j                  j                  | j                          t        d      | _         y )Nr5   r?   rj   )
rl   r7   r}   rm   ro   getr2   rk   r8   r
   )r   chunk_contentr   value
header_keys        r   ry   z:ExperimentalMarkdownSyntaxTextSplitter._complete_chunk_docb  s    **77!6!6!8 $ 9 9 @u!4488uE
:?""++J7@ KKt112%26r   c                r    t        j                  d|      }|r|j                  d      | j                  v r|S y )Nz^(#{1,6}) (.*)r@   )rematchrz   ro   )r   r;   r   s      r   rv   z4ExperimentalMarkdownSyntaxTextSplitter._match_headero  s3    *D1U[[^t'>'>>Lr   c                x    dD cg c]  }t        j                  ||       }}t        d |D        d       S c c}w )N)z^```(.*)z^~~~(.*)c              3  &   K   | ]	  }|s|  y wNr   .0r   s     r   	<genexpr>zEExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<genexpr>x       9u5U9   r   r   nextr   r;   rulematchess       r   rw   z2ExperimentalMarkdownSyntaxTextSplitter._match_codev  s9    4NOD288D$'OO994@@ P   7c                x    dD cg c]  }t        j                  ||       }}t        d |D        d       S c c}w )N)z
^\*\*\*+\nz^---+\nz^___+\nc              3  &   K   | ]	  }|s|  y wr   r   r   s     r   r   zEExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<genexpr>~  r   r   r   r   s       r   rx   z2ExperimentalMarkdownSyntaxTextSplitter._match_horzz  sA    -T
%)BHHT4 
 
 994@@
r   )NFT)r-   z"Union[List[Tuple[str, str]], None]r+   r]   r.   r]   r_   )r   rf   r   rH   r   r   )r   rH   r~   z	List[str]r   rH   )r   r   )r;   rH   r   zUnion[re.Match, None])r   r   r   r    rp   r   r\   r{   r|   ry   rv   rw   rx   r   r   r   rh   rh      sy    &R  CG!&"	1?1 1 	1")VF
7AAr   rh   )
__future__r   r   typingr   r   r   r   r   r	   langchain_core.documentsr
   langchain_text_splitters.baser   "langchain_text_splitters.characterr   r   r#   ra   re   rh   r   r   r   <module>r      sZ    " 	 ; ; - 2 M:9 :| |~y  [A [Ar   