
    Ig^-                        d dl mZ d dlZd dlZd dlmZmZ d dlmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlmZ d dlmZ  G d de      Z G d	 d
      Z G d d      Zy)    )annotationsN)BytesIOStringIO)AnyDictIterableListOptionalTuple	TypedDictcast)Document)RecursiveCharacterTextSplitterc                  :    e Zd ZU dZded<   ded<   ded<   ded<   y)	ElementTypezElement type as typed dict.strurlxpathcontentzDict[str, str]metadataN)__name__
__module____qualname____doc____annotations__     Z/var/www/html/answerous/venv/lib/python3.12/site-packages/langchain_text_splitters/html.pyr   r      s    %	HJLr   r   c                  J    e Zd ZdZ	 d	 	 	 d	dZ	 	 	 	 d
dZddZddZddZy)HTMLHeaderTextSplitterzU
    Splitting HTML files based on specified headers.
    Requires lxml package.
    c                2    || _         t        |      | _        y)ay  Create a new HTMLHeaderTextSplitter.

        Args:
            headers_to_split_on: list of tuples of headers we want to track mapped to
                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)].
            return_each_element: Return each element w/ associated headers.
        N)return_each_elementsortedheaders_to_split_on)selfr$   r"   s      r   __init__zHTMLHeaderTextSplitter.__init__   s     $7 #)*=#> r   c                    g }|D ]:  }|r%|d   d   |d   k(  r|d   dxx   d|d   z   z  cc<   *|j                  |       < |D cg c]  }t        |d   |d          c}S c c}w )zCombine elements with common metadata into chunks

        Args:
            elements: HTML element content with associated identifying info and metadata
        r   r   z  
page_contentr   )appendr   )r%   elementsaggregated_chunkselementchunks        r   aggregate_elements_to_chunksz3HTMLHeaderTextSplitter.aggregate_elements_to_chunks.   s     02 	2G!%b)*59LL
 ""%i0FWY=O4OO0 "((1	2 +
 %	"2U:=NO
 	
 
s   A#c                v    t        j                  |fi |}| j                  t        |j                              S )zSplit HTML from web URL

        Args:
            url: web URL
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the fetch url content request.
        )requestsgetsplit_text_from_filer   r   )r%   r   kwargsrs       r   split_text_from_urlz*HTMLHeaderTextSplitter.split_text_from_urlJ   s1     LL''(();<<r   c                6    | j                  t        |            S zJSplit HTML text string

        Args:
            text: HTML text
        r4   r   r%   texts     r   
split_textz!HTMLHeaderTextSplitter.split_textU        (($88r   c                   	 ddl m} |j                  d      }|j	                  ||      }t        j                  t              j                  dz  }|j	                  |      }|j                  |      } ||      }	|j                  t        |	            }
| j                  D cg c]  }|d   	 c}t        | j                        }dd	i}g }|
j                  d
|      D ]  }|j                  d      s|j                  d      s'|j                  t!        |dj#                  |j                  d|      D cg c]  }|j$                  xs d c}      dj#                  |j                  d|      D cg c]  }|j$                  xs d c}      t'        fd|j                  d|            D ci c]   }||j(                     |j$                  xs d" c}              | j*                  s| j-                  |      S |D cg c]  }t/        |d   |d          c}S # t        $ r}t        d      |d}~ww xY wc c}w c c}w c c}w c c}w c c}w )CSplit HTML file

        Args:
            file: HTML file
        r   etree>Unable to import lxml, please install with `pip install lxml`.Nzutf-8)encodingz!xsl/html_chunks_with_headers.xslthzhttp://www.w3.org/1999/xhtmlz*//*z*[@class='headers']z*[@class='chunk'] z*[@class='xpath']c                     | j                   v S N)tag)xheader_filters    r   <lambda>z=HTMLHeaderTextSplitter.split_text_from_file.<locals>.<lambda>   s    !%%=*@ r   z*[@class='headers']/*)r   r   r   r   r   r   r)   )lxmlrB   ImportError
HTMLParserparsepathlibPath__file__parentXSLT
fromstringr   r$   dictfindallr+   r   joinr<   filterrI   r"   r0   r   )r%   filerB   eparsertree	xslt_path	xslt_tree	transformresult
result_domheaderheader_mappingns_mapr,   r.   noder/   rK   s                     @r   r4   z+HTMLHeaderTextSplitter.split_text_from_file]   s]   	" !!7!3{{4( LL*114WW	KK	*	JJy)	4%%c&k2
 261I1IJvJd667 56 !))&&9 	G45#:   gg -4OO<OQW,X$( !%		R !# -4OO<OQW,X$( !%		R! )/ @ '0G P)	" !% +4884diio2E"		> ''44X>> & eI&6zARS {  	P	& K 
"s5   H$ %I8I4I2%II$	H>-H99H>N)F)r$   List[Tuple[str, str]]r"   bool)r,   zList[ElementType]returnList[Document])r   r   r5   r   rj   rk   r<   r   rj   rk   r[   r   rj   rk   )	r   r   r   r   r&   r0   r7   r=   r4   r   r   r   r    r       sH     %*?2? "?"
)
	
8	=9Hr   r    c                  h    e Zd ZdZ	 d
	 	 	 	 	 	 	 ddZddZddZ	 d
	 	 	 	 	 ddZddZddZ	dd	Z
y)HTMLSectionSplitterz`
    Splitting HTML files based on specified tag and font sizes.
    Requires lxml package.
    Nc                   t        |      | _        |At        j                  t              j
                  dz  j                         | _        || _        yt        j                  |      j                         | _        || _        y)a  Create a new HTMLSectionSplitter.

        Args:
            headers_to_split_on: list of tuples of headers we want to track mapped to
                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2"].
            xslt_path: path to xslt file for document transformation.
            Uses a default if not passed.
            Needed for html contents that using different format and layouts.
        Nzxsl/converting_to_header.xslt)	rW   r$   rQ   rR   rS   rT   absoluter_   r5   )r%   r$   r_   r5   s       r   r&   zHTMLSectionSplitter.__init__   si      $((;#< X&--0OOhj N
  %\\)4==?DNr   c                    g g }}|D ]8  }|j                  |j                         |j                  |j                         : | j                  ||      }t	        di | j
                  }|j                  |      S )zSplit documents.)	metadatasr   )r+   r*   r   create_documentsr   r5   split_documents)r%   	documentstextsrs   docresultstext_splitters          r   ru   z#HTMLSectionSplitter.split_documents   sw    ry 	+CLL))*S\\*	+ '''C6EE,,W55r   c                6    | j                  t        |            S r9   r:   r;   s     r   r=   zHTMLSectionSplitter.split_text   r>   r   c                   |xs i gt        |      z  }g }t        |      D ]  \  }}| j                  |      D ]  }t        j                  ||         }|j
                  j                         D ]'  }	|j
                  |	   dk(  s|d   |j
                  |	<   ) i ||j
                  }t        |j                  |      }
|j                  |
         |S )z&Create documents from a list of texts.#TITLE#Titler)   )
len	enumerater=   copydeepcopyr   keysr   r*   r+   )r%   rw   rs   
_metadatasrv   ir<   r/   r   keynew_docs              r   rt   z$HTMLSectionSplitter.create_documents   s     32$U"3
	 ' 		*GAt. *==A7 >>..0 @C~~c*i7.6w.?s+@ :h9%..9"0B0BXV  )*		* r   c                   	 ddl m}m}  ||d      }t	        | j
                  j                               }g }|j                  dg|z         }t        |      D ]  \  }}	|	}
|dk(  rd}d}g }n(|
j                  j                         }|
j                  }g }|
j                  D ]B  }|dz   t        |      k  r|||dz      k(  r n$t        |t              s2|j!                  |       D d	j#                  |      j                         }|d
k7  s|j!                  |||d        |S # t        $ r}t        d      |d }~ww xY w)Nr   )BeautifulSoupPageElementzzUnable to import BeautifulSoup/PageElement,                     please install with `pip install                     bs4`.zhtml.parserbodyr}   h1    rF   )rd   r   tag_name)bs4r   r   rN   listr$   r   find_allr   r<   stripnamenext_elementsr   
isinstancer   r+   rY   )r%   html_docr   r   r\   soupheaderssectionsr   rd   header_elementcurrent_headercurrent_header_tagsection_contentr.   r   s                   r   split_html_by_headersz)HTMLSectionSplitter.split_html_by_headers   s^   	6 X}5t//446702--7 23"7+ 	IAv*0NAv!*%)"(*!/!4!4!:!:!<%3%8%8""$)77 4q53w<'Gwq1u~,Egs+#**73	4
 hh/557G"}"0#*$6%	4 O  	 		s   D# #	D=,D88D=c                :   | j                   |S 	 ddlm} |j	                         }|j                  t        |      |      }|j                  | j                         }|j                  |      } ||      }t        |      S # t        $ r}t        d      |d }~ww xY w)Nr   rA   rC   )	r_   rM   rB   rN   rO   rP   r   rU   r   )	r%   html_contentrB   r\   r]   r^   r`   ra   rb   s	            r   convert_possible_tags_to_headerz3HTMLSectionSplitter.convert_possible_tags_to_header  s    >>!	" !!#{{8L16:KK/	JJy)	46{  	P	s   B   	B	BBc                   |j                         }| j                  |      }| j                  |      }|D cg c]>  }t        t	        t
        |d         | j                  t        |d            |d   i      @ c}S c c}w )r@   r   r   rd   )r   )getvaluer   r   r   r   r   r$   )r%   r[   file_contentr   sections        r   r4   z(HTMLSectionSplitter.split_text_from_file-  s     }};;LI--l; $

  S'),-,,S1D-EF I

 
	
 

s   AA=rH   )r$   rh   r_   zOptional[str]r5   r   rj   None)rv   zIterable[Document]rj   rk   rl   )rw   z	List[str]rs   zOptional[List[dict]]rj   rk   )r   r   rj   zList[Dict[str, Optional[str]]])r   r   rj   r   rm   )r   r   r   r   r&   ru   r=   rt   r   r   r4   r   r   r   ro   ro      su     $(2 ! 	
 
4
69 CG+?	$*X&
r   ro   )
__future__r   r   rQ   ior   r   typingr   r   r   r	   r
   r   r   r   r2   langchain_core.documentsr   "langchain_text_splitters.characterr   r   r    ro   r   r   r   <module>r      sH    "     N N N  - M) N NbY
 Y
r   