
    "#hy3                        d Z ddlmZ ddlmZmZmZ ddlZddlZddl	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ dZdZdZdZdZdZdZ e h d      Z!dZ"dZ# ejH                  dejJ                        Z&ddhZ' G d de(      Z) G d de)      Z*e"de#fdZ+e"de#fdZ,d Z- G d d e      Z. G d! d"e/      Z0 ed#$      d%        Z1eeeeeefd&Z2d' Z3d( Z4d) Z5efd*Z6eeeeeeede"e#e-fd+Z7y# e$ r	 ddlmZ Y w xY w),zc
Copyright (c) 2011 Jan Pomikalek

This software is licensed as described in the file LICENSE.rst.
    )absolute_import)divisionprint_functionunicode_literalsN)	lru_cache)Cleaner)ContentHandler   )	Paragraph)unicodeignored)is_blankg?F      g333333?g{Gz?F>    pdddldth1h2h3h4h5h6litdthtrulcoldivprebodyformtabletfoottheadcenterlegendoptioncaptioncolgroupfieldsetoptgrouptextarea
blockquoteutf8replaces#   <meta[^>]+charset=["']?([^'"/>\s]+)goodbadc                       e Zd ZdZy)JustextErrorz"Base class for jusText exceptions.N)__name__
__module____qualname____doc__     I/var/www/html/sandstorm/venv/lib/python3.12/site-packages/justext/core.pyr6   r6   1   s    (r<   r6   c                       e Zd Zy)JustextInvalidOptionsN)r7   r8   r9   r;   r<   r=   r?   r?   5   s    r<   r?   c                    t        | t              r| }|r|n|}| j                  ||      } nt        | |||      }	 t        j
                  j                  |t        j
                  j                               }|S # t        $ rA t        j
                  j                  | t        j
                  j                               }Y |S w xY w)zConverts HTML to DOM.)parser)	
isinstancer   encodedecode_htmllxmlhtml
fromstring
HTMLParser
ValueError)rF   default_encodingencodingerrorsdecoded_htmlforced_encodingdoms          r=   html_to_domrP   9   s    $ &.(4D{{?F3"4)98VLHii""<		8L8L8N"O J  H ii""4		0D0D0F"GJHs   =A: :ACCc                    t        | t              r| S |r| j                  ||      S t        j	                  |       }|rK|j                  d      j                  d      }t        t              5  | j                  ||      cddd       S 	 | j                  d      S # 1 sw Y   xY w# t        $ r> 	 | j                  ||      cY S # t        $ r}t        dt        |      z         d}~ww xY ww xY w)zv
    Converts a `html` containing an HTML page into Unicode.
    Tries to guess character encoding from meta tag.
    r
   ASCIINr1   z&Unable to decode the HTML to Unicode: )
rB   r   decodeCHARSET_META_TAG_PATTERNsearchgroupr   LookupErrorUnicodeDecodeErrorr6   )rF   rJ   rK   rL   matchdeclared_encodinges          r=   rD   rD   M   s    
 $ {{8V,,$++D1E!KKN11':[! 	:;;0&9	: 	:V{{6""	: 	:  V	V;;/88! 	VG'RS*TUU	V	Vs<   -B
B' B$'	C.1CC.	C*C%%C**C.c                 ^    dddddddddddddddd}t        di |}|j                  |       S )zRemoves unwanted parts of DOM.FT)head)processing_instructionsremove_unknown_tagssafe_attrs_onlypage_structureannoying_tagsframesmetalinks
javascriptscriptscommentsstyleembeddedforms	kill_tagsr;   )r   
clean_html)rO   optionscleaners      r=   preprocessorrp   k   sX     $)$ G"   Gc""r<   c                   D    e Zd ZdZed        Zd Zd Zd Zd Z	d Z
d Zy	)
ParagraphMakerzg
    A class for converting a HTML page represented as a DOM object into a list
    of paragraphs.
    c                 h     |        }t         j                  j                  ||       |j                  S )zConverts DOM into paragraphs.)rE   saxsaxify
paragraphs)clsroothandlers      r=   make_paragraphszParagraphMaker.make_paragraphs   s)     %g&!!!r<   c                 z    t               | _        g | _        d | _        d| _        d| _        | j                          y NF)PathInfopathrv   	paragraphlinkbr_start_new_pragraphselfs    r=   __init__zParagraphMaker.__init__   s3    J		  "r<   c                     | j                   r?| j                   j                         r%| j                  j                  | j                          t	        | j
                        | _         y N)r   contains_textrv   appendr   r~   r   s    r=   r   z"ParagraphMaker._start_new_pragraph   s>    >>dnn::<OO""4>>2"499-r<   c                    |d   }| j                   j                  |       |t        v s|dk(  rA| j                  r5|dk(  r| j                  xj
                  dz  c_        | j                          y t        |dk(        | _        | j                  r| j                  j                  d       n|dk(  rd| _	        | j                  xj
                  dz  c_        y )Nr
   r    aT)
r~   r   PARAGRAPH_TAGSr   r   
tags_countr   boolappend_textr   )r   nameqnameattrss       r=   startElementNSzParagraphMaker.startElementNS   s    Aw		>!ddltwwt| ))Q.)$$&44<(DGww**3/ 	NN%%*%r<   c                     |d   }| j                   j                          |t        v r| j                          |dk(  rd| _        y y )Nr
   r   F)r~   popr   r   r   )r   r   r   s      r=   endElementNSzParagraphMaker.endElementNS   s?    Aw		>!$$&3;DI r<   c                 $    | j                          y r   )r   r   s    r=   endDocumentzParagraphMaker.endDocument   s      "r<   c                     t        |      ry | j                  j                  |      }| j                  r(| j                  xj                  t        |      z  c_        d| _        y r|   )r   r   r   r   chars_count_in_linkslenr   )r   contenttexts      r=   
characterszParagraphMaker.characters   sH    G~~))'299NN//3t9</r<   N)r7   r8   r9   r:   classmethodrz   r   r   r   r   r   r   r;   r<   r=   rr   rr      s9    
 " "#.+&#r<   rr   c                   D    e Zd Zd Zed        Zed        Zd Zd Zd Z	y)r}   c                     g | _         y r   	_elementsr   s    r=   r   zPathInfo.__init__   s	    r<   c                 F    dj                  d | j                  D              S )N.c              3   &   K   | ]	  }|d      yw)r   Nr;   .0r[   s     r=   	<genexpr>zPathInfo.dom.<locals>.<genexpr>   s     5!5s   joinr   r   s    r=   rO   zPathInfo.dom   s    xx5dnn555r<   c                 L    ddj                  d | j                  D              z   S )N/c              3   ,   K   | ]  }d |dd z    yw)z%s[%d]N   r;   r   s     r=   r   z!PathInfo.xpath.<locals>.<genexpr>   s     G1h2A.Gs   r   r   s    r=   xpathzPathInfo.xpath   s     SXXGGGGGr<   c                     | j                         }|j                  |d      dz   }|||<   ||i f}| j                  j                  |       | S )Nr   r
   )_get_childrengetr   r   )r   tag_namechildrenorder
xpath_parts        r=   r   zPathInfo.append   sR    %%'Xq)A-"r*
j)r<   c                 B    | j                   si S | j                   d   d   S )Nr   r   r   s    r=   r   zPathInfo._get_children   s"    ~~I~~b!!$$r<   c                 :    | j                   j                          | S r   )r   r   r   s    r=   r   zPathInfo.pop   s    r<   N)
r7   r8   r9   r   propertyrO   r   r   r   r   r;   r<   r=   r}   r}      s@     6 6 H H%r<   r}      )maxsizec                 *    t        d | D              } | S )z7Lower-case all words in stoplist and create frozen set.c              3   <   K   | ]  }|j                           y wr   )lower)r   ws     r=   r   z"define_stoplist.<locals>.<genexpr>   s     5q5s   )	frozenset)stoplists    r=   define_stoplistr      s     5H55HOr<   c                    t        |      }| D ]  }t        |      }	|j                  |      }
|j                         }t	        | xr |j
                        |_        ||kD  rd|_        [d|j                  v sd|j                  v rd|_        d|j                  v rd|_        |	|k  r|j                  dkD  rd|_        d|_        |
|k\  r|	|kD  rd|_        d|_        |
|k\  rd|_        d|_         y	)
z&Context-free paragraph classification.r4      ©z&copyselectr   shortr3   neargoodN)r   r   stopwords_densitylinks_densityr   
is_headingheadingcf_classr   dom_pathr   )rv   r   
length_lowlength_highstopwords_lowstopwords_highmax_link_densityno_headingsr   lengthstopword_densitylink_densitys               r=   classify_paragraphsr      s    x(H '	Y$66x@ ..0 [!IY5I5IJ	**!&I	&Gy~~,E!&I+++!&Ij --1%*	"%,	"/#%+	"%/	".!+I!&I3'r<   c                 t    | |z   |k7  r0| |z  } ||    j                   }|t        v r|S |dk(  r|s|S | |z   |k7  r0y)Nr   r4   )
class_typeGOOD_OR_BAD)irv   ignore_neargoodincboundarycs         r=   _get_neighbourr     sT    
c'X
	SqM$$H
??H c'X
 r<   c                      t        | ||dd      S )z
    Return the class of the paragraph at the top end of the short/neargood
    paragraphs block. If ignore_neargood is True, than only 'bad' or 'good'
    can be returned, otherwise 'neargood' can be returned, too.
    r   )r   r   rv   r   s      r=   get_prev_neighbourr   !  s     !Z"bAAr<   c           	      2    t        | ||dt        |            S )z
    Return the class of the paragraph at the bottom end of the short/neargood
    paragraphs block. If ignore_neargood is True, than only 'bad' or 'good'
    can be returned, otherwise 'neargood' can be returned, too.
    r
   )r   r   r   s      r=   get_next_neighbourr   *  s     !Z!S_MMr<   c                    t        |       D ]  \  }}|j                  |_        |j                  r|j                  dk(  s3|dz   }d}|t	        |       k  sI||k  sO| |   j                  dk(  rd|_        i|t	        | |   j
                        z  }|dz  }|t	        |       k  s||k  rO i }t        |       D ]  \  }}|j                  dk7  rt        || d      }t        || d      }|dk(  r|dk(  rd||<   B|dk(  r|dk(  rd||<   R|dk(  rt        || d	      dk(  s|dk(  rt        || d	      dk(  rd||<   d||<    |j                         D ]  \  }}	|	| |   _         t        |       D ]G  \  }}|j                  dk7  rt        || d      }t        || d      }||fd
k(  rd|_        Ad|_        I t        |       D ]  \  }}|j                  r|j                  dk(  r|j                  dk7  s1|dz   }d}|t	        |       k  sG||k  sM| |   j                  dk(  rd|_        g|t	        | |   j
                        z  }|dz  }|t	        |       k  s||k  rO y)zr
    Context-sensitive paragraph classification. Assumes that classify_pragraphs
    has already been called.
    r   r
   r   r3   r   T)r   r4   F)r4   r4   N)		enumerater   r   r   r   r   r   r   items)
rv   max_heading_distancer   r   jdistancenew_classesprev_neighbournext_neighbourr   s
             r=   revise_paragraph_classificationr   3  s    "*- 9(11	!!i&:&:g&EE#j/!h2F&F!}''61'1	$JqM..//HFA #j/!h2F&F K!*- #97*+Az4P+Az4PV#&(@#KNu$5)@"KN%*<Q
\a*bfp*p%*<Q
\a*bfp*p#KN"KN#  !!# %1#$
1 % "*- *9:-+Az4P+Az4PN+~=#(I #)I * "*- 
9!!i&:&:e&C	HZHZ^cHcE#j/!h2F&F!}''61'-	$JqM..//HFA #j/!h2F&F
r<   c           
          t        | |
|	|      } ||      }t        j                  |      }t        ||||||||       t	        ||       |S )u   
    Converts an HTML page into a list of classified paragraphs. Each paragraph
    is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
    )rP   rr   rz   r   r   )	html_textr   r   r   r   r   r   r   r   rK   rJ   
enc_errorsrp   rO   rv   s                  r=   justextr   v  s[     i!18Z
HC
s
C//4J
Hj+~'7F#J0DEr<   )8r:   
__future__r   r   r   r   re	lxml.htmlrE   lxml.sax	functoolsr   ImportErrorbackports.functools_lru_cachelxml.html.cleanr   xml.sax.handlerr	   r   r   _compatr   r   utilsr   MAX_LINK_DENSITY_DEFAULTLENGTH_LOW_DEFAULTLENGTH_HIGH_DEFAULTSTOPWORDS_LOW_DEFAULTSTOPWORDS_HIGH_DEFAULTNO_HEADINGS_DEFAULTMAX_HEADING_DISTANCE_DEFAULTr   r   DEFAULT_ENCODINGDEFAULT_ENC_ERRORScompile
IGNORECASErT   r   	Exceptionr6   r?   rP   rD   rp   rr   objectr}   r   r   r   r   r   r   r   r;   r<   r=   <module>r     s   ' A A 	  8# $ *   %          #      %2::&QSUS`S`a uo)9 )	L 	 (8$Oa ( (8$Oa V<#4B^ BJv D 3  :L'7L-@X' 'FBN Fb @F -?'7L-@X9GZ(8%LI  878s   C7 7DD