
    (#hP9                        d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZmZ ddl m!Z!m"Z"  ejF                  e$      Z%ddddddddddddZ&e&jO                         D  ci c]  \  } }|| 
 c}} Z(h dZ)dededefdZ*dTdede+defdZ,	 dUdedee   de-defdZ.dee   dee/e/e/ee+   f   fdZ0	 dUd ed!e+d"e-dee-ee+   f   fd#Z1d ede-fd$Z2	 	 dVd%ed&e+d'e-d"e-def
d(Z3	 	 dWd)eded*e-d+e-dee   f
d,Z4d)ededee   fd-Z5d)eddfd.Z6d)eddfd/Z7d)eddfd0Z8d)eddfd1Z9d)eddfd2Z:d)eddfd3Z;i d4e6d5e6d6e6d7e8d8e8d9e8d:e8d;e8d<e8d=e9d>e9d?e7d@e7dAe7dBe:dCe:dDe:dEe;iZ<d)edFee+   ddfdGZ=	 dXdededHee+   defdIZ>d6dJd@d?dK d=dLdMdN dO	Z?dedefdPZ@dUdQedRe-de+fdSZAyc c}} w )Yz*
Functions to process nodes in HTML code.
    N)deepcopy)ListOptionalTuple)fix_relative_urlsget_base_url)_ElementElement
SubElementXPath
strip_tagstostring)HtmlElement   )duplicate_test)Document	ExtractorCUT_EMPTY_ELEMSMANUALLY_CLEANEDMANUALLY_STRIPPED)
textfiltertrimis_image_element)META_ATTRIBUTESdelete_element#iz#bz#uz#tz#subz#sup)emibstrongukbdsampttvarsubsup>   figuresourcepicturetreeoptionsreturnc                    t        j                         t        j                         }}|j                  s|j	                  g d       n| j                  d      D ]	  }d|_         |j                  r)|D cg c]  }|t        vs| }}|j                  d       t        | |       |j                  dk(  rX| j                  d      Gt        |       }|D ]#  }| j                  |      D ]  }t        |        % | j                  d      +|} n(|D ]#  }| j                  |      D ]  }t        |        % t!        | |j                        S c c}w )z/Prune the tree by discarding unwanted elements.)tabletdthtrz.//figure[descendant::table]divimgrecallz.//p)r   copyr   tablesextendxpathtagimagesPRESERVE_IMG_CLEANINGremover   focusfindr   iterr   
prune_html)	r+   r,   cleaning_liststripping_listelemetcopy
expressionelements	            W/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/htmlprocessing.pytree_cleaningrJ   0   sH    %5$9$9$;=N=S=S=U>M>>89 JJ=> 	DDH	~~$1TqQ>S5STTe$ t^$ }} TYYv%6%B' 	(J99Z0 (w'(	( 99V$D ( 	(J99Z0 (w'(	( dGMM**) Us   6EEr>   c                 |    |dk7  }| j                  d      D ]"  }|j                  t        v st        ||       $ | S )zADelete selected empty elements to save space and processing time.	precisionz-.//processing-instruction()|.//*[not(node())])	keep_tail)r9   r:   r   r   )r+   r>   tailsrH   s       rI   rA   rA   S   sB    [ E::MN 5;;/)7e45 K    nodelistwith_backupc                    |r$t        | j                               }t        |       }|D ]  } ||       D ]v  }|j                  I|j	                         }||j                         }|%|j                  xs ddz   |j                  z   |_        |j                         j                  |       x  |r%t        | j                               }|dz  kD  r| S S | S )z2Prune the HTML tree by removing unwanted sections.     )lentext_contentr   tailgetprevious	getparentr=   )	r+   rP   rQ   old_lenbackuprG   subtreeprevnew_lens	            rI   prune_unwanted_nodesr`   ]   s     d'')*$ 0
!$' 
	0G||'**,<",,.D#!%bC 7',, FDI&&w/
	00 d'')*1,t8&8KrO   links_xpathc                     d | D        D cg c]  }|s|	 }}t        t        t        |            }t        d |D              }t        |      t        |      ||fS c c}w )zCollect heuristics on link textc              3   N   K   | ]  }t        |j                                 y wN)r   rW   ).0rD   s     rI   	<genexpr>z$collect_link_info.<locals>.<genexpr>}   s     L$t0023Ls   #%c              3   ,   K   | ]  }|d k  s	d  yw)
   r   N )re   ls     rI   rf   z$collect_link_info.<locals>.<genexpr>   s     211r6Q2s   
)listmaprV   sum)ra   rE   mylistlengths
shortelemss        rI   collect_link_inforq   y   s^     MLRAPQaRFR3sF#$G222Jw<Vj&88	 Ss
   AArH   textfavor_precisionc                    | j                  d      }|sdg fS g }t        |      dk(  rN|rdnd}t        |d   j                               }t        |      |kD  rt        |      t        |      dz  kD  rdg fS | j                  d	k(  r| j                         d
nd}n| j                         d}nd}t        |      }||k  rKt        |      \  }	}
}}|
dk(  rd|fS t        j                  d|	|||
       |	|dz  kD  s|
dkD  r||
z  dkD  rd|fS d|fS )z>Remove sections which are rich in links (probably boilerplate).//refFr   rh   d   r   g?Tp<      i,  u8   list link text/total: %s/%s – short elems/total: %s/%s皙?)	findallrV   r   rW   r:   getnextrq   LOGGERdebug)rH   rr   rs   ra   rn   len_threshold	link_textlimitlenelemlenlinklenelemnumrp   s               rI   link_density_testr      s5    //(+KbyF
;1-3Q4467	y>M)c)ns4y3.N8O{{c *22??$H H$iG/@/M,*fa<<F	
 Ws]"w{zG7Kc7Q<&=rO   c                     | j                  d      }|syt        t        | j                                     }|dk  ryt	        |      \  }}}}|dk(  ryt
        j                  d||       |dk  r|d|z  kD  S |d	|z  kD  S )
z=Remove tables which are rich in links (probably boilerplate).ru   F   r   Tztable link text: %s / total: %si  rz   g      ?)r{   rV   r   rW   rq   r}   r~   )rH   ra   r   r   r   _s         rI   link_density_test_tablesr      s    //(+K$w++-./G}.{;GWa!|
LL2GWE&-n7S7]"Q'C'M:QQrO   r]   tagnamebacktrackingc                 j   g }|rdnd}|rdnd}| j                  |      D ]k  }t        |j                               }t        |||      \  }	}
|	s-|s1|
s4dt	        |      cxk  r|k  sIn Lt	        |      |k\  s[|j                  |       m t        j                  |      D ]  }t        |        | S )z{Determine the link density of elements with respect to their length,
    and remove the elements identified as boilerplate.r   rv   r      r   )	r@   r   rW   r   rV   appenddictfromkeysr   )r]   r   r   rs   	deletionsr   depth_thresholdrD   elemtextresulttemplists              rI   delete_by_link_densityr      s     I*CM*aOW% 	#))+,,T8_MCM1M1D	_,T"	# i( t NrO   rD   comments_fixpreserve_spacesc                    | j                   dk(  rt        |       r| S | j                   dk(  s&t        |       dk(  r| j                  s| j                  sy|s1| j                   dk(  r"|st        | j                        xs d| _        | S | j                  s?t        |       dk(  r1| j                  dc| _        | _        |r| j                   dk(  rd| _         |sHt        | j                        xs d| _        | j                  rt        | j                        xs d| _        | j                  st        |       s|j                  rt        | |      ry| S )z3Convert, format, and probe potential text elements.graphicdoner   NlbrS   rw   )	r:   r   rV   rr   rX   r   r   dedupr   )rD   r,   r   r   s       rI   handle_textnoder      s	    xx9!1$!7xx6c$i1nTYYtyy DHH,TYY/4DI 99Ta  $yy"	49DHH,DH O+t	99TYY/4DI
 IItMMnT7;KrO   c                    | j                   dk(  s&t        |       dk(  r| j                  s| j                  syt	        | j                        xs dt	        | j                        xs dc| _        | _        | j                   dk7  r1| j                  s%| j                  r| j                  dc| _        | _        | j                  s| j                  r$t        |       s|j                  rt        | |      ry| S )zBConvert, format, and probe potential text elements (light format).r   r   Nr   )r:   rV   rr   rX   r   r   r   r   )rD   r,   s     rI   process_noder     s    xx6c$i1nTYYtyy  		?2dDO4KtDIty xx4		dii#yy$	49 yyDIId.w2OKrO   c                 $   | j                  d| j                         d| _        d}| j                  ddd      D ]U  }|j                  dv r>|j                  dt        |j                         d|        |j                  dk(  r|dz  }d	|_        W y
)zGConvert <ul> and <ol> to <list> and underlying <li> elements to <item>.rendrk   r   dddtli)r   r   -itemN)setr:   r@   str)rD   r   subelems      rI   convert_listsr      s    HHVTXXDH	A99T4. ;;,&KK3w{{#3"4AaS 9:{{d"QrO   c                     d}| j                   dk(  rXt        |       dk(  r| d   j                   dk(  rd}| j                  d      }|r#d}|D ]  }|j                  j	                           |rd| _         y
d	| _         y
)z?Convert quoted elements while accounting for nested structures.Fprer   r   spanTz#.//span[starts-with(@class,'hljs')]codequoteN)r:   rV   r9   attribclear)rD   	code_flag
code_elemsr   s       rI   convert_quotesr   0  sy    Ixx5 t9>d1gkkV3IZZ EF
I% '$$&'"vDHDHrO   c                 ~    | j                   j                          | j                  d| j                         d| _        y)z$Add head tags and delete attributes.r   headN)r   r   r   r:   rD   s    rI   convert_headingsr   A  s+    KKHHVTXXDHrO   c                     d| _         y)zConvert <br> and <hr> to <lb>r   N)r:   r   s    rI   convert_line_breaksr   H  s	    DHrO   c                 6    d| _         | j                  dd       y)z7Convert <del>, <s>, <strike> to <del rend="overstrike">delr   
overstrikeN)r:   r   r   s    rI   convert_deletionsr   M  s    DHHHV\"rO   c                 L    d| _         | j                  d      D ]	  }d|_          y)zHandle details and summary.r3   summaryr   N)r:   r@   )rD   r   s     rI   convert_detailsr   S  s(    DH99Y' rO   dlolulh1h2h3h4h5h6brhr
blockquoter   qr   sstrikedetailsbase_urlc                     d| _         | j                  d      }| j                  j                          |r!|rt	        ||      }| j                  d|       yy)z7Replace link tags and href attributes, delete the rest.refhreftargetN)r:   getr   r   r   r   )rD   r   r   s      rI   convert_linkr   q  sL    DHXXfFKK&x8F6"	 rO   urlc                    |j                   s=d}|j                  r|dz  }| j                  |      D ]	  }d|_         t	        | d       n2|xr t        |      }| j                  dd      D ]  }t        ||        |j                  rm| j                  t        j                               D ]F  }|j                  j                          |j                  dt        |j                            d|_        H nt	        | gt        j                           | j                  t        j                               D ]  }t        |j                     |        |j                  r| j                  d      D ]	  }d|_         | S )	zBSimplify markup and convert relevant HTML tags to an XML standard.z).//*[self::div or self::li or self::p]//az|.//table//ar   ar   hir4   r   )linksr7   r9   r:   r   r   r@   r   
formattingREND_TAG_MAPPINGkeysr   r   r   CONVERSIONSr;   )r+   r,   r   
xpath_exprrD   r   s         rI   convert_tagsr   }  sY   
 ==@
>>.(JJJz* 	DDH	 	4 ,<,IIc5) 	)Dx(	) II.3356 	DKKHHV-dhh78DH	
 	42*//12 		+**,- $DHHd#$ ~~IIe$ 	!D DH	! KrO   r   c                 D    dt        | j                  dd      dd         S )Nhr   r   r   )intr   r   s    rI   <lambda>r     s%    1S&$!7!;<=> rO   r   r   c                 4    t         | j                  dd         S )Nr   r   )HTML_TAG_MAPPINGr   r   s    rI   r   r     s    '(>? rO   )	rk   r   r   r   r   r   r4   r   r   c                    | j                  t        j                               D ]  }t        t        |j                           }t        |      r ||      |_        n||_        |j                  dk(  r-|j                  d|j                  j                  dd             {|j                  j                           d| _        t        d      }|j                  |        |S )zConvert XML to simplified HTML.r   r   r   rS   bodyhtml)r@   HTML_CONVERSIONSr   r   r:   callabler   r   popr   r
   r   )r+   rD   
conversionroots       rI   convert_to_htmlr     s    		*//12  %c$((m4
J!$'DH!DH88s?HHVT[[__Xr:;KK  DH6?DKKKrO   documentwith_metadatac                     t        | j                        }|rFt        d      }t        D ]   }t	        | |      x}st        |d||       " |j                  d|       t        |dd      j                         S )z1Convert the document to HTML and return a string.r   meta)namecontentr   Tunicode)pretty_printencoding)	r   r   r
   r   getattrr   insertr   strip)r   r   	html_treer   r   values         rI   build_html_outputr    sw    .Iv# 	CD$//u/4dEB	C 	D!ID9EKKMMrO   )balanced)F)FF)TFrd   )B__doc__loggingr6   r   typingr   r   r   courlan.urlutilsr   r   
lxml.etreer	   r
   r   r   r   r   	lxml.htmlr   deduplicationr   settingsr   r   r   r   r   utilsr   r   r   xmlr   r   	getLogger__name__r}   r   itemsr   r<   rJ   r   rA   boolr`   r   rq   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  )kvs   00rI   <module>r     s$     ( ( < Q Q ! )  6 5 0 
		8	$ 			
  &6%;%;%=>TQAqD> 7  +  +i  +K  +F[  k  CH
!%e;?89k"9
3S$s)#$9 >C%% #%6:%
4c?%PRk Rd R, !	  	
 D !	+
++ + 	+
 h+\x ) 8J ( T  0 0d 0"8  h 4 
#H # #( t -- 	- 	
	
 	
 	
 	
 	
 	
 	
 	
 . 
>  
  	!" #$ %.	#{ 	#hsm 	# 	# AE$
$ )$08$$P >

?
 ( x (N N N# Nu ?s   G