
    (#h^K              	       r   d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlmZ ddlmZmZ dd	lmZmZ  ej(                  e      Z ej.                  d
      ZdedefdZh dZddhZh dZh dZh dZh dZ  ej.                  dejB                         ej.                  dejB                         ej.                  dejB                         ej.                  dejB                         ej.                  dejB                         ej.                  dejB                        dZ"ddhZ#ddhZ$d ede%fd!Z& G d" d#      Z' G d$ d%      Z( ej.                  d&ejB                         ej.                  d'ejB                        d(Z) ej.                  d)ejB                        Z*d*ede+fd+Z,i fded,ede+fd-Z-y).a  Minimalistic fork of readability-lxml code

This is a python port of a ruby port of arc90's readability project

http://lab.arc90.com/experiments/readability/

Given a html document, it pulls out the main body text and cleans it up.

Ruby port by starrhorne and iterationlabs
Python port by gfxmonk

For list of contributors see
https://github.com/timbertson/python-readability
https://github.com/buriy/python-readability

License of forked code: Apache-2.0.
    N)sqrt)
attrgetter)AnyDictOptionalSet)tostring)HtmlElementfragment_fromstring   )	load_htmltrimz\.( |$)stringreturnc                 &    t        | t        d      S )Nxml)encodingmethod)r	   str)r   s    Y/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/readability_lxml.py	_tostringr   &   s    FS77    >
   apdloluldivimgpretable
blockquoter   article>   tdr    r"   >	   ddr   dtlir   r   formasideaddress>
   h1h2h3h4h5h6thnavfooterheader>   r   r   r'   r   embedinputzcombx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitterz#and|article|body|column|main|shadowzKarticle|body|content|entry|hentry|main|page|pagination|post|text|blog|storyzbutton|combx|comment|com-|contact|figure|foot|footer|footnote|form|input|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widgetz.<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)z+https?:\/\/(?:www\.)?(?:youtube|vimeo)\.com)unlikelyCandidatesReokMaybeItsACandidateRe
positiveRe
negativeRedivToPElementsRevideoRebodyhtmlr   r   elemc                 F    t        t        | j                                     S )z7Return the length of the element with all its contents.)lenr   text_content)r?   s    r   text_lengthrC   W   s    tD%%'())r   c                   ,    e Zd ZdZddgZdededdfdZy)	Candidatez,Defines a class to score candidate elements.scorer?   r   Nc                      || _         || _        y N)rF   r?   )selfrF   r?   s      r   __init__zCandidate.__init__a   s    !
!%	r   )__name__
__module____qualname____doc__	__slots__floatr
   rJ    r   r   rE   rE   \   s)    2&!I&e &; &4 &r   rE   c            	           e Zd ZdZg dZddedededdfdZdefd	Z	d
e
eef   dedefdZd
e
eef   dee   fdZdedefdZde
eef   fdZdedefdZdedefdZddZddZded
e
eef   defdZy)Documentz,Class to build a etree document out of html.docmin_text_lengthretry_lengthrU   rV   rW   r   Nc                 .    || _         || _        || _        y)a  Generate the document

        :param doc: string of the html content.
        :param min_text_length: Set to a higher value for more precise detection of longer texts.
        :param retry_length: Set to a lower value for better detection of very small texts.

        The Document class is not re-enterable.
        It is designed to create a new Document() for each HTML file to process it.

        API method:
        .summary() -- cleaned up content
        NrT   )rI   rU   rV   rW   s       r   rJ   zDocument.__init__k   s     .(r   c                 @   | j                   j                  dd      D ]  }|j                           d}	 |r| j                          | j	                          | j                         }| j                  |      }|r| j                  ||      }n\|du rd}t        j                  d       ut        j                  d       | j                   j                  d      }||n| j                   }| j                  ||      }t        |xs d      }|r|| j                  k  rd}|S )	z
        Given a HTML file, extracts the text of the article.

        Warning: It mutates internal DOM representation of the HTML document,
        so it is better to call other API methods before this one.
        scriptstyleTFz5Ended up stripping too much - going for a safer parsez=Ruthless and lenient parsing did not work. Returning raw htmlr=    )rU   iter	drop_treeremove_unlikely_candidates&transform_misused_divs_into_paragraphsscore_paragraphsselect_best_candidateget_articleLOGGERdebugfindsanitizerA   rW   )	rI   r?   ruthless
candidatesbest_candidater#   r=   cleaned_articlearticle_lengths	            r   summaryzDocument.summary|   s    HHMM(G4 	DNN	 //1779..0J!77
CN**:~Ft#$HLLO S xx}}V,"&"2$"mmGZ@O !6B7NNT->->> ""r   ri   rj   c                    t        d|j                  dz        }t        d      }|j                  j	                         }|t        |      n|j                  g}|D ]  }d}||j                  k(  s||v r||   j                  |k\  rd}nf|j                  dk(  rW| j                  |      }	|j                  xs d}
t        |
      }|dkD  r|	d	k  s|dk  r|	d
k(  rt        j                  |
      rd}|s|j                  |        |S )N
   皙?z<div/>FTr   r\   P   g      ?r   )maxrF   r   r?   	getparentlisttagget_link_densitytextrA   	DOT_SPACEsearchappend)rI   ri   rj   sibling_score_thresholdoutputparentsiblingssiblingrz   link_densitynode_contentnode_lengths               r   rc   zDocument.get_article   s    #&b.*>*>*D"E$X.$$..0#)#54<N<O<O;P 	'G F.---:%w'--1HH##44W=&||1r!,/  "$$t+#r)(A-%,,\: "Fg&7	'< r   c                 >   |sy t        |j                         t        d      d      }t        j	                  t
        j                        r?|d d D ]7  }t        j                  d|j                  j                  |j                         9 t        t        |            S )NrF   T)keyreverse   zTop 5: %s %s)sortedvaluesr   rd   isEnabledForloggingDEBUGre   r?   ru   rF   nextr]   )rI   ri   sorted_candidates	candidates       r   rb   zDocument.select_best_candidate   s    "Z%8$
 w}}-.r2 R	^Y^^-?-?QRD*+,,r   r?   c                 l    t        |      xs d}t        d |j                  d      D              }||z  S )Nr   c              3   2   K   | ]  }t        |        y wrH   )rC   ).0links     r   	<genexpr>z,Document.get_link_density.<locals>.<genexpr>   s     M+d+Ms   z.//a)rC   sumfindall)rI   r?   total_lengthlink_lengths       r   rv   zDocument.get_link_density   s5    "4(-AMV8LMM\))r   c                    i }| j                   j                  ddd      D ]  }|j                         }||j                         }t        |j	                               }t        |      }|| j                  k  rZ||fD ]  }|||vs| j                  |      ||<     dt        |j                  d            z   t        |dz  d      z   }||   xj                  |z  c_
        |||   xj                  |dz  z  c_
         |j                         D ],  \  }}	|	xj                  d| j                  |      z
  z  c_
        . |S )	Nr   r    r$   r   ,d         )rU   r]   rs   r   rB   rA   rV   
score_nodesplitminrF   itemsrv   )
rI   ri   r?   parent_nodegrand_parent_node	elem_textelem_text_lennoderF   r   s
             r   ra   zDocument.score_paragraphs   sY   
HHMM#ud3 	AD..*K" + 5 5 7T..01I	NM t333$&78 =#J(>'+t'<Jt$= IOOC011C9Lq4QQE {#))U2) ,,-33uqy@3/	A8  *//1 	?OD)OOq4#8#8#>>>O	? r   c                     d}t        d |j                  d      |j                  d      f      D ]=  }t        d   j                  |      r|dz  }t        d   j                  |      s9|dz  }? |S )Nr   classidr:      r9   )filtergetREGEXESry   )rI   r?   weight	attributes       r   class_weightzDocument.class_weight  sr    txx'8$((4.&IJ 	I|$++I6"|$++I6"		
 r   c                     | j                  |      }t        |j                        }|j                         }|t        v r|dz  }n)|t
        v r|dz  }n|t        v r|dz  }n|t        v r|dz  }t        ||      S )Nr   r   )	r   r   ru   lower
DIV_SCORESBLOCK_SCORESBAD_ELEM_SCORESSTRUCTURE_SCORESrE   )rI   r?   rF   ru   names        r   r   zDocument.score_node  s|    !!$'$((myy{:QJE\!QJE_$QJE%%QJE%%r   c           
         | j                   j                  d      D ]  }dj                  t        d |j	                  d      |j	                  d      f            }t        |      dk  rM|j                  t        vs`t        d   j                  |      syt        d   j                  |      r|j                           y )Nz.//* r   r   r   r7   r8   )rU   r   joinr   r   rA   ru   
FRAME_TAGSr   ry   r^   )rI   r?   attrss      r   r_   z#Document.remove_unlikely_candidates  s    HH$$V, 
	!DHHVD488G+<dhhtn*MNOE5zA~
*23::5A !9:AA%H  
	!r   c                    | j                   j                  d      D ]H  }t        d   j                  dj	                  t        t        t        |                        rBd|_        J | j                   j                  d      D ]  }|j                  rP|j                  j                         r6t        d      }|j                  d c|_	        |_	        |j                  d|       t        t        |      d      D ]  \  }}|j                  rS|j                  j                         r9t        d      }|j                  d c|_	        |_        |j                  |d	z   |       |j                  d
k(  su|j!                            y )Nz.//divr;   r\   r   z<p/>r   T)r   r   br)rU   r   r   ry   r   mapr   rt   ru   rw   stripr   insertr   	enumeratetailr^   )rI   r?   p_elemposchilds        r   r`   z/Document.transform_misused_divs_into_paragraphs)  s0   HH$$X. 	D -.55ItDz23 	 HH$$X. 	&DyyTYY__.,V4)-D&TYAv&$Yt_dC &
U::%**"2"2"408F.3jj$+FKKKa099$OO%&	&r   r   c                 X   |j                  dddddd      D ];  }| j                  |      dk  s| j                  |      dkD  s,|j                          = |j                  d	d
      D ]  }|j                           |j                  d      D ]M  }d|j                  v r-t
        d   j                  |j                  d         rd|_        >|j                          O t               }t        |j                  d            D ]  }||v r	| j                  |      }||v r||   j                  nd}||z   dk  r3t        j                  d|j                  ||       |j                          j|j                         j!                  d      dk  sd}t"        D 	ci c]   }	|	t%        |j'                  d|	             " }
}	|
dxx   dz  cc<   |
dxx   t%        |j'                  d            z  cc<   t)        |      }| j                  |      }|j+                         }|||v r||   j                  nd}|
d   r|
d   d|
d   dz  z   kD  rd|
d    d}nx|
d   |
d   kD  r|j                  t,        vrd}nW|
d   |
d   d z  kD  rd!}nE|| j.                  k  r|
d   dk(  rd"| d#}n&|| j.                  k  r|
d   d$kD  rd"| d%}n|d&k  r|d'kD  r
d(|d)d*| }n|d&k\  r|d+kD  r
d(|d)d*| }n|
d,   dk(  r|d-k  s|
d,   dkD  rd.}n|sd/}g }|j1                         D ]"  }t)        |      }|s|j3                  |        n t%        |      dz   }|j1                  d0      D ]1  }t)        |      }|s|j3                  |       t%        |      |k\  s1 n |r6t5        |      d1kD  r(d2}|j7                  |j                  d3d4d5d6             nd2}|s|j                          t        j                  d7||j                  |xs d8       	 || _        t;        | j8                        S c c}	w )9Nr+   r,   r-   r.   r/   r0   r   gQ?r(   textareaiframesrcr<   VIDEOz6//table|//ul|//div|//aside|//header|//footer|//sectionz+Removed %s with score %6.3f and weight %-3sr   ro   Tz.//r'   r   r6   z.//input[@type="hidden"]r   r   r   g?ztoo many images ()zmore <li>s than <p>sr   zless than 3x <p>s than <input>sztoo short content length z without a single imager   z and too many imagesr   rp   ztoo many links z.3fz for its weight g      ?r5   K   z<<embed>s with too short content length, or too many <embed>sz
no content)	precedingi  Fr!   r   r   sectionz0Removed %6.3f %s with weight %s cause it has %s.r\   )r]   r   rv   r^   attribr   ry   rw   setreversedxpathrF   rd   re   ru   rB   countTEXT_CLEAN_ELEMSrA   r   rC   rs   	LIST_TAGSrV   itersiblingsrz   r   updaterU   r   )rI   r   ri   r4   r?   allowedr   rF   	to_removekindcountscontent_lengthr   r   reasonr~   sibsib_content_lengthlimits                      r   rg   zDocument.sanitizeF  s   iidD$dC 	#F  (1,0E0Ef0MPT0T  "	# IIfj1 	DNN	 IIh' 	!D#	(:(A(A$++eBT(U#	 		! %(EJJOP
 ]	D w&&t,F.2j.@Jt$**aE~!AHH	  ""$**3/"4 	FV>BD#dllS<899  t#w3t||4N'O#PP "-T!2#44T:"nn.* '*4 #;/55  #;6%=1vc{S7H3H#H0qAFD\F3K/DHHI4M3FG_sa8>F#d&:&::ve}PQ?Q88HH_`F#d&:&::ve}q?P3N3CCWX  b[\C%7),s);;KF8T  r\lS&8),s);;KF8T  Wo*~/BvHH W  ()F  "H#002 "-8-=*-$OO,>?!"  MA-E#0040@ &-8-=*-$OO,>?"8}5 %&  CMD$8$)	tyy$y'QR %INN$LLJ"o]	~ ""_s   %P')r      )r   N)rK   rL   rM   rN   rO   r
   intrJ   r   rm   r   rE   rc   r   rb   rP   rv   ra   r   r   r_   r`   rg   rQ   r   r   rS   rS   f   s   6:I)K )# )RU )`d )"*# *#X'd;	+A&B 'T] 'bm 'R	-[)5K0L 	-QYZcQd 	-*[ *U *
"${I'="> "H  &{ &y &!&:p#[ p#d;	;Q6R p#WZ p#r   rS   z-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remotez+and|article|body|column|content|main|shadow)unlikelyCandidatesokMaybeItsACandidatezdisplay:\s*noner   c                     d| j                   v r&t        j                  | j                  dd            ryd| j                   v ry| j                  d      dk(  rd| j                  dd      vryy	)
zT
    Checks if the node is visible by considering style, attributes, and class.
    r[   r\   Fhiddenzaria-hiddentruezfallback-imager   T)r   DISPLAY_NONEry   r   )r   s    r   is_node_visibler     sl    
 $++,"5"5dhhw6K"L4;;xx&(-=TXXF . r   optionsc                    t        |       }|y|j                  dd      }|j                  dd      }|j                  dt              }t        |j	                  d            }|j                  d |j	                  d	      D               d
}|D ]  } ||      s|j                  dd       d|j                  dd       }	t        d   j                  |	      rt        d   j                  |	      sd|j	                  d      rvt        |j                         j                               }
|
|k  r|t        |
|z
        z  }||kD  s y y)z]
    Decides whether or not the document is reader-able without parsing the whole thing.
    Fmin_content_length   	min_score   visibility_checkerz.//p | .//pre | .//articlec              3   <   K   | ]  }|j                           y wrH   )rs   )r   r   s     r   r   z)is_probably_readerable.<locals>.<genexpr>  s     Ed!Es   z	.//div/brg        r   r\   r   r   r   r   z./parent::li/pT)r   r   r   r   r   r   REGEXPSry   rA   rB   r   r   )r>   r   rU   r   r   r   nodesrF   r   class_and_idtext_content_lengths              r   is_probably_readerabler     sR    D/C
{ %93?K,I %9?K		678E	LLEcii.DEEE !$'((7B/0$((42D1EF'(//=g"G

&
G ::&'!$"3"3"5";";"=>!33),>>??9'* r   ).rN   r   remathr   operatorr   typingr   r   r   r   
lxml.etreer	   	lxml.htmlr
   r   utilsr   r   	getLoggerrK   rd   compilerx   r   r   DIV_TO_P_ELEMSr   r   r   r   r   Ir   r   r   r   rC   rE   rS   r   r   boolr   r   rQ   r   r   <module>r     s  $  	   + +  6 "			8	$ BJJz"	8k 8c 8 Y
*RX <  'BJJ 	[
 )bjj)OQSQUQUV"**V
 "** 	y
 #

9244 rzzH"$$O#( f
4L	*k *c *
& &P# P#r
 %"** 	A
 'BJJ6 rzz,bdd3+ $   <> % %s %$ %r   