
    (#h                        d Z ddlZddlmZmZ ddlmZmZmZ ddl	m
Z
mZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$  ejJ                  e&      Z'da(dZ)dedefdZ*dededede+de,dedeee+e,f   fdZ-dee+   fdZ.dedee+   defdZ/dede+de+defd Z0dededeee+e,f   fd!Z1dededeee+e,f   fd"Z2y)#z.
Functions grounding on third-party software.
    N)AnyTuple)ParagraphMakerclassify_paragraphsrevise_paragraph_classification)get_stoplistget_stoplists)_ElementElement
strip_tagstostring)HtmlElement   )basic_cleaning)convert_tagsprune_unwanted_nodestree_cleaning)Document)JUSTEXT_LANGUAGES)fromstring_bytestrim)TEI_VALID_TAGS)OVERALL_DISCARD_XPATHz.//aside|.//audio|.//button|.//fieldset|.//figure|.//footer|.//iframe|.//input|.//label|.//link|.//nav|.//noindex|.//noscript|.//object|.//option|.//select|.//source|.//svg|.//time	htmlinputreturnc                     	 t        | dd      }t        |j                               }||S t               S # t        $ r*}t
        j                  d|       t               cY d}~S d}~ww xY w)z6Safety net: try with the generic algorithm readability      )min_text_lengthretry_lengthNzreadability_lxml failed: %s)ReadabilityDocumentr   summaryr   	ExceptionLOGGERwarning)r   docr"   errs       Q/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/external.pytry_readabilityr)       s^    !)RcR"3;;=1!-w@;=@ 4c:}s   *7 	7 	A* A%A*%A*treebackup_treebodytextlen_textoptionsc                     |j                   dk(  r||j                  dz  kD  r|||fS d\  }}|j                   dk(  rt        |t              }t	        |      }t        t        |dd      j                  d            }	t        |	      }
t        j                  d|
|       |
d	|fv rd
}n|d	k(  r|
d	kD  rd}n|d|
z  kD  rd
}n|
d|z  kD  r|	j                  d      sd}n|j                  d      s|
|j                  dz  kD  rd}nt        |j                  d            t        |j                  d            kD  r|
|j                  dz  kD  rd}n]|j                   dk(  r*|j                  d      s|j                  d      r|
|kD  rd}n$t        j                  d||
|j                         d
}|r'||	|
}}}t        j                  d|j                         n t        j                  d|j                         |j                  t              s||j                  k  rat        j                  d|j                         t!        | |      \  }}}t#        |      }|r$|d|z  kD  st        j                  d|       |||}}}|r|st%        ||      \  }}}|||fS )zZDecide whether to choose own or external extraction
       based on a series of heuristicsrecall
   )FF	precisionr-   zutf-8)methodencodingz0extracted length: %s (algorithm) %s (extraction)r   FT   {z.//p//text()z.//tablez.//pz.//headz.//h2|.//h3|.//h4zextraction values: %s %s for %szusing generic algorithm: %szusing custom extraction: %sz3unclean document triggering justext examination: %s   zusing justext, length: %s)focusmin_extracted_sizer   r   r)   r   r   decodelenr$   debug
startswithxpathfindallsourceSANITIZED_XPATHjustext_rescueboolsanitize_tree)r*   r+   r,   r-   r.   r/   use_readability	jt_resulttemppost_algo	algo_textlen_algobody2text2	len_text2s                 r(   compare_extractionrN   -   sv    }} X0J0JR0O%OT8##!-OY}}#*;8MN $K0MXmFWMTTU\]^I9~H LLCXxXAx= 	Q8a<	AL	 	AL	 )=)=c)BZZ'Hw7Q7QTU7U,U	T\\*%	&T\\&-A)B	BxRYRlRlopRpGp	(	"4::i+@]EXEXYlEmrz  ~F  sF6(GNN[ ,iHd2GNNC2GNNC zz/"h1K1K&KJGNN["0w"?uiK	AiK/LL4i@#(%$D y,T7;dHx    c                      t               } t               D ]  }| j                  t        |              t	        |       at
        S )z8Retrieve and return the content of all JusText stoplists)setr	   updater   tupleJT_STOPLIST)stoplistlanguages     r(   jt_stoplist_initrW   o   s<     uH!O 0X./0/KrO   rU   c           
      l    t        j                  |       }t        ||dddddd       t        |d       |S )z(Customized version of JusText processing2      g?g?g      ?T)r   make_paragraphsr   r   )r*   rU   
paragraphss      r(   custom_justextr]   y   s8    //5J
Hb#sCtL#J4rO   urltarget_languagec                 p   t        d      }|t        v rt        t        |         }nt        xs
 t	               }	 t        | |      }|D ]=  }|j                  rt        d      |j                  c}|_        |j                  |       ? |S # t        $ r"}t        j                  d||       Y d}~|S d}~ww xY w)z9Second safety net: try with the generic algorithm justextr,   pzjustext %s %sN)r   r   r   rT   rW   r]   is_boilerplater-   appendr#   r$   error)	r*   r^   r_   result_bodyjustext_stoplistr\   	paragraphelemr'   s	            r(   try_justextri      s     &/K++'(9/(JK&<*:*<
%#D*:;
 $ 	%I''%clINNOD$)t$	%   0_c3// 0s   B
 
	B5B00B5c                     t        |       } t        | |j                  |j                        }t	        dj                  |j                                     }||t        |      fS )z1Try to use justext algorithm as a second fallback )r   ri   r^   langr   joinitertextr<   )r*   r/   rH   	temp_texts       r(   rC   rC      sQ     $Dgkk7<<@MSXXm44678I)S^33rO   c                 p   t        | |      }|j                  du rt        |d       t        |d       t        ||      }|j	                  ddd      D ]P  }|j
                  dk(  rd|_        |j
                  dv s)|j
                  dk(  r|j                  d	d
       d|_        R t        |j	                  d            D cg c]  }|j
                   c}D cg c]  }|t        vr| }}t        |g|  t        dj                  |j                                     }||t        |      fS c c}w c c}w )zLConvert and sanitize the output from the generic algorithm (post-processing)Faspantdthtrrow)rs   rt   roleheadcell*rk   )r   linksr   r   itertagrQ   r   r   rm   rn   r<   )r*   r/   cleaned_treerh   elementtagnamesanitization_listr-   s           r(   rE   rE      s+    !w/L}}<%|V$g6L!!$d3  88tDHXX%xx4(DH 47|7H7H7M3NOO.( 	 
 |0/0..012Ds4y(( Ps   D.D3)3__doc__loggingtypingr   r   justext.corer   r   r   justext.utilsr   r	   
lxml.etreer
   r   r   r   	lxml.htmlr   baseliner   htmlprocessingr   r   r   readability_lxmlr   r!   settingsr   utilsr   r   xmlr   xpathsr   	getLogger__name__r$   rT   rB   r)   strintrN   rW   r]   ri   rC   rE    rO   r(   <module>r      sp     ^ ] 5 > > ! % M M = ' )  )			8	$ I
{ 
{ 
? [ ? { ? ( ? Z] ? il ? wz ?   @E  FN  PS  UX  FX  @Y ? D%*  c
 s k  c h 04 4s 4uXsC=O7P 4) )c )eKc<Q6R )rO   