
    (#hD                        d Z 	 ddlZdZddlZddlZ	 ddlZdZddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlmZ 	 ddlZdZ	 ddlZdZ	 ddlZdZ	 dd	lmZ dd
lmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'  ejP                  e)      Z*ddhZ+ ejX                  dejZ                        Z. ejX                  dejZ                        Z/ ejX                  d      Z0 e$ddddd      Z1 ejX                  dejd                  ejf                  z        Z4 ejX                  d      Z5 ejX                  d      Z6h dZ7ddhZ8dZ9 ejX                  d      Z: ejX                  dejv                        Z<de=de=fd Z>d!e=de?fd"Z@d#e=deeA   fd$ZBdee=eAf   deAfd%ZCd&eAde?fd'ZDd(eAd&eAdeAfd)ZEd*eAdee#   fd+ZFd*edee#   fd,ZG e	d-.      d/eAdeAfd0       ZHd1eAdeAfd2ZIdTd1eAd3ed4   deAfd5ZJ e	d6.      dUd7eAd8e?d9e?deeA   fd:       ZKdUd;eAd8e?d9e?deeA   fd<ZLd=e!de!fd>ZM e	d6.      d1eAdeAfd?       ZNd@e!de?fdAZOdBeeA   de?fdCZPdDedEeQdefdFZRdGeQdHede?fdIZSdVd=e#dJeAdKe?de?fdLZTdMeAdNeAdeeA   fdOZUdMeAdNeAdJeAdPedee?ef   f
dQZVd@e!de?fdRZWd1eeA   de?fdSZXy# e$ r dZY w xY w# e$ r dZY w xY w# e$ r dZY yw xY w# e$ r dZY w xY w# e$ r dZY w xY w# e$ r dZY w xY w)Wzj
Module bundling functions related to HTML and text processing,
content filtering and language detection.
    NTF)	lru_cache)islice)AnyListLiteralOptionalTupleUnion	normalize)detect)
from_bytes)_Element)HtmlElement
HTMLParser
fromstring)HTTPResponseutf-8utf_8z^< ?! ?DOCTYPE.+?/ ?>z(<html.*?)\s*/>z(<!--.*?-->|<[^>]*>))collect_idsdefault_doctypeencodingremove_comments
remove_pisz(?<![p{P}>])\n)flagsz^https?://|/+$z3[^\s]+\.(avif|bmp|gif|hei[cf]|jpe?g|png|webp)(\b|$)>   phitdrefcellheaditemquotecodepre)zhttp-equiv="content-language"zproperty="og:locale"z
([a-z]{2})z\W*(Drucken|E-?Mail|Facebook|Flipboard|Google|Instagram|Linkedin|Mail|PDF|Pinterest|Pocket|Print|QQ|Reddit|Twitter|WeChat|WeiBo|Whatsapp|Xing|Mehr zum Thema:?|More on this.{,8}$)$filecontentreturnc                 4   t        | t              s| S t        r| dd dk(  r	 t        j                  |       S t        r| dd dk(  r	 t        j                  |       S t        r	 t        j                  |       S t        r	 t        j                  |       S | S # t
        $ r t        j                  d       Y ~w xY w# t        j                  $ r t        j                  d       Y w xY w# t        j                  $ r Y w xY w# t        j                  $ r Y | S w xY w)z
    Don't trust response headers and try to decompress a binary string
    with a cascade of installed packages. Use magic numbers when available.
    N   s   zinvalid GZ file   s   (/zinvalid ZSTD file)
isinstancebytesHAS_GZIPgzip
decompress	ExceptionLOGGERwarningHAS_ZSTD	zstandard	ZstdError
HAS_BROTLIbrotlierrorHAS_ZLIBzlib)r&   s    N/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/utils.pyhandle_compressed_filer<   ^   s   
 k5) KO6	.??;// KO'::	0''44 	$$[11 	??;//
 -  	.NN,-	. "" 	0NN./	0 || 		 zz 	 		sG   B B9 "C' >D  B65B69(C$#C$'C=<C= DDdatac                 F    	 | j                  d       y# t        $ r Y yw xY w)zLSimple heuristic to determine if a bytestring uses standard unicode encodingzUTF-8FT)decodeUnicodeDecodeError)r=   s    r;   isutf8rA      s,    G   s    	  bytesobjectc                    t        |       rdgS g }t        /t        |       d   }||j                  |j                                t	        |       dk  rt        |       }n!t        | dd | dd z         xs t        |       }t	        |      dkD  r)|j                  |D cg c]  }|j                   c}       |D cg c]  }|t        vs| c}S c c}w c c}w )z="Read all input or first chunk and return a list of encodingsr   Nr   i'  i  ixr   )	rA   cchardet_detectappendlowerlenr   extendr   UNICODE_ALIASES)rB   guessescchardet_guessdetection_resultsrgs         r;   detect_encodingrO      s     kyG"(5jA%NN>//12
;%&{3&{5D'9K<O'OP 4&{3 	 !,=>q

>?;!!?":A;; ?;s   C<C
Cc                    t        | t              r| S d}t        |       } t        |       D ]  }	 | j	                  |      } n |xs t        | dd      S # t
        t        f$ r t        j                  d|       d}Y Sw xY w)zCheck if the bytestring could be GZip and eventually decompress it,
       guess bytestring encoding and try to decode to Unicode string.
       Resort to destructive conversion otherwise.Nzwrong encoding detected: %sr   replace)r   errors)	r+   strr<   rO   r?   LookupErrorr@   r1   r2   )r&   htmltextguessed_encodings      r;   decode_filerW      s     +s#H )5K+K8 	"))*:;H
  Ks;KK /0 	NN8:JKH	s   A'A?>A?	beginningc                 
    d| vS )zOAssess if the object is proper HTML (awith a corresponding tag or declaration).html )rX   s    r;   is_dubious_htmlr\      s    ""    
htmlstringc                 8   d|v r3| j                  d      \  }}}t        j                  d|d      dz   |z   } t        t	        | j                                     D ]=  \  }}d|v r,|j                  d      rt        j                  d| d      }  | S |d	kD  s< | S  | S )
z>Repair faulty HTML strings to make then palatable for libxml2.doctype
    )countz<htmlz/>z\1>   )	partitionDOCTYPE_TAGsub	enumerateiter
splitlinesendswithFAULTY_HTML)r^   rX   	firstline_restilines          r;   repair_faulty_htmlrs      s     I'11$7	1d __R!_<tCdJ
T*"7"7"9:; 4d?t}}T2$1EJ  q5 r]   
htmlobjectc                     d}	 t        | j                  dd      t              }|S # t        $ r!}t        j                  d|       Y d}~|S d}~ww xY w)z!Try to pass bytes to LXML parser.Nutf8surrogatepassparserzlxml parser bytestring %s)r   encodeHTML_PARSERr0   r1   r8   )rt   treeerrs      r;   fromstring_bytesr~      sU    D7*++FOD[Y K  70#66K7s   !' 	AAAc                    t        | t              r| S t        | t              st        | d      r| j                  } t        | t
        t        f      st        dt        |             d}t        |       } | dd j                         }t        |      }t        | |      } d}	 t        | t              }|t)        |      d	k  r|st!        |       }|3|du r/t)        |      d
k  r!t$        j'                  dt)        |             d}|S # t        $ r t!        |       }d}Y lt"        $ r }t$        j'                  d|       Y d}~d}~ww xY w)zLoad object given as input and validate its type
    (accepted: lxml.html tree, trafilatura/urllib3 response, bytestring and string)
    r=   zincompatible input typeN2   Frx   Tzlxml parsing failed: %src   re   z9parsed tree length: %s, wrong data type or not valid HTML)r+   r   r   hasattrr=   r,   rS   	TypeErrortyperW   rF   r\   rs   r   r{   
ValueErrorr~   r0   r1   r8   rG   )rt   r|   rX   
check_flagfallback_parser}   s         r;   	load_htmlr      s2   
 *k**l+wz6/J__
j5#,/14
3CDDDZ(J3B%%'I +J#J	:JN5*[9 	D	A~
+ J$.3t9q=GT	
 K!  
+ 5.445s    D EE%E  Ei @  )maxsizecharc                 J    | j                         s| j                         r| S dS )z3Return a character if it belongs to certain classesrb   )isprintableisspace)r   s    r;   return_printables_and_spacesr   
  s"     ##%4?R?r]   stringc                 @    dj                  t        t        |             S )z6Prevent non-printable and XML invalid character errorsrb   )joinmapr   r   s    r;   remove_control_charactersr     s    7733V<==r]   unicodeform)NFCNFDNFKCNFKDc                     t        ||       S )z;Normalize the given string to the specified unicode format.r   )r   r   s     r;   normalize_unicoder     s    [&))r]   i   rr   preserve_spacetrailing_spacec                    t        | j                  dd      j                  dd      j                  dd            }|st        t        j	                  d|            }t        t        t        j                  |            rd}|S |rB| d	   j                         rdnd
}| d   j                         rdnd
}d
j                  |||g      }|S )zmRemove HTML space entities, then discard incompatible unicode
       and invalid XML characters on line levelz&#13;z&#10;ra   z&nbsp;     Nr   rb   )
r   rQ   trimLINES_TRIMMINGrh   allr   rS   r   r   )rr   r   r   new_linespace_beforespace_afters         r;   line_processingr     s     )gt)D)L)LWVZ)[)c)cdlnv)wxH **4:;s3;;)*H
 O	 "&q'//"33L!%b!1!1!3#KwwhDEHOr]   textc                     |rt        | d      S 	 dj                  t        dfd| j                         D                    j	                  dd      S # t
        $ r Y yw xY w)z<Convert text and discard incompatible and invalid charactersTra   Nc              3   6   K   | ]  }t        |        y w)N)r   ).0lr   s     r;   	<genexpr>zsanitize.<locals>.<genexpr>6  s     &eaq.'I&es   u   ␤rb   )r   r   filterrk   rQ   AttributeError)r   r   r   s    ` r;   sanitizer   /  sc     t^T::yy&eSWSbSbSd&efgoopxz|}} s   AA 	A A r|   c                 R   | j                         D ]  }|j                         }||j                  nd}|j                  t        v xs |t        v }|j                  t        v xs |t        v xs |}|j
                  D ]S  }d|v s|j
                  |   r"|j                  dd      d   | j                  vs9|j
                  j                  |       U |j                  rt        |j                  ||      |_	        |j                  st        |j                  ||      |_         | S )z?Trims spaces, removes control characters and normalizes unicoderb   :rc   r   )rj   	getparenttagSPACING_PROTECTEDFORMATTING_PROTECTEDattribsplitnsmappopr   r   tail)r|   elemparent
parent_tagr   r   	attributes          r;   sanitize_treer   ;  s   		 L!#)#5VZZ2
 %66Y*HY:Y%99qZK_=_qcq  	/Ii{{9-a1H1KSWS]S]1]KKOOI.	/
 99 NNKDI99 NNKDI%L& Kr]   c                     	 dj                  | j                               j                         S # t        t        f$ r Y yw xY w)z/Remove unnecessary spaces within a text string.r   rb   )r   r   stripr   r   r   s    r;   r   r   S  s<    xx'--//I& s   ,/ A Aelementc                     dD ]!  }| j                  |d      }t        |      s! y | j                  j                         D ]$  \  }}|j	                  d      st        |      s$ y y)z*Check if an element is a valid img element)data-srcsrcrb   Tr   F)getis_image_filer   items
startswith)r   attrr   values       r;   is_image_elementr   ]  si    # kk$# #>>//1 	KD%z*}U/C	 r]   imagesrcc                 `    | t        |       dkD  ryt        t        j                  |             S )zCheck if the observed string corresponds to a valid image extension.
       Use a length threshold and apply a regex on the content.i    F)rG   boolIMAGE_EXTENSIONsearch)r   s    r;   r   r   k  s.     3x=4/&&x011r]   iterablenc              #      K   t        |       }t        t        ||            x}r| t        t        ||            x}ryyw)zChunk data into smaller pieces.N)rj   tupler   )r   r   iteratorbatchs       r;   make_chunksr   s  sF      H~H!,-
-%
- !,-
-%
-s   >AAmy_lenoptionsc                     | |j                   k  r!t        j                  d|j                         y| |j                  kD  r"t        j                  d| |j                         yy)z=Check if the document length is within acceptable boundaries.ztoo small/incorrect for URL %sFztoo large: length %s for URL %sT)min_file_sizer1   r8   urlmax_file_size)r   r   s     r;   is_acceptable_lengthr   {  sO    %%%5w{{C%%%6Lr]   target_languagestrictc                 P   t         D ]G  }| j                  d| d      }|st        fd|D              r yt        j	                  d|        y |r>| j                  d      }|r+t        fd|D              ryt        j	                  d	       yt        j	                  d
       y)zrCheck HTML meta-elements for language information and split
       the result in case there are several languages.z	.//meta[@z][@content]c              3      K   | ]9  }t         j                  |j                  d d      j                               v  ; yw)contentrb   NRE_HTML_LANGr   r   rF   r   r   r   s     r;   r   z"check_html_lang.<locals>.<genexpr>  s6     l^b?l&8&8)R9P9V9V9X&YYl   ?ATz%s lang attr failedFz//html[@lang]c              3      K   | ]9  }t         j                  |j                  d d      j                               v  ; yw)langrb   Nr   r   s     r;   r   z"check_html_lang.<locals>.<genexpr>  s6     i[_?l&8&8&"9M9S9S9U&VVir   zHTML lang failedzNo relevant lang elements found)TARGET_LANG_ATTRSfindallanyr1   debugxpath)r|   r   r   r   elemss    `   r;   check_html_langr     s     " yk:;lfkllLL.5 

?+ichiiLL+,
LL23r]   	temp_texttemp_commentsc                     t         du rFt        |       t        |      kD  rt        j                  |       nt        j                  |      \  }}|S t        j                  d       d}|S )zARun external component (if installed) for language identificationTz3Language detector not installed, skipping detectionN)LANGID_FLAGrG   	py3langidclassifyr1   r2   )r   r   resultro   s       r;   language_classifierr     sd    d 9~M 22 y)##M2 	 M 	LMMr]   docmetac                     |[t        | |      |_        |j                  >|j                  |k7  r/t        j                  d|j                  |j                         d|fS d|fS )zFFilter text based on language detection and store relevant informationzwrong language: %s %sTF)r   languager1   r2   r   )r   r   r   r   s       r;   language_filterr     sc     ".y-H 'G,<,<,ONN2G4D4DgkkR= '>r]   c                     | j                   | j                  n| j                   }| xsC |j                         xs1 t        t	        t
        j                  |j                                     S )zFilter out unwanted text)r   r   r   r   r   	RE_FILTERmatchrk   )r   testtexts     r;   
textfilterr    sN    &||3w||H<a8++-aS(J]J]J_5`1aar]   c                 >    t        |       xr | j                          S )zJDetermine if a string is only composed of spaces and/or control characters)r   r   r   s    r;   text_chars_testr    s     <0 000r]   )r   )FF)F)Y__doc__r.   r-   ImportErrorloggingrer:   r9   	functoolsr   	itertoolsr   typingr   r   r   r   r	   r
   unicodedatar   r7   r6   r4   r3   r   r   cchardetr   rD   charset_normalizerr   
lxml.etreer   	lxml.htmlr   r   r   urllib3.responser   	getLogger__name__r1   rI   compileIrg   rm   HTML_STRIP_TAGSr{   UNICODE	MULTILINEr   URL_BLACKLIST_REGEXr   r   r   r   r   
IGNORECASEr  r,   r<   r   rA   rS   rO   rW   r\   rs   r~   r   r   r   r   r   r   r   r   r   r   intr   r   r   r   r   r  r  r[   r]   r;   <module>r     s  
H  	H    = = !JH
K
2 *  9 9 ) 
		8	$G$bjj0"$$7bjj+RTT2"**45 UEGeivz{-RZZ5LM bjj!23  "**STP UO  N rzz-( BJJ [  ]],	" "% "J 4 < <49 <2LU5#:. L3 L2#s #t #
3 3 3   +)> *# *(;"7 *Z 5@s @s @ @
>c >c >
*c *8T0U *be *
 4# t T ^fgj^k  (	3 	 	d 	W_`cWd 	 X 0 4   h 4 2HSM 2d 2# # #  s t +  T VZ 03 s x} s 3  WZ _deiknen_o "b bT b1HSM 1d 1u  H  H  J  H  K  Osi   J J# J1 J? 
K K J J #J.-J.1J<;J<?K
	K
KKK&%K&