
    (#hq&                        d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZ  ej8                  e      Z ej>                  d      Z  ej>                  dejB                        Z" ej>                  d      Z# ej>                  d      Z$ ej>                  d      Z% ej>                  d      Z& ej>                  d      Z' ej>                  d      Z( ej>                  d      Z)g dZ* G d d      Z+dddefde,dee,   de-de.de/de	e,   fdZ0de,d ee,   de-fd!Z1d"e,de	e,   fd#Z2d$ee,   d"e,de	e,   fd%Z3y)&z#
Deriving link info from sitemaps.
    N)islice)sleep)CallableListSetOptionalPattern)	clean_urlextract_domainfilter_urlsfix_relative_urlsget_hostinfolang_filter   )is_similar_domain)	fetch_urlis_live_page)	MAX_LINKSMAX_SITEMAPS_SEENz.<loc>(?:<!\[CDATA\[)?(http.+?)(?:\]\]>)?</loc>z<xhtml:link.+?>zhref=["\'](.+?)["\']zg(?:blogger|blogpost|ghost|hubspot|livejournal|medium|typepad|squarespace|tumblr|weebly|wix|wordpress)\.z^.{0,5}<\?xml|<sitemap|<urlsetz\.xml(\..{2,4})?$|\.xml[?#]zhttps?://[^\s<"]+z
\?.*$|#.*$z\.xml\b)zsitemap.xmlzsitemap.xml.gzsitemapzsitemap_index.xmlzsitemap_news.xmlc                       e Zd ZdZg dZ	 	 ddededee   dee   ded	dfd
Z	ddZ
ded	dfdZdee   dedeegdf   d	dfdZddZddZddZy)SitemapObjectzCStore all necessary information on sitemap download and processing.)	base_urlcontentcurrent_urldomainexternalseensitemap_urlstarget_langurlsNr   r   sitemapsurlsr    r   returnc                     || _         d| _        || _        || _        d| _        t               | _        || _        || _        g | _	        y )N )
r   r   r   r   r   setr   r   r    r!   )selfr   r   r"   r    r   s         Q/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/sitemaps.py__init__zSitemapObject.__init__@   sH     &!& "!e	'3*5!	    c                     t         j                  d| j                         t        | j                        xs d| _        | j
                  j                  | j                         y)z!Fetch a sitemap over the network.zfetching sitemap: %sr%   N)LOGGERdebugr   r   r   r   addr'   s    r(   fetchzSitemapObject.fetchR   sD    +T-=-=> !1!128b		d&&'r*   linkc                 L   || j                   k(  ryt        | j                  |      }t        || j                        xs d}|rt        || j                        syt        |d      }|t        j                  d|       y| j                  sMt        j                  |      s8t        | j                  |      s"t        j                  d| j                  |       yt        j                  |      r| j                   j#                  |       y| j$                  j#                  |       y)z^Examine a link and determine if it's valid and if it leads to
        a sitemap or a web page.Nr%   T)fastzcouldn't extract domain: %sz-link discarded, diverging domain names: %s %s)r   r   r   r
   r    r   r   r,   errorr   WHITELISTED_PLATFORMSsearchr   r   warningDETECT_SITEMAP_LINKr   appendr!   )r'   r1   	newdomains      r(   handle_linkzSitemapObject.handle_linkX   s     4### 5t//06B;tT-=-=>"4d3	LL6=
 )00;%dkk9=NN?i %%d+$$T*IIT"r*   regexindexhandlerc                    fdt        |j                  | j                        t              D        D ]
  } ||        t        j                  dt        | j                        t        | j                        | j                         y)zJExtract links from the content using pre-defined regex, index and handler.c              3   (   K   | ]	  }|     y w)N ).0mr=   s     r(   	<genexpr>z.SitemapObject.extract_links.<locals>.<genexpr>~   s      
AeH
s   z%%s sitemaps and %s links found for %sN)
r   finditerr   r   r,   r-   lenr   r!   r   )r'   r<   r=   r>   matchs     `  r(   extract_linkszSitemapObject.extract_linksz   sl    
$U^^DLL%A9M
 	E EN	 	3!!"		N		
r*   c                      d j                   vryt        j                  d j                   dt        j                        dt
        ddf fd} j                  t        d|       y)	z7Extract links corresponding to a given target language.z	hreflang=Nzhreflang=[\"'](z.*?|x-default)[\"']attrsr#   c                     j                  |       r-t        j                  |       }|rj                  |d          yyy)z!Examine language code attributes.r   N)r6   HREFLANG_REGEXr;   )rJ   
lang_match
lang_regexr'   s     r(   handle_lang_linkzASitemapObject.extract_sitemap_langlinks.<locals>.handle_lang_link   s@      '+2259
$$Z]3  (r*   r   )r   recompiler    DOTALLstrrH   XHTML_REGEX)r'   rO   rN   s   ` @r(   extract_sitemap_langlinksz'SitemapObject.extract_sitemap_langlinks   sb    dll*ZZt//00CDbii

	4C 	4D 	4 	;+;<r*   c                 F    | j                  t        d| j                         y)z=Extract sitemap links and web page links from a sitemap file.r   N)rH   
LINK_REGEXr;   r/   s    r(   extract_sitemap_linksz#SitemapObject.extract_sitemap_links   s    4++	
r*   c                 V   t        | j                  | j                        }|syt        j	                  | j                        s"| j                  t        d| j                         y| j                  )| j                          | j                  s| j                  ry| j                          y)z5Download a sitemap and extract the links it contains.Nr   )is_plausible_sitemapr   r   SITEMAP_FORMATrG   rH   DETECT_LINKSr;   r    rU   r   r!   rX   )r'   	plausibles     r(   processzSitemapObject.process   s    ()9)94<<H	##DLL1|Q0@0@A'**,  DII""$r*   )NF)r#   N)__name__
__module____qualname____doc__	__slots__rS   r   r   boolr)   r0   r;   r	   intr   rH   rU   rX   r^   rA   r*   r(   r   r   2   s    I
I" &*"" " 3i	"
 c]" " 
"$( #  #  #D
S\
*-
8@#8M
	
=$
%r*   r   Fg       @urlr    r   
sleep_timemax_sitemapsr#   c                    t        |       \  }}|t        j                  d|        g S t        |      st        j                  d|        g S d}| j	                  d      r| g}ng }t        |       t        |      dz   kD  r| }t        |||||      }	|	j                  s+t        |      xs t        D 
cg c]	  }
| d|
  c}
|	_        |	j                  rt        |	j                        |k  r|	j                  j                         |	_        |	j                          |	j                          |	j                  D cg c]  }||	j                  vs| c}|	_        t        |	j                        |k  rt        |       |	j                  rt        |	j                        |k  r|rt!        |	j"                  |      |	_        t        j%                  dt        |	j"                        |       |	j"                  S c c}
w c c}w )ax  Look for sitemaps for the given URL and gather links.

    Args:
        url: Webpage or sitemap URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
                     (two-letter string, ISO 639-1 format).
        external: Similar hosts only or external URLs
                  (boolean, defaults to False).
        sleep_time: Wait between requests on the same website.
        max_sitemaps: Maximum number of sitemaps to process.

    Returns:
        The extracted links as a list (sorted list of unique links).

    Nzinvalid URL: %sz*base URL unreachable, dropping sitemap: %s)z.gzr   z.xml   /z%s sitemap links found for %s)r   r,   r7   r   endswithrF   r   r   find_robots_sitemapsGUESSESr   popr   r0   r^   r   r   r!   r-   )rf   r    r   rg   rh   
domainnamebaseurl	urlfiltersitemapurlsr   gss               r(   sitemap_searchrv      s   . 's+J(#.	 CSI	I
||./es8c'lQ&&IGZk8TG 3G<  
&-A
!"wiqA

 

3w||#4|#C%22668 ++ 
q/DA 
 w|||+* 

3w||#4|#C "7<<;
LL0#gll2CZP<<+A
 
s   1G38G8G8contentsc                    |yt         j                  d|       } t        j                  |       r%t	        |t
              r*t        j                  |      rd|dd j                         v rt        j                  d|        yy)zLCheck if the sitemap corresponds to an expected format,
    i.e. TXT or XML.NFr%   z<html   znot a valid XML sitemap: %sT)SCRUB_REGEXsubPOTENTIAL_SITEMAPr6   
isinstancerS   r[   rG   lowerr,   r7   )rf   rw   s     r(   rZ   rZ      sr      //"c
"C 	  %Hc*.2F2Fx2Phtn**,,4c:r*   rq   c                 6    t        | dz         }t        ||       S )zUGuess the location of the robots.txt file and try to extract
    sitemap URLs from itz/robots.txt)r   extract_robots_sitemaps)rq   	robotstxts     r(   rm   rm     s      'M12I"9g66r*   r   c                 P   | t        |       dkD  rg S g }| j                         D ]  }|j                  d      }|dk\  r|d| }|j                         }|s1|j	                  dd      }t        |      dk(  sR|d   j                         j                         |d<   |d   dk(  s|j                  |d   j                                 t        t        j                  |            }|D cg c]  }|st        ||       }}t        j                  d	t        |             |S c c}w )
z.Read a robots.txt file and find sitemap links.Ni'  #r   :r   rj   r   z%s sitemaps found in robots.txt)rF   
splitlinesfindstripsplitr~   r9   listdictfromkeysr   r,   r-   )r   rq   
candidateslinei
line_partsurs   s           r(   r   r     s    C	NU2	J$$& 9IIcN68Dzz|ZZQ'
z?a&qM//1779JqM!}	)!!*Q-"5"5"789 dmmJ/0J:DJQ$Wa0JKJ
LL2C4DE Ks   *D#2D#)4rb   loggingrP   	itertoolsr   timer   typingr   r   r   r   r	   courlanr
   r   r   r   r   r   deduplicationr   	downloadsr   r   settingsr   r   	getLoggerr_   r,   rQ   rW   rR   rT   rL   r5   r[   r8   r\   rz   r|   rn   r   rS   rd   floatre   rv   rZ   rm   r   rA   r*   r(   <module>r      s    	   9 9  - . 2 
		8	$RZZIJ
bjj+RYY734"

n  => bjj!?@ rzz./bjj'BJJz* ~% ~%F "&)C	C#C C 	C
 C 
#YCLc Xc] t *7# 7$s) 7x} s tCy r*   