
    (#hd/                        d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZmZmZmZmZ 	 ddlZddlmZmZ d	d
lmZmZ d	dlmZmZmZ d	dlm Z  d	dl!m"Z"m#Z#m$Z$  ejJ                  e&      Z' edd      Z(dZ)dZ*dZ+ G d d      Z,de-de-de	ee-   ee-   f   fdZ.de-de	ee-   ee-   ee-   f   fdZ/de-de-dee   fdZ0de-dee   fdZ1de-dee-   de2fd Z3d!ee-   de2fd"Z4	 d5de-d#e,d$ee-   ddfd%Z5d&ee   d#e,ddfd'Z6	 	 	 	 	 d6d(e-d)ee-   d*ee   d!eee-      d+eee-      d,ee-   de,fd-Z7	 d7d#e,d.e2de,fd/Z8e*e+ddde ddfde-d0e9d1e9d!eee-      d2eee-      d)ee-   d3ed*ee   d,ee-   de	ee-   ee-   f   fd4Z:y# e$ r Y Sw xY w)8zC
Functions dedicated to website navigation and crawling/spidering.
    N)ConfigParser)sleep)ListOptionalTuple)RobotFileParser)UrlStoreextract_linksfix_relative_urlsget_base_urlis_navigation_pageis_not_crawlable)XPathtostring   )baselineprune_unwanted_nodes)Responsefetch_response	fetch_url)DEFAULT_CONFIG)LANGID_FLAGdecode_file	load_htmlF)
compressedstrictz/robots.txt
   i c                       e Zd ZdZg dZ	 	 	 ddedee   dee   dee   ddf
d	Zdedefd
Z	dedefdZ
deddfdZdeee      dee   fdZdedefdZy)CrawlParametersz6Store necessary information to manage a focused crawl.)	startbaselangrulesrefi	known_numis_onprune_xpathNr    r"   r#   r(   returnc                     || _         | j                  |      | _        | j                  |      | _        || _        |xs t        | j                        | _        d| _        d| _	        d| _
        || _        y )Nr   T)r    _get_base_urlr!   _get_referencer$   r"   	get_rulesr#   r%   r&   r'   r(   )selfr    r"   r#   r(   s        O/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/spider.py__init__zCrawlParameters.__init__0   sh      
++E2	++E2#'	050M4999M

*5    c                 <    t        |      }|st        d|       |S )z#Set reference domain for the crawl.zcannot start crawl: )r   
ValueError)r.   r    r!   s      r/   r+   zCrawlParameters._get_base_urlA   s'     '3E7;<<r1   c                 X    |j                  d      dk\  r|j                  dd      d   S |S )zDetermine the reference URL./   r   r   )countrsplit)r.   r    s     r/   r,   zCrawlParameters._get_referenceH   s-    */++c*:a*?u||C#A&JUJr1   	url_storec                     t        |j                  | j                              | _        t	        |j                  | j                              | _        y)z*Adjust crawl data based on URL store info.N)boolfind_unvisited_urlsr!   r'   lenfind_known_urlsr&   )r.   r9   s     r/   update_metadatazCrawlParameters.update_metadataL   s8    )77		BC
Y66tyyABr1   todoc                 r    |sg S |D cg c]#  }|| j                   k7  s| j                  |v s"|% c}S c c}w )z.Prepare the todo list, excluding invalid URLs.)r    r$   )r.   r@   us      r/   filter_listzCrawlParameters.filter_listQ   s2    IEa1

?txx1}EEEs   444linkc                     | j                    xs | j                   j                  d|      xr | j                  |v xr t        |       S )z9Run checks: robots.txt rules, URL type and crawl breadth.*)r#   	can_fetchr$   r   )r.   rD   s     r/   is_valid_linkzCrawlParameters.is_valid_linkW   sI     ^>tzz33C> +D +$T**	
r1   NNN)__name__
__module____qualname____doc__	__slots__strr   r   r0   r+   r,   r	   r?   r   rC   r;   rH    r1   r/   r   r   ,   s    <cI
 #+/%)66 sm6 (	6
 c]6 
6"3 3 KC KC KC Cd C
Fc 3 FS	 F
# 
$ 
r1   r   
htmlstringhomepager)   c                    d| vrd| vr| |fS t        |       }|| |fS |j                  d      }|r|d   nd}|rd|vrt        j                  d|       | |fS |j	                  d      d   j                         j                         j                  d	d      }|j                  d
      st        |      }t        ||      }t        |      }|t        j                  d|       yt        j                  d|       ||fS )z:Check if there could be a redirection by meta-refresh tag.z	"refresh"z	"REFRESH"z@.//meta[@http-equiv="refresh" or @http-equiv="REFRESH"]/@contentr    ;zno redirect found: %sr   zurl=httpzfailed redirect: %s)NNzsuccessful redirect: %s)r   xpathlogginginfosplitstriplowerreplace
startswithr   r   r   warning)rQ   rR   	html_treeresultsresulturl2base_urlnewhtmlstrings           r/   refresh_detectionrf   `   s    *$J)F8##*%I8## ooJG #WQZFS&,h78##<<Q%%'--/77CD??6"% 40dOM-t4LL*D1$r1   c                 H   t        | d      }|r|j                  sy|j                  | dfvr,t        j                  d|j                         |j                  } t        |j                        }t        ||       \  }}|yt        j                  d|       ||t        |      fS )zBCheck if the homepage is redirected and return appropriate values.FdecoderI   r5   zfollowed homepage redirect: %szfetching homepage OK: %s)	r   dataurlrX   rY   r   rf   debugr   )rR   responserQ   new_htmlstringnew_homepages        r/   probe_alternative_homepagerp      s     hu5H8== ||Hc?*5x||D<< X]]+J $5Z#J NLMM,l;<l)CCCr1   
robots_urlrj   c                     t               }|j                  |        	 |j                  |j                                |S # t        $ r }t
        j                  d|       Y d}~yd}~ww xY w)zEParse a robots.txt file with the standard library urllib.robotparser.zcannot read robots.txt: %sN)r   set_urlparse
splitlines	ExceptionLOGGERerror)rq   rj   r#   excs       r/   parse_robotsrz      s[     E	MM*DOO%& L  137s   > 	A'A""A'rd   c                 J    | t         z   }t        |      }|rt        ||      S dS )z?Attempt to fetch and parse robots.txt file for a given website.N)ROBOTS_TXT_URLr   rz   )rd   rq   rj   s      r/   r-   r-      s*    N*JZ D-1<
D);t;r1   languagec                     | r=|r;t         r5t        |       \  }}}t        j                  |      \  }}t	        ||k(        S y)zRun a baseline extraction and use a language detector to
    check if the content matches the target language.
    Return True if language checks are bypassed.T)r   r   	py3langidclassifyr;   )rQ   r}   _textrb   s        r/   is_target_languager      sB     h;j)
4&&t,	Fh&''r1   r@   c                 &    t        d | D              S )z6Probe if there are still navigation URLs in the queue.c              3   2   K   | ]  }t        |        y w)N)r   ).0rk   s     r/   	<genexpr>z&is_still_navigation.<locals>.<genexpr>   s     73!#&7s   )any)r@   s    r/   is_still_navigationr      s    7$777r1   paramsrk   c           	      h   t        | |j                        sy| r|j                  t        |j                  t              r|j                  g|_        t        |       }|Ft        ||j                  D cg c]  }t        |       c}      }t        |      j                         } g g }}t        | |xs |j                  d|j                  dd      D ]B  }|j                  |      st        |      r|j                  |       2|j                  |       D t        j!                  ||       yc c}w )zExamine the HTML code and process the retrieved internal links.
    Extract and filter new internal links after an optional language check.
    Store the links in todo-list while prioritizing the navigation ones.NFT)pagecontentrk   external_boolr}   with_navr   )urls
appendleft)r   r"   r(   
isinstancerO   r   r   r   r   ri   r
   r!   rH   r   append	URL_STOREadd_urls)rQ   r   rk   treexlinkslinks_priorityrD   s           r/   process_linksr      s    j&++6f((4f((#."("4"4!5F$'ASAS.TAuQx.TUD!$..0J>E6;;  ##D)d#!!$'LL En=' /Us   3D/
rm   c                     | | j                   syt        j                  | j                  gd       t	        t        | j                         ||j                         y)z2Convert urllib3 response object and extract links.NT)visited)rj   r   r   rk   r   r   r!   )rm   r   s     r/   process_responser      sG    
 x}}~t4 +hmm,ffkkBr1   r    r"   r#   knownr(   c                    t        | |||      }t        j                  |xs g d       t        j                  |j                  |             t        j	                  |j
                  |j                         |s1t        j                  |j                  gd       t        |d      }|S |j                  t               |S )zInitialize crawl by setting variables, copying values to the
    URL store and retrieving the initial page if the crawl starts.T)r   r   )r   F)initial)
r   r   r   rC   store_rulesr!   r#   r    
crawl_pager?   )r    r"   r#   r@   r   r(   r   s          r/   
init_crawlr      s     UD%=F EKR6F..t45&++v||4 >FD1 M 	y)Mr1   r   c                    t         j                  | j                        }|s6d| _        t	        t         j                  | j                              | _        | S | xj                  dz  c_        |r:t        |      \  }}}|rB|r@|r>t         j                  |g       t        || |       nt        |d      }t        ||        | j                  t                | S )z6Examine a webpage, extract navigation links and links.Fr   )rk   rh   )r   get_urlr!   r'   r=   r>   r&   r%   rp   r   r   r   r   r?   )r   r   rk   rQ   rR   new_base_urlrm   s          r/   r   r     s     

FKK
(Cy88EF
HHMH-G-L*
Hl(|z**f#6!#e46* 9%Mr1   max_seen_urlsmax_known_urlsknown_linksconfigc	                 N   t        | |||||      }	t        j                  |	j                  |j	                  dd            }
|	j
                  r_|	j                  |k  rP|	j                  |k  rAt        |	      }	t        |
       |	j
                  r|	j                  |k  r|	j                  |k  rAt        t        j                  t        j                  |	j                                    }t        t        j                  t        j                  |	j                                    }||fS )a  Basic crawler targeting pages of interest within a website.

    Args:
        homepage: URL of the page to first page to fetch, preferably the homepage of a website.
        max_seen_urls: maximum number of pages to visit, stop iterations at this number or at the exhaustion of pages on the website, whichever comes first.
        max_known_urls: stop if the total number of pages "known" exceeds this number.
        todo: provide a previously generated list of pages to visit / crawl frontier.
        known_links: provide a list of previously known pages.
        lang: try to target links according to language heuristics.
        config: use a different configuration (configparser format).
        rules: provide politeness rules (urllib.robotparser.RobotFileParser() format).
        prune_xpath: remove unwanted elements from the HTML pages using XPath.

    Returns:
        List of pages to visit, deque format, possibly empty if there are no further pages to visit.
        Set of known links.

    DEFAULT
SLEEP_TIME)default)r   r   get_crawl_delayr!   getfloatr'   r%   r&   r   r   listdictfromkeysr<   r>   )rR   r   r   r@   r   r"   r   r#   r(   r   
sleep_times              r/   focused_crawlerr   2  s    : $t[+NF**V__YE + J 	M1f6F6F6WF#j 	M1f6F6F6W i;;FKKHIJDt}}Y%>%>v{{%KLMKr1   )rT   )NNNNN)F);rM   rX   configparserr   timer   typingr   r   r   urllib.robotparserr   courlanr	   r
   r   r   r   r   r   ImportError
lxml.etreer   r   corer   r   	downloadsr   r   r   settingsr   utilsr   r   r   	getLoggerrJ   rw   r   r|   MAX_SEEN_URLSMAX_KNOWN_URLSr   rO   rf   rp   rz   r-   r;   r   r   r   r   r   r   intr   rP   r1   r/   <module>r      s	    %  ( ( . 	 ' 0 : : $ 6 6 
		8	$e4	1
 1
h##"#
8C=(3-'(#LDD
8C=(3-#67D2S  0I < < 9 <3 (3- D 8d3i 8D 8 #>#>#> 
##> 
	#>LCx CC 
C  '+ $!%!%
3- O$ 49
	
 DI # :  B '( $'+)'+!%... . 49
	.
 $s)$. 3-. . O$. #. 49d3i .u  		s   F FF