
    #h2N                        d Z ddlZddlZddlZddlZddlZ	 ddlZdZ	 ddl	Z	dZ
ddlmZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-  ej\                  e/      Z0 G d d      Z1 e1       Z2 G d de      Z3 G d d      Z4 G d d      Z5 G d d      Z6de7de6fdZ8y# e$ r dZY w xY w# e$ r dZ
Y w xY w)zd
Defines a URL store which holds URLs along with relevant information and entails crawling helpers.
    NTF)defaultdictdeque)datetime	timedelta)Enum)
itemgetter)Lock)AnyDefaultDictDequeDictListOptionalTupleUnion)RobotFileParser   )normalize_url)filter_links)lang_filtervalidate_url)clear_caches)get_base_urlget_host_and_pathis_known_linkc                   `    e Zd ZdZdZddeddfdZededefd       Z	dedefd	Z
dedefd
Zy)
CompressorzYUse system information on available compression modules and define corresponding methods.)
compressordecompressorcompressionreturnNc                     |rt         rt        j                  n#|rt        rt        j                  n| j
                  | _        |rt         rt        j                  | _        y |rt        rt        j                  | _        y | j
                  | _        y N)	HAS_BZ2bz2compressHAS_ZLIBzlib
_identicalr   
decompressr   )selfr    s     M/var/www/html/sandstorm/venv/lib/python3.12/site-packages/courlan/urlstore.py__init__zCompressor.__init__:   sj     w LL"-( 	 w NN 	 %0H 	 CG// 	    datac                     | S )zReturn unchanged data. )r/   s    r,   r)   zCompressor._identicalF   s	     r.   c                 N    | j                  t        j                  |d            S )z9Pickle the data and compress it if a method is available.   )protocol)r   pickledumpsr+   r/   s     r,   r&   zCompressor.compressK   s    v||D1=>>r.   c                 J    t        j                  | j                  |            S )zADecompress the data if a method is available and load the object.)r5   loadsr   r7   s     r,   r*   zCompressor.decompressO   s    ||D--d344r.   T)__name__
__module____qualname____doc__	__slots__boolr-   staticmethodr
   r)   r&   bytesr*   r1   r.   r,   r   r   6   se    _.I

D 

D 

    ?S ?S ?5u 5 5r.   r   c                       e Zd ZdZdZdZdZy)Statez0Record state information about a domain or host.r         N)r;   r<   r=   r>   OPENALL_VISITEDBUSTEDr1   r.   r,   rD   rD   W   s    6DKFr.   rD   c                   <    e Zd ZdZdZej                  fdeddfdZy)DomainEntryz7Class to record host-related information and URL paths.)countrulesstate	timestamptotaltuplesrN   r!   Nc                 h    d| _         d | _        || _        d | _        d| _        t               | _        y )Nr   )rL   rM   rN   rO   rP   r   rQ   )r+   rN   s     r,   r-   zDomainEntry.__init__b   s.    
04
!
(,
+07r.   )r;   r<   r=   r>   r?   rD   rG   r-   r1   r.   r,   rK   rK   ^   s$    =KI&+jj 3e 3T 3r.   rK   c                   4    e Zd ZdZdZdededdfdZdefdZy)	UrlPathTuplezBClass storing information for URL paths relative to a domain/host.)urlpathvisitedrU   rV   r!   Nc                 >    |j                  d      | _        || _        y )Nutf-8)encoderU   rV   )r+   rU   rV   s      r,   r-   zUrlPathTuple.__init__o   s    %nnW5$r.   c                 8    | j                   j                  d      S )zGet the URL path as string.rX   )rU   decoder+   s    r,   pathzUrlPathTuple.paths   s    ||""7++r.   )	r;   r<   r=   r>   r?   strr@   r-   r]   r1   r.   r,   rT   rT   k   s0    H&I% %d %t %,c ,r.   rT   c                   Z   e Zd ZdZdZ	 	 	 	 	 dBdedee   dededed	dfd
Z	 dCde	e   ded	e
eee   f   fdZded	ee   fdZdDdZ	 	 	 dEdedeee      dee   deee      d	df
dZ	 dFde	e   dee   d	e	eeef      fdZ	 	 	 dGdee	e      dee	e      ded	dfdZ	 	 	 dHdedededee   ded	dfdZd e	e   d	dfd!ZdDd"Zd	e	e   fd#Zd	e	e   fd$Zded	efd%Zd	efd&Zded	e	e   fd'Zded	e	e   fd(Zde	e   d	e	e   fd)Z de	e   d	e	eeef      fd*Z!ded	efd+Z"ded	efd,Z#dIded-ed	ee   fd.Z$	 	 dJd/e%d0ed	e	e   fd1Z&	 dKd0ed/ed	e	e   fd2Z'd3ed4ee(   d	dfd5Z)d3ed	ee(   fd6Z*dLd3ed7e%d	e%fd8Z+d	e	e   fd9Z,d	efd:Z-d;e%d	efd<Z.d	e	e   fd=Z/dDd>Z0dDd?Z1d@ed	dfdAZ2y)MUrlStorezNDefines a class to store domain-classified URLs and perform checks against it.)
compresseddonelanguagestricttrailing_slashurldict_lockNra   rc   rd   trailingverboser!   c                     | _         d _        | _        | _        | _        t        t               _        t                _	        dt        dt        dd f fd}|rit        j                  j                  d      sIt        j                  t        j                  |       t        j                  t        j                   |       y y y )NFnumframer!   c                     t         j                  dt        j                               j	                          t        j                  d       y )Nz<Processing interrupted, dumping unvisited URLs from %s hostsr   )LOGGERdebuglenrf   print_unvisited_urlssysexit)rk   rl   r+   s     r,   dump_unvisited_urlsz.UrlStore.__init__.<locals>.dump_unvisited_urls   s7    LLNDLL! %%'HHQKr.   win)ra   rb   rc   rd   re   r   rK   rf   r	   rg   r
   rr   platform
startswithsignalSIGINTSIGTERM)r+   ra   rc   rd   rh   ri   rt   s   `      r,   r-   zUrlStore.__init__   s     !+	'/"$,6A+6N6
	S 	 	 	 3<<2259MM&--)<=MM&..*=> :7r.   r/   rV   c                 l   t        t              }t        j                  |      D ]  }	 t	        |      \  }}|du rt
        j                  d|       t        | j                  Jt        || j                  | j                  | j                        du rt
        j                  d|       t        t        || j                  | j                  | j                        }t        |      \  }}||   j                  t        ||              |S # t         t        f$ r t
        j#                  d|       Y w xY w)NFzInvalid URL: %szWrong language: %s)rd   rc   re   zDiscarding URL: %s)r   r   dictfromkeysr   rn   ro   
ValueErrorrc   r   rd   re   r   r   appendrT   	TypeErrorwarning)	r+   r/   rV   	inputdicturlvalidation_result
parsed_urlhostinforU   s	            r,   _buffer_urlszUrlStore._buffer_urls   s!    <Gu;M	==& 	:C:0<S0A-!:$-LL!2C8$$ MM-#T]]DKK9L9L 
 LL!5s;$$*;;!]]#'#6#6	
 %6j$A!'(#**<+IJ3	:8  z* :3S9:s   CD

%D32D3domainc                     || j                   v rQ| j                  r,t        j                  | j                   |   j                        S | j                   |   j                  S t               S r#   )rf   ra   
COMPRESSORr*   rQ   r   r+   r   s     r,   
_load_urlszUrlStore._load_urls   sQ    T\\!!,,T\\&-A-H-HII<<'...wr.   c                     | j                   sHt        d | j                  j                         D              r| j                  5  d| _         d d d        y y y # 1 sw Y   y xY w)Nc              3   V   K   | ]!  }|j                   t        j                  k7   # y wr#   )rN   rD   rG   .0vs     r,   	<genexpr>z%UrlStore._set_done.<locals>.<genexpr>   s      V1EJJ!6 Vs   ')T)rb   allrf   valuesrg   r\   s    r,   	_set_donezUrlStore._set_done   sS    yyS V@S@S@U VV ! 	! ! Wy! !s   AAto_rightrO   to_leftc                 x   |j                  d      rd|dd  z   }|| j                  v rS|}nP|j                  d      r?d|dd  z   }|| j                  v r)| j                  |   | j                  |<   | j                  |= || j                  v rZ| j                  |   j                  t        j                  u ry | j                  |      }|D ch c]  }|j                          c}nt               }t               ||j                  fd|D               ||j                  fd|D               | j                  5  | j                  r(t        j                  |      | j                  |   _        n|| j                  |   _        t!        |      | j                  |   _        ||| j                  |   _        t'        d	 |D              r#t        j(                  | j                  |   _        n5t        j*                  | j                  |   _        | j,                  rd
| _        d d d        y c c}w # 1 sw Y   y xY w)Nzhttp://https   zhttps://httpr3   c              3   X   K   | ]!  }t        |j                               r| # y wr#   r   r]   r   tknowns     r,   r   z'UrlStore._store_urls.<locals>.<genexpr>   s      Ra=53QR   **c              3   X   K   | ]!  }t        |j                               r| # y wr#   r   r   s     r,   r   z'UrlStore._store_urls.<locals>.<genexpr>   s      U!mAFFHe6TAUr   c              3   4   K   | ]  }|j                     y wr#   )rV   )r   us     r,   r   z'UrlStore._store_urls.<locals>.<genexpr>   s     +199+   F)rw   rf   rN   rD   rI   r   r]   r   setextend
extendleftrg   ra   r   r&   rQ   rp   rP   rO   r   rH   rG   rb   )	r+   r   r   rO   r   	candidateurlsr   r   s	           @r,   _store_urlszUrlStore._store_urls   s    Y'&*,IDLL("z*+IDLL('+||I'>V$LL+ T\\!||F#))U\\9??6*D'+,!QVVX,E7DEE KKR8RROOUwUUZZ 	&.8.A.A$.GV$+.2V$+),TDLL &$1:V$.+d++-2->->V$*-2ZZV$*99 %DI	& 	& -	& 	&s   	H+<C&H00H9r   switchc                 >   d }i }t         j                  |      }t        |      D ]g  }t        |      \  }}||k7  r9|}| j	                  |      D 	ci c]  }	|	j                         |	j                   }}	||v sT|dk(  s|dk(  s_||   se||= i t        |      S c c}	w )Nr   rE   )r|   r}   sortedr   r   r]   rV   list)
r+   r   r   last_domainknown_pathsremaining_urlsr   r   rU   r   s
             r,   _search_urlszUrlStore._search_urls  s     &*13t,.) 
	(C 1# 6Hg;&&<@OOH<UVqqvvx2VV+%!!G0D"3'
	( N## Ws   "B
appendleftc                     |r;| j                  ||      j                         D ]  \  }}| j                  ||        |r<| j                  ||      j                         D ]  \  }}| j                  ||        yy)zAdd a list of URLs to the (possibly) existing one.
        Optional: append certain URLs to the left,
        specify if the URLs have already been visited.)r   )r   N)r   itemsr   )r+   r   r   rV   host	urltupless         r,   add_urlszUrlStore.add_urls  s     #'#4#4T7#C#I#I#K ;i  	 :;#'#4#4Z#I#O#O#Q :i  y 9: r.   
htmlstringr   externallangwith_navc           	          t        |      }| j                  |      }t        ||||xs | j                  || j                  |      \  }}	| j                  ||	       y)zJFind links in a HTML document, filter them and add them to the data store.)r   r   r   r   rM   rd   r   )r   r   N)r   	get_rulesr   rc   rd   r   )
r+   r   r   r   r   r   base_urlrM   linkslinks_prioritys
             r,   add_from_htmlzUrlStore.add_from_html-  s_      $x( ,!&;;!
~ 	5^<r.   domainsc                    | j                   5  |D ])  }t        t        j                        | j                  |<   + 	 ddd       | j                          t        j                         }t        j                  d|       y# 1 sw Y   DxY w)z)Declare domains void and prune the store.)rN   Nz'%s objects in GC after UrlStore.discard)
rg   rK   rD   rI   rf   r   gccollectrn   ro   )r+   r   drk   s       r,   discardzUrlStore.discardD  so    ZZ 	B B"-ELL"AQB	B 	jjl>D	B 	Bs   /B  B	c                     | j                   5  t        t              | _        ddd       t	                t        j                         }t        j                  d|       y# 1 sw Y   >xY w)zRe-initialize the URL store.Nz UrlStore reset, %s objects in GC)	rg   r   rK   rf   r   r   r   rn   ro   )r+   rk   s     r,   resetzUrlStore.resetM  sI    ZZ 	4&{3DL	4jjl7=		4 	4s   AA(c                 H    t        | j                  j                               S )z#Return all known domains as a list.)r   rf   keysr\   s    r,   get_known_domainszUrlStore.get_known_domainsW  s    DLL%%'((r.   c                     | j                   j                         D cg c]%  \  }}|j                  t        j                  k(  s$|' c}}S c c}}w )ziFind all domains for which there are unvisited URLs
        and potentially adjust done meta-information.)rf   r   rN   rD   rG   )r+   r   r   s      r,   get_unvisited_domainszUrlStore.get_unvisited_domains[  s8     #ll002Ldaagg6KLLLs   %AAc                 t    || j                   v r*| j                   |   j                  t        j                  k7  S y)z9Tell if all known URLs for the website have been visited.F)rf   rN   rD   rG   r   s     r,   is_exhausted_domainzUrlStore.is_exhausted_domain`  s/    T\\!<<'--;;r.   c                 4    t        | j                               S )zFReturn the number of websites for which there are still URLs to visit.)rp   r   r\   s    r,   unvisited_websites_numberz"UrlStore.unvisited_websites_numberg  s    4--/00r.   c                 l    | j                  |      D cg c]  }||j                         z    c}S c c}w )zLGet all already known URLs for the given domain (ex. "https://example.org").)r   r]   r+   r   r   s      r,   find_known_urlszUrlStore.find_known_urlsm  s*    +/??6+BCa!CCCs   1c                     | j                  |      s=| j                  |      D cg c]"  }|j                  r||j                         z   $ c}S g S c c}w )z,Get all unvisited URLs for the given domain.)r   r   rV   r]   r   s      r,   find_unvisited_urlszUrlStore.find_unvisited_urlsq  sF    ''//3v/FX!aiiFQVVX%XX	 Ys
   AAc                 (    | j                  |d      S )z:Take a list of URLs and return the currently unknown ones.r   r   r   r+   r   s     r,   filter_unknown_urlszUrlStore.filter_unknown_urlsw        a 00r.   c                 (    | j                  |d      S )z<Take a list of URLs and return the currently unvisited ones.rE   r   r   r   s     r,   filter_unvisited_urlszUrlStore.filter_unvisited_urls{  r   r.   c                 :    t        | j                  |g             S )z0Check if the given URL has already been visited.)r@   r   )r+   r   s     r,   has_been_visitedzUrlStore.has_been_visited  s    22C59:::r.   c                     t        |      \  }}|| j                  |      D ch c]  }|j                          c}v S c c}w )z/Check if the given URL has already been stored.)r   r   r]   )r+   r   r   rU   r   s        r,   is_knownzUrlStore.is_known  s9    -c2'T__X-FG1668GGGGs   >
as_visitedc                 "   | j                  |      s| j                  |      }|D ]  }|j                  r|rdd|_        | j                  5  | j                  |   xj
                  dz  c_        ddd       | j                  ||t        j                                ||j                         z   c S  | j                  5  t        j                  | j                  |   _        ddd       | j                          y# 1 sw Y   xY w# 1 sw Y   &xY w)zSRetrieve a single URL and consider it to be visited (with corresponding timestamp).Tr   NrO   )r   r   rV   rg   rf   rL   r   r   nowr]   rD   rH   rN   r   )r+   r   r   
url_tuplesr   s        r,   get_urlzUrlStore.get_url  s     ''/0J! /{{!&*!ZZ < LL066!;6<((x||~(V!CHHJ../ ZZ 	;).):):DLL &	;< <
	; 	;s   
#C9=#D9D	D
time_limitmax_urlsc                    g }| j                   j                         D ]  \  }}|j                  t        j                  k7  r$|j
                  r3t        j                         |j
                  z
  j                         |kD  sc| j                  |      }|w|j                  |       t        |      |k\  s n | j                          |S )zaGet a list of immediately downloadable URLs according to the given
        time limit per domain.)rf   r   rN   rD   rG   rO   r   r   total_secondsr   r   rp   r   )r+   r   r   r   websiteentryr   s          r,   get_download_urlszUrlStore.get_download_urls  s     "ll002 	NGU{{ejj(OOLLNU__4CCE
Rll7+?KK$4yH,	 	r.   c                    | j                         }|sg S |t        |      z  xs d}g }|D ]b  }| j                  |      }g }|D ]  }	t        |      |k\  st        |      t        |      z   |k\  r nl|	j                  r:|j	                  |	j                                d|	_        | j                  5  | j                  |   xj                  dz  c_        ddd        t        j                         }
| j                  |   j                  }|r|
|z
  j                         |kD  rd}n!|t        |
|z
  j                         d      z
  }|D ]  }|j	                  |||z   f       ||z  } |
t        d||z
        z   }| j                  |||       e | j!                          t#        |t%        d            S # 1 sw Y   }xY w)	zcGet up to the specified number of URLs along with a suitable
        backoff schedule (in seconds).r   TNg        z.2fr   r   )key)r   rp   r   rV   r   r]   rg   rf   rL   r   r   rO   r   floatr   r   r   r   r   )r+   r   r   	potential
per_domaintargetsr   r   urlpathsr   r   original_timestampschedule_secsrU   
total_diffs                  r,   establish_download_schedulez$UrlStore.establish_download_schedule  s    ..0	IY/41
+- "	GF0J"$H! 
8MZ/Gs8}4A{{OOCHHJ/"&CK 8V,22a728 8
8 ,,.C!%f!5!?!?&,,;;=
J # *U00??A#F. ! $ ,v/?@A+, yMJ,FGGJVZ:FE"	GH 	g:a=11/8 8s   0#F77Gr   rM   c                 n    | j                   rt        j                  |      }|| j                  |   _        y)z)Store crawling rules for a given website.N)ra   r   r&   rf   rM   )r+   r   rM   s      r,   store_ruleszUrlStore.store_rules  s*    ??''.E&+W#r.   c                     || j                   v rQ| j                  r,t        j                  | j                   |   j                        S | j                   |   j                  S y)z7Return the stored crawling rules for the given website.N)rf   ra   r   r*   rM   )r+   r   s     r,   r   zUrlStore.get_rules  sM    dll"!,,T\\'-B-H-HII<<(...r.   defaultc                 v    d}| j                  |      }	 |j                  d      }|xs |S # t        $ r Y w xY w)zBReturn the delay as extracted from robots.txt, or a given default.N*)r   crawl_delayAttributeError)r+   r   r  delayrM   s        r,   get_crawl_delayzUrlStore.get_crawl_delay  sK    w'	%%c*E   		s   , 	88c                 p    | j                   j                         D cg c]  }|j                   c}S c c}w )z2Return all download counts for the hosts in store.)rf   r   rL   )r+   r   s     r,   get_all_countszUrlStore.get_all_counts  s'    !%!4!4!67A777s   3c                 V    t        d | j                  j                         D              S )z!Find number of all URLs in store.c              3   4   K   | ]  }|j                     y wr#   )rP   r   s     r,   r   z,UrlStore.total_url_number.<locals>.<genexpr>  s     :q177:r   )sumrf   r   r\   s    r,   total_url_numberzUrlStore.total_url_number  s     :DLL$7$7$9:::r.   	thresholdc                 \    t        fd| j                  j                         D              S )z^Find out if the download limit (in seconds) has been reached for one of the websites in store.c              3   <   K   | ]  }|j                   k\    y wr#   )rL   )r   r   r  s     r,   r   z6UrlStore.download_threshold_reached.<locals>.<genexpr>  s     GA177i'Gs   )anyrf   r   )r+   r  s    `r,   download_threshold_reachedz#UrlStore.download_threshold_reached  s!    G1D1D1FGGGr.   c                 l    g }| j                   D ]"  }|j                  | j                  |             $ |S )z Return a list of all known URLs.)rf   r   r   )r+   r   r   s      r,   	dump_urlszUrlStore.dump_urls  s6    ll 	6FKK,,V45	6r.   c                 |    | j                   D ]-  }t        dj                  | j                  |            d       / y)z"Print all unvisited URLs in store.
TflushN)rf   printjoinr   r   s     r,   rq   zUrlStore.print_unvisited_urls  s5    ll 	KF$))D44V<=TJ	Kr.   c                     | j                   D ]c  }t        dj                  | j                  |      D cg c],  }| |j	                          dt        |j                         . c}      d       e yc c}w )z5Print all URLs in store (URL + TAB + visited or not).r  	Tr  N)rf   r  r  r   r]   r^   rV   r   s      r,   
print_urlszUrlStore.print_urls   sr    ll 		F		 "&!8 "(1668*Bs199~.>? 		s   1A4filenamec                 v    | ` t        |d      5 }t        j                  | |       ddd       y# 1 sw Y   yxY w)zWrite the URL store to disk.wbN)rg   openr5   dump)r+   r  outputs      r,   writezUrlStore.write/  s6    J(D! 	&VKKf%	& 	& 	&s   /8)FNFTF)F)r!   N)NNNr#   )NNF)FNTr:   )g      $@i'  )d   
   )r3   )3r;   r<   r=   r>   r?   r@   r   r^   r-   r   r   r   rT   r   r   r   r   r   intr   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r	  r  r  r  rq   r  r$  r1   r.   r,   r`   r`   x   s   TI !"&?? 3-? 	?
 ? ? 
?< 05 I (, 	S%--	. D |)< ! 37(,154&4& 5./4& H%	4&
 %-.4& 
4&n 8<$I$'/}$	eCHo	$4 %)*.	:tCy!: T#Y': 	:
 
:( "== = 	=
 sm= = 
=.EtCy ET E>)49 )MtCy M
# $ 13 1Dc Dd3i D# $s) 1S	 1d3i 11$s) 1U38_8M 1;C ;D ;HC HD Hc t x} . !  
c	0 682222/222	c22l,3 ,x/H ,T , /)B 	 s 	 U 	 5 	 8S	 8;# ;HE Hd H49 K
&c &d &r.   r`   r  r!   c                     t        | d      5 }t        j                  |      }ddd       t               _        |S # 1 sw Y   xY w)zLoad a URL store from disk.rbN)r!  r5   loadr	   rg   )r  r#  	url_stores      r,   
load_storer,  6  s@    	h	 (KK'	(fIO( (s	   <A)9r>   r   loggingr5   rx   rr   r%   r$   ImportErrorr(   r'   collectionsr   r   r   r   enumr   operatorr   	threadingr	   typingr
   r   r   r   r   r   r   r   urllib.robotparserr   cleanr   corer   filtersr   r   metar   urlutilsr   r   r   	getLoggerr;   rn   r   r   rD   rK   rT   r`   r^   r,  r1   r.   r,   <module>r;     s    
    
GH
 + (   	 	 	 /    .  D D 
		8	$5 5< \
D 
3 
3
, 
,{& {&|  O  G  Hs"   C C' C$#C$'C10C1