
    #hG                     .   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlmZmZmZmZ ddlmZ  ej$                  d      Z ej$                  d      Z ej$                  d	      Z ej$                  d
ej,                        Z	 d!dedede
e	d   e	eef   f   fdZ	 d"dedeee      dedee   fdZdedefdZdedefdZdede	eef   fdZdede	ee   ef   fdZdededefdZ dee   dee   dee   fdZ!d#dedededefdZ"dedee   defd Z#y)$zD
Functions related to URL manipulation and extraction of URL parts.
    N)unescape)AnyListOptionalSetTupleUnion)urljoinurlsplit
urlunsplitSplitResult)get_tldz{(?:(?:f|ht)tp)s?://(?:[^/?#]{,63}\.)?([^/?#.]{4,63}\.[^/?#]{2,63}|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[0-9a-f:]{16,})(?:/|$)z(?<=\D):\d+z^www[0-9]*\.z(?:feed(?:burner|proxy))urlfastreturnNNc                 ^   | rt        | t              sy|r\t        j                  |       }|rEt        j                  d|d   j                  d      d         }|j                  d      d   }|r||fS t        | dd	      }|y|j                  t        j                  d|j                        fS )
z0Cached function to extract top-level domain infor       @.r   T)	as_objectfail_silently)
isinstancestrDOMAIN_REGEXmatchSTRIP_PORT_REGEXsubsplitr   domainCLEAN_FLD_REGEXfld)r   r   domain_matchfull_domainclean_matchtldinfos         M/var/www/html/sandstorm/venv/lib/python3.12/site-packages/courlan/urlutils.pyget_tldinfor*      s     jc*#))#.*..r<?3H3H3Mb3QRK%++C03K"K//cT>G>>?..r7;;???    	blacklistc                 V    |
t               }t        | |      \  }}|r
||vr||vr|S dS )z;Extract domain name information using top-level domain infoNr   )setr*   )r   r,   r   r"   r&   s        r)   extract_domainr0   1   sI     E	%c5FK 62{)7S 	 r+   c                     t        | t              rt        t        |             }|S t        | t              r| }|S t        dt        |             )z3Parse a string or use urllib.parse object directly.zwrong input type:)r   r   r   r   r   	TypeErrortype)r   
parsed_urls     r)   _parser5   A   sO    #shsm,

 	 
C	%
  +T#Y77r+   c                 r    t        |       }|j                  r|j                  dz   }nd}||j                  z   S )ziStrip URL of some of its parts to get base URL.
    Accepts strings and urllib.parse ParseResult objects.z://r   )r5   schemenetloc)r   r4   r7   s      r)   get_base_urlr9   L   s<     J""U*J%%%%r+   c                     t        |       }t        |      }t        dd|j                  |j                  |j
                  g      }|dk(  rd}|r|st        d|        ||fS )zvDecompose URL in two parts: protocol + host/domain and path.
    Accepts strings and urllib.parse ParseResult objects.r   /zincomplete URL: )r5   r9   r   pathqueryfragment
ValueError)r   r4   hostnamepathvals       r)   get_host_and_pathrB   W   sp     JJ'H	R*"2"2J4G4GHG "}7+C5122Wr+   c                 :    t        | d      }t        |       }||fS )zXConvenience function returning domain and host info (protocol + host/domain) from a URL.Tr.   )r0   r9   )r   
domainnamebase_urls      r)   get_hostinforF   g   s#    $/JC Hxr+   baseurlc                     |j                  d      r|S t        |       j                  }t        |      }|j                  |dfvr)|j                  r|S t	        |j                  d            S t        | |      S )z8Prepend protocol and host information to relative links.{r   http)r7   )
startswithr   r8   r7   r   _replacer
   )rG   r   base_netloc	split_urls       r)   fix_relative_urlsrO   n   ss    
~~c
7#**KIR00J),,F,;<<7C  r+   	link_list	urlfilterc                     |t        t        |             S | D cg c]	  }||v s| }}|s%| D cg c]  }t        j                  |      s| }}t        t        |            S c c}w c c}w )zDReturn a list of links corresponding to the given substring pattern.)sortedr/   FEED_WHITELIST_REGEXsearch)rP   rQ   lfiltered_lists       r)   filter_urlsrX   ~   so    c)n%% )<1Y!^Q<M<$-Pq1E1L1LQ1OPP#m$%%	 = Qs   	A&A&A+A+	referenceignore_suffixc                 Z    t        |d      \  }}t        | d      \  }}|r||k7  S ||k7  S )zjDetermine if a link leads to another host, takes a reference URL and
    a URL as input, returns a booleanTr.   )r*   )r   rY   rZ   stripped_refrefstripped_domainr"   s          r)   is_externalr_      s>     $ID9L#)#D9OV,..S=r+   linkknown_linksc                    | |v ry| d   dk(  r| j                  d      n| dz   }||v ry| j                  d      rH| j                  d      rd| dd z   nd| dd z   }|d   dk(  r|j                  d      n|dz   }||v s||v ryy	)
zDCompare the link and its possible variants to the existing URL base.Tr   r;   rJ   httpsN      F)rstriprK   )r`   ra   
slash_testprotocol_tests       r)   is_known_linkri      s     { &*"X_S!$*J[  v!%!9FT"1Xwab?Q 	
 R C'   %$ 	
 K':+Dr+   )F)NF)T)$__doc__rehtmlr   typingr   r   r   r   r   r	   urllib.parser
   r   r   r   tldr   compiler   r   r#   IrT   r   boolr*   r0   r5   r9   rB   rF   rO   rX   r_   ri    r+   r)   <module>rt      s   
  9 9 C C  rzz 2::n- "**_-!rzz"=rttD  !@	@@
5eCHo-.@. BG	!#c(+:>c]   &c &c &3 5c?   c  eHSM3$67  !s ! ! ! 	&49 	&# 	&49 	&S S    #c( t r+   