
    #hZ                         d Z ddlZddlZddlmZmZmZmZ ddlm	Z	 ddl
mZmZ ddlmZmZmZmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZm Z   ejB                  e"      Z# ejH                  dejJ                        Z& ejH                  dejJ                        Z' ejH                  dejJ                        Z(	 	 	 	 	 d&de)de*de*dee)   de*de*deee)e)f      fdZ+	 	 d'dddddddddde)dee)   de*de*dee)   de*de*de*de*dee)   dee)   dee)   fdZ,ddddddd d!e)dee)   d"ee)   d#ee	   d$e*de*de*dee)   deee)   ee)   f   fd%Z-y)(z0
Core functions needed to make the module work.
    N)ListOptionalSetTuple)RobotFileParser   )normalize_url	scrub_url)	basic_filterdomain_filterextension_filteris_navigation_pageis_not_crawlablelang_filterpath_filtertype_filtervalidate_url)redirection_test)	BLACKLIST)extract_domainget_base_urlfix_relative_urlsis_externalis_known_linkz<a [^<>]+?>zhreflang=["\']?([a-z-]+)zhref=["\']?([^ ]+?)(["\' >])FTurlstrictwith_redirectslanguagewith_navtrailing_slashreturnc                    	 t        |       du rt        j                  d|        t        t	        |       } |rt        |       } t        | ||      du rt        j                  d|        t        |,t        | |||      du rt        j                  d|        t        t        |       \  }}|du rt        j                  d|        t        t        |j                        du rt        j                  d|        t        t        |j                        du rt        j                  d	|        t        |r>t        |j                  |j                        du rt        j                  d
|        t        t        ||||      } |rt!        | t"        d      }nt!        | d      }|t        j                  d	|        y	 | |fS # t$        t        f$ r t        j                  d|        Y yw xY w)a(  Check links for appropriateness and sanity
    Args:
        url: url to check
        strict: set to True for stricter filtering
        with_redirects: set to True for redirection test (per HTTP HEAD request)
        language: set target language (ISO 639-1 codes)
        with_nav: set to True to include navigation pages instead of discarding them
        trailing_slash: set to False to trim trailing slashes

    Returns:
        A tuple consisting of canonical URL and extracted domain

    Raises:
        ValueError, handled in exception.
    Fzrejected, basic filter: %s)r   r   zrejected, type filter: %sNzrejected, lang filter: %szrejected, validation test: %szrejected, extension filter: %szrejected, domain name: %szrejected, path filter: %sT)	blacklistfast)r$   zdiscarded URL: %s)r   LOGGERdebug
ValueErrorr
   r   r   r   r   r   pathr   netlocr   queryr	   r   r   AttributeError)	r   r   r   r   r   r    validation_test
parsed_urldomains	            I/var/www/html/sandstorm/venv/lib/python3.12/site-packages/courlan/core.py	check_urlr0   *   s   4>%LL5s; n "3'C s6H=FLL4c:  C6>BeKLL4c: '33&7#e#LL8#> JOO,5LL93? **+u4LL4c: k*//:3C3CDMLL4c: J.I #C94HF#Cd3F>LL4c:  ;	 J' (#.s   F7F? ?%G'&G')	no_filterr   r   r    r   	redirects	referencebase_urlpagecontentexternal_boolr1   r2   r3   r4   c          	      6   |
rt        d      t        |      }
|xs |
}t               t               }}| s|S |	xs |
}	d t        j	                  |       D        D ]  }d|v rd|v r|du rh|fd|v rbt
        j                  |      }|s.|d   j                  |      s	|d   dk(  sKt        j                  |      }|sc|j                  |d          xt        j                  |      }|s|j                  |d           |D ]l  }|j                  d	      st        ||      }|du r+t        ||||||
      }|8|d   }|t        ||	d      k7  rOt        ||      r\|j                  |       n t        j                  dt!        |      t!        |             |S )az  Filter links in a HTML document using a series of heuristics
    Args:
        pagecontent: whole page in binary format
        url: full URL of the original page
        external_bool: set to True for external links only, False for
                  internal links only
        no_filter: override settings and bypass checks to return all possible URLs
        language: set target language (ISO 639-1 codes)
        strict: set to True for stricter filtering
        trailing_slash: set to False to trim trailing slashes
        with_nav: set to True to include navigation pages instead of discarding them
        redirects: set to True for redirection test (per HTTP HEAD request)
        reference: provide a host reference for external/internal evaluation

    Returns:
        A set containing filtered HTTP links checked for sanity and consistency.

    Raises:
        Nothing.
    ,'base_url' is deprecated, use 'url' instead.c              3   &   K   | ]	  }|d      yw)r   N ).0ms     r/   	<genexpr>z extract_links.<locals>.<genexpr>   s     F!1Fs   relnofollowFhreflangr   z	x-defaulthttp)r   r    r   r   r   r   T)r   r3   ignore_suffixu!   %s links found – %s valid links)r'   r   setFIND_LINKS_REGEXfinditerHREFLANG_REGEXsearch
startswith
LINK_REGEXaddr   r0   r   r   r%   infolen)r5   r   r6   r1   r   r   r    r   r2   r3   r4   
candidates
validlinkslink	langmatch	linkmatchcheckeds                    r/   extract_linksrS      s   D GHHC H
/C UCE
J %XI G/88EF -D=Z4/("6:;M&--d3I!''1Yq\[5P&--d3	NN9Q<0 #))$/Iy|,!-&  v&$S$/D-!(!G 1:DIT!  z*t14 KK3S_c*oV    )langrulesexternalr   r   r4   
htmlstringrU   rV   rW   c                    |rt        d      g g }	}t        | |||||      D ]P  }
t        |
      s||j                  d|
      s#t	        |
      r|	j                  |
       @|j                  |
       R ||	fS )zPFind links in a HTML document, filter and prioritize them for crawling purposes.r8   )r5   r   r6   r   r   r   *)r'   rS   r   	can_fetchr   append)rX   r   rU   rV   rW   r   r   r4   linkslinks_priorityrO   s              r/   filter_linksr_      s     GHH>E  D!eooc4&@d#!!$'LL#& .  rT   )FFNFT)NF).__doc__loggingretypingr   r   r   r   urllib.robotparserr   cleanr	   r
   filtersr   r   r   r   r   r   r   r   r   networkr   settingsr   urlutilsr   r   r   r   r   	getLogger__name__r%   compileIrD   rF   rI   strboolr0   rS   r_   r:   rT   r/   <module>rp      s_  
  	 - - . +
 
 
 &   
		8	$2::nbdd3 7>RZZ7>

  "Z	ZZ Z sm	Z
 Z Z eCHoZ~ ]
 "#"]]	#] ]
 ] sm] ] ] ] ] }] sm] 	X]H '+"%!%!	#%! 3-	%!
 O$%! %! %! %! sm%! 49d3i %!rT   