
    #hC                        d Z ddlZddlZddlmZmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZ  ej(                  e      Z ej.                  d	      Z ej.                  d
      Z ej.                  d      Z ej.                  d      Z ej.                  d      Z ej.                  d      Z ej.                  d      Z ej.                  d      Z ej.                  d      Z  ej.                  d      Z!d$de"dee"   dee"   fdZ#de"de"fdZ$	 d%de"de%dee"   de"fdZ&de"de"fdZ'de"de"fdZ(d$de"dee"   de"fd Z)	 	 	 d&d!eee"f   de%dee"   d"e%de"f
d#Z*y)'z0
Functions performing URL trimming and cleaning
    N)OptionalUnion)parse_qsquote	urlencode
urlunsplitSplitResult   )is_valid_url)ALLOWED_PARAMSLANG_PARAMSTARGET_LANGS)_parsez	https?://zZ(https?://[^">&? ]+?)(?:https?://)|(?:https?://[^/]+?/[^/]+?[&?]u(rl)?=)(https?://[^"> ]+)z)https?://.+?(https?://.+?)(?:https?://|$)z(?<=\w):(?:80|443)z/+z^(?:/\.\.(?![^/]))+z</?[a-z]{,4}?>|{.+?}z/\&$z(.*?)[<>"\s]z^(?:dc|fbc|gc|twc|yc|ysc)lid|^(?:click|gbra|msclk|igsh|partner|wbra)id|^(?:ads?|mc|ga|gs|itm|mc|mkt|ml|mtm|oly|pk|utm|vero)_|(?:\b|_)(?:aff|affi|affiliate|campaign|cl?id|eid|ga|gl|kwd|keyword|medium|ref|referr?er|session|source|uid|xtor)urllanguagereturnc                 Z    	 t        t        |       d|      S # t        t        f$ r Y yw xY w)z4Helper function: chained scrubbing and normalizationFN)normalize_url	scrub_urlAttributeError
ValueError)r   r   s     J/var/www/html/sandstorm/venv/lib/python3.12/site-packages/courlan/clean.py	clean_urlr   0   s1    Ys^UH==J' s    **c                    dj                  | j                               j                  d      } | j                  d      r"| j	                  dd      j	                  dd      } t
        j                  d|       } t        j                  d| j	                  dd            } t        j                  |       }t        |      dkD  rd| vrt        j                  d	t        |      |        t        j                  |       }|r*t        |d         r|d   } t        j                  d
|        n@t         j                  |       }|r)t        |d         r|d   } t        j                  d
|        t"        j                  |       }|r|d   } t        |       dkD  r&t        j                  d| dd dz   t        |              | j%                  d      dk(  s| j%                  d      dkD  r| j'                  d      } | S )z@Strip unnecessary parts and make sure only one URL is considered z  	
z	<![CDATA[z]]>z&amp;&r
   zweb.archive.orgzdouble url: %s %sztaking url: %si  z$invalid-looking link %s of length %dN2   u   …/   z://)joinsplitstrip
startswithreplaceREMAINING_MARKUPsubTRAILING_AMP	PROTOCOLSfindalllenLOGGERdebug	SELECTIONmatchr   
MIDDLE_URLTRAILING_PARTScountrstrip)r   	protocolsr.   s      r   r   r   8   s    ''#))+

$
$	KC ~~k"kk+r*225"= 

r3
'C 

2s{{7C8
9C !!#&I
9~/s:(#i.#>$\%(+(CLL)3/$$S)EeAh/Ah-s3   %EAh
3x#~;S"X=MsSVxX yy~cii.2jjoJ    querystringstrictc                 l   | syt        |       }i }t        |      D ]  }|j                         }|r|t        vr|t        vr&t
        j                  |      r<|t        v r?|t        v r7t        ||   d         t        |   vrt        j                  d||       t        ||   ||<    t        |d      S )zStrip unwanted query elementsr   r   zbad lang: %s %sT)doseq)r   sortedlowerr   r   TRACKERS_REsearchr   strr+   r,   r   r   )r5   r6   r   qdictnewqdictqelemteststrs          r   clean_queryrB   j   s     [!EH '++-n,1K( $;&E%LO$L,BBLL*He<,%'( XT**r4   stringc                 L   d| vr| S g }| j                  d      D ]S  }|j                         j                  d      r!	 |j                  d      j	                  d      }|j                  |       U dj                  |      S # t
        $ r t        j                  d|       Y Ew xY w)z@Probe for punycode in lower-cased hostname and try to decode it.zxn--.utf8idnazinvalid utf/idna string: %s)
r!   r:   r#   encodedecodeUnicodeErrorr+   r,   appendr    )rC   partsparts      r   decode_punycoderN      s    VES! ::<""6*B{{6*11&9 	T 88E?	   B:DABs    BB#"B#url_partc                     t        | d      S )zbNormalize URLs parts (specifically path and fragment) while
    accounting for certain characters.z/%!=:,-)safe)r   )rO   s    r   normalize_partrR      s     	**r4   fragmentc                     d| v r3d| v rt        | d|      } t        |       S t        j                  |       rd} t        |       S )zNLook for trackers in URL fragments using query analysis, normalize the output.=r   Fr   )rB   r;   r<   rR   )rS   r   s     r   normalize_fragmentrV      sK    
h(?"8UH=H (## )H(##r4   
parsed_urltrailing_slashc           	      t   t        |       } | j                  j                         }t        | j                  j                               }	 | j
                  dv rt        j                  d|      }t        t        j                  dt        j                  d| j                                    }t        | j                  ||      xs d}|r|sd}n4|s2|s0t        |      dkD  r"|j!                  d      r|j#                  d      }|rdnt%        | j&                  |      }t)        |||||f      S # t        $ r Y w xY w)zFTakes a URL string or a parsed URL and returns a normalized URL string)P   i  r   r   r
   )r   schemer:   rN   netlocport	NETLOC_REr&   r   rR   PATH2PATH1pathrB   queryr*   endswithr2   rV   rS   r   )	rW   r6   r   rX   r[   r\   newpathnewquerynewfragments	            r   r   r      s    
#J$$&FZ..4467F??i']]2v.F
 UYYr599S*//+JKLG:++VX>D"HL1S!..%"$6z7J7JH$UKvvw+FGG'  s   
$D+ +	D76D7)N)FN)FNT)+__doc__loggingretypingr   r   urllib.parser   r   r   r   r	   filtersr   settingsr   r   r   urlutilsr   	getLogger__name__r+   compiler(   r-   r/   r^   r`   r_   r%   r'   r0   r;   r=   r   r   boolrB   rN   rR   rV   r    r4   r   <module>rt      s    	 " L L ! ? ?  
		8	$ BJJ|$	BJJa	 RZZDE
BJJ,-	 	

5

)* 2::56 rzz'"O,
 bjjA3 (3- 8C= /3 /3 /f GK++"+6>sm++BC C $+S +S +$ $ $ $ "	"Hk3&'"H"H sm"H 	"H
 	"Hr4   