
    "#h|"                     *   U d Z ddlZddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
	 ddlmZ ddlmZ ddlmZmZmZ dd	lmZ  ej.                  e      Zd
dhZee   ed<    e
j:                  e
j<                  j>                         e
j@                  jC                  ddg d      Z" e
jF                  e"      Z$ eddd
d      Z% ejL                  dejN                        Z( ejL                  dejN                        Z) G d d      Z*dede+fdZ,de-de+fdZ.de-dee   fdZ/de	e-ef   defdZ0d edefd!Z1d"edee   fd#Z2d$ede+fd%Z3d&ed$edefd'Z4d(edee   fd)Z5d(e	e-eef   dee   fd*Z6d+ed,ee   defd-Z7d.edefd/Z8y# e$ r dZY fw xY w)0z7
Module bundling functions related to HTML processing.
    N)datetime)AnyListOptionalSetUnion)detect)
from_bytes)HtmlElement
HTMLParser
fromstring   )MAX_FILE_SIZEutf-8utf_8UNICODE_ALIASES   )i  i  i  i  i  )totalconnectstatus_forcelist)retriesFT)collect_idsdefault_doctypeencoding
remove_pisz^< ?! ?DOCTYPE.+?/ ?>z(<html.*?)\s*/>c                   8    e Zd ZdZg dZdedededededd	fd
Zy	)	Extractorz0Defines a class to store all extraction options.	extensiveformatmaxminoriginalextensive_searchmax_datemin_dateoriginal_dateoutputformatreturnNc                 J    || _         || _        || _        || _        || _        y )Nr   )selfr$   r%   r&   r'   r(   s         K/var/www/html/sandstorm/venv/lib/python3.12/site-packages/htmldate/utils.py__init__zExtractor.__init__4   s'      0'%%+    )	__name__
__module____qualname____doc__	__slots__boolr   strr-    r.   r,   r   r   /   sI    6AI,, , 	,
 , , 
,r.   r   datar)   c                 .    | rt        |       t        kD  ryy)z6Check if the input object is suitable to be processed.TF)lenr   r7   s    r,   is_wrong_documentr;   C   s    3t9},r.   c                 F    	 | j                  d       y# t        $ r Y yw xY w)zLSimple heuristic to determine if a bytestring uses standard unicode encodingzUTF-8FT)decodeUnicodeDecodeErrorr:   s    r,   isutf8r?   J   s,    G   s    	  bytesobjectc                 Z   t        |       rdgS g }t        /t        |       d   }||j                  |j                                t	        | dd       xs t	        |       }|j                  |D cg c]  }|j                   c}       |D cg c]  }|t        vs| c}S c c}w c c}w )z<Read all input or first chunk and return a list of encodingsr   Nr   i:  )r?   cchardet_detectappendlowerr
   extendr   r   )r@   guessescchardet_guessdetection_resultsrgs         r,   detect_encodingrK   S   s     kyG"(5jA%NN>//12 #;v#67R:k;RNN(9:1AJJ:; ;!!?":A;; ; <s   0B#B(B(filecontentc                     t        | t              r| S d}t        |       D ]  }	 | j                  |      } n |xs t        | dd      S # t        t
        f$ r t        j                  d|       d}Y Sw xY w)znGuess bytestring encoding and try to decode to Unicode string.
    Resort to destructive conversion otherwise.Nzwrong encoding detected: %sr   replace)r   errors)
isinstancer5   rK   r=   LookupErrorr>   LOGGERwarning)rL   htmltextguessed_encodings      r,   decode_filerV   i   s     +s#H+K8 	"))*:;H
  Ks;KK /0 	NN8:JKH	s   A

'A43A4responsec                     t        | t        j                  j                        st	        | d      r| j
                  }t        |      S | }t        |      S )zRead the urllib3 object corresponding to the server response, then
    try to guess its encoding and decode it to return a unicode stringr7   )rP   urllib3rW   HTTPResponsehasattrr7   rV   )rW   resp_contents     r,   decode_responser]   }   sL     (G,,99:ghPV>W}} |$$  |$$r.   urlc                 r   	 t         j                  d| d      }|j                  dk7  r"t        j	                  d|j                  |        yt        |j                        rt        j	                  d|        yt        |j                        S # t        $ r!}t        j	                  d| |       Y d}~yd}~ww xY w)	a,  Fetches page using urllib3 and decodes the response.

    Args:
        url: URL of the page to fetch.

    Returns:
        HTML code as string, or Urllib3 response object (headers + body), or empty string in case
        the result is invalid, or None if there was a problem with the network.

    GET   )timeout   z!not a 200 response: %s for URL %szincorrect input data for URL %szdownload error: %s %sN)		HTTP_POOLrequeststatusrR   errorr;   r7   r]   	Exception)r^   rW   errs      r,   	fetch_urlrj      s    2 $$UC$<
 ??c!LL<hoosS
 	 x}}-LL:C@  #8==11  8,c377 8s   B 	B6B11B6	beginningc                 
    d| vS )zOAssess if the object is proper HTML (awith a corresponding tag or declaration).htmlr6   )rk   s    r,   is_dubious_htmlrn      s    ""r.   
htmlstringc                 8   d|v r3| j                  d      \  }}}t        j                  d|d      dz   |z   } t        t	        | j                                     D ]=  \  }}d|v r,|j                  d      rt        j                  d| d      }  | S |d	kD  s< | S  | S )
z>Repair faulty HTML strings to make then palatable for libxml2.doctype
 r   )countz<htmlz/>z\1>   )	partitionDOCTYPE_TAGsub	enumerateiter
splitlinesendswithFAULTY_HTML)ro   rk   	firstline_restilines          r,   repair_faulty_htmlr      s     I'11$7	1d __R!_<tCdJ
T*"7"7"9:; 4d?t}}T2$1EJ  q5 r.   
htmlobjectc                     d}	 t        | j                  d      t              }|S # t        $ r!}t        j                  d|       Y d}~|S d}~ww xY w)z!Try to pass bytes to LXML parser.Nutf8parserzlxml parser bytestring %s)r   encodeHTML_PARSERrh   rR   rg   )r   treeri   s      r,   fromstring_bytesr      sS    D7*++F3KH K  70#66K7s    & 	AAAc                    t        | t              r| S t        | t        t        f      st	        dt        |             t        | t              rD| j                  d      r3d| vr/t        j                  d|        t        |       } | t        d|       d}t        |       } | dd j                         }t        | |      } d}	 t        | t        	      }|t'        |      dk  r|st!        |       }|:t)        |      r/t'        |      dk  r!t        j%                  dt'        |             d}|S # t        $ r d
}t!        |       }Y st"        $ r }t        j%                  d|       Y d}~d}~ww xY w)zkLoad object given as input and validate its type
    (accepted: lxml.html tree, bytestring and string)
    zincompatible input type: %shttp zURL detected, downloading: %sNzURL couldn't be processed: %s2   Fr   Tzlxml parsing failed: %sr   ru   z9parsed tree length: %s, wrong data type or not valid HTML)rP   r   bytesr5   	TypeErrortype
startswithrR   debugrj   
ValueErrorrV   rD   r   r   r   r   rh   rg   r9   rn   )r   r   rk   fallback_parseri   s        r,   	load_htmlr      s[   
 *k*j5#,/5tJ7GHH 	:s#!!&)z!4jAz*
<jIIDZ(J3B%%'I#J	:JN5*[9 	D	A~
+ OI63t9q=GT	
 K!  ,
+ 5.445s   D- -E-E-E((E-r   elemlistc                 z    | j                  |      D ]&  }|j                         }||j                  |       ( | S )zDelete selected elements.)rz   	getparentremove)r   r   elementparents       r,   
clean_htmlr      sA    99X& #""$MM'"# Kr.   stringc                 \    dj                  | j                               j                         S )z7Remove superfluous space and normalize remaining space.r   )joinsplitstrip)r   s    r,   	trim_textr     s     88FLLN#))++r.   )9r2   loggingrer   typingr   r   r   r   r   rY   cchardetr	   rB   ImportErrorcharset_normalizerr
   	lxml.htmlr   r   r   settingsr   	getLoggerr/   rR   r   r5   __annotations__disable_warnings
exceptionsInsecureRequestWarningutilRetryRETRY_STRATEGYPoolManagerrd   r   compileIrw   r}   r   r4   r;   r   r?   rK   rV   r]   rj   rn   r   r   r   r   r   r6   r.   r,   <module>r      s2    	  2 2 2 * 9 9 # 
		8	$$g.S .   ++BB C##
. $ 
  G7	uw4 bjj0"$$7bjj+RTT2, ,(C D  4 < <49 <,LU5#:. L3 L(%c %c %3 8C= :#s #t #
3 3 3   +)> 1%sK 78 1Xk=R 1h[ DI + ,c ,c ,_  Os   F FF