
    (#hC                     x   d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZmZ ddlZddlZdd
lmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( 	 ddl)m*Z* ejV                  jY                  d      Z-	 ddl/Z/ e/j`                         Z1e1je                  e/jf                  e/jh                         e1je                  e/jf                  e/jj                         dZ6 ejn                  e8      Z9 ejt                  ejv                  jx                         da=da>da?dedeej                  ef   fdZAej                  j                  d      ZDd ed      z   dz   ZEeEeDd<   g dZFh dZG G d d      ZHdedeeeeI      eeI   f   fd ZJ	 dEded!eeeIeIf      deeIeIf   fd"ZKdedej                  j                  fd#ZM	 dFded$eNdeej                  ef   fd%ZOd&eId$eNd'eNdedeeH   f
d(ZPd&eId)eHd*e#deNfd+ZQd&eId)eHd,eNd*e#deeeHeIf      f
d-ZRde"dfd&eId$eNded*ee#   deeI   f
d.ZSddde"d/d&eId,eNd$eNd'eNdedeeH   fd0ZTd&eIdeNfd1ZUd&eIdeNfd2ZVd&eIdeNfd3ZW	 	 	 	 	 dGd4eeI   d5eeeI      d6eeI   d7ee   d8eNd9eNdefd:ZX	 dHd7ed;eYdeeeI   ef   fd<ZZ	 dId=eeI   d>e[d?eeIgef   d@e[deeeIef   ddf   f
dAZ\	 dEd=eeI   d>e[d*ee#   deeeIeIf   ddf   fdBZ]	 dEd=eeI   d>e[d*ee#   deeeIeHf   ddf   fdCZ^d&eId$eNd'eNdedeeH   f
dDZ_y# e.$ r dZ-Y w xY w# e.$ r dZ6Y ?w xY w)JzG
All functions needed to steer and execute downloads of web documents.
    N)ThreadPoolExecutoras_completed)ConfigParser)partial)version)BytesIO)sleep)	AnyCallableDict	GeneratorListOptionalSetTupleUnion)UrlStore)redirection_test   )DEFAULT_CONFIG	Extractor)URL_BLACKLIST_REGEXdecode_fileis_acceptable_lengthmake_chunks)SOCKSProxyManager
http_proxyTFargsreturnc                  x    t         rt        nt        j                  }t         rdt         ini }d|d<    |di || S )zCConfigure urllib3 download pool according to user-defined settings.	proxy_url2   	num_pools )	PROXY_URLr   urllib3PoolManager)r   manager_classmanager_argss      R/var/www/html/sandstorm/venv/lib/python3.12/site-packages/trafilatura/downloads.pycreate_poolr+   C   s=    )2%8K8KM/8K+bL "L0<0400    )accept_encodingztrafilatura/trafilaturaz( (+https://github.com/adbar/trafilatura)
User-Agent)i  i  i  i  i  i  i  i  i	  i
  i  i  i  i  i  i  iV  >   #   6   :   ;   <   @   B   M   R   S   [   c                       e Zd ZdZg dZdedededdfdZde	fd	Z
defd
Zdeeef   ddfdZde	ddfdZdeeef   fdZy)Responsez5Store information gathered in a HTTP response object.dataheadershtmlstatusurlr>   rA   rB   r   Nc                 J    || _         d | _        d | _        || _        || _        y Nr=   )selfr>   rA   rB   s       r*   __init__zResponse.__init__l   s%    	15#'	r,   c                     | j                   d uS rD   )r>   rE   s    r*   __bool__zResponse.__bool__s   s    yy$$r,   c                 H    | j                   xs t        | j                        S rD   )r@   r   r>   rH   s    r*   __repr__zResponse.__repr__v   s    yy2K		22r,   
headerdictc                 ~    |j                         D ci c]  \  }}|j                         | c}}| _        yc c}}w )z#Store response headers if required.N)itemslowerr?   )rE   rL   kvs       r*   store_headerszResponse.store_headersy   s0     2<1A1A1CDA	1DDs   9decodec                 X    |r(| j                   rt        | j                         | _        yyy)z9Decode the bytestring in data and store a string in html.N)r>   r   r@   )rE   rS   s     r*   decode_datazResponse.decode_data~   s"    dii#DII.DI  6r,   c                 V    | j                   D ci c]  }|t        | |       c}S c c}w )z,Convert the response object to a dictionary.)	__slots__getattr)rE   attrs     r*   as_dictzResponse.as_dict   s&    6:nnEdgdD))EEEs   &)__name__
__module____qualname____doc__rW   bytesintstrrF   boolrI   rK   r   rR   rU   rZ   r$   r,   r*   r<   r<   h   s    ;<IU C c d %$ %3# 3ES#X E4 E
/$ /4 /
Fc3h Fr,   r<   configc                     | j                  ddd      j                         }|r|j                         nd}| j                  dd      xs d}||fS )zARead and extract HTTP header strings from the configuration file.DEFAULTUSER_AGENTS )fallbackNCOOKIE)getstrip
splitlines)rc   myagents
agent_listmycookies       r*   _parse_configrp      sU     zz)]Rz@FFHH*2$$&J zz)X.6$Hxr,   r?   c                     | t         k7  r1t        |       \  }}i }|rt        j                  |      |d<   |r||d<   |xs t        S )z1Internal function to decide on user-agent string.r/   Cookie)r   rp   randomchoiceDEFAULT_HEADERS)rc   r?   rm   ro   s       r*   _determine_headersrv      sN     *62($*MM($;GL! (GH%o%r,   c           	          t         s[t        j                  j                  | j	                  dd      | j	                  dd      d| j	                  dd      dz  t
              a t         S )z5Define a retry strategy according to the config file.re   MAX_REDIRECTSr   DOWNLOAD_TIMEOUT   )totalredirectconnectbackoff_factorstatus_forcelist)RETRY_STRATEGYr&   utilRetrygetintFORCE_STATUSrc   s    r*   _get_retry_strategyr      sc      ++--	?;]]? !==4FG!K) , 	
 r,   no_sslc                     |rt         nt        }|s@t        | j                  dd      |rdnt	        j
                         |rdnd      }|r|a |S |a|S )zXCreate a urllib3 pool manager according to options in the config file and HTTPS setting.re   ry   N	CERT_NONECERT_REQUIRED)timeoutca_certs	cert_reqs)NO_CERT_POOL	HTTP_POOLr+   r   certifiwhere)rc   r   pools      r*   _initiate_poolr      sZ    
 "<yDMM)-?@#T%+k
 L K IKr,   rB   with_headersc                    	 t        ||      }|j                  d| t        |      t        |      d      }t	               }|j                  d      D ]<  }|j                  |       t        |      |j                  dd      kD  s3t        d       |j                          t        t        |      |j                  |j                               }|r|j                  |j                          |S # t"        j$                  j&                  $ r' t(        j+                  d	|        t-        | d
||      cY S t.        $ r!}	t(        j1                  d| |	       Y d}	~	yd}	~	ww xY w)zPInternal function to robustly send a request (SSL or not) and return its result.)r   GETF)r?   retriespreload_contenti   re   MAX_FILE_SIZEzMAX_FILE_SIZE exceededzretrying after SSLError: %sTzdownload error: %s %sN)r   requestrv   r   	bytearraystreamextendlenr   
ValueErrorrelease_connr<   r_   rA   geturlrR   r?   r&   
exceptionsSSLErrorLOGGERwarning_send_urllib_request	Exceptionerror)
rB   r   r   rc   pool_managerresponser>   chunkresperrs
             r*   r   r      s8   8%fV<  ''&v.'/! ( 
 {__U+ 	;EKK4y6==ODD !9::	; 	 dX__hoo6GHx//0&& E4c:#C|VDD 8,c3778s&   BC. A(C. .AE1E9EEr   optionsc                     t        |j                  xs |j                  xs d      }|j                  dk7  r"t        j                  d|j                  |        yt        ||      syy)z2Check if the response conforms to formal criteria.rg      z!not a 200 response: %s for URL %sFT)r   r@   r>   rA   r   r   r   )rB   r   r   lentests       r*   _is_suitable_responser      sR    (--68==6B7G#8(//3O1r,   rS   c                 >    t        | ||      r|r|j                  S |S y)z:Internal function to run safety checks on response result.N)r   r@   )rB   r   rS   r   s       r*   _handle_responser      s$     S(G4 &x}}4H4r,   c                     |r|j                   n|}t        | d||      }|r3|j                  r'|st        |      }t	        | ||      r|j
                  S y)a  Downloads a web page and seamlessly decodes the response.

    Args:
        url: URL of the page to fetch.
        no_ssl: Do not try to establish a secure connection (to prevent SSLError).
        config: Pass configuration values for output control.
        options: Extraction options (supersedes config).

    Returns:
        Unicode string or None in case of failed downloads and invalid results.

    T)rS   r   rc   r   N)rc   fetch_responser>   r   r   r@   )rB   r   rc   r   r   s        r*   	fetch_urlr     sP    $  'W^^FFc$vfMHHMMv.G h8== r,   )rS   r   r   rc   c                    t         st        nt        }t        j	                  d|         || |||      }|st        j	                  d|        y|j                  |       |S )a  Downloads a web page and returns a full response object.

    Args:
        url: URL of the page to fetch.
        decode: Use html attribute to decode the data (boolean).
        no_ssl: Don't try to establish a secure connection (to prevent SSLError).
        with_headers: Keep track of the response headers.
        config: Pass configuration values for output control.

    Returns:
        Response object or None in case of failed downloads and invalid results.

    zsending request: %szrequest failed: %sN)
HAS_PYCURLr   _send_pycurl_requestr   debugrU   )rB   rS   r   r   rc   dl_functionr   s          r*   r   r   #  sV    * /9&>RK
LL&,3f=H)3/ Or,   c                    d}t        j                         }|j                  t         j                  | j	                  d             |j                  t         j
                  d       |j                  t         j                  d       |j                  t         j                  d       |j                  |j                  d       t        r$|j                  t         j                  t               	 |j                          |j                  |j                        dk  }|j#                          |S # t         j                  $ r#}t        j!                  d| |       d}Y d}~Cd}~ww xY w)	z+Send a basic HTTP HEAD request with pycurl.Futf-8
   r   Ti  zpycurl HEAD error: %s %sN)pycurlCurlsetoptURLencodeCONNECTTIMEOUTSSL_VERIFYPEERSSL_VERIFYHOSTNOBODYr%   	PRE_PROXYperformgetinfoRESPONSE_CODEr   r   r   close)rB   page_existscurlr   s       r*   _pycurl_is_live_pager   B  s    K;;=DKK

CJJw/0KK%%r*KK%%q)KK%%q)KKT"F$$i0ll4#5#56<
 	JJL << /c:s   -.D- -E# EE#c                 v    	 t        |       }y# t        $ r!}t        j                  d| |       Y d}~yd}~ww xY w)zGUse courlan redirection test (based on urllib3) to send a HEAD request.zurllib3 HEAD error: %s %sNFT)r   r   r   r   )rB   _r   s      r*   _urllib3_is_live_pager   ^  s<    S!   0#s;s    	838c                 F    t         rt        |       nd}|xs t        |       S )zCSend a HTTP HEAD request without taking anything else into account.F)r   r   r   )rB   results     r*   is_live_pager   h  s"    *4!#&%F/*3//r,   	inputlist	blacklist
url_filter	url_storecompressionverbosec                 <   |t        |d|      }t        t        j                  |             } |r(| D cg c]  }t	        j
                  d|      |vs| } }|r%| D cg c]  t        fd|D              s } }|j                  |        |S c c}w c c}w )zMFilter, convert input URLs and add them to domain-aware processing dictionaryF)
compressedstrictr   rg   c              3   &   K   | ]  }|v  
 y wrD   r$   ).0fus     r*   	<genexpr>z)add_to_compressed_dict.<locals>.<genexpr>  s     0LAa0Ls   )r   listdictfromkeysr   subanyadd_urls)r   r   r   r   r   r   r   s         `r*   add_to_compressed_dictr   o  s     E7S	T]]9-.I 
$7$;$;B$B)$SA
	 
  )M1S0L0L-LQM	My!

 Ns   BB!B;B
sleep_timec                 h    	 | j                  |d      }|s| j                  r	 || fS t        |       2)zRDetermine threading strategy and draw URLs respecting domain-based back-off rules.i )
time_limitmax_urls)get_download_urlsdoner	   )r   r   
bufferlists      r*   load_download_bufferr     sC     00JQV0W
y   	j	 r,   r   download_threadsworker	chunksizec           	   #   
  K   t        |      5 }t        | |      D ]I  }|D ci c]  }|j                  ||      | }}t        |      D ]  }||   |j	                         f  K 	 ddd       yc c}w # 1 sw Y   yxY ww)z3Use a thread pool to perform a series of downloads.)max_workersN)r   r   submitr   r   )	r   r   r   r   executorr   rB   future_to_urlfutures	            r*   _buffered_downloadsr     s      
(8	9 =X Y7 	=EJOP3X__VS93>PMP&}5 =#F+V]]_<<=	== =P= =s,   BA7A2+A7)	B2A77B <Bc                 >    t        t        |      }t        | ||      S )z3Download queue consumer, single- or multi-threaded.)r   )r   r   r   )r   r   r   r   s       r*   buffered_downloadsr     s      Y0Fz+;VDDr,   c                 f    |r|j                   nt        }t        t        |      }t	        | ||      S )z7Download queue consumer, returns full Response objects.r   )rc   r   r   r   r   )r   r   r   rc   r   s        r*   buffered_response_downloadsr    s-      'W^^NF^F3Fz+;VDDr,   c                    t        |      j                         D cg c]  \  }}| d|  }}}t        j                         }|j	                  t        j
                  | j                  d             |j	                  t        j                  t               |j	                  t        j                  |       |j	                  t        j                  d       |j	                  t        j                  |j                  dd             |j	                  t        j                  |j                  dd             |j	                  t        j                  |j                  dd             |j	                  t        j                  |j                  dd             |j	                  t        j                   d       |du rA|j	                  t        j"                  d	       |j	                  t        j$                  d	       n2|j	                  t        j&                  t)        j*                                |r4t-               }|j	                  t        j.                  |j0                         t2        r$|j	                  t        j4                  t2               	 |j7                         }	tE        |	|jG                  |jH                        |jG                  |jJ                              }|jM                          |ri }jO                         jQ                  dd      jS                         D ]=  }d|vr|jU                  dd      \  }}|jW                         ||jW                         <   ? |jY                  |       |S c c}}w # t        j8                  $ rd}
t:        j9                  d
| |
       |du r?|
j<                  d	   t>        v r*t:        jA                  d| |
       tC        | d||      cY d}
~
S Y d}
~
yd}
~
ww xY w)zDExperimental function using libcurl and pycurl to speed up downloadsz: r   r   re   rx   ry   r   Tr   zpycurl error: %s %sFzretrying after SSL error: %s %sNz
iso-8859-1replace)errors:)-rv   rN   r   r   r   r   r   SHARE
CURL_SHARE
HTTPHEADERFOLLOWLOCATION	MAXREDIRSr   r   TIMEOUTMAXFILESIZENOSIGNALr   r   CAINFOr   r   r   HEADERFUNCTIONwriter%   r   
perform_rbr   r   r   CURL_SSL_ERRORSr   r   r<   r   r   EFFECTIVE_URLr   getvaluerS   rl   splitrk   rR   )rB   r   r   rc   headercontent
headerlistr   headerbytesbufferbytesr   r   respheaderslinenamevalues                   r*   r   r     s    7I6P6V6V6X#2676("WIJ  ;;=DKK

CJJw/0KKj)KK!!:.KK%%q)KK  &--	?"KLKK%%v}}Y@R'STKKi9K LMKK""FMM)_$MNKK#~F))1-F))1-FMM7==?3iF));+<+<=F$$i0oo'" T\\$"4"45t||DDVDV7WD 	JJL   ")),y)ITTV
	6D
 $**S!,KD%(-K

%
	6 	;'KYN << *C5
 U?sxx{o=LL:CE'T<HH s$   M5M; ;O2AO-"O2-O2rD   )F)NNNFF)g      @)i'  )`r^   loggingosrs   concurrent.futuresr   r   configparserr   	functoolsr   importlib.metadatar   ior   timer	   typingr
   r   r   r   r   r   r   r   r   r   r&   courlanr   courlan.networkr   settingsr   r   utilsr   r   r   r   urllib3.contrib.socksr   environrj   r%   ImportErrorr   	CurlSharer  r   SH_SHARELOCK_DATA_DNSLOCK_DATA_SSL_SESSIONr   	getLoggerr[   r   disable_warningsr   InsecureRequestWarningr   r   r   r'   r+   r   make_headersru   
USER_AGENTr   r  r<   ra   rp   rv   r   r   rb   r   r   r   r   r   r   r   r   r   r   floatr   r`   r   r   r  r   r$   r,   r*   <module>r9     s    	  ? %  &  
 
 
    , / V V7

|,I!!!#J foov';';<foov'C'CD J
 
		8	$   ++BB C	1 1g&9&93&> ? 1 ,,++D+AW]++.XX  !+ ( ?F FD ,  5$s)1Dhsm1S+T   ?C&&#+DcN#;&	#s(^& 1C1C & */"&
7#$."	""*."8D"h"J	s 	h 	 	t 		 *.9BeHcM"# )#'		  i 	
 c]> )	  	
   h>c d 8s t 0c 0d 0 %) $$(CyC!  !	
   6 .1	!	!%*	!
49h	!  	=S	== cUCZ = 	=
 uS#Xd*+=" $(ES	EE i E uS#Xd*+	E $(	ES		E	E i 	E uS(]#T4/0		ES	SS*.S8DShSc  I  Js%   4!L  A#L.  L+*L+.L98L9