
    *#h~                     b   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dl m!Z! d	d
l"m#Z# d	dl$m%Z% d	dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d	dl-m.Z. d	dl/m0Z0 ddl1m2Z2  e.e3      Z4g dZ5i e%D  ci c](  } | jl                  jo                  d      | jp                  * c} ddiZ9e%D  ch c]  } | jp                   c} Z: ejv                  d      Z<e=j}                  d      de=j}                  d      de=j}                  d      de=j}                  d      de=j}                  d      de=j}                  d      de=j}                  d      de=j}                  d       d!iZ?d"d#iZ@ eAd$  ee?e@      D              ZB G d% d&eC      ZDd' ZEd( ZFdUd)eGd*ee2   fd+ZHd, ZId- ZJd. ZKdUd*ee2   d/eLfd0ZMdUd*ee2   d/eNfd1ZOdUd*ee2   d/eLfd2ZPdUd3ZQd4 ZRd5eGd/eGfd6ZSd/eeG   fd7ZTdUd)eGd*ee2   d/eeG   fd8ZU	 dUd)eGd*ee2   d/eeGeeGeeGef   f   f   fd9ZV	 dUd)eGd*ee2   d/eeGeeGeeGef   f   f   fd:ZWdVdd;d<eGd*ee2   fd=ZXdUd5eGd*ee2   d/eeG   fd>ZYd?dd@d*ee2   fdAZZdUd*ee2   fdBZ[ G dC dD e\ e                   Z]d5eeGee]f   fdEZ^dd;d*ee2   fdFZ_dd;d*ee2   fdGZ`dUd*ee2   fdHZadUd*ee2   fdIZbdUd*ee2   fdJZcdWd*ee2   fdKZddUd*ee2   fdLZe G dM dNe      Zf G dO dPef      Zg G dQ dRef      Zh G dS dT      Ziyc c} w c c} w )X    N)TimeoutError)BytesIO)chain)PathPurePosixPath)	AnyCallableDict	GeneratorIterableListOptionalTupleUnion)ElementTree)ClientError   )config)COMPRESSION_FILESYSTEMS)"get_authentication_headers_for_urlget_datasets_user_agent	http_headis_local_pathis_relative_pathurl_or_path_join)
get_logger)
map_nested   )DownloadConfig)txtcsvjsonjsonltsvconllconlluorigparquetpklpicklerelxml.zipz(?<!:):/504B0304504B0506504B0708425A68bz21F8BgzipFD377A585A00xz04224D18lz428B52FFDzstds   Rar!rarc              #   2   K   | ]  }t        |        y wN)len).0magic_numbers     i/var/www/html/sandstorm/venv/lib/python3.12/site-packages/datasets/download/streaming_download_manager.py	<genexpr>rC   K   s       s   c                       e Zd Zy)NonStreamableDatasetErrorN)__name__
__module____qualname__     rB   rE   rE   Q   s    rJ   rE   c                     t        |       j                  d      ^} }t        |       r t        j                  j
                  | g| S t        j
                  | g| } dj                  | g|z         S )u#  
    This function extends os.path.join to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xjoin function allows you to apply the join on the first path of the chain.

    Example::

        >>> xjoin("zip://folder1::https://host.com/archive.zip", "file.txt")
        zip://folder1/file.txt::https://host.com/archive.zip
    ::)strsplitr   ospathjoin	posixpath)apbs      rB   xjoinrV   U   sb    & FLLEAQww||A"""NN1!q!yy!q!!rJ   c                 >   t        |       j                  d      ^} }t        |       r7t        j                  j                  t        |       j                               } nt        j
                  |       } | j                  d      r| dz  } dj                  | g|z         S )u#  
    This function extends os.path.dirname to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xdirname function allows you to apply the dirname on the first path of the chain.

    Example::

        >>> xdirname("zip://folder1/file.txt::https://host.com/archive.zip")
        zip://folder1::https://host.com/archive.zip
    rL   ://)rM   rN   r   rO   rP   dirnamer   as_posixrR   endswithrQ   rS   rU   s     rB   xdirnamer^   p   s|    & FLLEAQGGOODG,,./a  	zz#	T	99aS1WrJ   urlpathdownload_configc                    t        |       j                  d      ^}}t        |      rt        j                  j                  |      S t        | |      \  } }t        j                  | |      ^}}|j                  |      S )a  Extend `os.path.exists` function to support both local and remote files.

    Args:
        urlpath (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `bool`
    rL   r`   storage_options)	_as_strrN   r   rO   rP   exists!_prepare_path_and_storage_optionsfsspecget_fs_token_paths)r_   r`   main_hop	rest_hopsrd   fs_s          rB   xexistsrn      sp     #7+11$7HyXww~~h''#DW^m#n **7OTQyy""rJ   c                     t        |       j                  d      ^} }t        |       r6t        j                  j                  t        |       j                               S t        j
                  |       S )u  
    This function extends os.path.basename to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xbasename function allows you to apply the basename on the first path of the chain.

    Example::

        >>> xbasename("zip://folder1/file.txt::https://host.com/archive.zip")
        file.txt
    rL   )	rM   rN   r   rO   rP   basenamer   r[   rR   r]   s     rB   	xbasenamerq      sU    & FLLEAQwwQ 0 0 233!!!$$rJ   c                 F   t        |       j                  d      ^} }t        |       r6t        j                  j                  t        |       j                               S t        j                  |       \  } }dj                  | j                  d      r| dz   n| g|z         |fS )u,  
    This function extends os.path.split to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xsplit function allows you to apply the xsplit on the first path of the chain.

    Example::

        >>> xsplit("zip://folder1/file.txt::https://host.com/archive.zip")
        ('zip://folder1::https://host.com/archive.zip', 'file.txt')
    rL   rX   rY   )
rM   rN   r   rO   rP   r   r[   rR   rQ   r\   )rS   rU   tails      rB   xsplitrt      s    & FLLEAQww}}T!W--/00//!$4yyajjo!d(1=ABDHHrJ   c                    t        |       j                  d      ^} }t        |       r6t        j                  j                  t        |       j                               S t        j
                  |       \  } }dj                  | g|z         |fS )u8  
    This function extends os.path.splitext to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xsplitext function allows you to apply the splitext on the first path of the chain.

    Example::

        >>> xsplitext("zip://folder1/file.txt::https://host.com/archive.zip")
        ('zip://folder1/file::https://host.com/archive.zip', '.txt')
    rL   )
rM   rN   r   rO   rP   splitextr   r[   rR   rQ   )rS   rU   exts      rB   	xsplitextrx      sr    & FLLEAQwwQ 0 0 233##A&3yy!q!3&&rJ   returnc                    t        |       j                  d      ^}}t        |      rt        j                  j                  |       S t        | |      \  } }t        j                  | |      ^}}|j                  |      S )zExtend `os.path.isfile` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `bool`
    rL   rb   rc   )	rM   rN   r   rO   rP   isfilerg   rh   ri   )rP   r`   rj   rk   rd   rl   rm   s          rB   xisfiler|      sm     t9??40HyXww~~d## A$Xg ho**4QQyy""rJ   c                    t        |       j                  d      ^}}t        |      rt        j                  j                  |       S t        | |      \  } }t        j                  | |      ^}}|j                  |      }|1t        | |      5 }t        |j                               }ddd       |S |S # 1 sw Y   |S xY w)zExtend `os.path.getsize` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `int`: optional
    rL   rb   rc   N)rM   rN   r   rO   rP   getsizerg   rh   ri   sizexopenr?   read)	rP   r`   rj   rk   rd   rl   rm   r   fs	            rB   xgetsizer     s     t9??40HyXwwt$$ A$Xg ho**4QQwwx <t_= %1668}%t%s   B66C c                 N   t        |       j                  d      ^}}t        |      rt        j                  j                  |       S t        | |      \  } }t        j                  | |      ^}}|j                  d      d   }|j                  d      sy|j                  |      S )zExtend `os.path.isdir` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `bool`
    rL   rb   rc   ://r   /T)
rM   rN   r   rO   rP   isdirrg   rh   ri   strip)rP   r`   rj   rk   rd   rl   rm   
inner_paths           rB   xisdirr     s     t9??40HyXww}}T"" A$Xg ho**4QQ^^E*1-
$xx
##rJ   c                 z   t        |       j                  d      ^}}t        |      rB|r!t        j                  j                  ||      S t        j                  j                  |      S |r2t        j
                  |t        |      j                  d      d         S t        j                  j                  |      S )zExtend `os.path.relpath` function to support remote files.

    Args:
        path (`str`): URL path.
        start (`str`): Start URL directory path.

    Returns:
        `str`
    rL   )startr   )rM   rN   r   rO   rP   relpathrR   )rP   r   rj   rk   s       rB   xrelpathr   4  s     t9??40HyX9>rwwxu5]BGGOOT\D]]OTy  U1A1A$1G1JKsZ\ZaZaZiZijrZssrJ   c                 Z    | j                   t        j                  fd}|| _         y )Nc                  :   d }t        ddz         D ]  }	  | i |} |S  t        d      |# t        t        f$ r\}|}t        j	                  dt
        j                   d| d d       t        j                  t
        j                         Y d }~d }~ww xY w)Nr   z4Got disconnected from remote data host. Retrying in zsec [r   ]zServer Disconnected)
ranger   r   loggerwarningr   STREAMING_READ_RETRY_INTERVALtimesleepConnectionError)argskwargsdisconnect_errretryouterrmax_retriesr   s         rB   read_with_retriesz?_add_retries_to_file_obj_read_method.<locals>.read_with_retriesI  s    1kAo. 	MEAD+F+ 
	M ""78nL  . A!$J6KoKoJppuv{u||}  J  ~K  KL  M 

6??@@As   /BABB)r   r   STREAMING_READ_MAX_RETRIES)file_objr   r   r   s     @@rB   $_add_retries_to_file_obj_read_methodr   E  s%    ==D33K  &HMrJ   rP   c                 d    | j                  d      d   }dD ]  }|j                  |      d   } |S )Nr-   z?-_r   )rN   )rP   	extensionsymbs      rB   _get_path_extensionr   \  s>    

3#I  -OOD)!,	-rJ   c                    	 | j                  d       | j	                  t
              }| j                  d       t        t
              D ]W  }t        j                  |dt
        |z
         }||c S t        j                  |dt
        |z
         }|Jt        d| d       y# t        t        j                  f$ r Y yw xY w)zQread the magic number from a file-like object and return the compression protocolr   NzCompression protocol 'z' not implemented.)seekAttributeErrorioUnsupportedOperationr   MAGIC_NUMBER_MAX_LENGTHr   $MAGIC_NUMBER_TO_COMPRESSION_PROTOCOLget0MAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOLNotImplementedError)r   rA   icompressions       rB   *_get_extraction_protocol_with_magic_numberr   f  s    	q	 6612LFF1I*+ `:>>|LiNehiNi?jk"FJJ<XuZqtuZuKvw"%(>{mK]&^__`	 B334 s   B# #B?>B?c                    t        |       } | j                  d      d   }t        |      }|t        v s|dv s|j	                  d      ry |t
        v r	t
        |   S t        | |      \  } }	 t        j                  | fi |xs i 5 }t        |      cd d d        S # 1 sw Y   y xY w# t        $ r0 | j                  t        j                        rt        | dz         d  w xY w)NrL   r   tgztarz.tar.gzz.tar.bz2z.tar.xzrb   S
If the repo is private or gated, make sure to log in with `huggingface-cli login`.)rM   rN   r   BASE_KNOWN_EXTENSIONSr\   !COMPRESSION_EXTENSION_TO_PROTOCOLrg   rh   openr   FileNotFoundError
startswithr   HF_ENDPOINT)r_   r`   rP   r   rd   r   s         rB   _get_extraction_protocolr   x  s    'lG==q!D#D)I**&==;<	7	70;;@ZijG_	[[<_%:< 	A=a@	A 	A 	A f001#pp s*   +B& B	B& B#B& #B& &9Cc                     g }i }| j                  d      D ]4  }t        ||      \  }}|j                  |       |j                  |       6 dj	                  |      fS )NrL   rb   )rN   ,_prepare_single_hop_path_and_storage_optionsappendupdaterQ   )r_   r`   prepared_urlpathprepared_storage_optionshoprd   s         rB   rg   rg     sm     !}}T" 9KCapq_$ ''89 99%&77rJ   c                 l   |dn|j                   }d| v r| j                  d      d   nd}|||j                  v r|j                  |   }nV|R||j                  vrD|j                  j                         D ci c]  \  }}|t	        j
                         vr||  }}}ni }|r||i}|dv ri t        | |      dt               idd	id
|j                  |i       ||<   d| v rnt        |       }d}|j                  j                         D ]D  \  }	}
|	j                  d      s| d|
z   z  } |j                  }d|i|j                  |i       ||<   F d| v r	d| vr| dz  } | j                  d      rd||   d   d<   | |fS |dk(  r(|t        j                  d|j                  |i       ||<   | |fS c c}}w )a\  
    Prepare the URL and the kwargs that must be passed to the HttpFileSystem or to requests.get/head

    In particular it resolves google drive URLs
    It also adds the authentication headers for the Hugging Face Hub, for both https:// and hf:// paths.

    Storage options are formatted in the form {protocol: storage_options_for_protocol}
    Nr   r   file)httphttps)tokenz
user-agent	trust_envT)headersclient_kwargszdrive.google.comdownload_warningz	&confirm=cookieszconfirm=z
&confirm=tz"https://raw.githubusercontent.com/identityr   zAccept-Encodinghf)r   endpoint)r   rN   rd   itemsrh   available_protocolsr   r   r   r   r   r   r   r   )r_   r`   r   protocolrd   option_nameoption_valueresponser   kvs              rB   r   r     s:    $+D1F1FE*/7*:w}}U#A&H"x?3R3R'R)99(C		$9X9X)X .=-L-L-R-R-T
)\&"<"<">> %
 
 #_5$$4WEJ57 *40%
 ""8R0%
! ( )HG ((..0 j1<< 23{Q.G&..G1:G0iGZGZ[cegGh0iOH-	j (Zw-F|#GBCFPOH%i01BC O## 
T	**%
 !!(B/%
!
 O##M
s   8#F0rb   r   c                   t        |       }|j                  d      ^}}t        |      rt        ||g|i |S t	        ||      \  } }i ||xs i }	 t        j                  | g|d|i|j                         }	t        |	       |	S # t        $ r }
t        |
      dk(  rt        d      |
 d}
~
wt        $ r0 | j                  t        j                        rt        | dz         d w xY w)a  Extend `open` function to support remote files using `fsspec`.

    It also has a retry mechanism in case connection fails.
    The `args` and `kwargs` are passed to `fsspec.open`, except `token` which is used for queries to private repos on huggingface.co

    Args:
        file (`str`): Path name of the file to be opened.
        mode (`str`, *optional*, default "r"): Mode in which the file is opened.
        *args: Arguments to be passed to `fsspec.open`.
        download_config : mainly use token or storage_options to support different platforms and auth types.
        **kwargs: Keyword arguments to be passed to `fsspec.open`.

    Returns:
        file object
    rL   rb   modezCannot seek streaming HTTP filezStreaming is not possible for this dataset because data host server doesn't support HTTP range requests. You can still load this dataset in non-streaming mode by passing `streaming=False` (default)Nr   )re   rN   r   r   rg   rh   
ValueErrorrM   rE   r   r   r   r   r   )r   r   r`   r   r   file_strrj   rk   rd   r   es              rB   r   r     s   " t}H#>>$/HyXHd4T4V44=hXghD/22?0b2F;;t@@$@@EEG  )2O!  q666+y 
  ??6--.#mm s   )B 	C+B//<C+c                    t        |       j                  d      ^}}t        |      rt        j                  |       S t        | |      \  } }t        j                  | |      ^}}|j                  d      d   }|j                  d      r|j                  |      st        d|        |j	                  |      }|D 	cg c]$  }	t        j                  j                  |	d         & c}	S c c}	w )	zExtend `os.listdir` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `list` of `str`
    rL   rb   rc   r   r   r   zDirectory doesn't exist: name)re   rN   r   rO   listdirrg   rh   ri   r   r   r   rP   rp   )
rP   r`   rj   rk   rd   rl   rm   r   objectsobjs
             rB   xlistdirr     s     #4=..t4HyXzz$ !B$Xg ho**4QQ^^E*1-
C *)=#&?v$FGG**Z(9@A#  V-AAAs   ?)C+F)	recursiver`   c                   t        |       j                  d      ^}}t        |      rt        j                  ||      S t	        | |      \  } }t        j                  | |      ^}}|j                  d      d   }|j                  |      }	t        |j                  t              r|j                  n|j                  d   }
|	D cg c]  }dj                  |
 d| g|z          c}S c c}w )a  Extend `glob.glob` function to support remote files.

    Args:
        urlpath (`str`): URL path with shell-style wildcard patterns.
        recursive (`bool`, default `False`): Whether to match the "**" pattern recursively to zero or more
            directories or subdirectories.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `list` of `str`
    rL   )r   rb   rc   r   r   r   )re   rN   r   globrg   rh   ri   
isinstancer   rM   rQ   )r_   r   r`   rj   rk   rd   rl   rm   r   globbed_pathsr   globbed_paths               rB   xglobr     s     #7+11$7HyXyyY77 $EW^m#n **7OTQ
 ^^E*1-

+",R[[#">2;;BKKPRO\ijL		hZs<.9:YFGjjjs   !C*c              +   >  K   t        |       j                  d      ^}}t        |      rt        j                  |fi |E d{    yt        | |      \  } }t        j                  | |      ^}}|j                  d      d   }|j                  d      r|j                  |      sg S t        |j                  t              r|j                  n|j                  d   }	 |j                  |fi |D ]%  \  }
}}dj                  |	 d|
 g|z         ||f ' y7 ׭w)	au  Extend `os.walk` function to support remote files.

    Args:
        urlpath (`str`): URL root path.
        download_config : mainly use token or storage_options to support different platforms and auth types.
        **kwargs: Additional keyword arguments forwarded to the underlying filesystem.


    Yields:
        `tuple`: 3-tuple (dirpath, dirnames, filenames).
    rL   Nrb   rc   r   r   r   r   )re   rN   r   rO   walkrg   rh   ri   r   r   r   r   rM   rQ   )r_   r`   r   rj   rk   rd   rl   rm   r   r   dirpathdirnames	filenamess                rB   xwalkr   9  s     #7+11$7HyX778.v... $EW^m#n **7OTQ^^E*1-
C *)=I",R[[#">2;;BKKPRO,3BGGJ,I&,I 	Z(GXy))zWI67)CDhPYYY	Z 	/s   ADDCDc                        e Zd ZdZ fdZddee   fdZddee   fdZd Z	e
dd       Ze
defd	       Ze
defd
       Ze
defd       Zd Zdeedf   dd fdZdedd fdZ fdZ xZS )xPathzHExtension of `pathlib.Path` to support both local paths and remote URLs.c                     t         |          }|j                  d      ^}}t        |      r|S |j	                  dd      }t
        j                  d|      }||j                  d      rdz  }|S dz  }|S )NrL   \r   r   rX   rY    )super__str__rN   r   replace#SINGLE_SLASH_AFTER_PROTOCOL_PATTERNsubr\   )selfpath_strrj   rk   path_as_posix	__class__s        rB   r   zxPath.__str__W  s    7?$'~~d39"O ((s3;??}U!7!7!<D CEDrJ   r`   c                 .    t        t        |       |      S )zExtend `pathlib.Path.exists` method to support both local and remote files.

        Args:
            download_config : mainly use token or storage_options to support different platforms and auth types.

        Returns:
            `bool`
        rb   )rn   rM   )r  r`   s     rB   rf   zxPath.existsa  s     s4y/BBrJ   c              #   6  K   | j                         }|j                  d      ^}}t        |      r#t        |      j	                  |      E d{    y|rD|d   }t        ||      \  }}|j                  d      d   |i}dj                  ||g|dd       }nd}t        j                  t        ||      |      ^}}	|j	                  t        ||            }
|
D ]7  } t        |       dj                  |j                   d| g|z                9 y7 ȭw)a]  Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Args:
            pattern (`str`): Pattern that resulting paths must match.
            download_config : mainly use token or storage_options to support different platforms and auth types.

        Yields:
            [`xPath`]
        rL   Nr   rb   r   r   rc   )r[   rN   r   r   r   rg   rQ   rh   ri   rV   typer   )r  patternr`   
posix_pathrj   rk   r_   rd   rl   rm   r   r   s               rB   r   z
xPath.globl  s      ]]_
)//59"H~**7333 #A,+LWfu+v(#*==#7#:O"L!YY''JIabM'JK
"&..uZ/I[jkFB
 GGE(G$<=M - ] d4jr{{m3|n,M+NQZ+Z![\\]! 4s   ADDC	Dc                 .     | j                   d|z   fi |S )zRglob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Args:
            pattern (`str`): Pattern that resulting paths must match.

        Yields:
            [`xPath`]
        z**/)r   )r  r  r   s      rB   rglobzxPath.rglob  s     tyy3F33rJ   ry   c                 R     t        |       t        | j                                     S )zName function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            [`xPath`]
        )r  r^   r[   r  s    rB   parentzxPath.parent  s      tDz(4==?344rJ   c                 l    t        | j                         j                  d      d         j                  S )zName function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            `str`
        rL   r   )r   r[   rN   r   r  s    rB   r   z
xPath.name  +     T]]_2248;<AAArJ   c                 l    t        | j                         j                  d      d         j                  S )zStem function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            `str`
        rL   r   )r   r[   rN   stemr  s    rB   r  z
xPath.stem  r  rJ   c                 l    t        | j                         j                  d      d         j                  S )zSuffix function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            `str`
        rL   r   )r   r[   rN   suffixr  s    rB   r  zxPath.suffix  s+     T]]_2248;<CCCrJ   c                 2    t        t        |       g|i |S )a  Extend :func:`xopen` to support argument of type :obj:`~pathlib.Path`.

        Args:
            **args: Arguments passed to :func:`fsspec.open`.
            **kwargs: Keyword arguments passed to :func:`fsspec.open`.

        Returns:
            `io.FileIO`: File-like object.
        )r   rM   )r  r   r   s      rB   r   z
xPath.open  s     SY0000rJ   rT   .c                 T     t        |       t        | j                         g|       S )zExtend :func:`xjoin` to support argument of type :obj:`~pathlib.Path`.

        Args:
            *p (`tuple` of `str`): Other path components.

        Returns:
            [`xPath`]
        )r  rV   r[   r  rT   s     rB   joinpathzxPath.joinpath  s%     tDz%4!455rJ   c                 $    | j                  |      S r>   )r  r  s     rB   __truediv__zxPath.__truediv__  s    }}QrJ   c           	      R   t        |       j                  d      ^}}t        |      r' t        |       t        t        |   |                  S  t        |       dj                   t        |       t        |      j                  |            j                         g|z               S )NrL   )	rM   rN   r   r  r   with_suffixrQ   r   r[   )r  r  rj   rk   r  s       rB   r  zxPath.with_suffix  s    "4yt49"4:c%'"5f"=>??tDz$))ZT$Zh0G0S0STZ0[%\%e%e%g$hkt$tuvvrJ   r>   )ry   r   )rF   rG   rH   __doc__r   r   r   rf   r   r  propertyr  rM   r   r  r  r   r   r  r  r  __classcell__)r  s   @rB   r   r   T  s    R	Ch~&> 	C]Xn-E ]@	4 5 5 Bc B B Bc B B D D D
1	65c? 	6w 	6 S  W  w wrJ   r   c                 r    t        | t              rt        |       S t        t        t        |                   S r>   )r   r   rM   )rP   s    rB   re   re     s(    "4/3t9JSs4y9I5JJrJ   c                    dd l }t        | d      r |j                  | g|i |S t        |       }  |j                  t	        | d|      g|i |S Nr   r   rbrb   )r5   hasattrr   rM   r   )filepath_or_bufferr`   r   r   r5   s        rB   
xgzip_openr&    s_    !6*tyy+=d=f== !34tyy14Yk\`kdjkkrJ   c                    dd l }t        | d      r |j                  | g|i |S t        |       }  |j                  t	        | d|      g|i |S r"  )numpyr$  loadrM   r   )r%  r`   r   r   nps        rB   xnumpy_loadr+    s_    !6*rww);D;F;; !34rwwu/WiZ^ibhiirJ   c                     dd l }t        | d      r |j                  | fi |S t        |       } |j	                  dd      dk(  rt        | |      |d<    |j                  t        | d|      fi |S )Nr   r   r   inferrb   r#  )pandasr$  read_csvrM   r   r   r   r%  r`   r   pds       rB   xpandas_read_csvr2    sz    !6*r{{-888 !34::mW-8$<=Oap$qF=!r{{5!3T?[f_effrJ   c           
         dd l }t        | d      r	  |j                  | fi |S t        |       } 	  |j                  t        | d|      fi |S # t        $ r-  |j                  t	        | j                               fi |cY S w xY w# t        $ r9  |j                  t	        t        | d|      j                               fi |cY S w xY wr"  )r.  r$  
read_excelr   r   r   rM   r   r0  s       rB   xpandas_read_excelr5     s    !6*	O 2==!3>v>> !!34	 2=='94Q`!alekll  	O 2==);)@)@)B!CNvNN	O  	 2==0$X]]_`dj 	s"   A B	 3BB	?C
Cc                     dd l m} t        | d      r |j                  | fi |S  |j                  t	        | d|      fi |S r"  )scipy.ior   r$  loadmatr   )r%  r`   r   sios       rB   xsio_loadmatr:    sH    !6*s{{-888s{{5!3T?[f_effrJ   c                     t        | d      rt        j                  | |      S t        | d|      5 }t        j                  ||      cddd       S # 1 sw Y   yxY w)a  Extend `xml.etree.ElementTree.parse` function to support remote files.

    Args:
        source: File path or file object.
        parser (`XMLParser`, *optional*, default `XMLParser`): Parser instance.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `xml.etree.ElementTree.Element`: Root element of the given source document.
    r   )parserr#  rb   N)r$  ETparser   )sourcer<  r`   r   s       rB   	xet_parser@    sP     vvxxv..64A 	.Q88Af-	. 	. 	.s   AAc                 
   t        | d      r*t        j                  j                  j                  | fi |S t        | d|      5 }t        j                  j                  j                  |fi |cddd       S # 1 sw Y   yxY w)a  Extend `xml.dom.minidom.parse` function to support remote files.

    Args:
        filename_or_file (`str` or file): File path or file object.
        download_config : mainly use token or storage_options to support different platforms and auth types.
        **kwargs (optional): Additional keyword arguments passed to `xml.dom.minidom.parse`.

    Returns:
        :obj:`xml.dom.minidom.Document`: Parsed document.
    r   r#  rb   N)r$  r,   domminidomr>  r   )filename_or_filer`   r   r   s       rB   xxml_dom_minidom_parserE  -  sm     (ww$$%5@@@#T?K 	6q77??((5f5	6 	6 	6s   *A99Bc                   "    e Zd ZdZdefdZd Zy)_IterableFromGeneratorzkUtility class to create an iterable from a generator function, in order to reset the generator when needed.	generatorc                 .    || _         || _        || _        y r>   rH  r   r   )r  rH  r   r   s       rB   __init__z_IterableFromGenerator.__init__B  s    "	rJ   c              #   l   K    | j                   | j                  i | j                  E d {    y 7 wr>   rJ  r  s    rB   __iter__z_IterableFromGenerator.__iter__G  s'     !4>>499<<<<s   *424N)rF   rG   rH   r  r	   rK  rM  rI   rJ   rB   rG  rG  ?  s    u( 
=rJ   rG  c                       e Zd ZdZed        Zed        Zedee	ddf   fd       Z
e	 ddedee   dee	ddf   fd	       Zedd
       Zeddee   dd fd       Zy)ArchiveIterablezIAn iterable of (path, fileobj) from a TAR archive, used by `iter_archive`c              #   $  K   t        j                  | d      }|D ]o  }|j                  }|j                         s |#t        j
                  j                  |      j                  d      rR|j                  |      }||f g |_	        q ~y w)Nzr|*)fileobjr   r-   __)
tarfiler   r   isregrO   rP   rp   r   extractfilemembers)r   streamtarinfo	file_pathr   s        rB   	_iter_tarzArchiveIterable._iter_tarN  s     ae4 	 GI==? ww	*55kB))'2HX%%FN	  s   BBc              #   ,  K   t        j                  |       }|j                         D ]h  }|j                  }|j	                         r |#t
        j                  j                  |      j                  d      rR|j                  |      }||f j y w)NrR  )
zipfileZipFileinfolistfilenameis_dirrO   rP   rp   r   r   )r   zipfmemberrZ  r   s        rB   	_iter_zipzArchiveIterable._iter_zip_  s     q!mmo 
	&FI}} ww	*55kByy(HX%%
	&s   BBry   Nc              #      K   t        |      }|dk(  r| j                  |      E d {    y | j                  |      E d {    y 7 7 w)Nr.   )r   rd  r[  )clsr   r   s      rB   _iter_from_fileobjz"ArchiveIterable._iter_from_fileobjn  sE     @C%}}Q'''}}Q''' ('s!   %AAAA	A	Ar_   r`   c              #      K   t        ||      }t        |d|      5 }|dk(  r| j                  |      E d {    n| j                  |      E d {    d d d        y 7 '7 # 1 sw Y   y xY ww)Nrb   r#  r.   )r   r   rd  r[  )rf  r_   r`   r   r   s        rB   _iter_from_urlpathz"ArchiveIterable._iter_from_urlpathv  so      /wX7D/B 	,ae#==+++==+++		, 	,++		, 	,s>   A0A$A A$A"A$	A0 A$"A$$A-)A0c                 (     | | j                   |      S r>   )rg  )rf  rQ  s     rB   from_bufzArchiveIterable.from_buf  s    3))733rJ   c                 *     | | j                   ||      S r>   )ri  )rf  urlpath_or_bufr`   s      rB   from_urlpathzArchiveIterable.from_urlpath  s    3))>?KKrJ   r>   )ry   rO  )rF   rG   rH   r  staticmethodr[  rd  classmethodr   r   rg  rM   r   r   ri  rk  rn  rI   rJ   rB   rO  rO  K  s    S   & & (itT0A&B ( ( GK,,,4^,D,	5$$	%, , 4 4 L8N;S L_p L LrJ   rO  c                   v    e Zd ZdZe	 ddeeee   f   dee	   de
eddf   fd       Zeddee	   dd fd       Zy)	FilesIterablez8An iterable of paths from a list of directories or filesNurlpathsr`   ry   c           
   #     K   t        |t              s|g}|D ]  }t        ||      r t        |      j	                  d      r+| 0t        ||      rt        ||      D ]  \  }}}t        |D cg c]  }|j	                  d      r| c}      |d d  t        |      j	                  d      rPt        |      D ]"  }|j	                  d      rt        ||       $  t        |       y c c}w w)Nrb   rR  )
r   listr|   rq   r   r   r   sortedrV   r   )	rf  rs  r`   r_   r   r   r   rZ   r`  s	            rB   _iter_from_urlpathsz!FilesIterable._iter_from_urlpaths  s      (D) zH 	1Gw@W%00=A49'Sb4c 70GXy #))qgQXQcQcdoQp')q"rHQK )44[A $*9$5 7#..{;$#GX66	77 (00+	1 *rs   A0C72C2	C2A*C7c                 *     | | j                   ||      S r>   )rw  )rf  rs  r`   s      rB   from_urlpathszFilesIterable.from_urlpaths  s    3**HoFFrJ   r>   )rF   rG   rH   r  rp  r   rM   r   r   r   r   rw  ry  rI   rJ   rB   rr  rr    s{    BZ^1S$s)^,1?G?W1	3d?	#1 18 Gh~6N GZi G GrJ   rr  c            
           e Zd ZdZdZ	 	 	 	 ddee   dee   dee   dee   fdZe	d	        Z
d
 ZdedefdZd ZdedefdZd Zdeeej$                  f   dee   fdZdeeee   f   dee   fdZy)StreamingDownloadManagera  
    Download manager that uses the "::" separator to navigate through (possibly remote) compressed archives.
    Contrary to the regular `DownloadManager`, the `download` and `extract` methods don't actually download nor extract
    data, but they rather return the path or url that could be opened using the `xopen` function which extends the
    built-in `open` function to stream data from remote files.
    TNdataset_namedata_dirr`   	base_pathc                     || _         || _        |xs t        j                  j	                  d      | _        |xs
 t               | _        y )Nr-   )_dataset_name	_data_dirrO   rP   abspath
_base_pathr   r`   )r  r|  r}  r`   r~  s        rB   rK  z!StreamingDownloadManager.__init__  s<     *!#;rwws';.B.2BrJ   c                     | j                   S r>   )r  r  s    rB   
manual_dirz#StreamingDownloadManager.manual_dir  s    ~~rJ   c                 6    t        | j                  |d      }|S )aU  Normalize URL(s) of files to stream data from.
        This is the lazy version of `DownloadManager.download` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input url_or_urls.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        T	map_tuple)r   	_downloadr  url_or_urlss     rB   downloadz!StreamingDownloadManager.download  s    " !MrJ   r_   ry   c                 ^    t        |      }t        |      rt        | j                  |      }|S r>   )rM   r   r   r  )r  r_   s     rB   r  z"StreamingDownloadManager._download  s(    g,G$&t@GrJ   c                 6    t        | j                  |d      }|S )a  Add extraction protocol for given url(s) for streaming.

        This is the lazy version of `DownloadManager.extract` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        Tr  )r   _extract)r  r  rs  s      rB   extractz StreamingDownloadManager.extract  s    & dmm[DIrJ   c                    t        |      }t        || j                        }|j                  d      d   }t	        |      }|dv s|j                  d      rt        d| d      ||S |t        v rUt        j                  j                  |j                  d      d         }d|v r|d |j                  d       n|}| d	| d| S | d
| S )Nrb   rL   r   r   r   z+Extraction protocol for TAR archives like 'z' is not implemented in streaming mode. Please use `dl_manager.iter_archive` instead.

Example usage:

	url = dl_manager.download(url)
	tar_archive_iterator = dl_manager.iter_archive(url)

	for filename, file in tar_archive_iterator:
		...r-   r   z://::)rM   r   r`   rN   r   r\   r   !SINGLE_FILE_COMPRESSION_PROTOCOLSrO   rP   rp   rindex)r  r_   r   rP   r   
inner_files         rB   r  z!StreamingDownloadManager._extract  s    g,+GTEYEYZ}}T"1%'-	&$--8Z*[%=gY G   N::))'--*=a*@AJAD
AR$<j&7&7&<=XbJZs:,b	::ZuWI..rJ   c                 B    | j                  | j                  |            S )a0  Prepare given `url_or_urls` for streaming (add extraction protocol).

        This is the lazy version of `DownloadManager.download_and_extract` for streaming.

        Is equivalent to:

        ```
        urls = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) to stream from data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.
        )r  r  r  s     rB   download_and_extractz-StreamingDownloadManager.download_and_extract  s    $ ||DMM+677rJ   rm  c                     t        |d      rt        j                  |      S t        j                  || j                        S )aN  Iterate over files within an archive.

        Args:
            urlpath_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        r   rb   )r$  rO  rk  rn  r`   )r  rm  s     rB   iter_archivez%StreamingDownloadManager.iter_archive)  s<    ( >6*"++N;;"//PTPdPd/eerJ   rs  c                 D    t         j                  || j                        S )a  Iterate over files.

        Args:
            urlpaths (`str` or `list` of `str`):
                Root paths.

        Yields:
            str: File URL path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        rb   )rr  ry  r`   )r  rs  s     rB   
iter_filesz#StreamingDownloadManager.iter_filesB  s     " **8TEYEY*ZZrJ   )NNNN)rF   rG   rH   r  is_streamingr   rM   r   rK  r  r  r  r  r  r  r  r   r   BufferedReaderr   r   r  r   r  rI   rJ   rB   r{  r{    s     L '+"&48#'
Csm
C 3-
C ".1	
C
 C=
C  (  ,/ / /68(f5b6G6G1G+H fXV[_ f2[5d3i#8 [Xc] [rJ   r{  r>   )r)NN)jr   r   rO   rR   rerT  r   xml.dom.minidomr,   r]  asyncior   r   	itertoolsr   pathlibr   r   typingr   r	   r
   r   r   r   r   r   r   	xml.etreer   r=  rh   aiohttp.client_exceptionsr   r   r   filesystemsr   utils.file_utilsr   r   r   r   r   r   utils.loggingr   utils.py_utilsr   r`   r   rF   r   r   r   lstripr   r   r  compiler   bytesfromhexr   r   maxr   	ExceptionrE   rV   r^   rM   rn   rq   rt   rx   boolr|   intr   r   r   r   r   r   r   rg   r   r   r   r   r   r  r   re   r&  r+  r2  r5  r:  r@  rE  rG  rO  rr  r{  )fs_classs   0rB   <module>r     si    	 	  	         ' Y Y Y '  1  1  ' ' + 
H	 %I`aXx  %x'8'88a% 
5	% ! H_$_8X%6%6$_ !&0bjj&= # 
MM*u	MM*u	MM*u	MM(U	MM&6	MM.!4	MM*u	MM*v	( $ U4 0  BDtu  		 	"6>#S #8N+C #(%4I6'6#8N#; #t #&H^$<  0$(>": $d $,t"&.c c `Xc] `$c H^<T `hil`m 6 ?C	8	8#+N#;	8
3S$sCx.())*	8 ?C6$6$#+N#;6$
3S$sCx.())*6$r*RV * *x7O *ZB3 B.)A BTRUY B0 !&SW k8P k:ZH^$< Z6EwDL EwPK%T5() K W[ l8N;S l X\ jH^<T j	g(>:R 	gH^<T $gh~6N g.H^4L .$6h~>V 6$	=X 	=<L, <L~"G* "GJd[ d[s b %`s   $-L'L,