
    *#h}V                     2   d Z ddlZddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z,  e$e-      Z.g dZ/e0jc                  d      de0jc                  d      de0jc                  d      de0jc                  d      de0jc                  d      de0jc                  d      de0jc                  d      de0jc                  d      diZ2dd iZ3 e4d!  ee2e3      D              Z5 G d" d#ejl                        Z7 G d$ d%e      Z8d&e9d'e9fd(Z:d'ee9   fd)Z;d&e9d'ee9   fd*Z< G d+ d,e      Z= G d- d.e=      Z> G d/ d0e=      Z? G d1 d2      Z@y)3zDownload manager interface.    N)datetime)partial)chain)CallableDict	GeneratorIterableListOptionalTupleUnion   )config)DeprecatedEnum
deprecated)cached_pathget_from_cachehash_url_to_filenameis_relative_pathurl_or_path_join)get_size_checksum_dict)
get_loggeris_progress_bar_enabledtqdm)NestedDataStructure
map_nestedsize_str   )DownloadConfig)txtcsvjsonjsonltsvconllconlluorigparquetpklpicklerelxml504B0304zip504B0506504B0708425A68bz21F8BgzipFD377A585A00xz04224D18lz428B52FFDzstds   Rar!rarc              #   2   K   | ]  }t        |        y wN)len).0magic_numbers     _/var/www/html/sandstorm/venv/lib/python3.12/site-packages/datasets/download/download_manager.py	<genexpr>rB   F   s       s   c                       e Zd ZdZdZdZdZy)DownloadModea)  `Enum` for how to treat pre-existing downloads and data.

    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
    raw downloads and the prepared dataset if they exist.

    The generations modes:

    |                                     | Downloads | Dataset |
    |-------------------------------------|-----------|---------|
    | `REUSE_DATASET_IF_EXISTS` (default) | Reuse     | Reuse   |
    | `REUSE_CACHE_IF_EXISTS`             | Reuse     | Fresh   |
    | `FORCE_REDOWNLOAD`                  | Fresh     | Fresh   |

    reuse_dataset_if_existsreuse_cache_if_existsforce_redownloadN)__name__
__module____qualname____doc__REUSE_DATASET_IF_EXISTSREUSE_CACHE_IF_EXISTSFORCE_REDOWNLOAD     rA   rD   rD   L   s     83)rP   rD   c                   (    e Zd ZdZdZdZed        Zy)GenerateModerE   rF   rG   c                      y)NzUse 'DownloadMode' instead.rO   selfs    rA   help_messagezGenerateMode.help_messagef   s    ,rP   N)rH   rI   rJ   rL   rM   rN   propertyrV   rO   rP   rA   rR   rR   a   s&    73)- -rP   rR   pathreturnc                 d    | j                  d      d   }dD ]  }|j                  |      d   } |S )N.z?-_r   )split)rX   	extensionsymbs      rA   _get_path_extensionr`   k   s>    

3#I  -OOD)!,	-rP   c                    	 | j                  d       | j	                  t
              }| j                  d       t        t
              D ]W  }t        j                  |dt
        |z
         }||c S t        j                  |dt
        |z
         }|Jt        d| d       y# t        t        j                  f$ r Y yw xY w)zQread the magic number from a file-like object and return the compression protocolr   NzCompression protocol 'z' not implemented.)seekAttributeErrorioUnsupportedOperationreadMAGIC_NUMBER_MAX_LENGTHrange$MAGIC_NUMBER_TO_COMPRESSION_PROTOCOLget0MAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOLNotImplementedError)fr@   icompressions       rA   *_get_extraction_protocol_with_magic_numberrp   u   s    	q	 6612LFF1I*+ `:>>|LiNehiNi?jk"FJJ<XuZqtuZuKvw"%(>{mK]&^__`	 B334 s   B# #B?>B?c                     t        |       } t        |       }|t        v s|dv s| j                  d      ry t	        | d      5 }t        |      cd d d        S # 1 sw Y   y xY w)N)tgztar)z.tar.gzz.tar.bz2z.tar.xzrb)strr`   BASE_KNOWN_EXTENSIONSendswithopenrp   )rX   r^   rm   s      rA   _get_extraction_protocolry      sa    t9D#D)I 	**&==;<	dD	 =Q9!<= = =s   AAc                   "    e Zd ZdZdefdZd Zy)_IterableFromGeneratorzkUtility class to create an iterable from a generator function, in order to reset the generator when needed.	generatorc                 .    || _         || _        || _        y r=   r|   argskwargs)rU   r|   r   r   s       rA   __init__z_IterableFromGenerator.__init__   s    "	rP   c              #   l   K    | j                   | j                  i | j                  E d {    y 7 wr=   r~   rT   s    rA   __iter__z_IterableFromGenerator.__iter__   s'     !4>>499<<<<s   *424N)rH   rI   rJ   rK   r   r   r   rO   rP   rA   r{   r{      s    u( 
=rP   r{   c                       e Zd ZdZed        Zed        Zedee	ddf   fd       Z
ededee	ddf   fd       Zedd	       Zedd
       Zy)ArchiveIterablezIAn iterable of (path, fileobj) from a TAR archive, used by `iter_archive`c              #   $  K   t        j                  | d      }|D ]o  }|j                  }|j                         s |#t        j
                  j                  |      j                  d      rR|j                  |      }||f g |_	        q ~y w)Nzr|*)fileobjmoder[   __)
tarfilerx   nameisregosrX   basename
startswithextractfilemembers)rm   streamtarinfo	file_pathfile_objs        rA   	_iter_tarzArchiveIterable._iter_tar   s     ae4 	 GI==? ww	*55kB))'2HX%%FN	  s   BBc              #   ,  K   t        j                  |       }|j                         D ]h  }|j                  }|j	                         r |#t
        j                  j                  |      j                  d      rR|j                  |      }||f j y wNr   )
zipfileZipFileinfolistfilenameis_dirr   rX   r   r   rx   )rm   zipfmemberr   r   s        rA   	_iter_zipzArchiveIterable._iter_zip   s     q!mmo 
	&FI}} ww	*55kByy(HX%%
	&s   BBrY   Nc              #      K   t        |      }|dk(  r| j                  |      E d {    y | j                  |      E d {    y 7 7 w)Nr.   )rp   r   r   )clsrm   ro   s      rA   _iter_from_fileobjz"ArchiveIterable._iter_from_fileobj   sE     @C%}}Q'''}}Q''' ('s!   %AAAA	A	Aurlpathc              #      K   t        |      }t        |d      5 }|dk(  r| j                  |      E d {    n| j                  |      E d {    d d d        y 7 '7 # 1 sw Y   y xY ww)Nrt   r.   )ry   rx   r   r   )r   r   ro   rm   s       rA   _iter_from_pathzArchiveIterable._iter_from_path   si     .w7'4  	,Ae#==+++==+++		, 	,++		, 	,s>   A,A AA AA 	A,A A  A)%A,c                 (     | | j                   |      S r=   )r   )r   r   s     rA   from_bufzArchiveIterable.from_buf   s    3))733rP   c                 (     | | j                   |      S r=   )r   )r   urlpath_or_bufs     rA   	from_pathzArchiveIterable.from_path   s    3&&77rP   )rY   r   )rH   rI   rJ   rK   staticmethodr   r   classmethodr   r   r   ru   r   r   r   rO   rP   rA   r   r      s    S   & & (itT0A&B ( ( ,c ,itT8I.J , , 4 4 8 8rP   r   c                   X    e Zd ZdZedeeee   f   deeddf   fd       Z	edd       Z
y)FilesIterablez8An iterable of paths from a list of directories or filesurlpathsrY   Nc           
   #   r  K   t        |t              s|g}|D ]  }t        j                  j	                  |      r4t        j                  j                  |      j                  d      rR| Wt        j                  |      D ]  \  }}}t        |D cg c]  }|j                  d      r| c}      |d d  t        j                  j                  |      j                  d      rdt        |      D ]6  }|j                  d      rt        j                  j                  ||       8   y c c}w wr   )

isinstancelistr   rX   isfiler   r   walksortedjoin)r   r   r   dirpathdirnames	filenamesdirnamer   s           rA   _iter_from_pathszFilesIterable._iter_from_paths   s    (D) zH 	>Gww~~g&77##G,77D46GGG4D >0GXy #))qgQXQcQcdoQp')q"rHQKww''0;;KH $*9$5 >#..{;$ ggll7H==	>>	> *rs   BD7D2+D2/BD7c                 (     | | j                   |      S r=   )r   )r   r   s     rA   
from_pathszFilesIterable.from_paths   s    3''22rP   )rY   r   )rH   rI   rJ   rK   r   r   ru   r
   r   r   r   rO   rP   rA   r   r      sS    B>c49n(= >)CQUW[OB\ > >. 3 3rP   r   c            
       (   e Zd ZdZ	 	 	 	 	 ddee   dee   dee   dee   fdZed        Z	ed	        Z
ed
        ZdedefdZ ed      d        Zd ZdededefdZdeeej*                  f   fdZdeeee   f   fdZddZd Zd Zd Zd Zy)DownloadManagerFNdataset_namedata_dirdownload_config	base_pathc                     || _         || _        |xs t        j                  j	                  d      | _        i | _        || _        |xs
 t               | _	        i | _
        i | _        y)a4  Download manager constructor.

        Args:
            data_dir:
                can be used to specify a manual directory to get the files from.
            dataset_name (`str`):
                name of dataset this instance will be used for. If
                provided, downloads will contain which datasets they were used for.
            download_config (`DownloadConfig`):
                to specify the cache directory and other
                download options
            base_path (`str`):
                base path that is used when relative paths are used to
                download files. This can be a remote url.
            record_checksums (`bool`, defaults to `True`):
                Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.
        r[   N)_dataset_name	_data_dirr   rX   abspath
_base_path_recorded_sizes_checksumsrecord_checksumsr   r   downloaded_pathsextracted_paths)rU   r   r   r   r   r   s         rA   r   zDownloadManager.__init__  s\    2 *!#;rwws';Z\& 0.B.2B "!rP   c                     | j                   S r=   )r   rT   s    rA   
manual_dirzDownloadManager.manual_dir$  s    ~~rP   c                 V    t        d | j                  j                         D              S )z+Returns the total size of downloaded files.c              3   &   K   | ]	  }|d      yw)	num_bytesNrO   )r?   checksums_dicts     rA   rB   z2DownloadManager.downloaded_size.<locals>.<genexpr>+  s     m>>+.ms   )sumr   valuesrT   s    rA   downloaded_sizezDownloadManager.downloaded_size(  s$     mTEcEcEjEjElmmmrP   c                     ddl m |j                  j                         j	                  d      t        d      fdt        fd| t                      }|S )a  Ship the files using Beam FileSystems to the pipeline temp dir.

        Args:
            downloaded_path_or_paths (`str` or `list[str]` or `dict[str, str]`):
                Nested structure containing the
                downloaded path(s).
            pipeline ([`utils.beam_utils.BeamPipeline`]):
                Apache Beam Pipeline.

        Returns:
            `str` or `list[str]` or `dict[str, str]`
        r   )upload_local_to_remotetemp_locationzFYou need to specify 'temp_location' in PipelineOptions to upload filesc           
      (   t        j                  t        j                  t        j
                  j                  |             }t        j                  d|  dt        t        j
                  j                  |              d| d        | |       |S )Nz
Uploading z (z) to r[   )	posixpathr   r   DOWNLOADED_DATASETS_DIRr   rX   r   loggerinfor   getsize)local_file_pathremote_file_path
remote_dirr   s     rA   uploadz8DownloadManager.ship_files_with_pipeline.<locals>.uploadA  s    (~~F::BGG<L<L_<]  KK_-R9Y0Z/[[`aq`rrst #?4DE##rP   c                      |       S r=   rO   )r   r   s    rA   <lambda>z:DownloadManager.ship_files_with_pipeline.<locals>.<lambda>L  s    F?$; rP   disable_tqdm)utils.beam_utilsr   _optionsget_all_optionsrj   
ValueErrorr   r   )downloaded_path_or_pathspipelineuploaded_path_or_pathsr   r   r   s      @@@rA   ship_files_with_pipelinez(DownloadManager.ship_files_with_pipeline-  sb     	>&&668<<_M
eff	$ ",;$466"

 &%rP   url_or_urlsr   c           	          d}t        t        t        |j                         |j                                     |dt	                      D ]2  \  }}t        || j                        | j                  t        |      <   4 y)z)Record size/checksum of downloaded files.   zComputing checksums)delaydescdisable)record_checksumN)	r   r   r.   flattenr   r   r   r   ru   )rU   r   r   r   urlrX   s         rA   _record_sizes_checksumsz'DownloadManager._record_sizes_checksumsR  sw    [((*,D,L,L,NOP&/11	
 		IC 8Nd&;&;8D**3s84		rP   zCUse `.download`/`.download_and_extract` with `fsspec` URLs instead.c                 ,  	 | j                   j                  xs t        j                  	| j                   j                  }	fd}t        ||t                      }t        |      }t        |      }t        |j                         |j                               D ]J  \  }}	 t        |	dd|       d}|r| j                   j                  s2 |||       t        |	dd|       L | j                  ||       |j                  S # t        $ r d}Y _w xY w)a  
        Download given urls(s) by calling `custom_download`.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.
            custom_download (`Callable[src_url, dst_path]`):
                The source URL and destination path. For example
                `tf.io.gfile.copy`, that lets you download from  Google storage.

        Returns:
            downloaded_path(s): `str`, The downloaded paths matching the given input
                `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download_custom('s3://my-bucket/data.zip', custom_download_for_my_private_bucket)
        ```
        c                 V    t         j                  j                  t        |             S r=   )r   rX   r   r   )r   	cache_dirs    rA   url_to_downloaded_pathz?DownloadManager.download_custom.<locals>.url_to_downloaded_pathy  s    77<<	+?+DEErP   r   TF)r   local_files_onlyuse_etagmax_retries)r   r   r   DOWNLOADED_DATASETS_PATHr  r   r   r   r.   r   r   FileNotFoundErrorforce_downloadr   data)
rU   r   custom_downloadr  r   r   r   rX   cachedr   s
            @rA   download_customzDownloadManager.download_custom`  s   , ((22Uf6U6U	**66	F $."KBYB[>[$
  *+6#67O#P [0024L4T4T4VW 	IC9teal  T11@@T*9teal	 	$$[2JK',,, % s   !DDDc           	      0   | j                   j                         }d|_        |j                  d|_        t	        | j
                  |      }t        j                         }t        ||d|j                  t                d      }t        j                         |z
  }t        j                  d|j                         dz   d	       t        |      }t        |      }| j                  j!                  t#        t%        |j'                         |j'                                            t        j                         }| j)                  ||       t        j                         |z
  }t        j                  d
|j                         dz   d	       |j*                  S )ay  Download given URL(s).

        By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download. Each URL is a `str`.

        Returns:
            `str` or `list` or `dict`:
                The downloaded paths matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        FDownloading datar   TzDownloading data files)	map_tuplenum_procr   r   zDownloading took <   z minzChecksum Computation took )r   copyextract_compressed_filedownload_descr   	_downloadr   nowr   r  r   r   r   total_secondsr   r   updatedictr.   r   r   r  )rU   r   r   download_func
start_timer   durations          rA   downloadzDownloadManager.download  sV   & ..33527/((0,>O)P\\^
#-$--466)$
  <<>J.'(>(>(@B(F'GtLM)+6#67O#P $$T#k.A.A.CE]EeEeEg*h%ij\\^
$$[2JK<<>J.01G1G1IR1O0PPTUV',,,rP   url_or_filenamerY   c                 t    t        |      }t        |      rt        | j                  |      }t	        ||      S )Nr  )ru   r   r   r   r   )rU   r  r   s      rA   r  zDownloadManager._download  s1    o.O,.tPO?OLLrP   path_or_bufc                 n    t        |d      rt        j                  |      S t        j                  |      S )aK  Iterate over files within an archive.

        Args:
            path_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        rf   )hasattrr   r   r   )rU   r  s     rA   iter_archivezDownloadManager.iter_archive  s0    ( ;'"++K88",,[99rP   pathsc                 ,    t         j                  |      S )a  Iterate over file paths.

        Args:
            paths (`str` or `list` of `str`):
                Root paths.

        Yields:
            `str`: File path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        )r   r   )rU   r!  s     rA   
iter_fileszDownloadManager.iter_files  s    " ''..rP   c           	         |dk7  rt        j                  dt               | j                  j	                         }d|_        |j                  d|_        t        t        t        |      ||j                  t                d      }t        |      }t        |      }| j                  j                  t        t!        |j#                         |j#                                            |j$                  S )ak  Extract given path(s).

        Args:
            path_or_paths (path or `list` or `dict`):
                Path of file to extract. Each path is a `str`.
            num_proc (`int`):
                Use multi-processing if `num_proc` > 1 and the length of
                `path_or_paths` is larger than `num_proc`.

                <Deprecated version="2.6.2">

                Pass `DownloadConfig(num_proc=<num_proc>)` to the initializer instead.

                </Deprecated>

        Returns:
            extracted_path(s): `str`, The extracted paths matching the given input
            path_or_paths.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        r   z'num_proc' was deprecated in version 2.6.2 and will be removed in 3.0.0. Pass `DownloadConfig(num_proc=<num_proc>)` to the initializer instead.Tr
  r  zExtracting data files)r  r   r   )warningswarnFutureWarningr   r  r  r  r   r   r   r  r   r   r   r  r  r.   r   r  )rU   path_or_pathsr  r   r   s        rA   extractzDownloadManager.extract  s    6 |#MM b ..33526/((0,>O)$KA$--466(
 ,M:-o>##D]-B-B-DoF]F]F_)`$ab###rP   c                 B    | j                  | j                  |            S )a  Download and extract given `url_or_urls`.

        Is roughly equivalent to:

        ```
        extracted_paths = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.

        Returns:
            extracted_path(s): `str`, extracted paths of given URL(s).
        )r)  r  )rU   r   s     rA   download_and_extractz$DownloadManager.download_and_extract%  s      ||DMM+677rP   c                 6    | j                   j                         S r=   )r   r  rT   s    rA   get_recorded_sizes_checksumsz,DownloadManager.get_recorded_sizes_checksums7  s    --2244rP   c                 v   t        | j                  j                               t        | j                  j                               z
  }t	        | j                  j                               D ]L  \  }}||v st        j                  j                  |      s+t        j                  |       | j                  |= N y r=   )
setr   r   r   r   itemsr   rX   r   remove)rU   paths_to_deletekeyrX   s       rA   delete_extracted_filesz&DownloadManager.delete_extracted_files:  s    d2299;<s4CXCXC_C_Ca?bbd2288:; 	.IC&277>>$+?		$((-	.rP   c                 R    | j                   j                  r| j                          y y r=   )r   delete_extractedr4  rT   s    rA   manage_extracted_filesz&DownloadManager.manage_extracted_filesA  s"    00'') 1rP   )NNNNT)r   )rH   rI   rJ   is_streamingr   ru   r   r   rW   r   r   r   r   r   r   r   r  r  r  r   rd   BufferedReaderr   r
   r#  r)  r+  r-  r4  r7  rO   rP   rA   r   r      s9   L '+"&48#'!"sm!" 3-!" ".1	!"
 C=!"F   n n "& "&H3F bu  UV.- W.-`.-`M M~ MRU M:c23D3D.D(E :2/c49n 5 /&/$b8$5.*rP   r   )ArK   enumrd   r   r   r   r%  r   r   	functoolsr   	itertoolsr   typingr   r   r   r	   r
   r   r   r    r   utils.deprecation_utilsr   r   utils.file_utilsr   r   r   r   r   utils.info_utilsr   utils.loggingr   r   r   utils.py_utilsr   r   r   r   r   rH   r   rv   bytesfromhexri   rk   maxrg   EnumrD   rR   ru   r`   rp   ry   r{   r   r   r   rO   rP   rA   <module>rH     s    "  	 	        T T T  @ t t 5 E E F F + 
H	   
MM*u	MM*u	MM*u	MM(U	MM&6	MM.!4	MM*u	MM*v	( $ U4 0  BDtu  *499 **-> -c c `Xc] `$=3 =8C= =	=X 	=:8, :8z3* 3@E* E*rP   