
    *#hl                     z   d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,  e-e$j\                        Z/ e&j`                  e1      Z2 G d de-      Z3 G d de4      Z5dZ6e$j\                  ddge$jn                  g de$jp                  g diZ9dZ:ejv                   ejx                  d      k  rddgZ=ndd gZ=e$j\                  e$jn                  e$jp                  gZ>e>D  ci c]0  } | e9|    D cg c]  }e=D ]  }|j                  |e:!        c}}2 c}}} Z@e$j\                  d"giZAe6gZBe@eAgZCejv                   ejx                  d      k  rg d#ZDnd$d%gZDd&ZEg d'ZFd(e-d)eGfd*ZHd+eeee-f   d)ee-eee-   d,f   f   fd-ZId.e-d(e-d)eGfd/ZJd.e-d(e-d)eGfd0ZK	 dAd1e
e-gee-   f   d2e-d)ee-ee-   f   fd3ZLd1e
e-gee-   f   d)ee-   fd4ZM	 	 dBd(e-d2e-d5eee-      d6ee   d)ee-   f
d7ZNdCd2e-d6ee   d)ee-ee-   f   fd8ZO	 dCd2e-d6ee   d)ee-   fd9ZP	 dCd:e-d6ee   d)ee-   fd;ZQ	 	 dDd<ee-   d6ee   d)ee-   fd=ZR G d> d,ee-         ZS G d? d@ee-eSf         ZTyc c}}w c c}}} w )E    N)partial)	has_magic)PathPurePath)CallableDictListOptionalSetTupleUnion)get_fs_token_paths)HTTPFileSystem)HfFileSystem)version)
thread_map   )config)DownloadConfig)!_prepare_path_and_storage_options	xbasenamexjoin)Split)logging)is_local_pathis_relative_path)glob_pattern_to_regexstring_to_dictc                       e Zd Zy)UrlN__name__
__module____qualname__     P/var/www/html/sandstorm/venv/lib/python3.12/site-packages/datasets/data_files.pyr    r           r&   r    c                       e Zd Zy)EmptyDatasetErrorNr!   r%   r&   r'   r*   r*   "   r(   r&   r*   zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*traintraining)
validationvaliddevval)testtestingeval
evaluationz-._ 0-9z2023.9.0z{keyword}[{sep}/]**z**[{sep}/]{keyword}[{sep}/]**z**/*[{sep}/]{keyword}[{sep}/]**)keywordsepz**)zmetadata.csv**/metadata.csvzmetadata.jsonl**/metadata.jsonlr7   r8   z*[])z	README.mdzconfig.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonpatternreturnc                 4     t         fdt        D              S )Nc              3   &   K   | ]  }|v  
 y wNr%   ).0wilcard_characterr9   s     r'   	<genexpr>z%contains_wildcards.<locals>.<genexpr>^   s     Y0A G+Ys   )anyWILDCARD_CHARACTERS)r9   s   `r'   contains_wildcardsrC   ]   s    YEXYYYr&   patternsDataFilesListc           	         t        | t              r@| j                         D ci c]$  \  }}t        |      t        |t              r|n|g& c}}S t        | t              r	t
        | giS t        | t              rt        d | D              r| D ]W  }t        |t              r8t        |      dk(  r*d|v r&t        |j                  d      t        t        f      rKt        d|        | D cg c]  }|d   	 }}t        t        |            t        |      k7  rt        d|       | D ci c]-  }t        |d         t        |d   t              r|d   n|d   g/ c}S t
        | iS t        t	        |             S c c}}w c c}w c c}w )a/  
    Take the data_files patterns from the user, and format them into a dictionary.
    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
    The default split is "train".

    Returns:
        patterns: dictionary of split_name -> list of patterns
    c              3   <   K   | ]  }t        |t                y wr=   )
isinstancedict)r>   r9   s     r'   r@   z$sanitize_patterns.<locals>.<genexpr>o   s     AWz'4(As      splitpathz]Expected each split to have a 'path' key which can be a string or a list of strings, but got z*Some splits are duplicated in data_files: )rH   rI   itemsstrlistSANITIZED_DEFAULT_SPLITrA   lenget
ValueErrorsetsanitize_patterns)rD   keyvaluer9   splitss        r'   rU   rU   a   s    (D!ZbZhZhZjkJCQVC:eT#:%Gkk	Hc	"'(44	Hd	#AAA# 	w-G)7*"7;;v#6dD$wx  xA  B 	 7??7gg&?F?3v;3v;. #MfX!VWW  ( GG$%*WV_VZ:[wvbijpbqarr 
 ,X66 h003 l @s   )E=,F,2Fmatched_rel_pathc                 <   t        |       j                  j                  D cg c]  }|j                  d      s| }}t        |      j                  j                  D cg c]  }|j                  d      s| }}t	        |      t	        |      k7  S c c}w c c}w )u  
    When a path matches a pattern, we additionnally check if it's inside a special directory
    we ignore by default (if it starts with a double underscore).

    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
    mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── __pycache__
            └── b.txt

    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
    False
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
    False
    __)r   parentparts
startswithrQ   )rY   r9   partdata_dirs_to_ignore_in_pathdata_dirs_to_ignore_in_patterns        r'   "_is_inside_unrequested_special_dirrb      s    8 5==M4N4U4U4[4["uD_c_n_nos_t4"u"u7?7H7O7O7U7U%otY]YhYhimYnd%o"%o*+s3Q/RRR #v%os   BB B7Bc                 T   t        |       j                  D cg c]&  }|j                  d      st        |      dhk(  r%|( }}t        |      j                  D cg c]&  }|j                  d      st        |      dhk(  r%|( }}t	        |      t	        |      k7  S c c}w c c}w )u:  
    When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
    if the hidden part is mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── a.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
    False
    .)r   r]   r^   rT   rQ   )rY   r9   r_   hidden_directories_in_pathhidden_directories_in_patterns        r'   ?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dirrg      s    l ""2399"T__S=QZ]^bZchkglZl" " "'*00%DOOC4HQTUYQZ_b^cQc%! % )*c2O.PPP"%s!   B B B B%3B%B%pattern_resolver	base_pathc                     t         D ]  }|j                  dd      }	  | |      }t        |      dkD  s-|D ch c],  }t	        t        |      t        t        |                  d   . }}t        D cg c]  }||v st        |       c}t        |t        t              z
        z   }|D ci c]  }||j                  |      g c}c S  t        D ]e  }	g }
|	j                         D ]6  \  }}|D ],  }	  | |      }t        |      dkD  s|
j                  |        6 8 |
sQ|
D ci c]  }||	|   
 c}c S  t        d d|        # t        $ r Y Rw xY wc c}w c c}w c c}w # t        $ r Y w xY wc c}w )a+  
    Get the default pattern from a directory or repository by testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    z{split}*r   rK   )rK   Couldn't resolve pattern  with resolver )ALL_SPLIT_PATTERNSreplaceFileNotFoundErrorrQ   r   r   r   DEFAULT_SPLITSrN   sortedrT   formatALL_DEFAULT_PATTERNSrM   append)rh   ri   split_patternr9   
data_filesprX   rK   sorted_splitspatterns_dictnon_empty_splitsrD   s               r'   _get_data_files_patternsr|      s    , [''	37	)'2J z?Q $  y|-B9]C[-\]^ef F   6DWEuPVSZWZ`^,,[ M MZZ5EM00u0=>>ZZ[  . O,224 	OE8# !1'!:J z?Q&$++E2	 =MNEE=//NNO 7yP`Oab
cc3 ! 		  X [ )  Os@   E1E3	E"=E"-E'5E,)E;	EE,	E8	7E8	c                     g }t         D ]*  }	  | |      }t        |      dkD  r|j                  |       , |r|S t        d d|        # t        $ r Y Mw xY w)zM
    Get the supported metadata patterns from a directory or repository.
    r   rl   rm   )METADATA_PATTERNSrQ   ru   rp   )rh   non_empty_patternsr9   metadata_filess       r'   _get_metadata_files_patternsr     s~     $ 	-g6N>"Q&"))'2	 !!
7yP`Oab
cc	 ! 		s   'A	AAallowed_extensionsdownload_configc                    t        |       rt        ||       } nAt        |       r4t        j                  j                  |       d   t        j                  z   }nd}t        | |      \  } }t        | |      \  }}}|j                  d      d   j                  d      d   xs |j                  }| j                  d      d   j                  d      d   }t        t              t        |       hz
  }	t        |j                  t               r|j                  n|j                  d   }
|
dk7  r|
dz   nd}|j#                  | d	
      j%                         D cg c]  \  }}|d   dk(  rt        |      |	vrt'        t        j                  j)                  ||      t        j                  j)                  ||            s`t+        t        j                  j)                  ||      t        j                  j)                  ||            s|j-                  |      r|n||z    }}}|D cg c]3  }t/        fdt        |      j                  d      dd D              r|5 }}t1        |      t1        |      k  r>t3        t        |      t        |      z
        }t4        j7                  d|  d|        n|}|s$d|  d}|dt3               z  }t9        |      |S c c}}w c c}w )a 	  
    Resolve the paths and URLs of the data files from the pattern passed by the user.

    You can use patterns to resolve multiple local files. Here are a few examples:
    - *.csv to match all the CSV files at the first level
    - **.csv to match all the CSV files at any level
    - data/* to match all the files inside "data"
    - data/** to match all the files inside "data" and its subdirectories

    The patterns are resolved using the fsspec glob.

    glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
    For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,
    resulting in **.json being equivalent to **/*.json.

    More generally:
    - '*' matches any character except a forward-slash (to match just the file or directory name)
    - '**' matches any character including a forward-slash /

    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
    The same applies to special directories that start with a double underscore like "__pycache__".
    You can still include one if the pattern explicilty mentions it:
    - to include a hidden file: "*/.hidden.txt" or "*/.*"
    - to include a hidden directory: ".hidden/*" or ".*/*"
    - to include a special directory: "__special__/*" or "__*/*"

    Example::

        >>> from datasets.data_files import resolve_pattern
        >>> base_path = "."
        >>> resolve_pattern("docs/**/*.py", base_path)
        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

    Args:
        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
            The paths can be absolute or relative to base_path.
            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
        base_path (str): Base path to use when resolving relative paths.
        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
    Returns:
        List[str]: List of paths or URLs to the local or remote files that match the patterns.
    r    r   storage_optionsz::z://fileT)detailtypeNc              3   ,   K   | ]  }d |z   v   yw)rd   Nr%   )r>   suffixr   s     r'   r@   z"resolve_pattern.<locals>.<genexpr>o  s     g&3<#55gs   rd   r   z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )r   r   r   osrL   
splitdriver6   r   r   rK   root_markerrT   FILES_TO_IGNOREr   rH   protocolrN   globrM   rb   relpathrg   r^   rA   rQ   rO   loggerinforp   )r9   ri   r   r   r   fs_fs_base_path
fs_patternfiles_to_ignorer   protocol_prefixfilepathr   matched_pathsoutinvalid_matched_files	error_msgs     `               r'   resolve_patternr   !  s   b  	7+	w	GG&&w/2RVV;		@ZijG_!'?KHB1??4(+11%8<NLt$Q'--e4R8J/*i.@-AAO(c:r{{AH*2f*<h&"O !gggdg;AACHd<6!x 72GGOOHl3RWW__ZQ]5^
 PGGOOHl3RWW__ZQ]5^
 ''8oPX>XXM  % *
gIhDWD]D]^aDbcdceDfgg 
 

 s8c-(($(]);c#h)F$G!KK27);hi~h  A &wiq1	)9$?Q:R9STTI	**J=
s   CK'8K$c                 |    t        t        | |      }	 t        ||       S # t        $ r t	        d|  d      dw xY w)uh
  
    Get the default pattern from a directory testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    Some examples of supported patterns:

    Input:

        my_dataset_repository/
        ├── README.md
        └── dataset.csv

    Output:

        {"train": ["**"]}

    Input:

        my_dataset_repository/
        ├── README.md
        ├── train.csv
        └── test.csv

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train.csv
            └── test.csv

        my_dataset_repository/
        ├── README.md
        ├── train_0.csv
        ├── train_1.csv
        ├── train_2.csv
        ├── train_3.csv
        ├── test_0.csv
        └── test_1.csv

    Output:

        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train/
            │   ├── shard_0.csv
            │   ├── shard_1.csv
            │   ├── shard_2.csv
            │   └── shard_3.csv
            └── test/
                ├── shard_0.csv
                └── shard_1.csv

    Output:

        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train-00000-of-00003.csv
            ├── train-00001-of-00003.csv
            ├── train-00002-of-00003.csv
            ├── test-00000-of-00001.csv
            ├── random-00000-of-00003.csv
            ├── random-00001-of-00003.csv
            └── random-00002-of-00003.csv

    Output:

        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    ri   r   )ri   The directory at z doesn't contain any data filesN)r   r   r|   rp   r*   ri   r   resolvers      r'   get_data_patternsr     sP    h )_]Hj'IFF j"3I;>] ^_eiijs   ! ;c                 x    t        t        | |      }	 t        |      S # t        $ r t        d|  d      dw xY w)zE
    Get the supported metadata patterns from a local directory.
    r   r   z" doesn't contain any metadata fileN)r   r   r   rp   r   s      r'   get_metadata_patternsr     sM     )_]Hm+H55 m"3I;>` abhllms   
 9	data_filec                 r   t        | |      \  } }t        | |      \  }}}t        |t              r)|j	                  |       }|j
                  |j                  fS t        |t              r| j                  t        j                        rt        t        j                  |j                        }d| t        t        j                        dz   d  j                  ddd      z   } |j	                  |       }|j
                  |j                  fS |j                  |       }dD ]  }||v st        ||         fc S  y	)
Nr   r   )endpointtokenzhf://r   z	/resolve/@)ETagetagmtimer%   )r   r   rH   r   resolve_pathrepo_idrevisionr   r^   r   HF_ENDPOINTr   rQ   ro   r   rN   )	r   r   r   r   r   resolved_pathhffsr   rV   s	            r'   _get_single_origin_metadatar     s!    "C9^m!nI!)_MHB1"l#	2%%}'='=>>	B	'I,@,@ASAS,TV%7%7?T?TUiF,>,>(?!(C(EFNN{\_abcc	)))4%%}'='=>>779D( %$;S	N$$% r&   rw   c           
          t        t        t        |      | |t        j                  dt        |       dk  xs t        j                                S )Nr   zResolving data files   )max_workers
tqdm_classdescdisable)r   r   r   r   tqdmrQ   is_progress_bar_enabled)rw   r   r   s      r'   _get_origin_metadatar     sI    
 +_M<<#J2%NW-L-L-N)N r&   c                   j    e Zd ZdZdee   deee      f fdZd Ze		 	 	 ddee   de
j                  j                  dee   d	eee      d
ee   dd fd       Ze		 	 	 ddee   dee   d	eee      d
ee   dd f
d       Ze		 	 	 ddee   dee   d	eee      d
ee   dd f
d       Zdee   dd fdZ xZS )rE   a  
    List of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover DataFilesList has an additional attribute ``origin_metadata``.
    It can store:
    - the last modified time of local files
    - ETag of remote files
    - commit sha of a dataset repository

    Thanks to this additional attribute, it is possible to hash the list
    and get a different hash if and only if at least one file changed.
    This is useful for caching Dataset objects that are obtained from a list of data files.
    rw   origin_metadatac                 2    t         |   |       || _        y r=   )super__init__r   )selfrw   r   	__class__s      r'   r   zDataFilesList.__init__   s    $.r&   c                 P    t        g | || j                  |j                  z         S r=   )rE   r   )r   others     r'   __add__zDataFilesList.__add__$  s(    _t_e_d.B.BUEZEZ.Z[[r&   rD   dataset_infori   r   r   r:   c                     d|j                    d|j                   d|xs d j                  d      }| j                  ||||      S )Nzhf://datasets/r   /r   ri   r   r   )idsharstripfrom_patterns)clsrD   r   ri   r   r   s         r'   from_hf_repozDataFilesList.from_hf_repo'  s]     %\__$5Q|7G7G6H)/WYIZ[bbcfg	  	>Pbq ! 
 	
r&   c                     ||n%t               j                         j                         }| j                  ||||      S Nr   )r   resolveas_posixr   )r   rD   ri   r   r   s        r'   from_local_or_remotez"DataFilesList.from_local_or_remote5  sE     "+!6IDFNN<L<U<U<W	  	>Pbq ! 
 	
r&   c           	         ||n%t               j                         j                         }g }|D ]!  }	 |j                  t	        ||||             # t        ||      } | ||      S # t
        $ r t        |      s Y Qw xY w)Nr   r   )r   r   r   extendr   rp   r   r   )r   rD   ri   r   r   rw   r9   r   s           r'   r   zDataFilesList.from_patternsB  s     "+!6IDFNN<L<U<U<W	
 	G!!#"++=(7		 /z?[://	 %  ) *s   A))B B
extensionsc                     dj                  d |D              }t        j                  d| d      }t        | D cg c]  }|j	                  |      s| c}| j
                        S c c}w )N|c              3   &   K   | ]	  }d |z     yw)\Nr%   )r>   exts     r'   r@   z2DataFilesList.filter_extensions.<locals>.<genexpr>]  s     <#4#:<s   z.*(z	)(\..+)?$)r   )joinrecompilerE   matchr   )r   r   r9   r   s       r'   filter_extensionszDataFilesList.filter_extensions\  s_    ((<<<**s7):67(,I9i0HYI 00
 	
Is   A)A)NNN)r"   r#   r$   __doc__r	   rN   r   r   r   classmethodhuggingface_hubhf_apiDatasetInfor
   r   r   r   r   r   __classcell__)r   s   @r'   rE   rE     s   "/49 /tE#J?O /\ 
 $(2648
s)
 &,,88
 C=	

 %T#Y/
 ".1
 

 
  $(2648

s)

 C=

 %T#Y/	


 ".1

 


 

  $(26480s)0 C=0 %T#Y/	0
 ".10 
0 02
DI 
/ 
r&   c                   r   e Zd ZdZe	 	 	 ddeeeee   e	f   f   de
e   de
ee      de
e   dd f
d       Ze	 	 	 ddeeeee   e	f   f   d	ej                  j                  de
e   de
ee      de
e   dd fd
       Ze	 	 	 ddeeeee   e	f   f   de
e   de
ee      de
e   dd f
d       Zdee   dd fdZy)DataFilesDicta  
    Dict of split_name -> list of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover each list is a DataFilesList. It is possible to hash the dictionary
    and get a different hash if and only if at least one file changed.
    For more info, see ``DataFilesList``.

    This is useful for caching Dataset objects that are obtained from a list of data files.

    Changing the order of the keys of this dictionary also doesn't change its hash.
    NrD   ri   r   r   r:   c                      |        }|j                         D ]3  \  }}t        |t              st        j                  ||||      n|||<   5 |S r   )rM   rH   rE   r   r   rD   ri   r   r   r   rV   patterns_for_keys           r'   r   z"DataFilesDict.from_local_or_remoteu  sj     e%-^^%5 
	!C! ""2MB 22$''9$3	 3  & H
	 
r&   r   c                      |        }|j                         D ]4  \  }}t        |t              st        j                  |||||      n|||<   6 |S )N)r   ri   r   r   )rM   rH   rE   r   )	r   rD   r   ri   r   r   r   rV   r   s	            r'   r   zDataFilesDict.from_hf_repo  sm     e%-^^%5 	!C! ""2MB **$!-''9$3 +  & H	 
r&   c                      |        }|j                         D ]3  \  }}t        |t              st        j                  ||||      n|||<   5 |S r   )rM   rH   rE   r   r   s           r'   r   zDataFilesDict.from_patterns  sj     e%-^^%5 
	!C! ""2MB ++$''9$3	 ,  & H
	 
r&   r   c                 ~     t        |              }| j                         D ]  \  }}|j                  |      ||<    |S r=   )r   rM   r   )r   r   r   rV   data_files_lists        r'   r   zDataFilesDict.filter_extensions  sD    d4jl$(JJL 	E C&88DCH	E
r&   r   )r"   r#   r$   r   r   r   rN   r   r	   rE   r
   r   r   r   r   r   r   r   r   r%   r&   r'   r   r   e  s     $(2648sE$s)]":;;< C= %T#Y/	
 ".1 
 * 
 $(2648sE$s)]":;;< &,,88 C=	
 %T#Y/ ".1 
 .  $(2648sE$s)]":;;< C= %T#Y/	
 ".1 
 *DI / r&   r   )r   )NNr=   )@   N)Ur   r   	functoolsr   r   r   pathlibr   r   typingr   r   r	   r
   r   r   r   r   fsspecr   fsspec.implementations.httpr   r   	packagingr   tqdm.contrib.concurrentr   r   r   downloadr   #download.streaming_download_managerr   r   r   rX   r   utilsr   utils.file_utilsr   r   utils.py_utilsr   r   rN   TRAINrP   
get_loggerr"   r   r    rp   r*   SPLIT_PATTERN_SHARDED
VALIDATIONTESTSPLIT_KEYWORDSNON_WORDS_CHARSFSSPEC_VERSIONparse#KEYWORDS_IN_PATH_NAME_BASE_PATTERNSrq   rs   #DEFAULT_PATTERNS_SPLIT_IN_PATH_NAMEDEFAULT_PATTERNS_ALLrn   rt   r~   rB   r   boolrC   rU   rb   rg   r|   r   r   r   r   r   r   rE   r   )rK   r5   r9   s   000r'   <module>r     s=   	 	   " D D D  % 6 (  .  $ d d   = A ekk*  
		H	%	# 		) 	 a  
KK':&	;	JJ9
 	=7==44+@Ba*b'+@Bc*d'++u//<  ' ' 	 
%e,:  	wO<< ' # 
KK$  ,, '  
=7==44 	  Z Z Z#1dD#o 6 #14U4PS9VeKeEf@f;g #1LS Ss St SB;QVY ;Qdg ;Qlp ;Q~ DF'dud3i/0'd=@'d	#tCy.'dTd8SE49<L3M dRVWZR[ d( /304	\\\ !c+\ n-	\
 
#Y\~Xj Xjx7O Xj[_`ceijmen`n[o Xjz 15mmn-m 
#Ym  15n- 3Z0 04S	 n- 3Z	T
DI T
nXDm+, Xa's   >J6!J0-J60J6