
    *#hBZ                     2   d dl Z d dlZd dlZd dlZd dlZd dlmc mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZm Z  d dl!m"Z"  ee#      Z$dZ%d Z& G d de      Z' ed       G d de             Z(y)    N)ArgumentParser)Path)Optional)config)BaseDatasetsCLICommand)DownloadConfig)DownloadManager)MockDownloadManager)dataset_module_factoryimport_main_class)
deprecated)
get_loggerset_verbosity_warning)
map_nestedzutf-8c                     t        | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  	      S N)
DummyDataCommandpath_to_datasetauto_generaten_lines
json_fieldxml_tagmatch_text_fileskeep_uncompressed	cache_direncoding)argss    Y/var/www/html/sandstorm/venv/lib/python3.12/site-packages/datasets/commands/dummy_data.pydummy_data_command_factoryr      sV    
 
    c                        e Zd Z fdZ fdZ fdZ	 	 	 	 	 ddedee   dee   dee   d	ee   d
e	fdZ
	 	 	 	 ddedededee   dee   dee   d	ee   d
efdZedefd       Zd Z xZS )!DummyDataGeneratorDownloadManagerc                 N    t        |   |i | || _        g | _        g | _        y r   )super__init__mock_download_managerdownloaded_dummy_pathsexpected_dummy_paths)selfr&   r   kwargs	__class__s       r   r%   z*DummyDataGeneratorDownloadManager.__init__*   s-    $)&)%:"&(#$&!r    c                     t         |   |      }| j                  j                  |      }t        | j                  j
                  |d       t        | j                  j
                  |d       |S NT)	map_tuple)r$   downloadr&   r   r'   appendr(   r)   url_or_urlsoutputdummy_outputr+   s       r   r/   z*DummyDataGeneratorDownloadManager.download0   s]    !+.11::;G4..55vN4,,33\TRr    c                     t         |   t         | 	  |            }| j                  j                  |      }t	        | j
                  j                  |d       t	        | j                  j                  |d       |S r-   )r$   extractr/   r&   r   r'   r0   r(   r1   s       r   download_and_extractz6DummyDataGeneratorDownloadManager.download_and_extract7   sf    !1+!>?11::;G4..55vN4,,33\TRr       r   r   r   r   r   returnc                    t        j                  t         j                  j                  | j                  j
                  | j                  j                  | j                  j                  d      d       d}d| j                  _        t        | j                  | j                        D ]~  \  }}t         j                  j                  | j                  j
                  | j                  j                  | j                  j                  |      }	|| j                  ||	|||||      z  } |dk(  rt        j                  d       |dkD  S )N
dummy_dataTexist_okr   Fr   r   r   r   r   zDummy data generation failed: no dummy files were created. Make sure the data files format is supported by the auto-generation.)osmakedirspathjoinr&   datasets_scripts_dirdataset_namedummy_data_folderload_existing_dummy_datazipr'   r(   _create_dummy_dataloggererror)
r)   r   r   r   r   r   totalsrc_pathrelative_dst_pathdst_paths
             r   auto_generate_dummy_data_folderzADummyDataGeneratorDownloadManager.auto_generate_dummy_data_folder>   s.    	GGLL**??**77**<<	 	
 >C"";+.t/J/JDLeLe+f 	'H'ww||**??**77**<<!	H T,,%!1! -  E	  A:LLW qyr    rL   rN   c                    |xs t         }t        j                  j                  |      rt        j                  d|        t        |      j                  g d}t        fd|D              }	|Nt        j                  j                  |      }
|j                  d      D ]  }|	t        j                  |
|      z  }	 |	rt        |      j                  j                  dd       t        ||      5 }t        |d|      5 }g }t        |      D ]  \  }}||k\  r n|j!                  |        |j#                  d	j%                  |      j'                                d d d        d d d        y
dv rt        ||      5 }t)        j*                  |      }|||   }t-        |t.              rmt1        d |j3                         D              s&t5        dt7        |j9                                d      |j;                         D ci c]  \  }}||d |  }}}n|d | }|||i}t        |      j                  j                  dd       t        |d|      5 }t)        j<                  ||       d d d        d d d        y
t        fddD              r/|t        j?                  d       y
| jA                  |||||       y
t        j?                  d| d       yt        j                  jC                  |      rd}t        jD                  |      D ]  \  }}}|D ]  }|jG                  d      rt        j                  j%                  ||      }t        j                  j%                  |t        |      jI                  |            }|| jK                  |||||||      z  }  |S y # 1 sw Y   QxY w# 1 sw Y   y
xY wc c}}w # 1 sw Y   ]xY w# 1 sw Y   y
xY w)Nz#Trying to generate dummy data file )z.txtz.csvz.jsonlz.tsvc              3   &   K   | ]  }|v  
 y wr    .0	extensiondst_path_extensionss     r   	<genexpr>zGDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<genexpr>w   s     +vQZI9L,L+v   ,Tr=   parentsr   w    z.jsonc              3   <   K   | ]  }t        |t                y wr   )
isinstancelist)rT   vs     r   rW   zGDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<genexpr>   s     "S1:a#6"Ss   zCouldn't parse columns z\. Maybe specify which json field must be used to read the data with --json_field <my_field>.c              3   &   K   | ]  }|v  
 y wr   rR   rS   s     r   rW   zGDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<genexpr>   s     X)Y"55XrX   )z.xmlz.txmzEFound xml file but 'xml_tag' is set to None. Please provide --xml_tag)r   r   zCouldn't generate dummy file 'z9'. Ignore that if this file is not useful for dummy data.r   .r>   )&DEFAULT_ENCODINGr?   rA   isfilerI   debugr   suffixesanybasenamesplitfnmatchparentmkdiropen	enumerater0   writerB   stripjsonloadra   dictallvalues
ValueErrorrb   keysitemsdumpwarning_create_xml_dummy_dataisdirwalk
startswithrelative_torH   )r)   rL   rN   r   r   r   r   r   line_by_line_extensionsis_line_by_line_text_file	file_namepatternsrc_filedst_filefirst_linesiline	json_datakrc   first_json_datarK   rA   _filesnamesrc_file_pathdst_file_pathrV   s                               @r   rH   z4DummyDataGeneratorDownloadManager._create_dummy_datah   s    //77>>(#LL>xjIJ"&x."9"9&H#(++v^u+v(v%+GG,,X6	/55c: UG-G1TT-U )X%%++T4+H(X6 E(hh? E8&('0': 5GAt G| %'..t45 !rww{';'A'A'CDEE //(X6 =( $		( 3I!-$-j$9	!)T2""S	@P@P@R"SS","9$y~~?O:P9Q RQ !Q# 
 GPooFW*Xda1ak>*X*X*3HW*=!-+5*GN))//t/Lhh? =8		/8<=#=& XGWXX?NN#jk  //(GU\go/pNN0
:ww WW]]8$E"$''("3 a! D??3/(*T4(@(*XtM?R?^?^_g?h(i!8!8))$+'1$+-=%- "9 " 	 L! %UE EE  +Y= =#=& s]   6N=AN0!N=BO
O	AOO4O0N:	5N==O	OO	OO%c                    t        |      j                  j                  dd       t        | |      5 }d}g }t	        j
                  |d      D ]^  \  }}	|dk(  r|j                  |	       |j                         }
|	j                  |k(  s=||k  r|dz  }H|sK|d	   j                  |	       ` t	        j                  	
      j                  ||       d d d        y # 1 sw Y   y xY w)NTrZ   r\   r   )startend)eventsr   r_   )element)r   rn   ro   rp   ET	iterparser0   poptagremoveElementTreerr   )rL   rN   r   r   r   r   n_liner[   eventelemr   s              r   r~   z8DummyDataGeneratorDownloadManager._create_xml_dummy_data   s    X##T4#@(X. 	L(FG!||H=MN 
9tG#NN4(Axx7*!G+"aKF& ' 2 24 8
9 NN4(..x(.K	L 	L 	Ls   AC!C!=C!!C*c                 <   t         j                  j                  || j                  j                        }t         j                  j                  |d      }d}t
        j                  d| d       t        j                  |d||       t        j                  |       y )Nr;   z"Compressing dummy data folder to 'z.zip'rG   )
r?   rA   rB   r&   rE   rI   infoshutilmake_archivermtree)r)   r   root_dir	base_namebase_dirs        r   !compress_autogenerated_dummy_datazCDummyDataGeneratorDownloadManager.compress_autogenerated_dummy_data   sq    77<<1K1K1]1]^GGLL<8	85IJIuhAi r    )r8   NNNN)NNNN)__name__
__module____qualname__r%   r/   r7   intr   strboolrO   rH   staticmethodrf   r~   r   __classcell__)r+   s   @r   r"   r"   )   s   ' $(!%*."&(( SM( #	(
 #3-( 3-( 
(^ %)!%*."&RR R 	R
 SMR #R #3-R 3-R 
Rh DEP` L L$!r    r"   zThe `datasets` repository does not host the dataset scripts anymore. Therefore, dummy data is no longer needed to test their loading with CI.c                       e Zd Zedefd       Zdededede	e   de	e   de	e   d	ed
e	e   de	e   fdZ
d Zde	e   fdZd Zy)r   parserc                    | j                  dd      }|j                  ddd       |j                  dt        d	d
       |j                  dt        d d       |j                  dt        d d       |j                  dt        d d       |j                  ddd       |j                  dt        d d       |j                  dt        d dt                |j                  dt        d       |j                  t               y )Nr;   zGenerate dummy data.)helpz--auto_generate
store_truez!Automatically generate dummy data)actionr   z	--n_linesr8   zBNumber of lines or samples to keep when auto-generating dummy data)typedefaultr   z--json_fieldzOptional, json field to read the data from when auto-generating dummy data. In the json data files, this field must point to a list of samples as json objects (ex: the 'data' field for squad-like files)z	--xml_tagz[Optional, xml tag name of the samples inside the xml files when auto-generating dummy data.z--match_text_fileszOptional, a comma separated list of file patterns that looks for line-by-line text files other than *.txt or *.csv. Example: --match_text_files *.labelz--keep_uncompressedzWhether to leave the dummy data folders uncompressed when auto-generating dummy data. Useful for debugging for to do manual adjustements before compressing.z--cache_dirzKCache directory to download and cache files when auto-generating dummy dataz
--encodingz=Encoding to use when auto-generating dummy data. Defaults to r   z/Path to the dataset (example: ./datasets/squad))r   r   )func)
add_parseradd_argumentr   r   rf   set_defaultsr   )r   test_parsers     r   register_subcommandz$DummyDataCommand.register_subcommand   sL   '';Q'R  !2<Nq r  c13w 	! 	
 	   ^	 	! 	
 	  n	 	! 	
 	    k	 	! 	
 	  ! p 	! 	

 	  ^	 	! 	
 	  PQaPbc	 	! 	
 	  !2Ct u  &@ Ar    r   r   r   r   r   r   r   r   r   c
                     || _         t        j                  j                  |      r8|j	                  t        j
                  d      j                  d      d   | _        n7|j	                  t        j
                  d      j                  d      d   | _        t        j                  j                  |xs t        j                        }|| _        || _        || _        || _        || _        || _        || _        |	| _        y )N/r   )_path_to_datasetr?   rA   r   replaceseprl   _dataset_name
expanduserr   HF_DATASETS_CACHE_auto_generate_n_lines_json_field_xml_tag_match_text_files_keep_uncompressed
_cache_dir	_encoding)
r)   r   r   r   r   r   r   r   r   r   s
             r   r%   zDummyDataCommand.__init__	  s     !077==)!0!8!8!E!K!KC!PQS!TD!0!8!8!E!K!KC!PQS!TDGG&&y'LF4L4LM	+%!1"3#!r    c           	         t                t        | j                        }t        |j                        }|j
                  xs d g}g }t        j                         5 }|D ]  }|r|j                  nd } |||j                  |      }|r|j                  n|j                  j                  }	t        | j                  ||	dd      }
| j                  r.|j                  | j!                  ||
| j"                               | j%                  ||
        | j                  rJ| j"                  s>t'        |      rt)        d| j                   d       nt)        d	| j                   d       d d d        y # 1 sw Y   y xY w)
N)config_namehashr   TF)rD   r   versionuse_local_dummy_datarF   )dataset_buildermock_dl_managerr   )r   r   z>Automatic dummy data generation succeeded for all configs of ''z<Automatic dummy data generation failed for some configs of ')r   r   r   r   module_pathBUILDER_CONFIGStempfileTemporaryDirectoryr   r   r   r   r
   r   r   r0   _autogenerate_dummy_datar   _print_dummy_data_instructionsrw   print)r)   dataset_modulebuilder_clsbuilder_configsauto_generate_resultstmp_dirbuilder_configr   r   r   r   s              r   runzDummyDataCommand.run$  s}   /0E0EF'(B(BC &55?$ "((* 	sg"1 5Cn11"-+NL_L_kr"s4B.00H^H^HfHf"5!%!3!3)#)--2# &&)0055,;,;.2.E.E 6  77(7 8 +0 ""4+B+B,-Z[_[p[pZqqrstXY]YnYnXoopqr;	s 	s 	ss   DE33E<r9   c                    | j                   r8t        j                  j                  | j                   t        j
                        nt        j                  }t        |      }t        | j                  ||      }|j                  |       d|_        |j                  | j                  | j                  | j                  | j                   | j"                         |sVt        j                  j                  |j$                  |j&                        }|j)                  |       d|_        i }t        j*                  |j                   d       	 |j                  |      }	|	D ]8  }
|j-                  |
d       |
j.                  j0                  ||
j2                  <   : 	 t5        d |j7                         D              r.t8        j;                  d	|j                  j2                   d
       y|D cg c]  }||   dk(  s| }}t8        j;                  d| d|j                  j2                   d
       yt        j                  j                  | jB                  |jD                        }t8        jG                  d| d       y c c}w # t<        $ rC}t8        j?                  d|j                  j2                   dtA        |      z          Y d }~yd }~ww xY w)N)r   )rD   r&   download_configFr>   Tr<   )check_duplicate_keysc              3   &   K   | ]	  }|d kD    yw)r   NrR   )rT   
n_exampless     r   rW   z<DummyDataCommand._autogenerate_dummy_data.<locals>.<genexpr>q  s     V*zA~Vs   zEDummy data generation done and dummy data test succeeded for config 'z''.r   zCDummy data generation done but dummy data test failed since splits z have 0 examples for config 'z&Failed to load dummy data for config 'z''.
Original error:
z#Dummy data generated in directory 'zg' but kept uncompressed. Please compress this directory into a zip file to use it for dummy data tests.)$r   r?   rA   rB   r   DOWNLOADED_DATASETS_DIRDOWNLOADED_DATASETS_PATHr   r"   r   _split_generatorsrF   rO   r   r   r   r   r   rC   rD   r   r@   _prepare_split
split_infonum_examplesr   rw   rx   rI   r}   OSErrorrJ   r   r   rE   r   )r)   r   r   r   dl_cache_dirr   
dl_managerpath_do_datasetn_examples_per_splitsplit_generatorssplit_generator
split_nameempty_splitsegenerated_dummy_data_dirs                  r   r   z)DummyDataCommand._autogenerate_dummy_dataK  s     GGLL&*H*HI00 	
 )<@6++?ds

 	))*538022MM''MM!33^^ 	3 	
 ! ggll?+O+OQ`QmQmnO88I7;O4#% KK22TB!#2#D#D_#U '7 iO#22?Y^2_APA[A[AhAh()=)=>i V8L8S8S8UVVNN_`o`v`v`{`{_||  A   6J$'1MablMmqrMr
$L $ NN]^j]k  lI  JY  J`  J`  Je  Je  If  fi  j !')ww||D4I4I?KlKl'm$KK56N5O Pa a$  <_=S=S=X=X<YYop!f s%   AJ -J ;J 	K9KKc           
         t         j                  j                  | j                  |j                        }t
        j                  d| d       t        j                  |d       	 |j                  |      }t               }g }|j                  }D ]  }	t
        j                  d	|	j                           |j#                  |	j                          |	j$                  }
 |j&                  d5i |
}	 d
}|j                  d|j                  j                    dnd}|d|z   | j                   d| d| dz   z  }|D ]  \  }} |d| dz  } dj                  |      }t+        |      dkD  rxt+        |      dk(  r;t-        t/        |            |k(  r$dt-        t/        |             d| d| dz  }|}n&dj                  |      }d| d| dz  }|d| dz  }|d| d z  }t+        |      dk(  rBt-        t/        |            |k(  r+d!| d"| d#| d$z  }|d%| d&| d$z  }|d'| d(| d)| d*z  }n*d+| d,| d#| d-z  }|d.| d/| d$z  }|d0| d(| d)| d*z  }|d1| d2| d3z  }|d4z  }t        |       y # t        $ r>}t        d| j                   d|j                   d|j                   d       Y d }~Fd }~ww xY w# t        $ r&}|j)                  |j                         Y d }~^d }~ww xY w)6Nz$Creating dummy folder structure for z... Tr<   zDataset z with config a   seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file re   z/Collecting dummy data file paths to create for zU
==============================DUMMY DATA INSTRUCTIONS==============================
zconfig z of r^   z(- In order to create the dummy data for z, please go into the folder 'z' with `cd z` . 

za- It appears that the function `_generate_examples(...)` expects one or more files in the folder z using the function `glob.glob(...)`. In this case, please refer to the `_generate_examples(...)` method to see under which filename the dummy data files should be created. 

z, r   r_   z1- Please create a single dummy data file called 'z' from the folder 'zV'. Make sure that the dummy data file provides at least one example for the split(s) 'z' 

z0- Please create the following dummy data files 'z'

z- For each of the splits 'zU', make sure that one or more of the dummy data files provide at least one example 

z- If the method `_generate_examples(...)` includes multiple `open()` statements, you might have to create other files in addition to 'zG'. In this case please refer to the `_generate_examples(...)` method 

z@- After the dummy data file is created, it should be zipped to 'z.zip' with the command `zip z.zip z` 

z- You can now delete the file 'z' with the command `rm z- To get the file 'z;' back for further changes to the dummy data, simply unzip z.zip with the command `unzip z.zip` 

zP- After all dummy data files are created, they should be zipped recursively to 'z.zip' with the command `zip -r z/` 

z!- You can now delete the folder 'z' with the command `rm -r z- To get the folder 'z'- Make sure you have created the file 'z
.zip' in 'z' 
zT===================================================================================
rR   )r?   rA   rB   r   rE   rI   r   r@   r   FileNotFoundErrorr   r   r   filenamesetdummy_file_namer   r0   
gen_kwargs_generate_examplesaddlennextiter)r)   r   r   rE   generator_splitsr   files_to_createsplit_namesr  rl   r  	generatordummy_data_guidance_printconfig_stringkeyrecordfiles_strings                    r   r   z/DummyDataCommand._print_dummy_data_instructions  s(   GGLL)>)>@a@ab:;L:MTRS
%5	.@@Q %)99% 	0EKKI%**VWuzz*))J:::HZHI0,i)CRCYCYCego4499:$?km  *>#$++,,IJ[I\\ghygz  {C  DD) $- KC *  0Q  Ra  Qb  bS  .T  T)+	04 ii,!#?#q(T$2G-HO-[)/`aefjkzf{a|`}  ~Q  Rc  Qd  dz  {F  zG  GM  .N  N).#yy9)/_`l_m  nA  BS  AT  TY  .Z  Z))/I+  Wn  .o  o)%  ,r  s  r@  @I  *J  J%1$d?.C)D)W%+kl{k|  }Y  Zi  Yj  jo  p  o@  @F  *G  G%%1/1BBYZiYjjpq% &+>>O  PK  L[  K\  \y  zI  yJ  JT  *U  U%%+{  }L  |M  Ml  m|  l}  }B  CR  BS  SZ  *[  [%%3O3DD^_n^oouv% &+@@Q  RM  N]  M^  ^{  |K  {L  LV  *W  W%!5o5FjQbPccgh	
! 	"_4!'(Q ! 	4--.mO<R<R;S  Tc  de  dn  dn  co  op  q 	@ % 0##AJJ//0s1   &I; 0AK;	K3J==K	K4K//K4N)r   r   r   r   r   r   r   r   r   r   r%   r   r   r   rR   r    r   r   r      s     *BN *B *BX"" " 	"
 SM" #" #3-"  " C=" 3-"6%sN8_ghl_m 8tO)r    r   ))rm   rt   r?   r   r   xml.etree.ElementTreeetreer   r   argparser   pathlibr   typingr   datasetsr   datasets.commandsr   !datasets.download.download_configr   "datasets.download.download_managerr	   'datasets.download.mock_download_managerr
   datasets.loadr   r    datasets.utils.deprecation_utilsr   datasets.utils.loggingr   r   datasets.utils.py_utilsr   r   rI   rf   r   r"   r   rR   r    r   <module>r!     s      	   " " #    4 < > G C 7 D . 
H	 l! l!^  Ty)- y)y)r    