
    *#ht                        d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	Z
d dlZd dlmZ d dlmZ ej                   j"                  j%                  e      Ze G d dej*                               Z G d	 d
ej.                        Zy)    N)InitVar	dataclass)StringIO)Optionalrequire_storage_cast)
table_castc                       e Zd ZU dZdZeej                     ed<   dZ	e
ed<   dZeee
      ed<   dZee
   ed<   d	Zeed
<   dZeed<   dZe
ed<   d Zy)
TextConfigzBuilderConfig for text files.Nfeatureszutf-8encoding
deprecatederrorsencoding_errorsi   	chunksizeFkeep_linebreaksline	sample_byc                 Z    |dk7  r&t        j                  d| dt               || _        y y )Nr   z'errors' was deprecated in favor of 'encoding_errors' in version 2.14.0 and will be removed in 3.0.0.
You can remove this warning by passing 'encoding_errors=z
' instead.)warningswarnFutureWarningr   )selfr   s     `/var/www/html/sandstorm/venv/lib/python3.12/site-packages/datasets/packaged_modules/text/text.py__post_init__zTextConfig.__post_init__   s<    \!MMKKQ(R\^
 $*D  "    )__name__
__module____qualname____doc__r   r   datasetsFeatures__annotations__r   strr   r   r   r   intr   boolr   r    r   r   r   r      sh    ',0Hhx(()0Hc%1FGHSM"1%)OXc])Is!OT!Is*r   r   c                   Z    e Zd ZeZd Zd Zdej                  dej                  fdZ	d Z
y)Textc                 V    t        j                  | j                  j                        S )N)r   )r!   DatasetInfoconfigr   )r   s    r   _infoz
Text._info*   s    ##T[[-A-ABBr   c                    | j                   j                  s"t        d| j                   j                         |j                  | j                   j                        }t	        |t
        t        t        f      re|}t	        |t
              r|g}|D cg c]  }|j                  |       }}t        j                  t        j                  j                  d|i      gS g }|j                         D ]^  \  }}t	        |t
              r|g}|D cg c]  }|j                  |       }}|j                  t        j                  |d|i             ` |S c c}w c c}w )a  The `data_files` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].

        If str or List[str], then the dataset returns only the 'train' split.
        If dict, then keys should be from the `datasets.Split` enum.
        z=At least one data file must be specified, but got data_files=files)name
gen_kwargs)r,   
data_files
ValueErrordownload_and_extract
isinstancer$   listtuple
iter_filesr!   SplitGeneratorSplitTRAINitemsappend)r   
dl_managerr2   r/   filesplits
split_names          r   _split_generatorszText._split_generators-   s=    {{%%\]a]h]h]s]s\tuvv44T[[5K5KL
j3e"45E%%=BCTZ**40CEC++1E1ESZ\aRbcdd!+!1!1!3 	aJ%%=BCTZ**40CECMM(11zwX]N^_`		a
  D Ds   EEpa_tablereturnc                    | j                   j                  u| j                   j                  j                  }t        d | j                   j                  j	                         D              r|j                  |      }|S t        ||      }|S |j                  t        j                  dt        j                         i            S )Nc              3   4   K   | ]  }t        |         y w)Nr   ).0features     r   	<genexpr>z#Text._cast_table.<locals>.<genexpr>G   s     b+G44bs   text)
r,   r   arrow_schemaallvaluescastr	   paschemastring)r   rC   rP   s      r   _cast_tablezText._cast_tableD   s    ;;+[[))66FbDKKDXDXD_D_Dabb#==0 O &h7O==FBIIK+@!ABBr   c           
   #     K   | j                   j                  t        | j                   j                        ndg}t        t        j
                  j                  |            D ]  \  }}t        || j                   j                  | j                   j                        5 }| j                   j                  dk(  rd}	 |j                  | j                   j                        }|snQ||j                         z  }t        |      j                         }| j                   j                   s|D cg c]  }|j#                  d       }}t$        j&                  j)                  t%        j*                  |      g|      }	||f| j-                  |	      f |dz  }| j                   j                  dk(  rd}d	}	 |j                  | j                   j                        }
|
sn||
z  }||j                         z  }|j/                  d
      }t$        j&                  j)                  t%        j*                  |d d D cg c]  }|s|	 c}      g|      }	||f| j-                  |	      f |dz  }|d   }|rt$        j&                  j)                  t%        j*                  |g      g|      }	||f| j-                  |	      f nt| j                   j                  dk(  r[|j                         }t$        j&                  j)                  t%        j*                  |g      g|      }	|| j-                  |	      f d d d         y c c}w c c}w # 1 sw Y   xY ww)NrJ   )r   r   r   r   
)names   	paragraph z

document)r,   r   r6   	enumerate	itertoolschainfrom_iterableopenr   r   r   readr   readliner   	readlinesr   rstriprO   Tablefrom_arraysarrayrR   split)r   r/   pa_table_namesfile_idxr?   f	batch_idxbatchr   rC   	new_batchexamplerJ   s                r   _generate_tableszText._generate_tablesQ   s    7;{{7K7K7Wdkk223^d]e'	(E(Ee(LM -	?NHddT[[%9%9$++B]B]^ +?bc;;((F2 !I !t{{'<'< =$!- ( 9 9 ;#{{::CH$I4T[[%6$IE$I#%88#7#7%8IQ_#7#`  (3T5E5Eh5OOO!Q	  [[**k9 !IE$%FF4;;+@+@$A	(!*- %F 3#%88#7#7XXeCRj&T7Gw&TUV^l $8 $  (3T5E5Eh5OOO!Q	 %b	   #%88#7#75'9J8KSa#7#b'3T5E5Eh5OOO[[**j8668D!xx33RXXtf5E4Fn3]H"D$4$4X$>>>W+? +?-	? %J$ 'U;+? +?sE   BM BM,MC3M7M?MC4M7M
MM	MN)r   r   r   r   BUILDER_CONFIG_CLASSr-   rB   rO   rd   rR   ro   r'   r   r   r)   r)   '   s6    %C.CBHH C C/?r   r)   )r\   r   dataclassesr   r   ior   typingr   pyarrowrO   r!   datasets.features.featuresr   datasets.tabler	   utilslogging
get_loggerr   loggerBuilderConfigr   ArrowBasedBuilderr)   r'   r   r   <module>r}      sq      *     ; % 
			*	*8	4 *'' * **Y?8%% Y?r   