
    *#h                         d dl Z d dlmZmZmZ d dlZd dlmZ	 ddl
mZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddl m!Z! dedee"   fdZ# G d de!      Z$ G d d      Z%y)    N)BinaryIOOptionalUnion   )AudioDatasetFeaturesImage
NamedSplitValueconfig)FeatureType_visit)query_table)_PACKAGED_DATASETS_MODULES)Parquet)logging)NestedDataStructureLikePathLike   )AbstractDatasetReaderfeaturesreturnc                     t         j                  dt        ddffd}t        | |       t         j                  u rdS S )a  
    Get the writer_batch_size that defines the maximum row group size in the parquet files.
    The default in `datasets` is 1,000 but we lower it to 100 for image datasets.
    This allows to optimize random access to parquet file, since accessing 1 row requires
    to read its entire row group.

    This can be improved to get optimized size for querying/iterating
    but at least it matches the dataset viewer expectations on HF.

    Args:
        ds_config_info (`datasets.info.DatasetInfo`):
            Dataset info from `datasets`.
    Returns:
        writer_batch_size (`Optional[int]`):
            Writer batch size to pass to a dataset builder.
            If `None`, then it will use the `datasets` default.
    featurer   Nc                 (   t        | t              rt        t        j                        y t        | t
              rt        t        j                        y t        | t              r+| j                  dk(  rt        t        j                        y y y )Nbinary)

isinstancer
   minr   )PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETSr   )PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETSr   dtype*PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS)r   
batch_sizes    P/var/www/html/sandstorm/venv/lib/python3.12/site-packages/datasets/io/parquet.pyset_batch_sizez-get_writer_batch_size.<locals>.set_batch_size&   sk    gu%Z)Y)YZJ'Z)Y)YZJ'GMMX,EZ)Z)Z[J -F'    )npinfr   r   )r   r&   r$   s     @r%   get_writer_batch_sizer*      sF    & J\ \ \ 8^$'47Z7r'   c                   h     e Zd Z	 	 	 	 	 	 d
dee   dee   dee   dede	de	dee
   f fdZd	 Z xZS )ParquetDatasetReaderpath_or_pathssplitr   	cache_dirkeep_in_memory	streamingnum_procc           
          t        
|   |f||||||d| t        |t              r|n| j                  |i}t
        d   d   }	t        d||||	d|| _        y )N)r.   r   r/   r0   r1   r2   parquetr   )r/   
data_filesr   hash )super__init__r   dictr.   r   r   builder)selfr-   r.   r   r/   r0   r1   r2   kwargsr6   	__class__s             r%   r9   zParquetDatasetReader.__init__5   s     			
)		
 		
 *4M4)Htzz[hNi))4Q7 
$	

 
r'   c                 6   | j                   r(| j                  j                  | j                        }|S d }d }d }d }| j                  j	                  ||||| j
                         | j                  j                  | j                  || j                        }|S )N)r.   )download_configdownload_modeverification_mode	base_pathr2   )r.   rB   	in_memory)r1   r;   as_streaming_datasetr.   download_and_preparer2   
as_datasetr0   )r<   datasetr@   rA   rB   rC   s         r%   readzParquetDatasetReader.readT   s    >>ll77djj7IG& ! #O M $ILL-- /+"3# .  ll--jj4EQUQdQd . G r'   )NNNFFN)__name__
__module____qualname__r   r   r   r   r	   strboolintr9   rI   __classcell__)r>   s   @r%   r,   r,   4   sx     '+'+$"&
.x8
 
#
 8$	

 
 
 
 3-
>r'   r,   c                   T    e Zd Z	 d
dedeeef   dee   fdZ	defdZ
dededefd	Zy)ParquetDatasetWriterNrH   path_or_bufr$   c                 j    || _         || _        |xs t        |j                        | _        || _        y N)rH   rS   r*   r   r$   parquet_writer_kwargs)r<   rH   rS   r$   rV   s        r%   r9   zParquetDatasetWriter.__init__n   s3     &$O(=g>N>N(O%:"r'   r   c                    | j                   r| j                   nt        j                  }t        | j                  t
        t        t        j                  f      r@t        | j                  d      5 } | j                  d||d| j                  }d d d        |S  | j                  d| j                  |d| j                  }|S # 1 sw Y   S xY w)Nzwb+)file_objr$   r7   )r$   r   DEFAULT_MAX_BATCH_SIZEr   rS   rM   bytesosr   open_writerV   )r<   r$   bufferwrittens       r%   writezParquetDatasetWriter.writez   s    (,T__V=Z=Z
d&&eR[[(ABd&&. l&%$++kv*kPTPjPjkl  "dkkq4+;+;
qVZVpVpqG	l s   . CCrX   c           	      R   d}|j                  dd      }| j                  j                  j                  }t	        j
                  |fd|i|}t        j                  t        dt        | j                        |      dt        j                          d      D ]}  }t        | j                  j                  t        |||z         | j                  j                  | j                  j                  nd      }	|j                  |	       ||	j                   z  } |j#                          |S )	zWrites the pyarrow table as Parquet to a binary file handle.

        Caller is responsible for opening and closing the handle.
        r   rS   Nschemabaz"Creating parquet from Arrow format)unitdisabledesc)tablekeyindices)poprH   r   arrow_schemapqParquetWriterr   tqdmrangelenis_progress_bar_enabledr   _dataslice_indiceswrite_tablenbytesclose)
r<   rX   r$   rV   r_   _rb   writeroffsetbatchs
             r%   r]   zParquetDatasetWriter._write   s   
 !%%mT:&&33!!(S6S=RSll!S&
377995	
 	$F  ll((&&:"56151F1F1R--X\E
 u%u||#G	$ 	r'   rU   )rJ   rK   rL   r   r   r   r   r   rO   r9   r`   r]   r7   r'   r%   rR   rR   m   s]    
 %)	
;
; 8X-.
; SM	
;s x S VY r'   rR   )&r[   typingr   r   r   numpyr(   pyarrow.parquetr4   rl    r   r   r	   r
   r   r   r   features.featuresr   r   
formattingr   packaged_modulesr    packaged_modules.parquet.parquetr   utilsr   utils.typingr   r   abcr   rO   r*   r,   rR   r7   r'   r%   <module>r      sb    	 , ,   I I I 3 $ 9 6  < & 8H  8#  8F60 6r0 0r'   