
    *#ha                         d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ 	 ddlmZ dd	lmZ d
 Zd Zd Zd Z	 ddZd Z G d d      Z G d d      Zd Zy# e$ r dZY ;w xY w)zTF-specific utils import.    N)partial)ceil)uuid4)get_context)SharedMemory   )configc                    t        | t              r| S t        j                  rdd l}nt        d      | d   }i }|j                         D ]  \  }}t        |t        j                        r*t        j                  | D cg c]  }||   	 c}      ||<   Jt        ||j                        r&|j                  | D cg c]  }||   	 c}      ||<   t        j                  | D cg c]  }||   	 c}      ||<    |S c c}w c c}w c c}w )Nr   FCalled a Tensorflow-specific function but Tensorflow is not installed.)
isinstancedictr	   TF_AVAILABLE
tensorflowImportErroritemsnpndarraystackTensorarray)featurestffirstbatchkvfs          T/var/www/html/sandstorm/venv/lib/python3.12/site-packages/datasets/utils/tf_utils.pyminimal_tf_collate_fnr   $   s    (D!			bccQKEE :1a$xxx 8!1 89E!H299%xxx 8!1 89E!Hxxx 8!1 89E!H: L !9 8 8s   =C=
9D
#D
c                 :    t        |       }d|v r|d   |d<   |d= |S )Nlabellabels)r   )r   r   s     r   #minimal_tf_collate_fn_with_renamingr#   8   s-    !(+E%.h'NL    c                 ,   t         j                  j                  |       rt        | j                        S t         j                  j                  |       xs@ t         j                  j                  |       xs t         j                  j                  |       S N)patypesis_listis_numeric_pa_type
value_type
is_integeris_floating
is_decimal)pa_types    r   r*   r*   @   sc    	xx !'"4"45588w'h288+?+?+HhBHHL_L_`gLhhr$   c                 @   ddl m}m}m} ddlm} t        | |      rt        | j                        S t        | t              rt        | d         S t        | |      rt         |        j                        S t        | |      rt         |              S t        | |      ryy)Nr   )
ClassLabelSequenceValue)_ArrayXDr   TF) r1   r2   r3   features.featuresr4   r   is_numeric_featurefeaturelistr*   storage_dtype)r8   r1   r2   r3   r4   s        r   r7   r7   F   s    ..,'8$!'//22	GT	"!'!*--	GX	&!')"9"9::	GU	#!'),,	GZ	(r$   c                 Z   t        | t        j                        s| j                         } d}t        | t        j                        r|| j                            }d}n}t        j                  t        j                  |       dk(        r|| d   | d   dz    }nCt        | t        j                        r||    }n#t        dj                  t        |                   |+|j                         D 	
ci c]  \  }	}
|	|v s|	dv r|	|
 }}	}
|rft        t        |j                               d         }t        |      D 	
cg c])  }|j                         D 	
ci c]  \  }	}
|	|
|    c}
}	+ }}	}}
 ||fi |}|rJi }|j                         D ]3  \  }}t        j                   ||         }|j#                  |      }|||<   5 |S g }|j                         D ]?  \  }}t        j                   ||         }|j#                  |      }|j%                  |       A |S c c}
}	w c c}
}	w c c}
}	}w )NTF   r   zUnexpected type for indices: {})r!   	label_idsr"   )r   r   r   numpyintegeritemalldiffRuntimeErrorformattyper   lenr9   valuesranger   astypeappend)indicesdatasetcols_to_retain
collate_fncollate_fn_argscolumns_to_np_typesreturn_dict
is_batchedr   keyvalueactual_sizei	out_batchcol
cast_dtyper   s                    r   np_get_batchr[   X   s    grzz*--/J'2::&'
	 A%	&
WR[1_5	GRZZ	( <CCDMRSS! $kkm
Un$/O(O J
 
 $u||~.q12JOP[J\]]Q%++-@JC#uQx-@]]u00E	288: 	#OCHHU3Z(ELL,E"IcN		#  	288: 	$OCHHU3Z(ELL,EU#		$
 5
 A]s   /H=H&H %H& H&c	           	      $    t         j                  rddlnt        d      t	        d      rj
                  nft	        j                  j                  d      r!j                  j                  j                  n%t               dkD  rt        j                  d       dt        t         |||d	      j                         D 	cg c]  }	j                  j!                  |	       c}	j#                  j%                  dj&                        g
      fd       }
j(                  j*                  j-                  t                     }|rJHj/                  dj1                  dj&                              } fd}|j3                  ||      }n!|r|j5                  |j7                               }||j9                  ||      }|j;                  |
      }|fd}nfd}|j;                  |      S c c}	w )a  Create a tf.data.Dataset from the underlying Dataset. This is a single-process method - the multiprocess
    equivalent is multiprocess_dataset_to_tf.

    Args:
        dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.
        cols_to_retain (`List[str]`): Dataset column(s) to load in the
            tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and
            that do not exist in the original dataset.
        collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate
            lists of samples into a batch.
        collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the
            `collate_fn`. Can be empty.
        columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.
        output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to
            `tf.TensorSpec` objects.
        shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
            validation/evaluation.
        batch_size (`int`, default `None`): Size of batches to load from the dataset. Defaults to `None`, which implies that
            the dataset won't be batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`.
        drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,
            defaults to the same setting as shuffle.

    Returns:
        `tf.data.Dataset`
    r   Nr   random_index_shuffleindex_shufflei zto_tf_dataset() can be memory-inefficient on versions of TensorFlow older than 2.9. If you are iterating over a dataset with a very large number of samples, consider upgrading to TF >= 2.9.F)rM   rN   rO   rP   rQ   rR   )input_signaturec                     j                  | g      }t        j                               D ci c]  \  }}|||    c}}S c c}}w )N)inpTout)py_function	enumeratekeys)rL   outputrW   rT   rQ   	getter_fnr   touts       r   fetch_functionz%dataset_to_tf.<locals>.fetch_function   sR    	   

 .77J7O7O7Q-RS61cVAYSSSs   A   r=   )dtype)rU   c                     j                  | dk(        r(j                  j                  ddj                        }  || t	              dz
        }| |fS )Nr=   rj   l            )shapemaxvalrl   r<   )indexseed	max_index)
reduce_allrandomuniformint64rG   )staterp   shuffled_indexrM   r]   r   s      r   scan_random_indexz(dataset_to_tf.<locals>.scan_random_index   sZ    }}Ub[) 		))U"(()S1EUXY`UadeUefN.((r$   )drop_remainderc           
          | j                         D ci c]%  \  }}|j                  ||   j                        ' c}}S c c}}w r&   r   ensure_shapern   
input_dictrT   valoutput_signaturer   s      r   ensure_shapesz$dataset_to_tf.<locals>.ensure_shapes   sA    [e[k[k[mnxsTWC.>s.C.I.IJJnnns   *Ac                     | j                         D ci c](  \  }}|j                  ||   j                  dd        * c}}S c c}}w )Nr<   r|   r~   s      r   r   z$dataset_to_tf.<locals>.ensure_shapes   sJ    _i_o_o_qrS[SVX[C.>s.C.I.I!".MNNrrrs   -A)r	   r   r   r   hasattrr]   rt   experimentalr^   rG   warningswarnr   r[   rH   dtypesas_dtypefunction
TensorSpecrv   dataDatasetrI   fillcastscanshufflecardinalityr   map)rM   rN   rO   rP   rQ   r   r   
batch_sizerz   rl   ri   
tf_dataset	base_seedry   r   rg   r]   r   rh   s   `   ``         @@@@r   dataset_to_tfr      s   H bcc r)*!66	''	9!yy55CCw<*$MM*
  $%'/I 4G3M3M3OP%BIIu%PD[["--bhh"?!@[AT BT &&s7|4J'3GGD"(((CGD		)  __Y0AB
	''
(>(>(@A
%%j%P
/J	o
	s >>-((W Qs   "Hc                   *    e Zd Zd Zd Zd Zd Zd Zy)SharedMemoryContextc                      g | _         g | _        y r&   )created_shmsopened_shmsselfs    r   __init__zSharedMemoryContext.__init__   s    r$   c                     t        t        |      ||      }|r| j                  j                  |       |S | j                  j                  |       |S )N)sizenamecreate)r   intr   rK   r   )r   r   r   r   shms        r   get_shmzSharedMemoryContext.get_shm   sM    D	VD$$S) 
 ##C(
r$   c                     | j                  |t        j                  |      t        j                  |      j                  z  |      }t        j
                  |||j                        S )N)r   r   r   )rl   buffer)r   r   prodrl   itemsizer   buf)r   r   rn   rl   r   r   s         r   	get_arrayzSharedMemoryContext.get_array
  sI    ll2775>BHHUO<T<T+T]cldzz%uSWW==r$   c                     | S r&    r   s    r   	__enter__zSharedMemoryContext.__enter__      r$   c                     | j                   D ]"  }|j                          |j                          $ | j                  D ]  }|j                           y r&   )r   closeunlinkr   )r   exc_type	exc_value	tracebackr   s        r   __exit__zSharedMemoryContext.__exit__  sI    $$ 	CIIKJJL	 ## 	CIIK	r$   N)__name__
__module____qualname__r   r   r   r   r   r   r$   r   r   r      s    >r$   r   c                   >    e Zd Zd Zd Zd Zed        Zed        Zy)NumpyMultiprocessingGeneratorc                    || _         || _        || _        || _        |j	                         D cg c]*  \  }}|t
        j                  t
        j                  fv s)|, c}}| _        |j	                         D ci c]+  \  }}||| j                  vr|nt        j                  d      - c}}| _
        || _        || _        || _        |	| _        |
| _        |j	                         D ci c]U  \  }}||| j                  vrt!        |j"                  j$                        n!t!        |j"                  j$                        dz   W c}}| _        y c c}}w c c}}w c c}}w )NU1r<   )rM   rN   rO   rP   r   r   unicode_str_string_columnsrl   rQ   r   r   r   rz   num_workersr   rn   rankcolumns_to_ranks)r   rM   rN   rO   rP   rQ   r   r   r   rz   r   rY   rl   specs                 r   r   z&NumpyMultiprocessingGenerator.__init__  s>    ,$.5H5N5N5PtzsETY^`^i^ikmkrkr]sTsst 2779$
U #T%8%88bhhtnL$
  !1$,& .335!
T D4G4G)GTZZ__%SQUQ[Q[Q`Q`MadeMee!
 u$
!
s   *EE;0E*AEc              #     K   t        | j                  t        t        t	        | j
                        | j                  z                    }| j                  | j
                  | j                  | j                  || j                        \  }}}t        d      }g }g }g }t        |      D 	cg c]  }	|j                          }
}	t        |      D 	cg c]  }	|j                          }}	| j
                  | j                  | j                  | j                  | j                   | j"                  | j$                  d}t'               5 }t        |      D ]  }t)        t+                     }d| d| d d }|j-                  |       | j"                  j/                         D ci c]0  \  }}||j1                  | d| d|ft2        j4                  d      2 }}}|j-                  |       ||   }||k(  r||}nd }||||
|   ||   d	|}|j7                  | j8                  |d
      }|j;                          |j-                  |        d}|sht        |      D ]U  }|
|   j=                  d      st?        d      |
|   jA                          ||   }tC        d |jE                         D              rd} nt'               5 }|j/                         D ci c]0  \  }}||j1                  ||    d| || j                   |   d      2 }}}|j/                         D ci c]  \  }}|t3        jF                  |       }}}| j$                  D ];  }||   jI                  d||   jJ                  d          jM                  d      ||<   = 	 d d d         ||   jO                          X |sh|D ]  }|jQ                           	 d d d        y c c}	w c c}	w c c}}w c c}}w c c}}w # 1 sw Y   dxY w# 1 sw Y   y xY ww)Nspawn)rM   rN   rO   rP   rQ   r   r   dw__
   _shapeTrn   rl   r   )worker_namerL   extra_batcharray_ready_eventarray_loaded_event)targetkwargsdaemonF<   )timeoutzData loading worker timed out!c              3   L   K   | ]  }t        j                  |d k          yw)r   N)r   any).0rn   s     r   	<genexpr>z9NumpyMultiprocessingGenerator.__iter__.<locals>.<genexpr>w  s     P266%!),Ps   "$Ur=   ))minr   r   r   rG   rM   r   distribute_batchesrz   r   r   rI   EventrN   rO   rP   rQ   r   r   r   strr   rK   r   r   r   rv   Processworker_loopstartwaitTimeoutErrorclearr   rH   copyviewrn   squeezesetjoin)r   r   per_worker_batchesfinal_batchfinal_batch_workerctxnamesshape_arraysworkersr   array_ready_eventsarray_loaded_events	base_argsshm_ctxrW   worker_random_idr   rY   r   worker_shape_arraysworker_indicesfinal_batch_argworker_kwargsworkerend_signal_receivedarray_shapesbatch_shm_ctxrn   arraysarr
string_cols                                  r   __iter__z&NumpyMultiprocessingGenerator.__iter__<  sJ    $**CS5F5X0Y,Z[>B>U>ULL$//4+>+>T\\?
;K); '"383EFaciikFF49+4FGqsyy{GG ||"11//#33#'#;#; $ 5 5"11
	 !" H	g;' '#&uw<  #A3a(8'9:3B?[) &*%:%:%@%@%B'!T **k]!C5+GPTw^`^f^fos*tt'# ' ##$78!3A!6**{/F&1O&*O#.-#2);A)>*=a*@!  ! D,<,<][_`v&5'8 #(){+ %1A-a055b5A*+KLL&q)//1#/?LP,:M:M:OPP /3+ -. - /;.@.@.B" !+U  !8!8#(8*AcU 3&+&*&>&>s&C',	 "9 " " " EKLLN!SS#rwws|"3!S!S*.*=*= J &z 2 7 7!F:<N<T<TUW<X;Y8Z [ c cdf g #:.& !L'*..0K%1 *R " OH	 H	 GG"'X" "T aH	 H	s   B(P*OPO'APAO?5O!
C0O?O35O'O3" O-AO3$O?6O?P!O?'O33O<8O??PPc                     | S r&   r   r   s    r   __call__z&NumpyMultiprocessingGenerator.__call__  r   r$   c                 @   	
 dt         j                  d<   t        j                  rdd l}nt        d      |j                  j                  g d       
 	f
d}t               5 }|j                         D ci c]0  \  }}||j                  	 d| d|ft        j                  d	
      2 c}}|D ]
  } ||        | ||       j                         D ]
  \  }}d|d d   
j                          d d d        y c c}}w # 1 sw Y   y xY w)N3TF_CPP_MIN_LOG_LEVELr   r   GPUc           	        
 t        | 	
d      }i }t               5 }j                         D ]|  \  }}||   }|v r-|j                  d      j	                  |j
                  dz         }|j
                  |   d d  |j                   d| |j
                  |d      ||<   |||   d d  ~ j                          j                          j                          d d d        y # 1 sw Y   y xY w)NT)rL   rM   rN   rO   rP   rQ   rR   r   )r=   r   r   )
r[   r   r   r   reshapern   r   r   r   r   )rL   r   
out_arraysr   rY   rZ   r   r   r   rO   rP   rN   rQ   rM   r   r   r   s          r   send_batch_to_parentzGNumpyMultiprocessingGenerator.worker_loop.<locals>.send_batch_to_parent  s    -% /$7 E J$& +- (;'@'@'B /OC!#JEn, !&

4 0 8 8u9L M+0;;L%a(&3&=&=&-q.ekk\` '> 'JsO */JsOA&/ "%%'"'')"((*%+ + +s   C C))C2r   r   Fr   r=   )osenvironr	   r   r   r   set_visible_devicesr   r   r   r   rv   r   )rM   rN   rO   rP   rQ   r   r   rL   r   r   r   r   r   r  r   rY   r   r   r   r   s   ````` `  ```       @r   r   z)NumpyMultiprocessingGenerator.worker_loop  s$    .1

)*#fgg
		%%b%0	+ 	+B !" 	$g "2!7!7!9C W&&+auF'CD7Z\ZbZbkp&qqL
 ! ,$U+,&$[1*002 
Ua!!#	$ 	$	$ 	$s   2D5D;A
DDDc                    t        j                  t        |             }|rt         j                  j	                  |       t        |      }|||z  z
  }t        j
                  ||g      \  }}|st        |      dk(  rd }|j                  d|      }t        |      }	|	|	|z  z
  }
t        j
                  ||
g      \  }}|j                  d||      }t        j
                  ||j                  d   d      }|D cg c]  }t        j                  |d       }}t        t        |            D ]4  }t        j                  ||   ||   j                  dd      gd      ||<   6 |t        |      }nd }|||fS c c}w )Nr   r=   r<   )axis)r   arangerG   rt   r   splitr	  rn   r   rI   concatenate)rM   r   rz   r   r   rL   num_samplesincomplete_batch_cutofflast_incomplete_batchnum_batchesfinal_batches_cutofffinal_batchesper_worker_indicesr   rW   incomplete_batch_worker_idxs                   r   r   z0NumpyMultiprocessingGenerator.distribute_batches  s   ))CL)IIg&'l #.z1I"J)+'<S;T)U&&S!671<$(!//"j1'l*kK.GH!#'4H3I!J//"k:>XXgw}}Q/?aHRdebjj;ees=)* 	uA$&NN4Fq4I=YZK[KcKcdegiKj3krs$tq!	u !,*-m*<'*.'!#8:UUU fs   :E9N)	r   r   r   r   r  r  staticmethodr   r   r   r$   r   r   r     sA     
D_B E$ E$N V Vr$   r   c
                    t         j                  rddl}
nt        d      t	        | |||||||||	
      }|
j
                  j                  j                  ||      }|rt        t        |       |z        }n t        t        t        |       |z              }|j                  |
j
                  j                  j                  |            S )ao  Create a tf.data.Dataset from the underlying Dataset. This is a multi-process method - the single-process
    equivalent is dataset_to_tf.

    Args:
        dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.
        cols_to_retain (`List[str]`): Dataset column(s) to load in the
            tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and
            that do not exist in the original dataset.
        collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate
            lists of samples into a batch.
        collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the
            `collate_fn`. Can be empty.
        columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.
        output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to
            `tf.TensorSpec` objects.
        shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
            validation/evaluation.
        batch_size (`int`, default `None`): Size of batches to load from the dataset. Defaults to `None`, which implies that
            the dataset won't be batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`.
        drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,
            defaults to the same setting as shuffle.
        num_workers (`int`): Number of workers to use for loading the dataset. Should be >= 1.

    Returns:
        `tf.data.Dataset`
    r   Nr   )
rM   rN   rO   rP   rQ   r   r   r   rz   r   )r   )r	   r   r   r   r   r   r   from_generatorr   rG   r   applyr   assert_cardinality)rM   rN   rO   rP   rQ   r   r   r   rz   r   r   data_generatorr   dataset_lengths                 r   multiprocess_dataset_to_tfr#  	  s    L bcc2%'/)%N //Qa/bJS\Z78T#g,";<=BGG00CCNSTTr$   )F)__doc__r  r   	functoolsr   mathr   uuidr   r?   r   pyarrowr'   multiprocessr   multiprocess.shared_memoryr   r   r5   r	   r   r#   r*   r7   r[   r   r   r   r#  r   r$   r   <module>r+     s      	       $7 (i& ej-`n)b @mV mV`=UW  Ls   A$ $A.-A.