
    +#h:                     Z   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlZd dlZ	d dl
Z
d dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZ d dlmZ  ej6                  d       dZdZd	Zd
ZdZ d
Z!eee!dZ"d Z#d Z$d Z%d Z&d Z'd Z(d Z) G d de      Z*d Z+d Z,e G d d             Z-d Z.d Z/y)    N)	dataclass)Dataset)
AutoConfigAutoModelForCausalLMAutoModelForSeq2SeqLMAutoTokenizerDataCollatorForSeq2SeqPreTrainedTokenizerSeq2SeqTrainerSeq2SeqTrainingArgumentsTrainerTrainingArgumentsset_seed)TrainerCallbackignoreiz[SEP]z[PAD]z</s>z<s>)	sep_token	pad_token	unk_tokenc                    | j                         D ]	  }d|_         t        | j                  j                        }t        | j                  j                        D ]*  \  }}|||z
  k\  s|j                         D ]	  }d|_         , | j                  j                  j                         D ]	  }d|_         | j                  j                         D ]	  }d|_         | S )NFT)
parametersrequires_gradlentransformerh	enumerateln_flm_head)modelunfreeze_last_n	parameterNUM_DECODER_LAYERSims         _/var/www/html/sandstorm/venv/lib/python3.12/site-packages/dsp/modules/finetuning/finetune_hf.py_freeze_model_layersr%   3   s    %%' (	"'	( U..001%++--. /1"_44\\^ /	*.	'// &&++668 '	"&	']]--/ '	"&	'L    c                     g }dd l }t        |       5 }|D ]"  }|j                  |j                  |             $ 	 d d d        t	        j
                  |      }|S # 1 sw Y    xY w)Nr   )ujsonopenappendloadsr   	from_list)pathLr(   flinedatasets         r$   
_load_datar2   G   sc    
A	d (q 	(DHHU[[&'	(( "GN( (s   (AA#c                 :    |r|  d} | S |  d|j                    } | S )N )r   text	tokenizerencoder_decoder_modeldecoder_only_model	rationales        r$   preprocess_promptr;   S   s2    .dV1:DK 8<fAi>Q>Q=R4SDKr&   c                 J    |r| n|  |j                    } | j                         S N)	eos_tokenlstripr5   s        r$   preprocess_completionr@   X   s'    (4y7J7J6K.LD;;=r&   c                     | j                  fd      } | D cg c]
  }|d   	| }}t        dt        |       dt        |               | j                  d       } | S c c}w )Nc           	      Z    t        | d   d         t        | d   d         dS )Npromptr:   
completion)rC   rD   )r;   r@   )xconfigr9   r8   r7   s    r$   <lambda>z"_preprocess_data.<locals>.<lambda>^   sO    #AhK<QSegmnygz{+AlOYH]_qsy  {F  tG  H% r&   rD   z)# examples skipped due to parsing error: z / c                     | d   S )NrD    rE   s    r$   rG   z"_preprocess_data.<locals>.<lambda>d   s
    q r&   )mapprintr   filter)r1   r7   r8   r9   rF   rE   skippeds    ````  r$   _preprocess_datarO   ]   sk    kk  G "=QQ|_%<q=G=	5c'l^3s7|n
UVnn67GN >s
   
A(A(c                 x   d }dfd	}dfd	}|r. || d      } || d      }||d}	| j                  |d|		      }
nJ|rH| j                  d
       } | j                  fd      }  || d      }d|i}	| j                  |d|		      }
t        d	        t        dt        
j                                |
S )Nc                     | j                  fdd      }t        |d   D cg c]  }t        |       c}      }|S c c}w )Nc                      |          S r=   rI   )rE   columnr7   s    r$   rG   z>_tokenize_dataset.<locals>.get_dataset_stats.<locals>.<lambda>j   s    1V91E r&   T)batched	input_ids)rK   maxr   )r1   r7   rS   tokenized_inputsrE   
max_lengths    ``   r$   get_dataset_statsz,_tokenize_dataset.<locals>.get_dataset_statsi   sB    ";;'Et;T*:;*GHQ#a&HI
 Is   ArX   c           	           | d   ||d      } | d   ||d      }|d   }|dk(  r4|D cg c]'  }|D cg c]  }|j                   k7  r|nt         c}) }}}||d<   |S c c}w c c}}w )	NrC   TrX   padding
truncationrD   )text_targetrX   r\   r]   rU   rX   labels)pad_token_idIGNORE_INDEX)	samplemax_source_lengthmax_target_lengthr\   model_inputsr_   labellr7   s	           r$   get_tokens_seq2seqz-_tokenize_dataset.<locals>.get_tokens_seq2seqn   s     !1>OY`mqr vl';HYcjw{|$ l"iop`eUZ[PQQ)"8"88lJ[pFp!'X \ps   	A*A%A*%A*c           	          | d   ||d      }t        j                  |d         } | d   |d      }|d   D cg c]  }t        |       }}t        ||      D ]  \  }}	t        g|	z  |d |	  |dk(  r4|D 
cg c]'  }|D 
cg c]  }
|
j
                  k7  r|
nt         c}
) }}}
||d<   |S c c}w c c}
w c c}
}w )	NcombinedTr[   rU   rC   )rX   r]   rX   r_   )copydeepcopyr   zipra   r`   )rb   rX   r\   re   r_   promptstokensprompt_lensrf   
source_lenrg   r7   s              r$   get_tokens_causalz,_tokenize_dataset.<locals>.get_tokens_causal}   s     
!3
T[hlm |K89F8,PTU181EFvs6{FF!$V[!9 	=E:".*!<E+:	= l"iop`eUZ[PQQ)"8"88lJ[pFp!'X G \ps   B3=	B=B8$B=8B=rC   rD   )rc   rd   T)rT   	fn_kwargsc                 "    d| d   dz   | d   z   iS )Nrj   rC   r4   rD   rI   )examples    r$   rG   z#_tokenize_dataset.<locals>.<lambda>   s!    z78;Ls;RU\]iUj;j.k r&   c                 L    t         | d         d         j                  k  S )Nrj   rU   )r   model_max_length)rE   r7   s    r$   rG   z#_tokenize_dataset.<locals>.<lambda>   s&    3y:/G/T+UYbYsYs+s r&   rj   zDataset statistics: zKeys of tokenized dataset: )rX   )rK   rM   rL   listfeatures)r1   r7   r8   r9   rY   rh   rr   rc   rd   kwargstokenized_datasetrX   s    `          r$   _tokenize_datasetr|   h   s    
$ -gy(K-gy,O(9Qbc#KK(:DTZK[	++kl..!st&w	:F
,#KK(94SYKZ	 
)*	'->-G-G(H'I
JKr&   c           	         |\  }}t        |t              r|d   }|j                  |d      }t        j                  |t
        k7  ||j                        }|j                  |d      }| j                  ||d      }|j                         D 	ci c]  \  }}	|t        |	dz  d       }}}	|D 
cg c]$  }
t        j                  |
|j                  k7        & }}
t        j                  |      |d<   |S c c}	}w c c}
w )Nr   T)skip_special_tokens)predictions
referencesuse_stemmerd      gen_len)
isinstancetuplebatch_decodenpwherera   r`   computeitemsroundcount_nonzeromean)metric
eval_predsr7   predsr_   decoded_predsdecoded_labelsresultkvpredprediction_lenss               r$   _compute_metricsr      s    ME6%a**5d*KM XXf,fi6L6LMF++F+MN^^.^b^cF/5||~>tq!aq3w"">F>TYZDr''	0F0F(FGZOZ0F9M ?Zs   C61)C<c                       e Zd Zd Zy)PeftSavingCallbackc                    |j                   }|d   j                  |       t        j                  j	                  |j                   d      }t        j                  j                  |      rt        j                  |       y d  y )Nr   zpytorch_model.bin)best_model_checkpointsave_pretrainedosr-   joinexistsremove)selfargsstatecontrolrz   peft_model_pathpytorch_model_paths          r$   on_train_endzPeftSavingCallback.on_train_end   s[    55w''8WW\\%*E*EGZ[)+8J)K		$%QUr&   N)__name__
__module____qualname__r   rI   r&   r$   r   r      s    Vr&   r   c                 ~   t        |       }t        d#i d|d   d|d   d|d   d|d   ddd	|d
   d|d   ddd|d    dddddddddd|d   ddddd|d   d|d   }t        | ||d   |d   |fd |d!   rt        gnd "      }|j	                          |j
                  j                  S )$N)r7   r   
output_dirper_device_train_batch_size
batch_sizegradient_accumulation_stepsper_device_eval_batch_sizepredict_with_generateTlearning_ratelrnum_train_epochsepochs	log_levelerrorlogging_dir/logslogging_strategystepslogging_steps  evaluation_strategyepochsave_strategysave_total_limitload_best_model_at_end	report_totensorboardfp16bf16traintestc                     t        |       S r=   )r   )rE   r   r7   s    r$   rG   z _train_seq2seq.<locals>.<lambda>   s    "261i"H r&   peft)r   r7   r   train_dataseteval_datasetdata_collatorcompute_metrics	callbacksrI   )r	   r   r   r   r   r   r   r   r7   r{   r   rF   r   training_argstrainers    ` `    r$   _train_seq2seqr      sX   *YeLM - ,'$*<$8 %++H$I $*,#7	
 # Tl  )  l+,E2 !  $   )   $!"  #$ F^%& F^'M. '0&v.#H*0.%&d	G MMO==...r&   c                 j   |j                  |       }|j                  t        |             |dkD  r|j                         j                  j
                  }|j                         j                  j
                  }|d|  j                  dd      }|d|  j                  dd      }||| d ||| d yy)z
    Resize tokenizer and embedding.
    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    r   NT)dimkeepdim)add_special_tokensresize_token_embeddingsr   get_input_embeddingsweightdataget_output_embeddingsr   )special_tokens_dictr7   r   num_new_tokensinput_embeddingsoutput_embeddingsinput_embeddings_avgoutput_embeddings_avgs           r$   $smart_tokenizer_and_embedding_resizer      s    
 112EFN	!!#i.1 557>>CC!779@@EE/0@.AFF1VZF[ 12BN? C H HQX\ H ]-A.)*.C>/*+ r&   c                   "    e Zd ZU dZeed<   d Zy) DataCollatorForSupervisedDatasetz6
    Collate examples for supervised fine-tuning.
    r7   c                    | j                   j                  }t        fddD              \  }}t        j                  |      t        j                  |      }}t        j
                  j                  j                  j                  |d|      }t        j
                  j                  j                  j                  |dt              }t        |||j                  |            S )Nc              3   N   K   | ]  }D cg c]  }||   	 c}  y c c}w wr=   rI   ).0keyinstance	instancess      r$   	<genexpr>z<DataCollatorForSupervisedDataset.__call__.<locals>.<genexpr>  s$     !oPS"KX8C="K!o"Ks   	% %)rU   r_   T)batch_firstpadding_value)rU   r_   attention_mask)r7   r`   r   torchtensornnutilsrnnpad_sequencera   dictne)r   r   r`   rU   r_   s    `   r$   __call__z)DataCollatorForSupervisedDataset.__call__  s    ~~22!!oWn!oo	6!LL3U\\&5I6	HHNN&&33I4_k3l	##00TYe0fiy||T`Gabbr&   N)r   r   r   __doc__r
   __annotations__r   rI   r&   r$   r   r      s     #"	cr&   r   c           
      h   t        |      }t        d!i d|d   d|d   d|d   d|d   d|d   d	|d
   ddd|d    dddddddddd|d
   ddddd|d   d|d   }t        | |||d   |d   ||d   rt        gnd        }|j	                          |j
                  j                  S )"N)r7   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Tr   r   r   r   r   r   r   )r   r7   r   r   r   r   r   rI   )r   r   r   r   r   r   r   r   s           r$   _train_causalr     sH   4yIM & ,'$*<$8 %++H$I $*,#7	
 Tl  )  l+,E2 !  $   )  $   !" F^#$ F^%M, '0&v.#*0.%&dG MMO==...r&   c                 v   t        d       t        j                  j                  d|d         }t        j                  j	                  |      rwt        j
                  | d      }t        |d       d   }t        t        j                  j                  |d            5 }t        j                  |      }d d d        d	   }nLt        j                  |d
       ||d<   ||d<   t        t        j                  j                  |d   d      d      5 }t        j                  ||       d d d        t        j                  |      j                  d   d   }	d|	v xs d|	v }
d|	v xs d|	v }|
s|s
J d|        |d   r	|
sJ d       |d   r|d   rJ d       |
rt        nt         }|d   rb|j                  |d      }|
rt"        j$                  nt"        j&                  }t)        |dddd !      }t+        ||      }|j-                          n\|d   rF|j                  |      }t/        |j0                        }|j3                  |j5                                n|j                  |      }t7        j                  |      }|rt9        t:        ||       t=        |       }t?        |||
||      }tA        |||
|      }|jC                  d "      }tE        d#|        tG        j                  d$      }|
rtI        |||||      }n|rtK        |||||      }tE        d%        |S # 1 sw Y   mxY w# 1 sw Y   xY w)&N*   z../finetuning_ckptssavez/checkpoint*c                 <    t        | j                  d      d         S )N-)intsplitrJ   s    r$   rG   zfinetune_hf.<locals>.<lambda>?  s    QWWS\"5E1F r&   )r   r  ztrainer_state.jsonr   T)exist_oktargetr   zcompiler_config.jsonwarchitecturesr   ConditionalGenerationT5WithLMHeadModelCausalLMGPT2LMHeadModelz!Unknown HuggingFace model class: fidz3Model must be encoder-decoder for Fusion in Decoderr   z&FiD and PEFT can't be trained togetherauto)
device_mapF       g?)	task_typeinference_moder
lora_alphalora_dropout)	test_sizezFinetuning dataset: rougezBest checkpoint of model: )&r   r   r-   r   r   globsortedr)   jsonloadmakedirsdumpr   from_pretrained__dict__r   r   TaskTypeSEQ_2_SEQ_LM	CAUSAL_LM
LoraConfigget_peft_modelprint_trainable_parametersFiDT5rF   load_t5
state_dictr   r   SPECIAL_TOKENS_DICTr2   rO   r|   train_test_splitrL   evaluater   r   )	data_pathr  rF   r   ckpts
final_ckptr/   r   r   architecturer8   r9   AutoModelClassr   r  peft_configt5r7   r1   r{   r   s                        r$   finetune_hfr6  7  s0   RL3VF^DJ	ww~~j!		ZL56E'FGK
"'',,z+?@A 	!QIIaLE	! %&= > 	J.!x)|"'',,vl35KLcR 	!VWIIfa 	! "11&9BB?STUV!8L!H rNaeqNq(L8`>OS_>_$(:h>_`f_g<hh:%=$9p;pp9%=v`8``6 3H.Ma&>"226f2ME1F--HL^L^I$yRS`bqtuK"5+6E,,.e}#33F;bii(bmmo.&66v> "11&9	01DiQVW Y'"7I7LN`bhi-gyBWYkl->>>M$%6$789 w' $25)EVX^`f$g!$1%DUW]_e$f!	&'<&=
>?  o	! 	!	! 	!s    L!L.!L+.L8)0rk   r  r  r   warningsdataclassesr   r.  numpyr   r   datasetsr   transformersr   r   r   r   r	   r
   r   r   r   r   r   transformers.trainer_callbackr   filterwarningsra   DEFAULT_SEP_TOKENDEFAULT_PAD_TOKENDEFAULT_EOS_TOKENDEFAULT_BOS_TOKENDEFAULT_UNK_TOKENr,  r%   r2   r;   r@   rO   r|   r   r   r   r   r   r   r6  rI   r&   r$   <module>rC     s       	  !        :
    !     "" # (	

6r"V V)/XD& c c c$'/T@!r&   