
    ##h:                        d dl mZmZmZ ddlmZmZmZ  e       rd dlZd dl	m
Z
 d dlZd dlmZ d dl	mZ  e       rd dlmZ  ej&                  e      Zej,                  dej.                  fd       Zd!d	ej2                  d
edeej2                  ej2                  f   fdZej,                  dej.                  dej.                  dej.                  dej.                  fd       Zej:                  fdej2                  dej2                  dej2                  dej2                  d
ee   dej<                  dej2                  fdZej@                  dej:                  fdej2                  dej2                  dej2                  dej2                  d
eeeef      dej<                  dej2                  fd       Z! G d de
jD                        Z#	 	 	 	 	 d"dZ$	 	 d#d Z%y)$    )ListOptionalTuple   )is_accelerate_availableis_torch_availableloggingN)
functional)init_empty_weights
BLOCK_SIZEc                    t        j                  d      }||z  t        j                  d|      z   }t        j                  | |z         j	                  t         j
                        }t        j                  t        j                  |            dz  }||z  }|j	                  |j                  j                        }t        j                  ||z   |       t        j                  ||z   |       y )Nr   axisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	x_ptry_ptrs_ptrr   pidoffsxsys	            f/var/www/html/sandstorm/venv/lib/python3.12/site-packages/transformers/integrations/finegrained_fp8.pyact_quant_kernelr$   $   s    
--Q
Cbii:66D
  ,A
rvvayE!A	AA	U[[##$AHHUT\1HHUS[!    r    
block_sizereturnc                 f     j                         sJ  j                  d   |z  dk(  sJ t        j                   t        j                        }  j
                  g  j                         d d  j                  d      |z  dt        j                  i} fd}t        |    |||       ||fS )Nr   r   r   c                 T    t        j                  j                         | d         fS )Nr   )tritoncdivnumel)metar    s    r#   gridzact_quant.<locals>.grid6   s"    AGGItL'9:<<r%   )r   )	is_contiguousshapetorch
empty_likefloat8_e4m3fn	new_emptysizer   r$   )r    r&   r"   r!   r0   s   `    r#   	act_quantr8   0   s    ??772;#q(((%"5"56ARQVVXcr]RAFF2J*$<REMMRA= T1az:a4Kr%   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc                    t        j                  d      }t        j                  ||      }t        j                  ||      }||z  }||z  }||z  }t        ||z
  |      }|||z  z   }||z  |z  } ||z  t        j                  d|      z   |z  }!| |z  t        j                  d|      z   |z  }"t        j                  d|      }#| |!dddf   |
z  |#dddf   |z  z   z   }$||#dddf   |z  |"dddf   |z  z   z   }%||!|z  z   }&|"|z  }'||'|z  z   }(t        j
                  ||ft         j                        })t        dt        j                  ||            D ]  }*t        j                  |$|#dddf   ||*|z  z
  k  d      }+t        j                  |%|#dddf   ||*|z  z
  k  d      },|*|z  }-|-|	z  }.t        j                  |&|.|z  z         }/t        j                  |(|.|z  z         }0|)t        j                  |+|,      |/dddf   z  |0dddf   z  z  })|$||z  z  }$|%||z  z  }% |j                  j                  t         j                  k(  r |)j                  t         j                        }1nf|j                  j                  t         j                  k(  r |)j                  t         j                        }1n|)j                  t         j                        }1||z  t        j                  d|      z   }2| |z  t        j                  d|      z   }3|||2dddf   z  z   ||3dddf   z  z   }4|2dddf   |k  |3dddf   |k  z  }5t        j                  |4|1|5       y)zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and
    store the result in output tensor `C`.
    r   r   Nr*   g        )maskother)r>   )r   r   r-   minr   zerosr   ranger   dotr   r   bfloat16r   float16r   )6ABCAsBsMNKgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nr9   r:   r;   r<   r   	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_masks6                                                         r#   _w8a8_block_fp8_matmulrw   >   sb   J --Q
C<(I<(I#i/&&H\)Ky;.=L3-.E##4E|#bii<&@@AEG|#bii<&@@AEGYYq,'F'!T'"Y.a91LLMF&D/I-a0@90LLMF7[((G'!H8k))G((L,7rzzJK1bgga./ +GGFa1q<7G3G!GsSGGF41q<7G3G!GsSl"W$ggg+ 556ggg+ 556rvva|c!T'l2Sq\AA,**,**+ 	wwR[[(NN2;;'	
		rzz	)NN2::&NN2::&l"RYYq,%??Gl"RYYq,%??GWQW---	GD!G<L0LLFag"wtQw'7!';<FHHVQV$r%   rF   rG   rI   rJ   output_dtypec                    t        |      dk(  sJ |d   |d   }}| j                  d   |j                  d   k(  sJ | j                  dd |j                  dd k(  r| j                         sJ t        j                  | j                  d   |      |j                  d   k(  sJ | j                         | j                  d   z  |j                  dk(  r|j                         r|j                  dk(  sJ |j                  \  }t        j                  |      |j                  d   k(  sJ t        j                  ||      |j                  d   k(  sJ | j                  dd fz   }	| j                  |	|      }
d}|k  r!t        j                        }t        |d      }|}||z  dk(  sJ |}fd	}t        |   | ||
|||||| j                  d
      | j                  d      |j                  d      |j                  d      |
j                  d
      |
j                  d      |j                  d
      |j                  d      |j                  d      |j                  d      |||d       |
S )a  This function performs matrix multiplication with block-wise
    quantization.
    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.
    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should
        be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.
    Returns:
        torch.Tensor: The result of matmul.
    r   r      r)   Nr*         c                 l    t        j                  | d         t        j                  | d         z  fS )Nr9   r:   )r,   r-   )METArK   rL   s    r#   r0   z*w8a8_block_fp8_matmul_triton.<locals>.grid   s1    AtN34v{{1d>FZ7[[]]r%      )r9   r:   r;   r<   )lenr2   r1   r,   r-   r.   ndimr6   next_power_of_2r   rw   stride)rF   rG   rI   rJ   r&   rx   block_nblock_krM   C_shaperH   r9   r;   r:   r0   rK   rL   s                  @@r#   w8a8_block_fp8_matmul_tritonr      sG   . z?a!!}jmWG772;!''"+%%%773B<288CR=(Q__->>>;;qwwr{G,<<<		QWWR[ A66Q;1??,A==77DAq;;q'"bhhqk111;;q'"bhhqk111ggcrlaT!G	G<0AL<--a0<,L\!Q&&&L^ 4 			

									
		"
		"
		!
		!!!!16 Hr%   input_qweight_qinput_scaleweight_scalec                 ~   | j                   dk(  r| j                  nd| j                  d   | j                  d   f\  }}}|j                  d   }	| j                  d|      }
|j                  |j                  d   d      }|	|d   z  }||d   z  }t        j                  ||z  |	ft        j
                  | j                        }t        |      D ]  }||d   z  }||d   z   }t        |      D ]  }||d   z  }||d   z   }|
dd||f   }|||||f   }|dd||dz   f   }|||f   }t        j                  ||j                         t        j                  dt        j
                  | j                        ||      |z  }|dd||fxx   |z  cc<     |j                  |||	      }|j                  |      S )a  
    Performs blocked matrix multiplication with FP8 quantized matrices.

    Args:
        input_q: Quantized input tensor with 1x128 block quantization
        weight_q: Quantized weight tensor with 128x128 block quantization
        input_scale: Scaling factors for input blocks
        weight_scale: Scaling factors for weight blocks
        block_size: Tuple of (M, N) for weight block dimensions
        output_dtype: Desired output dtype
       rz   r   r)   r   deviceN)scale_ascale_b	out_dtype)r   r2   viewr3   rA   r   r   rB   
_scaled_mmttensorr   )r   r   r   r   r&   rx   
batch_sizeseq_len
hidden_dimout_featuresinput_reshapedinput_scale_reshapednum_weight_blocks_mnum_weight_blocks_noutputim_startm_endjn_startn_endinput_blockweight_blockcurr_input_scalecurr_weight_scaleblock_results                             r#   w8a8_block_fp8_matmul_compiler      s   ( 8?||q7HgmmqRYR_R_`aRbdkdqdqrsdtNu#J>>!$L \\"j1N&++K,=,=a,@"E&*Q-7$
15[[*w.=U]][b[i[ijF&' 5jm#*Q-'*+ 	5A*Q-'Gjm+E )GEM)9:K#GEM75=$@AL  4Aq1q5yLA ,QT 2    NN$!LL%--W-* ##  1gem#$4$/	5	5: [[Wl;F99\""r%   c                        e Zd Zej                  Z	 	 	 	 	 d	dedededee	eef      f fdZ
dej                  dej                  fdZ xZS )
	FP8Linearin_featuresr   biasr&   c                    t         
|   ||       || _        || _        t        j
                  j                  t	        j                  ||t        j                  |            | _
        | j                  j                         dk(  rb||d   z   dz
  |d   z  }||d   z   dz
  |d   z  }	t        j                  t	        j                  ||	t        j                  |            | _        n| j                  dd        || _        || _        |r8t        j                  t	        j                  | j                              | _        y | j                  dd        y )Nr   rz   r   weight_scale_invr   )super__init__r   r   r3   nn	Parameteremptyr   r   weightelement_sizer   r   register_parameterr&   activation_schemer   )selfr   r   r   r   r&   r   r   scale_out_featuresscale_in_features	__class__s             r#   r   zFP8Linear.__init__)  s     	l3&(hh((\;V_VeVent)uv;;##%*".A">"BzRS}!T!,z!}!<q!@ZPQ] R$&LL.0A_ef%D! ##$6=$!2U[[1B1B%CDDI##FD1r%   inputr'   c           	      b   | j                   j                         dkD  r+t        j                  || j                   | j                        S t        || j                  d         \  }}t        j                  j                  |j                        5  t        || j                   || j                  | j                  |j                        }d d d        t        j                  j                          | j                  | j                  z   }j                  |j                        S # 1 sw Y   ^xY w)Nrz   )rx   r*   )r   r   Flinearr   r8   r&   r3   cudar   r   r   r   synchronizer   )r   r   qinputscaler   s        r#   forwardzFP8Linear.forwardK  s    ;;##%)88E4;;		:: &eT__Q-?@MFE ""5<<0 5KK))OO!& JJ""$yy$$))+995;;9// s   :D%%D.)FNNNdynamic)__name__
__module____qualname__r3   r5   r   intboolr   r   r   Tensorr   __classcell__)r   s   @r#   r   r   &  ss    E 04# 2 2  2 	 2 U38_- 2D0U\\ 0ell 0r%   r   c                   	 |g }| j                         D ]<  \  }}|j                  |       t        |t        j                        r||xs g vrdj                  |      	t        	fd|xs g D              st               5  t        |j                  |j                  |j                  du|j                  j                  |j                  j                  |j                  |j                         | j"                  |<   d}ddd       t%        t'        |j)                                     dkD  rt+        ||||||      \  }}|j-                  d       ? | |fS # 1 sw Y   ZxY w)	z%Replace Linear layers with FP8Linear.N.c              3   &   K   | ]  }|v  
 y w)N ).0keycurrent_key_name_strs     r#   	<genexpr>z+_replace_with_fp8_linear.<locals>.<genexpr>u  s     ]ss22]s   )r   r   r   r   r   r   r&   Tr   )has_been_replacedr)   )named_childrenappend
isinstancer   Linearjoinanyr   r   r   r   r   r   r   r   r   weight_block_size_modulesr   listchildren_replace_with_fp8_linearpop)
modeltp_planmodules_to_not_convertcurrent_key_namequantization_configr   namemodule_r   s
            @r#   r   r   d  sW    ,,. !f%fbii(T:P:VTV-W#&88,<#= ]?U?[Y[]]') 
-+4$*$6$6%+%8%8#[[4%}}33$mm11*=*O*O#6#H#H,ENN4( )-%
- tFOO%&'!+#;& #"3$ A  	R ;!> ###3
- 
-s   A>EE#	c                     |dgn|}|j                   |j                  |j                          t        t        |            }t	        | | j
                  ||      \  } }|st        j                  d       | S )z:Helper function to replace model layers with FP8 versions.lm_head)r   r   r   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.)r   extendr   setr   _tp_planloggerwarning)r   r   r   r   s       r#   replace_with_fp8_linearr     s     -C,Ji[Pf11=%%&9&P&PQ!#&<"=>75/	 E <	

 Lr%   )r{   )NNNNF)NN)&typingr   r   r   utilsr   r   r	   r3   torch.nnr   r,   triton.languagelanguager   r
   r   
accelerater   
get_loggerr   r   jit	constexprr$   r   r   r8   rw   r   r   r   compiler   r   r   r   r   r   r%   r#   <module>r      s'    ) ( H H  (- 
		H	% bll  
 
3 
u||U\\?Y9Z 
 Q%4 ,,5Q%6 ,,7Q%8 ,,9Q%: ,,;Q% Q%t !&M||M||M 	M 		M
 S	M ++M \\Mb  -1 %>#\\>#ll># ># ,,	>#
 sCx)># ++># \\># >#B;0		 ;0@ +$`  r%   