
    zIg)                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z d dlZd dlmZmZmZmZ d dlmZ d dlmZmZ ddlmZ ddlmZ dd	lmZmZ  ej<                  d
ej>                          ej@                  e!      Z" G d d      Z# G d de#      Z$ G d de#      Z% G d de#      Z& G d de#      Z'd Z( G d d      Z)d/dZ* G d d      Z+ G d d      Z,d Z-d Z.e!d k(  r e.       Z/e/j`                  re"jc                  ejd                         e/jf                  Z4e/jj                  Z6ee/jn                     Z7ejp                  js                  e6      r!e"ju                  d!e6 d"        e;d!e6 d"      e/jx                  r'e/jz                  d#k(  re"j}                  d$       d%e/_<         ej~                  e4      Z@e/jz                  d#k(  r e&e/j                  e/j                  &      ZCne/jz                  d'k(  r+ e'e/j                  e/jx                  e/j                  e7(      ZCnOe/jz                  d)k(  r e$       ZCn8e/jz                  d*k(  r e%e/j                  +      ZCn eEd,e/jz                          e,e@e/j                  e/j                  eC-      ZGeGj                          eGj                  j                  e6d.       yy)0    )annotationsN)
GraphProto
ModelProto	NodeProtoTensorProto)version)quantize_matmul_4bitsquantize_qdq_matmul_4bits   )CalibrationDataReader)	ONNXModel)QuantFormatattribute_to_kwargz2%(asctime)s %(name)s [%(levelname)s] - %(message)s)formatlevelc                      e Zd Zd Zy)WeightOnlyQuantConfigc                     || _         || _        y)a  This is the Base class for Weight Only Quant Configuration.

        Args:
            algorithm:
                weight only quantize algorithm name.
            quant_format: QuantFormat{QOperator, QDQ}.
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
        N	algorithmquant_format)selfr   r   s      l/var/www/html/answerous/venv/lib/python3.12/site-packages/onnxruntime/quantization/matmul_4bits_quantizer.py__init__zWeightOnlyQuantConfig.__init__    s     #(    N)__name__
__module____qualname__r    r   r   r   r      s    )r   r   c                  8     e Zd Zdej                  f fd	Z xZS )RTNWeightOnlyQuantConfigNc                r    |t         j                  k(  sJ d       |i }t        |   d|       || _        y)aF  
        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
        RTN is the most straightforward way to quantize weight using scale maps.

        Args:
            ratios:
                percentile of clip. Defaults to {}.
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
        z"RTN only supports QOperator formatNRTNr   )r   	QOperatorsuperr   ratios)r   r&   r   	__class__s      r   r   z!RTNWeightOnlyQuantConfig.__init__/   sJ    " {444Z6ZZ4>F% 	 	
 r   r   r   r   r   r$   r   __classcell__r'   s   @r   r!   r!   .   s      ** r   r!   c                  D     e Zd Zdddddej                  f	 d fdZ xZS )GPTQWeightOnlyQuantConfigg{Gz?   FTc                    |t         j                  k(  sJ d       t        |   d|       || _        || _        || _        || _        || _        || _	        y)a  
        This is a class for GPTQ algorithm Weight Only Quant Configuration.
        GPTQ algorithm provides more accurate quantization but requires more computational resources.

        Args:
            calibration_data_reader:
                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
            percdamp:
                percent of the average Hessian diagonal to use for dampening.
            block_size (int, optional):
                channel number in one block to execute a GPTQ quantization iteration.
            actorder (bool, optional):
                whether rearrange Hessian matrix considering the diag's value.
            mse (bool, optional):
                whether get scale and zero point with mse error.
            perchannel (bool, optional):
                whether quantize weight per-channel.
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
        z#GPTQ only supports QOperator formatGPTQr   N)
r   r$   r%   r   calibration_data_readerpercdamp
block_sizeactordermse
perchannel)	r   r0   r1   r2   r3   r4   r5   r   r'   s	           r   r   z"GPTQWeightOnlyQuantConfig.__init__L   se    @ {444[6[[4% 	 	
 (?$ $ $r   )r0   r   r(   r*   s   @r   r,   r,   K   s/      **+%!6+% +%r   r,   c                  <     e Zd Zdddej                  f fd	Z xZS )HQQWeightOnlyQuantConfigr-      r   c                    |t         j                  k(  sJ d       t        |   d|       || _        || _        || _        y)a&  
        This is a class for HQQ algorithm Weight Only Quant Configuration.
        HQQ algorithm quant weight without needing calibrate data.

        Args:
            block_size (int, optional):
                channel number in one block to execute a HQQ quantization iteration.
            bits (int, optional):
                how many bits to represent weight.
            axis (int, optional):
                0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
        z"HQQ only supports QOperator formatHQQr   N)r   r$   r%   r   r2   bitsaxis)r   r2   r;   r<   r   r'   s        r   r   z!HQQWeightOnlyQuantConfig.__init__{   sN    . {444Z6ZZ4% 	 	
 %		r   r(   r*   s   @r   r7   r7   z   s!      ** r   r7   c                  H     e Zd Zdddej                  f	 	 	 	 	 d fdZ xZS )DefaultWeightOnlyQuantConfigr-   FNc                `    t         |   d|       || _        || _        d| _        || _        y)a  
        This is a class for weight only affine quantization configuration.

        Args:
            block_size (int, optional):
                channel number in one block to execute an affine quantization iteration.
            is_symmetric (bool, optional):
                whether quantize weight symmetrically.
            accuracy_level (int, optional):
                Accuracy level of the 4-bit quantized MatMul computation.
                Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details.
                (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits)
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
        DEFAULTr   r8   N)r%   r   r2   is_symmetricr;   accuracy_level)r   r2   rA   rB   r   r'   s        r   r   z%DefaultWeightOnlyQuantConfig.__init__   s5    0 	9<H$(	,r   )r2   intrA   boolrB   
int | Noner(   r*   s   @r   r>   r>      s=     "%) **-- - #	- -r   r>   c                P    t        |t        j                  | |z        z        | k(  S N)rC   npceil)val1val2s     r   is_divisiblerL      s$    tbggdTk**+t33r   c                  ^    e Zd Z	 	 ddZe	 	 	 d	 	 	 	 	 d	d       Zed        Z	 d
dZddZy)HQQWeightOnlyQuantizerc                    || _         y rG   configr   rQ   s     r   r   zHQQWeightOnlyQuantizer.__init__   s     r   Nc                   dd l |dddddn|}|d   |d   |d	   |d
   f\  }}}	}
| j                  rj                  nj                  }| j	                  |      }|j	                  |      }|j	                  |      }|dk(  rfd}n|ffd	}d}t        |
      D ]  }j                  ||z  |z         j                  |d   |d         }||z
  |z  } |||z
  |      }j                  |||z
  |z  z
  |d      }||	z  }t        j                  ||z
        j                               }|r t        |t        j                  |d             ||k  r|} n ~~~~||fS )Nr   gffffff?g      $@g)\(?   )lp_normbetakappaitersrU   rV   rW   rX   r   c                    j                  |       j                  j                  j                  j	                  |       d|z  z
        z  S )N      ?)signnn
functionalreluabs)xrV   torchs     r   	shrink_opz:HQQWeightOnlyQuantizer.optimize_weights.<locals>.shrink_op   s=    zz!}uxx':':'?'?		!sUYz@Y'ZZZr   c           
         j                  |       j                  j                  j                  j	                  |       d|z  j                  j	                  |       dz   |dz
        z  z
        z  S )NrZ   g:0yE>r   )r[   r\   r]   r^   r_   pow)r`   rV   pra   s      r   rb   z:HQQWeightOnlyQuantizer.optimize_weights.<locals>.shrink_op   sg    zz!}uxx':':'?'?IIaLC$J%))EIIaL4<OQRUVQV2W#WW(  r   g     @Tr<   keepdim   )ra   is_cudafloat16float32torangeroundclampmeanfloatr_   printrH   )tensorscalezeromin_maxr<   
opt_paramsverboserU   rV   rW   rX   dtypew_frb   
best_erroriw_qw_rw_ecurrent_errorra   s                       @r   optimize_weightsz'HQQWeightOnlyQuantizer.optimize_weights   s    	R\RdcD2Njt
y!vww	'
#ue "(U]]iiwwu~a<[
 &- 
 
u 	A++cEkD0177
GAJOC:&CC#It,C::cS3Y%$77dD:QDEMD!%))C#I"6";";"=>Ma-34z)*
	 c3d{r   c           	        | j                   d   |j                   d   k(  r|j                  }| j                  } |dv rA| j                         dz  |z  }t        |      D ]  }| dd xxx ||d |   ||z  z  z  ccc  y t	        d      )Nr   )   r8      r   zOnly 2,4,8 bits are supported.)shapeTelement_sizerm   NotImplementedError)pack_tensorori_int_tensorr;   compress_ratiojs        r   pack_on_row_fast_248bitz.HQQWeightOnlyQuantizer.pack_on_row_fast_248bit  s    Q>#7#7#::+--N%--K9(557!;tCN>* UAB>!2C^2C#DQR#TTU &&FGGr   c                <   dd l }|j                         }	|	j                  }
||
|   |z  z
  |z  }|dk(  r+|j                  j                  j                  |	d|fdd      }	n,|j                  j                  j                  |	ddd|fdd      }	|	j                  }|-|r+|dk(  r|	j                  d|g      n|	j                  |dg      }	|du r#|	j                         |	j                         }}d}n,|	j                  |d      d   }|	j                  |d      d   }d|z  dz
  }d}||g}|||z
  z  j                  d	
      }||z
  }|dk(  j                         j                         dkD  r|||dk(  <   ||z  j                  d	
      }| |z  }|r|j                  |      }|r| j                  |	||||      \  }}|j                  |	|z  |z         j                  |d   |d         }|j                  |      j                         }d|z  }|dk(  r+|j                  |d   d      }|j                  |d   d      }n*|j                  d|d         }|j                  d|d         }~	~~||j                  |j                         |j                  |j                         fS )Nr   r   constantFTrf   r   g     @)max)rs   rt   ru   rv   r<   rZ   )ra   rq   r   r\   r]   padreshapeminr   ro   sumitemrn   r   rC   rl   ry   )r   rs   r;   channel_wise
group_sizeoptimize
round_zeror<   ra   weight	ori_shapepad_lenr   _min_maxmax_vmin_vrv   rt   min_max_axisru   r}   s                         r   quantize_internalz(HQQWeightOnlyQuantizer.quantize_internal  s    	LL		$* <<
J19XX((,,Va\:qQFXX((,,VaAw5GUVWF ":>!)V^^R$45&..ZdfhYiJjF 5 vzz|$DH::4:6q9D::4:6q9D4!%. $+&--#-6d{A""$))+a/.3L*+\)00S09Euu};;t$D //vUQU_fmq/rKE4 kk&5.4/066wqz71:Nkk% $$&e19MM%(B/E<<a"-DMM"eBi0E<<E"I.DD$EHHV\\*DGGFLL,AAAr   c                n	   |j                   dk7  r|gS ddl}t        j                  d|j                   d       |j
                  d   }t        ||      \  }}|t        j                  d       |gS t        j                  j                  |      }t        |j                        dk7  rt        j                  d	       |gS |j                  |      }|j                  j                         r|j                         }| j                  |j                   | j"                  j$                  | j"                  j&                  
      \  }	}
}|	j)                         }	|
j)                         }
|j)                         }|j+                  |	j                  d   |	j                  d   dz  f|j,                  |	j.                        }| j1                  ||	| j"                  j$                         |
j3                         j5                         }|j3                         j5                         }|j7                  d      }|j7                  d      }|j                  \  }}| j"                  j&                  }|dz  }||z   dz
  |z  }|j7                  |||      }t        j                  j9                  |j3                         j5                               }|j                  dz   |_        |j
                  D ].  }|j                  |k(  s|j
                  j;                  |        n t        j                  j9                  |      }|j                  dz   |_        |j<                  j?                  ||g       |j
                  d   |j                  |j                  g}t        j                  j9                  |      }|j                  dz   |_        |j<                  j?                  |g       |jA                  |j                         i }|j                  \  }}||d<   ||d<   | j"                  j$                  |d<   | j"                  j&                  |d<   t        jB                  jD                  	 d||jF                  d   g|j                  r|j                  dz   nddd|}t        j                  d|j                   d       |gS )
        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
        MatMulr   Nstart to quantize  ...r   2MatMul doesn't have const weight. Skip to quantizer   )MatMul weight is not 2D. Skip to quantize)r;   r   )ry   devicer   _Q4_scales_zero_pointsKNr;   r2    com.microsoftinputsoutputsnamedomaincomplete quantization of MatMulNBits)$op_typera   loggerinfor   inputget_initializeronnxnumpy_helperto_arraylenr   
from_numpycudais_availabler   r   rQ   r;   r2   
contiguouszerosuint8r   r   cpunumpyr   
from_arrayremoveinitializerextendappendhelper	make_nodeoutput)r   nodegraph_stackra   input_bb_pbbs_graphb_arrayb_array_torchquant_weight_torchscales_torchzero_points_torchpacked_torchscaleszero_pointsrowscolsr2   	blob_sizek_blocksb_quantr   scales_tensorinput_names	zp_tensorkwargsmatmul_q4_nodes                              r   quantizezHQQWeightOnlyQuantizer.quantizeP  s   
 <<8#6M(489**Q-(+>h<KKLM6M##,,T2w}}"KKCD6M((1::""$)..0M>B>T>TOO$++"2"2t{{?U?U ?U ?
;L*; 0::<#..0-88:{{%%a(*<*B*B1*E*JK++%,, # 

 	$$\3Et{{GWGWX!!#))+'++-335#!))"-"((
d[[++
!O	:%)j8#++D(IF##..|/?/?/A/G/G/IJyy5(^^ 	EzzW$%%e,	
 ))44V<!YY2##Wm$<=zz!}gllM4F4FG%%00=	^3	##YK09>>*]]
dss))v#{{55|..
[[^$&*iiU"R"
 
 	/		{$?@r   )rQ   r7   )r   NF)rv   z	list[int]r<   rC   rw   dict)r8   T@   TTr   r   r   r   list[GraphProto]returnzlist[NodeProto])	r   r   r   r   staticmethodr   r   r   r   r   r   r   rN   rN      sw    (  6 	6
 6 6 6p 	H 	H fg?BBR r   rN   c                    t        t        |      dz
  dd      D ]/  }||   }|j                  D ]  }|j                  | k(  s||fc c S  1 y)Nr   r   )NN)rm   r   r   r   )r   
graph_pathgidgraphrs   s        r   r   r     s\    S_q("b1 %3'' 	%F{{d"u}$	%%
 r   c                  $    e Zd ZddZddZddZy)DefaultWeightOnlyQuantizerc                    || _         y rG   rP   rR   s     r   r   z#DefaultWeightOnlyQuantizer.__init__  s	    r   c           
     z   t        |j                        dk7  rt        d      |j                  \  }}| j                  j                  }||z   dz
  |z  }| j                  j
                  t        j                  k(  r|dz  }||z  }||z
  }|dkD  rt        j                  |d|fdfd      }t        j                  |||fd      }	t        j                  ||dz   dz  z  d      }
t        j                  ||z  |j                        }t        |	|||
|||| j                  j                         nt        j                  ||z  dz   dz  d      }	t        j                  ||z  dz   dz  d      }
t        j                  ||f|j                        }t        |	|||
|||| j                  j                         |	||
fS )	z!4b quantize fp32 weight to a blobr   z9Current int4 block quantization only supports 2D tensors!r   r   )r   r   r   r   )ry   )r   r   
ValueErrorrQ   r2   r   r   r$   rH   r   r   ry   r	   rA   r
   )r   
fp32weightr   r   r2   r   r   padded_rowsr   packed
zero_pointr   s               r   int4_block_quantz+DefaultWeightOnlyQuantizer.int4_block_quant  s    z A%XYY%%
d[[++
:%)j8;;##{'<'<<"aI"Z/K!D(G{VVJ!Wv0F
S
 XXtXy9IF$8a<A*=">gNJXXthz7G7GHF!
FJ
D$PTP[P[PhPh XXtd{Q14GDF4(?Q#61"<GLJXXx.j6F6FGF%
FJ
D$PTP[P[PhPh 
++r   c                6   |j                   dk7  r|gS t        j                  d|j                   d       | j                  j
                  rt        j                  nt        j                  }|j                  d   }t        ||      \  }}|t        j                  d       |gS t        j                  j                  |      }t        |j                        dk7  rt        j                  d       |gS | j!                  |      \  }}	}
| j                  j"                  t$        j&                  k(  r[t        j                  j)                  ||j                  dz         }t        j                  j)                  |	|j                  d	z         }nut        j*                  j-                  |j                  d
z   ||j                  |j/                         d      }t        j                  j)                  |	|j                  dz         }|j                  D ].  }|j                  |k(  s|j                  j1                  |        n |j2                  j5                  ||g       g }| j                  j"                  t$        j&                  k(  ri|j                  d   |j                  |j                  g}| j                  j
                  sdt        j                  j)                  |
|j                  dz         }|j7                  |j                         |j2                  j5                  |g       i }|j                  \  }}||d<   ||d<   d|d<   | j                  j8                  |d<   | j                  j:                  | j                  j:                  |d<   t        j*                  j<                  	 d||j>                  d   g|j                  r|j                  dz   nddd|}|j7                  |       n|j                  |j                  g}|j                  dz   g}|j                  d   |d   g}|j>                  d   g}| j                  j
                  st        j*                  j-                  |j                  dz   ||	j                  |
j/                         d      }|j7                  |j                         |j2                  j5                  |g       d| j                  j8                  d}t        j*                  j<                  	 d|||j                  r|j                  d
z   ndd|}t        j*                  j=                  d|||j                  r|j                  dz   nd      }|j5                  ||g       t        j                  d|j                   d       |S ) r   r   r   r   r   r   r   r   r   r   _DQ_Q4T
_DQ_scalesr   r   r   r   r8   r;   r2   rB   r   r   r   _output_DQ_zero_points)r<   r2   )r   r   r   
_matmul_Q4r   r   )DequantizeLinear) r   r   r   r   rQ   rA   r   INT4UINT4r   r   r   r   r   r   r   r   r   r   r$   r   r   make_tensortobytesr   r   r   r   r2   rB   r   r   )r   r   r   qtyper   b_tensorb_graph	b_ndarrayr   r   r   r   r   r   output_nodesr   r   r   r   r   r   dq_input_namesdq_output_namesmatmul_input_namesmatmul_output_names	dq_kwargsdq_nodematmul_nodes                               r   r   z#DefaultWeightOnlyQuantizer.quantize  s    <<8#6M(489$(KK$<$<  +BSBS**Q-+G[A'KKLM6M%%..x8	y1$KKCD6M&*&;&;I&F#;;##{'<'<<''2268==5;PQG --88QZAZ[Mkk--hmmh.Fy`f`n`n`prvwG --88Q]A]^M]] 	EzzW$$$U+	
 	""G]#;<;;##{'<'<<::a=',,8J8JKK;;++ --88hmmVdFde	""9>>2##**I;7F"JD$F3KF3KF6N#';;#9#9F< {{))5+/;;+E+E'(![[22"Q(*.))TYY&& N /%llM,>,>?N&||i78O"&**Q-1C!D#';;q>"2;;++ KK33MM$55ufllKL_L_Lacg	 %%inn5##**I;7!"$++2H2HIIkk++"%'-1YYTYY)B	
 G ++//)+15TYY-	 0 K + 67/		{$?@r   N)rQ   r>   )r   znpt.ArrayLiker   z)tuple[np.ndarray, np.ndarray, np.ndarray]r   )r   r   r   r   r   r   r   r   r   r   r     s     ,D]r   r   c                  f    e Zd ZdZddddej
                  df	 	 	 	 	 	 	 	 	 d
dZddZd Zd Z	d	 Z
y)MatMul4BitsQuantizerao  
    Perform 4b quantization of constant MatMul weights.
    If algo_config.quant_format is QOperator, the quantized weight is stored in a MatMulNBits node, which relaces the
    MatMul node.
    If algo_config.quant_format is QDQ, the quantized weight is stored in a DeQuantizeLinear node. The MatMul node is
    replaced by the DequantizeLinear + MatMul nodes.
    r-   FNc                   |g }t        |t              rt        t        j                  |            n
t        |      | _        t        |t              r|nd | _        || _        || _        || _	        t        |      | _        d | _        |t        ||||      }|| _        |j                  dk(  rt!        | j                        | _        y |j                  dk(  rt#        | j                        | _        y y )Nr2   rA   rB   r   r:   r@   )
isinstancestrr   r   loadmodel
model_pathr2   rA   rB   setnodes_to_excludenode_quantizerr>   algo_configr   rN   r   )r   r  r2   rA   rB   r!  r   r#  s           r   r   zMatMul4BitsQuantizer.__init__=  s     #!4>uc4JYtyy/0PYZ_P`
#-eS#9%t$(, #$4 5"6%)-)	K '  E)"89I9I"JD""i/"<T=M=M"ND 0r   c                   g }|d   }|j                   D ]|  }|j                  D cg c]R  }|j                  t        j                  j
                  k(  s'|j                  t        j                  j                  k(  r|T }}t        |      rVi }|j                  D ]  }|j                  t        j                  j
                  k(  r9|j                  |j                         |j                  | j                  |      i}n|j                  t        j                  j                  k(  rTg }	|j                  D ]4  }
|j                  |
       |	j                  | j                  |      g       6 |j                  |	i}nt        |      }|j                  |        t        j                   j"                  |j$                  |j&                  |j(                  fd|j                  i|}g }|j                  | j*                  v r't,        j/                  d|j                   d       |g}n^| j0                  6| j0                  j2                  dk(  r| j4                  j7                  ||      }n| j4                  j7                  ||      }|j                  |        |j9                  d       |j                   j                  |       |j;                          |S c c}w )Nr   r   zexclude to quantize z$ as specified by nodes_to_exclude...r:   r   )r   	attributetyper   AttributeProtoGRAPHGRAPHSr   r   gr   _process_subgraphgraphsr   r   updater   r   r   r   r   r!  r   r   r#  r   r"  r   
ClearFieldpop)r   r   	new_nodesr   r   attrgraph_attrsr   kvvaluesubgraph	out_nodess               r   r+  z&MatMul4BitsQuantizer._process_subgraph]  si   	BJJ "	(D !NN99 3 3 9 99TYY$J]J]JdJd=d K 
 ; NN &DyyD$7$7$=$==#**4662"ii)?)?)LMd&9&9&@&@@ "(, PH'..x8!LL$*@*@*M)NOP #ii//5MM"%& {{,,LL$**dkk@D		MS IyyD1112499+=abc!F	!!-$2B2B2L2LPU2U //88{K	 //88{K	Y'E"	(H 	 

)$Ms   AKc           	     h   i }d| j                   | j                  rdndd}| j                  j                  j                  j                  D ]_  }|j
                  dv st        |j                  D cg c]  }| j                  j                  |      du ! c}      rQ|||j                  <   a |S c c}w )z3Generate weight only quant configuration for nodes.r8   symasym)r;   r   scheme)r   N)
r2   rA   r  r   r   r   allr   r   r   )r   q4_node_configtemplate_config_q4r   r|   s        r   _generate_q4_node_configz-MatMul4BitsQuantizer._generate_q4_node_config  s    //#00ef

 JJ$$**// 	CD||z)4::VaDJJ66q9TAVW0BN499-	C  Ws   0$B/
c                ^     fd}i } j                    j                   |d<    j                         } j                  j                  }t        j                  d| d       |dk(  r\ddlm}  j                  j                  |d	<    |d j                   j                  n j                  j                  |d
| _
        n|dk(  rddlm}  j                  j                  |d<    j                  j                  |d<    j                  j                  |d<    j                  j                  |d<    j                  j                   |d<   d|d<    |       } |d j                   j                  n j                  j                  ||d| _
        t        j                  d| d       y)u  4b quantize a model with RTN or GPTQ algorithm. Please refer to
        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
        for more details on weight only quantization using Intel® Neural Compressor.
        c               3  z   K   t        j                  j                  j                        } | D ]  }|d f 
 y wrG   )copydeepcopyr#  r0   )data_readerdatar   s     r   inc_dataloaderz<MatMul4BitsQuantizer.int4_quant_algo.<locals>.inc_dataloader  s:     --(8(8(P(PQK# !Dj !s   8;NrB   zstart to quantize model with z algorithm...r#   r   )rtn_quantizer&   )r  weight_configr/   )gptq_quantizer1   	blocksizer3   r4   r5   r   	n_samples)r  rG  
dataloaderz$complete quantization of model with z algorithm.r   )rB   r>  r#  r   r   r   .neural_compressor.adaptor.ox_utils.weight_onlyrF  r&   r  r  rH  r1   r2   r3   r4   r5   )r   rE  r   weight_only_node_configr   rF  rH  rK  s   `       r   int4_quant_algoz$MatMul4BitsQuantizer.int4_quant_algo  s   	!
 *'+':':F#$"&"?"?"A$$..	3I;mLMS#//66F8% )-)Ddoo$**JZJZ5 DJ
 & T!%!1!1!:!:F:"&"2"2"="=F;!%!1!1!:!:F: ,,00F5M#'#3#3#>#>F< "$F;')J& )-)Ddoo$**JZJZ5% 	DJ 	:9+[QRr   c                x   | j                   j                  dv r| j                  j                         g}| j                   j                  t
        j                  k(  r| j                  j                  dd       n{| j                  j                         }|D ]\  }|j                  dv s|j                  dk  s"t        j                  d       | j                  j                  |j                  d       ^ | j                  |       | j                  j                          y 	 t        j                   d       d
d l}t        j,                  |j.                        t        j,                  d      k\  sJ d       | j1                          y # t"        $ r)}t%        j&                  | d       t)        d	      |d }~ww xY w)N)r:   r@   r   r   )Nzai.onnxr      zThe opset of the input model is under 21 and doesn't support int4 data type. Force to update it to opset 21, but the generated model may not be a valid model.neural_compressor.zLneural-compressor is not correctly installed. Please check your environment.r   z2.3.2zGRequire neural-compressor >= 2.3.2 to support weight only quantization!)r#  r   r  r   r   r   r$   set_opset_importopset_importr   r   r   warningr+  clean_initializers	importlibimport_module	ExceptionloggingerrorRuntimeErrorrQ  parse__version__rN  )r   r   rT  opseterQ  s         r   processzMatMul4BitsQuantizer.process  s}   %%);;::++-.K ,,0E0EE

++OQ?#zz668) FE||'<<QSASp 

33ELL"EF "";/JJ))+''(;< %==!2!>!>?7==D  YXY    "  1g&"bs   !F 	F9$F44F9)
r  zModelProto | strr2   rC   rA   rD   rB   rE   r#  zWeightOnlyQuantConfig | None)r   r   )r   r   r   __doc__r   r$   r   r+  r>  rN  ra  r   r   r   r  r  4  st     "%) **48OO O 	O
 #O 2O@+Z-S^$#r   r  c                &    | j                         dv S )N)true1)lower)r4  s    r   ort_convert_str_to_boolrg    s    ;;=M))r   c            
     h   t        j                  d      } | j                  ddd       | j                  ddd       | j                  d	d
dt        d       | j                  ddt        g dd       | j                  ddt        d       | j                  dd
dddt
        dd
gd       | j                  dd
t        d       | j                  ddd
d !       | j                  d
"       | j                  d#d$t        d
g d%&       | j                  d'd(t        d(d)gd*       | j                         S )+Na
  Blockwise int4 quantization for MatMul 2D weight matrices.

A weight matrix is partitioned into into blocks, where each block is a
continguous subset inside each column. Each block is quantized into a
set of 4b integers with a scaling factor and an optional offset.
)descriptionz--input_modelTzPath to the input model file)requiredhelpz--output_modelzPath to the output model filez--block_sizeF    zBlock size for quantization)rj  defaultr&  rk  z--quant_methodrm  )rm  hqqrtngptquW   the algorithm used to quantize weight, 
rtn and gptq leverage Intel® Neural Compressor)rm  r&  choicesrk  z--bitsr8   z#the target bits to represent weight)rm  r&  rk  z--symmetric?zWIndicate whether to quantize the model symmetrically, symmetric is not supported by hqq)rj  rm  constnargsr&  rq  rk  z--accuracy_levelzAccuracy level of the 4-bit quantized MatMul computation. Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).)rj  r&  rk  z-vz	--verbose
store_true)rj  action)rx   z--nodes_to_exclude+zBSpecify the nodes to be excluded from quantization with node names)rt  r&  rj  rm  rk  z--quant_formatr$   QDQzQuantFormat {QOperator, QDQ}QOperator format quantizes the model with quantized operators directly.QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.)argparseArgumentParseradd_argumentrC   r  rg  set_defaults
parse_args)parsers    r   r}  r}    s{   $$F $=[\
(4>]^
Spq
1g   !#<ab
$uf  	 q	   kE,O
&
Q   e$Y   r   __main__zfile z already existsrn  zAsymmetric is not supportted by hqq, will force to symmetric=FalseF)r2   r;   rm  r  ro  rp  )r2   z!Unsupported quantization method: )r  rB   r!  r#  T)r   r   r   ztuple[TensorProto, GraphProto])J
__future__r   ry  rA  rW  rZ  osr   rH   numpy.typingtypingnptr   onnx.onnx_pbr   r   r   r   	packagingr   onnxruntime.capi._pybind_stater	   r
   	calibrater   
onnx_modelr   quant_utilsr   r   basicConfigINFO	getLoggerr   r   r   r!   r,   r7   r>   rL   rN   r   r   r  rg  r}  argsrx   setLevelDEBUGinput_modelinput_model_pathoutput_modeloutput_model_pathr   pathexistsr[  rY  	symmetricquant_methodrU  r  r  r2   r;   quant_configrB   r   r!  quantra  save_model_to_filer   r   r   <module>r     s   #     	    G G  [ , ! 8   OW^WcWc d			8	$) )4 :,% 5 ,%^ 4  F-#8 -@4a  a HC CLw# w#t*;| z<D||&''))t001L	ww~~'(u./?@% 12/BCC~~$++u4Z[DII&'EE!/4??QUQZQZ[			i	'3..%	
 
		e	#/1			f	$0DOOL<T=N=N<OPQQ **.. 	E 
MMO	KK""#4d;S r   