
    IgI                        d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlm Z   ejB                  e"      Z#dZ$dZ%dZ&dZ'dZ(dZ)dZ*dZ+dZ,dZ- G d de	j\                        Z/ G d dej`                  jb                        Z2 G d d      Z3d Z4 G d  d!ej`                  jb                        Z5 G d" d#e	j\                        Z6 G d$ d%e	j\                        Z7 G d& d'e	j\                        Z8 G d( d)e	j\                        Z9 G d* d+e	j\                        Z: G d, d-e	j\                        Z; G d. d/e	j\                        Z< G d0 d1e	j\                        Z=d2 Z>ej~                  j                  d3        ZAej~                  j                  d4        ZBej~                  j                  d5        ZC G d6 d7e	j\                        ZD G d8 d9e	j\                        ZE G d: d;e      ZFd<ZGd=ZH ed>eG       G d? d@eF             ZI edAeG       G dB dCeF             ZJ G dD dEe	j\                        ZK G dF dGe	j\                        ZL G dH dIe	j\                        ZM edJeG       G dK dLeF             ZN edMeG       G dN dOeF             ZO edPeG       G dQ dReF             ZPy)SzPyTorch DeBERTa model.    )Sequence)OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)softmax_backward_data)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DebertaConfigr   zmicrosoft/deberta-basez!lsanochkin/deberta-large-feedbackz' Paris'z0.54z#Palak/microsoft_deberta-large_squadz' a nice puppet'gQ?      c                   4     e Zd Z fdZd Zed        Z xZS )ContextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        |j                        | _        || _	        y N)
super__init__r   Linearpooler_hidden_sizedenseStableDropoutpooler_dropoutdropoutconfigselfr(   	__class__s     i/var/www/html/answerous/venv/lib/python3.12/site-packages/transformers/models/deberta/modeling_deberta.pyr!   zContextPooler.__init__9   sE    YYv88&:S:ST
$V%:%:;    c                     |d d df   }| j                  |      }| j                  |      }t        | j                  j                     |      }|S Nr   )r'   r$   r   r(   pooler_hidden_act)r*   hidden_statescontext_tokenpooled_outputs       r,   forwardzContextPooler.forward?   sM     &ad+]3

=1t{{<<=mLr-   c                 .    | j                   j                  S r   )r(   hidden_sizer*   s    r,   
output_dimzContextPooler.output_dimI   s    {{&&&r-   )__name__
__module____qualname__r!   r4   propertyr8   __classcell__r+   s   @r,   r   r   8   s!     ' 'r-   r   c                   @    e Zd ZdZed        Zed        Zed        Zy)XSoftmaxa  
    Masked Softmax which is optimized for saving memory

    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
        mask (`torch.IntTensor`):
            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax

    Example:

    ```python
    >>> import torch
    >>> from transformers.models.deberta.modeling_deberta import XSoftmax

    >>> # Make a tensor
    >>> x = torch.randn([4, 20, 100])

    >>> # Create a mask
    >>> mask = (x > 0).int()

    >>> # Specify the dimension to apply softmax
    >>> dim = -1

    >>> y = XSoftmax.apply(x, mask, dim)
    ```c                 r   || _         |j                  t        j                         }|j	                  |t        j
                  t        j                  |j                        j                              }t        j                  || j                         }|j                  |d       | j                  |       |S r/   )dimtotorchboolmasked_filltensorfinfodtypeminsoftmaxmasked_fill_save_for_backward)ctxinputmaskrB   rmaskoutputs         r,   r4   zXSoftmax.forwardj   s    ''%**%&""5%,,u{{5;;7O7S7S*TUvsww/E1%f%r-   c                 Z    | j                   \  }t        | ||| j                  |      }|d d fS r   )saved_tensorsr   rB   )rN   grad_outputrR   	inputGrads       r,   backwardzXSoftmax.backwardu   s2    %%	)#{FCGGVT	$$$r-   c                    dd l mc m} ddlm}m} | j                  d||j                  d         }| j                  d| j                  d| j                  dt        j                  dt        j                  	      
      |      |j                  d         } || ||| j                  dt        j                  t        j                  |j                         j                               j                        
            }	 || |	|      }	 || |	|| j                  dt        j                  dt        j                  	      
            S )Nr   )rF   rK   CastLong)to_iSubConstantr   rI   )value_tBool)torch.onnx.symbolic_helperonnxsymbolic_helpertorch.onnx.symbolic_opset9rF   rK   opcast_pytorch_to_onnxrD   rG   int64rH   typerI   rJ   rE   )
gr*   rP   rB   sym_helprF   rK   mask_cast_valuer_maskrR   s
             r,   symboliczXSoftmax.symbolic{   s   55C$$vt(2O2OPV2W$XDDZau{{1STVef..v6  

 tVQTT*ell5;;tyy{O`O`ObCcCgCg6hTi
 FC(1ffadd:u||TU]b]g]gGhd.ijjr-   N)r9   r:   r;   __doc__staticmethodr4   rW   rm    r-   r,   r@   r@   N   sE    6   % %
 k kr-   r@   c                       e Zd Zd Zy)DropoutContextc                 <    d| _         d | _        d| _        d| _        y )Nr   r   T)r'   rP   scale
reuse_maskr7   s    r,   r!   zDropoutContext.__init__   s    	
r-   N)r9   r:   r;   r!   rp   r-   r,   rr   rr      s    r-   rr   c                    t        |t              s|}d }n5|j                  }||j                  z  }|j                  r|j
                  nd }|dkD  rI|Gdt        j                  |       j                  d|z
        z
  j                  t        j                        }t        |t              r|j
                  ||_        ||fS )Nr   r   )
isinstancerr   r'   rt   ru   rP   rD   
empty_like
bernoulli_rC   rE   )rO   local_contextr'   rP   s       r,   get_maskr{      s    m^4''=&&&%2%=%=}!!4{t|E$$U+66q7{CCGG

S-0%!%M=r-   c            	           e Zd ZdZed        Zed        Zedej                  j                  dej                  j                  deeef   dej                  j                  fd       Zy	)
XDropoutzlOptimized dropout function to save computation and memory by using mask operation instead of multiplication.c                     t        ||      \  }}dd|z
  z  | _        |dkD  r0| j                  |       |j                  |d      | j                  z  S |S )Ng      ?r   r   )r{   rt   rM   rF   )rN   rO   	local_ctxrP   r'   s        r,   r4   zXDropout.forward   sX     	2g1w;'	Q;!!$'$$T1-		99Lr-   c                     | j                   dkD  r/| j                  \  }|j                  |d      | j                   z  d fS |d fS )Nr   r   )rt   rT   rF   )rN   rU   rP   s      r,   rW   zXDropout.backward   sF    99q=''GT**43cii?EE$$r-   ri   rO   r   returnc                 v    ddl m} |}t        |t              r|j                  }d}|j	                  | |||      S )Nr   )symbolic_opset12T)
torch.onnxr   rw   rr   r'   )ri   rO   r   r   	dropout_ptrains         r,   rm   zXDropout.symbolic   s=    /	i0!))I  ''5)UCCr-   N)r9   r:   r;   rn   ro   r4   rW   rD   _CGraphValuer   floatrr   rm   rp   r-   r,   r}   r}      s    v  % % DEHHNN D588>> DeESaLaFb Dglgogogugu D Dr-   r}   c                   <     e Zd ZdZ fdZd Zd ZddZd Z xZ	S )r%   z
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    c                 L    t         |           || _        d| _        d | _        y r/   )r    r!   	drop_probcountcontext_stack)r*   r   r+   s     r,   r!   zStableDropout.__init__   s$    "
!r-   c                     | j                   r3| j                  dkD  r$t        j                  || j	                               S |S )zr
        Call the module

        Args:
            x (`torch.tensor`): The input tensor to apply dropout
        r   )trainingr   r}   applyget_context)r*   xs     r,   r4   zStableDropout.forward   s5     ==T^^a/>>!T%5%5%788r-   c                      d| _         d | _        y r/   )r   r   r7   s    r,   clear_contextzStableDropout.clear_context   s    
!r-   c                 v    | j                   g | _         d| _        | j                   D ]  }||_        ||_         y r/   )r   r   ru   rt   )r*   ru   rt   cs       r,   init_contextzStableDropout.init_context   s@    %!#D
## 	A%ALAG	r-   c                 >   | j                   | j                  t        | j                         k\  r#| j                   j                  t	                      | j                   | j                     }| j
                  |_        | xj                  dz  c_        |S | j
                  S )Nr   )r   r   lenappendrr   r   r'   )r*   rN   s     r,   r   zStableDropout.get_context   sw    )zzS!3!344"")).*:;$$TZZ0C..CKJJ!OJJ>>!r-   )Tr   )
r9   r:   r;   rn   r!   r4   r   r   r   r=   r>   s   @r,   r%   r%      s!    "	"	"r-   r%   c                   *     e Zd ZdZd fd	Zd Z xZS )DebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).c                     t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        y r   )
r    r!   r   	ParameterrD   onesweightzerosbiasvariance_epsilon)r*   sizeepsr+   s      r,   r!   zDebertaLayerNorm.__init__  sH    ll5::d#34LLT!23	 #r-   c                 X   |j                   }|j                         }|j                  dd      }||z
  j                  d      j                  dd      }||z
  t	        j
                  || j                  z         z  }|j                  |      }| j                  |z  | j                  z   }|S )NT)keepdim   )
rI   r   meanpowrD   sqrtr   rC   r   r   )r*   r1   
input_typer   varianceys         r,   r4   zDebertaLayerNorm.forward	  s    "((
%++-!!"d!3!D(--a055b$5G&-HtG\G\<\1]]%((4KK-'$))3r-   )g-q=r9   r:   r;   rn   r!   r4   r=   r>   s   @r,   r   r      s    L$r-   r   c                   $     e Zd Z fdZd Z xZS )DebertaSelfOutputc                     t         |           t        j                  |j                  |j                        | _        t        |j                  |j                        | _        t        |j                        | _        y r   )r    r!   r   r"   r6   r$   r   layer_norm_eps	LayerNormr%   hidden_dropout_probr'   r)   s     r,   r!   zDebertaSelfOutput.__init__  sX    YYv1163E3EF
)&*<*<f>S>ST$V%?%?@r-   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r$   r'   r   r*   r1   input_tensors      r,   r4   zDebertaSelfOutput.forward  7    

=1]3}|'CDr-   r9   r:   r;   r!   r4   r=   r>   s   @r,   r   r     s    Ar-   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )DebertaAttentionc                 p    t         |           t        |      | _        t	        |      | _        || _        y r   )r    r!   DisentangledSelfAttentionr*   r   rR   r(   r)   s     r,   r!   zDebertaAttention.__init__#  s-    -f5	'/r-   c                 z    | j                  ||||||      }|r|\  }}||}| j                  ||      }	|r|	fS |	S )N)query_statesrelative_posrel_embeddings)r*   rR   )
r*   r1   attention_maskoutput_attentionsr   r   r   self_output
att_matrixattention_outputs
             r,   r4   zDebertaAttention.forward)  sh     ii%%)   
 &1#K(L;;{LA$j11##r-   FNNNr   r>   s   @r,   r   r   "  s      $r-   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DebertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r    r!   r   r"   r6   intermediate_sizer$   rw   
hidden_actstrr   intermediate_act_fnr)   s     r,   r!   zDebertaIntermediate.__init__H  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r-   r1   r   c                 J    | j                  |      }| j                  |      }|S r   )r$   r   r*   r1   s     r,   r4   zDebertaIntermediate.forwardP  s&    

=100?r-   )r9   r:   r;   r!   rD   Tensorr4   r=   r>   s   @r,   r   r   G  s#    9U\\ ell r-   r   c                   $     e Zd Z fdZd Z xZS )DebertaOutputc                    t         |           t        j                  |j                  |j
                        | _        t        |j
                  |j                        | _	        t        |j                        | _        || _        y r   )r    r!   r   r"   r   r6   r$   r   r   r   r%   r   r'   r(   r)   s     r,   r!   zDebertaOutput.__init__W  s_    YYv779K9KL
)&*<*<f>S>ST$V%?%?@r-   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r,   r4   zDebertaOutput.forward^  r   r-   r   r>   s   @r,   r   r   V  s    r-   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )DebertaLayerc                     t         |           t        |      | _        t	        |      | _        t        |      | _        y r   )r    r!   r   	attentionr   intermediater   rR   r)   s     r,   r!   zDebertaLayer.__init__f  s3    )&1/7#F+r-   c                     | j                  ||||||      }|r|\  }}| j                  |      }	| j                  |	|      }
|r|
fS |
S )Nr   r   r   r   )r   r   rR   )r*   r1   r   r   r   r   r   r   r   intermediate_outputlayer_outputs              r,   r4   zDebertaLayer.forwardl  sr      >>/%%) * 
 +;(j"//0@A{{#68HI *--r-   )NNNFr   r>   s   @r,   r   r   e  s    ,  r-   r   c                   H     e Zd ZdZ fdZd Zd ZddZ	 	 	 	 	 ddZ xZ	S )	DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    t         |           t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        t        |dd      | _	        | j                  rdt        |dd      | _
        | j                  dk  r|j                  | _
        t        j                  | j                  dz  |j                        | _        d| _        y c c}w )Nrelative_attentionFmax_relative_positionsr   r   r   )r    r!   r   
ModuleListrangenum_hidden_layersr   layergetattrr   r   max_position_embeddings	Embeddingr6   r   gradient_checkpointing)r*   r(   _r+   s      r,   r!   zDebertaEncoder.__init__  s    ]]%H`H`Ba#bQL$8#bc
")&2F"N""*1&:RTV*WD'**Q..4.L.L+"$,,t/J/JQ/NPVPbPb"cD&+# $cs   Cc                 R    | j                   r| j                  j                  }|S d }|S r   )r   r   r   )r*   r   s     r,   get_rel_embeddingz DebertaEncoder.get_rel_embedding  s0    7;7N7N,,33 UYr-   c                     |j                         dk  rE|j                  d      j                  d      }||j                  d      j                  d      z  }|S |j                         dk(  r|j                  d      }|S )Nr   r   r   r   )rB   	unsqueezesqueeze)r*   r   extended_attention_masks      r,   get_attention_maskz!DebertaEncoder.get_attention_mask  s    1$&4&>&>q&A&K&KA&N#47N7V7VWY7Z7d7deg7hhN  !Q&+55a8Nr-   c                     | j                   rL|J||j                  d      n|j                  d      }t        ||j                  d      |j                        }|S )Nr   )r   r   build_relative_positiondevice)r*   r1   r   r   qs        r,   get_rel_poszDebertaEncoder.get_rel_pos  sX    ""|';)5)A!!"%}GYGYZ\G]A21m6H6H6LmNbNbcLr-   c           
      ~   | j                  |      }| j                  |||      }|rdnd }|rdnd }	t        |t              r|d   }
n|}
| j	                         }t        | j                        D ]  \  }}|r||fz   }| j                  r.| j                  r"| j                  |j                  |
|||||      }n ||
|||||      }|r|\  }}|8|}t        |t              r(|dz   t        | j                        k  r||dz      nd }
n|}
|s|	fz   }	 |r||fz   }|st        d |||	fD              S t        |||	      S )Nrp   r   )r   r   r   r   r   c              3   &   K   | ]	  }||  y wr   rp   ).0vs     r,   	<genexpr>z)DebertaEncoder.forward.<locals>.<genexpr>  s     hqZ[Zghs   last_hidden_stater1   
attentions)r   r   rw   r   r   	enumerater   r   r   _gradient_checkpointing_func__call__r   tupler   )r*   r1   r   output_hidden_statesr   r   r   return_dictall_hidden_statesall_attentionsnext_kvr   ilayer_moduleatt_ms                  r,   r4   zDebertaEncoder.forward  s    00@''|\R"6BD0dmX.#A&G#G//1(4 #	;OA|#$58H$H!**t}} $ A A ))"  "%! !-"!-!-#1&7! !'4$u',mX667!ec$**o6MmAE2SWG' !/5(!:G#	;J   1]4D Dh]4E~$Vhhh+;LYg
 	
r-   )NN)TFNNT)
r9   r:   r;   rn   r!   r   r   r   r4   r=   r>   s   @r,   r   r     s1    B	, "A
r-   r   c                 .   t        j                  | t         j                  |      }t        j                  |t         j                  |      }|dddf   |j                  dd      j	                  | d      z
  }|d| ddf   }|j                  d      }|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    rI   r   Nr   r   r   )rD   arangelongviewrepeatr   )
query_sizekey_sizer   q_idsk_idsrel_pos_idss         r,   r   r     s    " LL5::fEELLFCE4.5::a#4#;#;J#JJKkzk1n-K''*Kr-   c                     | j                  |j                  d      |j                  d      |j                  d      |j                  d      g      S )Nr   r   r   r   expandr   )c2p_posquery_layerr   s      r,   c2p_dynamic_expandr#    sI    >>;++A.0@0@0C[EUEUVWEXZfZkZklnZopqqr-   c                     | j                  |j                  d      |j                  d      |j                  d      |j                  d      g      S )Nr   r   r   r  )r!  r"  	key_layers      r,   p2c_dynamic_expandr&  
  sG    >>;++A.0@0@0CY^^TVEWYbYgYghjYklmmr-   c                     | j                  |j                         d d | j                  d      |j                  d      fz         S )Nr   r   r  )	pos_indexp2c_attr%  s      r,   pos_dynamic_expandr*    s=    GLLN2A.)..2DinnUWFX1YYZZr-   c                   >     e Zd ZdZ fdZd Z	 	 	 	 ddZd Z xZS )r   a  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                    t         |           |j                  |j                  z  dk7  r&t	        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  dz  d      | _
        t        j                  t        j                  | j                  t        j                              | _        t        j                  t        j                  | j                  t        j                              | _        |j"                  |j"                  ng | _        t%        |d	d      | _        t%        |d
d      | _        | j(                  rbt        j                  |j                  |j                  d      | _        t        j                  |j                  |j                  d      | _        | j&                  rt%        |dd      | _        | j.                  dk  r|j0                  | _        t3        |j4                        | _        d| j"                  v r1t        j                  |j                  | j                  d      | _        d| j"                  v r/t        j                  |j                  | j                        | _        t3        |j<                        | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   Fr   r^   r   talking_headr   r   r   c2pp2c) r    r!   r6   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   r"   in_projr   rD   r   r   q_biasv_biaspos_att_typer   r   r/  head_logits_projhead_weights_projr   r   r%   r   pos_dropoutpos_proj
pos_q_projattention_probs_dropout_probr'   r)   s     r,   r!   z"DisentangledSelfAttention.__init__  sU    : ::a?#F$6$6#7 8 445Q8  $*#=#= #&v'9'9F<V<V'V#W !558P8PPyy!3!3T5G5G!5KRWXll5;;0B0B5;;#WXll5;;0B0B5;;#WX393F3F3RF//XZ")&2F"N#FNEB$&IIf.H.H&JdJdkp$qD!%'YYv/I/I6KeKelq%rD"""*1&:RTV*WD'**Q..4.L.L+,V-G-GHD))) "		&*<*<d>P>PW\ ])))"$))F,>,>@R@R"S$V%H%HIr-   c                     |j                         d d | j                  dfz   }|j                  |      }|j                  dddd      S )Nr   r   r   r   r   )r   r2  r  permute)r*   r   new_x_shapes      r,   transpose_for_scoresz.DisentangledSelfAttention.transpose_for_scoresB  sF    ffhsmt'?'?&DDFF;yyAq!$$r-   c                    |9| j                  |      }| j                  |      j                  dd      \  }}	}
n1d }| j                   j                  j                  | j                  dz  d      }t        d      D cg c]C  }t        j                  t        | j                        D cg c]  }||dz  |z       c}d      E }}}dgdz  } ||d   |d   |j                  |d   j                              }t        dd      D cg c]/  } |||   ||   |j                  ||   j                              1 c}\  }}|||fD cg c]  }| j                  |       c}\  }}	}
|| j                  | j                  ddddf         z   }|
| j                  | j                  ddddf         z   }
d}dt        | j                        z   }t        j                  t        j                  |j!                  d      t        j"                        |z        }||j                  |j                        z  }t        j$                  ||	j'                  dd	            }| j(                  r&| j+                  |      }| j-                  ||	|||      }|||z   }| j.                  r5| j1                  |j3                  dd
dd            j3                  dddd
      }t4        j7                  ||d      }| j9                  |      }| j.                  r5| j;                  |j3                  dd
dd            j3                  dddd
      }t        j$                  ||
      }|j3                  dd
dd      j=                         }|j!                         dd	 dz   }|j?                  |      }|r||fS |S c c}w c c}}w c c}w c c}w )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr   r   rB   c                     |5t        j                  || j                               |j                         z   S t        j                  || j                               S r   )rD   matmult)wbr   s      r,   linearz1DisentangledSelfAttention.forward.<locals>.linearr  s@    = <<13351ACCE99 <<133511r-   r   r^   r   r   r   )r   ) r7  rD  chunkr   r2  r   rD   catrC   rI   r8  r9  r   r:  r   rG   r   r   rH  	transposer   r=  disentangled_att_biasr/  r;  rB  r@   r   r'   r<  
contiguousr  )r*   r1   r   r   r   r   r   qpr"  r%  value_layerrL  wskr  qkvwqkvbr   r  r   rel_attscale_factorrt   attention_scoresattention_probscontext_layernew_context_layer_shapes                              r,   r4   z!DisentangledSelfAttention.forwardG  s   L m,B262K2KB2O2U2UVW]_2U2`/KK2 $$**4+C+Ca+GQ*OBhmnohpqcdEIIeD<T<T6UVr!a%!)}V\]^qDq6A:DtAwQtAw}})MNA]bcdfg]hiXYF47DG]-=-=DGMM-=-RSiDAqZ[]^`aYb2cTU43L3LQ3O2c/KK!D$=$=dkk$PTVW->X$YY!D$=$=dkk$PTVW->X$YY3t0011

5<<(8(8(<EKKPS__`!EHH;3D3DH$EE <<Y5H5HR5PQ""!--n=N00iWegstG/'9 #445E5M5MaQRTUWX5YZbbcdfgijlmn"..)9>2N,,7"44_5L5LQPQSTVW5XYaabcefhiklmO_kB%--aAq9DDF"/"4"4"6s";e"C%**+BC!?33  O Wq j2cs$   +N1,N,>N14N7N<,N1c           	         |7|j                  d      }t        ||j                  d      |j                        }|j                         dk(  r!|j	                  d      j	                  d      }nT|j                         dk(  r|j	                  d      }n/|j                         dk7  rt        d|j                                t        t        |j                  d      |j                  d            | j                        }|j                         j                  |j                        }|| j                  |z
  | j                  |z   d d f   j	                  d      }d}d| j                  v r| j                  |      }	| j                  |	      }	t        j                  ||	j!                  d	d            }
t        j"                  ||z   d|dz  dz
        }t        j$                  |
d	t'        |||      
      }
||
z  }d| j                  v r| j)                  |      }| j                  |      }|t        j*                  t        j,                  |j                  d	      t        j.                        |z        z  }|j                  d      |j                  d      k7  r6t        |j                  d      |j                  d      |j                        }n|}t        j"                  | |z   d|dz  dz
        }t        j                  ||j!                  d	d      j                  |j0                              }t        j$                  |d	t3        |||      
      j!                  d	d      }|j                  d      |j                  d      k7  rA|d d d d d d df   j	                  d	      }t        j$                  |dt5        |||      
      }||z  }|S )Nr   r   r   r   r      z2Relative position ids must be of dim 2 or 3 or 4. r0  r   )rB   indexr1  r^   )r   r   r   rB   r   r3  rJ   maxr   r  rC   r:  r>  rD  rD   rH  rO  clampgatherr#  r?  r   rG   r   rI   r&  r*  )r*   r"  r%  r   r   rY  r   att_spanscorepos_key_layerc2p_attr!  pos_query_layerr_posp2c_posr)  r(  s                    r,   rP  z/DisentangledSelfAttention.disentangled_att_bias  sk     $A21innR6H+J\J\]L"'11!4>>qAL1$'11!4L1$QR^RbRbRdQefggs;++B/1CDdFaFab#((*--k.@.@A'''(2T5P5PS[5[[]^^

)A, 	  D%%% MM.9M 55mDMll;0G0GB0OPGkk,"91hlQ>NOGll7:LWVaco:pqGWE D%%%"oon=O"77HOuzz%,,7K7KB7OW\WbWb*cfr*rssO#y~~b'99/	r0BINNSUDVXcXjXjk$kk5&8"3Q1q8HIGll9o.G.GB.O.R.RYbYhYh.R.ijGllR'9';PY'ZiB  #y~~b'99(Aq!4>>rB	,,wB>PQZ\cen>opWEr-   r   )	r9   r:   r;   rn   r!   rD  r4   rP  r=   r>   s   @r,   r   r     s.    !JF%  Y!v1r-   r   c                   *     e Zd ZdZ fdZddZ xZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        |dd      }t        |d|j                        | _        t        j                  |j                  | j                  |      | _        t        |dd      | _	        | j                  sd | _
        n/t        j                  |j                  | j                        | _
        |j                  dkD  r/t        j                  |j                  | j                        | _        | j                  |j                  k7  r1t        j                  | j                  |j                  d      | _        t!        |j                  |j"                        | _        t'        |j(                        | _        || _        | j/                  d	t1        j2                  |j                        j5                  d
      d       y )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr.  position_ids)r   r   )
persistent)r    r!   r   r6   ro  r   r   
vocab_sizeword_embeddingsrq  position_embeddingsr   type_vocab_sizetoken_type_embeddingsr"   
embed_projr   r   r   r%   r   r'   r(   register_bufferrD   r  r   )r*   r(   rn  r+   s      r,   r!   zDebertaEmbeddings.__init__  sf   v~q9%f.>@R@RS!||F,=,=t?R?R`lm%,V5Ld%S"))'+D$')||F4R4RTXTgTg'hD$!!A%)+f6L6LdNaNa)bD&&"4"44 ii(;(;V=O=OV[\DO)&*<*<f>S>ST$V%?%?@ 	ELL)G)GHOOPWXej 	 	
r-   c                    ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|:t        j                  |t        j                  | j                  j
                        }|| j                  |      }| j                   | j                  |j	                               }nt        j                  |      }|}	| j                  r|	|z  }	| j                  j                  dkD  r| j                  |      }
|	|
z  }	| j                  | j                  j                  k7  r| j                  |	      }	| j!                  |	      }	||j#                         |	j#                         k7  rD|j#                         dk(  r |j%                  d      j%                  d      }|j'                  d      }|j)                  |	j*                        }|	|z  }	| j-                  |	      }	|	S )Nr   r   r  r   r_  r   )r   rr  rD   r   r  r   ru  rv  
zeros_likerq  r(   rw  rx  ro  r6   ry  r   rB   r   r   rC   rI   r'   )r*   	input_idstoken_type_idsrr  rP   inputs_embedsinput_shape
seq_lengthrv  
embeddingsrx  s              r,   r4   zDebertaEmbeddings.forward  s    #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  00;M##/"&":":<;L;L;N"O"'"2"2="A"
%%--J;;&&*$($>$>~$N!//J$++"9"994J^^J/
xxzZ^^--88:?<<?2215D~~a(77:++,D#d*J\\*-
r-   )NNNNNr   r>   s   @r,   rl  rl    s    Q
4,r-   rl  c                   (    e Zd ZdZeZdZdgZdZd Z	y)DebertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    debertarv  Tc                 :   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyy)zInitialize the weights.g        )r   stdN)rw   r   r"   r   datanormal_r(   initializer_ranger   zero_r   rp  )r*   modules     r,   _init_weightsz$DebertaPreTrainedModel._init_weights-  s    fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> . .r-   N)
r9   r:   r;   rn   r   config_classbase_model_prefix"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr  rp   r-   r,   r  r  "  s(    
 !L!*?)@&&*#?r-   r  a  
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.


    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zaThe bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.c                   \    e Zd Z fdZd Zd Zd Z eej                  d             e
eee      	 	 	 	 	 	 	 	 ddeej                      deej                      d	eej                      d
eej                      deej                      dee   dee   dee   deeef   fd              Z xZS )DebertaModelc                     t         |   |       t        |      | _        t	        |      | _        d| _        || _        | j                          y r/   )	r    r!   rl  r  r   encoderz_stepsr(   	post_initr)   s     r,   r!   zDebertaModel.__init__}  s@     +F3%f-r-   c                 .    | j                   j                  S r   r  ru  r7   s    r,   get_input_embeddingsz!DebertaModel.get_input_embeddings  s    ...r-   c                 &    || j                   _        y r   r  r*   new_embeddingss     r,   set_input_embeddingsz!DebertaModel.set_input_embeddings  s    *8'r-   c                     t        d      )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r*   heads_to_prunes     r,   _prune_headszDebertaModel._prune_heads  s    
 ""[\\r-   batch_size, sequence_length
checkpointoutput_typer  r}  r   r~  rr  r  r   r  r  r   c	           	      |   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }	n!||j                         d d }	nt	        d      ||j                  n|j                  }
|t        j                  |	|
      }|&t        j                  |	t        j                  |
      }| j                  |||||      }| j                  ||d||      }|d	   }| j                  d	kD  r|d
   }t        | j                        D cg c]  }| j                  j                   d    }}|d   }| j                  j#                         }| j                  j%                  |      }| j                  j'                  |      }|d	d  D ]!  } |||d|||      }|j)                  |       # |d   }|s|f||rd	d  z   S dd  z   S t+        ||r|j,                  nd |j.                        S c c}w )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r  )r}  r~  rr  rP   r  T)r  r   r  r   r   Fr   r   r  )r(   r   r  use_return_dictr3  %warn_if_padding_and_no_attention_maskr   r   rD   r   r   r  r  r  r  r   r   r   r   r   r   r   r1   r  )r*   r}  r   r~  rr  r  r   r  r  r  r   embedding_outputencoder_outputsencoded_layersr1   r   layersr   r   rel_posr   sequence_outputs                         r,   r4   zDebertaModel.forward  sz   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU%.%:!!@T@T!"ZZFCN!"[[EJJvVN??)%' + 
 ,,!%/# ' 
 )+<<!*2.M6;DLL6IJdll((,JFJ)"-L!\\;;=N!\\<<^LNll../?@G 	4$!"&+!-!(#1  %%l3	4 ),#%>R8\(]]]XY8\(]]]-;O/77UY&11
 	
+ Ks    H9)NNNNNNNN)r9   r:   r;   r!   r  r  r  r   DEBERTA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   rD   r   rE   r   r   r4   r=   r>   s   @r,   r  r  x  s   
/9] ++C+J+JKh+ij&#$ -11515/304,0/3&*N
ELL)N
 !.N
 !.	N

 u||,N
  -N
 $D>N
 'tnN
 d^N
 
uo%	&N
 kN
r-   r  z5DeBERTa Model with a `language modeling` head on top.c                       e Zd ZddgZ fdZd Zd Z eej                  d             e
eeedee      	 	 	 	 	 	 	 	 	 dd	eej$                     d
eej$                     deej$                     deej$                     deej$                     deej$                     dee   dee   dee   deeef   fd              Z xZS )DebertaForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r    r!   r  r  DebertaOnlyMLMHeadclsr  r)   s     r,   r!   zDebertaForMaskedLM.__init__  s4     #F+%f- 	r-   c                 B    | j                   j                  j                  S r   )r  predictionsdecoderr7   s    r,   get_output_embeddingsz(DebertaForMaskedLM.get_output_embeddings  s    xx##+++r-   c                     || j                   j                  _        |j                  | j                   j                  _        y r   )r  r  r  r   r  s     r,   set_output_embeddingsz(DebertaForMaskedLM.set_output_embeddings  s,    '5$$2$7$7!r-   r  z[MASK])r  r  r  rP   expected_outputexpected_lossr}  r   r~  rr  r  labelsr   r  r  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r~  rr  r  r   r  r  r   r   r   losslogitsr1   r  )
r(   r  r  r  r	   r  rt  r   r1   r  )r*   r}  r   r~  rr  r  r  r   r  r  outputsr  prediction_scoresmasked_lm_lossloss_fctrR   s                   r,   r4   zDebertaForMaskedLM.forward  s    8 &1%<k$++B]B],,))%'/!5#  	
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r-   	NNNNNNNNN)r9   r:   r;   _tied_weights_keysr!   r  r  r   r  r  r   _CHECKPOINT_FOR_MASKED_LMr   r  _MASKED_LM_EXPECTED_OUTPUT_MASKED_LM_EXPECTED_LOSSr   rD   r   rE   r   r   r4   r=   r>   s   @r,   r  r    s/   :<Z[,8 ++C+J+JKh+ij,"$2. -11515/304)-,0/3&*1
ELL)1
 !.1
 !.	1

 u||,1
  -1
 &1
 $D>1
 'tn1
 d^1
 
un$	%1
 k1
r-   r  c                   $     e Zd Z fdZd Z xZS )DebertaPredictionHeadTransformc                    t         |           t        |d|j                        | _        t        j                  |j                  | j                        | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  | j                  |j                        | _        y )Nro  )r   )r    r!   r   r6   ro  r   r"   r$   rw   r   r   r   transform_act_fnr   r   r)   s     r,   r!   z'DebertaPredictionHeadTransform.__init__=  s    %f.>@R@RSYYv1143F3FG
f''-$*6+<+<$=D!$*$5$5D!d&9&9v?T?TUr-   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r$   r  r   r   s     r,   r4   z&DebertaPredictionHeadTransform.forwardH  s4    

=1--m<}5r-   r   r>   s   @r,   r  r  <  s    	Vr-   r  c                   *     e Zd Z fdZd Zd Z xZS )DebertaLMPredictionHeadc                    t         |           t        |      | _        t	        |d|j
                        | _        t        j                  | j                  |j                  d      | _
        t        j                  t        j                  |j                              | _        | j                  | j                  _        y )Nro  Fr.  )r    r!   r  	transformr   r6   ro  r   r"   rt  r  r   rD   r   r   r)   s     r,   r!   z DebertaLMPredictionHead.__init__P  s    7?%f.>@R@RS yy!4!4f6G6GeTLLV->->!?@	 !IIr-   c                 :    | j                   | j                  _         y r   )r   r  r7   s    r,   _tie_weightsz$DebertaLMPredictionHead._tie_weights^  s     IIr-   c                 J    | j                  |      }| j                  |      }|S r   )r  r  r   s     r,   r4   zDebertaLMPredictionHead.forwarda  s$    }5]3r-   )r9   r:   r;   r!   r  r4   r=   r>   s   @r,   r  r  O  s    &&r-   r  c                   $     e Zd Z fdZd Z xZS )r  c                 B    t         |           t        |      | _        y r   )r    r!   r  r  r)   s     r,   r!   zDebertaOnlyMLMHead.__init__i  s    26:r-   c                 (    | j                  |      }|S r   )r  )r*   r  r  s      r,   r4   zDebertaOnlyMLMHead.forwardm  s     ,,_=  r-   r   r>   s   @r,   r  r  h  s    ;!r-   r  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   v    e Zd Z fdZd Zd Z eej                  d             e	e
ee      	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   deeef   fd              Z xZS ) DebertaForSequenceClassificationc                 |   t         |   |       t        |dd      }|| _        t	        |      | _        t        |      | _        | j                  j                  }t        j                  ||      | _        t        |dd       }|| j                  j                  n|}t        |      | _        | j!                          y )N
num_labelsr   cls_dropout)r    r!   r   r  r  r  r   poolerr8   r   r"   
classifierr(   r   r%   r'   r  )r*   r(   r  r8   drop_outr+   s        r,   r!   z)DebertaForSequenceClassification.__init__z  s     V\15
$#F+#F+[[++
))J
;6=$76>6F4;;22H$X. 	r-   c                 6    | j                   j                         S r   )r  r  r7   s    r,   r  z5DebertaForSequenceClassification.get_input_embeddings  s    ||0022r-   c                 :    | j                   j                  |       y r   )r  r  r  s     r,   r  z5DebertaForSequenceClassification.set_input_embeddings  s    )).9r-   r  r  r}  r   r~  rr  r  r  r   r  r  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }| j                  |      }d}|| j                   j                  | j                  dk(  rXt        j                         }|j                  d      j                  |j                        } |||j                  d            }n_|j                         dk(  s|j                  d      dk(  r|dk\  j                         }|j!                         }|j                  d      dkD  rt#        j$                  |d|j'                  |j                  d      |j                  d                  }t#        j$                  |d|j                  d            }t)               } ||j                  d| j                        j+                         |j                  d            }nIt#        j,                  d      j                  |      }n#t        j.                  d      } ||      |z  j1                  d      j3                          }n| j                   j                  dk(  rIt               }| j                  dk(  r& ||j5                         |j5                               }n |||      }n| j                   j                  dk(  r=t)               } ||j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt7               } |||      }|	s|f|
dd z   }||f|z   S |S t9        |||
j:                  |
j<                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r~  r   rr  r  r   r  r  r   r   r   
regressionsingle_label_classificationmulti_label_classificationr  )r(   r  r  r  r'   r  problem_typer  r   r
   r  rC   rI   rB   r   nonzeror  rD   rc  r   r	   r   rG   
LogSoftmaxsumr   r   r   r   r1   r  )r*   r}  r   r~  rr  r  r  r   r  r  r  encoder_layerr3   r  r  loss_fnlabel_indexlabeled_logitsr  log_softmaxrR   s                        r,   r4   z(DebertaForSequenceClassification.forward  s   0 &1%<k$++B]B],,))%'/!5#  	
  
M2]3/{{''/??a' jjlG#[[_//=F"66;;r?;DZZ\Q&&++b/Q*>#)Q;"7"7"9K#[[]F"''*Q.)."A{'9'9+:J:J1:Mv{{[\~'^* "'fa9I9I"9M!N#3#5'(;(;B(P(V(V(XZ`ZeZefhZij$||A11&9"$--"3K)&1F:??CIIKKD))\9"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'fG4I4IV]VhVh
 	
r-   r  )r9   r:   r;   r!   r  r  r   r  r  r   r  r   r  r   rD   r   rE   r   r   r4   r=   r>   s   @r,   r  r  r  s'   $3: ++C+J+JKh+ij&,$ -11515/304)-,0/3&*M
ELL)M
 !.M
 !.	M

 u||,M
  -M
 &M
 $D>M
 'tnM
 d^M
 
u..	/M
 kM
r-   r  z
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                   j    e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   dee   deee	f   fd              Z xZS )DebertaForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   )r    r!   r  r  r  r   Dropoutr   r'   r"   r6   r  r  r)   s     r,   r!   z&DebertaForTokenClassification.__init__  si      ++#F+zz&"<"<=))F$6$68I8IJ 	r-   r  r  r}  r   r~  rr  r  r  r   r  r  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r   r  )r(   r  r  r'   r  r	   r  r  r   r1   r  )r*   r}  r   r~  rr  r  r  r   r  r  r  r  r  r  r  rR   s                   r,   r4   z%DebertaForTokenClassification.forward  s    , &1%<k$++B]B],,))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$fG4I4IV]VhVh
 	
r-   r  )r9   r:   r;   r!   r   r  r  r   r  r   r  r   rD   r   rE   r   r   r4   r=   r>   s   @r,   r  r    s   	 ++C+J+JKh+ij&)$ -11515/304)-,0/3&*-
ELL)-
 !.-
 !.	-

 u||,-
  --
 &-
 $D>-
 'tn-
 d^-
 
u++	,-
 k-
r-   r  z
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ eej                  d             eee	e
eeee      	 	 	 	 	 	 	 	 	 	 ddeej"                     deej"                     deej"                     deej"                     deej"                     d	eej"                     d
eej"                     dee   dee   dee   deee	f   fd              Z xZS )DebertaForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r    r!   r  r  r  r   r"   r6   
qa_outputsr  r)   s     r,   r!   z$DebertaForQuestionAnswering.__init__9  sS      ++#F+))F$6$68I8IJ 	r-   r  )r  r  r  r  r  qa_target_start_indexqa_target_end_indexr}  r   r~  rr  r  start_positionsend_positionsr   r  r  r   c           
      &   |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r   rF  )ignore_indexr   )r  start_logits
end_logitsr1   r  )r(   r  r  r  splitr   rQ  r   r   rb  r	   r   r1   r  )r*   r}  r   r~  rr  r  r  r  r   r  r  r  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossrR   s                         r,   r4   z#DebertaForQuestionAnswering.forwardC  s   B &1%<k$++B]B],,))%'/!5#  	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r-   )
NNNNNNNNNN)r9   r:   r;   r!   r   r  r  r   _CHECKPOINT_FOR_QAr   r  _QA_EXPECTED_OUTPUT_QA_EXPECTED_LOSS_QA_TARGET_START_INDEX_QA_TARGET_END_INDEXr   rD   r   rE   r   r   r4   r=   r>   s   @r,   r  r  1  s@    ++C+J+JKh+ij%0$+'40 -11515/3042604,0/3&*F
ELL)F
 !.F
 !.	F

 u||,F
  -F
 "%,,/F
  -F
 $D>F
 'tnF
 d^F
 
u22	3F
 kF
r-   r  )Qrn   collections.abcr   typingr   r   r   rD   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   configuration_debertar   
get_loggerr9   loggerr  r  r  r  r  r  r  r  r  r  Moduler   autogradFunctionr@   rr   r{   r}   r%   r   r   r   r   r   r   r   r   jitscriptr#  r&  r*  r   rl  r  DEBERTA_START_DOCSTRINGr  r  r  r  r  r  r  r  r  rp   r-   r,   <module>r#     s!    $ ) )    A A !  . 2 u u 0 
		H	%!.  @ ' !  ; (    'BII ',<ku~~&& <k~ &$Du~~&& $DN."BII ."bryy (		 !$ryy !$J")) BII  299  Db
RYY b
J2 r r n n [ [		 DI		 IX?_ ?2 ") X gl
) l
	l
^ QSjkM
/ M
 lM
`RYY &bii 2! !  l
'= l
l
^  ?
$: ?
?
D  [
"8 [
[
r-   