
    Ig6                     Z   d dl Z d dlZd dlmZmZ d dlZd dlmc mZ	 ddl
mZmZ  e       r6d dlmZmZmZ d dlmZmZ d e e j*                  e      j,                        v Zdej0                  d	eej0                  ej0                  ef   fd
Zdej0                  dej0                  dej0                  dej0                  def
dZd Z	 	 	 	 	 	 	 ddej0                  dej0                  dej0                  dej0                  dedededeej0                     dee   dee   dedee   defdZy)    N)OptionalTuple   )is_flash_attn_2_availableis_flash_attn_greater_or_equal)index_first_axis	pad_inputunpad_input)flash_attn_funcflash_attn_varlen_funcwindow_sizeattention_maskreturnc                 d   | j                  dt        j                        }t        j                  | j	                         d      j	                         }|j                         j                         }t        j                  t        j                  |dt        j                        d      }|||fS )aq  
    Retrieves indexing data required to repad unpadded (ragged) tensors.

    Arguments:
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

    Return:
        indices (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input sequence.
        cu_seqlens (`torch.Tensor`):
            The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        max_seqlen_in_batch (`int`):
            Maximum sequence length in batch.
    )dimdtypeF)as_tupler   )r   r   )
sumtorchint32nonzeroflattenmaxitemFpadcumsum)r   seqlens_in_batchindicesmax_seqlen_in_batch
cu_seqlenss        h/var/www/html/answerous/venv/lib/python3.12/site-packages/transformers/modeling_flash_attention_utils.py_get_unpad_datar$   !   s      &))b)DmmN224uEMMOG*..0557u||$4!5;;OQWXJ     query_layer	key_layervalue_layerquery_lengthc                    t        |      \  }}}|j                  \  }}	}
}t        |j                  ||	z  |
|      |      }t        |j                  ||	z  |
|      |      }||	k(  r't        | j                  ||	z  d|      |      } |}|}|}nn|dk(  rLd}t	        j
                  |dz   t        j                  | j                        }|dd }| j                  d      } n|dd| df   }t        | |      \  } }}}| |||||f||ffS )a  
    Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong to different batches.

    This function is used instead of `flash_attn.bert_padding.unpad_input` in order to avoid the recomputation of the same intermediary
    tensors for query, key, value tensors.

    Arguments:
        query_layer (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
        query_length (`int`):
            Target length.

    Return:
        query_layer (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   )r   deviceN)
r$   shaper   reshaper   aranger   r+   squeezer
   )r&   r'   r(   r   r)   	indices_kcu_seqlens_kmax_seqlen_in_batch_k
batch_size
kv_seq_lennum_key_value_headshead_dimcu_seqlens_qmax_seqlen_in_batch_q	indices_qs                  r#   _upad_inputr:   <   sU   N 6E^5T2I|2<EOO9J
/ !2!2:
3JL_ai!jluvI"J35H(SU^K z!&{':'::
;RTVX`'aclm# 5			 !||N%++k6H6H
 !"%	!))!, (L=>(9:FQR]_mFnCY.C 		|$	 56 r%   c                    | j                  d| j                  d      | j                  d            } |j                  d|j                  d      |j                  d            }|j                  d|j                  d      |j                  d            }|j                         }t        j                  |j                  d      |j
                  t        j                        }t        j                  ||dk(     t        j                  |j                         |j
                  t        j                        f      }|j                         dz   }| |||||f||ffS )aK  
    This function returns necessary arguments to call `flash_attn_varlen_func`.
    All three query, key, value states will be flattened.
    Cummulative lengths of each examples in the batch will be extracted from position_ids.

    NOTE: ideally cummulative lengths should be prepared at the data collator stage

    Arguments:
        query (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        position_ids (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

    Return:
        query (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   )r+   r   r   )
viewsizer   r   r.   r+   r   cattensorr   )querykeyvalueposition_idsr9   cu_seq_lens
max_lengths          r#   prepare_fa2_from_position_idsrG      s   @ JJr5::b>5::b>:E
((2sxx|SXXb\
2CJJr5::b>5::b>:E'')L\..q1,:M:MUZU`U`aI))la'(LL**,\5H5HPUP[P[\	
K !!#a'J3y;*DzS]F^__r%   query_states
key_statesvalue_states	is_causaldropoutrD   softmax_scalesliding_windowuse_top_left_masksoftcapdeterministicc                    |
s|}n	|xr |dk7  }t         xr |	duxr |j                  d   |	kD  }|rd|	|	fini }t        d      r*|#t        j                  j                  dd      dk(  }||d<   |||d	<   |U| j                  d
   }t        | ||||      \  } }}}}}|\  }}|\  }}t        | ||f|||||||d|}t        ||||      }|S ||dk7  rt        j                  |d      d
k\  j                         sz| j                  d
      }t        | |||      \  } }}}}}|\  }}|\  }}t        | ||f|||||||d|}|j                  |d|j                  d      |j                  d            }|S t        | |||f||d|}|S )a  
    Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
    first unpad the input, then computes the attention scores and pad the final attention scores.

    Args:
        query_states (`torch.Tensor`):
            Input query states to be passed to Flash Attention API
        key_states (`torch.Tensor`):
            Input key states to be passed to Flash Attention API
        value_states (`torch.Tensor`):
            Input value states to be passed to Flash Attention API
        attention_mask (`torch.Tensor`):
            The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
            position of padding tokens and 1 for the position of non-padding tokens.
        dropout (`float`):
            Attention dropout
        softmax_scale (`float`, *optional*):
            The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        use_top_left_mask (`bool`, defaults to `False`):
            flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
        softcap (`float`, *optional*):
            Softcap for the attention logits, used e.g. in gemma2.
        deterministic (`bool`, *optional*):
            Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled.
    r   Nr   z2.4.1FLASH_ATTENTION_DETERMINISTIC01rQ   rP   r   )r7   r1   max_seqlen_qmax_seqlen_k	dropout_prM   causalr   )r   r<   )rM   rY   )_flash_supports_window_sizer,   r   osenvirongetr:   r   r	   r   diffallr>   rG   r=   r   )rH   rI   rJ   r   r)   rK   rL   rD   rM   rN   rO   rP   rQ   rY   use_sliding_windowsflash_kwargsr3   r9   rE   max_seq_lensr7   r1   r8   r2   attn_output_unpadattn_outputs                             r#   _flash_attention_forwardre      sb   P  0|q0 	$kd(BkzGWGWXYGZ]kGk  I\MNN#CDacL%g. JJNN+JCPTWWM(5_%")Y !!''*
Wb*lNLX
Tj,	; &1"l7C442
 &%..'
 
   19j,WH = 
	!la&7L^`AaefAf@k@k@m!&&q)
Wt*lLX
Tj,	; &1"l7C44,
 &%..'
 
 "&&z2{7G7G7K[M]M]^`Mab 	 &*lG
KXag
kw
 r%   )g        NNNFNN) inspectr[   typingr   r   r   torch.nn.functionalnn
functionalr   utilsr   r   flash_attn.bert_paddingr   r	   r
   
flash_attnr   r   list	signature
parametersrZ   Tensorintr$   r:   rG   boolfloatre    r%   r#   <module>rv      s     	 "    L PPB"/48I8I8I/8Z8e8e3f"fELL U5<<WZ;Z5[ 6FF||F F LL	F
 FR/`r +/%)$(##v,,vv ,,v LL	v
 v v v 5<<(v E?v SMv v e_v vr%   