
    Ig                        d dl Z d dlmZ d dlZd dlmZ d dlZddlmZ ddl	m
Z
mZmZ ddlmZmZmZ ddlmZmZmZ dd	lmZmZmZ d
dlmZ  ej6                  e      Z G d de      Z G d de      Z G d de      Z d Z!d(dZ" G d de      Z# G d de#e      Z$ G d de      Z%e#e$e%dZ& G d de      Z' G d de      Z( G d d e(e      Z) G d! d"e
      Z* G d# d$e      Z+ G d% d&e      Z,g d'Z-y))    N)Optional   )logging   )GemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification)GraniteAttentionGraniteFlashAttention2GraniteSdpaAttention)LlamaDecoderLayer
LlamaModelLlamaPreTrainedModel)Phi3MLPPhi3RMSNormPhi3RotaryEmbedding   )	GlmConfigc                       e Zd Zy)
GlmRMSNormN__name__
__module____qualname__     `/var/www/html/answerous/venv/lib/python3.12/site-packages/transformers/models/glm/modular_glm.pyr   r   2       r   r   c                       e Zd Zy)GlmRotaryEmbeddingNr   r   r   r   r    r    6   r   r   r    c                       e Zd Zy)GlmMLPNr   r   r   r   r"   r"   :   r   r   r"   c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	z*Rotates half the hidden dims of the input..r   Nr   r   dim)torchstackflatten)xx1x2s      r   rotate_halfr.   >   sJ    	
319B	
319B;;Ryb)11"55r   c                 r   |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }| dd| j                  d   dz  f   | d| j                  d   dz  df   }} |dd|j                  d   dz  f   |d|j                  d   dz  df   }}| |z  t        |       |z  z   }||z  t        |      |z  z   }	t	        j
                  ||gd      }t	        j
                  |	|gd      }	||	fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr$   r   r%   )	unsqueezeshaperepeat_interleaver.   r(   cat)
qkcossinposition_idsunsqueeze_dimq_passk_passq_embedk_embeds
             r   apply_rotary_pos_embr>   E   so   ( --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC #))))*Ac1772;!3C3E.E,FvA#))))*Ac1772;!3C3E.E,FvA 3w;q>C/0G3w;q>C/0G ii&)r2Gii&)r2GGr   c                   0     e Zd Zddedee   f fdZ xZS )GlmAttentionconfig	layer_idxc                     t         |   ||       t        j                  | j                  | j                  d      | _        dt        j                  | j                        z  | _	        y )NF)biasr   )
super__init__nnLinearhidden_sizeo_projmathsqrthead_dimscalingselfrA   rB   	__class__s      r   rF   zGlmAttention.__init__o   sK    +ii 0 0$2B2BO499T]]33r   Nr   r   r   r   r   intrF   __classcell__rQ   s   @r   r@   r@   n   s    4y 4Xc] 4 4r   r@   c                       e Zd Zy)GlmFlashAttention2Nr   r   r   r   rX   rX   u   r   r   rX   c                       e Zd Zy)GlmSdpaAttentionNr   r   r   r   rZ   rZ   y   r   r   rZ   )eagerflash_attention_2sdpac                   0     e Zd Zddedee   f fdZ xZS )GlmDecoderLayerrA   rB   c                     t         |           t        |      | _        t	        |j
                  |j                        | _        t	        |j
                  |j                        | _        y )Neps)	rE   rF   r"   mlpr   rI   rms_norm_epsinput_layernormpost_attention_layernormrO   s      r   rF   zGlmDecoderLayer.__init__   sO    &>)&*<*<&BUBUV(263E3E6K^K^(_%r   rR   rS   rV   s   @r   r_   r_      s#    `y `Xc] ` `r   r_   c                       e Zd Zy)GlmPreTrainedModelNr   r   r   r   rh   rh      r   r   rh   c                   $     e Zd Zdef fdZ xZS )GlmModelrA   c           	         t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t        |j                  dz  |j                  |j                        | _        d| _        | j%                          y c c}w )Nra   r   )r&   max_position_embeddingsbaseF)rE   rF   rG   
ModuleListrangenum_hidden_layersr_   layersr   rI   rd   normr    rM   rl   
rope_theta
rotary_embgradient_checkpointing	post_initrO   s      r   rF   zGlmModel.__init__   s     mmAFvG_G_A`aI_VY/a
 v11v7J7JK	,1$f>\>\cictct
 ',# 	 bs   Cr   r   r   r   rF   rU   rV   s   @r   rj   rj      s    y  r   rj   c                   $     e Zd Zdef fdZ xZS )GlmForCausalLMrA   c                 d    t         |   |       t        |      | _        | j	                          y rR   rE   rF   rj   modelrv   rP   rA   rQ   s     r   rF   zGlmForCausalLM.__init__   &     f%
r   rw   rV   s   @r   ry   ry          y  r   ry   c                   $     e Zd Zdef fdZ xZS )GlmForSequenceClassificationrA   c                 d    t         |   |       t        |      | _        | j	                          y rR   r{   r}   s     r   rF   z%GlmForSequenceClassification.__init__   r~   r   rw   rV   s   @r   r   r      r   r   r   c                   $     e Zd Zdef fdZ xZS )GlmForTokenClassificationrA   c                 d    t         |   |       t        |      | _        | j	                          y rR   r{   r}   s     r   rF   z"GlmForTokenClassification.__init__   r~   r   rw   rV   s   @r   r   r      r   r   r   )rh   rj   ry   r   r   )Nr   ).rK   typingr   r(   torch.nnrG   torch.utils.checkpointutilsr   gemma.modeling_gemmar   r   r	   granite.modeling_graniter
   r   r   llama.modeling_llamar   r   r   phi3.modeling_phi3r   r   r   configuration_glmr   
get_loggerr   loggerr   r    r"   r.   r>   r@   rX   rZ   GLM_ATTENTION_CLASSESr_   rh   rj   ry   r   r   __all__r   r   r   <module>r      s          
 
 
 
 ) 
		H	%	 		, 		W 	6&R4# 4	'= 		+ 	
 + `' `	- 	!:  % #A  ; r   