
    Ig[                     ,   d dl mZ d dlmZmZmZ d dlZd dlZd dlm	Z	 d dl
mZmZ d dlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ  ej0                  e      Z G d de      Z G d de      Z G d de      Ze G d de             Z G d de      Zy)    )	dataclass)OptionalTupleUnionN)CrossEntropyLoss)InstructBlipQFormerConfigInstructBlipVisionConfig)$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutput   )PretrainedConfig)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)logging   )CONFIG_MAPPINGc                       e Zd Zy)InstructBlipVideoVisionConfigN__name__
__module____qualname__     |/var/www/html/answerous/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   )       r   r   c                       e Zd Zy)InstructBlipVideoQFormerConfigNr   r   r   r   r   r   -   r   r   r   c                   P     e Zd ZdZdZ	 	 	 	 	 d fd	Zededede	fd       Z
 xZS )	InstructBlipVideoConfiga
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
    ```instructblipvideoc                 f   t        |   di | |i }t        j                  d       |i }t        j                  d       |i }t        j                  d       t	        di || _        t        di || _        d|v r|d   nd}t        |   di || _	        | j                  j                  | _
        | j                  j                  | _        || _        || _        | j
                  j                  | j                  _        | j                  j                   t"        v | _        d| _        d| _        y )	NzZvision_config is None. initializing the InstructBlipVideoVisionConfig with default values.z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.zTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typeoptg      ?g{Gz?r   )super__init__loggerinfor   vision_configr   qformer_configr   text_configtie_word_embeddingsis_encoder_decodernum_query_tokensvideo_token_indexhidden_sizeencoder_hidden_sizer"   r   use_decoder_only_language_modelinitializer_factorinitializer_range)	selfr(   r)   r*   r-   r.   kwargstext_model_type	__class__s	           r   r%   z InstructBlipVideoConfig.__init__k   s"    	"6" MKKtu!NKKvwKKKno:K]K<N~N7C{7R+l3X])/:I[I#'#3#3#G#G "&"2"2"E"E 0!2262D2D2P2P//3/?/?/J/JNo/o,"%!%r   r(   r)   r*   c                 n     | d|j                         |j                         |j                         d|S )a  
        Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
        language model configurations.

        Returns:
            [`InstructBlipVideoConfig`]: An instance of a configuration object
        )r(   r)   r*   r   )to_dict)clsr(   r)   r*   r5   s        r    from_vision_qformer_text_configsz8InstructBlipVideoConfig.from_vision_qformer_text_configs   sD       
'//1)113#++-
 	
 	
r   )NNN    N)r   r   r   __doc__r"   r%   classmethodr   r   r   r;   __classcell__)r7   s   @r   r   r   1   sW    5n %J $&L 
4
 7
 &	
 
r   r   c                       e Zd Zy)4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r   r   rA   rA      s    r   rA   c                   &   e Zd Z	 	 	 	 	 	 	 	 	 	 ddej                  dej                  deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   deej                     dee   dedee	e
f   fdZ ej                         	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dedej                  fd       Zy))InstructBlipVideoForConditionalGenerationNpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_stateslabelsreturn_dictinterpolate_pos_encodingreturnc           
      H	   ||n| j                   j                  }|j                  \  }}}}}|j                  ||z  |||      }| j	                  |||	||      }|d   }t        j                  |j                         dd t
        j                  |j                        }| j                  j                  |j                  d   dd      }t        j                  |j                         dd t
        j                  |j                        }|t        j                  |      }|j                  |d      }|j                  |d      }t        j                  ||gd      }| j                  |||||||	|      }|d   ddd|j                  d      ddf   }| j!                  |      }|j                  || j                   j"                  |z  d      }t        j                  |j                         dd t
        j                  |j                        } | j$                  j'                         |      }|t        j                  |      }t)        | j                   d	d      K|| j                   j*                  k(  j-                  d      j/                  |      }|j1                         ||<   nyt2        j5                  d
       t        j                  ||j7                  |j                        gd      }t        j                  ||j7                  |j                        gd      }| j                   j8                  r| j%                  ||||	|      }|r|j:                  n|d   }d}|
|
j7                  |j                        }
|dd|
j                  d       dddf   }|dddddf   j=                         } |
dddf   j=                         j7                  |j                        }!t?        d      }" |"| jA                  d| j                   jB                  jD                        |!jA                  d            }n?| j%                  ||||||	||
      }|r|jF                  n|d   }|r|j:                  n|d   }|s||||f}#||f|#z   S |#S tI        |||||      S )a0
  
        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`List[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```N)rD   rK   rL   rN   rO   r   dtypedevicedim   )rG   rH   query_embedsencoder_hidden_statesencoder_attention_maskrK   rL   rN   r.   K  Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.)inputs_embedsrH   rK   rL   rN   .mean)	reduction)r]   rH   rI   rJ   rK   rL   rN   rM   )losslogitsvision_outputsqformer_outputslanguage_model_outputs)%configuse_return_dictshapereshapevision_modeltorchonessizelongrU   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr-   language_modelget_input_embeddingsgetattrr.   	unsqueeze	expand_asflattenr&   warning_oncetor1   ra   
contiguousr   viewr*   
vocab_sizer`   rA   )$r4   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   
batch_sizeframeschannelheightwidthrb   image_embedsimage_attention_maskrn   query_attention_maskquery_outputsquery_outputlanguage_model_inputslanguage_model_attention_maskr]   special_image_maskoutputsra   r`   shift_logitsshift_labelsloss_fctoutputs$                                       r   forwardz1InstructBlipVideoForConditionalGeneration.forward   s   N &1%<k$++B]B] 6B5G5G2
FGVU#++J,?&RWX**%/!5#%= + 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7/!5# % 	
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j$++JfJfioJoqs t(-

!&&("-UZZH]HdHd)
% C++@@B9M!"__Y7N 4;; 3T:F"+t{{/L/L"L!W!WXZ![!e!efs!t0E0M0M0OM,-z
 "II'<m>N>NOdOkOk>l&mstuM"YY.0A0AB_BfBf0ghnoN ;;66))+-"3%9' * G (3W^^
FD!6==1FKKN?#4a 78%c3B3k2==?%c12g99;>>v}}M ,f= 1 1"dkk6M6M6X6X Y[g[l[lmo[pq))+-"3'="3%9' * 	G $/7<<GAJD'2W^^
FnmWEF)-)9TGf$EvEC))#*
 	
r   c                 6
   t        | d      r| j                          |j                  \  }}	}
}}|j                  ||	z  |
||      }| j	                  |d|      j
                  }t        j                  |j                         dd t        j                  |j                        }| j                  j                  |j                  d   dd      }t        j                  |j                         dd t        j                  |j                        }|t        j                  |      }|j                  |	d      }|j                  |	d      }t        j                  ||gd	      }| j!                  |||||d
      }|j
                  ddd|j                  d	      ddf   }| j#                  |      }|j                  || j$                  j&                  |	z  d      }t        j                  |j                         dd t        j                  |j                        }|^t        j(                  | j$                  j*                  j,                  gg      j/                  |d	      j1                  |j                        }|t        j                  |      } | j3                         |      }t5        | j$                  dd      K|| j$                  j6                  k(  j9                  d      j;                  |      }|j=                         ||<   nt>        jA                  d       t        j                  ||j1                  |j                        gd	      }t        j                  ||j1                  |j                        gd	      }| jB                  j$                  jD                  sM|jG                  dd      |j                  d	   z   d	z
  |d<   |jG                  dd      |j                  d	   z   |d<    | jB                  jH                  d||d|}| jB                  j$                  jD                  s| j$                  j*                  jJ                  d   dk(  rdn| j$                  j*                  j,                  }t        j(                  |gg      j/                  |d	      j1                  |j                        }tM        |t        jN                        s*t        j                  ||jP                  gd      |_(        |S t        j                  ||gd      }|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        hf_device_mapT)rN   rO   NrR   rS   r   rV   rX   )rG   rH   rY   rZ   r[   rN   r.   r\   
max_length   
min_length)r]   rH   LLaMAForCausalLMr   r   ))hasattr_preprocess_acceleraterg   rh   ri   last_hidden_staterj   rk   rl   rm   rU   rn   ro   rp   rq   rr   rs   rt   re   r-   
LongTensorr*   bos_token_idrepeatr|   rv   rw   r.   rx   ry   rz   r&   r{   ru   r,   getgeneratearchitectures
isinstanceTensor	sequences)r4   rD   rE   rF   rG   rH   rO   generate_kwargsr   r   r   r   r   r   r   rn   r   r   r   r   language_attention_maskr]   r   r   r   
bos_tokenss                             r   r   z2InstructBlipVideoForConditionalGeneration.generatel  s   > 4)'') 6B5G5G2
FGVU#++J,?&RWX((%= ) 
 
	 	
  %zz,*;*;*=cr*B%**]i]p]pq((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7 % 
 %66q:PL<M<Ma<P:PRS7ST $ 8 8 F !6 = =j$++JfJfioJoqs t"'**!&&("-UZZH]HdHd#
   4;;#:#:#G#G"H!IJ
A&L''( 
 !"__Y7N3113I> 4;; 3T:F"+t{{/L/L"L!W!WXZ![!e!efs!t0E0M0M0OM,-z
 "II'<m>N>NOdOkOk>l&mstuM"YY(.*;*;<S<Z<Z*[\bcN &&--@@#''b9<Q<W<WXY<ZZ]^^  - 1@0C0CLRS0TWlWrWrstWu0u-.$%%.. 
')
 
 ""))<< ;;**88;?QQ [[,,99 
 ))L>*:;BB:qQTTUaUhUhiJgu||4$)IIz7;L;L.MSU$V!   ))Z$9rBr   )
NNNNNNNNNF)NNNNF)r   r   r   rj   FloatTensorr   r   boolr   r   rA   r   no_gradr   r   r   r   rC   rC      s   
 >B15598<=A,0/3-1&*).{
''{
 !,,{
 !))9)9 :	{

 E--.{
 !!1!12{
 $E$4$45{
 !))9)9 :{
 $D>{
 'tn{
 ))*{
 d^{
 #'{
 
uJJ	K{
z U]]_ 9==A0459).A''A $E$4$45A !))9)9 :	A
 E,,-A !!1!12A #'A 
		A Ar   rC   ) dataclassesr   typingr   r   r   rj   torch.utils.checkpointtorch.nnr   ;transformers.models.instructblip.configuration_instructblipr   r	   6transformers.models.instructblip.modeling_instructblipr
   r   configuration_utilsr   models.auto.modeling_autor   utilsr   autor   
get_loggerr   r&   r   r   r   rA   rC   r   r   r   <module>r      s     " ) )   %
 4 J  ! 
		H	%	$< 		%> 	u
. u
p 	;j 	 	@0T @r   