
    ##hd                     L	   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlZd dlZd dlmZ dd	lmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# dd
l$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*  e       rd dl+Z,d dl-Z, ej\                   ej\                  e,j^                        j`                         ej\                  d      k\  re,jb                  jd                  Z3ne,jb                  Z3 e       rd dl4mZ5 d dl6m7Z7 e3jp                  e7jp                  e3jr                  e7jr                  e3jt                  e7jt                  e3jv                  e7jv                  e3jx                  e7jx                  e3jz                  e7jz                  iZ>er e       rd dl?Z? e!j                  eA      ZBedej                  deDd   eDej                     eDd   f   ZEeeDd   ddeDd   eDd   eDeDd      eDeDd      eDeDd      f   ZF G d de      ZG G d de      ZH G d de      ZIe G d d             ZJeKeLeeMeLeDeK   f   f   ZNd ZO G d de      ZPd ZQd ZRdeDfdZSd  ZTd! ZUd"ej                  d#eVfd$ZWdhd%eMd#eDeE   fd&ZXdeeDeE   eEf   d#eEfd'ZYdeeDeE   eEf   d#eEfd(ZZd#eFfd)Z[d#ej                  fd*Z\	 did"ej                  d+eeeMe]eMd,f   f      d#eGfd-Z^	 did"ej                  d.eeeGeLf      d#eMfd/Z_did"ej                  d0eGd#e]eMeMf   fd1Z`d2e]eMeMf   d3eMd4eMd#e]eMeMf   fd5Zad6eKeLeeDe]f   f   d#eVfd7Zbd6eKeLeeDe]f   f   d#eVfd8Zcd9eeKeLeeDe]f   f      d#eVfd:Zdd9eeKeLeeDe]f   f      d#eVfd;Zedid"eeLdf   d<eef   d#dfd=Zgdjd>eJfd?Zhd@eLdAefdBZi	 did@eLdAee   fdCZjd@eLdAefdDZkd@eLdAefdEZlejeiekeldFZm	 	 	 	 dkdGeeLdHf   dIeeM   dJeeM   dKeLdAee   d#ej                  fdLZo	 dideeDe]eLdf   d<eef   d#edeDd   eDeDd      f   fdMZp	 	 	 	 	 	 	 	 	 	 	 	 dldNeeV   dOeef   dPeeV   dQeeefeDef   f      dReeefeDef   f      dSeeV   dTeeM   dUeeV   dVeeKeLeMf      dWeeV   dXeeKeLeMf      dYedZ   fd[Zq G d\ d]      Zrd^eHd_e]eHd,f   d9eDeK   d#dfd`ZsdaeDeL   dbeDeL   fdcZt edde       G df dg             Zuy)m    N)Iterable)redirect_stdout)	dataclass)BytesIO)TYPE_CHECKINGCallableOptionalUnion)version   )ExplicitEnumis_av_availableis_cv2_availableis_decord_availableis_jax_tensoris_numpy_arrayis_tf_tensoris_torch_availableis_torch_tensoris_torchvision_availableis_vision_availableis_yt_dlp_availableloggingrequires_backendsto_numpy)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDIMAGENET_STANDARD_MEANIMAGENET_STANDARD_STDOPENAI_CLIP_MEANOPENAI_CLIP_STDz9.1.0)io)InterpolationModezPIL.Image.Imageztorch.Tensorz
np.ndarrayc                       e Zd ZdZdZy)ChannelDimensionchannels_firstchannels_lastN)__name__
__module____qualname__FIRSTLAST     U/var/www/html/sandstorm/venv/lib/python3.12/site-packages/transformers/image_utils.pyr%   r%   f   s    EDr.   r%   c                       e Zd ZdZdZy)AnnotationFormatcoco_detectioncoco_panopticN)r(   r)   r*   COCO_DETECTIONCOCO_PANOPTICr-   r.   r/   r1   r1   k   s    %N#Mr.   r1   c                   d    e Zd Zej                  j
                  Zej                  j
                  Zy)AnnotionFormatN)r(   r)   r*   r1   r4   valuer5   r-   r.   r/   r7   r7   p   s$    %44::N$2288Mr.   r7   c                   6    e Zd ZU eed<   eed<   eed<   eed<   y)VideoMetadatatotal_num_framesfpsdurationvideo_backendN)r(   r)   r*   int__annotations__floatstrr-   r.   r/   r:   r:   u   s    	JOr.   r:   c                 b    t               xr$ t        | t        j                  j                        S N)r   
isinstancePILImageimgs    r/   is_pil_imagerJ      s     EZSYY__%EEr.   c                        e Zd ZdZdZdZdZdZy)	ImageTypepillowtorchnumpy
tensorflowjaxN)r(   r)   r*   rF   TORCHNUMPY
TENSORFLOWJAXr-   r.   r/   rL   rL      s    
CEEJ
Cr.   rL   c                 >   t        |       rt        j                  S t        |       rt        j                  S t        |       rt        j                  S t        |       rt        j                  S t        |       rt        j                  S t        dt        |              )NzUnrecognised image type )rJ   rL   rF   r   rR   r   rS   r   rT   r   rU   
ValueErrortypeimages    r/   get_image_typer[      su    E}}ueE###U}}
/U}=
>>r.   c                     t        |       xs2 t        |       xs% t        |       xs t        |       xs t	        |       S rD   )rJ   r   r   r   r   rH   s    r/   is_valid_imager]      s8    vs 3vs7Kv|\_O`vdqrudvvr.   imagesc                 .    | xr t        d | D              S )Nc              3   2   K   | ]  }t        |        y wrD   )r]   ).0rZ   s     r/   	<genexpr>z*is_valid_list_of_images.<locals>.<genexpr>   s     DE./D   all)r^   s    r/   is_valid_list_of_imagesrf      s    DcDVDDDr.   c                 r    t        | t        t        f      r| D ]  }t        |      r y yt	        |       syy)NFT)rE   listtuplevalid_imagesr]   )imgsrI   s     r/   rj   rj      s?    $u& 	C$	  D!r.   c                 L    t        | t        t        f      rt        | d         S y)Nr   F)rE   rh   ri   r]   rH   s    r/   
is_batchedrm      s"    #e}%c!f%%r.   rZ   returnc                     | j                   t        j                  k(  ryt        j                  |       dk\  xr t        j                  |       dk  S )zV
    Checks to see whether the pixel values have already been rescaled to [0, 1].
    Fr   r   )dtypenpuint8minmaxrY   s    r/   is_scaled_imageru      s>     {{bhh 66%=A4"&&-1"44r.   expected_ndimsc           	      (   t        |       r| S t        |       r| gS t        |       rU| j                  |dz   k(  rt	        |       } | S | j                  |k(  r| g} | S t        d|dz    d| d| j                   d      t        dt        |        d      )a  
    Ensure that the output is a list of images. If the input is a single image, it is converted to a list of length 1.
    If the input is a batch of images, it is converted to a list of images.

    Args:
        images (`ImageInput`):
            Image of images to turn into a list of images.
        expected_ndims (`int`, *optional*, defaults to 3):
            Expected number of dimensions for a single input image. If the input image has a different number of
            dimensions, an error is raised.
    r   z%Invalid image shape. Expected either z or z dimensions, but got z dimensions.ztInvalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray, but got .)rm   rJ   r]   ndimrh   rW   rX   )r^   rv   s     r/   make_list_of_imagesrz      s     & Fxf;;.1,,&\F  [[N*XF 	 78J7K4P^O_ `KK=. 
 	  $V~Q	0 r.   c                 <   t        | t        t        f      r=t        d | D              r+t        d | D              r| D cg c]  }|D ]  }|  c}}S t        | t        t        f      rXt	        |       rMt        | d         s| d   j                  dk(  r| S | d   j                  dk(  r| D cg c]  }|D ]  }|  c}}S t        |       r7t        |       s| j                  dk(  r| gS | j                  dk(  rt        |       S t        d|        c c}}w c c}}w )a|  
    Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
    If the input is a nested list of images, it is converted to a flat list of images.
    Args:
        images (`Union[List[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of images or a 4d array of images.
    c              3   H   K   | ]  }t        |t        t        f        y wrD   rE   rh   ri   ra   images_is     r/   rb   z+make_flat_list_of_images.<locals>.<genexpr>        K
8dE]3K    "c              3   2   K   | ]  }t        |        y wrD   rf   r~   s     r/   rb   z+make_flat_list_of_images.<locals>.<genexpr>        Ih'1Irc   r         z*Could not make a flat list of images from 	rE   rh   ri   re   rf   rJ   ry   r]   rW   )r^   img_listrI   s      r/   make_flat_list_of_imagesr      s    	6D%=)KFKKI&II$*?h?s???&4-(-DV-Lq	"fQinn&9M!9>>Q(.CH(C3CCCCCf6;;!#38O;;!<
A&J
KK @ Ds    D.Dc                     t        | t        t        f      r&t        d | D              rt        d | D              r| S t        | t        t        f      rYt	        |       rNt        | d         s| d   j                  dk(  r| gS | d   j                  dk(  r| D cg c]  }t        |       c}S t        |       r9t        |       s| j                  dk(  r| ggS | j                  dk(  rt        |       gS t        d      c c}w )z
    Ensure that the output is a nested list of images.
    Args:
        images (`Union[List[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of list of images or a list of 4d array of images.
    c              3   H   K   | ]  }t        |t        t        f        y wrD   r}   r~   s     r/   rb   z-make_nested_list_of_images.<locals>.<genexpr>  r   r   c              3   2   K   | ]  }t        |        y wrD   r   r~   s     r/   rb   z-make_nested_list_of_images.<locals>.<genexpr>  r   rc   r   r   r   z]Invalid input type. Must be a single image, a list of images, or a list of batches of images.r   )r^   rZ   s     r/   make_nested_list_of_imagesr   	  s     	6D%=)KFKKI&II &4-(-DV-Lq	"fQinn&98O!9>>Q-34EDK44 f6;;!#3H:;;!L>!
t
uu 5s   C;c                    t        | t        t        f      r|t        | d   t        t        f      rct        | d   d         rRt	        | d   d         s?| d   d   j
                  dk(  r*| D cg c]  }|D cg c]  }|D ]  }|  c}} } }}}| S t        | t        t        f      r\t        | d         rNt	        | d         s| d   j
                  dk(  r| gS | d   j
                  dk(  r]| D cg c]  }t        |       c}S t        |       r9t	        |       s| j
                  dk(  r| ggS | j
                  dk(  rt        |       gS t        d|        c c}}w c c}}}w c c}w )z
    Ensure that the input is a list of videos.
    Args:
        videos (`VideoInput`):
            Video or videos to turn into a list of videos.
    Returns:
        list: A list of videos.
    r   r   r   z"Could not make batched video from )rE   rh   ri   r]   rJ   ry   rW   )videosbatched_videos
batch_listvideos       r/   make_batched_videosr   -  sN    &4-(Zq	D%=-QVdeklmenopeqVrF1IaL)fQil.?.?1.Dmstt[i~V:V%uVuVtFt	FT5M	*~fQi/Hq	"fQinn&98OAY^^q -34EDK44		6;;!#3H:[[AL>!
9&B
CC! Wt 5s   -
E7EE0E$Ec                     t        |       st        dt        |              t               r9t	        | t
        j                  j                        rt        j                  |       S t        |       S )NzInvalid image type: )
r]   rW   rX   r   rE   rF   rG   rq   arrayr   rH   s    r/   to_numpy_arrayr   L  sP    #/S	{;<<C!Axx}C=r.   num_channels.c                     ||nd}t        |t              r|fn|}| j                  dk(  rd\  }}n-| j                  dk(  rd\  }}nt        d| j                         | j                  |   |v rD| j                  |   |v r3t
        j                  d| j                   d       t        j                  S | j                  |   |v rt        j                  S | j                  |   |v rt        j                  S t        d      )	a[  
    Infers the channel dimension format of `image`.

    Args:
        image (`np.ndarray`):
            The image to infer the channel dimension of.
        num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
            The number of channels of the image.

    Returns:
        The channel dimension of the image.
    r   r   r   )r      r   z(Unsupported number of image dimensions: z4The channel dimension is ambiguous. Got image shape z,. Assuming channels are the first dimension.z(Unable to infer channel dimension format)
rE   r?   ry   rW   shapeloggerwarningr%   r+   r,   )rZ   r   	first_dimlast_dims       r/   infer_channel_dimension_formatr   U  s     $0#;<L&0s&CL?LzzQ"	8	q"	8CEJJ<PQQ{{9-%++h2G<2WB5;;-O{|	
  %%%	Y	<	/%%%	X	,	.$$$
?
@@r.   input_data_formatc                     |t        |       }|t        j                  k(  r| j                  dz
  S |t        j                  k(  r| j                  dz
  S t        d|       )a  
    Returns the channel dimension axis of the image.

    Args:
        image (`np.ndarray`):
            The image to get the channel dimension axis of.
        input_data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format of the image. If `None`, will infer the channel dimension from the image.

    Returns:
        The channel dimension axis of the image.
    r   r   Unsupported data format: )r   r%   r+   ry   r,   rW   )rZ   r   s     r/   get_channel_dimension_axisr   z  sd      :5A,222zzA~	.33	3zzA~
01B0CD
EEr.   channel_dimc                     |t        |       }|t        j                  k(  r| j                  d   | j                  d   fS |t        j                  k(  r| j                  d   | j                  d   fS t        d|       )a  
    Returns the (height, width) dimensions of the image.

    Args:
        image (`np.ndarray`):
            The image to get the dimensions of.
        channel_dim (`ChannelDimension`, *optional*):
            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.

    Returns:
        A tuple of the image's height and width.
    r   )r   r%   r+   r   r,   rW   )rZ   r   s     r/   get_image_sizer     s{     4U;&,,,{{2B//	(--	-{{2B//4[MBCCr.   
image_size
max_height	max_widthc                 x    | \  }}||z  }||z  }t        ||      }t        ||z        }t        ||z        }	||	fS )a  
    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
    Important, even if image_height < max_height and image_width < max_width, the image will be resized
    to at least one of the edges be equal to max_height or max_width.

    For example:
        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)

    Args:
        image_size (`Tuple[int, int]`):
            The image to resize.
        max_height (`int`):
            The maximum allowed height.
        max_width (`int`):
            The maximum allowed width.
    )rs   r?   )
r   r   r   heightwidthheight_scalewidth_scale	min_scale
new_height	new_widths
             r/   #get_image_size_for_max_height_widthr     sV    , MFE&Le#KL+.IVi'(JEI%&Iy  r.   
annotationc                     t        | t              rId| v rEd| v rAt        | d   t        t        f      r(t	        | d         dk(  st        | d   d   t              ryy)Nimage_idannotationsr   TFrE   dictrh   ri   lenr   s    r/   "is_valid_annotation_coco_detectionr     s`    :t$*$Z'z-04-@ 
=)*a/:j>WXY>Z\`3a r.   c                     t        | t              rMd| v rId| v rEd| v rAt        | d   t        t        f      r(t	        | d         dk(  st        | d   d   t              ryy)Nr   segments_info	file_namer   TFr   r   s    r/   !is_valid_annotation_coco_panopticr     sh    :t$*$z):%z/2T5MB 
?+,1Z
?@[\]@^`d5e r.   r   c                 &    t        d | D              S )Nc              3   2   K   | ]  }t        |        y wrD   )r   ra   anns     r/   rb   z3valid_coco_detection_annotations.<locals>.<genexpr>  s     N31#6Nrc   rd   r   s    r/    valid_coco_detection_annotationsr     s    N+NNNr.   c                 &    t        d | D              S )Nc              3   2   K   | ]  }t        |        y wrD   )r   r   s     r/   rb   z2valid_coco_panoptic_annotations.<locals>.<genexpr>  s     M#05Mrc   rd   r   s    r/   valid_coco_panoptic_annotationsr     s    MMMMr.   timeoutc                    t        t        dg       t        | t              r| j	                  d      s| j	                  d      rHt
        j                  j                  t        t        j                  | |      j                              } nt        j                  j                  |       r t
        j                  j                  |       } n| j	                  d      r| j                  d      d   } 	 t!        j"                  | j%                               }t
        j                  j                  t        |            } n2t        | t
        j                  j                        r| } nt+        d      t
        j,                  j/                  |       } | j1                  d      } | S # t&        $ r}t)        d|  d	|       d
}~ww xY w)a3  
    Loads `image` to a PIL Image.

    Args:
        image (`str` or `PIL.Image.Image`):
            The image to convert to the PIL Image format.
        timeout (`float`, *optional*):
            The timeout value in seconds for the URL request.

    Returns:
        `PIL.Image.Image`: A PIL Image.
    visionhttp://https://r   zdata:image/,r   zIncorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got z. Failed with NzuIncorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image.RGB)r   
load_imagerE   rB   
startswithrF   rG   openr   requestsgetcontentospathisfilesplitbase64decodebytesencode	ExceptionrW   	TypeErrorImageOpsexif_transposeconvert)rZ   r   b64es       r/   r   r     sv    j8*-%I&%*:*::*F IINN78<<w+O+W+W#XYEWW^^E"IINN5)E.C(+((8		ws|4
 
E399??	+ D
 	
 LL''.EMM% EL    i  jo  ip  p~  @  ~A  B s   2AF" "	G+F<<Gmetadatac           	         | j                   }| j                  }|-|+t        ||z  |z        }||kD  rt        d| d| d| d      |"t	        j
                  d|||z  t              }|S t	        j
                  d|t              }|S )a`  
    A default sampling function that replicates the logic used in get_uniform_frame_indices,
    while optionally handling `fps` if `num_frames` is not provided.

    Args:
        metadata (`VideoMetadata`):
            `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
        num_frames (`int`, *optional*):
            Number of frames to sample uniformly.
        fps (`int`, *optional*):
            Desired frames per second. Takes priority over num_frames if both are provided.

    Returns:
        `np.ndarray`: Array of frame indices to sample.
    z When loading the video with fps=z, we computed num_frames=z  which exceeds total_num_frames=z. Check fps or video metadata.r   )rp   )r;   r<   r?   rW   rq   arange)r   
num_framesr<   kwargsr;   	video_fpsindicess          r/   default_sample_indices_fnr     s       00I co)I5;<
((23%7PQ[P\ ]22B1CCac 
 ))A/1AJ1NVYZ N ))A/s;Nr.   
video_pathsample_indices_fnc                    t        t        dg       ddl}|j                  |       }t	        |j                  |j                              }|j                  |j                        }|r||z  nd}t        t	        |      t        |      t        |      d      } |dd|i|}	d}
g }|j                         r|j                         \  }}|snk|
|	v rI|j                  \  }}}|j                  ||j                        }|j                  |d|d|d|f          |r|
dz  }
|
|k\  rn|j                         r|j!                          |	|_        t%        j&                  |      |fS )	av  
    Decode a video using the OpenCV backend.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    cv2r   Nopencvr;   r<   r=   r>   r   r   r-   )r   read_video_opencvr   VideoCapturer?   r   CAP_PROP_FRAME_COUNTCAP_PROP_FPSr:   rA   isOpenedreadr   cvtColorCOLOR_BGR2RGBappendreleaseframes_indicesrq   stack)r   r   r   r   r   r;   r   r=   r   r   indexframessuccessframer   r   channels                    r/   r   r   ?  s_   2 '%1Z(E599S%=%=>?		#**+I/8)+aH-.E)4DuU]nvH  <<V<GEF
..
G%*[["FE7LL(9(9:EMM%&!E'1W9 <=>QJE$$ ..
 
MMO%H88FX%%r.   c                 X   t        t        dg       ddlm}m}  ||  |d            }|j                         }t        |      }|r||z  nd}t        t        |      t        |      t        |      d      }	 |dd|	i|}
|j                  |
      j                         }|
|	_        ||	fS )a  
    Decode a video using the Decord backend.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    decordr   )VideoReadercpu)urictxr   r   r-   )r   read_video_decordr	  r
  r  get_avg_fpsr   r:   r?   rA   	get_batchasnumpyr  )r   r   r   r
  r  vrr   r;   r=   r   r   r  s               r/   r  r  x  s    2 '(4'	Q	0B I2w/8)+aH-.E)4DuU]nvH  <<V<G\\'"**,F%H8r.   c                    t        t        dg       ddl}|j                  |       }|j                  j
                  d   j                  }|j                  j
                  d   j                  }|r||z  nd}t        t        |      t        |      t        |      d      } |dd|i|}	g }
|j                  d       |	d   }t        |j                  d            D ](  \  }}||kD  r n|dk\  s||	v s|
j                  |       * t        j                   |
D cg c]  }|j#                  d	
       c}      }|	|_        ||fS c c}w )a}  
    Decode the video with PyAV decoder.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    avr   Npyavr   r   r   )r   rgb24)formatr-   )r   read_video_pyavr  r   streamsr   r  average_rater:   r?   rA   seek	enumeratedecoder   rq   r  
to_ndarrayr  )r   r   r   r  	containerr;   r   r=   r   r   r  	end_indexir  xr   s                   r/   r  r    sB   2 ov.
#I ((..q188!!''*77I/8)+aH-.E)4DuU]ntH  <<V<GFNN1Ii..Q.78 !5y=6a7lMM% 	! HHFCqall'l2CDE%H(? Ds   Ec                 <   t        j                  | dddd      \  }}}|d   }|j                  d      }|r||z  nd}t        t	        |      t        |      t        |      d	      }	 |dd
|	i|}
||
   j                         j                         }|
|	_        ||	fS )a  
    Decode the video with torchvision decoder.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    g        NsecTHWC)	start_ptsend_ptspts_unitoutput_formatr   r   torchvisionr   r   r-   )	torchvision_io
read_videosizer:   r?   rA   
contiguousrO   r  )r   r   r   r   _infor   r;   r=   r   r   s              r/   read_video_torchvisionr1    s    0 $..NE1d [!Izz!}/8)+aH-.)x#	H  <<V<G'N%%'--/E%H(?r.   )r	  r   r  r*  r   
VideoInputr   r<   backendc                 Z   |t        d      |fd}|}| j                  d      s| j                  d      rt               st        d      t	        t
        dg       dd	lm} t               }t        |      5   |       5 }	|	j                  | g       ddd       ddd       |j                         }
t        |
      }n| j                  d
      s| j                  d      r)t        t        j                  |       j                        }n_t        j                   j#                  |       r| }n=t%        |       s$t'        | t(        t*        f      rt%        | d         rd}nt-        d      | j                  d
      xs | j                  d      }|r|dv rt        d      || S t/               s|dk(  s-t1               s|dk(  st3               s|dk(  st5               s|dk(  rt        d| d| d      t6        |   } |||fi |\  } }| |fS # 1 sw Y   yxY w# 1 sw Y   ~xY w)a  
    Loads `video` to a numpy array.

    Args:
        video (`str` or `VideoInput`):
            The video to convert to the numpy array format. Can be a link to video or local path.
        num_frames (`int`, *optional*):
            Number of frames to sample uniformly. If not passed, the whole video is loaded.
        fps (`int`, *optional*):
            Number of frames to sample per second. Should be passed only when `num_frames=None`.
            If not specified and `num_frames==None`, all frames are sampled.
        backend (`str`, *optional*, defaults to `"opencv"`):
            The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
            The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
            indices at which the video should be sampled. For example:

            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, Dict]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - Metadata dictionary.
    Nzc`num_frames`, `fps`, and `sample_indices_fn` are mutually exclusive arguments, please use only one!c                 "    t        | fd|S )N)r   r<   )r   )r   	fn_kwargsr<   r   s     r/   sample_indices_fn_funcz*load_video.<locals>.sample_indices_fn_func=  s    ,Xc*RUcYbccr.   zhttps://www.youtube.comzhttp://www.youtube.comzETo load a video from YouTube url you have  to install `yt_dlp` first.yt_dlpr   )	YoutubeDLr   r   zVIncorrect format used for video. Should be an url linking to an video or a local path.)r   r*  zlIf you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backendr	  r  r   r*  zYou chose backend=zf for loading the video but the required library is not found in your environment Make sure to install z before loading the video.)rW   r   r   ImportErrorr   
load_videor8  r9  r   r   downloadgetvaluer   r   r   r   r   r   r]   rE   rh   ri   r   r   r   r   r   VIDEO_DECODERS)r   r   r<   r3  r   r   r7  r9  bufferf	bytes_objfile_objvideo_is_urlvideo_decoderr   s    ``            r/   r;  r;    s.   N :16G6Oq
 	

  	d 312e6F6FG_6`"$eff*xj1$V$ 	 ik 	 QJJw	  	 OO%	9%			)	$(8(8(D8<<.667				:edE]#CW\]^W_H`pqq ##I.N%2B2B:2NL#<<z
 	
  !"w(':!g&7 "w(':(*w-/G 	 *$$+9,FH
 	

 #7+M#H.?J6JOE8(?K	  	  	  	 s$   H 	HH H	H  H*c                 <   t        | t        t        f      rjt        |       rDt        | d   t        t        f      r+| D cg c]  }|D cg c]  }t	        ||       c} c}}S | D cg c]  }t	        ||       c}S t	        | |      S c c}w c c}}w c c}w )a  Loads images, handling different levels of nesting.

    Args:
      images: A single image, a list of images, or a list of lists of images to load.
      timeout: Timeout for loading images.

    Returns:
      A single image, a list of images, a list of lists of images.
    r   r   )rE   rh   ri   r   r   )r^   r   image_grouprZ   s       r/   load_imagesrG  r  s     &4-(v;:fQi$?eklVa[QEZw7QllDJK5Jug6KK&'22	 RlKs    	B	BB*BB
do_rescalerescale_factordo_normalize
image_mean	image_stddo_padsize_divisibilitydo_center_crop	crop_size	do_resizer-  resamplePILImageResamplingc                     | r|t        d      |r|t        d      |r||t        d      |r|t        d      |	r|
|t        d      yy)a  
    Checks validity of typically used arguments in an `ImageProcessor` `preprocess` method.
    Raises `ValueError` if arguments incompatibility is caught.
    Many incompatibilities are model-specific. `do_pad` sometimes needs `size_divisor`,
    sometimes `size_divisibility`, and sometimes `size`. New models and processors added should follow
    existing arguments when possible.

    Nz=`rescale_factor` must be specified if `do_rescale` is `True`.zzDepending on the model, `size_divisibility`, `size_divisor`, `pad_size` or `size` must be specified if `do_pad` is `True`.zP`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.z<`crop_size` must be specified if `do_center_crop` is `True`.zA`size` and `resample` must be specified if `do_resize` is `True`.)rW   )rH  rI  rJ  rK  rL  rM  rN  rO  rP  rQ  r-  rR  s               r/   validate_preprocess_argumentsrU    s    , n,XYY#+ I
 	
 +y/@kll)+WXXdlh&6\]] '7yr.   c                       e Zd ZdZd ZddZd Zdej                  de	e
ef   dej                  fd	Zdd
Zd ZddZddZd Zd ZddZy)ImageFeatureExtractionMixinzD
    Mixin that contain utilities for preparing image features.
    c                     t        |t        j                  j                  t        j                  f      s$t        |      st        dt        |       d      y y )Nz	Got type zS which is not supported, only `PIL.Image.Image`, `np.array` and `torch.Tensor` are.)rE   rF   rG   rq   ndarrayr   rW   rX   selfrZ   s     r/   _ensure_format_supportedz4ImageFeatureExtractionMixin._ensure_format_supported  sQ    %#))//2::!>?X]H^DK= )& &  I_?r.   Nc                    | j                  |       t        |      r|j                         }t        |t        j
                        r|'t        |j                  d   t        j                        }|j                  dk(  r$|j                  d   dv r|j                  ddd      }|r|dz  }|j                  t        j                        }t        j                  j                  |      S |S )a"  
        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
        needed.

        Args:
            image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
                The image to convert to the PIL Image format.
            rescale (`bool`, *optional*):
                Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
                default to `True` if the image type is a floating type, `False` otherwise.
        r   r   r   r   r      )r\  r   rO   rE   rq   rY  flatfloatingry   r   	transposeastyperr   rF   rG   	fromarray)r[  rZ   rescales      r/   to_pil_imagez(ImageFeatureExtractionMixin.to_pil_image  s     	%%e,5!KKMEeRZZ($UZZ]BKK@zzQ5;;q>V#;1a0LL*E99&&u--r.   c                     | j                  |       t        |t        j                  j                        s|S |j	                  d      S )z
        Converts `PIL.Image.Image` to RGB format.

        Args:
            image (`PIL.Image.Image`):
                The image to convert.
        r   )r\  rE   rF   rG   r   rZ  s     r/   convert_rgbz'ImageFeatureExtractionMixin.convert_rgb  s8     	%%e,%1L}}U##r.   rZ   scalern   c                 .    | j                  |       ||z  S )z7
        Rescale a numpy image by scale amount
        )r\  )r[  rZ   rh  s      r/   rd  z#ImageFeatureExtractionMixin.rescale  s     	%%e,u}r.   c                    | j                  |       t        |t        j                  j                        rt	        j
                  |      }t        |      r|j                         }|'t        |j                  d   t        j                        n|}|r/| j                  |j                  t        j                        d      }|r"|j                  dk(  r|j                  ddd      }|S )a  
        Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
        dimension.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to convert to a NumPy array.
            rescale (`bool`, *optional*):
                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
                default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
            channel_first (`bool`, *optional*, defaults to `True`):
                Whether or not to permute the dimensions of the image to put the channel dimension first.
        r   p?r   r   r   )r\  rE   rF   rG   rq   r   r   rO   r_  integerrd  rb  float32ry   ra  )r[  rZ   rd  channel_firsts       r/   r   z*ImageFeatureExtractionMixin.to_numpy_array  s     	%%e,eSYY__-HHUOE5!KKME;B?*UZZ]BJJ7PWLLbjj!99EEUZZ1_OOAq!,Er.   c                     | j                  |       t        |t        j                  j                        r|S t	        |      r|j                  d      }|S t        j                  |d      }|S )z
        Expands 2-dimensional `image` to 3 dimensions.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to expand.
        r   )axis)r\  rE   rF   rG   r   	unsqueezerq   expand_dimsrZ  s     r/   rr  z'ImageFeatureExtractionMixin.expand_dims  s_     	%%e, eSYY__-L5!OOA&E  NN5q1Er.   c                    | j                  |       t        |t        j                  j                        r| j	                  |d      }nw|rut        |t
        j                        r0| j                  |j                  t
        j                        d      }n+t        |      r | j                  |j                         d      }t        |t
        j                        rt        |t
        j                        s.t        j                  |      j                  |j                        }t        |t
        j                        st        j                  |      j                  |j                        }nt        |      rddl}t        ||j                        s?t        |t
        j                        r |j                   |      }n |j"                  |      }t        ||j                        s?t        |t
        j                        r |j                   |      }n |j"                  |      }|j$                  dk(  r)|j&                  d   dv r||ddddf   z
  |ddddf   z  S ||z
  |z  S )a  
        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
        if it's a PIL Image.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to normalize.
            mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
                The mean (per channel) to use for normalization.
            std (`List[float]` or `np.ndarray` or `torch.Tensor`):
                The standard deviation (per channel) to use for normalization.
            rescale (`bool`, *optional*, defaults to `False`):
                Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
                happen automatically.
        T)rd  rk  r   Nr   r   )r\  rE   rF   rG   r   rq   rY  rd  rb  rm  r   rA   r   rp   rN   Tensor
from_numpytensorry   r   )r[  rZ   meanstdrd  rN   s         r/   	normalizez%ImageFeatureExtractionMixin.normalize$  s     	%%e,eSYY__-''t'<E %,U\\"**%=yI 'U[[]I>eRZZ(dBJJ/xx~,,U[[9c2::.hhsm**5;;7U#dELL1dBJJ/+5++D1D'5<<-Dc5<<0c2::.*%**3/C&%,,s+C::?u{{1~7DD$//3q$}3EEEDLC''r.   c                    ||nt         j                  }| j                  |       t        |t        j
                  j
                        s| j                  |      }t        |t              rt        |      }t        |t              st        |      dk(  r|rt        |t              r||fn	|d   |d   f}n|j                  \  }}||k  r||fn||f\  }}	t        |t              r|n|d   }
||
k(  r|S |
t        |
|	z  |z        }}|.||
k  rt        d| d|       ||kD  rt        ||z  |z        |}}||k  r||fn||f}|j                  ||      S )a  
        Resizes `image`. Enforces conversion of input to PIL.Image.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to resize.
            size (`int` or `Tuple[int, int]`):
                The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
                matched to this.

                If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
                `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to
                this number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
            resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                The filter to user for resampling.
            default_to_square (`bool`, *optional*, defaults to `True`):
                How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a
                square (`size`,`size`). If set to `False`, will replicate
                [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
                with support for resizing only the smallest edge and providing an optional `max_size`.
            max_size (`int`, *optional*, defaults to `None`):
                The maximum allowed for the longer edge of the resized image: if the longer edge of the image is
                greater than `max_size` after being resized according to `size`, then the image is resized again so
                that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller
                edge may be shorter than `size`. Only used if `default_to_square` is `False`.

        Returns:
            image: A resized `PIL.Image.Image`.
        r   r   zmax_size = zN must be strictly greater than the requested size for the smaller edge size = )rR  )rS  BILINEARr\  rE   rF   rG   re  rh   ri   r?   r   r-  rW   resize)r[  rZ   r-  rR  default_to_squaremax_sizer   r   shortlongrequested_new_short	new_shortnew_longs                r/   r|  z"ImageFeatureExtractionMixin.resizeX  sy   <  (389K9T9T%%e,%1%%e,EdD!;DdC CIN '1$'<d|47DQRGBT %

v16&ufovuot.8s.Cda#// L&93?RUY?Y\a?a;b8	'#66()( 4@@DvG   (*.1(Y2F2Q.RT\8	05	8,hPYEZ||D8|44r.   c                    | j                  |       t        |t              s||f}t        |      st        |t        j
                        rP|j                  dk(  r| j                  |      }|j                  d   dv r|j                  dd n|j                  dd }n|j                  d   |j                  d   f}|d   |d   z
  dz  }||d   z   }|d   |d   z
  dz  }||d   z   }t        |t        j                  j                        r|j                  ||||f      S |j                  d   dv rdnd}|sKt        |t        j
                        r|j                  ddd      }t        |      r|j                  ddd      }|dk\  r!||d   k  r|dk\  r||d   k  r|d||||f   S |j                  dd	 t        |d   |d         t        |d   |d         fz   }	t        |t        j
                        rt	        j                   ||	
      }
nt        |      r|j#                  |	      }
|	d	   |d   z
  dz  }||d   z   }|	d   |d   z
  dz  }||d   z   }|
d||||f<   ||z  }||z  }||z  }||z  }|
dt        d|      t%        |
j                  d	   |      t        d|      t%        |
j                  d   |      f   }
|
S )a  
        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
        size given, it will be padded (so the returned result has the size asked).

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
                The image to resize.
            size (`int` or `Tuple[int, int]`):
                The size to which crop the image.

        Returns:
            new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape: (n_channels,
            height, width).
        r   r   r   r   NTF.r   )r   r   )r\  rE   ri   r   rq   rY  ry   rr  r   r-  rF   rG   cropra  permutert   
zeros_like	new_zerosrs   )r[  rZ   r-  image_shapetopbottomleftrightrn  	new_shape	new_imagetop_pad
bottom_padleft_pad	right_pads                  r/   center_cropz'ImageFeatureExtractionMixin.center_crop  s    	%%e,$&$<D 5!Zrzz%BzzQ((/-2[[^v-E%++ab/5;;WYXY?K ::a=%**Q-8K1~Q'A-tAwAa(Q.tAw eSYY__-::tS%899 !&A& 8e %,1a0u%aA. !8+a.0TQY5KXYNCZc&j$u*455 KK$DG[^(Dc$q'S^_`SaFb'cc	eRZZ(e9=IU#	2IR=;q>1a7{1~-
bMKN2q8{1~-	AF	#wz)8I+==>w'Qs9??2#6??QPST]TcTcdfTginPoAoo
	 r.   c                     | j                  |       t        |t        j                  j                        r| j	                  |      }|dddddddf   S )a  
        Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of
        `image` to a NumPy array if it's a PIL Image.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image whose color channels to flip. If `np.ndarray` or `torch.Tensor`, the channel dimension should
                be first.
        Nr   )r\  rE   rF   rG   r   rZ  s     r/   flip_channel_orderz.ImageFeatureExtractionMixin.flip_channel_order  sI     	%%e,eSYY__-''.ETrT1aZ  r.   c                     ||nt         j                  j                  }| j                  |       t	        |t         j                  j                        s| j                  |      }|j                  ||||||      S )a  
        Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
        counter clockwise around its centre.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
                rotating.

        Returns:
            image: A rotated `PIL.Image.Image`.
        )rR  expandcenter	translate	fillcolor)rF   rG   NEARESTr\  rE   re  rotate)r[  rZ   anglerR  r  r  r  r  s           r/   r  z"ImageFeatureExtractionMixin.rotate  sn      (389J9J%%e,%1%%e,E||HVFicl  
 	
r.   rD   )NT)F)NTN)Nr   NNN)r(   r)   r*   __doc__r\  re  rg  rq   rY  r
   rA   r?   rd  r   rr  ry  r|  r  r  r  r-   r.   r/   rW  rW    sj    <$RZZ eSj0A bjj @(2(hA5FIV!"
r.   rW  annotation_formatsupported_annotation_formatsc                     | |vrt        dt         d|       | t        j                  u rt	        |      st        d      | t        j
                  u rt        |      st        d      y y )NzUnsupported annotation format: z must be one of zInvalid COCO detection annotations. Annotations must a dict (single image) or list of dicts (batch of images) with the following keys: `image_id` and `annotations`, with the latter being a list of annotations in the COCO format.zInvalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts (batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with the latter being a list of annotations in the COCO format.)rW   r  r1   r4   r   r5   r   )r  r  r   s      r/   validate_annotationsr    s    
  <<:6(BRSoRpqrr,;;;/<B  ,:::.{;M  < ;r.   valid_processor_keyscaptured_kwargsc                     t        |      j                  t        |             }|r+dj                  |      }t        j	                  d| d       y y )Nz, zUnused or unrecognized kwargs: rx   )set
differencejoinr   r   )r  r  unused_keysunused_key_strs       r/   validate_kwargsr  )  sJ    o&11#6J2KLK;/88HJK r.   T)frozenc                       e Zd ZU dZdZee   ed<   dZee   ed<   dZ	ee   ed<   dZ
ee   ed<   dZee   ed<   dZee   ed<   d	 Zy)
SizeDictz>
    Hashable dictionary to store image size information.
    Nr   r   longest_edgeshortest_edger   r   c                 P    t        | |      rt        | |      S t        d| d      )NzKey z not found in SizeDict.)hasattrgetattrKeyError)r[  keys     r/   __getitem__zSizeDict.__getitem__>  s.    44%%cU"9:;;r.   )r(   r)   r*   r  r   r	   r?   r@   r   r  r  r   r   r  r-   r.   r/   r  r  1  sb     !FHSM E8C="&L(3-&#'M8C=' $J$#Ix}#<r.   r  )r   rD   )NN)NNr   N)NNNNNNNNNNNN)vr   r   collections.abcr   
contextlibr   dataclassesr   r"   r   typingr   r   r	   r
   rO   rq   r   	packagingr   utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   utils.constantsr   r   r   r   r    r!   	PIL.ImagerF   PIL.ImageOpsparse__version__base_versionrG   
ResamplingrS  r*  r+  torchvision.transformsr#   r  BOXr{  HAMMINGBICUBICLANCZOSpil_torch_interpolation_mappingrN   
get_loggerr(   r   rY  rh   
ImageInputr2  r%   r1   r7   r:   r   rB   r?   AnnotationTyperJ   rL   r[   r]   rf   rj   rm   boolru   rz   r   r   r   r   ri   r   r   r   r   r   r   r   r   rA   r   r   r   r  r  r1  r>  r   r;  rG  rU  rW  r  r  r  r-   r.   r/   <module>r     s    	 $ & !  ; ;       "  w}}]W]]3??3@@A]W]]SZE[[ YY11 YY!4< &&(9(A(A""$5$9$9''):)C)C&&(9(A(A&&(9(A(A&&(9(A(A+
'  
		H	% rzz>48I3JDQSQ[Q[L\^bcq^rr

 		 !l	n	 	
| 
$| $
9\ 9
    c5c4:!5667F ?wED E	52:: 5$ 5$ $D<L $N L$z"J./ L LF!v$z"J./!v!vHD: D>2::  NR"A::"A%-eCsCx4H.I%J"A"AL TXF::F*259I39N3O*PFF0D"** D3C DuUXZ]U] D0!c3h!! ! 38_	!>4U4;=O8O3P UY $sE$+<N7N2O TX  O(4U4QV;EW@W;X2Y O^b ON$sE$PU+DV?V:W1X N]a N)eC!223 )huo )Yj )X   F6&6&6&v -1(()(V000f---b  )	 !%,0al"#aa 
#a 	a
  )a XXaJ TX3$s$5563AI%3
d#45tDAR<S7TTU3, "&&*#'6:59!'+%)*. $%)/3&^&^UO&^ 4.&^ ud5k123	&^
 eT%[012&^ TN&^  }&^ TN&^ S#X'&^ ~&^ 4S>
"&^ +,&^T\
 \
~
'"'(8#(="> d 
	2L$s) Ld3i L $< < <r.   