
    "h>                        d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
Z
d dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ defd	Z e
j6                         d
efd       Z e
j6                         d        Z e
j6                         defd       Z d Z! e
j6                         d        Z" e
j6                  d      d        Z# e	d       G d d             Z$ G d de      Z%y)    )BaseBackend	GPUTarget)irpassesllvmnvidia)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                     d S )Nc                 *    | j                         rdS dS )N)       r   )r   r   r   )is_int8)lhsTyperhsTypes     \/var/www/html/sandstorm/venv/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py<lambda>zmin_dot_size.<locals>.<lambda>   s    GOO4EL <      r   s    r   min_dot_sizer      s    WWr   binaryc                    t         j                  j                  d| j                          dd      t         j                  j                  t         j                  j                  t              d|       g}|D ]  }t         j                  j                  |      s#t         j                  j                  |      sCt        j                  |dgt        j                        }|mt        j                  d|j                  d      t        j                   	      }|||j#                  d
      fc S  t%        d|        )NTRITON__PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )osenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r   pathsr#   resultversions        r   _path_to_binaryr@      s     	

 06;
RWW__X.v>E
  177>>#277>>##6,,c;-?
HYHYZF!))$=v}}W?U]_]i]ij&a 0001 fX.
//r   c                  j    t        j                  t        d      d   dg      j                  d      } | S )Nptxasr   r$   r&   )r4   r5   r@   r9   )r?   s    r   get_ptxas_versionrC   &   s2    %%w'?'BK&PQXXY`aGNr   returnc                     t        | t              sJ t        t        | j	                  d            \  }}|dk(  r|dk  rd|z   S |dk(  ry|dk(  rd|z   S |dk(  rd	|z   S t        d
| z         )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P   U      F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapintsplitr<   )cuda_versionmajorminors      r   ptx_get_versionrW   ,   s    
 lC(((sL..s34LE5{19:aZ{Ez{Ez
X[gg
hhr   c                 T    | j                   }|t        d      \  }}t        |      }|S )NrB   )ptx_versionr@   rW   )optionsrY   _rT   s       r   get_ptx_version_from_optionsr\   ?   s1    %%K)'2<%l3r   c                 >    t        |       }t        d|      }d| }|S )NS   z+ptx)r\   min)rZ   rY   llvm_ptx_versionfeaturess       r   get_featuresrb   G   s.    .w7K 2{+&'(HOr   c                     t        | d      5 }t        j                  |j                               j	                         cd d d        S # 1 sw Y   y xY w)Nrb)openhashlibsha256read	hexdigest)r.   fs     r   	file_hashrk   U   s>    	dD	 4Q~~affh'1134 4 4s   1AAT)frozenc                   J   e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   dZ	eed	<   dZ
eed
<   dZeed<   dZee   ed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZee   ed<   dZeed<   dZee   ed<   dZeed<   dZeed<   dZeed<   dZeed <   dZeed!<   d" Zd# Z y)$CUDAOptions   	num_warpsr)   num_ctas   
num_stagesr   num_buffers_warp_specnum_consumer_groupsreg_dec_producerreg_inc_consumerNmaxnreg)r)   r)   r)   cluster_dimsrY   Tenable_fp_fusion)fp8e5fp8e4b15supported_fp8_dtypesr   deprecated_fp8_dtypestf32default_dot_input_precision)r   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsFdebugcudabackend_namesanitize_overflowc                    t        t              j                  dz  }| j                  i nt	        | j                        }|j                  dd       s%t        j                  dt        |dz              |d<   t        j                  | dt        |j                                      | j                  dkD  r| j                  | j                  dz
  z  dk(  sJ d       y )	Nlib	libdeviceTRITON_LIBDEVICE_PATHzlibdevice.10.bcr   r   r)   znum_warps must be a power of 2)r   r1   parentr   dictr,   r*   getenvrP   object__setattr__tupleitemsrp   )selfdefault_libdirr   s      r   __post_init__zCUDAOptions.__post_init__t   s    h..6 ,,4b$t?O?O:P{D1')yy1H#n_pNpJq'rK$4k6G6G6I0JK~~!t~~!9K'LQR&R 	0/	0R&Rr   c           	      ^   t        | j                        }t        d t        |d         D              |d<   dj	                  t        |j                               D cg c]  \  }}| d|  c}}      }t        j                  |j                  d            j                         S c c}}w )Nc              3   <   K   | ]  \  }}|t        |      f  y wN)rk   ).0kvs      r   	<genexpr>z#CUDAOptions.hash.<locals>.<genexpr>   s     (htq!!Yq\):(hs   r   r[   -r&   )
r   __dict__r   sortedr/   r   rf   rg   encoderi   )r   	hash_dictnamevalkeys        r   hashzCUDAOptions.hash}   s    '	#((hviXeNfGg(h#h	- hh	@Q9RSID#4&#ST~~cjj12<<>> Ts   B)
)!__name__
__module____qualname__rp   rR   __annotations__rq   rs   rt   ru   rv   rw   rx   r   ry   r   rY   rz   boolr}   r   rP   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   rn   rn   [   s    IsHcJ!"3"  cc "GXc]!#L%#K!d!'<%*<(*5:*'--/I %*I*.!4.KE4L#"t"0?r   rn   c                        e Zd Zedefd       Zdeddf fdZdefdZd Z	d Z
deeef   fd	Zd
 Zed        Zed        Zed        Zed        Zed        Zd Z ej.                         d        Z xZS )CUDABackendr   c                      | j                   dk(  S )Nr   )backendr   s    r   supports_targetzCUDABackend.supports_target   s    ~~''r   rD   Nc                     t         |   |       |j                  | _        t	        | j                  t
              sJ d| _        y )Ncubin)super__init__arch
capabilityrO   rR   
binary_ext)r   r   	__class__s     r   r   zCUDABackend.__init__   s6      ++$//3///!r   c                    t         j                  j                         D ci c]  }||v s|||    }}d|vrPt        t         j                        }| j
                  dk\  r|j                  d       t        t        |            |d<   d|vr| j
                  dk\  rd|d<   d|vrt        j                  dd	      d	k(  |d<   | j
                  dk(  rd
nd|d<   t        di |S c c}w )Nr}   Y   fp8e4nvr~   Z   )r|   rz   TRITON_DEFAULT_FP_FUSION1i   @r   r   r   )rn   __dataclass_fields__keyssetr}   r   addr   r   r*   r   )r   optsr   argsr}   s        r   parse_optionszCUDABackend.parse_options   s    $/$D$D$I$I$KYqqTXy47
YY!-#&{'G'G#H "$$((3+08L1M+ND'("$."$0>,-T)')yy1KS'QUX'XD#$9=B9NTU,-"T"" Zs
   	C"C"c                     |j                   |j                  |j                  |j                  d   |j                  d   |j                  d   fS )Nr   r)      )rp   rq   sharedry   )r   metadatas     r   pack_metadatazCUDABackend.pack_metadata   sO    OO!!!$!!!$!!!$
 	
r   c                     dd l mc mc m} | j                  dk\  r|j
                  n|j                  t        | j                        d}|S )Nr   rI   )convert_custom_typesr   )	triton.language.extra.cudalanguageextrar   r   convert_custom_float8_sm80convert_custom_float8_sm70r   r   )r   r   codegen_fnss      r   get_codegen_implementationz&CUDABackend.get_codegen_implementation   sA    11 04"/DD++$JiJi(5

 r   c                     ddl m} d|iS )Nr   )r   ztriton.language.extra.libdevice)r   r   )r   r   s     r   get_module_mapzCUDABackend.get_module_map   s    819==r   c                 .    t        j                  |       y r   )r   load_dialects)r   ctxs     r   r   zCUDABackend.load_dialects   s    S!r   c                    t        j                  | j                        }|j                          t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j
                  j                  |       t        j
                  j                  |       t        j                  j                  |       |j!                  |        | S r   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointeradd_combineadd_canonicalizeradd_reorder_broadcastadd_cseadd_licmadd_symbol_dceadd_loop_unrollrun)modr   optpms       r   	make_ttirzCUDABackend.make_ttir   s    __S[[)
!!"%..r2#''+))"-b!r"$$R(##B'
s
r   c                 
   t        j                         }|j                  <|j                  d   |_        |j                  d   |_        |j                  d   |_        t        j                  j                  dd      dk(  rOt        j                         }t        j                  || j                        }| j                  j                  d       t        j                  | j                        }|j!                          t"        j$                  j'                  |d| |j(                  d	|j*                         t"        j,                  j/                  |       |d
z  dk\  rt"        j,                  j1                  |       t         j"                  j2                  j5                  ||       t"        j,                  j7                  |       t"        j,                  j9                  |       t"        j,                  j;                  |       t"        j,                  j7                  |       t"        j,                  j=                  ||dk\         t"        j>                  jA                  |       |d
z  dk\  r[t"        j,                  jC                  |       t"        j,                  jE                  |       t"        j,                  jG                  ||jH                         t"        j,                  jK                  ||jH                         t"        j,                  jM                  ||jH                         t"        j,                  jO                  ||jP                  |jH                  |jR                  |jT                         t"        j,                  jW                  ||jX                         t"        j,                  j[                  ||jH                         t"        j,                  j]                  |       t"        j,                  j=                  ||dk\         t"        j,                  j7                  |       t"        j,                  j_                  |       t"        j,                  ja                  |       t"        j>                  jA                  |       t"        j>                  jc                  |       |d
z  dk\  rRt         j"                  j2                  je                  |       t         j"                  j2                  jg                  |       t"        j>                  ji                  |       |jk                  |        |j                  |j                  |j
                  f|d<   | S )Nr   r)   r   MLIR_ENABLE_REMARK0r   Tzcuda:r   rM      rI   	   ry   )6r   ClusterInfory   clusterDimXclusterDimYclusterDimZr*   r+   r,   r   
source_mgrr   source_mgr_diagr   printOpOnDiagnosticr   r   r   r   add_convert_to_ttgpuirrp   rq   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operandsr   r   add_optimize_accumulator_init add_combine_tensor_select_and_ifadd_ws_task_partitionru   add_taskid_propagateadd_ws_data_partitionadd_ws_code_partitionrt   rv   rw   add_pipeliners   add_ws_loweringadd_prefetchadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_tma_loweringr   r   )r   r   r   r   cluster_infosrcMgrdiagr   s           r   
make_ttgirzCUDABackend.make_ttgir   s   ))+''*'7'7':L$'*'7'7':L$'*'7'7':L$::>>.4;__&F%%fckk:DKK++D1__S[[)
**2zl/CS]]TVX[XdXde##B'q NN))"-,,R>44R833B7,,R044R800Z25EFb!q NN88<NN;;B?NN00S5L5LMNN//C4K4KLNN00S5L5LMNN00S5N5NPSPgPg141E1EsG[G[]NN''CNN;NN**2s/F/FG##B'00Z25EF44R82226//3b!$$R(q MM##77;MM##44R8''+
s$0$<$<l>V>VXdXpXp#q 
r   c                    t        |      }| j                  d      }||dxx   |z  cc<   | }t        j                  |j                        }|j                          t        j                  j                  dd      dk(  rOt        j                         }t        j                  ||j                        }	|j                  j                  d       t        j                  j                  j!                  |       t        j                  j#                  |       t        j$                  j'                  |       t        j$                  j)                  |       t        j                  j+                  |       t        j                  j                  j-                  |||       t        j                  j.                  j1                  |       t        j$                  j3                  |       t        j4                  j7                  |       t        j4                  j9                  |       t        j4                  j;                  |       t        j                  j                  dd      dk(  rt        j<                  j?                  |       |jA                  |       t        jB                          t        j                         }
t        jD                  ||
      }|dk(  rd	nd
| }tG        |      }d}t        jH                  ||||       t        jJ                  |       |jL                  R|jO                         D ]?  }|jQ                         r|jS                         s%|jU                  |jL                         A |jV                  r4|jV                  D cg c]  \  }}|	 }}}t        jX                  ||       t        jZ                  |t        j\                         | j                  d      |d<   t_        |      }~~
|S c c}}w )Nz"triton_gpu.num-warp-groups-per-ctarp   r   r   r   TTRITON_DISABLE_LINE_INFOr   sm_90asm_nvptx64-nvidia-cudaztriton_gpu.sharedr   )0r\   get_int_attrr   r   r   r   r*   r+   r,   r   r   r   r   r   r   r   %add_decompose_unsupported_conversionsr  convertadd_scf_to_cfadd_index_to_llvmiradd_allocate_shared_memoryadd_to_llvmirr   add_nvgpu_to_llvmadd_arith_to_llvmirr   r   r   r   llvmiradd_di_scoper   init_targets	to_modulerb   attach_datalayoutset_nvvm_reflect_ftzrx   get_functionsis_declarationis_external_linkageset_nvvm_maxnregr   link_extern_libsoptimize_moduleOPTIMIZE_O3rP   )srcr   rZ   r   rY   num_warp_groupsr   r   r  r  r   llvm_modprocra   tripler   r   r.   r=   rets                       r   	make_llirzCUDABackend.make_llir   s   27; **+OP&[!_4!__S[[)
::>>.4;__&F%%fckk:DKK++D1CCBG77;$$R(**2.11"5++B
KH11"5**2.''+b!$$R(::>>4c:cAMM&&r*
s,,.>>#w/%+x3zl1C(&xx@##H- ??&++- 8'')a.C.C.E&&w78 .5.A.ABltTTBEB!!(E2Xt'7'78 !--.AB(m
 Cs   P c           	         t        |      }d}|dk(  rdnd| }t        |      }t        j                  | |||dg|j                  d      }t        j                  d|      }	t        |	      dk(  sJ |	d	   |d
<   |dz   d|dz   }t        j                  dd| |t
        j                        }t        j                  dd|      }t        j                  j                  dd      dk(  rt        d       t        |       |S )Nr  r   r  r  znvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r)   r   r   rM   rF   z\.version \d+\.\d+z	.version r'   z,\s*debug|debug,\s*r"   NVPTX_ENABLE_DUMPr   r   z // -----// NVPTX Dump //----- //)r\   rb   r   translate_to_asmrz   r7   findalllensubr:   r*   r+   r,   print)
r0  r   r   r   rY   r4  r3  ra   r5  namess
             r   make_ptxzCUDABackend.make_ptx=  s   237&%+x3zl1C$##CxBSATVYVjVjlqr

FL5zQ 8$b);r>*:;ff*i},EsRTR^R^_ff+R5::>>-s3s:45#J
r   c                 v   t        d      \  }}t        j                  ddd      5 }t        j                  ddd      5 }|j                  |        |j	                          |j
                  dz   }t        j                  j                  d	      rg nd
g}	|j                  rg ndg}
|dk(  rdnd}t        j                  j                  dd      dk(  rddgng }|g|	|
d|d| | |j
                  d|}	 t        j                  |dd|       t        j                  j                  |j
                        rt        j                  |j
                         t        j                  j                  |j
                        rt        j                  |j
                         t!        |d!      5 }|j#                         }d d d        t        j                  j                  |      rt        j                  |       d d d        d d d        S # t        j                  $ r}t!        |j
                        5 }|j#                         }d d d        n# 1 sw Y   nxY wt        j                  j                  |j
                        rt        j                  |j
                         |j$                  dk(  rd}n2|j$                  dt&        j(                  z   k(  rd}nd|j$                   }t+        | d ddj-                  |       d       d }~ww xY w# 1 sw Y   UxY w# 1 sw Y   &xY w# 1 sw Y   S xY w)"NrB   Fwz.ptx)deletemodesuffixrz.logz.or  z	-lineinfoz--fmad=falser   ar"   DISABLE_PTXAS_OPTr   r   z--opt-levelz-vz--gpu-name=sm_z-oT)check	close_fdsr%      z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command:  
rd   )r@   tempfileNamedTemporaryFilewriteflushr   r*   r+   r,   rz   r4   r   r.   r2   removeCalledProcessErrorre   rh   
returncodesignalSIGSEGVr<   r/   )r0  r   r   r   rB   r[   fsrcflogfbin	line_infofmadrD  	opt_level	ptxas_cmdelog_filelogerrorrj   r   s                       r   
make_cubinzCUDABackend.make_cubinS  s   "7+q((CO '	 SW''u3vN'	 RVJJsOJJL99t#D jjnn-GH{mI--2N3CD&",S"F02

?RTW0X\_0_,egI!$(*.1:>LZLY_X`<acgclclnrtxINydS77>>$)),IIdii(77>>$)),IIdii($ dD! !Q!ww~~d#		$O'	  '	 P + 00 N$))_ *"--/C* * *77>>$)),IIdii(<<3&?E\\S6>>%994E=all^LE"eW -77:e <558XXi5H4I$M N NN"! !I'	  '	  '	 P sz   L. B:L!;B)H$L!0L<L!=L.L"L7I		LIB6LLL!LL!!L+	&L..L8c                 b      fd|d<    fd|d<    fd|d<    fd|d<    fd	|d
<   y )Nc                 *    j                  | |      S r   )r   r0  r   rZ   r   s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>  s    t~~c8W/U r   r   c                 @    j                  | |j                        S r   )r  r   re  s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>      XwX\XgXg0h r   ttgirc                 @    j                  | |j                        S r   )r6  r   re  s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>  s    t~~c8WVZVeVe/f r   llirc                 @    j                  | |j                        S r   )r?  r   re  s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>  s    dmmC7TXTcTc.d r   ptxc                 @    j                  | |j                        S r   )rb  r   re  s     r   r   z(CUDABackend.add_stages.<locals>.<lambda>  rg  r   r   r   )r   stagesrZ   s   ` `r   
add_stageszCUDABackend.add_stages  s0    Uvhwfvduhwr   c                 8    t               }| d| j                   S )Nr   )rC   r   )r   r?   s     r   r   zCUDABackend.hash  s     #%!DOO,--r   )r   r   r   staticmethodr   r   r   r
   r   r   r   r   rP   r   r   r   r   r  r6  r?  rb  ro  	functools	lru_cacher   __classcell__)r   s   @r   r   r      s    (	 ( ("y "T "#S #"
>S*_ 5 >"   2 2h : :x  * * *Xi Y. .r   r   )&triton.backends.compilerr   r   triton._C.libtritonr   r   r   r   dataclassesr	   rr  typingr
   r   r   r   typesr   rf   r7   rN  rU  r*   r4   pathlibr   r   rs  rP   r@   rC   rR   rW   r\   rb   rk   rn   r   r   r   r   <module>r{     s    ; 8 8 !  - -   	   	  X X 0C 0 0   
 iS i i$ 
 
 T4 4
 $%? %? %?PF.+ F.r   