
    *#hr7                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ  ee      Z G d
 d      Z G d de
      Z G d dee
      Z G d de      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d  d!e      Z' G d" d#      Z(y)$    N)ABCabstractmethod)Path)DictListOptionalTypeUnion   )config   )FileLock)
get_loggerc                   Z    e Zd Zddee   fdZdedefdZdededefd	Zdd
ededefdZ	y)ExtractManagerN	cache_dirc                     |r.t         j                  j                  |t        j                        nt        j
                  | _        t        | _        y N)	ospathjoinr   EXTRACTED_DATASETS_DIREXTRACTED_DATASETS_PATHextract_dir	Extractor	extractor)selfr   s     S/var/www/html/sandstorm/venv/lib/python3.12/site-packages/datasets/utils/extract.py__init__zExtractManager.__init__   s6    FOBGGLLF$A$ABU[UsUs 	 #    r   returnc                     ddl m} t        j                  j	                  |      }t        j                  j                  | j                   ||            S )Nr   )hash_url_to_filename)
file_utilsr#   r   r   abspathr   r   )r   r   r#   abs_paths       r   _get_output_pathzExtractManager._get_output_path   s:    4 77??4(ww||D,,.B8.LMMr    output_pathforce_extractc                     |xsY t         j                  j                  |       xr7 t         j                  j                  |      xr t        j                  |       S r   )r   r   isfileisdirlistdir)r   r(   r)   s      r   _do_extractzExtractManager._do_extract%   sI     
{++lRWW]];5O5kTVT^T^_jTk0l	
r    
input_pathc                     | j                   j                  |      }|s|S | j                  |      }| j                  ||      r| j                   j	                  |||       |S r   )r   infer_extractor_formatr'   r.   extract)r   r/   r)   extractor_formatr(   s        r   r2   zExtractManager.extract*   s]    >>@@L++J7K7NN"":{<LMr    r   F)
__name__
__module____qualname__r   strr   r'   boolr.   r2    r    r   r   r      s\    #(3- #NS NS N
s 
4 
D 

# d s r    r   c                   v    e Zd Zeedeeef   defd              Z	e
edeeef   deeef   ddfd              Zy)BaseExtractorr   r!   c                      y r   r:   clsr   kwargss      r   is_extractablezBaseExtractor.is_extractable5        	r    r/   r(   Nc                      y r   r:   )r/   r(   s     r   r2   zBaseExtractor.extract:   rB   r    )r5   r6   r7   classmethodr   r
   r   r8   r9   rA   staticmethodr2   r:   r    r   r<   r<   4   sw    %c	"2     E$), 5s;K PT   r    r<   c                   p    e Zd ZU g Zee   ed<   edee	e
f   defd       Zed	dee	e
f   dedefd       Zy)
MagicNumberBaseExtractormagic_numbersr   magic_number_lengthc                 h    t        | d      5 }|j                  |      cd d d        S # 1 sw Y   y xY w)Nrb)openread)r   rI   fs      r   read_magic_numberz*MagicNumberBaseExtractor.read_magic_numberC   s0    $ 	/66-.	/ 	/ 	/s   (1magic_numberr!   c                     s/t        d | j                  D              }	 | j                  ||      t	        fd| j                  D              S # t        $ r Y yw xY w)Nc              3   2   K   | ]  }t        |        y wr   )len).0cls_magic_numbers     r   	<genexpr>z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>K   s     %f@Pc*:&;%fs   Fc              3   @   K   | ]  }j                  |        y wr   )
startswith)rT   rU   rP   s     r   rV   z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>P   s     gAQ<**+;<gs   )maxrH   rO   OSErrorany)r?   r   rP   rI   s     ` r   rA   z'MagicNumberBaseExtractor.is_extractableH   sd    "%%fTWTeTe%f"f"44T;NO gUXUfUfggg  s   A 	AANr    )r5   r6   r7   rH   r   bytes__annotations__rE   r
   r   r8   intrO   rD   r9   rA   r:   r    r   rG   rG   @   sq    !#M4;#/dCi 0 /s / / h%c	"2 h% hRV h hr    rG   c                   r    e Zd Zedeeef   defd       Ze	d        Z
e	deeef   deeef   ddfd       Zy)	TarExtractorr   r!   c                 ,    t        j                  |      S r   )tarfile
is_tarfiler>   s      r   rA   zTarExtractor.is_extractableT   s    !!$''r    c              #   >  K   dt         dt         fddt         dt         dt        ffddt         dt        ffd} |      }| D ]  } |j                  |      r$t        j	                  d|j                   d       :|j                         r9 |||      r0t        j	                  d|j                   d	|j                          |j                         r9 |||      r0t        j	                  d|j                   d
|j                          |  yw)a  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309
        r   r!   c                 z    t         j                  j                  t         j                  j                  |             S r   )r   r   realpathr%   )r   s    r   resolvedz*TarExtractor.safemembers.<locals>.resolvedd   s$    77##BGGOOD$9::r    basec                 p     t         j                  j                  ||             j                  |       S r   )r   r   r   rX   )r   ri   rh   s     r   badpathz)TarExtractor.safemembers.<locals>.badpathg   s+    T4 89DDTJJJr    c                      t         j                  j                  |t         j                  j                  | j                                    } | j
                  |      S )N)ri   )r   r   r   dirnamenamelinkname)infori   tiprk   rh   s      r   badlinkz)TarExtractor.safemembers.<locals>.badlinkk   s>    277<<bggoodii.HIJC4==s33r    zExtraction of z is blocked (illegal path)z is blocked: Symlink to z is blocked: Hard link to N)r8   r9   rn   loggererrorissymro   islnk)membersr(   rr   ri   finfork   rh   s        @@r   safememberszTarExtractor.safemembersX   s    	;3 	;3 	;	K# 	KS 	KT 	K	4 	4 	4
 $ 	Euzz4(~ejj\9STU75$#7~ejj\9QRWR`R`Qabc75$#7~ejj\9STYTbTbScde	s   DDr/   r(   Nc                     t        j                  |d       t        j                  |       }|j	                  |t
        j                  ||             |j                          y )NTexist_ok)rw   )r   makedirsrc   rL   
extractallra   ry   close)r/   r(   tar_files      r   r2   zTarExtractor.extract|   sI    
K$/<<
+K1I1I(T_1`ar    )r5   r6   r7   rD   r
   r   r8   r9   rA   rE   ry   r2   r:   r    r   ra   ra   S   s|    (%c	"2 ( ( ( ! !F E$), 5s;K PT  r    ra   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)GzipExtractors   r/   r(   r!   Nc                     t        j                  | d      5 }t        |d      5 }t        j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY wNrK   wb)gziprL   shutilcopyfileobj)r/   r(   	gzip_fileextracted_files       r   r2   zGzipExtractor.extract   s]    YYz4( 	>Ik4( >N""9n=>	> 	>> >	> 	>!   AAAA	AA!	r5   r6   r7   rH   rE   r
   r   r8   r2   r:   r    r   r   r      sC     MM>E$), >5s;K >PT > >r    r   c                   |     e Zd Zg dZed
deeef   dede	f fd       Z
edeeef   deeef   ddfd	       Z xZS )ZipExtractor)s   PKs   PKs   PKr   rP   r!   c                    t         |   ||      ry	 ddlm}m}m}m}m}m}m	}	m
}
m}m} t        |d      5 } |	|      }|r||   dk(  r||   dk(  r||   dk(  r
	 d d d        y||   ||   k(  ry|j                  ||          |j                         ||   k(  rO||   |
k\  rG|j!                  |
      }t#        |      |
k(  r(t%        j&                  ||      }||   |k(  r
	 d d d        yd d d        y# 1 sw Y   yxY w# t(        $ r Y yw xY w)NrP   Tr   )
_CD_SIGNATURE_ECD_DISK_NUMBER_ECD_DISK_START_ECD_ENTRIES_TOTAL_ECD_OFFSET	_ECD_SIZE_EndRecDatasizeCentralDirstringCentralDirstructCentralDirrK   F)superrA   zipfiler   r   r   r   r   r   r   r   r   r   rL   seektellrM   rS   structunpack	Exception)r?   r   rP   r   r   r   r   r   r   r   r   r   r   fpendrecdatacentdir	__class__s                    r   rA   zZipExtractor.is_extractable   s4   7!$\!B	   dD! 0R$R01Q66);LPQ;QV\]hVimnVn#	0 0
   01VO5LL{ 34779{(;;y@QUc@c#%77>#:D"4yN:*0--8H$*O#*=#9=M#M+/0 0 0 0  		sA   $C? $C3C? &A;C3!C? *C? 3C<8C? <C? ?	D
Dr/   r(   Nc                     t        j                  |d       t        j                  | d      5 }|j	                  |       |j                          d d d        y # 1 sw Y   y xY w)NTr{   r)r   r}   r   ZipFiler~   r   )r/   r(   zip_files      r   r2   zZipExtractor.extract   sM    
K$/__Z- 	,NN	 	 	s   "AA"r\   )r5   r6   r7   rH   rD   r
   r   r8   r]   r9   rA   rE   r2   __classcell__)r   s   @r   r   r      sz    M "%c	"2 "% "RV " "H E$), 5s;K PT  r    r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)XzExtractors   7zXZ r/   r(   r!   Nc                     t        j                  |       5 }t        |d      5 }t        j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr   )lzmarL   r   r   r/   r(   compressed_filer   s       r   r2   zXzExtractor.extract   sd    YYz" 	Dok4( DN""?NCD	D 	DD D	D 	Ds!   AAAA	AA r   r:   r    r   r   r      sI    01MDE$), D5s;K DPT D Dr    r   c                   F    e Zd ZddgZedeeef   deeef   ddfd       Zy)RarExtractors   Rar! s   Rar! r/   r(   r!   Nc                     t         j                  st        d      dd l}t	        j
                  |d       |j                  |       }|j                  |       |j                          y )NzPlease pip install rarfiler   Tr{   )	r   RARFILE_AVAILABLEImportErrorrarfiler   r}   RarFiler~   r   )r/   r(   r   rfs       r   r2   zRarExtractor.extract   sK    '':;;
K$/__Z(
k"

r    r   r:   r    r   r   r      sG    (*ABME$), 5s;K PT  r    r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)ZstdExtractors   (/r/   r(   r!   Nc                 
   t         j                  st        d      dd l}|j	                         }t        | d      5 }t        |d      5 }|j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)NzPlease pip install zstandardr   rK   r   )r   ZSTANDARD_AVAILABLEr   	zstandardZstdDecompressorrL   copy_stream)r/   r(   zstddctxifhofhs         r   r2   zZstdExtractor.extract   sy    ))<== $$&*d# 	'sDd,C 	'sS#&	' 	' 	' 	' 	' 	's#   A9	A-A9-A6	2A99Br   r:   r    r   r   r      sD    ()M'E$), '5s;K 'PT ' 'r    r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)Bzip2Extractors   BZhr/   r(   r!   Nc                     t        j                  | d      5 }t        |d      5 }t        j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY wr   )bz2rL   r   r   r   s       r   r2   zBzip2Extractor.extract   sf    XXj$' 	D?k4( DN""?NCD	D 	DD D	D 	Dr   r   r:   r    r   r   r      sI    $%MDE$), D5s;K DPT D Dr    r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)SevenZipExtractors   7z'r/   r(   r!   Nc                     t         j                  st        d      dd l}t	        j
                  |d       |j                  | d      5 }|j                  |       d d d        y # 1 sw Y   y xY w)NzPlease pip install py7zrr   Tr{   r   )r   PY7ZR_AVAILABLEr   py7zrr   r}   SevenZipFiler~   )r/   r(   r   archives       r   r2   zSevenZipExtractor.extract   s]    %%899
K$/
C0 	,G{+	, 	, 	,s   	A$$A-r   r:   r    r   r   r      sD    01M,E$), ,5s;K ,PT , ,r    r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)Lz4Extractors   "Mr/   r(   r!   Nc                    t         j                  st        d      dd l}|j                  j                  | d      5 }t        |d      5 }t        j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)NzPlease pip install lz4r   rK   r   )r   LZ4_AVAILABLEr   	lz4.frameframerL   r   r   )r/   r(   lz4r   r   s        r   r2   zLz4Extractor.extract  s    ##677YY^^J- 	Dk4( DN""?NCD	D 	DD D	D 	Ds#   A=	A1 A=1A:	6A==Br   r:   r    r   r   r     sI    ()MDE$), D5s;K DPT D Dr    r   c                      e Zd ZU eeeeeee	e
ed	Zeeee   f   ed<   ed        Zedeeef   defd       Zeddeeef   dedefd	       Zedeeef   defd
       Ze	 	 ddeeef   deeef   dee   dee   ddf
d       Zy)r   )	tarr   zipxzrarr   r   7zr   
extractorsc                 V    t        d | j                  j                         D              S )Nc              3   t   K   | ]0  }t        |t              r|j                  D ]  }t        |        2 y wr   )
issubclassrG   rH   rS   )rT   r   extractor_magic_numbers      r   rV   z9Extractor._get_magic_number_max_length.<locals>.<genexpr>  sD      
)%=>*3*A*A	
 ' &'
'
s   68)rY   r   values)r?   s    r   _get_magic_number_max_lengthz&Extractor._get_magic_number_max_length  s)     
 ^^224
 
 	
r    r   rI   c                 P    	 t         j                  | |      S # t        $ r Y yw xY w)N)rI   r    )rG   rO   rZ   )r   rI   s     r   _read_magic_numberzExtractor._read_magic_number&  s0    	+==dXk=ll 		s    	%%return_extractorr!   c                     t        j                  dt               | j                  |      }|r|sdS d| j                  |   fS |sdS dS )Nz{Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'infer_extractor_format' instead.categoryTF)FN)warningswarnFutureWarningr1   r   )r?   r   r   r3   s       r   rA   zExtractor.is_extractable-  sU    4"	

 55d;/4]dCNNK[<\5]],u?-?r    c                     | j                         }| j                  ||      }| j                  j                         D ]  \  }}|j	                  ||      s|c S  y )Nr   )r   r   r   itemsrA   )r?   r   magic_number_max_lengthrP   r3   r   s         r   r1   z Extractor.infer_extractor_format9  s_    "%"B"B"D--d4KL+.>>+?+?+A 	('i''<'H''	(r    Nr/   r(   r3   r   c                    t        j                  t         j                  j                  |      d       t	        t        |      j                  d            }t        |      5  t        j                  |d       |s|dk7  rd|dk7  st        |t              s%t        j                  dt               |dk7  r|n|}n| j                  |   }|j                  ||      cd d d        S t        j                  dt               | j                  j!                         D ]0  }|j#                  |      s|j                  ||      c cd d d        S  	 d d d        y # 1 sw Y   y xY w)	NTr{   z.lock)ignore_errors
deprecatedzsParameter 'extractor' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'extractor_format' instead.r   ztParameter 'extractor_format' was made required in version 2.4.0 and not passing it will raise an exception in 3.0.0.)r   r}   r   rm   r8   r   with_suffixr   r   rmtree
isinstancer   r   r   r   r2   r   rA   )r?   r/   r(   r3   r   	lock_paths         r   r2   zExtractor.extractA  s8    	BGGOOK04@[)55g>?	i  	JMM+T:9#<,J?OQT4UMM:!.
 .7,-F	L\I #/? @I (([A	J 	J **
 "%!6!6!8 JI //
;(00[II+	J 	J&J'	J 	J 	Js    #A9E&A	E0EEE!r4   )Nr   )r5   r6   r7   ra   r   r   r   r   r   r   r   r   r   r   r8   r	   r<   r^   rD   r   rE   r
   r   r_   r   r9   rA   r1   r   r2   r:   r    r   r   r     sL    
2JS$}--. 
 
 
 tSy!1    	@%c	"2 	@d 	@W[ 	@ 	@ (%c	*: (s ( ( 
 +/-9J$)$J 49%J #3-	J
 M*J 
J Jr    r   ))r   r   r   r   r   r   rc   r   r   abcr   r   pathlibr   typingr   r   r   r	   r
    r   filelockr   loggingr   r5   rs   r   r<   rG   ra   r   r   r   r   r   r   r   r   r   r:   r    r   <module>r      s    
   	      #  4 4    
H	 <	C 	h}c h&.= .b>, >1+ 1hD* D+ ', 'D- D,0 ,D+ DRJ RJr    