
    :Qg                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
Z
d dlZd dlmZmZ  e j                   d      Zdededefd	Zd
edee   fdZd Zd Zd Z	 ddededej0                  dedef
dZd ZdefdZdefdZd de	ej<                  ee   f   dee    de	edf   fdZ!d deee      dee    de	edf   fdZ"d deee      dee    de	edf   fdZ#deee      defdZ$d Z%y)!    N)Path)ListOptionalUnion)elements_from_jsonelements_to_textzunstructured.evaldocpathoutput_typereturnc                     	 |dk(  rt        t        |             }|S |dk(  rt        |       }|S t        d| d      # t        $ r}t        j                  d|         |d}~ww xY w)z
    Convert given input document (path) into cct-ready. The function only support conversion
    from `json` or `txt` file.
    jsontxtzZFile type not supported. Expects one of `json` or `txt`,                     but received z	 instead.zCould not read the file N)r   r   _read_text_file
ValueErrorloggererror)r	   r
   
output_cctes       W/var/www/html/answerous/venv/lib/python3.12/site-packages/unstructured/metrics/utils.py_prepare_output_cctr      s    
& )*<W*EFJ  E!(1J  ""-i9   /y9:s   > > > 	A&A!!A&dirc                     g }t        j                  |       D ]s  \  }}}|D ]h  }t         j                  j                  ||       }|dk(  r|j	                  |       :|j	                  t         j                  j                  ||             j u |S )z
    Recursively lists all files in the given directory and its subdirectories.
    Returns a list of all files found, with each file's path relative to the
    initial directory.
    .)oswalkpathrelpathappendjoin)r   listdirdirpath_	filenamesfilenamerelative_paths          r   _listdir_recursiver&   %   s     G!# FI! 	FHGGOOGS9M#x(rww||M8DE	FF N    c                 4    ddddd}| j                  |      S )a  
    Renames aggregated columns in a DataFrame based on a predefined mapping.

    Parameters:
    df (pandas.DataFrame): The DataFrame with aggregated columns to rename.

    Returns:
    pandas.DataFrame: A new DataFrame with renamed aggregated columns.
    meanstdevpstdevcount)_mean_stdev_pstdev_count)columns)rename)df
rename_maps     r   _rename_aggregated_columnsr5   7   s$     "WU\]J99Z9((r'   c                  L    t        j                  | d      j                         S )zs
    Concatenates multiple pandas DataFrame objects along the columns (side-by-side)
    and resets the index.
       )axis)pdconcatreset_index)r3   s    r   _format_grouping_outputr<   E   s    
 99Ra ,,..r'   c                    t        |       dk(  ry| j                  j                         }|D cg c]*  }t        t        |      t        d | |   D                    , c}t	        j
                  dj                  fdt        |      D                     t	        j
                  dt              z  dt        |      dz
  z  z          | j                         D ]  \  }}g |D ]@  }t        |t              rj                  |d       'j                  t        |             B t	        j
                  dj                  fd	t        t                    D                      yc c}w )
zD
    Displays the evaluation metrics in a formatted text table.
    r   Nc              3   D   K   | ]  }t        t        |              y wN)lenstr).0items     r   	<genexpr>z_display.<locals>.<genexpr>U   s     CST^Cs     c              3   L   K   | ]  \  }}|j                  |           y wr?   ljust)rB   iheader
col_widthss      r   rD   z_display.<locals>.<genexpr>W   s"     X	6Z]3X   !$-r7   z.3fc              3   L   K   | ]  }|   j                  |           y wr?   rG   )rB   rI   rK   formatted_rows     r   rD   z_display.<locals>.<genexpr>a   s%     ^q]1%++JqM:^rL   )r@   r1   tolistmaxclickechor   	enumeratesumiterrows
isinstancefloatr   rA   range)r3   headersrJ   r"   rowrC   rK   rO   s         @@r   _displayr\   M   s(    2w!|jj!GSZIOCKC6
CCDJ 
JJsxxXYwEWXXY	JJsS_$sc'lQ.>'??@++- 	
3 	0D$&$$Sz3$$SY/		0
 	

HH^E#mJ\D]^^	
	
s   /E+	directoryr$   r3   mode	overwritec                    |dvrt        d      | rt        |       j                  d       d|j                  v r|d   j	                  t
              |d<   d|j                  v r#d|j                  v r|j                  ddgd       |st        | |      }|j                  t        j                  j                  | |      d	|d
|dk(         y)z
    Save the metrics report to tsv file. The function allows an option 1) to choose `mode`
    as `w` (write) or `a` (append) and 2) to `overwrite` the file if filename existed or not.
    )waz/Mode not supported. Mode must be one of [w, a].T)exist_okr,   r$   	connector)byinplace	Fra   )sepr^   indexrJ   N)r   r   mkdirr1   astypeintsort_values_get_non_duplicated_filenameto_csvr   r   r   )r]   r$   r3   r^   r_   s        r   _write_to_filerp   e   s     :JKKYt,"**k((-7RZZK2::$=
;
3TB/	8DII
Y)t$eUY]`U`  r'   c                 P    t        j                  d|       }|rt        |d         S y)a  
    A function that defines the sorting method for duplicated file names. For example,
    with filename.ext filename (1).ext filename (2).ext filename (10).ext - this function
    extracts the integer in the bracket and sort those numbers ascendingly.
    z(\d+)r   )refindallrl   )r$   numberss     r   _sorting_keyrv   {   s+     jj8,G72; r'   c           	         |j                  dd      \  }}dt        j                  |       dt        j                  |       d}t        | D cg c]  }t        j                  ||      s| c}t
              }g }|D ]D  }t        j                  d|      }	|	s|j                  t        |	j                  d                   F |j                          d}
|D ]  }||
k(  r|
dz  }
 n |dz   t        |
      z   d	z   |z   S c c}w )
z
    Checks the duplicity of the file name from the list and run the numerical check
    of the minimum number needed as extension to not overwrite the exising file.
    Returns a string of file name in the format of `filename (<min number>).ext`.
    r   r7   ^z(?: \((\d+)\))?\.$)keyz	\((\d+)\)z (z).)rsplitrs   escapesortedmatchrv   searchr   rl   groupsortrA   )	file_listtarget_filenameoriginal_filename	extensionpatternfduplicated_filesru   filer~   counternumbers               r   _uniquity_filer      s    $3#9#9#q#A y299./00A"))IBVAWWXYG)LQrxx7KqLR^_G  0		,-NN3u{{1~./0
 LLNG WqLG	 t#c'l2T9IEE# Ms   D*Dc                 D    t        t        j                  |       |      }|S )zs
    Helper function to calls the `_uniquity_file` function. Takes in directory and file name
    to check on.
    )r   r   r    )r   r$   s     r   rn   rn      s    
 bjjox8HOr'   scoresroundingc                 j    t        |       dk(  ryt        j                  |       }|s|S t        ||      S )z
    Find mean from the list. Returns None if no element in the list.

    Args:
        rounding (int): optional argument that allows user to define decimal points. Default at 3.
    r   N)r@   
statisticsr)   round)r   r   r)   s      r   r-   r-      s6     6{a??6"Dx  r'   c                     | D cg c]  }||	 } }t        |       dk  ry|st        j                  |       S t        t        j                  |       |      S c c}w )z
    Find standard deviation from the list.
    Returns None if only 0 or 1 element in the list.

    Args:
        rounding (int): optional argument that allows user to define decimal points. Default at 3.
    Nr7   )r@   r   r*   r   r   r   scores      r   r.   r.      s\     "(=5+<e=F=
6{a''!!&)844 >
   AAc                     | D cg c]  }||	 } }t        |       dk  ry|st        j                  |       S t        t        j                  |       |      S c c}w )z
    Find population standard deviation from the list.
    Returns None if only 0 or 1 element in the list.

    Args:
        rounding (int): optional argument that allows user to define decimal points. Default at 3.
    Nr7   )r@   r   r+   r   r   s      r   r/   r/      s\     "(=5+<e=F=
6{a  ((""6*H55 >r   c                     t        |       S )z,
    Returns the row count of the list.
    )r@   )r   s    r   r0   r0      s     v;r'   c                    t         j                  j                  |       st        d|  d      	 t	        | d      5 }|j                         }ddd       |S # 1 sw Y   S xY w# t        $ r}t        d|  d|       d}~ww xY w)zG
    Reads the contents of a text file and returns it as a string.
    zThe file at z does not exist.ignore)errorsNz+An error occurred when reading the file at z: )r   r   existsFileNotFoundErroropenreadOSErrorIOError)r   r   textr   s       r   r   r      s    
 77>>$,tf4D EFFQ$x( 	A668D		 QCD6A3OPPQs4   A% A	A% A"A% "A% %	B.A??B)ra   T)   )&loggingr   rs   r   pathlibr   typingr   r   r   rR   pandasr9   unstructured.staging.baser   r   	getLoggerr   rA   r   r&   r5   r<   r\   	DataFrameboolrp   rv   r   rn   SeriesrX   rl   r-   r.   r/   r0   r    r'   r   <module>r      s    	 	   ( (   J			.	/ 3 3 *C DI $)/
2 Y]!')||;>QU, F# F83 !%		4;./ !8C= !QVW\^bWbQc !54( 5HSM 5%PUW[P[J\ 5$6D%) 6Xc] 65QVX\Q\K] 6 4( U Qr'   