
    6Ig@                         d dl Z d dlmZ d dlZd dlZd dlZd dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlZd dlZd dlmZ d d	lmZ d dlZ e         G d
 d      Zy)    N)embedding_functions)ListDictOptional)CrossEncoder)RecursiveCharacterTextSplitter)ThreadPoolExecutoras_completed)PyPDFLoader)datetime)load_dotenvc                      e Zd Zd$dedee   dedededededefd	Zd
 Z	deeef   fdZ
d Zdedee   fdZd Zd%dedededededededefdZd&dedee   dee   defdZd ZdeddfdZd'd eded!ed"edee   f
d#Zy)(HybridSearchcollection_nameuserfolder_nameduckdb_pathuuid_storage_path
chunk_sizechunk_overlapalphac	                    t        j                  d      dz   t        |j                        z   dz   t        |      z   }	|| _        || _        || _        |	dz   |z   | _        t        dd      | _	        | j                         | _        |	dz   }
t         j                  j                  |
      st        j                  |
       t        j                   |
      | _        t%        j&                  t        j                  d      d	
      }| j"                  j)                  ||      | _        t-        j.                  |	dz   |z         | _        | j3                          t5        ||t6        d      | _        y)z5
        Initialize the hybrid search system
        DB_PATH/z$cross-encoder/ms-marco-MiniLM-L-6-v2i   )
max_lengthz
/chroma_db)pathOPENAI_API_KEYztext-embedding-3-small)api_key
model_name)nameembedding_functionF)r   r   length_functionis_separator_regexN)osgetenvstr	user_coder   r   r   r   r   reranker_model_load_uuid_storage	pdf_uuidsr   existsmakedirschromadbPersistentClientchroma_clientr   OpenAIEmbeddingFunctionget_or_create_collectionchroma_collectionduckdbconnectconn_setup_duckdbr   lentext_splitter)selfr   r   r   r   r   r   r   r   r   persist_directory	openai_efs               3/var/www/html/answerous/dependencies/chromautils.py__init__zHybridSearch.__init__   sF    ))I&s*3t~~+>>sB3{CSS.$*!(->!>*+Q^ab002
 $L0ww~~/0KK)*%66<MN'??II.//
	 "&!3!3!L!L ( "M "
 NN73;{#:;	 <!'$	
    c                     | j                   j                  |D cg c]	  }||d   f c}      }t        ||      D ]
  \  }}||d<    |S c c}w )Ncontentreranker_score)r(   predictzip)r9   queryresultschunkscoresscores         r<   rerankerzHybridSearch.rerankerA   s`    $$,,U\-]EueI6F.G-]^0 	*LE5$)E"#	* 	 .^s   A
returnc                     t         j                  j                  | j                        r5t	        | j                  d      5 }t        j                  |      cddd       S i S # 1 sw Y   i S xY w)z8
        Load the UUID storage from a JSON file
        rN)r$   r   r+   r   openjsonloadr9   files     r<   r)   zHybridSearch._load_uuid_storageH   sV     77>>$001d,,c2 'dyy' '	'	s    A!!A+c                     t        | j                  d      5 }t        j                  | j                  |d       ddd       y# 1 sw Y   yxY w)z6
        Save the UUID storage to a JSON file
        w   )indentN)rM   r   rN   dumpr*   rP   s     r<   _save_uuid_storagezHybridSearch._save_uuid_storageQ   s?     $((#. 	6$IIdnnd15	6 	6 	6s   #AApdf_namec                 `    | j                   j                         D ]  \  }}|d   |k(  s|c S  y)z8
        Retrieve the UUID for a given PDF name
        rX   N)r*   items)r9   rX   uuiddetailss       r<   get_pdf_uuidzHybridSearch.get_pdf_uuidX   s;     "^^113 	MD'z"h.	 r>   c                      j                   j                  d        j                   j                  d        j                   j                  d        j                   j                  d        j                   j                  d       	  j                   j                  d fdd	       y
# t        j                  $ r Y y
w xY w)z2
        Set up DuckDB tables and indexes
        aG  
            CREATE TABLE IF NOT EXISTS documents (
                chunk_id VARCHAR PRIMARY KEY,
                pdf_uuid VARCHAR,
                content TEXT,
                page_number INTEGER,
                chunk_number INTEGER,
                created_at TIMESTAMP,
                metadata JSON
            )
        z
            CREATE TABLE IF NOT EXISTS terms (
                term VARCHAR,
                chunk_id VARCHAR,
                tf INTEGER,
                UNIQUE(term, chunk_id)
            )
        z
            CREATE TABLE IF NOT EXISTS doc_stats (
                collection_name VARCHAR PRIMARY KEY,
                total_chunks INTEGER,
                avg_chunk_length DOUBLE
            )
        z>CREATE INDEX IF NOT EXISTS idx_pdf_uuid ON documents(pdf_uuid)z2CREATE INDEX IF NOT EXISTS idx_term ON terms(term)
bm25_scorec                 .    j                  | ||||      S )N)_bm25_score)tfdfdoc_lenavg_doc_len
total_docsr9   s        r<   <lambda>z,HybridSearch._setup_duckdb.<locals>.<lambda>   s    $$RWk:N r>   DOUBLE)return_typeN)r5   executecreate_functionr3   CatalogException)r9   s   `r<   r6   zHybridSearch._setup_duckdba   s    
 			 
 
	 			  	 			  	 			Z[		NO
	II%%O$	 &  && 		s   
!B, ,CCrb   rc   rd   re   rf   k1bc                     t        j                  ||z
  dz   |dz   z  dz         }||dz   z  ||d|z
  ||z  |z  z   z  z   z  }	t        ||	z        S )z-Calculate BM25 score for a term in a document      ?   )nplogfloat)
r9   rb   rc   rd   re   rf   rm   rn   idftf_adjusteds
             r<   ra   zHybridSearch._bm25_score   se    ffj2o+S9A=>R!V}bAEAK+<U4U.V)VWS;&''r>   Npdf_pathpdf_uuidmetadatac                    |t        t        j                               }|i }t        j                  j                  |      j                  d      d   }|||d| j                  |<   | j                          t        |      }|j                         }| j                  j                  d|g       | j                  j                  d|g       | j                  j                  d|i      d	   }|r| j                  j                  |
       g }t!               5 }	g }
t#        |      D ]4  \  }}|
j%                  |	j'                  | j(                  |||||             6 t+        |
      D ]  }|j-                           	 ddd       t/        j0                  |      }| j                  j                  d       | j                  j                  d| j2                  g       |S # 1 sw Y   bxY w)zG
        Process and upsert a PDF document into both databases
        N.r   )rX   rw   ry   (DELETE FROM documents WHERE pdf_uuid = ?WDELETE FROM terms WHERE chunk_id IN (SELECT chunk_id FROM documents WHERE pdf_uuid = ?)rx   whereidsr   z-INSERT INTO documents SELECT * FROM chunks_dfz
            INSERT OR REPLACE INTO doc_stats 
            SELECT 
                ? as collection_name,
                COUNT(*) as total_chunks,
                AVG(LENGTH(content)) as avg_chunk_length
            FROM documents
        )r&   r[   uuid4r$   r   basenamesplitr*   rW   r   rO   r5   rj   r2   getdeleter	   	enumerateappendsubmitprocess_pager
   resultpd	DataFramer   )r9   rw   rx   ry   rX   loaderpagesexisting_ids
all_chunksexecutorfuturespage_numpagefuture	chunks_dfs                  r<   
upsert_pdfzHybridSearch.upsert_pdf   s    4::<(HH 77##H-33C8; !  $
x 
 	! X& 			6J	
 			eJ	

 --11X8N1OPUV""))l); 
! 		 XG #,E"2 s$xt/@/@$RZ\dfpqrs 'w/   		  LL,			IJ 			  ""#	% 3		  		 s   A%GG(c                    | j                   j                  |j                        }t        |      D ]  \  }}| d| d| }	t	        j
                         }
||||
j                         d|}| j                  j                  |	g|g|g       |j                  |	|||||
|d       |j                         j                         }t        j                  |      j                         }g }|j                         D ]  \  }}|j                  ||	|d        |st        j                   |      }| j"                  j%                  d        y)	z|
        Process a single page and add chunks to databases.
        This method will run in parallel for each page.
        z-pz-c)rx   page_numberchunk_number
created_at)r   	documents	metadatas)chunk_idrx   r@   r   r   r   ry   )termr   rb   z'INSERT INTO terms SELECT * FROM term_dfN)r8   
split_textpage_contentr   r   now	isoformatr2   addr   lowerr   r   Seriesvalue_countsrZ   r   r5   rj   )r9   r   r   rx   ry   r   chunks	chunk_numrF   r   r   chunk_metadataterms	term_freq	term_datar   freqterm_dfs                     r<   r   zHybridSearch.process_page   sh   
 ##..t/@/@A )& 1 -	MIu"2hZr)=H!J %' )(224	
 N ""&&J ')* '  $$ ' )(*  KKM'')E		%(557II'oo/ 
d    ("  ,,y1		!!"KL[-	Mr>   c                 ^   | j                   j                  d|g       | j                   j                  d|g       	 | j                  j                  d|i      d   }|r| j                  j	                  |       t        d	| d
       y# t
        $ r}t        d|        Y d}~,d}~ww xY w)z
        Delete all data associated with a given PDF UUID from DuckDB and ChromaDB.
        
        Args:
            pdf_uuid (str): The unique identifier of the PDF to delete.
        r|   r}   rx   r~   r   r   z$Error while deleting from ChromaDB: Nz"All data associated with PDF UUID z has been deleted.)r5   rj   r2   r   r   	Exceptionprint)r9   rx   r   es       r<   delete_collection_by_uuidz&HybridSearch.delete_collection_by_uuid  s     			6J	
 			eJ	
	>1155!8, 6 L &&--,-? 	28*<NOP  	>8<==	>s   ?B 	B,B''B,rD   top_kfinal_nc                    | j                   j                  |gd|i|      }|j                         j                         }dj	                  dj                  |D cg c]  }d c}            }	| j                  j                  |	|| j                  g|z   |||gz         j                         }
|d   d   }|rHt        |      }t        |      }t        |d   d   |      D ci c]  \  }}|d	||z
  ||z
  z  z
   }}}ni }|
j                         D ci c]  \  }}|d
   |d    }}}|rtt        |j                               }t        |j                               }||kD  r-|j                         D ci c]  \  }}|||z
  ||z
  z   }}}n|D ci c]  }|d }}t!        |j#                               t!        |j#                               z  }g }|D ]  }|j%                  |d      }|j%                  |d      }||z  d	|z
  |z  z   }| j                  j                  d|g      j                         j&                  d   }|j)                  ||d   t+        |d         t+        |d         |d   |||d        |r| j-                  ||      }t/        |d d      d| S c c}w c c}}w c c}}w c c}}w c c}w )z=
        Perform hybrid search within a specific PDF
        rx   )query_textsr   	n_resultsa#  
        WITH pdf_stats AS (
            -- Calculate statistics specific to the PDF
            SELECT COUNT(*) AS total_chunks, AVG(LENGTH(content)) AS avg_chunk_length
            FROM documents
            WHERE pdf_uuid = ?  -- Filter by the specific PDF
        ),
        doc_lengths AS (
            -- Get the length of each chunk within the specific PDF
            SELECT d.chunk_id, LENGTH(d.content) AS doc_len
            FROM documents d
            WHERE d.pdf_uuid = ?  -- Filter by the specific PDF
        ),
        term_stats AS (
            -- Calculate term statistics specific to the PDF
            SELECT 
                t.term,
                t.chunk_id,
                t.tf,
                COUNT(*) OVER (PARTITION BY t.term) AS df  -- Document frequency within this PDF
            FROM terms t
            JOIN documents d ON t.chunk_id = d.chunk_id
            WHERE t.term IN ({}) AND d.pdf_uuid = ?  -- Filter terms by the specific PDF
        ),
        scores AS (
            -- Calculate BM25 score for each chunk in the specific PDF
            SELECT 
                t.chunk_id,
                SUM(bm25_score(
                    t.tf, 
                    t.df, 
                    dl.doc_len, 
                    ps.avg_chunk_length,   -- Use PDF-level average chunk length
                    ps.total_chunks        -- Use PDF-level total chunks
                )) AS bm25_score
            FROM term_stats t
            JOIN doc_lengths dl ON t.chunk_id = dl.chunk_id
            CROSS JOIN pdf_stats ps  -- Join with PDF-specific statistics
            GROUP BY t.chunk_id
        )
        SELECT 
            d.chunk_id,
            d.content,
            d.page_number,
            d.chunk_number,
            d.metadata,
            COALESCE(s.bm25_score, 0) AS bm25_score  -- Default to 0 if no BM25 score
        FROM documents d
        LEFT JOIN scores s ON d.chunk_id = s.chunk_id
        WHERE d.pdf_uuid = ?  -- Filter by the specific PDF
        ORDER BY bm25_score DESC  -- Sort by BM25 score
        LIMIT ?
        ,?	distancesr   r   rq   r   r_   rp   z*SELECT * FROM documents WHERE chunk_id = ?r@   r   r   ry   )r   r@   r   r   ry   combined_scorevector_scorer_   c                     | d   S )NrA    )xs    r<   rg   z%HybridSearch.search.<locals>.<lambda>  s    a8H6I r>   T)keyreverseN)r2   rD   r   r   formatjoinr5   rj   r   fetchdfminmaxrC   iterrowsvaluesrZ   setkeysr   ilocr   intrI   sorted)r9   rD   rx   r   r   r   vector_resultsquery_terms_
bm25_querybm25_resultsvector_distancesmin_distancemax_distancer   distancevector_scoresrowbm25_scoresmin_bm25_scoremax_bm25_scorerH   r   combined_resultsr   r_   r   chunk_detailss                               r<   searchzHybridSearch.search3  sA   
 //55x( 6 
 kkm))+4h F388+6QS678i 	l yy((t++,{:hRW=XX
 ') 	 *+6q9/0L/0L +.nU.CA.FHX*Y&Hh !x,6<,;VWWWM 
 M
 '//1
3 
OS..
 
  !3!3!56N !3!3!56N. ,7+<+<+>'% u~5.>:YZZ  >IIx}II ++-.[5E5E5G1HH
" 	H(,,Xq9L$15J #\1QY*4LLN !II--<
 giQ M
 ##$(3"=#?@ #M.$A B)*5"0 ,(	% 		0 #}}U4DE&,ISWXYaZabbG 7
 Js   	J$"J)J/6J5
J;)z	search.dbzpdf_uuids.jsoni     rp   )g      ?g      ?)NN)      rp   )__name__
__module____qualname__r&   r   r   r   rt   r=   rI   r)   rW   r   r]   r6   ra   r   r   r   r   r   r>   r<   r   r      sd   *
 *
4: *
C *
^a *
  EH *
  il *
  DG *
  V[ *
XDdO 6S Xc] 3j(c (s (S (u (Z] (ch (sx (  EJ (B3 B(3- BRZ[_R` Blo BH4MlQ# Q$ Q:EcC Ec3 Ecs Ec# Eccghlcm Ecr>   r   )r-   chromadb.utilsr   r3   pandasr   numpyrr   typingr   r   r   sentence_transformersr   langchain.text_splitterr   concurrent.futuresr	   r
   langchain.document_loadersr   r[   r$   r   dotenvr   rN   r   r   r>   r<   <module>r      sH     .    ' ' . B ? 2  	    dc dcr>   