o
    fg                     @   s&  d dl Z d dlZd dlmZ d dlZd dlZd dlZd dlZd dlZ	d dl
Zd dlZd dlmZ d dlmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z& d dl'm'Z' d dl(m)Z) e)  dZ*e+dZ,G dd dZ-dS )    N)embedding_functions)BeautifulSoup)ListDictOptional)OpenAI)CrossEncoder)Document)PyPDFLoader
TextLoader)RecursiveCharacterTextSplitter)UnstructuredMarkdownLoader)ThreadPoolExecutoras_completed)encode_imageparse_scorm_content)datetime)load_dotenvzgpt-4o-2024-08-06OPENAI_MODELc                   @   sZ  e Zd ZdHdededed	ed
edededefddZdd Zdeeef fddZ	dd Z
dedee fddZdd ZdIdeded ed!ed"ed#ed$edefd%d&Zefd'd(Zefd)d*Zd+d, Zd-d. ZdJd0eded1ed2ee d3ee d4ee defd5d6Zd7d8 Zd9d: Zd;d< Zd=d> Zd2edd/fd?d@ZdKdCed2edDedEedee f
dFdGZd/S )LHybridSearch	search.dbdocument_uuids.json           ?collection_name	user_codefolder_nameduckdb_pathuuid_storage_path
chunk_sizechunk_overlapalphac	                 C   s*  t dd t| d t| }	t dd t| d t| }
t | _|| _|| _|| _|	d | | _|
| _	t
ddd| _|  | _|	d }t j|sTt | tj|d| _tjt d	t d
d}| jj||d| _t|	d | | _t j|	|| _|   d| _t||t dd| _!dS )z5
        Initialize the hybrid search system
        DB_PATH/STORAGE_PATHz$cross-encoder/ms-marco-MiniLM-L-6-v2i   )
max_lengthz
/chroma_db)pathOPENAI_API_KEYEMBEDDING_MODEL)api_key
model_name)nameembedding_functionffffff?F)r    r!   length_functionis_separator_regexN)"osgetenvstrr   openai_clientr   r    r!   r   document_upload_pathr   reranker_model_load_uuid_storagedocument_uuidsr'   existsmakedirschromadbPersistentClientchroma_clientr   OpenAIEmbeddingFunctionget_or_create_collectionchroma_collectionduckdbconnectconnjoin
duckdbpath_setup_duckdbSCORE_THRESHOLDr   lentext_splitter)selfr   r   r   r   r   r    r!   r"   r#   UPLOAD_PATHpersist_directory	openai_ef rN   8/var/www/html/answerous_api/dependencies/HybridSearch.py__init__"   s@   ""

zHybridSearch.__init__c                    s:   | j  fdd|D }t||D ]\}}||d< q|S )Nc                    s   g | ]} |d  fqS )contentrN   ).0chunkqueryrN   rO   
<listcomp>U   s    z)HybridSearch.reranker.<locals>.<listcomp>reranker_score)r6   predictzip)rJ   rU   resultsscoresscorerS   rN   rT   rO   rerankerS   s   
zHybridSearch.rerankerreturnc                 C   sH   t j| jr"t| jd}t|W  d   S 1 sw   Y  i S )z8
        Load the UUID storage from a JSON file
        rN)r1   r'   r9   r   openjsonloadrJ   filerN   rN   rO   r7   [   s
    zHybridSearch._load_uuid_storagec                 C   sD   t | jd}tj| j|dd W d   dS 1 sw   Y  dS )z6
        Save the UUID storage to a JSON file
        w   )indentN)r`   r   ra   dumpr8   rc   rN   rN   rO   _save_uuid_storaged   s   "zHybridSearch._save_uuid_storagedocument_namec                 C   s,   | j  D ]\}}|d |kr|  S qdS )z=
        Retrieve the UUID for a given document name
        sanitized_nameN)r8   items)rJ   rj   uuiddetailsrN   rN   rO   get_document_uuidk   s
   zHybridSearch.get_document_uuidc              	      s   t  jC}|d |d |d |d |d z|jd fddd	d
 W n
 t jy8   Y n	w W d   dS W d   dS 1 sLw   Y  dS )z2
        Set up DuckDB tables and indexes
        z
                CREATE TABLE IF NOT EXISTS documents (
                    chunk_id VARCHAR PRIMARY KEY,
                    document_uuid VARCHAR,
                    content TEXT,
                    metadata JSON
                )
            z
                CREATE TABLE IF NOT EXISTS terms (
                    term VARCHAR,
                    chunk_id VARCHAR,
                    tf INTEGER,
                    UNIQUE(term, chunk_id)
                )
            z
                CREATE TABLE IF NOT EXISTS doc_stats (
                    collection_name VARCHAR PRIMARY KEY,
                    total_chunks INTEGER,
                    avg_chunk_length DOUBLE
                )
            zHCREATE INDEX IF NOT EXISTS idx_document_uuid ON documents(document_uuid)z2CREATE INDEX IF NOT EXISTS idx_term ON terms(term)
bm25_scorec                    s     | ||||S )N)_bm25_score)tfdfdoc_lenavg_doc_len
total_docsrJ   rN   rO   <lambda>   s   z,HybridSearch._setup_duckdb.<locals>.<lambda>DOUBLE)return_typeN)rA   rB   rE   executecreate_functionCatalogException)rJ   rC   rN   rw   rO   rF   t   s&   





	


-"zHybridSearch._setup_duckdb      ?      ?rr   rs   rt   ru   rv   k1bc           
      C   sR   t || d |d  d }||d  ||d| || |     }	t||	 S )z-Calculate BM25 score for a term in a documentr      )nplogfloat)
rJ   rr   rs   rt   ru   rv   r   r   idftf_adjustedrN   rN   rO   rq      s   (zHybridSearch._bm25_scorec                 C   sf   t |}d| d| d}| jjjj|dd|ddd| d	d
dgdgddid}|jd jjS )Nz+
        You are given a image from a pdf: z
        Along with the image you are provided with text from the pdf that is present around it.
        Your task is to generate a detailed description of that image using the image and the context provided.
        
        <context>
        a6  
        </context>
        
        Just write the description in a natural flow, do not begin like "this image...".
        Give a detailed description, without leaving any information.
        The response must be 2-3 passage long.
        Do not assume anything, or add anything extra on your own.
        usertexttyper   	image_urlzdata:image/jpeg;base64,low)urldetail)r   r   rolerQ   r   modelmessagesresponse_formatr   )r   r4   chatcompletionscreatechoicesmessagerQ   )rJ   pdf_name
image_pathcontextr   base64_imagepromptresponserN   rN   rO   get_image_description   s.   
z"HybridSearch.get_image_descriptionc                 C   sP   d| d| d| d}| j jjj|dd|dgdgd	did
}|jd jjS )Nz-
        You are given a table from the pdf: a   in html format.
        Along with the tab;e you are provided with text from the pdf that is present around it.
        Your task is to generate a detailed description of that table using the image and the context provided.
        
        <context>
        z,
        </context>

        <html>
        a  
        </html>
        
        Just write the description in a natural flow, do not begin like "this table...".
        Give a detailed description, without leaving any information.
        The response must be 2-3 passage long.
        You can decide the length of description based on the relevance of the context provided.
        Do not assume anything, or add anything extra on your own.
        r   r   r   r   r   r   r   )r4   r   r   r   r   r   rQ   )rJ   r   htmlr   r   r   r   rN   rN   rO   get_table_description   s&   

z"HybridSearch.get_table_descriptionc                    s   g } fdd}t  8}g }	|D ]\}
}t|D ]\}}|	|||
|||||| qqt|	D ]	}||  q2W d    |S 1 sGw   Y  |S )Nc                    s   g }d}t td| | t| | d |d d D ]}	t|	|v r)||t|	 7 }qd|}t||d }
|d|  d| d }|
| t	
 ||| |d	d
S )Nr   r   

imager$   _z.jpeg)pager'   page_contentmetadata)rangemaxminr3   rD   fitzPixmapextract_imagesaver	   r   )page_numkxrefdocimg_path
chunk_dict	xrefs_allr   bufferr   pixr'   rj   rJ   rN   rO   process_image_chunk   s   .

z7HybridSearch.handle_images.<locals>.process_image_chunk)r   	enumerateappendsubmitr   result)rJ   rj   r   r   r   r   image_chunksr   executorfuturesr   xrefsr   r   futurerN   r   rO   handle_images   s    


zHybridSearch.handle_imagesc                    s   g }t j|dd}tdd | D } fdd}t .}g }	|D ]}
|
j}|	||||
||| q"t|	D ]	}||	  q9W d    |S 1 sNw   Y  |S )Nall)pagesc                 S   s   g | ]}t |qS rN   )int)rR   prN   rN   rO   rV     s    z.HybridSearch.handle_tables.<locals>.<listcomp>c           	         s|   g }d}t td| | t| | d |D ]}t||v r%||t| 7 }qd|}|j }t |||| |ddS )Nr   r   r   )r   r   r   )	r   r   r   r3   rD   rs   to_htmlr	   r   )	r   tablerj   r   	last_pager   r   r   r   rw   rN   rO   process_table_chunk!  s   &

z7HybridSearch.handle_tables.<locals>.process_table_chunk)
camelotread_pdfr   keysr   r   r   r   r   r   )rJ   document_pathrj   r   table_chunkstablesr   r   r   r   r   r   r   rN   rw   rO   handle_tables  s$   
zHybridSearch.handle_tablesNr   rk   document_uuidr   
upload_dirc                 C   sN  |du r
t t }|du ri }||||d| j|< |   g }g }g }	|dr| j d| d}
| j d| d}tj|
dd tj|dd t	|}|
 }i }|D ]$}d	|jd
< d|jd< |t |jd g  |t |jd  |j qU| |||}n}|dr| ||}g }| D ]\}}|d }|dd}t||d}|| qt |}t|D ]	\}}||jd< qn>|drt|}|
 }t |}t|D ]	\}}||jd< qnt|}|
 }t |}t|D ]	\}}||jd< qt| j}|d|g |d|g | jjd|idd }|r)| jj|d g }|	r9| |	|| t d |rG| !||| t d t" ,}g }t|D ]\}}||#| j$||| qQt%|D ]}|&  qgW d   n	1 szw   Y  t'(|}t d |d |d|g W d   |S 1 sw   Y  |S )zL
        Process and upsert a document document into both databases
        N)rj   rk   r   r   z.pdfr$   z/imagesz/tablesT)exist_okr   r   r'   r   z.ziprQ   scorm)sourcer   r   z.txt-DELETE FROM documents WHERE document_uuid = ?\DELETE FROM terms WHERE chunk_id IN (SELECT chunk_id FROM documents WHERE document_uuid = ?)r   whereidsr   zImages processedzTables processedzText Chunks processedz-INSERT INTO documents SELECT * FROM chunks_dfa  
                INSERT OR REPLACE INTO doc_stats 
                SELECT 
                    ? as collection_name,
                    COUNT(*) as total_chunks,
                    AVG(LENGTH(content)) as avg_chunk_length
                FROM documents
            ))r3   rm   uuid4r8   ri   endswithr5   r1   r:   r
   rb   r   
setdefaultr   r   r   extract_scorm_contentrl   r	   r   split_documentsr   r   r   rA   rB   rE   r{   r@   getdeleteprocess_image_chunksprintprocess_table_chunksr   r   process_chunkr   r   pd	DataFrame)rJ   r   rj   rk   r   r   r   chunksr   r   r   
table_pathloaderr   rS   	documents	file_nametext_contentr   irC   existing_ids
all_chunksr   r   	chunk_numr   	chunks_dfrN   rN   rO   upsert_document;  s   










33zHybridSearch.upsert_documentc              	   C   s  | j |j}t|D ]u\}}| d|jd  d| d}||jd |dd}| jj|g|g|gd |||||d | 	 }	t
|	 }
g }|
 D ]\}}||||d	 qP|rt
|}t| j}|d
 W d   n1 s{w   Y  qdS )|
        Process a single page and add chunks to databases.
        This method will run in parallel for each page.
        -pr   -cz-tr   )r   page_numberchunk_numberr   r   r   	metadataschunk_idr   rQ   r   termr  rr   'INSERT INTO terms SELECT * FROM term_dfN)rI   
split_textr   r   r   r@   addr   lowersplitr   Seriesvalue_countsrl   r   rA   rB   rE   r{   )rJ   parent_chunkr   r   r   r   rS   r  chunk_metadataterms	term_freq	term_datar  freqterm_dfrC   rN   rN   rO   r     sF   	

zHybridSearch.process_chunkc              	   C     t |D ]|\}}| d|jd  d| d}||jd d|jd |d}| jj|g|jg|gd ||||j|d	 |j  }t	|
 }	g }
|	 D ]\}}|
|||d
 qP|
rt|
}t| j}|d W d   n1 s{w   Y  qdS )r   r   r   r   -ir   r'   )r   r   r   r'   r   r   r  r  r  Nr   r   r@   r  r   r   r	  r
  r   r  r  rl   r   rA   rB   rE   r{   )rJ   r   r   r   r   rS   r  r  r  r  r  r  r  r  rC   rN   rN   rO   r     F   	

z!HybridSearch.process_image_chunksc              	   C   r  )r   r   r   r   r  r   r   )r   r   r   r   r   r   r  r  r  Nr  )rJ   r   r   r   r   rS   r  r  r  r  r  r  r  r  rC   rN   rN   rO   r   %  r  z!HybridSearch.process_table_chunksc              
   C   s  t j|t jt j|d }zt|d}|| W d    n1 s(w   Y  W n tyG } zt	d|  W Y d }~nd }~ww t
|}i }|D ]0}t|try|dd}	|dd}
t|
d}|jd	d
d}d|||d||	< qPt	d|  qP|S )Nr   r_   z'Error while extracting SCORM document: idunknownr    zhtml.parser T)	separatorstripr   )r   r'   rQ   r   zSkipping invalid chunk: )r1   r'   rD   splitextbasenamezipfileZipFile
extractall	Exceptionr   r   
isinstancedictr   r   get_text)rJ   r   r   extract_pathzip_refer   r   rS   r   html_contentsoupr   rN   rN   rO   r   V  s2   "

z"HybridSearch.extract_scorm_contentc              
   C   s   t | j}|d|g |d|g W d   n1 sw   Y  z| jjd|idd }|r9| jj|d W n tyS } ztd|  W Y d}~nd}~ww td	| d
 dS )z
        Delete all data associated with a given document UUID from DuckDB and ChromaDB.
        
        Args:
            document_uuid (str): The unique identifier of the document to delete.
        r   r   Nr   r   r   r   z$Error while deleting from ChromaDB: z'All data associated with document UUID z has been deleted.)	rA   rB   rE   r{   r@   r   r   r#  r   )rJ   r   rC   r   r)  rN   rN   rO   delete_collection_by_uuidw  s0   z&HybridSearch.delete_collection_by_uuid      rU   top_kfinal_nc                    s  |rj j|gdd|ii|d}n	j j|g|d}|  }dddd |D }j||jg| |||g 	 }	|d	 d
 }
i }|
rrt
|
}t|
}t|d d
 |
D ]\}}|jkrpd|| ||   ||< q[ni }d}dd |	 D }|rt
| t|   kr fdd| D }ndd |D }fdd| D }t| t| B }|sdS g }tj[}|D ]P}|r||d
}||d
}|| d| |  }|d|g	 }|jrddd}n|jd
 }|||d r	|d nd|d r|d nd|||d qW d   n	1 s'w   Y  |s1dS ||}t|dd ddd| S )zB
        Perform hybrid search within a specific document
        r   z$eq)query_textsr   	n_results)r1  r2  aF  
        WITH document_stats AS (
            -- Calculate statistics specific to the document
            SELECT COUNT(*) AS total_chunks, AVG(LENGTH(content)) AS avg_chunk_length
            FROM documents
            WHERE document_uuid = ?  -- Filter by the specific document
        ),
        doc_lengths AS (
            -- Get the length of each chunk within the specific document
            SELECT d.chunk_id, LENGTH(d.content) AS doc_len
            FROM documents d
            WHERE d.document_uuid = ?  -- Filter by the specific document
        ),
        term_stats AS (
            -- Calculate term statistics specific to the document
            SELECT 
                t.term,
                t.chunk_id,
                t.tf,
                COUNT(*) OVER (PARTITION BY t.term) AS df  -- Document frequency within this document
            FROM terms t
            JOIN documents d ON t.chunk_id = d.chunk_id
            WHERE t.term IN ({}) AND d.document_uuid = ?  -- Filter terms by the specific document
        ),
        scores AS (
            -- Calculate BM25 score for each chunk in the specific document
            SELECT 
                t.chunk_id,
                SUM(bm25_score(
                    t.tf, 
                    t.df, 
                    dl.doc_len, 
                    ps.avg_chunk_length,   -- Use document-level average chunk length
                    ps.total_chunks        -- Use document-level total chunks
                )) AS bm25_score
            FROM term_stats t
            JOIN doc_lengths dl ON t.chunk_id = dl.chunk_id
            CROSS JOIN document_stats ps  -- Join with document-specific statistics
            GROUP BY t.chunk_id
        )
        SELECT 
            d.chunk_id,
            d.content,
            d.metadata,
            COALESCE(s.bm25_score, 0) AS bm25_score  -- Default to 0 if no BM25 score
        FROM documents d
        LEFT JOIN scores s ON d.chunk_id = s.chunk_id
        WHERE d.document_uuid = ?  -- Filter by the specific document
        ORDER BY bm25_score DESC  -- Sort by BM25 score
        LIMIT ?
        ,c                 S   s   g | ]}d qS )?rN   )rR   r   rN   rN   rO   rV     s    z'HybridSearch.search.<locals>.<listcomp>	distancesr   r   r   r.   c                 S   s   i | ]\}}|d  |d qS )r  rp   rN   )rR   r   rowrN   rN   rO   
<dictcomp>  s    z'HybridSearch.search.<locals>.<dictcomp>c                    s"   i | ]\}}||    qS rN   rN   rR   r  r\   )max_bm25_scoremin_bm25_scorerN   rO   r7    s    c                 S   s   i | ]}|d qS )r   rN   )rR   r  rN   rN   rO   r7    s    c                    s    i | ]\}}| j kr||qS rN   )rG   r8  rw   rN   rO   r7    s    Fz*SELECT * FROM documents WHERE chunk_id = ?r  )rQ   r   rQ   r   )r  rQ   r   combined_scorevector_scorerp   Nc                 S   s   | d S )NrW   rN   )xrN   rN   rO   rx   3  s    z%HybridSearch.search.<locals>.<lambda>T)keyreverse)r@   rU   r	  r
  formatrD   rC   r{   r   fetchdfr   r   rY   rG   iterrowsvaluesrl   setr   rA   rB   rE   r   emptyilocr   r]   sorted)rJ   rU   r   r/  r0  r"   vector_resultsquery_terms
bm25_querybm25_resultsvector_distancesvector_scoresmin_distancemax_distancer  distancerG   bm25_scoresr   combined_resultsrC   r<  rp   r;  chunk_details_dfchunk_detailsrN   )r9  r:  rJ   rO   search  s   
24


zHybridSearch.search)r   r   r   r   r   )r~   r   )NNN)r-  r.  r   )__name__
__module____qualname__r3   r   r   rP   r]   r   r7   ri   r   ro   rF   rq   gpt_4o_minir   r   r   r   r   r   r   r   r   r,  r   rU  rN   rN   rN   rO   r   !   s(    ,1		,3(%"4 211!(r   ).rA   r;   chromadb.utilsr   r1   rm   r   ra   pandasr   numpyr   r   bs4r   typingr   r   r   r   openair   sentence_transformersr   langchain.schema.documentr	   langchain.document_loadersr
   r   langchain.text_splitterr   $langchain_community.document_loadersr   concurrent.futuresr   r   dependencies.helperr   r   r   dotenvr   gpt_4or2   rY  r   rN   rN   rN   rO   <module>   s6    
