import duckdb
import chromadb
from chromadb.utils import embedding_functions

import os
import uuid
import fitz
import json
import pandas as pd
import numpy as np
import zipfile
from bs4 import BeautifulSoup
from typing import List, Dict, Optional

import camelot
from openai import OpenAI
from sentence_transformers import CrossEncoder
from langchain.schema.document import Document
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from concurrent.futures import ThreadPoolExecutor, as_completed
from dependencies.helper import encode_image, parse_scorm_content

from datetime import datetime
from dotenv import load_dotenv

load_dotenv()

gpt_4o='gpt-4o-2024-08-06'
gpt_4o_mini=os.getenv("OPENAI_MODEL")

class HybridSearch:
    def __init__(self, collection_name: str, user_code: str, folder_name: str, duckdb_path: str = "search.db", uuid_storage_path: str = "document_uuids.json", chunk_size: int = 1000, chunk_overlap: int = 200, alpha: float = 0.5):
        """
        Initialize the hybrid search system
        """
        DB_PATH = os.getenv("DB_PATH")+"/"+str(user_code)+"/"+str(folder_name)
        UPLOAD_PATH = os.getenv("STORAGE_PATH")+"/"+str(user_code)+"/"+str(folder_name)
        
        self.openai_client = OpenAI()
        self.collection_name = collection_name
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.uuid_storage_path = DB_PATH+"/"+uuid_storage_path
        self.document_upload_path = UPLOAD_PATH
        self.reranker_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
        # Load UUIDs from local storage
        self.document_uuids = self._load_uuid_storage()

        # Initialize ChromaDB with OpenAI embeddings
        # self.chroma_client = chromadb.Client()
        
        persist_directory = DB_PATH+"/chroma_db"
        if not os.path.exists(persist_directory):
            os.makedirs(persist_directory)
        self.chroma_client = chromadb.PersistentClient(path=persist_directory)
        openai_ef = embedding_functions.OpenAIEmbeddingFunction(
            api_key=os.getenv("OPENAI_API_KEY"),
            model_name=os.getenv("EMBEDDING_MODEL")
        )
        
        # Get or create ChromaDB collection
        self.chroma_collection = self.chroma_client.get_or_create_collection(
            name=collection_name,
            embedding_function=openai_ef
        )
        
        # Initialize DuckDB
        self.conn = duckdb.connect(DB_PATH+'/'+duckdb_path)
        self.duckdbpath = os.path.join(DB_PATH, duckdb_path)
        self._setup_duckdb()
        self.SCORE_THRESHOLD = 0.7
        
        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            is_separator_regex=False
        )

    def reranker(self, query, results):
        # rerank the results with original query and documents returned from Chroma
        scores = self.reranker_model.predict([(query, chunk['content']) for chunk in results])
        for score, chunk in zip(scores, results):
            chunk['reranker_score']=score
        # get the highest scoring document
        return results
        
    def _load_uuid_storage(self) -> Dict[str, Dict]:
        """
        Load the UUID storage from a JSON file
        """
        if os.path.exists(self.uuid_storage_path):
            with open(self.uuid_storage_path, 'r') as file:
                return json.load(file)
        return {}

    def _save_uuid_storage(self):
        """
        Save the UUID storage to a JSON file
        """
        with open(self.uuid_storage_path, 'w') as file:
            json.dump(self.document_uuids, file, indent=4)
    
    def get_document_uuid(self, document_name: str) -> Optional[str]:
        """
        Retrieve the UUID for a given document name
        """
        for uuid, details in self.document_uuids.items():
            if details["sanitized_name"] == document_name:
                return uuid
        return None

    def _setup_duckdb(self):
        """
        Set up DuckDB tables and indexes
        """
        with duckdb.connect(self.duckdbpath) as conn:
            # Create documents table
            conn.execute("""
                CREATE TABLE IF NOT EXISTS documents (
                    chunk_id VARCHAR PRIMARY KEY,
                    document_uuid VARCHAR,
                    content TEXT,
                    metadata JSON
                )
            """)

            # Create terms table for BM25
            conn.execute("""
                CREATE TABLE IF NOT EXISTS terms (
                    term VARCHAR,
                    chunk_id VARCHAR,
                    tf INTEGER,
                    UNIQUE(term, chunk_id)
                )
            """)

            # Create document stats table
            conn.execute("""
                CREATE TABLE IF NOT EXISTS doc_stats (
                    collection_name VARCHAR PRIMARY KEY,
                    total_chunks INTEGER,
                    avg_chunk_length DOUBLE
                )
            """)

            # Create indexes
            conn.execute("CREATE INDEX IF NOT EXISTS idx_document_uuid ON documents(document_uuid)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_term ON terms(term)")

            # Check if 'bm25_score' function already exists
            try:
                # Register BM25 scoring function if not already registered
                conn.create_function(
                    'bm25_score',
                    lambda tf, df, doc_len, avg_doc_len, total_docs: 
                        self._bm25_score(tf, df, doc_len, avg_doc_len, total_docs),
                    return_type="DOUBLE"
                )
            except duckdb.CatalogException:
                # Function already exists, skip registration
                pass
    
    def _bm25_score(self, tf: int, df: int, doc_len: int, avg_doc_len: float, total_docs: int, k1: float = 1.5, b: float = 0.75) -> float:
        """Calculate BM25 score for a term in a document"""
        idf = np.log((total_docs - df + 0.5) / (df + 0.5) + 1)
        tf_adjusted = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * doc_len / avg_doc_len))
        return float(idf * tf_adjusted)
    
    def get_image_description(self, pdf_name, image_path, context, model=gpt_4o_mini):
        base64_image = encode_image(image_path)
        prompt=f"""
        You are given a image from a pdf: {pdf_name}
        Along with the image you are provided with text from the pdf that is present around it.
        Your task is to generate a detailed description of that image using the image and the context provided.
        
        <context>
        {context}
        </context>
        
        Just write the description in a natural flow, do not begin like "this image...".
        Give a detailed description, without leaving any information.
        The response must be 2-3 passage long.
        Do not assume anything, or add anything extra on your own.
        """
        response = self.openai_client.chat.completions.create(
            model = model,
            messages = [
                {
                    'role': 'user',
                    'content': [
                        {
                            'type': 'text',
                            'text': prompt
                        },
                        {
                            'type': 'image_url',
                            'image_url':{
                                'url':  f"data:image/jpeg;base64,{base64_image}",
                                'detail': 'low'
                            }
                        }
                    ]
                }
            ],
            response_format={"type":"text"},
        )
        return response.choices[0].message.content
    
    def get_table_description(self, pdf_name, html, context, model=gpt_4o_mini):
        prompt=f"""
        You are given a table from the pdf: {pdf_name} in html format.
        Along with the tab;e you are provided with text from the pdf that is present around it.
        Your task is to generate a detailed description of that table using the image and the context provided.
        
        <context>
        {context}
        </context>

        <html>
        {html}
        </html>
        
        Just write the description in a natural flow, do not begin like "this table...".
        Give a detailed description, without leaving any information.
        The response must be 2-3 passage long.
        You can decide the length of description based on the relevance of the context provided.
        Do not assume anything, or add anything extra on your own.
        """
        response = self.openai_client.chat.completions.create(
            model = model,
            messages = [
                {
                    'role': 'user',
                    'content': [
                        {
                            'type': 'text',
                            'text': prompt
                        },
                    ]
                }
            ],
            response_format={"type":"text"},
        )
        return response.choices[0].message.content

    def handle_images(self, document_name, xrefs_all, doc, img_path, chunk_dict):
        image_chunks = []
        # Function to process each image chunk
        def process_image_chunk(page_num, k, xref, doc, img_path, chunk_dict, xrefs_all):
            # Collect context from nearby pages
            context = []
            buffer = 1
            for page in range(max(0, page_num - buffer), min(page_num + buffer + 1, xrefs_all[-1][0])):
                if str(page) in chunk_dict:
                    context += chunk_dict[str(page)]
            context = "\n\n".join(context)

            # Extract image and save
            pix = fitz.Pixmap(doc.extract_image(xref)['image'])
            path = img_path + f"/{page_num}_{k}.jpeg"
            pix.save(path)

            # Create the Document object
            return Document(
                page_content=self.get_image_description(document_name, path, context),
                metadata={"page": page_num, "path": path}
            )
        with ThreadPoolExecutor() as executor:
            futures = []
            for page_num, xrefs in xrefs_all:
                for k, xref in enumerate(xrefs):
                    futures.append(executor.submit(process_image_chunk, page_num, k, xref, doc, img_path, chunk_dict, xrefs_all))
            
            # Collect the results as they complete
            for future in as_completed(futures):
                image_chunks.append(future.result())
        
        return image_chunks

    def handle_tables(self, document_path, document_name, chunk_dict):
        table_chunks = []
        tables = camelot.read_pdf(document_path, pages='all')
        last_page = max([int(p) for p in chunk_dict.keys()])

        def process_table_chunk(page_num, table, document_name, chunk_dict, last_page):
            context = []
            buffer = 0
            for page in range(max(0, page_num - buffer), min(page_num + buffer + 1, last_page)):
                if str(page) in chunk_dict:
                    context += chunk_dict[str(page)]
            context = "\n\n".join(context)
            html = table.df.to_html()
            return Document(
                page_content=self.get_table_description(document_name, html, context),
                metadata={"page": page_num, "html": html}
            )

        with ThreadPoolExecutor() as executor:
            futures = []
            for table in tables:
                page_num = table.page
                futures.append(
                    executor.submit(process_table_chunk, page_num, table, document_name, chunk_dict, last_page)
                )
            
            for future in as_completed(futures):
                table_chunks.append(future.result())
        
        return table_chunks

    def upsert_document(self, document_path: str, document_name:str, sanitized_name: str, document_uuid: Optional[str] = None, metadata: Optional[Dict] = None, upload_dir: Optional[str] = None) -> str:
        """
        Process and upsert a document document into both databases
        """
        if document_uuid is None:
            document_uuid = str(uuid.uuid4())
        
        if metadata is None:
            metadata = {}
        
        # Store the UUID and metadata locally
        self.document_uuids[document_uuid] = {
            "document_name": document_name,
            "sanitized_name": sanitized_name,
            "document_path": document_path,
            "metadata": metadata
        }
        self._save_uuid_storage()

        chunks = []
        table_chunks = []
        image_chunks = []
        # Load document and process as before
        if document_path.endswith('.pdf'):
            img_path = f"{self.document_upload_path}/{document_name}/images"
            table_path = f"{self.document_upload_path}/{document_name}/tables"
            os.makedirs(img_path, exist_ok=True)
            os.makedirs(table_path, exist_ok=True)
            # doc = fitz.open(document_path)
            # page_num=0
            # xrefs_all=[]
            # for page in doc:
            #     imgs = page.get_images()
            #     xrefs=[img[0] for img in imgs]
            #     xrefs_all.append((page_num, xrefs))
            #     page_num+=1
            
            loader = PyPDFLoader(document_path)
            chunks = loader.load()
            chunk_dict={}
            for chunk in chunks:
                chunk.metadata['type']="text"
                chunk.metadata['path']=None
                chunk_dict.setdefault(str(chunk.metadata['page']), [])
                chunk_dict[str(chunk.metadata['page'])].append(chunk.page_content)

            # image_chunks = self.handle_images(document_name, xrefs_all, doc, img_path, chunk_dict)
            table_chunks = self.handle_tables(document_path, document_name, chunk_dict)
        elif document_path.endswith('.zip'):
            chunk_dict = self.extract_scorm_content(document_path,upload_dir)
            
            documents = []

            for file_name, chunk in chunk_dict.items():
                text_content = chunk["content"]  # Extracted text content
                metadata = {"source": file_name, "type": "scorm"}

                # Create a Document object
                doc = Document(page_content=text_content, metadata=metadata)
                documents.append(doc)

            # Now split the documents into chunks
            chunks = RecursiveCharacterTextSplitter().split_documents(documents)

            # Assign page numbers
            for i, chunk in enumerate(chunks):
                chunk.metadata['page'] = i  # Set page number
            
        elif document_path.endswith('.txt'):
            loader = TextLoader(document_path)
            chunks = loader.load()
            chunks = RecursiveCharacterTextSplitter().split_documents(chunks)
            for i, chunk in enumerate(chunks):
                chunk.metadata['page'] = i

        else:
            loader = UnstructuredMarkdownLoader(document_path)
            chunks = loader.load()   
            chunks = RecursiveCharacterTextSplitter().split_documents(chunks)
            for i, chunk in enumerate(chunks):
                chunk.metadata['page'] = i
        
        with duckdb.connect(self.duckdbpath) as conn:
            # Delete existing chunks if document was previously uploaded
            conn.execute(
                "DELETE FROM documents WHERE document_uuid = ?",
                [document_uuid]
            )
            conn.execute(
                "DELETE FROM terms WHERE chunk_id IN (SELECT chunk_id FROM documents WHERE document_uuid = ?)",
                [document_uuid]
            )
            
            existing_ids = self.chroma_collection.get(where={"document_uuid": document_uuid})["ids"]
            if existing_ids:
                self.chroma_collection.delete(ids=existing_ids)

            # Create a ThreadPoolExecutor to process pages concurrently
            all_chunks = []
            
            if image_chunks:
                self.process_image_chunks(image_chunks, document_uuid, all_chunks)
                print("Images processed")

            if table_chunks:
                self.process_table_chunks(table_chunks, document_uuid, all_chunks)
                print("Tables processed")
                
            
            with ThreadPoolExecutor() as executor:
                futures = []
                
                for chunk_num, chunk in enumerate(chunks):
                    futures.append(executor.submit(self.process_chunk, chunk, document_uuid, all_chunks))
                
                for future in as_completed(futures):
                    future.result()

            # Batch insert chunks
            chunks_df = pd.DataFrame(all_chunks)
            print("Text Chunks processed")
            conn.execute("INSERT INTO documents SELECT * FROM chunks_df")
            
            # Update document stats
            conn.execute("""
                INSERT OR REPLACE INTO doc_stats 
                SELECT 
                    ? as collection_name,
                    COUNT(*) as total_chunks,
                    AVG(LENGTH(content)) as avg_chunk_length
                FROM documents
            """, [document_uuid])
        
        return document_uuid

    def process_chunk(self, parent_chunk, document_uuid, all_chunks):
        """
        Process a single page and add chunks to databases.
        This method will run in parallel for each page.
        """
        chunks = self.text_splitter.split_text(parent_chunk.page_content)
        for chunk_num, chunk in enumerate(chunks):
            chunk_id = f"{document_uuid}-p{parent_chunk.metadata['page']}-c{chunk_num}-t"

            # Prepare chunk metadata
            chunk_metadata = {
                "document_uuid": document_uuid,
                "page_number": parent_chunk.metadata['page'],
                "chunk_number": chunk_num,
                "type": "text"
            }
            
            # Store in ChromaDB
            self.chroma_collection.add(
                ids=[chunk_id],
                documents=[chunk],
                metadatas=[chunk_metadata]
            )
            
            # Store in DuckDB
            all_chunks.append({
                "chunk_id": chunk_id,
                "document_uuid": document_uuid,
                "content": chunk,
                "metadata": chunk_metadata,
                # **chunk_metadata
            })
            
            # Process terms for BM25
            terms = chunk.lower().split()
            term_freq = pd.Series(terms).value_counts()
            
            term_data = []
            for term, freq in term_freq.items():
                term_data.append({
                    "term": term,
                    "chunk_id": chunk_id,
                    "tf": freq
                })
            
            if term_data:
                term_df = pd.DataFrame(term_data)
                with duckdb.connect(self.duckdbpath) as conn:
                    conn.execute("INSERT INTO terms SELECT * FROM term_df")

    def process_image_chunks(self, image_chunks, document_uuid, all_chunks):
        """
        Process a single page and add chunks to databases.
        This method will run in parallel for each page.
        """
        for chunk_num, chunk in enumerate(image_chunks):
            chunk_id = f"{document_uuid}-p{chunk.metadata['page']}-c{chunk_num}-i"
            
            # Prepare chunk metadata
            chunk_metadata = {
                "document_uuid": document_uuid,
                "page_number": chunk.metadata['page'],
                "type": "image",
                "path": chunk.metadata['path'],
                "chunk_number": chunk_num
            }
            
            # Store in ChromaDB
            self.chroma_collection.add(
                ids=[chunk_id],
                documents=[chunk.page_content],
                metadatas=[chunk_metadata]
            )
            
            # Store in DuckDB
            all_chunks.append({
                "chunk_id": chunk_id,
                "document_uuid": document_uuid,
                "content": chunk.page_content,
                "metadata": chunk_metadata
            })
            
            # Process terms for BM25
            terms = chunk.page_content.lower().split()
            term_freq = pd.Series(terms).value_counts()
            
            term_data = []
            for term, freq in term_freq.items():
                term_data.append({
                    "term": term,
                    "chunk_id": chunk_id,
                    "tf": freq
                })
            
            if term_data:
                term_df = pd.DataFrame(term_data)
                with duckdb.connect(self.duckdbpath) as conn:
                    conn.execute("INSERT INTO terms SELECT * FROM term_df")

    def process_table_chunks(self, table_chunks, document_uuid, all_chunks):
        """
        Process a single page and add chunks to databases.
        This method will run in parallel for each page.
        """
        for chunk_num, chunk in enumerate(table_chunks):
            chunk_id = f"{document_uuid}-p{chunk.metadata['page']}-c{chunk_num}-i"
            
            # Prepare chunk metadata
            chunk_metadata = {
                "document_uuid": document_uuid,
                "page_number": chunk.metadata['page'],
                "type": "table",
                "html": chunk.metadata['html'],
                "chunk_number": chunk_num
            }
            
            # Store in ChromaDB
            self.chroma_collection.add(
                ids=[chunk_id],
                documents=[chunk.page_content],
                metadatas=[chunk_metadata]
            )
            
            # Store in DuckDB
            all_chunks.append({
                "chunk_id": chunk_id,
                "document_uuid": document_uuid,
                "content": chunk.page_content,
                "metadata": chunk_metadata
            })
            
            # Process terms for BM25
            terms = chunk.page_content.lower().split()
            term_freq = pd.Series(terms).value_counts()
            
            term_data = []
            for term, freq in term_freq.items():
                term_data.append({
                    "term": term,
                    "chunk_id": chunk_id,
                    "tf": freq
                })
            
            if term_data:
                term_df = pd.DataFrame(term_data)
                with duckdb.connect(self.duckdbpath) as conn:
                    conn.execute("INSERT INTO terms SELECT * FROM term_df")

    def extract_scorm_content(self, document_path,upload_dir):
        extract_path = os.path.join(upload_dir, os.path.splitext(os.path.basename(document_path))[0])
        
        # Extract zip file
        try:
            with zipfile.ZipFile(document_path, 'r') as zip_ref:
                zip_ref.extractall(extract_path)
        except Exception as e:
            print(f"Error while extracting SCORM document: {e}")

        chunks = parse_scorm_content(extract_path)
        
        chunk_dict={}
        for chunk in chunks:
            if isinstance(chunk, dict):  # Ensure chunk is a dict
                file_name = chunk.get("id", "unknown")  # Get filename (fallback to "unknown" if missing)
                html_content = chunk.get("text", "")  # Get file content

                soup = BeautifulSoup(html_content, "html.parser")
                text_content = soup.get_text(separator=" ", strip=True)

                # Store data in a dictionary, using file_name as key
                chunk_dict[file_name] = {
                    "type": "scorm",
                    "path": extract_path,
                    "content": text_content,
                    "page_content": text_content
                }
            else:
                print(f"Skipping invalid chunk: {chunk}")  # Debugging
            
        return chunk_dict

    def delete_collection_by_uuid(self, document_uuid: str) -> None:
        """
        Delete all data associated with a given document UUID from DuckDB and ChromaDB.
        
        Args:
            document_uuid (str): The unique identifier of the document to delete.
        """
        # Delete from DuckDB
        with duckdb.connect(self.duckdbpath) as conn:
            conn.execute(
                "DELETE FROM documents WHERE document_uuid = ?",
                [document_uuid]
            )
            conn.execute(
                "DELETE FROM terms WHERE chunk_id IN (SELECT chunk_id FROM documents WHERE document_uuid = ?)",
                [document_uuid]
            )

        # Delete from ChromaDB
        try:
            existing_ids = self.chroma_collection.get(
                where={"document_uuid": document_uuid}
            )["ids"]
            if existing_ids:
                self.chroma_collection.delete(ids=existing_ids)
        except Exception as e:
            print(f"Error while deleting from ChromaDB: {e}")

        print(f"All data associated with document UUID {document_uuid} has been deleted.")

    def search(self, query: str, document_uuid: str, top_k: int = 15, final_n: int = 5, alpha = 0.5) -> List[Dict]:
        """
        Perform hybrid search within a specific document
        """
        # Get vector search results
        if document_uuid:
            vector_results = self.chroma_collection.query(
                query_texts=[query],
                where={"document_uuid": {"$eq": document_uuid}},
                n_results=top_k
            )
        else:
            vector_results = self.chroma_collection.query(
                query_texts=[query],
                n_results=top_k
            )
        # Get BM25 results
        query_terms = query.lower().split()

        bm25_query = """
        WITH document_stats AS (
            -- Calculate statistics specific to the document
            SELECT COUNT(*) AS total_chunks, AVG(LENGTH(content)) AS avg_chunk_length
            FROM documents
            WHERE document_uuid = ?  -- Filter by the specific document
        ),
        doc_lengths AS (
            -- Get the length of each chunk within the specific document
            SELECT d.chunk_id, LENGTH(d.content) AS doc_len
            FROM documents d
            WHERE d.document_uuid = ?  -- Filter by the specific document
        ),
        term_stats AS (
            -- Calculate term statistics specific to the document
            SELECT 
                t.term,
                t.chunk_id,
                t.tf,
                COUNT(*) OVER (PARTITION BY t.term) AS df  -- Document frequency within this document
            FROM terms t
            JOIN documents d ON t.chunk_id = d.chunk_id
            WHERE t.term IN ({}) AND d.document_uuid = ?  -- Filter terms by the specific document
        ),
        scores AS (
            -- Calculate BM25 score for each chunk in the specific document
            SELECT 
                t.chunk_id,
                SUM(bm25_score(
                    t.tf, 
                    t.df, 
                    dl.doc_len, 
                    ps.avg_chunk_length,   -- Use document-level average chunk length
                    ps.total_chunks        -- Use document-level total chunks
                )) AS bm25_score
            FROM term_stats t
            JOIN doc_lengths dl ON t.chunk_id = dl.chunk_id
            CROSS JOIN document_stats ps  -- Join with document-specific statistics
            GROUP BY t.chunk_id
        )
        SELECT 
            d.chunk_id,
            d.content,
            d.metadata,
            COALESCE(s.bm25_score, 0) AS bm25_score  -- Default to 0 if no BM25 score
        FROM documents d
        LEFT JOIN scores s ON d.chunk_id = s.chunk_id
        WHERE d.document_uuid = ?  -- Filter by the specific document
        ORDER BY bm25_score DESC  -- Sort by BM25 score
        LIMIT ?
        """.format(','.join(['?' for _ in query_terms]))  # Format for terms in the query

        bm25_results = self.conn.execute(
            bm25_query,
            [document_uuid, self.collection_name] + query_terms + [document_uuid, document_uuid, top_k]
        ).fetchdf()

        # Normalize vector scores (convert distance to similarity and scale to [0, 1])
        vector_distances = vector_results['distances'][0]
        vector_scores = {}
        if vector_distances:
            min_distance = min(vector_distances)
            max_distance = max(vector_distances)
            # vector_scores = {
            #     chunk_id: 1 - (distance - min_distance) / (max_distance - min_distance)
            #     for chunk_id, distance in zip(vector_results['ids'][0], vector_distances) if distance >= self.SCORE_THRESHOLD
            # }
            for chunk_id, distance in zip(vector_results['ids'][0], vector_distances): 
                if (distance) >= self.SCORE_THRESHOLD:
                    vector_scores[chunk_id] = 1 - (distance - min_distance) / (max_distance - min_distance)
        else:
            vector_scores = {}
        
        SCORE_THRESHOLD = 0.7
        # Normalize BM25 scores to [0, 1] using min-max normalization
        bm25_scores = {
            row['chunk_id']: row['bm25_score']
            for _, row in bm25_results.iterrows()
        }
        if bm25_scores:
            min_bm25_score = min(bm25_scores.values())
            max_bm25_score = max(bm25_scores.values())
            
            if max_bm25_score > min_bm25_score:  # Avoid division by zero
                bm25_scores = {
                    chunk_id: (score - min_bm25_score) / (max_bm25_score - min_bm25_score)
                    for chunk_id, score in bm25_scores.items()
                }
            else:
                # If all scores are the same, assign a neutral normalized score (e.g., 0.5)
                bm25_scores = {chunk_id: 0.5 for chunk_id in bm25_scores}
            # ✅ Apply SCORE_THRESHOLD rule (filter out low-scoring chunks)
            bm25_scores = {
                chunk_id: score for chunk_id, score in bm25_scores.items() if score >= self.SCORE_THRESHOLD
            }
        # Combine scores
        all_chunks = set(vector_scores.keys()) | set(bm25_scores.keys())

        if not all_chunks:
            # print("No matching chunks found. Returning empty results.")
            return False

        combined_results = []
        with duckdb.connect(self.duckdbpath) as conn:
            for chunk_id in all_chunks:
                if chunk_id:
                    vector_score = vector_scores.get(chunk_id, 0)
                    bm25_score = bm25_scores.get(chunk_id, 0)
                    
                    # Calculate combined score
                    combined_score = alpha * vector_score + (1 - alpha) * bm25_score
                    
                    # Get chunk details from DuckDB
                    chunk_details_df = conn.execute(
                        "SELECT * FROM documents WHERE chunk_id = ?",
                        [chunk_id]
                    ).fetchdf()

                    # ✅ Handle the case when no chunk details are found
                    if chunk_details_df.empty:
                        chunk_details = {"content": "", "metadata": ""}
                    else:
                        chunk_details = chunk_details_df.iloc[0]
                
                    combined_results.append({
                        'chunk_id': chunk_id,
                        'content': chunk_details['content'] if chunk_details['content'] else '',
                        'metadata': chunk_details['metadata'] if chunk_details['content'] else '',
                        'combined_score': combined_score,
                        'vector_score': vector_score,
                        'bm25_score': bm25_score
                    })
        
        if not combined_results:
            # print("No valid combined results found.")
            return False

        combined_results = self.reranker(query, combined_results)
        # Sort by combined score and return top results
        return sorted(combined_results, key=lambda x: x['reranker_score'], reverse=True)[:final_n]