From 3ba585ffc023c875a259802922eead37d6a8b5e8 Mon Sep 17 00:00:00 2001
From: Saurab-Shrestha9639*969**9858//852 <shresthasaurab030@gmail.com>
Date: Mon, 10 Jun 2024 16:37:31 +0545
Subject: [PATCH] Added hybrid search with updated chunk size for semantic
 embedding

---
 private_gpt/__main__.py                       |   3 +-
 .../components/vector_store/hybrid_fn.py      | 151 ++++++++++++++++++
 .../vector_store/vector_store_component.py    |   9 ++
 private_gpt/server/chat/chat_service.py       |  31 ++--
 private_gpt/server/ingest/ingest_service.py   |  49 +++++-
 scripts/setup                                 |  11 +-
 settings.yaml                                 |   8 +-
 7 files changed, 241 insertions(+), 21 deletions(-)
 create mode 100644 private_gpt/components/vector_store/hybrid_fn.py

diff --git a/private_gpt/__main__.py b/private_gpt/__main__.py
index 7a864e6..0ab422b 100644
--- a/private_gpt/__main__.py
+++ b/private_gpt/__main__.py
@@ -8,7 +8,8 @@ from fastapi_pagination import add_pagination
 from private_gpt.settings.settings import settings
 from fastapi.staticfiles import StaticFiles
 from private_gpt.constants import UPLOAD_DIR
-
+# import nest_asyncio
+# nest_asyncio.apply()
 # Set log_config=None to do not use the uvicorn logging configuration, and
 # use ours instead. For reference, see below:
 # https://github.com/tiangolo/fastapi/discussions/7457#discussioncomment-5141108
diff --git a/private_gpt/components/vector_store/hybrid_fn.py b/private_gpt/components/vector_store/hybrid_fn.py
new file mode 100644
index 0000000..2d62768
--- /dev/null
+++ b/private_gpt/components/vector_store/hybrid_fn.py
@@ -0,0 +1,151 @@
+from llama_index.core.vector_stores import VectorStoreQueryResult
+
+from typing import Any, List, Tuple
+import torch
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+
+doc_tokenizer = AutoTokenizer.from_pretrained(
+    "naver/efficient-splade-VI-BT-large-doc"
+)
+doc_model = AutoModelForMaskedLM.from_pretrained(
+    "naver/efficient-splade-VI-BT-large-doc"
+)
+
+query_tokenizer = AutoTokenizer.from_pretrained(
+    "naver/efficient-splade-VI-BT-large-query"
+)
+query_model = AutoModelForMaskedLM.from_pretrained(
+    "naver/efficient-splade-VI-BT-large-query"
+)
+
+def sparse_doc_vectors(
+    texts: List[str],
+) -> Tuple[List[List[int]], List[List[float]]]:
+    """
+    Computes vectors from logits and attention mask using ReLU, log, and max operations.
+    """
+    tokens = doc_tokenizer(
+        texts, truncation=True, padding=True, return_tensors="pt"
+    )
+    if torch.cuda.is_available():
+        tokens = tokens.to("cuda")
+
+    output = doc_model(**tokens)
+    logits, attention_mask = output.logits, tokens.attention_mask
+    relu_log = torch.log(1 + torch.relu(logits))
+    weighted_log = relu_log * attention_mask.unsqueeze(-1)
+    tvecs, _ = torch.max(weighted_log, dim=1)
+
+    # extract the vectors that are non-zero and their indices
+    indices = []
+    vecs = []
+    for batch in tvecs:
+        indices.append(batch.nonzero(as_tuple=True)[0].tolist())
+        vecs.append(batch[indices[-1]].tolist())
+
+    return indices, vecs
+
+
+def sparse_query_vectors(
+    texts: List[str],
+) -> Tuple[List[List[int]], List[List[float]]]:
+    """
+    Computes vectors from logits and attention mask using ReLU, log, and max operations.
+    """
+    # TODO: compute sparse vectors in batches if max length is exceeded
+    tokens = query_tokenizer(
+        texts, truncation=True, padding=True, return_tensors="pt"
+    )
+    if torch.cuda.is_available():
+        tokens = tokens.to("cuda")
+
+    output = query_model(**tokens)
+    logits, attention_mask = output.logits, tokens.attention_mask
+    relu_log = torch.log(1 + torch.relu(logits))
+    weighted_log = relu_log * attention_mask.unsqueeze(-1)
+    tvecs, _ = torch.max(weighted_log, dim=1)
+
+    # extract the vectors that are non-zero and their indices
+    indices = []
+    vecs = []
+    for batch in tvecs:
+        indices.append(batch.nonzero(as_tuple=True)[0].tolist())
+        vecs.append(batch[indices[-1]].tolist())
+
+    return indices, vecs
+
+def relative_score_fusion(
+    dense_result: VectorStoreQueryResult,
+    sparse_result: VectorStoreQueryResult,
+    alpha: float = 0.5,  # passed in from the query engine
+    top_k: int = 2,  # passed in from the query engine i.e. similarity_top_k
+) -> VectorStoreQueryResult:
+    """
+    Fuse dense and sparse results using relative score fusion.
+    """
+    # sanity check
+    assert dense_result.nodes is not None
+    assert dense_result.similarities is not None
+    assert sparse_result.nodes is not None
+    assert sparse_result.similarities is not None
+
+    # deconstruct results
+    sparse_result_tuples = list(
+        zip(sparse_result.similarities, sparse_result.nodes)
+    )
+    sparse_result_tuples.sort(key=lambda x: x[0], reverse=True)
+
+    dense_result_tuples = list(
+        zip(dense_result.similarities, dense_result.nodes)
+    )
+    dense_result_tuples.sort(key=lambda x: x[0], reverse=True)
+
+    # track nodes in both results
+    all_nodes_dict = {x.node_id: x for x in dense_result.nodes}
+    for node in sparse_result.nodes:
+        if node.node_id not in all_nodes_dict:
+            all_nodes_dict[node.node_id] = node
+
+    # normalize sparse similarities from 0 to 1
+    sparse_similarities = [x[0] for x in sparse_result_tuples]
+    max_sparse_sim = max(sparse_similarities)
+    min_sparse_sim = min(sparse_similarities)
+    sparse_similarities = [
+        (x - min_sparse_sim) / (max_sparse_sim - min_sparse_sim)
+        for x in sparse_similarities
+    ]
+    sparse_per_node = {
+        sparse_result_tuples[i][1].node_id: x
+        for i, x in enumerate(sparse_similarities)
+    }
+
+    # normalize dense similarities from 0 to 1
+    dense_similarities = [x[0] for x in dense_result_tuples]
+    max_dense_sim = max(dense_similarities)
+    min_dense_sim = min(dense_similarities)
+    dense_similarities = [
+        (x - min_dense_sim) / (max_dense_sim - min_dense_sim)
+        for x in dense_similarities
+    ]
+    dense_per_node = {
+        dense_result_tuples[i][1].node_id: x
+        for i, x in enumerate(dense_similarities)
+    }
+
+    # fuse the scores
+    fused_similarities = []
+    for node_id in all_nodes_dict:
+        sparse_sim = sparse_per_node.get(node_id, 0)
+        dense_sim = dense_per_node.get(node_id, 0)
+        fused_sim = alpha * (sparse_sim + dense_sim)
+        fused_similarities.append((fused_sim, all_nodes_dict[node_id]))
+
+    fused_similarities.sort(key=lambda x: x[0], reverse=True)
+    fused_similarities = fused_similarities[:top_k]
+
+    # create final response object
+    return VectorStoreQueryResult(
+        nodes=[x[1] for x in fused_similarities],
+        similarities=[x[0] for x in fused_similarities],
+        ids=[x[1].node_id for x in fused_similarities],
+    )
\ No newline at end of file
diff --git a/private_gpt/components/vector_store/vector_store_component.py b/private_gpt/components/vector_store/vector_store_component.py
index 316e4c6..b4e08ce 100644
--- a/private_gpt/components/vector_store/vector_store_component.py
+++ b/private_gpt/components/vector_store/vector_store_component.py
@@ -13,6 +13,7 @@ from llama_index.core.vector_stores.types import (
 from private_gpt.open_ai.extensions.context_filter import ContextFilter
 from private_gpt.paths import local_data_path
 from private_gpt.settings.settings import Settings
+from .hybrid_fn import sparse_query_vectors, sparse_doc_vectors, relative_score_fusion
 
 logger = logging.getLogger(__name__)
 
@@ -119,6 +120,11 @@ class VectorStoreComponent:
                     QdrantVectorStore(
                         client=client,
                         collection_name="make_this_parameterizable_per_api_call",
+                        enable_hybrid=True, 
+                        batch_size=20,
+                        sparse_doc_fn=sparse_doc_vectors,
+                        sparse_query_fn=sparse_query_vectors,
+                        # hybrid_fusion_fn=relative_score_fusion,
                     ),  # TODO
                 )
             case _:
@@ -144,6 +150,9 @@ class VectorStoreComponent:
                 if self.settings.vectorstore.database != "qdrant"
                 else None
             ),
+            sparse_top_k=12, 
+            vector_store_query_mode="hybrid",
+            alpha=0.5
         )
 
     def close(self) -> None:
diff --git a/private_gpt/server/chat/chat_service.py b/private_gpt/server/chat/chat_service.py
index c3d70a1..9a13cda 100644
--- a/private_gpt/server/chat/chat_service.py
+++ b/private_gpt/server/chat/chat_service.py
@@ -16,6 +16,9 @@ from llama_index.core.storage import StorageContext
 from llama_index.core.types import TokenGen
 from pydantic import BaseModel
 
+from llama_index.core import get_response_synthesizer
+from llama_index.core.query_engine import RetrieverQueryEngine
+
 from private_gpt.components.embedding.embedding_component import EmbeddingComponent
 from private_gpt.components.llm.llm_component import LLMComponent
 from private_gpt.components.node_store.node_store_component import NodeStoreComponent
@@ -26,6 +29,7 @@ from private_gpt.open_ai.extensions.context_filter import ContextFilter
 from private_gpt.server.chunks.chunks_service import Chunk
 from private_gpt.settings.settings import Settings
 
+from private_gpt.paths import models_path
 
 class Completion(BaseModel):
     response: str
@@ -36,7 +40,7 @@ class CompletionGen(BaseModel):
     response: TokenGen
     sources: list[Chunk] | None = None
 
-
+reranker_path = models_path / 'reranker'
 @dataclass
 class ChatEngineInput:
     system_message: ChatMessage | None = None
@@ -126,9 +130,16 @@ class ChatService:
                 )
                 node_postprocessors.append(rerank_postprocessor)
             
-            return CondensePlusContextChatEngine.from_defaults(
-                system_prompt=system_prompt,
+            response_synthesizer = get_response_synthesizer(structured_answer_filtering=True, llm=self.llm_component.llm)
+            
+            custom_query_engine = RetrieverQueryEngine(
                 retriever=vector_index_retriever,
+                response_synthesizer=response_synthesizer
+            )
+            
+            return ContextChatEngine.from_defaults(
+                system_prompt=system_prompt,
+                retriever=custom_query_engine,
                 llm=self.llm_component.llm,  # Takes no effect at the moment
                 node_postprocessors=node_postprocessors,
             )
@@ -189,16 +200,15 @@ class ChatService:
         system_prompt = (
             """
             You are a helpful assistant named QuickGPT by Quickfox Consulting.
-            Your responses must be strictly and exclusively based on the context documents provided.
 
-            You are not allowed to use any information, knowledge, or external sources outside of the given context documents.
-            If the answer to a query is not present in the context documents, 
-            you should respond with "I do not have enough information in the provided context to answer this question."
+            Engage in a two-way conversation, ensuring that your responses are strictly and exclusively based on the relevant context documents provided.
 
-            Your responses should be relevant, informative, and easy to understand. 
+            Do not use any prior knowledge or external sources or make assumptions, inferences, or draw upon any prior knowledge beyond what is explicitly stated in the relevant context documents. 
+            If the answer to a query is not present in the relevant context documents, respond with "I do not have enough information in the provided context to answer this question."
+
+            Your responses must be relevant, informative, and easy to understand. 
             Aim to deliver high-quality answers that are respectful and helpful, using clear and concise language.
-            Focus on providing accurate and reliable answers based solely on the given context. 
-            Do not make assumptions, inferences, or draw upon any prior knowledge beyond what is explicitly stated in the context documents.
+            Consider previous queries only if the latest query is directly related to them. Address only the most recent query unless it explicitly builds upon a previous one.
             """
         )
         chat_history = (
@@ -209,7 +219,6 @@ class ChatService:
             use_context=use_context,
             context_filter=context_filter,
         )
-        # chat_engine = chat_engine.as_chat_engine(chat_mode="react", llm=self.llm_component.llm, verbose=True) # configuring ReAct Chat engine
         wrapped_response = chat_engine.chat(
             message=last_message if last_message is not None else "",
             chat_history=chat_history,
diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py
index fcb2606..5156266 100644
--- a/private_gpt/server/ingest/ingest_service.py
+++ b/private_gpt/server/ingest/ingest_service.py
@@ -1,11 +1,12 @@
 import logging
 import tempfile
 from pathlib import Path
-from typing import TYPE_CHECKING, AnyStr, BinaryIO
+from typing import TYPE_CHECKING, AnyStr, BinaryIO, Sequence, Any, List
 
 from injector import inject, singleton
-from llama_index.core.node_parser import SentenceWindowNodeParser, SemanticSplitterNodeParser
+from llama_index.core.node_parser import SemanticSplitterNodeParser, SentenceSplitter
 from llama_index.core.storage import StorageContext
+from llama_index.core.schema import BaseNode , ObjectType , TextNode
 
 from private_gpt.components.embedding.embedding_component import EmbeddingComponent
 from private_gpt.components.ingest.ingest_component import get_ingestion_component
@@ -17,12 +18,44 @@ from private_gpt.components.vector_store.vector_store_component import (
 from private_gpt.server.ingest.model import IngestedDoc
 from private_gpt.settings.settings import settings
 
+
+from llama_index.core.extractors import (
+    QuestionsAnsweredExtractor,
+    TitleExtractor,
+)
 if TYPE_CHECKING:
     from llama_index.core.storage.docstore.types import RefDocInfo
 
 logger = logging.getLogger(__name__)
 
 
+DEFAULT_CHUNK_SIZE = 512
+SENTENCE_CHUNK_OVERLAP = 20
+
+class SafeSemanticSplitter(SemanticSplitterNodeParser):
+
+    safety_chunker: SentenceSplitter = SentenceSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=SENTENCE_CHUNK_OVERLAP)
+
+    def _parse_nodes(
+            self, 
+            nodes, 
+            show_progress: bool = False, 
+            **kwargs
+    ) -> List[BaseNode]:
+        all_nodes: List[BaseNode] = super()._parse_nodes(nodes=nodes, show_progress=show_progress, **kwargs)
+        all_good = True
+        for node in all_nodes:
+            if node.get_type() == ObjectType.TEXT:
+                node: TextNode= node
+                if self.safety_chunker._token_size(node.text) > self.safety_chunker.chunk_size:
+                    logging.info("Chunk size too big after semantic chunking: switching to static chunking")
+                    all_good = False
+                    break
+            if not all_good:
+                all_nodes = self.safety_chunker._parse_nodes(nodes, show_progress=show_progress, **kwargs)
+        return all_nodes
+
+
 @singleton
 class IngestService:
     @inject
@@ -39,14 +72,22 @@ class IngestService:
             docstore=node_store_component.doc_store,
             index_store=node_store_component.index_store,
         )
-        node_parser = SemanticSplitterNodeParser.from_defaults(
+        # splitter = SentenceSplitter(chunk_size=512, chunk_overlap=128)
+        node_parser = SafeSemanticSplitter.from_defaults(
             embed_model=embedding_component.embedding_model,
+            # sentence_splitter=splitter,
+            include_metadata=True,
+            include_prev_next_rel=True,
         )
 
         self.ingest_component = get_ingestion_component(
             self.storage_context,
             embed_model=embedding_component.embedding_model,
-            transformations=[node_parser, embedding_component.embedding_model],
+            transformations=[
+                node_parser,
+                TitleExtractor(nodes=1, llm=self.llm_service.llm),
+                QuestionsAnsweredExtractor(questions=1,llm=self.llm_service.llm),
+                embedding_component.embedding_model],
             settings=settings(),
         )
 
diff --git a/scripts/setup b/scripts/setup
index 1f2454a..72e670f 100755
--- a/scripts/setup
+++ b/scripts/setup
@@ -4,6 +4,7 @@ import argparse
 
 from huggingface_hub import hf_hub_download, snapshot_download
 from transformers import AutoTokenizer
+from sentence_transformers import SentenceTransformer
 
 from private_gpt.paths import models_path, models_cache_path
 from private_gpt.settings.settings import settings
@@ -46,4 +47,12 @@ AutoTokenizer.from_pretrained(
 )
 print("Tokenizer downloaded!")
 
-print("Setup done")
+# Download Reranker
+# print(f"Downloading reranker {settings().rag.rerank.model}")
+
+# reranker_path = r'D:/QuickGPT/privateGPT/models/reranker'
+# rerank_postprocessor = SentenceTransformer(
+#                     settings().rag.rerank.model
+#                 )
+# rerank_postprocessor.save(reranker_path)
+print("Setup done")
\ No newline at end of file
diff --git a/settings.yaml b/settings.yaml
index d3780fc..0d0efb5 100644
--- a/settings.yaml
+++ b/settings.yaml
@@ -51,14 +51,14 @@ rag:
   #This value is disabled by default.  If you enable this settings, the RAG will only use articles that meet a certain percentage score.
   rerank:
     enabled: true
-    model: mixedbread-ai/mxbai-embed-large-v1
+    model: avsolatorio/GIST-Embedding-v0
     top_n: 2
 
 llamacpp:
   # llm_hf_repo_id: bartowski/Meta-Llama-3-8B-Instruct-GGUF
   # llm_hf_model_file: Meta-Llama-3-8B-Instruct-Q6_K.gguf
   llm_hf_repo_id: qwp4w3hyb/Hermes-2-Pro-Llama-3-8B-iMat-GGUF
-  llm_hf_model_file: hermes-2-pro-llama-3-8b-imat-Q6_K.gguf
+  llm_hf_model_file:  hermes-2-pro-llama-3-8b-imat-Q4_K_S.gguf
   tfs_z: 1.0            # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
   top_k: 40             # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
   top_p: 0.9            # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
@@ -68,11 +68,11 @@ llamacpp:
 embedding:
   # Should be matching the value above in most cases
   mode: huggingface
-  ingest_mode: parallel
+  ingest_mode: pipeline
   embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
 
 huggingface:
-  embedding_hf_model_name: mixedbread-ai/mxbai-embed-large-v1
+  embedding_hf_model_name: BAAI/bge-large-en
   access_token: ${HUGGINGFACE_TOKEN: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}
 
 vectorstore: