Added hybrid search with updated chunk size for semantic embedding

This commit is contained in:
Saurab-Shrestha9639*969**9858//852 2024-06-10 16:37:31 +05:45
parent ebe43082cd
commit 3ba585ffc0
7 changed files with 241 additions and 21 deletions

View file

@ -8,7 +8,8 @@ from fastapi_pagination import add_pagination
from private_gpt.settings.settings import settings from private_gpt.settings.settings import settings
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from private_gpt.constants import UPLOAD_DIR from private_gpt.constants import UPLOAD_DIR
# import nest_asyncio
# nest_asyncio.apply()
# Set log_config=None to do not use the uvicorn logging configuration, and # Set log_config=None to do not use the uvicorn logging configuration, and
# use ours instead. For reference, see below: # use ours instead. For reference, see below:
# https://github.com/tiangolo/fastapi/discussions/7457#discussioncomment-5141108 # https://github.com/tiangolo/fastapi/discussions/7457#discussioncomment-5141108

View file

@ -0,0 +1,151 @@
from llama_index.core.vector_stores import VectorStoreQueryResult
from typing import Any, List, Tuple
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
doc_tokenizer = AutoTokenizer.from_pretrained(
"naver/efficient-splade-VI-BT-large-doc"
)
doc_model = AutoModelForMaskedLM.from_pretrained(
"naver/efficient-splade-VI-BT-large-doc"
)
query_tokenizer = AutoTokenizer.from_pretrained(
"naver/efficient-splade-VI-BT-large-query"
)
query_model = AutoModelForMaskedLM.from_pretrained(
"naver/efficient-splade-VI-BT-large-query"
)
def sparse_doc_vectors(
texts: List[str],
) -> Tuple[List[List[int]], List[List[float]]]:
"""
Computes vectors from logits and attention mask using ReLU, log, and max operations.
"""
tokens = doc_tokenizer(
texts, truncation=True, padding=True, return_tensors="pt"
)
if torch.cuda.is_available():
tokens = tokens.to("cuda")
output = doc_model(**tokens)
logits, attention_mask = output.logits, tokens.attention_mask
relu_log = torch.log(1 + torch.relu(logits))
weighted_log = relu_log * attention_mask.unsqueeze(-1)
tvecs, _ = torch.max(weighted_log, dim=1)
# extract the vectors that are non-zero and their indices
indices = []
vecs = []
for batch in tvecs:
indices.append(batch.nonzero(as_tuple=True)[0].tolist())
vecs.append(batch[indices[-1]].tolist())
return indices, vecs
def sparse_query_vectors(
texts: List[str],
) -> Tuple[List[List[int]], List[List[float]]]:
"""
Computes vectors from logits and attention mask using ReLU, log, and max operations.
"""
# TODO: compute sparse vectors in batches if max length is exceeded
tokens = query_tokenizer(
texts, truncation=True, padding=True, return_tensors="pt"
)
if torch.cuda.is_available():
tokens = tokens.to("cuda")
output = query_model(**tokens)
logits, attention_mask = output.logits, tokens.attention_mask
relu_log = torch.log(1 + torch.relu(logits))
weighted_log = relu_log * attention_mask.unsqueeze(-1)
tvecs, _ = torch.max(weighted_log, dim=1)
# extract the vectors that are non-zero and their indices
indices = []
vecs = []
for batch in tvecs:
indices.append(batch.nonzero(as_tuple=True)[0].tolist())
vecs.append(batch[indices[-1]].tolist())
return indices, vecs
def relative_score_fusion(
dense_result: VectorStoreQueryResult,
sparse_result: VectorStoreQueryResult,
alpha: float = 0.5, # passed in from the query engine
top_k: int = 2, # passed in from the query engine i.e. similarity_top_k
) -> VectorStoreQueryResult:
"""
Fuse dense and sparse results using relative score fusion.
"""
# sanity check
assert dense_result.nodes is not None
assert dense_result.similarities is not None
assert sparse_result.nodes is not None
assert sparse_result.similarities is not None
# deconstruct results
sparse_result_tuples = list(
zip(sparse_result.similarities, sparse_result.nodes)
)
sparse_result_tuples.sort(key=lambda x: x[0], reverse=True)
dense_result_tuples = list(
zip(dense_result.similarities, dense_result.nodes)
)
dense_result_tuples.sort(key=lambda x: x[0], reverse=True)
# track nodes in both results
all_nodes_dict = {x.node_id: x for x in dense_result.nodes}
for node in sparse_result.nodes:
if node.node_id not in all_nodes_dict:
all_nodes_dict[node.node_id] = node
# normalize sparse similarities from 0 to 1
sparse_similarities = [x[0] for x in sparse_result_tuples]
max_sparse_sim = max(sparse_similarities)
min_sparse_sim = min(sparse_similarities)
sparse_similarities = [
(x - min_sparse_sim) / (max_sparse_sim - min_sparse_sim)
for x in sparse_similarities
]
sparse_per_node = {
sparse_result_tuples[i][1].node_id: x
for i, x in enumerate(sparse_similarities)
}
# normalize dense similarities from 0 to 1
dense_similarities = [x[0] for x in dense_result_tuples]
max_dense_sim = max(dense_similarities)
min_dense_sim = min(dense_similarities)
dense_similarities = [
(x - min_dense_sim) / (max_dense_sim - min_dense_sim)
for x in dense_similarities
]
dense_per_node = {
dense_result_tuples[i][1].node_id: x
for i, x in enumerate(dense_similarities)
}
# fuse the scores
fused_similarities = []
for node_id in all_nodes_dict:
sparse_sim = sparse_per_node.get(node_id, 0)
dense_sim = dense_per_node.get(node_id, 0)
fused_sim = alpha * (sparse_sim + dense_sim)
fused_similarities.append((fused_sim, all_nodes_dict[node_id]))
fused_similarities.sort(key=lambda x: x[0], reverse=True)
fused_similarities = fused_similarities[:top_k]
# create final response object
return VectorStoreQueryResult(
nodes=[x[1] for x in fused_similarities],
similarities=[x[0] for x in fused_similarities],
ids=[x[1].node_id for x in fused_similarities],
)

View file

@ -13,6 +13,7 @@ from llama_index.core.vector_stores.types import (
from private_gpt.open_ai.extensions.context_filter import ContextFilter from private_gpt.open_ai.extensions.context_filter import ContextFilter
from private_gpt.paths import local_data_path from private_gpt.paths import local_data_path
from private_gpt.settings.settings import Settings from private_gpt.settings.settings import Settings
from .hybrid_fn import sparse_query_vectors, sparse_doc_vectors, relative_score_fusion
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -119,6 +120,11 @@ class VectorStoreComponent:
QdrantVectorStore( QdrantVectorStore(
client=client, client=client,
collection_name="make_this_parameterizable_per_api_call", collection_name="make_this_parameterizable_per_api_call",
enable_hybrid=True,
batch_size=20,
sparse_doc_fn=sparse_doc_vectors,
sparse_query_fn=sparse_query_vectors,
# hybrid_fusion_fn=relative_score_fusion,
), # TODO ), # TODO
) )
case _: case _:
@ -144,6 +150,9 @@ class VectorStoreComponent:
if self.settings.vectorstore.database != "qdrant" if self.settings.vectorstore.database != "qdrant"
else None else None
), ),
sparse_top_k=12,
vector_store_query_mode="hybrid",
alpha=0.5
) )
def close(self) -> None: def close(self) -> None:

View file

@ -16,6 +16,9 @@ from llama_index.core.storage import StorageContext
from llama_index.core.types import TokenGen from llama_index.core.types import TokenGen
from pydantic import BaseModel from pydantic import BaseModel
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from private_gpt.components.embedding.embedding_component import EmbeddingComponent from private_gpt.components.embedding.embedding_component import EmbeddingComponent
from private_gpt.components.llm.llm_component import LLMComponent from private_gpt.components.llm.llm_component import LLMComponent
from private_gpt.components.node_store.node_store_component import NodeStoreComponent from private_gpt.components.node_store.node_store_component import NodeStoreComponent
@ -26,6 +29,7 @@ from private_gpt.open_ai.extensions.context_filter import ContextFilter
from private_gpt.server.chunks.chunks_service import Chunk from private_gpt.server.chunks.chunks_service import Chunk
from private_gpt.settings.settings import Settings from private_gpt.settings.settings import Settings
from private_gpt.paths import models_path
class Completion(BaseModel): class Completion(BaseModel):
response: str response: str
@ -36,7 +40,7 @@ class CompletionGen(BaseModel):
response: TokenGen response: TokenGen
sources: list[Chunk] | None = None sources: list[Chunk] | None = None
reranker_path = models_path / 'reranker'
@dataclass @dataclass
class ChatEngineInput: class ChatEngineInput:
system_message: ChatMessage | None = None system_message: ChatMessage | None = None
@ -126,9 +130,16 @@ class ChatService:
) )
node_postprocessors.append(rerank_postprocessor) node_postprocessors.append(rerank_postprocessor)
return CondensePlusContextChatEngine.from_defaults( response_synthesizer = get_response_synthesizer(structured_answer_filtering=True, llm=self.llm_component.llm)
system_prompt=system_prompt,
custom_query_engine = RetrieverQueryEngine(
retriever=vector_index_retriever, retriever=vector_index_retriever,
response_synthesizer=response_synthesizer
)
return ContextChatEngine.from_defaults(
system_prompt=system_prompt,
retriever=custom_query_engine,
llm=self.llm_component.llm, # Takes no effect at the moment llm=self.llm_component.llm, # Takes no effect at the moment
node_postprocessors=node_postprocessors, node_postprocessors=node_postprocessors,
) )
@ -189,16 +200,15 @@ class ChatService:
system_prompt = ( system_prompt = (
""" """
You are a helpful assistant named QuickGPT by Quickfox Consulting. You are a helpful assistant named QuickGPT by Quickfox Consulting.
Your responses must be strictly and exclusively based on the context documents provided.
You are not allowed to use any information, knowledge, or external sources outside of the given context documents. Engage in a two-way conversation, ensuring that your responses are strictly and exclusively based on the relevant context documents provided.
If the answer to a query is not present in the context documents,
you should respond with "I do not have enough information in the provided context to answer this question."
Your responses should be relevant, informative, and easy to understand. Do not use any prior knowledge or external sources or make assumptions, inferences, or draw upon any prior knowledge beyond what is explicitly stated in the relevant context documents.
If the answer to a query is not present in the relevant context documents, respond with "I do not have enough information in the provided context to answer this question."
Your responses must be relevant, informative, and easy to understand.
Aim to deliver high-quality answers that are respectful and helpful, using clear and concise language. Aim to deliver high-quality answers that are respectful and helpful, using clear and concise language.
Focus on providing accurate and reliable answers based solely on the given context. Consider previous queries only if the latest query is directly related to them. Address only the most recent query unless it explicitly builds upon a previous one.
Do not make assumptions, inferences, or draw upon any prior knowledge beyond what is explicitly stated in the context documents.
""" """
) )
chat_history = ( chat_history = (
@ -209,7 +219,6 @@ class ChatService:
use_context=use_context, use_context=use_context,
context_filter=context_filter, context_filter=context_filter,
) )
# chat_engine = chat_engine.as_chat_engine(chat_mode="react", llm=self.llm_component.llm, verbose=True) # configuring ReAct Chat engine
wrapped_response = chat_engine.chat( wrapped_response = chat_engine.chat(
message=last_message if last_message is not None else "", message=last_message if last_message is not None else "",
chat_history=chat_history, chat_history=chat_history,

View file

@ -1,11 +1,12 @@
import logging import logging
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, AnyStr, BinaryIO from typing import TYPE_CHECKING, AnyStr, BinaryIO, Sequence, Any, List
from injector import inject, singleton from injector import inject, singleton
from llama_index.core.node_parser import SentenceWindowNodeParser, SemanticSplitterNodeParser from llama_index.core.node_parser import SemanticSplitterNodeParser, SentenceSplitter
from llama_index.core.storage import StorageContext from llama_index.core.storage import StorageContext
from llama_index.core.schema import BaseNode , ObjectType , TextNode
from private_gpt.components.embedding.embedding_component import EmbeddingComponent from private_gpt.components.embedding.embedding_component import EmbeddingComponent
from private_gpt.components.ingest.ingest_component import get_ingestion_component from private_gpt.components.ingest.ingest_component import get_ingestion_component
@ -17,12 +18,44 @@ from private_gpt.components.vector_store.vector_store_component import (
from private_gpt.server.ingest.model import IngestedDoc from private_gpt.server.ingest.model import IngestedDoc
from private_gpt.settings.settings import settings from private_gpt.settings.settings import settings
from llama_index.core.extractors import (
QuestionsAnsweredExtractor,
TitleExtractor,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from llama_index.core.storage.docstore.types import RefDocInfo from llama_index.core.storage.docstore.types import RefDocInfo
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DEFAULT_CHUNK_SIZE = 512
SENTENCE_CHUNK_OVERLAP = 20
class SafeSemanticSplitter(SemanticSplitterNodeParser):
safety_chunker: SentenceSplitter = SentenceSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=SENTENCE_CHUNK_OVERLAP)
def _parse_nodes(
self,
nodes,
show_progress: bool = False,
**kwargs
) -> List[BaseNode]:
all_nodes: List[BaseNode] = super()._parse_nodes(nodes=nodes, show_progress=show_progress, **kwargs)
all_good = True
for node in all_nodes:
if node.get_type() == ObjectType.TEXT:
node: TextNode= node
if self.safety_chunker._token_size(node.text) > self.safety_chunker.chunk_size:
logging.info("Chunk size too big after semantic chunking: switching to static chunking")
all_good = False
break
if not all_good:
all_nodes = self.safety_chunker._parse_nodes(nodes, show_progress=show_progress, **kwargs)
return all_nodes
@singleton @singleton
class IngestService: class IngestService:
@inject @inject
@ -39,14 +72,22 @@ class IngestService:
docstore=node_store_component.doc_store, docstore=node_store_component.doc_store,
index_store=node_store_component.index_store, index_store=node_store_component.index_store,
) )
node_parser = SemanticSplitterNodeParser.from_defaults( # splitter = SentenceSplitter(chunk_size=512, chunk_overlap=128)
node_parser = SafeSemanticSplitter.from_defaults(
embed_model=embedding_component.embedding_model, embed_model=embedding_component.embedding_model,
# sentence_splitter=splitter,
include_metadata=True,
include_prev_next_rel=True,
) )
self.ingest_component = get_ingestion_component( self.ingest_component = get_ingestion_component(
self.storage_context, self.storage_context,
embed_model=embedding_component.embedding_model, embed_model=embedding_component.embedding_model,
transformations=[node_parser, embedding_component.embedding_model], transformations=[
node_parser,
TitleExtractor(nodes=1, llm=self.llm_service.llm),
QuestionsAnsweredExtractor(questions=1,llm=self.llm_service.llm),
embedding_component.embedding_model],
settings=settings(), settings=settings(),
) )

View file

@ -4,6 +4,7 @@ import argparse
from huggingface_hub import hf_hub_download, snapshot_download from huggingface_hub import hf_hub_download, snapshot_download
from transformers import AutoTokenizer from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from private_gpt.paths import models_path, models_cache_path from private_gpt.paths import models_path, models_cache_path
from private_gpt.settings.settings import settings from private_gpt.settings.settings import settings
@ -46,4 +47,12 @@ AutoTokenizer.from_pretrained(
) )
print("Tokenizer downloaded!") print("Tokenizer downloaded!")
# Download Reranker
# print(f"Downloading reranker {settings().rag.rerank.model}")
# reranker_path = r'D:/QuickGPT/privateGPT/models/reranker'
# rerank_postprocessor = SentenceTransformer(
# settings().rag.rerank.model
# )
# rerank_postprocessor.save(reranker_path)
print("Setup done") print("Setup done")

View file

@ -51,14 +51,14 @@ rag:
#This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score. #This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score.
rerank: rerank:
enabled: true enabled: true
model: mixedbread-ai/mxbai-embed-large-v1 model: avsolatorio/GIST-Embedding-v0
top_n: 2 top_n: 2
llamacpp: llamacpp:
# llm_hf_repo_id: bartowski/Meta-Llama-3-8B-Instruct-GGUF # llm_hf_repo_id: bartowski/Meta-Llama-3-8B-Instruct-GGUF
# llm_hf_model_file: Meta-Llama-3-8B-Instruct-Q6_K.gguf # llm_hf_model_file: Meta-Llama-3-8B-Instruct-Q6_K.gguf
llm_hf_repo_id: qwp4w3hyb/Hermes-2-Pro-Llama-3-8B-iMat-GGUF llm_hf_repo_id: qwp4w3hyb/Hermes-2-Pro-Llama-3-8B-iMat-GGUF
llm_hf_model_file: hermes-2-pro-llama-3-8b-imat-Q6_K.gguf llm_hf_model_file: hermes-2-pro-llama-3-8b-imat-Q4_K_S.gguf
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
@ -68,11 +68,11 @@ llamacpp:
embedding: embedding:
# Should be matching the value above in most cases # Should be matching the value above in most cases
mode: huggingface mode: huggingface
ingest_mode: parallel ingest_mode: pipeline
embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5 embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
huggingface: huggingface:
embedding_hf_model_name: mixedbread-ai/mxbai-embed-large-v1 embedding_hf_model_name: BAAI/bge-large-en
access_token: ${HUGGINGFACE_TOKEN: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} access_token: ${HUGGINGFACE_TOKEN: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}
vectorstore: vectorstore: