From def2b7f71f3dfca3513a6f842cbd4257da31961a Mon Sep 17 00:00:00 2001 From: Robert Hirsch Date: Thu, 6 Jun 2024 21:07:07 +0200 Subject: [PATCH] added window_size setting for ingestion --- private_gpt/server/ingest/ingest_service.py | 5 +++-- private_gpt/settings/settings.py | 4 ++++ settings.yaml | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py index f9ae472..97c4e00 100644 --- a/private_gpt/server/ingest/ingest_service.py +++ b/private_gpt/server/ingest/ingest_service.py @@ -39,13 +39,14 @@ class IngestService: docstore=node_store_component.doc_store, index_store=node_store_component.index_store, ) - node_parser = SentenceWindowNodeParser.from_defaults() + self._settings = settings() + node_parser = SentenceWindowNodeParser.from_defaults(window_size=self._settings.vectorstore.inject_win_size) self.ingest_component = get_ingestion_component( self.storage_context, embed_model=embedding_component.embedding_model, transformations=[node_parser, embedding_component.embedding_model], - settings=settings(), + settings=self._settings, ) def _ingest_data(self, file_name: str, file_data: AnyStr) -> list[IngestedDoc]: diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 408b1ec..abb7ad3 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -120,6 +120,10 @@ class LLMSettings(BaseModel): class VectorstoreSettings(BaseModel): database: Literal["chroma", "qdrant", "postgres"] + inject_win_size: int = Field( + 3, + description="How many sentences on either side to capture, when parsing files", + ) class NodeStoreSettings(BaseModel): diff --git a/settings.yaml b/settings.yaml index 1765fe4..5577fe9 100644 --- a/settings.yaml +++ b/settings.yaml @@ -74,6 +74,7 @@ huggingface: vectorstore: database: qdrant + inject_win_size: 2 nodestore: database: simple