diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py index f9ae472..97c4e00 100644 --- a/private_gpt/server/ingest/ingest_service.py +++ b/private_gpt/server/ingest/ingest_service.py @@ -39,13 +39,14 @@ class IngestService: docstore=node_store_component.doc_store, index_store=node_store_component.index_store, ) - node_parser = SentenceWindowNodeParser.from_defaults() + self._settings = settings() + node_parser = SentenceWindowNodeParser.from_defaults(window_size=self._settings.vectorstore.inject_win_size) self.ingest_component = get_ingestion_component( self.storage_context, embed_model=embedding_component.embedding_model, transformations=[node_parser, embedding_component.embedding_model], - settings=settings(), + settings=self._settings, ) def _ingest_data(self, file_name: str, file_data: AnyStr) -> list[IngestedDoc]: diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 408b1ec..abb7ad3 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -120,6 +120,10 @@ class LLMSettings(BaseModel): class VectorstoreSettings(BaseModel): database: Literal["chroma", "qdrant", "postgres"] + inject_win_size: int = Field( + 3, + description="How many sentences on either side to capture, when parsing files", + ) class NodeStoreSettings(BaseModel): diff --git a/settings.yaml b/settings.yaml index 1765fe4..5577fe9 100644 --- a/settings.yaml +++ b/settings.yaml @@ -74,6 +74,7 @@ huggingface: vectorstore: database: qdrant + inject_win_size: 2 nodestore: database: simple