feat(settings): Configurable context_window and tokenizer (#1437)

2025-12-22 07:40:12 +01:00 · 2023-12-21 14:49:35 +01:00 · 2023-12-21 14:49:35 +01:00 · 4780540870
commit 4780540870
parent 6eeb95ec7f
4 changed files with 43 additions and 7 deletions
--- a/scripts/setup
+++ b/scripts/setup
@ -3,6 +3,7 @@ import os
 import argparse

 from huggingface_hub import hf_hub_download, snapshot_download
+from transformers import AutoTokenizer

 from private_gpt.paths import models_path, models_cache_path
 from private_gpt.settings.settings import settings
@ -15,8 +16,9 @@ if __name__ == '__main__':
    resume_download = args.resume

 os.makedirs(models_path, exist_ok=True)
-embedding_path = models_path / "embedding"

+# Download Embedding model
+embedding_path = models_path / "embedding"
 print(f"Downloading embedding {settings().local.embedding_hf_model_name}")
 snapshot_download(
    repo_id=settings().local.embedding_hf_model_name,
@ -24,9 +26,9 @@ snapshot_download(
    local_dir=embedding_path,
 )
 print("Embedding model downloaded!")
-print("Downloading models for local execution...")

 # Download LLM and create a symlink to the model file
+print(f"Downloading LLM {settings().local.llm_hf_model_file}")
 hf_hub_download(
    repo_id=settings().local.llm_hf_repo_id,
    filename=settings().local.llm_hf_model_file,
@ -34,6 +36,14 @@ hf_hub_download(
    local_dir=models_path,
    resume_download=resume_download,
 )
-
 print("LLM model downloaded!")
+
+# Download Tokenizer
+print(f"Downloading tokenizer {settings().llm.tokenizer}")
+AutoTokenizer.from_pretrained(
+    pretrained_model_name_or_path=settings().llm.tokenizer,
+    cache_dir=models_cache_path,
+)
+print("Tokenizer downloaded!")
+
 print("Setup done")