feat(settings): Configurable context_window and tokenizer (#1437)

This commit is contained in:
Iván Martínez 2023-12-21 14:49:35 +01:00 committed by GitHub
parent 6eeb95ec7f
commit 4780540870
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 43 additions and 7 deletions

View file

@ -3,6 +3,7 @@ import os
import argparse
from huggingface_hub import hf_hub_download, snapshot_download
from transformers import AutoTokenizer
from private_gpt.paths import models_path, models_cache_path
from private_gpt.settings.settings import settings
@ -15,8 +16,9 @@ if __name__ == '__main__':
resume_download = args.resume
os.makedirs(models_path, exist_ok=True)
embedding_path = models_path / "embedding"
# Download Embedding model
embedding_path = models_path / "embedding"
print(f"Downloading embedding {settings().local.embedding_hf_model_name}")
snapshot_download(
repo_id=settings().local.embedding_hf_model_name,
@ -24,9 +26,9 @@ snapshot_download(
local_dir=embedding_path,
)
print("Embedding model downloaded!")
print("Downloading models for local execution...")
# Download LLM and create a symlink to the model file
print(f"Downloading LLM {settings().local.llm_hf_model_file}")
hf_hub_download(
repo_id=settings().local.llm_hf_repo_id,
filename=settings().local.llm_hf_model_file,
@ -34,6 +36,14 @@ hf_hub_download(
local_dir=models_path,
resume_download=resume_download,
)
print("LLM model downloaded!")
# Download Tokenizer
print(f"Downloading tokenizer {settings().llm.tokenizer}")
AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=settings().llm.tokenizer,
cache_dir=models_cache_path,
)
print("Tokenizer downloaded!")
print("Setup done")