Ingestion Speedup Multiple strategy (#1309)

This commit is contained in:
lopagela 2023-11-25 20:12:09 +01:00 committed by GitHub
parent 546ba33e6f
commit bafdd3baf1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 515 additions and 195 deletions

106
scripts/ingest_folder.py Normal file → Executable file
View file

@ -1,3 +1,5 @@
#!/usr/bin/env python3
import argparse
import logging
from pathlib import Path
@ -8,7 +10,51 @@ from private_gpt.server.ingest.ingest_watcher import IngestWatcher
logger = logging.getLogger(__name__)
ingest_service = global_injector.get(IngestService)
class LocalIngestWorker:
def __init__(self, ingest_service: IngestService) -> None:
self.ingest_service = ingest_service
self.total_documents = 0
self.current_document_count = 0
self._files_under_root_folder: list[Path] = list()
def _find_all_files_in_folder(self, root_path: Path) -> None:
"""Search all files under the root folder recursively.
Count them at the same time
"""
for file_path in root_path.iterdir():
if file_path.is_file():
self.total_documents += 1
self._files_under_root_folder.append(file_path)
elif file_path.is_dir():
self._find_all_files_in_folder(file_path)
def ingest_folder(self, folder_path: Path) -> None:
# Count total documents before ingestion
self._find_all_files_in_folder(folder_path)
self._ingest_all(self._files_under_root_folder)
def _ingest_all(self, files_to_ingest: list[Path]) -> None:
logger.info("Ingesting files=%s", [f.name for f in files_to_ingest])
self.ingest_service.bulk_ingest([(str(p.name), p) for p in files_to_ingest])
def ingest_on_watch(self, changed_path: Path) -> None:
logger.info("Detected change in at path=%s, ingesting", changed_path)
self._do_ingest_one(changed_path)
def _do_ingest_one(self, changed_path: Path) -> None:
try:
if changed_path.exists():
logger.info(f"Started ingesting file={changed_path}")
self.ingest_service.ingest(changed_path.name, changed_path)
logger.info(f"Completed ingesting file={changed_path}")
except Exception:
logger.exception(
f"Failed to ingest document: {changed_path}, find the exception attached"
)
parser = argparse.ArgumentParser(prog="ingest_folder.py")
parser.add_argument("folder", help="Folder to ingest")
@ -37,53 +83,17 @@ if args.log_file:
)
logger.addHandler(file_handler)
if __name__ == "__main__":
total_documents = 0
current_document_count = 0
root_path = Path(args.folder)
if not root_path.exists():
raise ValueError(f"Path {args.folder} does not exist")
ingest_service = global_injector.get(IngestService)
worker = LocalIngestWorker(ingest_service)
worker.ingest_folder(root_path)
def count_documents(folder_path: Path) -> None:
global total_documents
for file_path in folder_path.iterdir():
if file_path.is_file():
total_documents += 1
elif file_path.is_dir():
count_documents(file_path)
def _recursive_ingest_folder(folder_path: Path) -> None:
global current_document_count, total_documents
for file_path in folder_path.iterdir():
if file_path.is_file():
current_document_count += 1
progress_msg = f"Document {current_document_count} of {total_documents} ({(current_document_count / total_documents) * 100:.2f}%)"
logger.info(progress_msg)
_do_ingest(file_path)
elif file_path.is_dir():
_recursive_ingest_folder(file_path)
def _do_ingest(changed_path: Path) -> None:
try:
if changed_path.exists():
logger.info(f"Started ingesting {changed_path}")
ingest_service.ingest(changed_path.name, changed_path)
logger.info(f"Completed ingesting {changed_path}")
except Exception:
logger.exception(
f"Failed to ingest document: {changed_path}, find the exception attached"
)
path = Path(args.folder)
if not path.exists():
raise ValueError(f"Path {args.folder} does not exist")
# Count total documents before ingestion
count_documents(path)
_recursive_ingest_folder(path)
if args.watch:
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
watcher = IngestWatcher(args.folder, _do_ingest)
watcher.start()
if args.watch:
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
watcher = IngestWatcher(args.folder, worker.ingest_on_watch)
watcher.start()