mirror of
https://github.com/zylon-ai/private-gpt.git
synced 2025-12-22 04:30:11 +01:00
Ingestion Speedup Multiple strategy (#1309)
This commit is contained in:
parent
546ba33e6f
commit
bafdd3baf1
13 changed files with 515 additions and 195 deletions
106
scripts/ingest_folder.py
Normal file → Executable file
106
scripts/ingest_folder.py
Normal file → Executable file
|
|
@ -1,3 +1,5 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
|
@ -8,7 +10,51 @@ from private_gpt.server.ingest.ingest_watcher import IngestWatcher
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ingest_service = global_injector.get(IngestService)
|
||||
|
||||
class LocalIngestWorker:
|
||||
def __init__(self, ingest_service: IngestService) -> None:
|
||||
self.ingest_service = ingest_service
|
||||
|
||||
self.total_documents = 0
|
||||
self.current_document_count = 0
|
||||
|
||||
self._files_under_root_folder: list[Path] = list()
|
||||
|
||||
def _find_all_files_in_folder(self, root_path: Path) -> None:
|
||||
"""Search all files under the root folder recursively.
|
||||
Count them at the same time
|
||||
"""
|
||||
for file_path in root_path.iterdir():
|
||||
if file_path.is_file():
|
||||
self.total_documents += 1
|
||||
self._files_under_root_folder.append(file_path)
|
||||
elif file_path.is_dir():
|
||||
self._find_all_files_in_folder(file_path)
|
||||
|
||||
def ingest_folder(self, folder_path: Path) -> None:
|
||||
# Count total documents before ingestion
|
||||
self._find_all_files_in_folder(folder_path)
|
||||
self._ingest_all(self._files_under_root_folder)
|
||||
|
||||
def _ingest_all(self, files_to_ingest: list[Path]) -> None:
|
||||
logger.info("Ingesting files=%s", [f.name for f in files_to_ingest])
|
||||
self.ingest_service.bulk_ingest([(str(p.name), p) for p in files_to_ingest])
|
||||
|
||||
def ingest_on_watch(self, changed_path: Path) -> None:
|
||||
logger.info("Detected change in at path=%s, ingesting", changed_path)
|
||||
self._do_ingest_one(changed_path)
|
||||
|
||||
def _do_ingest_one(self, changed_path: Path) -> None:
|
||||
try:
|
||||
if changed_path.exists():
|
||||
logger.info(f"Started ingesting file={changed_path}")
|
||||
self.ingest_service.ingest(changed_path.name, changed_path)
|
||||
logger.info(f"Completed ingesting file={changed_path}")
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Failed to ingest document: {changed_path}, find the exception attached"
|
||||
)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(prog="ingest_folder.py")
|
||||
parser.add_argument("folder", help="Folder to ingest")
|
||||
|
|
@ -37,53 +83,17 @@ if args.log_file:
|
|||
)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
total_documents = 0
|
||||
current_document_count = 0
|
||||
root_path = Path(args.folder)
|
||||
if not root_path.exists():
|
||||
raise ValueError(f"Path {args.folder} does not exist")
|
||||
|
||||
ingest_service = global_injector.get(IngestService)
|
||||
worker = LocalIngestWorker(ingest_service)
|
||||
worker.ingest_folder(root_path)
|
||||
|
||||
def count_documents(folder_path: Path) -> None:
|
||||
global total_documents
|
||||
for file_path in folder_path.iterdir():
|
||||
if file_path.is_file():
|
||||
total_documents += 1
|
||||
elif file_path.is_dir():
|
||||
count_documents(file_path)
|
||||
|
||||
|
||||
def _recursive_ingest_folder(folder_path: Path) -> None:
|
||||
global current_document_count, total_documents
|
||||
for file_path in folder_path.iterdir():
|
||||
if file_path.is_file():
|
||||
current_document_count += 1
|
||||
progress_msg = f"Document {current_document_count} of {total_documents} ({(current_document_count / total_documents) * 100:.2f}%)"
|
||||
logger.info(progress_msg)
|
||||
_do_ingest(file_path)
|
||||
elif file_path.is_dir():
|
||||
_recursive_ingest_folder(file_path)
|
||||
|
||||
|
||||
def _do_ingest(changed_path: Path) -> None:
|
||||
try:
|
||||
if changed_path.exists():
|
||||
logger.info(f"Started ingesting {changed_path}")
|
||||
ingest_service.ingest(changed_path.name, changed_path)
|
||||
logger.info(f"Completed ingesting {changed_path}")
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Failed to ingest document: {changed_path}, find the exception attached"
|
||||
)
|
||||
|
||||
|
||||
path = Path(args.folder)
|
||||
if not path.exists():
|
||||
raise ValueError(f"Path {args.folder} does not exist")
|
||||
|
||||
# Count total documents before ingestion
|
||||
count_documents(path)
|
||||
|
||||
_recursive_ingest_folder(path)
|
||||
if args.watch:
|
||||
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
|
||||
watcher = IngestWatcher(args.folder, _do_ingest)
|
||||
watcher.start()
|
||||
if args.watch:
|
||||
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
|
||||
watcher = IngestWatcher(args.folder, worker.ingest_on_watch)
|
||||
watcher.start()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue