mirror of
https://github.com/zylon-ai/private-gpt.git
synced 2025-12-22 20:12:55 +01:00
378 lines
14 KiB
Python
378 lines
14 KiB
Python
import os
|
|
import logging
|
|
import traceback
|
|
|
|
import aiofiles
|
|
from pathlib import Path
|
|
from typing import Literal
|
|
|
|
from private_gpt.users.models.document import MakerCheckerActionType, MakerCheckerStatus
|
|
from sqlalchemy.orm import Session
|
|
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File, status, Security, Body, Form
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel, Field
|
|
|
|
from private_gpt.users import crud, models, schemas
|
|
from private_gpt.users.api import deps
|
|
from private_gpt.users.constants.role import Role
|
|
|
|
from private_gpt.server.ingest.ingest_service import IngestService
|
|
from private_gpt.server.ingest.model import IngestedDoc
|
|
from private_gpt.server.utils.auth import authenticated
|
|
from private_gpt.constants import UPLOAD_DIR
|
|
|
|
ingest_router = APIRouter(prefix="/v1", dependencies=[Depends(authenticated)])
|
|
|
|
logger = logging.getLogger(__name__)
|
|
class IngestTextBody(BaseModel):
|
|
file_name: str = Field(examples=["Avatar: The Last Airbender"])
|
|
text: str = Field(
|
|
examples=[
|
|
"Avatar is set in an Asian and Arctic-inspired world in which some "
|
|
"people can telekinetically manipulate one of the four elements—water, "
|
|
"earth, fire or air—through practices known as 'bending', inspired by "
|
|
"Chinese martial arts."
|
|
]
|
|
)
|
|
|
|
|
|
class IngestResponse(BaseModel):
|
|
object: Literal["list"]
|
|
model: Literal["private-gpt"]
|
|
data: list[IngestedDoc]
|
|
|
|
class DeleteFilename(BaseModel):
|
|
filename: str
|
|
|
|
# @ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
|
|
# def ingest(request: Request, file: UploadFile) -> IngestResponse:
|
|
# """Ingests and processes a file.
|
|
|
|
# Deprecated. Use ingest/file instead.
|
|
# """
|
|
# return ingest_file(request, file)
|
|
|
|
|
|
@ingest_router.post("/ingest/file1", tags=["Ingestion"])
|
|
def ingest_file(request: Request, file: UploadFile = File(...)) -> IngestResponse:
|
|
"""Ingests and processes a file, storing its chunks to be used as context.
|
|
|
|
The context obtained from files is later used in
|
|
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
|
|
|
Most common document
|
|
formats are supported, but you may be prompted to install an extra dependency to
|
|
manage a specific file type.
|
|
|
|
A file can generate different Documents (for example a PDF generates one Document
|
|
per page). All Documents IDs are returned in the response, together with the
|
|
extracted Metadata (which is later used to improve context retrieval). Those IDs
|
|
can be used to filter the context used to create responses in
|
|
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
|
"""
|
|
service = request.state.injector.get(IngestService)
|
|
if file.filename is None:
|
|
raise HTTPException(400, "No file name provided")
|
|
upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
|
|
try:
|
|
with open(upload_path, "wb") as f:
|
|
f.write(file.file.read())
|
|
with open(upload_path, "rb") as f:
|
|
ingested_documents = service.ingest_bin_data(file.filename, f)
|
|
except Exception as e:
|
|
return {"message": f"There was an error uploading the file(s)\n {e}"}
|
|
finally:
|
|
file.file.close()
|
|
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
|
|
|
|
|
@ingest_router.post("/ingest/text", tags=["Ingestion"])
|
|
def ingest_text(request: Request, body: IngestTextBody) -> IngestResponse:
|
|
"""Ingests and processes a text, storing its chunks to be used as context.
|
|
|
|
The context obtained from files is later used in
|
|
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
|
|
|
A Document will be generated with the given text. The Document
|
|
ID is returned in the response, together with the
|
|
extracted Metadata (which is later used to improve context retrieval). That ID
|
|
can be used to filter the context used to create responses in
|
|
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
|
"""
|
|
service = request.state.injector.get(IngestService)
|
|
if len(body.file_name) == 0:
|
|
raise HTTPException(400, "No file name provided")
|
|
ingested_documents = service.ingest_text(body.file_name, body.text)
|
|
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
|
|
|
|
|
@ingest_router.get("/ingest/list", tags=["Ingestion"])
|
|
def list_ingested(request: Request) -> IngestResponse:
|
|
"""Lists already ingested Documents including their Document ID and metadata.
|
|
|
|
Those IDs can be used to filter the context used to create responses
|
|
in `/chat/completions`, `/completions`, and `/chunks` APIs.
|
|
"""
|
|
service = request.state.injector.get(IngestService)
|
|
ingested_documents = service.list_ingested()
|
|
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
|
|
|
|
|
@ingest_router.delete("/ingest/{doc_id}", tags=["Ingestion"])
|
|
def delete_ingested(request: Request, doc_id: str) -> None:
|
|
"""Delete the specified ingested Document.
|
|
|
|
The `doc_id` can be obtained from the `GET /ingest/list` endpoint.
|
|
The document will be effectively deleted from your storage context.
|
|
"""
|
|
service = request.state.injector.get(IngestService)
|
|
service.delete(doc_id)
|
|
|
|
|
|
@ingest_router.post("/ingest/file/delete", tags=["Ingestion"])
|
|
def delete_file(
|
|
request: Request,
|
|
delete_input: DeleteFilename,
|
|
log_audit: models.Audit = Depends(deps.get_audit_logger),
|
|
db: Session = Depends(deps.get_db),
|
|
current_user: models.User = Security(
|
|
deps.get_current_user,
|
|
scopes=[Role.ADMIN["name"], Role.SUPER_ADMIN["name"], Role.OPERATOR["name"]],
|
|
|
|
)) -> dict:
|
|
"""Delete the specified filename.
|
|
|
|
The `filename` can be obtained from the `GET /ingest/list` endpoint.
|
|
The document will be effectively deleted from your storage context.
|
|
"""
|
|
filename = delete_input.filename
|
|
service = request.state.injector.get(IngestService)
|
|
try:
|
|
document = crud.documents.get_by_filename(db,file_name=filename)
|
|
if document:
|
|
doc_ids = service.get_doc_ids_by_filename(filename)
|
|
try:
|
|
if doc_ids:
|
|
for doc_id in doc_ids:
|
|
service.delete(doc_id)
|
|
upload_path = Path(f"{UPLOAD_DIR}/{filename}")
|
|
os.remove(upload_path)
|
|
except:
|
|
print("Unable to delete file from the static directory")
|
|
log_audit(
|
|
model='Document',
|
|
action='delete',
|
|
details={
|
|
"detail": f"{filename}",
|
|
'user': current_user.username,
|
|
},
|
|
user_id=current_user.id
|
|
)
|
|
crud.documents.remove(db=db, id=document.id)
|
|
db.execute(models.document_department_association.delete().where(
|
|
models.document_department_association.c.document_id == document.id
|
|
))
|
|
return {"status": "SUCCESS", "message": f"{filename}' deleted successfully."}
|
|
except Exception as e:
|
|
print(traceback.print_exc())
|
|
logger.error(
|
|
f"Unexpected error deleting documents with filename '{filename}': {str(e)}")
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Internal Server Error")
|
|
|
|
|
|
async def create_documents(
|
|
db: Session,
|
|
file_name: str = None,
|
|
current_user: models.User = None,
|
|
departments: schemas.DocumentDepartmentList = Depends(),
|
|
log_audit: models.Audit = None,
|
|
):
|
|
"""
|
|
Create documents in the `Document` table and update the \n
|
|
`Document Department Association` table with the departments ids for the documents.
|
|
"""
|
|
department_ids = departments.departments_ids
|
|
file_ingested = crud.documents.get_by_filename(
|
|
db, file_name=file_name)
|
|
if file_ingested:
|
|
raise HTTPException(
|
|
status_code=409,
|
|
detail="File already exists. Choose a different file.",
|
|
)
|
|
print(f"{file_name} uploaded by {current_user.id} action {MakerCheckerActionType.INSERT.value} and status {MakerCheckerStatus.PENDING.value}")
|
|
docs_in = schemas.DocumentMakerCreate(
|
|
filename=file_name,
|
|
uploaded_by=current_user.id,
|
|
action_type=MakerCheckerActionType.INSERT,
|
|
status=MakerCheckerStatus.PENDING,
|
|
doc_type_id=departments.doc_type_id,
|
|
)
|
|
document = crud.documents.create(db=db, obj_in=docs_in)
|
|
department_ids = department_ids if department_ids else "1"
|
|
department_ids = [int(number) for number in department_ids.split(",")]
|
|
for department_id in department_ids:
|
|
db.execute(models.document_department_association.insert().values(document_id=document.id, department_id=department_id))
|
|
|
|
log_audit(
|
|
model='Document',
|
|
action='create',
|
|
details={
|
|
'filename': f"{file_name}",
|
|
'user': f"{current_user.username}",
|
|
'departments': f"{department_ids}"
|
|
},
|
|
user_id=current_user.id
|
|
)
|
|
return document
|
|
|
|
|
|
async def common_ingest_logic(
|
|
request: Request,
|
|
db: Session,
|
|
ocr_file,
|
|
original_file: str = None,
|
|
current_user: models.User = None,
|
|
departments: schemas.DocumentDepartmentList = Depends(),
|
|
log_audit: models.Audit = None,
|
|
):
|
|
service = request.state.injector.get(IngestService)
|
|
try:
|
|
with open(ocr_file, 'rb') as file:
|
|
file_name = Path(ocr_file).name
|
|
upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
|
|
|
|
if file_name is None:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="No file name provided",
|
|
)
|
|
|
|
await create_documents(db, file_name, current_user, departments, log_audit)
|
|
with open(upload_path, "wb") as f:
|
|
f.write(file.read())
|
|
file.seek(0)
|
|
ingested_documents = service.ingest_bin_data(file_name, file)
|
|
# Handling Original File
|
|
if original_file:
|
|
try:
|
|
print("ORIGINAL PDF FILE PATH IS :: ", original_file)
|
|
file_name = Path(original_file).name
|
|
upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
|
|
|
|
if file_name is None:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
detail="No file name provided",
|
|
)
|
|
|
|
await create_documents(db, file_name, current_user, departments, log_audit)
|
|
|
|
with open(upload_path, "wb") as f:
|
|
with open(original_file, "rb") as original_file_reader:
|
|
f.write(original_file_reader.read())
|
|
|
|
with open(upload_path, "rb") as f:
|
|
ingested_documents = service.ingest_bin_data(file_name, f)
|
|
except Exception as e:
|
|
print(traceback.print_exc())
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail="Internal Server Error: Unable to ingest file.",
|
|
)
|
|
|
|
logger.info(
|
|
f"{file_name} is uploaded by the {current_user.username}.")
|
|
|
|
return ingested_documents
|
|
|
|
except HTTPException:
|
|
print(traceback.print_exc())
|
|
raise
|
|
|
|
except Exception as e:
|
|
print(traceback.print_exc())
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail="Internal Server Error: Unable to ingest file.",
|
|
)
|
|
|
|
|
|
async def ingest(request: Request, file_path: str) -> IngestResponse:
|
|
"""Ingests and processes a file, storing its chunks to be used as context."""
|
|
service = request.state.injector.get(IngestService)
|
|
|
|
try:
|
|
with open(file_path, 'rb') as file:
|
|
file_name = Path(file_path).name
|
|
upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
|
|
|
|
with upload_path.open('wb') as f:
|
|
f.write(file.read())
|
|
|
|
with upload_path.open('rb') as f:
|
|
ingested_documents = await service.ingest_bin_data(file_name, f)
|
|
|
|
except Exception as e:
|
|
return {"message": f"There was an error uploading the file(s)\n {e}"}
|
|
|
|
finally:
|
|
upload_path.unlink(missing_ok=True)
|
|
|
|
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
|
|
|
|
|
|
|
@ingest_router.post("/ingest/file", response_model=IngestResponse, tags=["Ingestion"])
|
|
async def ingest_file(
|
|
request: Request,
|
|
departments: schemas.DocumentDepartmentList = Depends(),
|
|
file: UploadFile = File(...),
|
|
log_audit: models.Audit = Depends(deps.get_audit_logger),
|
|
db: Session = Depends(deps.get_db),
|
|
current_user: models.User = Security(
|
|
deps.get_current_user,
|
|
scopes=[Role.ADMIN["name"], Role.SUPER_ADMIN["name"], Role.OPERATOR["name"]],
|
|
)) -> IngestResponse:
|
|
"""Ingests and processes a file, storing its chunks to be used as context."""
|
|
service = request.state.injector.get(IngestService)
|
|
try:
|
|
original_filename = file.filename
|
|
print("Original file name is:", original_filename)
|
|
if original_filename is None:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
detail="No file name provided",
|
|
)
|
|
upload_path = Path(f"{UPLOAD_DIR}/{original_filename}")
|
|
try:
|
|
contents = await file.read()
|
|
async with aiofiles.open(upload_path, 'wb') as f:
|
|
await f.write(contents)
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail="Internal Server Error: Unable to ingest file.",
|
|
)
|
|
|
|
await create_documents(db, original_filename, current_user, departments, log_audit)
|
|
with open(upload_path, "rb") as f:
|
|
ingested_documents = service.ingest_bin_data(original_filename, f)
|
|
|
|
logger.info(f"{original_filename} is uploaded by {current_user.username} in {departments.departments_ids}")
|
|
response = IngestResponse(
|
|
object="list", model="private-gpt", data=ingested_documents
|
|
)
|
|
return response
|
|
|
|
except HTTPException:
|
|
print(traceback.print_exc())
|
|
raise
|
|
|
|
except Exception as e:
|
|
print(traceback.print_exc())
|
|
logger.error(f"There was an error uploading the file(s): {str(e)}")
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail="Internal Server Error: Unable to ingest file.",
|
|
)
|