private-gpt/private_gpt/components/ingest/ingest_helper.py

import logging
from pathlib import Path

from llama_index.core.readers import StringIterableReader
from llama_index.core.readers.base import BaseReader
from llama_index.core.readers.json import JSONReader
from llama_index.core.schema import Document

logger = logging.getLogger(__name__)


# Inspired by the `llama_index.core.readers.file.base` module
def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
    try:
        from llama_index.readers.file.docs import (  # type: ignore
            DocxReader,
            HWPReader,
            PDFReader,
        )
        from llama_index.readers.file.epub import EpubReader  # type: ignore
        from llama_index.readers.file.image import ImageReader  # type: ignore
        from llama_index.readers.file.ipynb import IPYNBReader  # type: ignore
        from llama_index.readers.file.markdown import MarkdownReader  # type: ignore
        from llama_index.readers.file.mbox import MboxReader  # type: ignore
        from llama_index.readers.file.slides import PptxReader  # type: ignore
        from llama_index.readers.file.tabular import PandasCSVReader  # type: ignore
        from llama_index.readers.file.video_audio import (  # type: ignore
            VideoAudioReader,
        )
    except ImportError as e:
        raise ImportError("`llama-index-readers-file` package not found") from e

    default_file_reader_cls: dict[str, type[BaseReader]] = {
        ".hwp": HWPReader,
        ".pdf": PDFReader,
        ".docx": DocxReader,
        ".pptx": PptxReader,
        ".ppt": PptxReader,
        ".pptm": PptxReader,
        ".jpg": ImageReader,
        ".png": ImageReader,
        ".jpeg": ImageReader,
        ".mp3": VideoAudioReader,
        ".mp4": VideoAudioReader,
        ".csv": PandasCSVReader,
        ".epub": EpubReader,
        ".md": MarkdownReader,
        ".mbox": MboxReader,
        ".ipynb": IPYNBReader,
    }
    return default_file_reader_cls


# Patching the default file reader to support other file types
FILE_READER_CLS = _try_loading_included_file_formats()
FILE_READER_CLS.update(
    {
        ".json": JSONReader,
    }
)


class IngestionHelper:
    """Helper class to transform a file into a list of documents.

    This class should be used to transform a file into a list of documents.
    These methods are thread-safe (and multiprocessing-safe).
    """

    @staticmethod
    def transform_file_into_documents(
        file_name: str, file_data: Path
    ) -> list[Document]:
        documents = IngestionHelper._load_file_to_documents(file_name, file_data)
        for document in documents:
            document.metadata["file_name"] = file_name
        IngestionHelper._exclude_metadata(documents)
        return documents

    #Modification to provide better support for file ingest with charmap issues
    @staticmethod
    def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
        logger.debug("Transforming file_name=%s into documents", file_name)
        extension = Path(file_name).suffix
        reader_cls = FILE_READER_CLS.get(extension)
        if reader_cls is None:
            logger.debug(
                "No reader found for extension=%s, using default string reader",
                extension,
            )
            # Read as a plain text
            string_reader = StringIterableReader()
            return string_reader.load_data([IngestionHelper._read_all_text(file_data)])

        logger.debug("Specific reader found for extension=%s", extension)
        documents = reader_cls().load_data(file_data)

        return documents

    #new method being called from _load_file_to_documents
    @staticmethod
    def _read_all_text(file_data: Path) -> str:
        try:
            # Read raw bytes first
            raw_bytes = file_data.read_bytes()

            # Use chardet to detect encoding
            detected = chardet.detect(raw_bytes)
            encoding = detected["encoding"] or 'utf-8'  # Fallback to utf-8 if detection fails
            confidence = detected.get('confidence', 0)

            logger.debug(f"Detected encoding {encoding} with confidence {confidence} for {file_data}")

            # Try the detected encoding first
            try:
                text = raw_bytes.decode(encoding)
            except UnicodeDecodeError:
                # If detected encoding fails, try common encodings
                for fallback_encoding in ['utf-8', 'cp1252', 'iso-8859-1', 'latin1']:
                    try:
                        text = raw_bytes.decode(fallback_encoding)
                        logger.debug(f"Successfully decoded with fallback encoding: {fallback_encoding}")
                        break
                    except UnicodeDecodeError:
                        continue
                else:
                    # If all encodings fail, use 'replace' error handler with utf-8
                    text = raw_bytes.decode('utf-8', errors='replace')
                    logger.warning(f"Falling back to UTF-8 with replacement for {file_data}")

            # Clean up the text
            cleaned_text = (text.encode('utf-8', errors='replace')
                        .decode('utf-8')
                        .replace('\udc58', '')  # Remove specific problematic Unicode chars
                        .replace('\x00', '')     # Remove null bytes
                        .replace('\ufffd', '')   # Remove replacement character
                        .strip())                # Remove leading/trailing whitespace

            if not cleaned_text:
                logger.warning(f"Cleaned text is empty for {file_data}")

            return cleaned_text

        except Exception as e:
            logger.error(f"Error processing file {file_data}: {str(e)}")
            raise

    @staticmethod
    def _exclude_metadata(documents: list[Document]) -> None:
        logger.debug("Excluding metadata from count=%s documents", len(documents))
        for document in documents:
            document.metadata["doc_id"] = document.doc_id
            # We don't want the Embeddings search to receive this metadata
            document.excluded_embed_metadata_keys = ["doc_id"]
            # We don't want the LLM to receive these metadata in the context
            document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]