private-gpt/private_gpt/components/ingest/ingest_helper.py
kpcrash 5f8b29c571
Update ingest_helper.py to use chardet
Fixes errors related to character mapping in existing code.
2025-02-04 18:13:36 -05:00

156 lines
6.3 KiB
Python

import logging
from pathlib import Path
from llama_index.core.readers import StringIterableReader
from llama_index.core.readers.base import BaseReader
from llama_index.core.readers.json import JSONReader
from llama_index.core.schema import Document
logger = logging.getLogger(__name__)
# Inspired by the `llama_index.core.readers.file.base` module
def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
try:
from llama_index.readers.file.docs import ( # type: ignore
DocxReader,
HWPReader,
PDFReader,
)
from llama_index.readers.file.epub import EpubReader # type: ignore
from llama_index.readers.file.image import ImageReader # type: ignore
from llama_index.readers.file.ipynb import IPYNBReader # type: ignore
from llama_index.readers.file.markdown import MarkdownReader # type: ignore
from llama_index.readers.file.mbox import MboxReader # type: ignore
from llama_index.readers.file.slides import PptxReader # type: ignore
from llama_index.readers.file.tabular import PandasCSVReader # type: ignore
from llama_index.readers.file.video_audio import ( # type: ignore
VideoAudioReader,
)
except ImportError as e:
raise ImportError("`llama-index-readers-file` package not found") from e
default_file_reader_cls: dict[str, type[BaseReader]] = {
".hwp": HWPReader,
".pdf": PDFReader,
".docx": DocxReader,
".pptx": PptxReader,
".ppt": PptxReader,
".pptm": PptxReader,
".jpg": ImageReader,
".png": ImageReader,
".jpeg": ImageReader,
".mp3": VideoAudioReader,
".mp4": VideoAudioReader,
".csv": PandasCSVReader,
".epub": EpubReader,
".md": MarkdownReader,
".mbox": MboxReader,
".ipynb": IPYNBReader,
}
return default_file_reader_cls
# Patching the default file reader to support other file types
FILE_READER_CLS = _try_loading_included_file_formats()
FILE_READER_CLS.update(
{
".json": JSONReader,
}
)
class IngestionHelper:
"""Helper class to transform a file into a list of documents.
This class should be used to transform a file into a list of documents.
These methods are thread-safe (and multiprocessing-safe).
"""
@staticmethod
def transform_file_into_documents(
file_name: str, file_data: Path
) -> list[Document]:
documents = IngestionHelper._load_file_to_documents(file_name, file_data)
for document in documents:
document.metadata["file_name"] = file_name
IngestionHelper._exclude_metadata(documents)
return documents
#Modification to provide better support for file ingest with charmap issues
@staticmethod
def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
logger.debug("Transforming file_name=%s into documents", file_name)
extension = Path(file_name).suffix
reader_cls = FILE_READER_CLS.get(extension)
if reader_cls is None:
logger.debug(
"No reader found for extension=%s, using default string reader",
extension,
)
# Read as a plain text
string_reader = StringIterableReader()
return string_reader.load_data([IngestionHelper._read_all_text(file_data)])
logger.debug("Specific reader found for extension=%s", extension)
documents = reader_cls().load_data(file_data)
return documents
#new method being called from _load_file_to_documents
@staticmethod
def _read_all_text(file_data: Path) -> str:
try:
# Read raw bytes first
raw_bytes = file_data.read_bytes()
# Use chardet to detect encoding
detected = chardet.detect(raw_bytes)
encoding = detected["encoding"] or 'utf-8' # Fallback to utf-8 if detection fails
confidence = detected.get('confidence', 0)
logger.debug(f"Detected encoding {encoding} with confidence {confidence} for {file_data}")
# Try the detected encoding first
try:
text = raw_bytes.decode(encoding)
except UnicodeDecodeError:
# If detected encoding fails, try common encodings
for fallback_encoding in ['utf-8', 'cp1252', 'iso-8859-1', 'latin1']:
try:
text = raw_bytes.decode(fallback_encoding)
logger.debug(f"Successfully decoded with fallback encoding: {fallback_encoding}")
break
except UnicodeDecodeError:
continue
else:
# If all encodings fail, use 'replace' error handler with utf-8
text = raw_bytes.decode('utf-8', errors='replace')
logger.warning(f"Falling back to UTF-8 with replacement for {file_data}")
# Clean up the text
cleaned_text = (text.encode('utf-8', errors='replace')
.decode('utf-8')
.replace('\udc58', '') # Remove specific problematic Unicode chars
.replace('\x00', '') # Remove null bytes
.replace('\ufffd', '') # Remove replacement character
.strip()) # Remove leading/trailing whitespace
if not cleaned_text:
logger.warning(f"Cleaned text is empty for {file_data}")
return cleaned_text
except Exception as e:
logger.error(f"Error processing file {file_data}: {str(e)}")
raise
@staticmethod
def _exclude_metadata(documents: list[Document]) -> None:
logger.debug("Excluding metadata from count=%s documents", len(documents))
for document in documents:
document.metadata["doc_id"] = document.doc_id
# We don't want the Embeddings search to receive this metadata
document.excluded_embed_metadata_keys = ["doc_id"]
# We don't want the LLM to receive these metadata in the context
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]