From 6af9fb8b42ca9067d5eb72edc76e4acabe2498f5 Mon Sep 17 00:00:00 2001 From: Robert Hirsch Date: Thu, 6 Jun 2024 21:00:46 +0200 Subject: [PATCH] While ingesting, some files led to a crash due to encoding error. Even though utf-8 some characters still messed it up. --- private_gpt/components/ingest/ingest_helper.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/private_gpt/components/ingest/ingest_helper.py b/private_gpt/components/ingest/ingest_helper.py index a110907..a3dbecf 100644 --- a/private_gpt/components/ingest/ingest_helper.py +++ b/private_gpt/components/ingest/ingest_helper.py @@ -89,10 +89,16 @@ class IngestionHelper: ) # Read as a plain text string_reader = StringIterableReader() - return string_reader.load_data([file_data.read_text()]) + return string_reader.load_data([file_data.read_text(errors='replace')]) logger.debug("Specific reader found for extension=%s", extension) - return reader_cls().load_data(file_data) + try: + res = reader_cls().load_data(file_data) + except: + string_reader = StringIterableReader() + res = string_reader.load_data([file_data.read_text(errors='replace')]) + pass + return res @staticmethod def _exclude_metadata(documents: list[Document]) -> None: