While ingesting, some files led to a crash due to encoding error. Even though utf-8 some characters still messed it up.

This commit is contained in:
Robert Hirsch 2024-06-06 21:00:46 +02:00
parent 757a8c79fd
commit 6af9fb8b42
No known key found for this signature in database
GPG key ID: A9D9D1205DBED12C

View file

@ -89,10 +89,16 @@ class IngestionHelper:
)
# Read as a plain text
string_reader = StringIterableReader()
return string_reader.load_data([file_data.read_text()])
return string_reader.load_data([file_data.read_text(errors='replace')])
logger.debug("Specific reader found for extension=%s", extension)
return reader_cls().load_data(file_data)
try:
res = reader_cls().load_data(file_data)
except:
string_reader = StringIterableReader()
res = string_reader.load_data([file_data.read_text(errors='replace')])
pass
return res
@staticmethod
def _exclude_metadata(documents: list[Document]) -> None: