Added pymupdf4llm for pdf parsing to markdown

2025-12-22 23:22:57 +01:00 · 2024-06-11 16:08:16 +05:45 · 2024-06-11 16:08:16 +05:45 · b3df8cae65
commit b3df8cae65
parent 3ba585ffc0
9 changed files with 126 additions and 20 deletions
--- a/private_gpt/components/ingest/ingest_helper.py
+++ b/private_gpt/components/ingest/ingest_helper.py
@ -17,6 +17,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
            HWPReader,
            PDFReader,
        )
+        from pymupdf4llm import LlamaMarkdownReader  # type: ignore
        from llama_index.readers.file.epub import EpubReader  # type: ignore
        from llama_index.readers.file.image import ImageReader  # type: ignore
        from llama_index.readers.file.ipynb import IPYNBReader  # type: ignore
@ -32,7 +33,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:

    default_file_reader_cls: dict[str, type[BaseReader]] = {
        ".hwp": HWPReader,
-        ".pdf": PDFReader,
+        ".pdf": LlamaMarkdownReader,
        ".docx": DocxReader,
        ".pptx": PptxReader,
        ".ppt": PptxReader,
@ -102,4 +103,4 @@ class IngestionHelper:
            # We don't want the Embeddings search to receive this metadata
            document.excluded_embed_metadata_keys = ["doc_id"]
            # We don't want the LLM to receive these metadata in the context
-            document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]
+            document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page"]