Added pymupdf4llm for pdf parsing to markdown

This commit is contained in:
Saurab-Shrestha9639*969**9858//852 2024-06-11 16:08:16 +05:45
parent 3ba585ffc0
commit b3df8cae65
9 changed files with 126 additions and 20 deletions

View file

@ -17,6 +17,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
HWPReader,
PDFReader,
)
from pymupdf4llm import LlamaMarkdownReader # type: ignore
from llama_index.readers.file.epub import EpubReader # type: ignore
from llama_index.readers.file.image import ImageReader # type: ignore
from llama_index.readers.file.ipynb import IPYNBReader # type: ignore
@ -32,7 +33,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
default_file_reader_cls: dict[str, type[BaseReader]] = {
".hwp": HWPReader,
".pdf": PDFReader,
".pdf": LlamaMarkdownReader,
".docx": DocxReader,
".pptx": PptxReader,
".ppt": PptxReader,
@ -102,4 +103,4 @@ class IngestionHelper:
# We don't want the Embeddings search to receive this metadata
document.excluded_embed_metadata_keys = ["doc_id"]
# We don't want the LLM to receive these metadata in the context
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page"]