mirror of
https://github.com/zylon-ai/private-gpt.git
synced 2025-12-22 23:22:57 +01:00
Added pymupdf4llm for pdf parsing to markdown
This commit is contained in:
parent
3ba585ffc0
commit
b3df8cae65
9 changed files with 126 additions and 20 deletions
|
|
@ -17,6 +17,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
|||
HWPReader,
|
||||
PDFReader,
|
||||
)
|
||||
from pymupdf4llm import LlamaMarkdownReader # type: ignore
|
||||
from llama_index.readers.file.epub import EpubReader # type: ignore
|
||||
from llama_index.readers.file.image import ImageReader # type: ignore
|
||||
from llama_index.readers.file.ipynb import IPYNBReader # type: ignore
|
||||
|
|
@ -32,7 +33,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
|||
|
||||
default_file_reader_cls: dict[str, type[BaseReader]] = {
|
||||
".hwp": HWPReader,
|
||||
".pdf": PDFReader,
|
||||
".pdf": LlamaMarkdownReader,
|
||||
".docx": DocxReader,
|
||||
".pptx": PptxReader,
|
||||
".ppt": PptxReader,
|
||||
|
|
@ -102,4 +103,4 @@ class IngestionHelper:
|
|||
# We don't want the Embeddings search to receive this metadata
|
||||
document.excluded_embed_metadata_keys = ["doc_id"]
|
||||
# We don't want the LLM to receive these metadata in the context
|
||||
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]
|
||||
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue