Added pymupdf4llm for pdf parsing to markdown

This commit is contained in:
Saurab-Shrestha9639*969**9858//852 2024-06-11 16:08:16 +05:45
parent 3ba585ffc0
commit b3df8cae65
9 changed files with 126 additions and 20 deletions

16
poetry.lock generated
View file

@ -4891,6 +4891,20 @@ files = [
[package.dependencies] [package.dependencies]
PyMuPDFb = "1.24.3" PyMuPDFb = "1.24.3"
[[package]]
name = "pymupdf4llm"
version = "0.0.5"
description = "PyMuPDF Utilities for LLM/RAG"
optional = false
python-versions = "*"
files = [
{file = "pymupdf4llm-0.0.5-py3-none-any.whl", hash = "sha256:9882e42789dcefbad25c8e570d9c8d72eaf34e6a1f16ef3f555faf5f7718654f"},
{file = "pymupdf4llm-0.0.5.tar.gz", hash = "sha256:3256dbc5feec8ec3149586e2e2688f3f47fa733395c33e21a7af15f9b7531689"},
]
[package.dependencies]
pymupdf = ">=1.24.2"
[[package]] [[package]]
name = "pymupdfb" name = "pymupdfb"
version = "1.24.3" version = "1.24.3"
@ -7545,4 +7559,4 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.11,<3.12" python-versions = ">=3.11,<3.12"
content-hash = "340cc561cf928802b001b918eb03605a9055b9f0068937203b18c390dea93494" content-hash = "463d14cd284646ece2ac8105207e0b197f3b208f4fa667d458f4ffbea1f2a353"

View file

@ -17,6 +17,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
HWPReader, HWPReader,
PDFReader, PDFReader,
) )
from pymupdf4llm import LlamaMarkdownReader # type: ignore
from llama_index.readers.file.epub import EpubReader # type: ignore from llama_index.readers.file.epub import EpubReader # type: ignore
from llama_index.readers.file.image import ImageReader # type: ignore from llama_index.readers.file.image import ImageReader # type: ignore
from llama_index.readers.file.ipynb import IPYNBReader # type: ignore from llama_index.readers.file.ipynb import IPYNBReader # type: ignore
@ -32,7 +33,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
default_file_reader_cls: dict[str, type[BaseReader]] = { default_file_reader_cls: dict[str, type[BaseReader]] = {
".hwp": HWPReader, ".hwp": HWPReader,
".pdf": PDFReader, ".pdf": LlamaMarkdownReader,
".docx": DocxReader, ".docx": DocxReader,
".pptx": PptxReader, ".pptx": PptxReader,
".ppt": PptxReader, ".ppt": PptxReader,
@ -102,4 +103,4 @@ class IngestionHelper:
# We don't want the Embeddings search to receive this metadata # We don't want the Embeddings search to receive this metadata
document.excluded_embed_metadata_keys = ["doc_id"] document.excluded_embed_metadata_keys = ["doc_id"]
# We don't want the LLM to receive these metadata in the context # We don't want the LLM to receive these metadata in the context
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"] document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page"]

View file

@ -140,6 +140,75 @@ class Llama2PromptStyle(AbstractPromptStyle):
f"{completion.strip()} {self.E_INST}" f"{completion.strip()} {self.E_INST}"
) )
class Llama3PromptStyle(AbstractPromptStyle):
"""
Template:
{% set loop_messages = messages %}
{% for message in loop_messages %}
{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}
{% if loop.index0 == 0 %}
{% set content = bos_token + content %}
{% endif %}
{{ content }}
{% endfor %}
{% if add_generation_prompt %}
{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{% endif %}
"""
BOS, EOS = "<|begin_of_text|>", "<|end_of_text|>"
B_INST, E_INST = "<|start_header_id|>user<|end_header_id|>", "<|eot_id|>"
B_SYS, E_SYS = "<|start_header_id|>system<|end_header_id|> ", "<|eot_id|>"
ASSISTANT_INST = "<|start_header_id|>assistant<|end_header_id|>"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. \
Always answer as helpfully as possible and follow ALL given instructions. \
Do not speculate or make up information. \
Do not reference any given instructions or context. \
"""
def _messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
string_messages: list[str] = []
if messages[0].role == MessageRole.SYSTEM:
system_message_str = messages[0].content or ""
messages = messages[1:]
else:
system_message_str = self.DEFAULT_SYSTEM_PROMPT
system_message_str = f"{self.B_SYS} {system_message_str.strip()} {self.E_SYS}"
for i in range(0, len(messages), 2):
user_message = messages[i]
assert user_message.role == MessageRole.USER
if i == 0:
str_message = f"{system_message_str} {self.BOS} {self.B_INST} "
else:
# end previous user-assistant interaction
string_messages[-1] += f" {self.EOS}"
# no need to include system prompt
str_message = f"{self.BOS} {self.B_INST} "
str_message += f"{user_message.content} {self.E_INST} {self.ASSISTANT_INST}"
if len(messages) > (i + 1):
assistant_message = messages[i + 1]
assert assistant_message.role == MessageRole.ASSISTANT
str_message += f" {assistant_message.content} {self.E_SYS} {self.B_INST}"
string_messages.append(str_message)
return "".join(string_messages)
def _completion_to_prompt(self, completion: str) -> str:
system_prompt_str = self.DEFAULT_SYSTEM_PROMPT
return (
f"{self.B_SYS} {system_prompt_str.strip()} {self.E_SYS} "
f"{completion.strip()} {self.E_SYS} "
)
class TagPromptStyle(AbstractPromptStyle): class TagPromptStyle(AbstractPromptStyle):
"""Tag prompt style (used by Vigogne) that uses the prompt style `<|ROLE|>`. """Tag prompt style (used by Vigogne) that uses the prompt style `<|ROLE|>`.
@ -218,7 +287,7 @@ class ChatMLPromptStyle(AbstractPromptStyle):
def get_prompt_style( def get_prompt_style(
prompt_style: Literal["default", "llama2", prompt_style: Literal["default", "llama2","llama3",
"tag", "mistral", "chatml"] | None "tag", "mistral", "chatml"] | None
) -> AbstractPromptStyle: ) -> AbstractPromptStyle:
"""Get the prompt style to use from the given string. """Get the prompt style to use from the given string.
@ -230,6 +299,8 @@ def get_prompt_style(
return DefaultPromptStyle() return DefaultPromptStyle()
elif prompt_style == "llama2": elif prompt_style == "llama2":
return Llama2PromptStyle() return Llama2PromptStyle()
elif prompt_style == "llama3":
return Llama3PromptStyle()
elif prompt_style == "tag": elif prompt_style == "tag":
return TagPromptStyle() return TagPromptStyle()
elif prompt_style == "mistral": elif prompt_style == "mistral":

View file

@ -124,6 +124,7 @@ class VectorStoreComponent:
batch_size=20, batch_size=20,
sparse_doc_fn=sparse_doc_vectors, sparse_doc_fn=sparse_doc_vectors,
sparse_query_fn=sparse_query_vectors, sparse_query_fn=sparse_query_vectors,
use_async=True,
# hybrid_fusion_fn=relative_score_fusion, # hybrid_fusion_fn=relative_score_fusion,
), # TODO ), # TODO
) )

View file

@ -12,6 +12,7 @@ from llama_index.core.postprocessor import (
SentenceTransformerRerank, SentenceTransformerRerank,
SimilarityPostprocessor, SimilarityPostprocessor,
) )
from llama_index.core.postprocessor import SentenceEmbeddingOptimizer
from llama_index.core.storage import StorageContext from llama_index.core.storage import StorageContext
from llama_index.core.types import TokenGen from llama_index.core.types import TokenGen
from pydantic import BaseModel from pydantic import BaseModel
@ -31,6 +32,16 @@ from private_gpt.settings.settings import Settings
from private_gpt.paths import models_path from private_gpt.paths import models_path
DEFAULT_CONDENSE_PROMPT_TEMPLATE = """
Given the following conversation between a user and an AI assistant and a follow up question from user,
rephrase the follow up question to be a standalone question based on the given context.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
class Completion(BaseModel): class Completion(BaseModel):
response: str response: str
sources: list[Chunk] | None = None sources: list[Chunk] | None = None
@ -123,21 +134,20 @@ class ChatService:
similarity_cutoff=settings.rag.similarity_value similarity_cutoff=settings.rag.similarity_value
), ),
] ]
if settings.rag.rerank.enabled: if settings.rag.rerank.enabled:
rerank_postprocessor = SentenceTransformerRerank( rerank_postprocessor = SentenceTransformerRerank(
model=settings.rag.rerank.model, top_n=settings.rag.rerank.top_n model=settings.rag.rerank.model, top_n=settings.rag.rerank.top_n
) )
node_postprocessors.append(rerank_postprocessor) node_postprocessors.append(rerank_postprocessor)
response_synthesizer = get_response_synthesizer(structured_answer_filtering=True, llm=self.llm_component.llm) response_synthesizer = get_response_synthesizer(response_mode="no_text", llm=self.llm_component.llm)
custom_query_engine = RetrieverQueryEngine( custom_query_engine = RetrieverQueryEngine(
retriever=vector_index_retriever, retriever=vector_index_retriever,
response_synthesizer=response_synthesizer response_synthesizer=response_synthesizer
) )
return ContextChatEngine.from_defaults( return CondensePlusContextChatEngine.from_defaults(
system_prompt=system_prompt, system_prompt=system_prompt,
retriever=custom_query_engine, retriever=custom_query_engine,
llm=self.llm_component.llm, # Takes no effect at the moment llm=self.llm_component.llm, # Takes no effect at the moment
@ -201,7 +211,7 @@ class ChatService:
""" """
You are a helpful assistant named QuickGPT by Quickfox Consulting. You are a helpful assistant named QuickGPT by Quickfox Consulting.
Engage in a two-way conversation, ensuring that your responses are strictly and exclusively based on the relevant context documents provided. Engage in a two-way conversation, ensuring that your responses are strictly and exclusively based on the relevant context documents provided without adding extra information from your prior knowledge.
Do not use any prior knowledge or external sources or make assumptions, inferences, or draw upon any prior knowledge beyond what is explicitly stated in the relevant context documents. Do not use any prior knowledge or external sources or make assumptions, inferences, or draw upon any prior knowledge beyond what is explicitly stated in the relevant context documents.
If the answer to a query is not present in the relevant context documents, respond with "I do not have enough information in the provided context to answer this question." If the answer to a query is not present in the relevant context documents, respond with "I do not have enough information in the provided context to answer this question."
@ -209,6 +219,11 @@ class ChatService:
Your responses must be relevant, informative, and easy to understand. Your responses must be relevant, informative, and easy to understand.
Aim to deliver high-quality answers that are respectful and helpful, using clear and concise language. Aim to deliver high-quality answers that are respectful and helpful, using clear and concise language.
Consider previous queries only if the latest query is directly related to them. Address only the most recent query unless it explicitly builds upon a previous one. Consider previous queries only if the latest query is directly related to them. Address only the most recent query unless it explicitly builds upon a previous one.
Here are the relevant documents for the context:
{context_str}
Instruction: Based on the above documents, provide a detailed answer for the user question below.
Answer "don't know" if not present in the document.
""" """
) )
chat_history = ( chat_history = (
@ -222,6 +237,7 @@ class ChatService:
wrapped_response = chat_engine.chat( wrapped_response = chat_engine.chat(
message=last_message if last_message is not None else "", message=last_message if last_message is not None else "",
chat_history=chat_history, chat_history=chat_history,
) )
sources = [Chunk.from_node(node) for node in wrapped_response.source_nodes] sources = [Chunk.from_node(node) for node in wrapped_response.source_nodes]
completion = Completion(response=wrapped_response.response, sources=sources) completion = Completion(response=wrapped_response.response, sources=sources)

View file

@ -4,7 +4,7 @@ from pathlib import Path
from typing import TYPE_CHECKING, AnyStr, BinaryIO, Sequence, Any, List from typing import TYPE_CHECKING, AnyStr, BinaryIO, Sequence, Any, List
from injector import inject, singleton from injector import inject, singleton
from llama_index.core.node_parser import SemanticSplitterNodeParser, SentenceSplitter from llama_index.core.node_parser import SemanticSplitterNodeParser, SentenceSplitter, SentenceWindowNodeParser
from llama_index.core.storage import StorageContext from llama_index.core.storage import StorageContext
from llama_index.core.schema import BaseNode , ObjectType , TextNode from llama_index.core.schema import BaseNode , ObjectType , TextNode
@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
DEFAULT_CHUNK_SIZE = 512 DEFAULT_CHUNK_SIZE = 512
SENTENCE_CHUNK_OVERLAP = 20 SENTENCE_CHUNK_OVERLAP = 50
class SafeSemanticSplitter(SemanticSplitterNodeParser): class SafeSemanticSplitter(SemanticSplitterNodeParser):
@ -72,22 +72,23 @@ class IngestService:
docstore=node_store_component.doc_store, docstore=node_store_component.doc_store,
index_store=node_store_component.index_store, index_store=node_store_component.index_store,
) )
# splitter = SentenceSplitter(chunk_size=512, chunk_overlap=128)
node_parser = SafeSemanticSplitter.from_defaults( node_parser = SafeSemanticSplitter.from_defaults(
embed_model=embedding_component.embedding_model, embed_model=embedding_component.embedding_model,
# sentence_splitter=splitter,
include_metadata=True, include_metadata=True,
include_prev_next_rel=True, include_prev_next_rel=True,
) )
# node_parser = SentenceWindowNodeParser.from_defaults(
# window_size=3,
# window_metadata_key="window",
# original_text_metadata_key="original_text",
# )
self.ingest_component = get_ingestion_component( self.ingest_component = get_ingestion_component(
self.storage_context, self.storage_context,
embed_model=embedding_component.embedding_model, embed_model=embedding_component.embedding_model,
transformations=[ transformations=[
node_parser, node_parser,
TitleExtractor(nodes=1, llm=self.llm_service.llm), embedding_component.embedding_model
QuestionsAnsweredExtractor(questions=1,llm=self.llm_service.llm), ],
embedding_component.embedding_model],
settings=settings(), settings=settings(),
) )

View file

@ -104,12 +104,13 @@ class LLMSettings(BaseModel):
0.1, 0.1,
description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.", description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.",
) )
prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field( prompt_style: Literal["default", "llama2", "llama3", "tag", "mistral", "chatml"] = Field(
"llama2", "llama2",
description=( description=(
"The prompt style to use for the chat engine. " "The prompt style to use for the chat engine. "
"If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n" "If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n"
"If `llama2` - use the llama2 prompt style from the llama_index. Based on `<s>`, `[INST]` and `<<SYS>>`.\n" "If `llama2` - use the llama2 prompt style from the llama_index. Based on `<s>`, `[INST]` and `<<SYS>>`.\n"
"If `llama3` - use the llama3 prompt style from the llama_index.\n"
"If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n" "If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n"
"If `mistral` - use the `mistral prompt style. It shoudl look like <s>[INST] {System Prompt} [/INST]</s>[INST] { UserInstructions } [/INST]" "If `mistral` - use the `mistral prompt style. It shoudl look like <s>[INST] {System Prompt} [/INST]</s>[INST] { UserInstructions } [/INST]"
"`llama2` is the historic behaviour. `default` might work better with your custom models." "`llama2` is the historic behaviour. `default` might work better with your custom models."

View file

@ -70,6 +70,7 @@ fastapi-pagination = "^0.12.23"
xlsxwriter = "^3.2.0" xlsxwriter = "^3.2.0"
pdf2image = "^1.17.0" pdf2image = "^1.17.0"
pymupdf = "^1.24.4" pymupdf = "^1.24.4"
pymupdf4llm = "^0.0.5"
[tool.poetry.extras] [tool.poetry.extras]
ui = ["gradio"] ui = ["gradio"]

View file

@ -51,7 +51,7 @@ rag:
#This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score. #This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score.
rerank: rerank:
enabled: true enabled: true
model: avsolatorio/GIST-Embedding-v0 model: infgrad/stella-base-en-v2
top_n: 2 top_n: 2
llamacpp: llamacpp:
@ -60,7 +60,7 @@ llamacpp:
llm_hf_repo_id: qwp4w3hyb/Hermes-2-Pro-Llama-3-8B-iMat-GGUF llm_hf_repo_id: qwp4w3hyb/Hermes-2-Pro-Llama-3-8B-iMat-GGUF
llm_hf_model_file: hermes-2-pro-llama-3-8b-imat-Q4_K_S.gguf llm_hf_model_file: hermes-2-pro-llama-3-8b-imat-Q4_K_S.gguf
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) top_k: 30 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
@ -72,7 +72,7 @@ embedding:
embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5 embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
huggingface: huggingface:
embedding_hf_model_name: BAAI/bge-large-en embedding_hf_model_name: mixedbread-ai/mxbai-embed-large-v1
access_token: ${HUGGINGFACE_TOKEN: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} access_token: ${HUGGINGFACE_TOKEN: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}
vectorstore: vectorstore: