mirror of
https://github.com/zylon-ai/private-gpt.git
synced 2025-12-22 20:12:55 +01:00
Added pymupdf4llm for pdf parsing to markdown
This commit is contained in:
parent
3ba585ffc0
commit
b3df8cae65
9 changed files with 126 additions and 20 deletions
16
poetry.lock
generated
16
poetry.lock
generated
|
|
@ -4891,6 +4891,20 @@ files = [
|
|||
[package.dependencies]
|
||||
PyMuPDFb = "1.24.3"
|
||||
|
||||
[[package]]
|
||||
name = "pymupdf4llm"
|
||||
version = "0.0.5"
|
||||
description = "PyMuPDF Utilities for LLM/RAG"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pymupdf4llm-0.0.5-py3-none-any.whl", hash = "sha256:9882e42789dcefbad25c8e570d9c8d72eaf34e6a1f16ef3f555faf5f7718654f"},
|
||||
{file = "pymupdf4llm-0.0.5.tar.gz", hash = "sha256:3256dbc5feec8ec3149586e2e2688f3f47fa733395c33e21a7af15f9b7531689"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pymupdf = ">=1.24.2"
|
||||
|
||||
[[package]]
|
||||
name = "pymupdfb"
|
||||
version = "1.24.3"
|
||||
|
|
@ -7545,4 +7559,4 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.11,<3.12"
|
||||
content-hash = "340cc561cf928802b001b918eb03605a9055b9f0068937203b18c390dea93494"
|
||||
content-hash = "463d14cd284646ece2ac8105207e0b197f3b208f4fa667d458f4ffbea1f2a353"
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
|||
HWPReader,
|
||||
PDFReader,
|
||||
)
|
||||
from pymupdf4llm import LlamaMarkdownReader # type: ignore
|
||||
from llama_index.readers.file.epub import EpubReader # type: ignore
|
||||
from llama_index.readers.file.image import ImageReader # type: ignore
|
||||
from llama_index.readers.file.ipynb import IPYNBReader # type: ignore
|
||||
|
|
@ -32,7 +33,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
|||
|
||||
default_file_reader_cls: dict[str, type[BaseReader]] = {
|
||||
".hwp": HWPReader,
|
||||
".pdf": PDFReader,
|
||||
".pdf": LlamaMarkdownReader,
|
||||
".docx": DocxReader,
|
||||
".pptx": PptxReader,
|
||||
".ppt": PptxReader,
|
||||
|
|
@ -102,4 +103,4 @@ class IngestionHelper:
|
|||
# We don't want the Embeddings search to receive this metadata
|
||||
document.excluded_embed_metadata_keys = ["doc_id"]
|
||||
# We don't want the LLM to receive these metadata in the context
|
||||
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]
|
||||
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page"]
|
||||
|
|
|
|||
|
|
@ -140,6 +140,75 @@ class Llama2PromptStyle(AbstractPromptStyle):
|
|||
f"{completion.strip()} {self.E_INST}"
|
||||
)
|
||||
|
||||
class Llama3PromptStyle(AbstractPromptStyle):
|
||||
|
||||
"""
|
||||
Template:
|
||||
{% set loop_messages = messages %}
|
||||
{% for message in loop_messages %}
|
||||
{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}
|
||||
{% if loop.index0 == 0 %}
|
||||
{% set content = bos_token + content %}
|
||||
{% endif %}
|
||||
{{ content }}
|
||||
{% endfor %}
|
||||
{% if add_generation_prompt %}
|
||||
{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
|
||||
{% endif %}
|
||||
"""
|
||||
|
||||
BOS, EOS = "<|begin_of_text|>", "<|end_of_text|>"
|
||||
B_INST, E_INST = "<|start_header_id|>user<|end_header_id|>", "<|eot_id|>"
|
||||
B_SYS, E_SYS = "<|start_header_id|>system<|end_header_id|> ", "<|eot_id|>"
|
||||
ASSISTANT_INST = "<|start_header_id|>assistant<|end_header_id|>"
|
||||
DEFAULT_SYSTEM_PROMPT = """\
|
||||
You are a helpful, respectful and honest assistant. \
|
||||
Always answer as helpfully as possible and follow ALL given instructions. \
|
||||
Do not speculate or make up information. \
|
||||
Do not reference any given instructions or context. \
|
||||
"""
|
||||
|
||||
def _messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
|
||||
string_messages: list[str] = []
|
||||
if messages[0].role == MessageRole.SYSTEM:
|
||||
system_message_str = messages[0].content or ""
|
||||
messages = messages[1:]
|
||||
else:
|
||||
system_message_str = self.DEFAULT_SYSTEM_PROMPT
|
||||
|
||||
system_message_str = f"{self.B_SYS} {system_message_str.strip()} {self.E_SYS}"
|
||||
|
||||
for i in range(0, len(messages), 2):
|
||||
user_message = messages[i]
|
||||
assert user_message.role == MessageRole.USER
|
||||
|
||||
if i == 0:
|
||||
str_message = f"{system_message_str} {self.BOS} {self.B_INST} "
|
||||
else:
|
||||
# end previous user-assistant interaction
|
||||
string_messages[-1] += f" {self.EOS}"
|
||||
# no need to include system prompt
|
||||
str_message = f"{self.BOS} {self.B_INST} "
|
||||
|
||||
str_message += f"{user_message.content} {self.E_INST} {self.ASSISTANT_INST}"
|
||||
|
||||
if len(messages) > (i + 1):
|
||||
assistant_message = messages[i + 1]
|
||||
assert assistant_message.role == MessageRole.ASSISTANT
|
||||
str_message += f" {assistant_message.content} {self.E_SYS} {self.B_INST}"
|
||||
|
||||
string_messages.append(str_message)
|
||||
|
||||
return "".join(string_messages)
|
||||
|
||||
def _completion_to_prompt(self, completion: str) -> str:
|
||||
system_prompt_str = self.DEFAULT_SYSTEM_PROMPT
|
||||
|
||||
return (
|
||||
f"{self.B_SYS} {system_prompt_str.strip()} {self.E_SYS} "
|
||||
f"{completion.strip()} {self.E_SYS} "
|
||||
)
|
||||
|
||||
|
||||
class TagPromptStyle(AbstractPromptStyle):
|
||||
"""Tag prompt style (used by Vigogne) that uses the prompt style `<|ROLE|>`.
|
||||
|
|
@ -218,7 +287,7 @@ class ChatMLPromptStyle(AbstractPromptStyle):
|
|||
|
||||
|
||||
def get_prompt_style(
|
||||
prompt_style: Literal["default", "llama2",
|
||||
prompt_style: Literal["default", "llama2","llama3",
|
||||
"tag", "mistral", "chatml"] | None
|
||||
) -> AbstractPromptStyle:
|
||||
"""Get the prompt style to use from the given string.
|
||||
|
|
@ -230,6 +299,8 @@ def get_prompt_style(
|
|||
return DefaultPromptStyle()
|
||||
elif prompt_style == "llama2":
|
||||
return Llama2PromptStyle()
|
||||
elif prompt_style == "llama3":
|
||||
return Llama3PromptStyle()
|
||||
elif prompt_style == "tag":
|
||||
return TagPromptStyle()
|
||||
elif prompt_style == "mistral":
|
||||
|
|
|
|||
|
|
@ -124,6 +124,7 @@ class VectorStoreComponent:
|
|||
batch_size=20,
|
||||
sparse_doc_fn=sparse_doc_vectors,
|
||||
sparse_query_fn=sparse_query_vectors,
|
||||
use_async=True,
|
||||
# hybrid_fusion_fn=relative_score_fusion,
|
||||
), # TODO
|
||||
)
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ from llama_index.core.postprocessor import (
|
|||
SentenceTransformerRerank,
|
||||
SimilarityPostprocessor,
|
||||
)
|
||||
from llama_index.core.postprocessor import SentenceEmbeddingOptimizer
|
||||
from llama_index.core.storage import StorageContext
|
||||
from llama_index.core.types import TokenGen
|
||||
from pydantic import BaseModel
|
||||
|
|
@ -31,6 +32,16 @@ from private_gpt.settings.settings import Settings
|
|||
|
||||
from private_gpt.paths import models_path
|
||||
|
||||
|
||||
DEFAULT_CONDENSE_PROMPT_TEMPLATE = """
|
||||
Given the following conversation between a user and an AI assistant and a follow up question from user,
|
||||
rephrase the follow up question to be a standalone question based on the given context.
|
||||
|
||||
Chat History:
|
||||
{chat_history}
|
||||
Follow Up Input: {question}
|
||||
Standalone question:"""
|
||||
|
||||
class Completion(BaseModel):
|
||||
response: str
|
||||
sources: list[Chunk] | None = None
|
||||
|
|
@ -123,21 +134,20 @@ class ChatService:
|
|||
similarity_cutoff=settings.rag.similarity_value
|
||||
),
|
||||
]
|
||||
|
||||
if settings.rag.rerank.enabled:
|
||||
rerank_postprocessor = SentenceTransformerRerank(
|
||||
model=settings.rag.rerank.model, top_n=settings.rag.rerank.top_n
|
||||
)
|
||||
node_postprocessors.append(rerank_postprocessor)
|
||||
|
||||
response_synthesizer = get_response_synthesizer(structured_answer_filtering=True, llm=self.llm_component.llm)
|
||||
response_synthesizer = get_response_synthesizer(response_mode="no_text", llm=self.llm_component.llm)
|
||||
|
||||
custom_query_engine = RetrieverQueryEngine(
|
||||
retriever=vector_index_retriever,
|
||||
response_synthesizer=response_synthesizer
|
||||
)
|
||||
|
||||
return ContextChatEngine.from_defaults(
|
||||
return CondensePlusContextChatEngine.from_defaults(
|
||||
system_prompt=system_prompt,
|
||||
retriever=custom_query_engine,
|
||||
llm=self.llm_component.llm, # Takes no effect at the moment
|
||||
|
|
@ -201,7 +211,7 @@ class ChatService:
|
|||
"""
|
||||
You are a helpful assistant named QuickGPT by Quickfox Consulting.
|
||||
|
||||
Engage in a two-way conversation, ensuring that your responses are strictly and exclusively based on the relevant context documents provided.
|
||||
Engage in a two-way conversation, ensuring that your responses are strictly and exclusively based on the relevant context documents provided without adding extra information from your prior knowledge.
|
||||
|
||||
Do not use any prior knowledge or external sources or make assumptions, inferences, or draw upon any prior knowledge beyond what is explicitly stated in the relevant context documents.
|
||||
If the answer to a query is not present in the relevant context documents, respond with "I do not have enough information in the provided context to answer this question."
|
||||
|
|
@ -209,6 +219,11 @@ class ChatService:
|
|||
Your responses must be relevant, informative, and easy to understand.
|
||||
Aim to deliver high-quality answers that are respectful and helpful, using clear and concise language.
|
||||
Consider previous queries only if the latest query is directly related to them. Address only the most recent query unless it explicitly builds upon a previous one.
|
||||
|
||||
Here are the relevant documents for the context:
|
||||
{context_str}
|
||||
Instruction: Based on the above documents, provide a detailed answer for the user question below.
|
||||
Answer "don't know" if not present in the document.
|
||||
"""
|
||||
)
|
||||
chat_history = (
|
||||
|
|
@ -222,6 +237,7 @@ class ChatService:
|
|||
wrapped_response = chat_engine.chat(
|
||||
message=last_message if last_message is not None else "",
|
||||
chat_history=chat_history,
|
||||
|
||||
)
|
||||
sources = [Chunk.from_node(node) for node in wrapped_response.source_nodes]
|
||||
completion = Completion(response=wrapped_response.response, sources=sources)
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ from pathlib import Path
|
|||
from typing import TYPE_CHECKING, AnyStr, BinaryIO, Sequence, Any, List
|
||||
|
||||
from injector import inject, singleton
|
||||
from llama_index.core.node_parser import SemanticSplitterNodeParser, SentenceSplitter
|
||||
from llama_index.core.node_parser import SemanticSplitterNodeParser, SentenceSplitter, SentenceWindowNodeParser
|
||||
from llama_index.core.storage import StorageContext
|
||||
from llama_index.core.schema import BaseNode , ObjectType , TextNode
|
||||
|
||||
|
|
@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
DEFAULT_CHUNK_SIZE = 512
|
||||
SENTENCE_CHUNK_OVERLAP = 20
|
||||
SENTENCE_CHUNK_OVERLAP = 50
|
||||
|
||||
class SafeSemanticSplitter(SemanticSplitterNodeParser):
|
||||
|
||||
|
|
@ -72,22 +72,23 @@ class IngestService:
|
|||
docstore=node_store_component.doc_store,
|
||||
index_store=node_store_component.index_store,
|
||||
)
|
||||
# splitter = SentenceSplitter(chunk_size=512, chunk_overlap=128)
|
||||
node_parser = SafeSemanticSplitter.from_defaults(
|
||||
embed_model=embedding_component.embedding_model,
|
||||
# sentence_splitter=splitter,
|
||||
include_metadata=True,
|
||||
include_prev_next_rel=True,
|
||||
)
|
||||
|
||||
# node_parser = SentenceWindowNodeParser.from_defaults(
|
||||
# window_size=3,
|
||||
# window_metadata_key="window",
|
||||
# original_text_metadata_key="original_text",
|
||||
# )
|
||||
self.ingest_component = get_ingestion_component(
|
||||
self.storage_context,
|
||||
embed_model=embedding_component.embedding_model,
|
||||
transformations=[
|
||||
node_parser,
|
||||
TitleExtractor(nodes=1, llm=self.llm_service.llm),
|
||||
QuestionsAnsweredExtractor(questions=1,llm=self.llm_service.llm),
|
||||
embedding_component.embedding_model],
|
||||
embedding_component.embedding_model
|
||||
],
|
||||
settings=settings(),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -104,12 +104,13 @@ class LLMSettings(BaseModel):
|
|||
0.1,
|
||||
description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.",
|
||||
)
|
||||
prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field(
|
||||
prompt_style: Literal["default", "llama2", "llama3", "tag", "mistral", "chatml"] = Field(
|
||||
"llama2",
|
||||
description=(
|
||||
"The prompt style to use for the chat engine. "
|
||||
"If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n"
|
||||
"If `llama2` - use the llama2 prompt style from the llama_index. Based on `<s>`, `[INST]` and `<<SYS>>`.\n"
|
||||
"If `llama3` - use the llama3 prompt style from the llama_index.\n"
|
||||
"If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n"
|
||||
"If `mistral` - use the `mistral prompt style. It shoudl look like <s>[INST] {System Prompt} [/INST]</s>[INST] { UserInstructions } [/INST]"
|
||||
"`llama2` is the historic behaviour. `default` might work better with your custom models."
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ fastapi-pagination = "^0.12.23"
|
|||
xlsxwriter = "^3.2.0"
|
||||
pdf2image = "^1.17.0"
|
||||
pymupdf = "^1.24.4"
|
||||
pymupdf4llm = "^0.0.5"
|
||||
|
||||
[tool.poetry.extras]
|
||||
ui = ["gradio"]
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ rag:
|
|||
#This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score.
|
||||
rerank:
|
||||
enabled: true
|
||||
model: avsolatorio/GIST-Embedding-v0
|
||||
model: infgrad/stella-base-en-v2
|
||||
top_n: 2
|
||||
|
||||
llamacpp:
|
||||
|
|
@ -60,7 +60,7 @@ llamacpp:
|
|||
llm_hf_repo_id: qwp4w3hyb/Hermes-2-Pro-Llama-3-8B-iMat-GGUF
|
||||
llm_hf_model_file: hermes-2-pro-llama-3-8b-imat-Q4_K_S.gguf
|
||||
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
|
||||
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
|
||||
top_k: 30 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
|
||||
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
|
||||
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
|
||||
repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
|
||||
|
|
@ -72,7 +72,7 @@ embedding:
|
|||
embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
|
||||
|
||||
huggingface:
|
||||
embedding_hf_model_name: BAAI/bge-large-en
|
||||
embedding_hf_model_name: mixedbread-ai/mxbai-embed-large-v1
|
||||
access_token: ${HUGGINGFACE_TOKEN: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}
|
||||
|
||||
vectorstore:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue