This blog post demonstrates how to build a Retrieval-Augmented Generation (RAG) pipeline for PDF parsing using Qdrant as a VectorDB and ChatOllama with the deepseek-r1:8b model – all served via Streamlit.
pip install streamlit qdrant-client langchain_huggingface langchain_community langchain_ollama sentence-transformers
The following code imports necessary modules, initializes a Qdrant client, and sets up the LLM using the deepseek-r1:8b model (optionally, you can use Llama3.2 or any other model).
import streamlit as st
import io
import os
import re
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnableSequence
from langchain.prompts.chat import ChatPromptTemplate
from langchain.schema import StrOutputParser
from sentence_transformers import SentenceTransformer, util
We initialize the Qdrant client (running on localhost) and configure our LLM using ChatOllama with the deepseek-r1:8b model.
# Initialize Qdrant client
qdrant_client = QdrantClient(host="localhost", port=6333)
# Llama3.2 / Deepseek Model Configuration
llm = ChatOllama(
# model="llama3.2-vision:11b", # Uncomment to use Llama3.2
model="deepseek-r1:8b",
base_url="http://127.0.0.1:11434" # Replace with appropriate server details
)
This block defines helper functions to load a PDF, split it into chunks, clean the text, embed the chunks using Sentence Transformers, and store the vectors in a Qdrant collection.
def replace_t_with_space(documents):
return [doc.page_content.replace("\n", " ").replace("\t", " ") for doc in documents]
def encode_pdf_to_qdrant(path, collection_name, chunk_size=1000, chunk_overlap=200):
loader = PyPDFLoader(path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
)
texts = text_splitter.split_documents(documents)
cleaned_texts = replace_t_with_space(texts)
document_objects = [Document(page_content=text) for text in cleaned_texts]
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectors = embeddings.embed_documents([doc.page_content for doc in document_objects])
if qdrant_client.collection_exists(collection_name):
qdrant_client.delete_collection(collection_name)
qdrant_client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=len(vectors[0]), distance="Cosine")
)
points = [
PointStruct(id=i, vector=vector, payload={"text": document_objects[i].page_content})
for i, vector in enumerate(vectors)
]
qdrant_client.upsert(collection_name=collection_name, points=points)
st.write(f"PDF data from '{path}' stored in Qdrant under '{collection_name}'.")
The SimpleRAG
class handles document retrieval from Qdrant and uses a LangChain pipeline to generate answers based on the retrieved context.
class SimpleRAG:
def __init__(self, llm, path, collection_name, chunk_size=1000, chunk_overlap=200, n_retrieved=2):
self.path = path
self.llm = llm
self.collection_name = collection_name
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.n_retrieved = n_retrieved
# Encode the PDF into Qdrant collection
encode_pdf_to_qdrant(path, collection_name, chunk_size, chunk_overlap)
def retrieve_documents(self, query):
docs = query_qdrant(self.collection_name, query, self.n_retrieved)
return docs
def generate_answer(self, query):
documents = self.retrieve_documents(query)
st.write("Retrieved documents:", documents)
context = "\n\n".join(documents)
prompt_template = """Answer the following question based on the provided context. If the answer is not in the context, say 'I don't know.'
Context:
{context}
Question: {question}
Answer:"""
prompt = ChatPromptTemplate.from_template(prompt_template)
chain = RunnableSequence(
RunnablePassthrough(),
prompt,
self.llm,
StrOutputParser()
)
answer = chain.invoke({"context": context, "question": query})
return answer
This section preprocesses text by removing punctuation and normalizing case, and defines an evaluation function that computes cosine similarity between the generated answer and a ground truth answer using Sentence Transformers.
def preprocess_text(text):
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)
return text.strip()
def evaluate_answer(predicted, ground_truth):
predicted = preprocess_text(predicted)
ground_truth = preprocess_text(ground_truth)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings_pred = model.encode(predicted, convert_to_tensor=True)
embeddings_gt = model.encode(ground_truth, convert_to_tensor=True)
similarity = util.pytorch_cos_sim(embeddings_pred, embeddings_gt).item()
return similarity
The following code snippet is the main part of the application that creates the interactive web interface for the RAG pipeline:
# Streamlit Interface
st.title("PDF Parsing RAG - using locally running LLM: deepseek-r1 and VectorDB: Qdrant")
st.write("This app demonstrates a Retrieval-Augmented Generation pipeline using Qdrant and ChatOllama.")
# PDF Upload Section
uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
if uploaded_pdf is not None:
with open("temp.pdf", "wb") as f:
f.write(uploaded_pdf.read())
pdf_path = "temp.pdf"
else:
st.info("Please upload a PDF file to proceed.")
st.stop()
# User Input Section
collection_name = st.text_input("Qdrant Collection Name", value="my_collection")
chunk_size = st.number_input("Chunk Size", value=1000, min_value=100)
chunk_overlap = st.number_input("Chunk Overlap", value=200, min_value=0)
n_retrieved = st.number_input("Number of documents (or text chunks) to retrieve", value=2, min_value=1)
query = st.text_input("Enter your query")
# Evaluation Section: Checkbox and Ground Truth Text Area
evaluate_flag = st.checkbox("Evaluate the answer using [sentence-transformers/all-MiniLM-L6-v2]", value=False)
if evaluate_flag:
default_ground_truth = ("The key skills of the candidate are Account Management, Client Relations, Project Management, Negotiation and Communication, Technical Proficiency, Leadership and Recognition etc.")
true_answer = st.text_area("Enter the ground truth answer for evaluation", value=default_ground_truth)
else:
true_answer = ""
# Run RAG on button click
if st.button("Run RAG"):
if query.strip() == "":
st.error("Please enter a query.")
else:
rag = SimpleRAG(
llm=llm,
path=pdf_path,
collection_name=collection_name,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
n_retrieved=n_retrieved
)
answer = rag.generate_answer(query)
st.write("### Answer:")
st.write(answer)
if evaluate_flag and true_answer.strip() != "":
score = evaluate_answer(answer, true_answer)
st.write("### Evaluation Score (Cosine Similarity):", score)
import streamlit as st
import io
import re
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnableSequence
from langchain.prompts.chat import ChatPromptTemplate
from langchain.schema import StrOutputParser
from sentence_transformers import SentenceTransformer, util
# -------------------------------
# Initialize Qdrant client
# -------------------------------
qdrant_client = QdrantClient(host="localhost", port=6333)
# -------------------------------
# Llama3.2 / DeepSeek Model Configuration
# -------------------------------
llm = ChatOllama(
#model="llama3.2-vision:11b",
model="deepseek-r1:8b",
base_url="http://127.0.0.1:11434" # Replace with the appropriate server details
)
# -------------------------------
# Prompt for MultiQueryRetriever (not used in this simplified example)
# -------------------------------
QUERY_PROMPT = PromptTemplate(
input_variables=["question"],
template="""You are an AI assistant. Generate five alternative versions of the question to improve document retrieval. Separate them with newlines.
Original question: {question}"""
)
# -------------------------------
# Helper Functions
# -------------------------------
def replace_t_with_space(documents):
return [doc.page_content.replace("\n", " ").replace("\t", " ") for doc in documents]
def encode_pdf_to_qdrant(path, collection_name, chunk_size=1000, chunk_overlap=200):
loader = PyPDFLoader(path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
)
texts = text_splitter.split_documents(documents)
cleaned_texts = replace_t_with_space(texts)
document_objects = [Document(page_content=text) for text in cleaned_texts]
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectors = embeddings.embed_documents([doc.page_content for doc in document_objects])
if qdrant_client.collection_exists(collection_name):
qdrant_client.delete_collection(collection_name)
qdrant_client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=len(vectors[0]), distance="Cosine")
)
points = [
PointStruct(id=i, vector=vector, payload={"text": document_objects[i].page_content})
for i, vector in enumerate(vectors)
]
qdrant_client.upsert(collection_name=collection_name, points=points)
#st.write(f"PDF data from '{path}' stored in Qdrant under '{collection_name}'.")
def query_qdrant(collection_name, query, n_retrieved=2):
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
query_vector = embeddings.embed_query(query)
results = qdrant_client.search(collection_name, query_vector=query_vector, limit=n_retrieved)
retrieved_texts = [result.payload["text"] for result in results]
return retrieved_texts
# -------------------------------
# SimpleRAG Class
# -------------------------------
class SimpleRAG:
def __init__(self, llm, path, collection_name, chunk_size=1000, chunk_overlap=200, n_retrieved=2):
self.path = path
self.llm = llm
self.collection_name = collection_name
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.n_retrieved = n_retrieved
# Encode the PDF into Qdrant collection
encode_pdf_to_qdrant(path, collection_name, chunk_size, chunk_overlap)
def retrieve_documents(self, query):
"""Retrieve documents from the Qdrant collection based on the query."""
docs = query_qdrant(self.collection_name, query, self.n_retrieved)
return docs
def generate_answer(self, query):
"""Generate an answer to the query using retrieved documents and the LLM chain."""
# Retrieve documents from Qdrant
documents = self.retrieve_documents(query)
st.write("Retrieved documents:", documents)
# Combine documents into a single context string
context = "\n\n".join(documents)
# Define a prompt template for prompt composition
prompt_template = """Answer the following question based on the provided context. If the answer is not in the context, say 'I don't know.'
Context:
{context}
Question: {question}
Answer:"""
# Create a ChatPromptTemplate using the prompt template
prompt = ChatPromptTemplate.from_template(prompt_template)
# Create a LangChain pipeline for the RAG process
chain = RunnableSequence(
RunnablePassthrough(), # Pass through the context and question
prompt, # Use the ChatPromptTemplate
self.llm, # Pass the processed input to the language model
StrOutputParser(), # Parse the LLM's output as a string
)
# Invoke the chain with the query
answer = chain.invoke({"context": context, "question": query})
return answer
# -------------------------------
# Text Preprocessing and Evaluation Functions
# -------------------------------
def preprocess_text(text):
"""Preprocess text: remove punctuation, convert to lowercase, and strip whitespace."""
text = text.lower()
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
return text.strip()
def evaluate_answer(predicted, ground_truth):
"""
Evaluate the predicted answer against the ground truth using cosine similarity.
Returns a similarity score between 0.0 and 1.0.
"""
predicted = preprocess_text(predicted)
ground_truth = preprocess_text(ground_truth)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings_pred = model.encode(predicted, convert_to_tensor=True)
embeddings_gt = model.encode(ground_truth, convert_to_tensor=True)
similarity = util.pytorch_cos_sim(embeddings_pred, embeddings_gt).item()
return similarity
# -------------------------------
# Streamlit Interface
# -------------------------------
st.title("PDF Parsing RAG - using locally running LLM: deepseek-r1 and VectorDB: Qdrant")
st.write("This app demonstrates a Retrieval-Augmented Generation pipeline using Qdrant and ChatOllama.")
# PDF Upload Section
uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
if uploaded_pdf is not None:
# Save the uploaded PDF temporarily
with open("temp.pdf", "wb") as f:
f.write(uploaded_pdf.read())
pdf_path = "temp.pdf"
else:
st.info("Please upload a PDF file to proceed.")
st.stop()
# User Input Section
collection_name = st.text_input("Qdrant Collection Name", value="my_collection")
chunk_size = st.number_input("Chunk Size", value=1000, min_value=100)
chunk_overlap = st.number_input("Chunk Overlap", value=200, min_value=0)
n_retrieved = st.number_input("Number of documents (or text chunks) to retrieve from the Qdrant collection based on the query", value=2, min_value=1)
query = st.text_input("Enter your query")
# Evaluation Section: Checkbox and Ground Truth Text Area appears if checked
evaluate_flag = st.checkbox("Evaluate the answer using [sentence-transformers/all-MiniLM-L6-v2]", value=False)
if evaluate_flag:
default_ground_truth = ("The key skills of the candidate are Account Management, Client Relations, Project Management, Negotiation and Communication, Technical Proficiency, Leadership and Recognition etc.")
true_answer = st.text_area("Enter the ground truth answer for evaluation", value=default_ground_truth)
else:
true_answer = ""
# When the user clicks the "Run RAG" button:
if st.button("Run RAG"):
if query.strip() == "":
st.error("Please enter a query.")
else:
# Initialize and run the RAG pipeline
rag = SimpleRAG(
llm=llm,
path=pdf_path,
collection_name=collection_name,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
n_retrieved=n_retrieved
)
answer = rag.generate_answer(query)
st.write("### Answer:")
st.write(answer)
# If evaluation is enabled and ground truth is provided, display the evaluation score.
if evaluate_flag and true_answer.strip() != "":
score = evaluate_answer(answer, true_answer)
st.write("### Evaluation Score (Cosine Similarity):", score)
This is the complete code for the PDF Parsing RAG - using locally running LLM (deepseek-r1), VectorDB(Qdrant)
streamlit run RAG_PDF_Parsing.py
Open your terminal, navigate to the project directory, and run the command above.
Once the app starts, open your browser:
Local URL: http://localhost:8501
Network URL: http://192.168.1.11:8501
The RAG_PDF_Parsing.py
program demonstrates how to integrate Qdrant, LangChain, and ChatOllama (using the deepseek-r1:8b model) to build a Retrieval-Augmented Generation pipeline. By splitting a PDF into manageable chunks, embedding the text, storing it in a vector database, and retrieving the most relevant context for a query, the app generates answers in real time. The optional evaluation module computes cosine similarity between the generated answer and a provided ground truth answer.