A detailed guide on building a food item recognition and calorie estimation app
Before running the program, install the following Python modules using pip install
:
streamlit
(pip install streamlit)requests
(pip install requests)Pillow
(pip install Pillow)base64
, io
, and pathlib
modules are part of the Python standard library.import streamlit as st
from pathlib import Path
import base64
import requests
from PIL import Image
import io
This block imports the necessary modules for building the Streamlit UI, processing images, handling API requests, and encoding images.
# Define the API endpoint for the LLaMA model
LAMA_API_URL = "http://localhost:11434/api/chat"
The API endpoint is set to send requests to the local LLaMA Vision model.
def encode_image_to_base64(image: Image.Image) -> str:
"""Convert a PIL image to a base64 string."""
buffer = io.BytesIO()
image.save(buffer, format="JPEG")
return base64.b64encode(buffer.getvalue()).decode('utf-8')
This function converts a PIL image to a base64 string, which can then be embedded in the JSON payload.
def extract_text_from_image(messages: list, image: Image.Image) -> str:
"""Send image and chat history to the LLaMA API and get the response."""
base64_image = encode_image_to_base64(image)
# Append the image to the latest message
messages[-1]['images'] = [base64_image]
payload = {
"model": "llama3.2-vision:11b",
"stream": False,
"messages": messages
}
# Send request to the LLaMA API
response = requests.post(
LAMA_API_URL,
json=payload,
headers={"Content-Type": "application/json"}
)
# Extract the text content from the response
if response.status_code == 200:
return response.json().get('message', {}).get('content', 'No response from the model.')
else:
return f"Error: {response.status_code} - {response.text}"
This function prepares the payload with the encoded image and chat history, sends it to the LLaMA API, and returns the model's response.
# Streamlit UI
st.title("Food Item Recognition and Calorie Estimation")
st.write("Upload an image to list food items and calculate their total calorie count using LLaMA Vision.")
This section sets up the title and introduction for the Streamlit app.
# Initialize chat history
if 'chat_history' not in st.session_state:
st.session_state.chat_history = [
{"role": "system", "content": "You are a helpful assistant specializing in food recognition and calorie estimation."}
]
# Display chat history
st.subheader("Chat History")
for message in st.session_state.chat_history:
if message['role'] == 'user':
st.write(f"**You:** {message['content']}")
elif message['role'] == 'assistant':
st.write(f"**Assistant:** {message['content']}")
This block initializes the chat history and displays previous messages on the UI.
# Upload an image
uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
if uploaded_image:
# Display the uploaded image
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image", use_container_width=True)
# Get user input for the prompt
user_input = st.text_input("Enter your prompt for the assistant:",
value="List down all food items in the image and calculate total calorie count.")
if st.button("Submit"):
# Add user input to chat history
st.session_state.chat_history.append({"role": "user", "content": user_input})
# Process the image and get the response
st.write("Processing the image...")
response = extract_text_from_image(st.session_state.chat_history, image)
# Add assistant's response to chat history
st.session_state.chat_history.append({"role": "assistant", "content": response})
# Display the assistant's response
st.subheader("Response:")
st.write(response)
This section creates an image uploader, collects user input, processes the image by calling the LLaMA API, and displays the model's response.
import streamlit as st
from pathlib import Path
import base64
import requests
from PIL import Image
import io
# Define the API endpoint for the LLaMA model
LAMA_API_URL = "http://localhost:11434/api/chat"
def encode_image_to_base64(image: Image.Image) -> str:
"""Convert a PIL image to a base64 string."""
buffer = io.BytesIO()
image.save(buffer, format="JPEG")
return base64.b64encode(buffer.getvalue()).decode('utf-8')
def extract_text_from_image(messages: list, image: Image.Image) -> str:
"""Send image and chat history to the LLaMA API and get the response."""
base64_image = encode_image_to_base64(image)
# Append the image to the latest message
messages[-1]['images'] = [base64_image]
payload = {
"model": "llama3.2-vision:11b",
"stream": False,
"messages": messages
}
# Send request to the LLaMA API
response = requests.post(
LAMA_API_URL,
json=payload,
headers={"Content-Type": "application/json"}
)
# Extract the text content from the response
if response.status_code == 200:
return response.json().get('message', {}).get('content', 'No response from the model.')
else:
return f"Error: {response.status_code} - {response.text}"
# Streamlit UI
st.title("Food Item Recognition and Calorie Estimation")
st.write("Upload an image to list food items and calculate their total calorie count using LLaMA Vision.")
# Initialize chat history
if 'chat_history' not in st.session_state:
st.session_state.chat_history = [
{"role": "system", "content": "You are a helpful assistant specializing in food recognition and calorie estimation."}
]
# Display chat history
st.subheader("Chat History")
for message in st.session_state.chat_history:
if message['role'] == 'user':
st.write(f"**You:** {message['content']}")
elif message['role'] == 'assistant':
st.write(f"**Assistant:** {message['content']}")
# Upload an image
uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
if uploaded_image:
# Display the uploaded image
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image", use_container_width=True)
# Get user input for the prompt
user_input = st.text_input("Enter your prompt for the assistant:",
value="List down all food items in the image and calculate total calorie count.")
if st.button("Submit"):
# Add user input to chat history
st.session_state.chat_history.append({"role": "user", "content": user_input})
# Process the image and get the response
st.write("Processing the image...")
response = extract_text_from_image(st.session_state.chat_history, image)
# Add assistant's response to chat history
st.session_state.chat_history.append({"role": "assistant", "content": response})
# Display the assistant's response
st.subheader("Response:")
st.write(response)
This is the complete Python code for the Streamlit application.
streamlit run food_calorie_calculator.py
You can now view your Streamlit app in your browser.
Local URL: http://localhost:8501
Network URL: http://192.168.1.11:8501
This application integrates computer vision and natural language processing by allowing users to upload an image, process it via a LLaMA Vision model, and retrieve text responses. The program demonstrates effective use of Streamlit for building a user-friendly web interface combined with API-driven AI responses.