Containers provide reproducible, isolated environments for running local AI services. Docker simplifies deployment to a single docker compose up command, while Kubernetes handles multi-node GPU clusters with automatic scheduling and scaling. This guide covers practical container deployments from simple single-machine Docker setups to production Kubernetes clusters with GPU scheduling, including all the GPU passthrough configuration that makes container-based AI workloads possible.
Prerequisites
- Linux host (recommended) or Windows with WSL2 or macOS with Docker Desktop
- Docker Engine 24.0+ or Docker Desktop 4.25+
- NVIDIA GPU with drivers installed (for GPU acceleration)
docker composev2 (included with modern Docker)
Part 1: Docker Fundamentals for AI
Installing Docker
# Quick install on Linux
curl -fsSL https://get.docker.com | sh
sudo usermod -aG docker $USER
newgrp docker # Apply group without logout
# Verify
docker --version
docker compose version
NVIDIA Container Toolkit Setup
The NVIDIA Container Toolkit is required for GPU access in Docker containers.
# Add NVIDIA repository
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
# Install
sudo apt update
sudo apt install -y nvidia-container-toolkit
# Configure Docker runtime
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
# Verify GPU access in containers
docker run --rm --gpus all nvidia/cuda:12.6.0-base-ubuntu24.04 nvidia-smi
For Fedora/RHEL:
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
sudo dnf install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
AMD GPU in Docker
AMD GPUs don’t need a special toolkit. Pass the device nodes directly:
docker run --rm \
--device /dev/kfd \
--device /dev/dri \
-v ollama:/root/.ollama \
-p 11434:11434 \
ollama/ollama:rocm
Part 2: Ollama in Docker
Basic Ollama Container
# CPU only
docker run -d \
-v ollama:/root/.ollama \
-p 11434:11434 \
--name ollama \
--restart unless-stopped \
ollama/ollama
# NVIDIA GPU
docker run -d \
--gpus all \
-v ollama:/root/.ollama \
-p 11434:11434 \
--name ollama \
--restart unless-stopped \
ollama/ollama
# Pull and run a model
docker exec ollama ollama pull llama3.1:8b
docker exec -it ollama ollama run llama3.1:8b
Ollama with Custom Configuration
docker run -d \
--gpus all \
-v ollama:/root/.ollama \
-v /data/models:/models \
-p 11434:11434 \
-e OLLAMA_HOST=0.0.0.0:11434 \
-e OLLAMA_NUM_PARALLEL=4 \
-e OLLAMA_MAX_LOADED_MODELS=2 \
-e OLLAMA_KEEP_ALIVE=30m \
--name ollama \
--restart unless-stopped \
ollama/ollama
Specific GPU Assignment
# Use only GPU 0
docker run -d \
--gpus '"device=0"' \
-v ollama:/root/.ollama \
-p 11434:11434 \
--name ollama-gpu0 \
ollama/ollama
# Use GPUs 0 and 1
docker run -d \
--gpus '"device=0,1"' \
-v ollama:/root/.ollama \
-p 11435:11434 \
--name ollama-gpu01 \
ollama/ollama
Part 3: Docker Compose Stacks
Stack 1: Ollama + Open WebUI (Personal)
The most common local AI stack. A complete ChatGPT replacement.
# docker-compose.yml
services:
ollama:
image: ollama/ollama
container_name: ollama
volumes:
- ollama_data:/root/.ollama
ports:
- "11434:11434"
environment:
- OLLAMA_HOST=0.0.0.0:11434
- OLLAMA_KEEP_ALIVE=30m
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: unless-stopped
open-webui:
image: ghcr.io/open-webui/open-webui:main
container_name: open-webui
ports:
- "3000:8080"
environment:
- OLLAMA_BASE_URL=http://ollama:11434
volumes:
- webui_data:/app/backend/data
depends_on:
- ollama
restart: unless-stopped
volumes:
ollama_data:
webui_data:
# Launch the stack
docker compose up -d
# Pull a model
docker exec ollama ollama pull llama3.1:8b
# Open http://localhost:3000
Stack 2: Ollama + Open WebUI + HTTPS (Team)
Production-ready stack with Nginx reverse proxy and Let’s Encrypt SSL.
# docker-compose.yml
services:
ollama:
image: ollama/ollama
container_name: ollama
volumes:
- ollama_data:/root/.ollama
environment:
- OLLAMA_HOST=0.0.0.0:11434
- OLLAMA_NUM_PARALLEL=4
- OLLAMA_MAX_LOADED_MODELS=2
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: unless-stopped
open-webui:
image: ghcr.io/open-webui/open-webui:main
container_name: open-webui
environment:
- OLLAMA_BASE_URL=http://ollama:11434
- WEBUI_AUTH=true
- WEBUI_SECRET_KEY=${WEBUI_SECRET_KEY}
volumes:
- webui_data:/app/backend/data
depends_on:
- ollama
restart: unless-stopped
nginx:
image: nginx:alpine
container_name: nginx
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/conf.d/default.conf:ro
- certbot_data:/etc/letsencrypt:ro
- certbot_www:/var/www/certbot:ro
depends_on:
- open-webui
restart: unless-stopped
certbot:
image: certbot/certbot
container_name: certbot
volumes:
- certbot_data:/etc/letsencrypt
- certbot_www:/var/www/certbot
entrypoint: "/bin/sh -c 'trap exit TERM; while :; do certbot renew; sleep 12h; done'"
restart: unless-stopped
volumes:
ollama_data:
webui_data:
certbot_data:
certbot_www:
# nginx.conf
server {
listen 80;
server_name ai.yourdomain.com;
location /.well-known/acme-challenge/ {
root /var/www/certbot;
}
location / {
return 301 https://$host$request_uri;
}
}
server {
listen 443 ssl;
server_name ai.yourdomain.com;
ssl_certificate /etc/letsencrypt/live/ai.yourdomain.com/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/ai.yourdomain.com/privkey.pem;
client_max_body_size 100M;
location / {
proxy_pass http://open-webui:8080;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# WebSocket support
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_read_timeout 300s;
}
}
Stack 3: vLLM + Open WebUI (High Throughput)
For serving multiple concurrent users with better throughput than Ollama.
# docker-compose.yml
services:
vllm:
image: vllm/vllm-openai:latest
container_name: vllm
command: >
--model meta-llama/Llama-3.1-8B-Instruct
--host 0.0.0.0
--port 8000
--max-model-len 8192
--gpu-memory-utilization 0.9
environment:
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
volumes:
- hf_cache:/root/.cache/huggingface
ports:
- "8000:8000"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: unless-stopped
open-webui:
image: ghcr.io/open-webui/open-webui:main
container_name: open-webui
ports:
- "3000:8080"
environment:
- OPENAI_API_BASE_URL=http://vllm:8000/v1
- OPENAI_API_KEY=unused
- OLLAMA_BASE_URL=
volumes:
- webui_data:/app/backend/data
depends_on:
- vllm
restart: unless-stopped
volumes:
hf_cache:
webui_data:
Stack 4: Multi-Model with GPU Assignment
Run different models on different GPUs.
# docker-compose.yml
services:
ollama-chat:
image: ollama/ollama
container_name: ollama-chat
volumes:
- ollama_chat:/root/.ollama
ports:
- "11434:11434"
environment:
- OLLAMA_HOST=0.0.0.0:11434
- NVIDIA_VISIBLE_DEVICES=0
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0']
capabilities: [gpu]
restart: unless-stopped
ollama-code:
image: ollama/ollama
container_name: ollama-code
volumes:
- ollama_code:/root/.ollama
ports:
- "11435:11434"
environment:
- OLLAMA_HOST=0.0.0.0:11434
- NVIDIA_VISIBLE_DEVICES=1
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['1']
capabilities: [gpu]
restart: unless-stopped
open-webui:
image: ghcr.io/open-webui/open-webui:main
container_name: open-webui
ports:
- "3000:8080"
environment:
- OLLAMA_BASE_URL=http://ollama-chat:11434
- OPENAI_API_BASE_URLS=http://ollama-code:11434/v1
volumes:
- webui_data:/app/backend/data
depends_on:
- ollama-chat
- ollama-code
restart: unless-stopped
volumes:
ollama_chat:
ollama_code:
webui_data:
Part 4: Production Docker Practices
Health Checks
services:
ollama:
image: ollama/ollama
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# ...
open-webui:
image: ghcr.io/open-webui/open-webui:main
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
depends_on:
ollama:
condition: service_healthy
# ...
Resource Limits
services:
ollama:
image: ollama/ollama
deploy:
resources:
limits:
memory: 32G # Limit total memory
reservations:
memory: 16G
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
# ...
Backup and Persistence
# Backup Ollama models
docker run --rm \
-v ollama_data:/source:ro \
-v $(pwd)/backups:/backup \
alpine tar czf /backup/ollama-models-$(date +%Y%m%d).tar.gz -C /source .
# Backup Open WebUI data
docker run --rm \
-v webui_data:/source:ro \
-v $(pwd)/backups:/backup \
alpine tar czf /backup/webui-data-$(date +%Y%m%d).tar.gz -C /source .
# Restore
docker run --rm \
-v ollama_data:/target \
-v $(pwd)/backups:/backup \
alpine tar xzf /backup/ollama-models-20260407.tar.gz -C /target
Pre-loading Models on Container Start
# Dockerfile.ollama-preloaded
FROM ollama/ollama
# Copy a script that pulls models on first start
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
#!/bin/bash
# entrypoint.sh
ollama serve &
sleep 5
# Pull models if not already present
ollama pull llama3.1:8b 2>/dev/null || true
ollama pull qwen2.5-coder:7b 2>/dev/null || true
ollama pull nomic-embed-text 2>/dev/null || true
# Keep server running
wait
Logging
services:
ollama:
image: ollama/ollama
logging:
driver: json-file
options:
max-size: "50m"
max-file: "3"
# ...
# View logs
docker compose logs -f ollama
docker compose logs -f --tail=100 open-webui
# Export logs
docker compose logs ollama > ollama.log
Part 5: Kubernetes for Local AI
Kubernetes manages multi-node GPU clusters with automatic scheduling, scaling, and self-healing.
Prerequisites
- Kubernetes cluster (k3s, kubeadm, or managed)
- NVIDIA GPU Operator or NVIDIA Device Plugin
- Storage class for model persistence
Installing NVIDIA GPU Operator
The GPU Operator installs everything needed for GPU workloads in Kubernetes.
# Add NVIDIA Helm repo
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
helm repo update
# Install GPU Operator
helm install gpu-operator nvidia/gpu-operator \
--namespace gpu-operator \
--create-namespace \
--set driver.enabled=true \
--set toolkit.enabled=true
# Verify GPU nodes
kubectl get nodes -l nvidia.com/gpu.present=true
kubectl describe node <gpu-node> | grep nvidia.com/gpu
Alternative: NVIDIA Device Plugin Only
If you already have NVIDIA drivers installed on nodes:
# Install just the device plugin
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml
# Verify
kubectl get pods -n kube-system | grep nvidia
Deploying Ollama on Kubernetes
# ollama-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ollama
labels:
app: ollama
spec:
replicas: 1
selector:
matchLabels:
app: ollama
template:
metadata:
labels:
app: ollama
spec:
containers:
- name: ollama
image: ollama/ollama
ports:
- containerPort: 11434
env:
- name: OLLAMA_HOST
value: "0.0.0.0:11434"
- name: OLLAMA_KEEP_ALIVE
value: "30m"
resources:
limits:
nvidia.com/gpu: 1
memory: "32Gi"
requests:
nvidia.com/gpu: 1
memory: "16Gi"
volumeMounts:
- name: ollama-data
mountPath: /root/.ollama
volumes:
- name: ollama-data
persistentVolumeClaim:
claimName: ollama-pvc
---
apiVersion: v1
kind: Service
metadata:
name: ollama
spec:
selector:
app: ollama
ports:
- port: 11434
targetPort: 11434
type: ClusterIP
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ollama-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
storageClassName: local-path # Adjust for your cluster
kubectl apply -f ollama-deployment.yaml
# Pull a model
kubectl exec -it deploy/ollama -- ollama pull llama3.1:8b
Open WebUI on Kubernetes
# open-webui-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: open-webui
labels:
app: open-webui
spec:
replicas: 1
selector:
matchLabels:
app: open-webui
template:
metadata:
labels:
app: open-webui
spec:
containers:
- name: open-webui
image: ghcr.io/open-webui/open-webui:main
ports:
- containerPort: 8080
env:
- name: OLLAMA_BASE_URL
value: "http://ollama:11434"
- name: WEBUI_AUTH
value: "true"
- name: WEBUI_SECRET_KEY
valueFrom:
secretKeyRef:
name: webui-secret
key: secret-key
volumeMounts:
- name: webui-data
mountPath: /app/backend/data
resources:
limits:
memory: "2Gi"
requests:
memory: "512Mi"
volumes:
- name: webui-data
persistentVolumeClaim:
claimName: webui-pvc
---
apiVersion: v1
kind: Service
metadata:
name: open-webui
spec:
selector:
app: open-webui
ports:
- port: 8080
targetPort: 8080
type: ClusterIP
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: webui-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: open-webui
annotations:
nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
nginx.ingress.kubernetes.io/proxy-body-size: "100m"
spec:
rules:
- host: ai.yourdomain.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: open-webui
port:
number: 8080
tls:
- hosts:
- ai.yourdomain.com
secretName: webui-tls
GPU Scheduling Strategies
# Request specific GPU type
resources:
limits:
nvidia.com/gpu: 1
nodeSelector:
nvidia.com/gpu.product: NVIDIA-GeForce-RTX-4090
# Request multiple GPUs
resources:
limits:
nvidia.com/gpu: 2
# Time-slicing (share GPU between pods)
# Requires NVIDIA GPU Operator with time-slicing config
apiVersion: v1
kind: ConfigMap
metadata:
name: time-slicing-config
data:
any: |-
version: v1
flags:
migStrategy: none
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 4 # Each physical GPU appears as 4 logical GPUs
Scaling with Multiple GPU Nodes
# vLLM with tensor parallelism across GPUs
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-70b
spec:
replicas: 1
selector:
matchLabels:
app: vllm-70b
template:
metadata:
labels:
app: vllm-70b
spec:
containers:
- name: vllm
image: vllm/vllm-openai:latest
args:
- "--model"
- "meta-llama/Llama-3.1-70B-Instruct"
- "--tensor-parallel-size"
- "2"
- "--max-model-len"
- "8192"
- "--gpu-memory-utilization"
- "0.9"
ports:
- containerPort: 8000
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: token
resources:
limits:
nvidia.com/gpu: 2
memory: "128Gi"
volumeMounts:
- name: model-cache
mountPath: /root/.cache/huggingface
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: model-cache-pvc
Monitoring Containerized AI
Prometheus + Grafana Stack
# Add to docker-compose.yml
prometheus:
image: prom/prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
ports:
- "9090:9090"
restart: unless-stopped
grafana:
image: grafana/grafana
volumes:
- grafana_data:/var/lib/grafana
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
restart: unless-stopped
nvidia-exporter:
image: utkuozdemir/nvidia_gpu_exporter
devices:
- /dev/nvidiactl
- /dev/nvidia0
volumes:
- /usr/bin/nvidia-smi:/usr/bin/nvidia-smi
- /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1
ports:
- "9835:9835"
restart: unless-stopped
# prometheus.yml
scrape_configs:
- job_name: 'nvidia-gpu'
static_configs:
- targets: ['nvidia-exporter:9835']
- job_name: 'ollama'
static_configs:
- targets: ['ollama:11434']
metrics_path: /api/tags
Troubleshooting
GPU Not Accessible in Container
# Verify NVIDIA Container Toolkit
docker info | grep -i nvidia
# If missing, reconfigure
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
# Test GPU access
docker run --rm --gpus all nvidia/cuda:12.6.0-base-ubuntu24.04 nvidia-smi
Container OOM Killed
# Check container events
docker inspect --format='{{.State.OOMKilled}}' ollama
# Increase memory limits in compose file
# Or use a smaller model
docker exec ollama ollama run phi3:mini
Model Files Not Persisting
# Verify volume is mounted correctly
docker inspect ollama | grep -A 10 Mounts
# Check volume contents
docker run --rm -v ollama_data:/data alpine ls -la /data/
# If empty, models need to be re-pulled
docker exec ollama ollama pull llama3.1:8b
Open WebUI Can’t Connect to Ollama
# Check network connectivity between containers
docker exec open-webui curl -s http://ollama:11434/api/tags
# If using host network mode for Ollama
# Use host.docker.internal instead:
# OLLAMA_BASE_URL=http://host.docker.internal:11434
# Check container logs
docker compose logs ollama open-webui
Next Steps
- Set up the web interface: Open WebUI + Ollama detailed guide
- Deploy for your organization: Enterprise Local AI guide
- Platform-specific setup: Linux guide for host preparation
- Choose your models: Model selection guide