The Problem
Let’s say you're running the following PyTorch script:
import torch
import time
def max_out_vram():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
allocated = []
while True:
# Allocate 512MB
tensor = torch.empty((512 * 1024 * 1024 // 4,), dtype=torch.float32, device=device)
allocated.append(tensor)
print(f"Allocated {len(allocated) * 512} MB")
time.sleep(3)
max_out_vram()You expect to see VRAM usage go up in nvtop. But it doesn’t.
Why?
The Underlying Reasons
1. Memory Allocation Is Lazy
PyTorch (like CUDA) often uses lazy memory allocation. Calling torch.empty() reserves memory, but doesn’t actually commit it until you write to it.
So the VRAM isn’t physically used — yet.
2. No GPU Kernel Is Launched
Tools like nvtop, and APIs like NVML, often detect GPU usage based on active CUDA contexts and kernel launches.
If your script doesn’t run a kernel, NVML may not treat it as "active".
3. We Track Active Usage
GPU Pools rely on runtime activity (kernel execution and context tracking) to attribute resource usage.
If you only allocate memory without touching it or running a kernel, the system might not record that as actual usage.
What can we run instead?
The following program will achieve a similar objective, to incrementally allocate more memory until the capacity is exceeded, but it deploys a kernel and executes work that accesses the memory.
import torch
import time
import sys
import subprocess
try:
from tqdm import tqdm
except ImportError:
subprocess.check_call([sys.executable, "-m", "pip", "install", "tqdm"])
finally:
from tqdm import tqdm
# ==== CONFIG ====
USE_FP16 = False # Use float16 to test larger sizes
START_N = 1000 # Initial matrix size
STEP_N = 1000 # Increment per step
MAX_ITER = 3 # Matrix additions per step
SLEEP_BETWEEN = 0.5 # Delay between steps in seconds
# ==== SETUP ====
start_time = time.time()
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if USE_FP16 else torch.float32
bytes_per_element = 2 if USE_FP16 else 4
props = torch.cuda.get_device_properties(device)
total_vram = props.total_memory
print(f"\n🔍 Probing VRAM on GPU: {props.name}")
print(f"🔢 Total VRAM: {total_vram / 1024**2:.2f} MiB")
print(f"⚙️ Dtype: {'float16' if USE_FP16 else 'float32'} (Element size: {bytes_per_element} bytes)\n")
# ==== RAMP-UP LOOP ====
n = START_N
last_success_n = None
while True:
try:
print(f"🚀 Trying matrix size: {n} x {n}")
# Attempt allocation
x = torch.ones((n, n), dtype=dtype, device=device)
y = torch.ones((n, n), dtype=dtype, device=device)
z = torch.empty_like(x)
for i in range(MAX_ITER):
torch.add(x, y, out=z)
torch.cuda.synchronize()
used = torch.cuda.memory_allocated(device) / 1024**2
print(f" ✅ Iteration {i+1}: Used GPU memory: {used:.2f} MiB")
last_success_n = n
del x, y, z
torch.cuda.empty_cache()
n += STEP_N
time.sleep(SLEEP_BETWEEN)
except RuntimeError as e:
if "CUDA out of memory" in str(e):
print(f"\n💥 OOM at matrix size: {n} x {n}")
break
else:
raise
# ==== SUMMARY ====
peak = torch.cuda.max_memory_allocated(device) / 1024**2
print(f"\n✅ Largest successful matrix: {last_success_n} x {last_success_n}")
print(f"📈 Peak GPU memory used: {peak:.2f} MiB")
print("🟢 Finished probing.")
print(f"⏱️ Total time: {time.time() - start_time:.2f} seconds")