Spaces:

rahul7star
/

Flash-Attn-Demo

Running

App Files Files Community

Flash-Attn-Demo / app.py

rahul7star

Update app.py

5479a3e verified about 1 month ago

raw

history blame contribute delete

3.89 kB

	import gradio as gr
	import torch
	from kernels import get_kernel
	import io
	import contextlib

	# ============================================================
	# ⚙️ Setup
	# ============================================================
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	if torch.cuda.is_available():
	flash_attn = get_kernel("kernels-community/vllm-flash-attn3")
	flash_attn_func = flash_attn.flash_attn_func
	else:
	flash_attn_func = None # CPU fallback

	# ============================================================
	# 🧠 Reference attention (PyTorch SDPA)
	# ============================================================
	def reference_attention(query, key, value, causal=False):
	query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
	with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
	out = torch.nn.functional.scaled_dot_product_attention(query, key, value, is_causal=causal)
	return out.transpose(1, 2).contiguous()

	# ============================================================
	# 🎛️ Test function
	# ============================================================
	def run_flash_attention(B=2, S=5, H=4, D=8, seed=42):
	B, S, H, D = int(B), int(S), int(H), int(D)
	torch.manual_seed(int(seed))

	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
	q = k = v = torch.randn(B, S, H, D, device=device, dtype=dtype)

	log = io.StringIO()
	with contextlib.redirect_stdout(log):
	print(f"Running FlashAttention Tests on device: {device}")
	print(f"Input shape: B={B}, S={S}, H={H}, D={D}\n")

	# Always run PyTorch reference attention
	out_ref = reference_attention(q, k, v)
	print(f"✅ Reference attention OK: {out_ref.shape}\n")

	# Run FlashAttention if CUDA available
	if flash_attn_func is not None:
	try:
	out_flash, _ = flash_attn_func(q, k, v, causal=False)
	print("⚡ FlashAttention (non-causal):")
	print(f" Output: {out_flash.shape}")
	print(f" Close to reference: {torch.allclose(out_flash, out_ref, atol=1e-2, rtol=1e-3)}\n")

	out_flash_causal, _ = flash_attn_func(q, k, v, causal=True)
	out_ref_causal = reference_attention(q, k, v, causal=True)
	print("⚡ FlashAttention (causal):")
	print(f" Output: {out_flash_causal.shape}")
	print(f" Close to reference: {torch.allclose(out_flash_causal, out_ref_causal, atol=1e-2, rtol=1e-3)}\n")

	except Exception as e:
	print("❌ FlashAttention test failed:")
	print(str(e))
	else:
	print("⚠️ CUDA not available — FlashAttention test skipped.")
	print("Using PyTorch SDPA (reference) only.\n")

	return log.getvalue()

	# ============================================================
	# 🧩 Gradio UI
	# ============================================================
	with gr.Blocks(title="Flash Attention Kernel Tester") as demo:
	gr.Markdown("## ⚡ Flash Attention Kernel Tester")
	gr.Markdown("Compare PyTorch SDPA vs FlashAttention implementations interactively. Works on both CPU and CUDA.")

	with gr.Row():
	B = gr.Slider(1, 8, value=2, step=1, label="Batch Size (B)")
	S = gr.Slider(2, 10, value=5, step=1, label="Sequence Length (S)")
	with gr.Row():
	H = gr.Slider(1, 8, value=4, step=1, label="Number of Heads (H)")
	D = gr.Slider(4, 64, value=8, step=4, label="Head Dim (D)")
	seed = gr.Number(value=42, label="Random Seed")

	run_btn = gr.Button("🚀 Run Tests")
	output = gr.Textbox(label="Console Output", lines=25, show_copy_button=True)

	run_btn.click(run_flash_attention, inputs=[B, S, H, D, seed], outputs=output)

	demo.launch()