Spaces:

merve
/

SAM3-video-segmentation

Running on Zero

App Files Files Community

merve HF Staff

hysts HF Staff commited on 9 days ago

Commit

d855433

verified ·

1 Parent(s): 9aa97a1

ZeroGPU (#4)

Browse files

- Clean up (207380d371e725816ae5c3f53d531910ba67504d)
- Clean up (2f7dd3f5db9e9b315769dff39950a17399dbbaf5)
- ruff (6228bae0defd157d867ee7c11dbc471d1302990e)
- gradio==5.47.2 (2ca03b40be4b7e3e269f5fe923006ed6bd5bd037)
- Clean up (05517058ea0a3f027c86838ca74b4a813d5b36af)
- Clean up (b3503590167f6ef34dd1c47eb02e9a1f4109cc44)
- Clean up (899c724afb8a9537d8153c201537982d958706ff)
- Add missing type annotation (fd135add43ba4849b42d5a6a48a6145f7c4890af)
- Fix type annotation (9827da7e076f2799498620fd469d7e3c9480e9f7)
- Rename (9487a4249ff9574b8eb19f9a555b4da404db399b)
- Clean up (bec66180eb459c8dfca06ed7289d5690100a1915)
- Clean up (57b535c8bbd55e17113e91fb848e12474776fa77)
- Fix (c5ddb5172dcf01b92d44239d9600ece930b6b0b7)
- Add missing type annotation (dd289a80dcbb51991c3a7f262e3078ad7784ad96)
- Use tempfile (bb8d6819946800c97f7f0b34bb84195b582482f4)
- Clean up (5f16955badf395d39d104cd2a02c4ed0e37c9d9e)
- Add missing type annotation (a594ef6a99311692f11a0f39d1cee6b070a37b87)
- Clean up (8224854fa85dbb96e43f4955cbb8ceb49f94bc75)
- Fix (74ad55952b232b45cca92948c7665a60ad2d3eab)
- Clean up (4e68fb424493d6c682e55f08aa5ac524e1962173)
- ZeroGPU (ffce5fe119880498bc912e544612c92893db305b)

Co-authored-by: hysts <[email protected]>

Files changed (4) hide show

README.md +1 -1
app.py +402 -327
pyproject.toml +60 -0
uv.lock +0 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🐠
 colorFrom: yellow
 colorTo: gray
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: yellow
 colorTo: gray
 sdk: gradio
+sdk_version: 5.47.2
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,37 +1,106 @@
 import colorsys
 import gc
-import os
-from typing import Optional
 import cv2
 import gradio as gr
 import numpy as np
 import torch
 from gradio.themes import Soft
 from PIL import Image, ImageDraw, ImageFont
 from transformers import Sam3TrackerVideoModel, Sam3TrackerVideoProcessor, Sam3VideoModel, Sam3VideoProcessor
-def get_device_and_dtype() -> tuple[str, torch.dtype]:
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype = torch.bfloat16
-    return device, dtype
-_GLOBAL_DEVICE, _GLOBAL_DTYPE = get_device_and_dtype()
-_GLOBAL_MODEL_REPO_ID = "facebook/sam3"
-_GLOBAL_TOKEN = os.getenv("HF_TOKEN")
-_GLOBAL_TRACKER_MODEL = Sam3TrackerVideoModel.from_pretrained(
-    _GLOBAL_MODEL_REPO_ID, torch_dtype=_GLOBAL_DTYPE, device_map=_GLOBAL_DEVICE
-).eval()
-_GLOBAL_TRACKER_PROCESSOR = Sam3TrackerVideoProcessor.from_pretrained(_GLOBAL_MODEL_REPO_ID, token=_GLOBAL_TOKEN)
-_GLOBAL_TEXT_VIDEO_MODEL = Sam3VideoModel.from_pretrained(_GLOBAL_MODEL_REPO_ID, token=_GLOBAL_TOKEN)
-_GLOBAL_TEXT_VIDEO_MODEL = _GLOBAL_TEXT_VIDEO_MODEL.to(_GLOBAL_DEVICE, dtype=_GLOBAL_DTYPE).eval()
-_GLOBAL_TEXT_VIDEO_PROCESSOR = Sam3VideoProcessor.from_pretrained(_GLOBAL_MODEL_REPO_ID, token=_GLOBAL_TOKEN)
-print("Models loaded successfully!")
 def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], dict]:
@@ -106,10 +175,10 @@ def pastel_color_for_prompt(prompt_text: str) -> tuple[int, int, int]:
 class AppState:
-    def __init__(self):
         self.reset()
-    def reset(self):
         self.video_frames: list[Image.Image] = []
         self.inference_session = None
         self.video_fps: float | None = None
@@ -130,7 +199,7 @@ class AppState:
         self.pending_box_start_obj_id: int | None = None
         self.active_tab: str = "point_box"
-    def __repr__(self):
         return f"AppState(video_frames={len(self.video_frames)}, video_fps={self.video_fps}, masks_by_frame={len(self.masks_by_frame)}, color_by_obj={len(self.color_by_obj)})"
     @property
@@ -139,23 +208,20 @@ class AppState:
 def init_video_session(
-    GLOBAL_STATE: gr.State, video: str | dict, active_tab: str = "point_box"
 ) -> tuple[AppState, int, int, Image.Image, str]:
-    GLOBAL_STATE.video_frames = []
-    GLOBAL_STATE.masks_by_frame = {}
-    GLOBAL_STATE.color_by_obj = {}
-    GLOBAL_STATE.color_by_prompt = {}
-    GLOBAL_STATE.text_prompts_by_frame_obj = {}
-    GLOBAL_STATE.clicks_by_frame_obj = {}
-    GLOBAL_STATE.boxes_by_frame_obj = {}
-    GLOBAL_STATE.composited_frames = {}
-    GLOBAL_STATE.inference_session = None
-    GLOBAL_STATE.active_tab = active_tab
-    device = _GLOBAL_DEVICE
-    dtype = _GLOBAL_DTYPE
-    video_path: Optional[str] = None
     if isinstance(video, dict):
         video_path = video.get("name") or video.get("path") or video.get("data")
     elif isinstance(video, str):
@@ -170,7 +236,6 @@ def init_video_session(
     if len(frames) == 0:
         raise gr.Error("No frames could be loaded from the video.")
-    MAX_SECONDS = 8.0
     trimmed_note = ""
     fps_in = info.get("fps")
     max_frames_allowed = int(MAX_SECONDS * fps_in) if fps_in else len(frames)
@@ -179,44 +244,49 @@ def init_video_session(
         trimmed_note = f" (trimmed to {int(MAX_SECONDS)}s = {len(frames)} frames)"
         if isinstance(info, dict):
             info["num_frames"] = len(frames)
-    GLOBAL_STATE.video_frames = frames
-    GLOBAL_STATE.video_fps = float(fps_in) if fps_in else None
     raw_video = [np.array(frame) for frame in frames]
     if active_tab == "text":
-        processor = _GLOBAL_TEXT_VIDEO_PROCESSOR
-        GLOBAL_STATE.inference_session = processor.init_video_session(
             video=frames,
-            inference_device=device,
             processing_device="cpu",
             video_storage_device="cpu",
-            dtype=dtype,
         )
     else:
-        processor = _GLOBAL_TRACKER_PROCESSOR
-        GLOBAL_STATE.inference_session = processor.init_video_session(
             video=raw_video,
-            inference_device=device,
-            video_storage_device="cpu",
             processing_device="cpu",
-            inference_state_device=device,
-            dtype=dtype,
         )
     first_frame = frames[0]
     max_idx = len(frames) - 1
     if active_tab == "text":
         status = (
-            f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
-            f"Device: {device}, dtype: bfloat16. Ready for text prompting."
         )
     else:
         status = (
-            f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
-            f"Device: {device}, dtype: bfloat16. Video session initialized."
         )
-    return GLOBAL_STATE, 0, max_idx, first_frame, status
 def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
@@ -288,7 +358,7 @@ def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
                 try:
                     font = ImageFont.truetype(font_path, font_size)
                     break
-                except (OSError, IOError):
                     continue
             if font is None:
                 # Fallback to default font
@@ -340,7 +410,7 @@ def update_frame_display(state: AppState, frame_idx: int) -> Image.Image:
     return compose_frame(state, frame_idx)
-def _get_prompt_for_obj(state: AppState, obj_id: int) -> Optional[str]:
     """Get the prompt text associated with an object ID."""
     # Priority 1: Check text_prompts_by_frame_obj (most reliable)
     for frame_texts in state.text_prompts_by_frame_obj.values():
@@ -348,19 +418,18 @@ def _get_prompt_for_obj(state: AppState, obj_id: int) -> Optional[str]:
             return frame_texts[obj_id].strip()
     # Priority 2: Check inference session mapping
-    if state.inference_session is not None:
-        if (
-            hasattr(state.inference_session, "obj_id_to_prompt_id")
-            and obj_id in state.inference_session.obj_id_to_prompt_id
-        ):
-            prompt_id = state.inference_session.obj_id_to_prompt_id[obj_id]
-            if hasattr(state.inference_session, "prompts") and prompt_id in state.inference_session.prompts:
-                return state.inference_session.prompts[prompt_id].strip()
     return None
-def _ensure_color_for_obj(state: AppState, obj_id: int):
     """Assign color to object based on its prompt if available, otherwise use object ID."""
     prompt_text = _get_prompt_for_obj(state, obj_id)
@@ -375,6 +444,7 @@ def _ensure_color_for_obj(state: AppState, obj_id: int):
         state.color_by_obj[obj_id] = pastel_color_for_object(obj_id)
 def on_image_click(
     img: Image.Image | np.ndarray,
     state: AppState,
@@ -383,12 +453,13 @@ def on_image_click(
     label: str,
     clear_old: bool,
     evt: gr.SelectData,
-) -> Image.Image:
     if state is None or state.inference_session is None:
         return img
-    model = _GLOBAL_TRACKER_MODEL
-    processor = _GLOBAL_TRACKER_PROCESSOR
     x = y = None
     if evt is not None:
@@ -417,29 +488,28 @@ def on_image_click(
             state.pending_box_start_obj_id = ann_obj_id
             state.composited_frames.pop(ann_frame_idx, None)
             return update_frame_display(state, ann_frame_idx)
-        else:
-            x1, y1 = state.pending_box_start
-            x2, y2 = int(x), int(y)
-            state.pending_box_start = None
-            state.pending_box_start_frame_idx = None
-            state.pending_box_start_obj_id = None
-            state.composited_frames.pop(ann_frame_idx, None)
-            x_min, y_min = min(x1, x2), min(y1, y2)
-            x_max, y_max = max(x1, x2), max(y1, y2)
-            box = [[[x_min, y_min, x_max, y_max]]]
-            processor.add_inputs_to_inference_session(
-                inference_session=state.inference_session,
-                frame_idx=ann_frame_idx,
-                obj_ids=ann_obj_id,
-                input_boxes=box,
-            )
-            frame_boxes = state.boxes_by_frame_obj.setdefault(ann_frame_idx, {})
-            obj_boxes = frame_boxes.setdefault(ann_obj_id, [])
-            obj_boxes.clear()
-            obj_boxes.append((x_min, y_min, x_max, y_max))
-            state.composited_frames.pop(ann_frame_idx, None)
     else:
         label_int = 1 if str(label).lower().startswith("pos") else 0
@@ -485,23 +555,26 @@ def on_image_click(
     state.composited_frames.pop(ann_frame_idx, None)
-    return update_frame_display(state, ann_frame_idx)
 def on_text_prompt(
     state: AppState,
     frame_idx: int,
     text_prompt: str,
-) -> tuple[Image.Image, str, str]:
     if state is None or state.inference_session is None:
         return None, "Upload a video and enter text prompt.", "**Active prompts:** None"
-    model = _GLOBAL_TEXT_VIDEO_MODEL
-    processor = _GLOBAL_TEXT_VIDEO_PROCESSOR
     if not text_prompt or not text_prompt.strip():
         active_prompts = _get_active_prompts_display(state)
-        return update_frame_display(state, int(frame_idx)), "Please enter a text prompt.", active_prompts
     frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
@@ -509,7 +582,9 @@ def on_text_prompt(
     prompt_texts = [p.strip() for p in text_prompt.split(",") if p.strip()]
     if not prompt_texts:
         active_prompts = _get_active_prompts_display(state)
-        return update_frame_display(state, int(frame_idx)), "Please enter a valid text prompt.", active_prompts
     # Add text prompt(s) - supports both single string and list of strings
     state.inference_session = processor.add_text_prompt(
@@ -593,7 +668,10 @@ def on_text_prompt(
         status = f"Processed text prompt(s) {prompts_str} on frame {frame_idx}. No objects detected."
     active_prompts = _get_active_prompts_display(state)
-    return update_frame_display(state, int(frame_idx)), status, active_prompts
 def _get_active_prompts_display(state: AppState) -> str:
@@ -610,32 +688,35 @@ def _get_active_prompts_display(state: AppState) -> str:
     return "**Active prompts:** None"
-def propagate_masks(GLOBAL_STATE: gr.State):
-    if GLOBAL_STATE is None:
-        return GLOBAL_STATE, "Load a video first.", gr.update()
-    if GLOBAL_STATE.active_tab != "text" and GLOBAL_STATE.inference_session is None:
-        return GLOBAL_STATE, "Load a video first.", gr.update()
-    total = max(1, GLOBAL_STATE.num_frames)
     processed = 0
-    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update()
     last_frame_idx = 0
     with torch.no_grad():
-        if GLOBAL_STATE.active_tab == "text":
-            if GLOBAL_STATE.inference_session is None:
-                yield GLOBAL_STATE, "Text video model not loaded.", gr.update()
                 return
-            model = _GLOBAL_TEXT_VIDEO_MODEL
-            processor = _GLOBAL_TEXT_VIDEO_PROCESSOR
             # Collect all unique prompts from existing frame annotations
             text_prompt_to_obj_ids = {}
-            for frame_idx, frame_texts in GLOBAL_STATE.text_prompts_by_frame_obj.items():
                 for obj_id, text_prompt in frame_texts.items():
                     if text_prompt not in text_prompt_to_obj_ids:
                         text_prompt_to_obj_ids[text_prompt] = []
@@ -643,8 +724,8 @@ def propagate_masks(GLOBAL_STATE: gr.State):
                         text_prompt_to_obj_ids[text_prompt].append(obj_id)
             # Also check if there are prompts already in the inference session
-            if hasattr(GLOBAL_STATE.inference_session, "prompts") and GLOBAL_STATE.inference_session.prompts:
-                for prompt_text in GLOBAL_STATE.inference_session.prompts.values():
                     if prompt_text not in text_prompt_to_obj_ids:
                         text_prompt_to_obj_ids[prompt_text] = []
@@ -652,31 +733,30 @@ def propagate_masks(GLOBAL_STATE: gr.State):
                 text_prompt_to_obj_ids[text_prompt].sort()
             if not text_prompt_to_obj_ids:
-                yield GLOBAL_STATE, "No text prompts found. Please add a text prompt first.", gr.update()
                 return
             # Add all prompts to the inference session (processor handles deduplication)
-            for text_prompt in text_prompt_to_obj_ids.keys():
-                GLOBAL_STATE.inference_session = processor.add_text_prompt(
-                    inference_session=GLOBAL_STATE.inference_session,
                     text=text_prompt,
                 )
-            earliest_frame = (
-                min(GLOBAL_STATE.text_prompts_by_frame_obj.keys()) if GLOBAL_STATE.text_prompts_by_frame_obj else 0
-            )
-            frames_to_track = GLOBAL_STATE.num_frames - earliest_frame
             outputs_per_frame = {}
             for model_outputs in model.propagate_in_video_iterator(
-                inference_session=GLOBAL_STATE.inference_session,
                 start_frame_idx=earliest_frame,
                 max_frame_num_to_track=frames_to_track,
             ):
                 processed_outputs = processor.postprocess_outputs(
-                    GLOBAL_STATE.inference_session,
                     model_outputs,
                 )
                 frame_idx = model_outputs.frame_idx
@@ -687,8 +767,8 @@ def propagate_masks(GLOBAL_STATE: gr.State):
                 scores = processed_outputs["scores"]
                 prompt_to_obj_ids = processed_outputs.get("prompt_to_obj_ids", {})
-                masks_for_frame = GLOBAL_STATE.masks_by_frame.setdefault(frame_idx, {})
-                frame_texts = GLOBAL_STATE.text_prompts_by_frame_obj.setdefault(frame_idx, {})
                 num_objects = len(object_ids)
                 if num_objects > 0:
@@ -715,183 +795,185 @@ def propagate_masks(GLOBAL_STATE: gr.State):
                         # Store prompt and assign color
                         if found_prompt:
                             frame_texts[current_obj_id] = found_prompt.strip()
-                        _ensure_color_for_obj(GLOBAL_STATE, current_obj_id)
-                GLOBAL_STATE.composited_frames.pop(frame_idx, None)
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
-                    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
         else:
-            if GLOBAL_STATE.inference_session is None:
-                yield GLOBAL_STATE, "Tracker model not loaded.", gr.update()
                 return
-            model = _GLOBAL_TRACKER_MODEL
-            processor = _GLOBAL_TRACKER_PROCESSOR
-            for sam2_video_output in model.propagate_in_video_iterator(
-                inference_session=GLOBAL_STATE.inference_session
-            ):
                 video_res_masks = processor.post_process_masks(
                     [sam2_video_output.pred_masks],
-                    original_sizes=[
-                        [GLOBAL_STATE.inference_session.video_height, GLOBAL_STATE.inference_session.video_width]
-                    ],
                 )[0]
                 frame_idx = sam2_video_output.frame_idx
-                for i, out_obj_id in enumerate(GLOBAL_STATE.inference_session.obj_ids):
-                    _ensure_color_for_obj(GLOBAL_STATE, int(out_obj_id))
                     mask_2d = video_res_masks[i].cpu().numpy()
-                    masks_for_frame = GLOBAL_STATE.masks_by_frame.setdefault(frame_idx, {})
                     masks_for_frame[int(out_obj_id)] = mask_2d
-                    GLOBAL_STATE.composited_frames.pop(frame_idx, None)
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
-                    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
     text = f"Propagated masks across {processed} frames."
-    yield GLOBAL_STATE, text, gr.update(value=last_frame_idx)
-def reset_prompts(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, str, str]:
     """Reset prompts and all outputs, but keep processed frames and cached vision features."""
-    if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
-        active_prompts = _get_active_prompts_display(GLOBAL_STATE)
-        return GLOBAL_STATE, None, "No active session to reset.", active_prompts
-    if GLOBAL_STATE.active_tab != "text":
-        active_prompts = _get_active_prompts_display(GLOBAL_STATE)
-        return GLOBAL_STATE, None, "Reset prompts is only available for text prompting mode.", active_prompts
     # Reset inference session tracking data but keep cache and processed frames
-    if hasattr(GLOBAL_STATE.inference_session, "reset_tracking_data"):
-        GLOBAL_STATE.inference_session.reset_tracking_data()
     # Manually clear prompts (reset_tracking_data doesn't clear prompts themselves)
-    if hasattr(GLOBAL_STATE.inference_session, "prompts"):
-        GLOBAL_STATE.inference_session.prompts.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "prompt_input_ids"):
-        GLOBAL_STATE.inference_session.prompt_input_ids.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "prompt_embeddings"):
-        GLOBAL_STATE.inference_session.prompt_embeddings.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "prompt_attention_masks"):
-        GLOBAL_STATE.inference_session.prompt_attention_masks.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "obj_id_to_prompt_id"):
-        GLOBAL_STATE.inference_session.obj_id_to_prompt_id.clear()
     # Reset detection-tracking fusion state
-    if hasattr(GLOBAL_STATE.inference_session, "obj_id_to_score"):
-        GLOBAL_STATE.inference_session.obj_id_to_score.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "obj_id_to_tracker_score_frame_wise"):
-        GLOBAL_STATE.inference_session.obj_id_to_tracker_score_frame_wise.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "obj_id_to_last_occluded"):
-        GLOBAL_STATE.inference_session.obj_id_to_last_occluded.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "max_obj_id"):
-        GLOBAL_STATE.inference_session.max_obj_id = -1
-    if hasattr(GLOBAL_STATE.inference_session, "obj_first_frame_idx"):
-        GLOBAL_STATE.inference_session.obj_first_frame_idx.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "unmatched_frame_inds"):
-        GLOBAL_STATE.inference_session.unmatched_frame_inds.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "overlap_pair_to_frame_inds"):
-        GLOBAL_STATE.inference_session.overlap_pair_to_frame_inds.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "trk_keep_alive"):
-        GLOBAL_STATE.inference_session.trk_keep_alive.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "removed_obj_ids"):
-        GLOBAL_STATE.inference_session.removed_obj_ids.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "suppressed_obj_ids"):
-        GLOBAL_STATE.inference_session.suppressed_obj_ids.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "hotstart_removed_obj_ids"):
-        GLOBAL_STATE.inference_session.hotstart_removed_obj_ids.clear()
     # Clear all app state outputs
-    GLOBAL_STATE.masks_by_frame.clear()
-    GLOBAL_STATE.text_prompts_by_frame_obj.clear()
-    GLOBAL_STATE.composited_frames.clear()
-    GLOBAL_STATE.color_by_obj.clear()
-    GLOBAL_STATE.color_by_prompt.clear()
     # Update display
-    current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
-    current_idx = max(0, min(current_idx, GLOBAL_STATE.num_frames - 1))
-    preview_img = update_frame_display(GLOBAL_STATE, current_idx)
-    active_prompts = _get_active_prompts_display(GLOBAL_STATE)
     status = "Prompts and outputs reset. Processed frames and cached vision features preserved."
-    return GLOBAL_STATE, preview_img, status, active_prompts
-def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, int, str, str]:
-    if not GLOBAL_STATE.video_frames:
-        return GLOBAL_STATE, None, 0, 0, "Session reset. Load a new video.", "**Active prompts:** None"
-    if GLOBAL_STATE.active_tab == "text":
-        if GLOBAL_STATE.video_frames:
-            processor = _GLOBAL_TEXT_VIDEO_PROCESSOR
-            GLOBAL_STATE.inference_session = processor.init_video_session(
-                video=GLOBAL_STATE.video_frames,
-                inference_device=_GLOBAL_DEVICE,
                 processing_device="cpu",
                 video_storage_device="cpu",
-                dtype=_GLOBAL_DTYPE,
-            )
-    elif GLOBAL_STATE.inference_session is not None and hasattr(
-        GLOBAL_STATE.inference_session, "reset_inference_session"
-    ):
-        GLOBAL_STATE.inference_session.reset_inference_session()
-    else:
-        if GLOBAL_STATE.video_frames:
-            processor = _GLOBAL_TRACKER_PROCESSOR
-            raw_video = [np.array(frame) for frame in GLOBAL_STATE.video_frames]
-            GLOBAL_STATE.inference_session = processor.init_video_session(
-                video=raw_video,
-                inference_device=_GLOBAL_DEVICE,
-                video_storage_device="cpu",
-                processing_device="cpu",
-                dtype=_GLOBAL_DTYPE,
             )
-    GLOBAL_STATE.masks_by_frame.clear()
-    GLOBAL_STATE.clicks_by_frame_obj.clear()
-    GLOBAL_STATE.boxes_by_frame_obj.clear()
-    GLOBAL_STATE.text_prompts_by_frame_obj.clear()
-    GLOBAL_STATE.composited_frames.clear()
-    GLOBAL_STATE.color_by_obj.clear()
-    GLOBAL_STATE.color_by_prompt.clear()
-    GLOBAL_STATE.pending_box_start = None
-    GLOBAL_STATE.pending_box_start_frame_idx = None
-    GLOBAL_STATE.pending_box_start_obj_id = None
     gc.collect()
-    current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
-    current_idx = max(0, min(current_idx, GLOBAL_STATE.num_frames - 1))
-    preview_img = update_frame_display(GLOBAL_STATE, current_idx)
-    slider_minmax = gr.update(minimum=0, maximum=max(GLOBAL_STATE.num_frames - 1, 0), interactive=True)
     slider_value = gr.update(value=current_idx)
     status = "Session reset. Prompts cleared; video preserved."
-    active_prompts = _get_active_prompts_display(GLOBAL_STATE)
-    return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status, active_prompts
-def _on_video_change_pointbox(GLOBAL_STATE: gr.State, video):
-    GLOBAL_STATE, min_idx, max_idx, first_frame, status = init_video_session(GLOBAL_STATE, video, "point_box")
     return (
-        GLOBAL_STATE,
         gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
         first_frame,
         status,
     )
-def _on_video_change_text(GLOBAL_STATE: gr.State, video):
-    GLOBAL_STATE, min_idx, max_idx, first_frame, status = init_video_session(GLOBAL_STATE, video, "text")
-    active_prompts = _get_active_prompts_display(GLOBAL_STATE)
     return (
-        GLOBAL_STATE,
         gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
         first_frame,
         status,
@@ -899,10 +981,8 @@ def _on_video_change_text(GLOBAL_STATE: gr.State, video):
     )
-theme = Soft(primary_hue="blue", secondary_hue="rose", neutral_hue="slate")
-with gr.Blocks(title="SAM3", theme=theme) as demo:
-    GLOBAL_STATE = gr.State(AppState())
     gr.Markdown(
         """
@@ -934,15 +1014,13 @@ with gr.Blocks(title="SAM3", theme=theme) as demo:
             with gr.Row():
                 with gr.Column(scale=1):
-                    video_in_text = gr.Video(label="Upload video", sources=["upload", "webcam"], interactive=True)
                     load_status_text = gr.Markdown(visible=True)
                     reset_btn_text = gr.Button("Reset Session", variant="secondary")
                 with gr.Column(scale=2):
-                    preview_text = gr.Image(label="Preview", interactive=True)
                     with gr.Row():
-                        frame_slider_text = gr.Slider(
-                            label="Frame", minimum=0, maximum=0, step=1, value=0, interactive=True
-                        )
                         with gr.Column(scale=0):
                             propagate_btn_text = gr.Button("Propagate across video", variant="primary")
                             propagate_status_text = gr.Markdown(visible=True)
@@ -969,12 +1047,9 @@ with gr.Blocks(title="SAM3", theme=theme) as demo:
             ]
             with gr.Row():
                 gr.Examples(
-                    examples=examples_list_text,
-                    inputs=[GLOBAL_STATE, video_in_text],
-                    fn=_on_video_change_text,
-                    outputs=[GLOBAL_STATE, frame_slider_text, preview_text, load_status_text, active_prompts_display],
                     label="Examples",
-                    cache_examples=False,
                     examples_per_page=5,
                 )
@@ -1000,17 +1075,13 @@ with gr.Blocks(title="SAM3", theme=theme) as demo:
             with gr.Row():
                 with gr.Column(scale=1):
-                    video_in_pointbox = gr.Video(
-                        label="Upload video", sources=["upload", "webcam"], interactive=True, max_length=7
-                    )
                     load_status_pointbox = gr.Markdown(visible=True)
                     reset_btn_pointbox = gr.Button("Reset Session", variant="secondary")
                 with gr.Column(scale=2):
-                    preview_pointbox = gr.Image(label="Preview", interactive=True)
                     with gr.Row():
-                        frame_slider_pointbox = gr.Slider(
-                            label="Frame", minimum=0, maximum=0, step=1, value=0, interactive=True
-                        )
                         with gr.Column(scale=0):
                             propagate_btn_pointbox = gr.Button("Propagate across video", variant="primary")
                             propagate_status_pointbox = gr.Markdown(visible=True)
@@ -1032,105 +1103,101 @@ with gr.Blocks(title="SAM3", theme=theme) as demo:
             ]
             with gr.Row():
                 gr.Examples(
-                    examples=examples_list_pointbox,
-                    inputs=[GLOBAL_STATE, video_in_pointbox],
-                    fn=_on_video_change_pointbox,
-                    outputs=[GLOBAL_STATE, frame_slider_pointbox, preview_pointbox, load_status_pointbox],
                     label="Examples",
-                    cache_examples=False,
                     examples_per_page=5,
                 )
     video_in_pointbox.change(
-        _on_video_change_pointbox,
-        inputs=[GLOBAL_STATE, video_in_pointbox],
-        outputs=[GLOBAL_STATE, frame_slider_pointbox, preview_pointbox, load_status_pointbox],
         show_progress=True,
     )
-    def _sync_frame_idx_pointbox(state_in: AppState, idx: int):
         if state_in is not None:
             state_in.current_frame_idx = int(idx)
         return update_frame_display(state_in, int(idx))
     frame_slider_pointbox.change(
-        _sync_frame_idx_pointbox,
-        inputs=[GLOBAL_STATE, frame_slider_pointbox],
         outputs=preview_pointbox,
     )
     video_in_text.change(
-        _on_video_change_text,
-        inputs=[GLOBAL_STATE, video_in_text],
-        outputs=[GLOBAL_STATE, frame_slider_text, preview_text, load_status_text, active_prompts_display],
         show_progress=True,
     )
-    def _sync_frame_idx_text(state_in: AppState, idx: int):
         if state_in is not None:
             state_in.current_frame_idx = int(idx)
         return update_frame_display(state_in, int(idx))
     frame_slider_text.change(
-        _sync_frame_idx_text,
-        inputs=[GLOBAL_STATE, frame_slider_text],
         outputs=preview_text,
     )
-    def _sync_obj_id(s: AppState, oid):
         if s is not None and oid is not None:
             s.current_obj_id = int(oid)
-        return gr.update()
-    obj_id_inp.change(_sync_obj_id, inputs=[GLOBAL_STATE, obj_id_inp], outputs=[])
-    def _sync_label(s: AppState, lab: str):
         if s is not None and lab is not None:
             s.current_label = str(lab)
-        return gr.update()
-    label_radio.change(_sync_label, inputs=[GLOBAL_STATE, label_radio], outputs=[])
-    def _sync_prompt_type(s: AppState, val: str):
         if s is not None and val is not None:
             s.current_prompt_type = str(val)
             s.pending_box_start = None
         is_points = str(val).lower() == "points"
-        updates = [
             gr.update(visible=is_points),
             gr.update(interactive=is_points) if is_points else gr.update(value=True, interactive=False),
-        ]
-        return updates
     prompt_type.change(
-        _sync_prompt_type,
-        inputs=[GLOBAL_STATE, prompt_type],
         outputs=[label_radio, clear_old_chk],
     )
     preview_pointbox.select(
-        on_image_click,
-        [preview_pointbox, GLOBAL_STATE, frame_slider_pointbox, obj_id_inp, label_radio, clear_old_chk],
-        preview_pointbox,
     )
-    def _on_text_apply(state: AppState, frame_idx: int, text: str):
-        img, status, active_prompts = on_text_prompt(state, frame_idx, text)
-        return img, status, active_prompts
     text_apply_btn.click(
-        _on_text_apply,
-        inputs=[GLOBAL_STATE, frame_slider_text, text_prompt_input],
-        outputs=[preview_text, text_status, active_prompts_display],
     )
     reset_prompts_btn.click(
-        reset_prompts,
-        inputs=[GLOBAL_STATE],
-        outputs=[GLOBAL_STATE, preview_text, text_status, active_prompts_display],
     )
-    def _render_video(s: AppState):
         if s is None or s.num_frames == 0:
             raise gr.Error("Load a video first.")
         fps = s.video_fps if s.video_fps and s.video_fps > 0 else 12
@@ -1144,44 +1211,52 @@ with gr.Blocks(title="SAM3", theme=theme) as demo:
             frames_np.append(np.array(img)[:, :, ::-1])
             if (idx + 1) % 60 == 0:
                 gc.collect()
-        out_path = "/tmp/sam3_playback.mp4"
         try:
-            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-            writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
-            for fr_bgr in frames_np:
-                writer.write(fr_bgr)
-            writer.release()
-            return out_path
         except Exception as e:
             print(f"Failed to render video with cv2: {e}")
             raise gr.Error(f"Failed to render video: {e}")
-    render_btn_pointbox.click(_render_video, inputs=[GLOBAL_STATE], outputs=[playback_video_pointbox])
-    render_btn_text.click(_render_video, inputs=[GLOBAL_STATE], outputs=[playback_video_text])
     propagate_btn_pointbox.click(
-        propagate_masks,
-        inputs=[GLOBAL_STATE],
-        outputs=[GLOBAL_STATE, propagate_status_pointbox, frame_slider_pointbox],
     )
     propagate_btn_text.click(
-        propagate_masks,
-        inputs=[GLOBAL_STATE],
-        outputs=[GLOBAL_STATE, propagate_status_text, frame_slider_text],
     )
     reset_btn_pointbox.click(
-        reset_session,
-        inputs=GLOBAL_STATE,
-        outputs=[GLOBAL_STATE, preview_pointbox, frame_slider_pointbox, frame_slider_pointbox, load_status_pointbox],
     )
     reset_btn_text.click(
-        reset_session,
-        inputs=GLOBAL_STATE,
         outputs=[
-            GLOBAL_STATE,
             preview_text,
             frame_slider_text,
             frame_slider_text,

 import colorsys
 import gc
+import tempfile
+from collections import defaultdict
+from collections.abc import Iterator, Mapping, Sequence
+from typing import Any
 import cv2
 import gradio as gr
 import numpy as np
+import spaces
 import torch
 from gradio.themes import Soft
 from PIL import Image, ImageDraw, ImageFont
 from transformers import Sam3TrackerVideoModel, Sam3TrackerVideoProcessor, Sam3VideoModel, Sam3VideoProcessor
+MODEL_ID = "facebook/sam3"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16
+TRACKER_MODEL = Sam3TrackerVideoModel.from_pretrained(MODEL_ID, torch_dtype=DTYPE).to(DEVICE).eval()
+TRACKER_PROCESSOR = Sam3TrackerVideoProcessor.from_pretrained(MODEL_ID)
+TEXT_VIDEO_MODEL = Sam3VideoModel.from_pretrained(MODEL_ID).to(DEVICE, dtype=DTYPE).eval()
+TEXT_VIDEO_PROCESSOR = Sam3VideoProcessor.from_pretrained(MODEL_ID)
+print("Models loaded successfully!")
+MAX_SECONDS = 8.0
+def to_device_recursive(obj: Any, device: str | torch.device) -> Any:  # noqa: ANN401
+    """Return a new object where all torch.Tensors reachable from `obj` are moved to the given device.
+    - Does NOT mutate the original object.
+    - Handles:
+        * torch.Tensor
+        * Mapping (e.g. dict, defaultdict, OrderedDict, etc.)
+        * Sequence (e.g. list, tuple) except str/bytes
+        * Custom classes with attributes (__dict__)
+    - Tries to preserve container types where reasonable.
+    """
+    device = torch.device(device)
+    memo = {}
+    def _convert(x: Any) -> Any:  # noqa: ANN401, C901
+        obj_id = id(x)
+        if obj_id in memo:
+            return memo[obj_id]
+        # 1. Tensor
+        if isinstance(x, torch.Tensor):
+            y = x.to(device)
+            memo[obj_id] = y
+            return y
+        # 2. Mapping (dict, defaultdict, etc.)
+        if isinstance(x, Mapping):
+            # Special case: defaultdict
+            if isinstance(x, defaultdict):
+                y = defaultdict(x.default_factory)
+                memo[obj_id] = y
+                for k, v in x.items():
+                    y[k] = _convert(v)
+                return y
+            # Try to rebuild the same type using (key, value) pairs
+            try:
+                y = type(x)((k, _convert(v)) for k, v in x.items())
+                memo[obj_id] = y
+                return y
+            except TypeError:
+                # Fallback: plain dict
+                y = {k: _convert(v) for k, v in x.items()}
+                memo[obj_id] = y
+                return y
+        # 3. Sequence (list/tuple/etc.) but not str/bytes
+        if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)):
+            if isinstance(x, list):
+                y = [_convert(v) for v in x]
+            elif isinstance(x, tuple):
+                y = type(x)(_convert(v) for v in x)
+            else:
+                try:
+                    y = type(x)(_convert(v) for v in x)
+                except TypeError:
+                    y = [_convert(v) for v in x]
+            memo[obj_id] = y
+            return y
+        # 4. Custom object with attributes (__dict__)
+        if hasattr(x, "__dict__") and not isinstance(x, type):
+            new_obj = x.__class__.__new__(x.__class__)
+            memo[obj_id] = new_obj
+            for name, value in vars(x).items():
+                setattr(new_obj, name, _convert(value))
+            return new_obj
+        # 5. Everything else → keep as-is
+        memo[obj_id] = x
+        return x
+    return _convert(obj)
 def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], dict]:
 class AppState:
+    def __init__(self) -> None:
         self.reset()
+    def reset(self) -> None:
         self.video_frames: list[Image.Image] = []
         self.inference_session = None
         self.video_fps: float | None = None
         self.pending_box_start_obj_id: int | None = None
         self.active_tab: str = "point_box"
+    def __repr__(self) -> str:
         return f"AppState(video_frames={len(self.video_frames)}, video_fps={self.video_fps}, masks_by_frame={len(self.masks_by_frame)}, color_by_obj={len(self.color_by_obj)})"
     @property
 def init_video_session(
+    state: AppState, video: str | dict, active_tab: str = "point_box"
 ) -> tuple[AppState, int, int, Image.Image, str]:
+    state.video_frames = []
+    state.masks_by_frame = {}
+    state.color_by_obj = {}
+    state.color_by_prompt = {}
+    state.text_prompts_by_frame_obj = {}
+    state.clicks_by_frame_obj = {}
+    state.boxes_by_frame_obj = {}
+    state.composited_frames = {}
+    state.inference_session = None
+    state.active_tab = active_tab
+    video_path: str | None = None
     if isinstance(video, dict):
         video_path = video.get("name") or video.get("path") or video.get("data")
     elif isinstance(video, str):
     if len(frames) == 0:
         raise gr.Error("No frames could be loaded from the video.")
     trimmed_note = ""
     fps_in = info.get("fps")
     max_frames_allowed = int(MAX_SECONDS * fps_in) if fps_in else len(frames)
         trimmed_note = f" (trimmed to {int(MAX_SECONDS)}s = {len(frames)} frames)"
         if isinstance(info, dict):
             info["num_frames"] = len(frames)
+    state.video_frames = frames
+    state.video_fps = float(fps_in) if fps_in else None
     raw_video = [np.array(frame) for frame in frames]
     if active_tab == "text":
+        processor = TEXT_VIDEO_PROCESSOR
+        state.inference_session = processor.init_video_session(
             video=frames,
+            inference_device="cpu",
+            inference_state_device="cpu",
             processing_device="cpu",
             video_storage_device="cpu",
+            dtype=DTYPE,
         )
     else:
+        processor = TRACKER_PROCESSOR
+        state.inference_session = processor.init_video_session(
             video=raw_video,
+            inference_device="cpu",
+            inference_state_device="cpu",
             processing_device="cpu",
+            video_storage_device="cpu",
+            dtype=DTYPE,
         )
+    state.inference_session.inference_device = DEVICE
+    state.inference_session.processing_device = DEVICE
+    state.inference_session.cache.inference_device = DEVICE
     first_frame = frames[0]
     max_idx = len(frames) - 1
     if active_tab == "text":
         status = (
+            f"Loaded {len(frames)} frames @ {state.video_fps or 'unknown'} fps{trimmed_note}. "
+            f"Device: {DEVICE}, dtype: bfloat16. Ready for text prompting."
         )
     else:
         status = (
+            f"Loaded {len(frames)} frames @ {state.video_fps or 'unknown'} fps{trimmed_note}. "
+            f"Device: {DEVICE}, dtype: bfloat16. Video session initialized."
         )
+    return state, 0, max_idx, first_frame, status
 def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
                 try:
                     font = ImageFont.truetype(font_path, font_size)
                     break
+                except OSError:
                     continue
             if font is None:
                 # Fallback to default font
     return compose_frame(state, frame_idx)
+def _get_prompt_for_obj(state: AppState, obj_id: int) -> str | None:
     """Get the prompt text associated with an object ID."""
     # Priority 1: Check text_prompts_by_frame_obj (most reliable)
     for frame_texts in state.text_prompts_by_frame_obj.values():
             return frame_texts[obj_id].strip()
     # Priority 2: Check inference session mapping
+    if state.inference_session is not None and (
+        hasattr(state.inference_session, "obj_id_to_prompt_id")
+        and obj_id in state.inference_session.obj_id_to_prompt_id
+    ):
+        prompt_id = state.inference_session.obj_id_to_prompt_id[obj_id]
+        if hasattr(state.inference_session, "prompts") and prompt_id in state.inference_session.prompts:
+            return state.inference_session.prompts[prompt_id].strip()
     return None
+def _ensure_color_for_obj(state: AppState, obj_id: int) -> None:
     """Assign color to object based on its prompt if available, otherwise use object ID."""
     prompt_text = _get_prompt_for_obj(state, obj_id)
         state.color_by_obj[obj_id] = pastel_color_for_object(obj_id)
+@spaces.GPU
 def on_image_click(
     img: Image.Image | np.ndarray,
     state: AppState,
     label: str,
     clear_old: bool,
     evt: gr.SelectData,
+) -> tuple[Image.Image, AppState]:
     if state is None or state.inference_session is None:
         return img
+    model = TRACKER_MODEL
+    processor = TRACKER_PROCESSOR
+    state.inference_session = to_device_recursive(state.inference_session, DEVICE)
     x = y = None
     if evt is not None:
             state.pending_box_start_obj_id = ann_obj_id
             state.composited_frames.pop(ann_frame_idx, None)
             return update_frame_display(state, ann_frame_idx)
+        x1, y1 = state.pending_box_start
+        x2, y2 = int(x), int(y)
+        state.pending_box_start = None
+        state.pending_box_start_frame_idx = None
+        state.pending_box_start_obj_id = None
+        state.composited_frames.pop(ann_frame_idx, None)
+        x_min, y_min = min(x1, x2), min(y1, y2)
+        x_max, y_max = max(x1, x2), max(y1, y2)
+        box = [[[x_min, y_min, x_max, y_max]]]
+        processor.add_inputs_to_inference_session(
+            inference_session=state.inference_session,
+            frame_idx=ann_frame_idx,
+            obj_ids=ann_obj_id,
+            input_boxes=box,
+        )
+        frame_boxes = state.boxes_by_frame_obj.setdefault(ann_frame_idx, {})
+        obj_boxes = frame_boxes.setdefault(ann_obj_id, [])
+        obj_boxes.clear()
+        obj_boxes.append((x_min, y_min, x_max, y_max))
+        state.composited_frames.pop(ann_frame_idx, None)
     else:
         label_int = 1 if str(label).lower().startswith("pos") else 0
     state.composited_frames.pop(ann_frame_idx, None)
+    state.inference_session = to_device_recursive(state.inference_session, "cpu")
+    return update_frame_display(state, ann_frame_idx), state
+@spaces.GPU
 def on_text_prompt(
     state: AppState,
     frame_idx: int,
     text_prompt: str,
+) -> tuple[Image.Image, str, str, AppState]:
     if state is None or state.inference_session is None:
         return None, "Upload a video and enter text prompt.", "**Active prompts:** None"
+    model = TEXT_VIDEO_MODEL
+    processor = TEXT_VIDEO_PROCESSOR
     if not text_prompt or not text_prompt.strip():
         active_prompts = _get_active_prompts_display(state)
+        return update_frame_display(state, int(frame_idx)), "Please enter a text prompt.", active_prompts, state
     frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
     prompt_texts = [p.strip() for p in text_prompt.split(",") if p.strip()]
     if not prompt_texts:
         active_prompts = _get_active_prompts_display(state)
+        return update_frame_display(state, int(frame_idx)), "Please enter a valid text prompt.", active_prompts, state
+    state.inference_session = to_device_recursive(state.inference_session, DEVICE)
     # Add text prompt(s) - supports both single string and list of strings
     state.inference_session = processor.add_text_prompt(
         status = f"Processed text prompt(s) {prompts_str} on frame {frame_idx}. No objects detected."
     active_prompts = _get_active_prompts_display(state)
+    state.inference_session = to_device_recursive(state.inference_session, "cpu")
+    return update_frame_display(state, int(frame_idx)), status, active_prompts, state
 def _get_active_prompts_display(state: AppState) -> str:
     return "**Active prompts:** None"
+@spaces.GPU
+def propagate_masks(state: AppState) -> Iterator[tuple[AppState, str, dict]]:
+    if state is None:
+        return state, "Load a video first.", gr.update()
+    if state.active_tab != "text" and state.inference_session is None:
+        return state, "Load a video first.", gr.update()
+    total = max(1, state.num_frames)
     processed = 0
+    yield state, f"Propagating masks: {processed}/{total}", gr.update()
     last_frame_idx = 0
     with torch.no_grad():
+        if state.active_tab == "text":
+            if state.inference_session is None:
+                yield state, "Text video model not loaded.", gr.update()
                 return
+            model = TEXT_VIDEO_MODEL
+            processor = TEXT_VIDEO_PROCESSOR
+            state.inference_session = to_device_recursive(state.inference_session, DEVICE)
             # Collect all unique prompts from existing frame annotations
             text_prompt_to_obj_ids = {}
+            for frame_idx, frame_texts in state.text_prompts_by_frame_obj.items():
                 for obj_id, text_prompt in frame_texts.items():
                     if text_prompt not in text_prompt_to_obj_ids:
                         text_prompt_to_obj_ids[text_prompt] = []
                         text_prompt_to_obj_ids[text_prompt].append(obj_id)
             # Also check if there are prompts already in the inference session
+            if hasattr(state.inference_session, "prompts") and state.inference_session.prompts:
+                for prompt_text in state.inference_session.prompts.values():
                     if prompt_text not in text_prompt_to_obj_ids:
                         text_prompt_to_obj_ids[prompt_text] = []
                 text_prompt_to_obj_ids[text_prompt].sort()
             if not text_prompt_to_obj_ids:
+                state.inference_session = to_device_recursive(state.inference_session, "cpu")
+                yield state, "No text prompts found. Please add a text prompt first.", gr.update()
                 return
             # Add all prompts to the inference session (processor handles deduplication)
+            for text_prompt in text_prompt_to_obj_ids:
+                state.inference_session = processor.add_text_prompt(
+                    inference_session=state.inference_session,
                     text=text_prompt,
                 )
+            earliest_frame = min(state.text_prompts_by_frame_obj.keys()) if state.text_prompts_by_frame_obj else 0
+            frames_to_track = state.num_frames - earliest_frame
             outputs_per_frame = {}
             for model_outputs in model.propagate_in_video_iterator(
+                inference_session=state.inference_session,
                 start_frame_idx=earliest_frame,
                 max_frame_num_to_track=frames_to_track,
             ):
                 processed_outputs = processor.postprocess_outputs(
+                    state.inference_session,
                     model_outputs,
                 )
                 frame_idx = model_outputs.frame_idx
                 scores = processed_outputs["scores"]
                 prompt_to_obj_ids = processed_outputs.get("prompt_to_obj_ids", {})
+                masks_for_frame = state.masks_by_frame.setdefault(frame_idx, {})
+                frame_texts = state.text_prompts_by_frame_obj.setdefault(frame_idx, {})
                 num_objects = len(object_ids)
                 if num_objects > 0:
                         # Store prompt and assign color
                         if found_prompt:
                             frame_texts[current_obj_id] = found_prompt.strip()
+                        _ensure_color_for_obj(state, current_obj_id)
+                state.composited_frames.pop(frame_idx, None)
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
+                    state.inference_session = to_device_recursive(state.inference_session, "cpu")
+                    yield state, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
+                    state.inference_session = to_device_recursive(state.inference_session, DEVICE)
         else:
+            if state.inference_session is None:
+                yield state, "Tracker model not loaded.", gr.update()
                 return
+            model = TRACKER_MODEL
+            processor = TRACKER_PROCESSOR
+            state.inference_session = to_device_recursive(state.inference_session, DEVICE)
+            for sam2_video_output in model.propagate_in_video_iterator(inference_session=state.inference_session):
                 video_res_masks = processor.post_process_masks(
                     [sam2_video_output.pred_masks],
+                    original_sizes=[[state.inference_session.video_height, state.inference_session.video_width]],
                 )[0]
                 frame_idx = sam2_video_output.frame_idx
+                for i, out_obj_id in enumerate(state.inference_session.obj_ids):
+                    _ensure_color_for_obj(state, int(out_obj_id))
                     mask_2d = video_res_masks[i].cpu().numpy()
+                    masks_for_frame = state.masks_by_frame.setdefault(frame_idx, {})
                     masks_for_frame[int(out_obj_id)] = mask_2d
+                    state.composited_frames.pop(frame_idx, None)
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
+                    state.inference_session = to_device_recursive(state.inference_session, "cpu")
+                    yield state, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
+                    state.inference_session = to_device_recursive(state.inference_session, DEVICE)
     text = f"Propagated masks across {processed} frames."
+    state.inference_session = to_device_recursive(state.inference_session, "cpu")
+    yield state, text, gr.update(value=last_frame_idx)
+def reset_prompts(state: AppState) -> tuple[AppState, Image.Image, str, str]:
     """Reset prompts and all outputs, but keep processed frames and cached vision features."""
+    if state is None or state.inference_session is None:
+        active_prompts = _get_active_prompts_display(state)
+        return state, None, "No active session to reset.", active_prompts
+    if state.active_tab != "text":
+        active_prompts = _get_active_prompts_display(state)
+        return state, None, "Reset prompts is only available for text prompting mode.", active_prompts
     # Reset inference session tracking data but keep cache and processed frames
+    if hasattr(state.inference_session, "reset_tracking_data"):
+        state.inference_session.reset_tracking_data()
     # Manually clear prompts (reset_tracking_data doesn't clear prompts themselves)
+    if hasattr(state.inference_session, "prompts"):
+        state.inference_session.prompts.clear()
+    if hasattr(state.inference_session, "prompt_input_ids"):
+        state.inference_session.prompt_input_ids.clear()
+    if hasattr(state.inference_session, "prompt_embeddings"):
+        state.inference_session.prompt_embeddings.clear()
+    if hasattr(state.inference_session, "prompt_attention_masks"):
+        state.inference_session.prompt_attention_masks.clear()
+    if hasattr(state.inference_session, "obj_id_to_prompt_id"):
+        state.inference_session.obj_id_to_prompt_id.clear()
     # Reset detection-tracking fusion state
+    if hasattr(state.inference_session, "obj_id_to_score"):
+        state.inference_session.obj_id_to_score.clear()
+    if hasattr(state.inference_session, "obj_id_to_tracker_score_frame_wise"):
+        state.inference_session.obj_id_to_tracker_score_frame_wise.clear()
+    if hasattr(state.inference_session, "obj_id_to_last_occluded"):
+        state.inference_session.obj_id_to_last_occluded.clear()
+    if hasattr(state.inference_session, "max_obj_id"):
+        state.inference_session.max_obj_id = -1
+    if hasattr(state.inference_session, "obj_first_frame_idx"):
+        state.inference_session.obj_first_frame_idx.clear()
+    if hasattr(state.inference_session, "unmatched_frame_inds"):
+        state.inference_session.unmatched_frame_inds.clear()
+    if hasattr(state.inference_session, "overlap_pair_to_frame_inds"):
+        state.inference_session.overlap_pair_to_frame_inds.clear()
+    if hasattr(state.inference_session, "trk_keep_alive"):
+        state.inference_session.trk_keep_alive.clear()
+    if hasattr(state.inference_session, "removed_obj_ids"):
+        state.inference_session.removed_obj_ids.clear()
+    if hasattr(state.inference_session, "suppressed_obj_ids"):
+        state.inference_session.suppressed_obj_ids.clear()
+    if hasattr(state.inference_session, "hotstart_removed_obj_ids"):
+        state.inference_session.hotstart_removed_obj_ids.clear()
     # Clear all app state outputs
+    state.masks_by_frame.clear()
+    state.text_prompts_by_frame_obj.clear()
+    state.composited_frames.clear()
+    state.color_by_obj.clear()
+    state.color_by_prompt.clear()
     # Update display
+    current_idx = int(getattr(state, "current_frame_idx", 0))
+    current_idx = max(0, min(current_idx, state.num_frames - 1))
+    preview_img = update_frame_display(state, current_idx)
+    active_prompts = _get_active_prompts_display(state)
     status = "Prompts and outputs reset. Processed frames and cached vision features preserved."
+    return state, preview_img, status, active_prompts
+def reset_session(state: AppState) -> tuple[AppState, Image.Image, int, int, str, str]:
+    if not state.video_frames:
+        return state, None, 0, 0, "Session reset. Load a new video.", "**Active prompts:** None"
+    if state.active_tab == "text":
+        if state.video_frames:
+            processor = TEXT_VIDEO_PROCESSOR
+            state.inference_session = processor.init_video_session(
+                video=state.video_frames,
+                inference_device=DEVICE,
                 processing_device="cpu",
                 video_storage_device="cpu",
+                dtype=DTYPE,
             )
+    elif state.inference_session is not None and hasattr(state.inference_session, "reset_inference_session"):
+        state.inference_session.reset_inference_session()
+    elif state.video_frames:
+        processor = TRACKER_PROCESSOR
+        raw_video = [np.array(frame) for frame in state.video_frames]
+        state.inference_session = processor.init_video_session(
+            video=raw_video,
+            inference_device=DEVICE,
+            video_storage_device="cpu",
+            processing_device="cpu",
+            dtype=DTYPE,
+        )
+    state.masks_by_frame.clear()
+    state.clicks_by_frame_obj.clear()
+    state.boxes_by_frame_obj.clear()
+    state.text_prompts_by_frame_obj.clear()
+    state.composited_frames.clear()
+    state.color_by_obj.clear()
+    state.color_by_prompt.clear()
+    state.pending_box_start = None
+    state.pending_box_start_frame_idx = None
+    state.pending_box_start_obj_id = None
     gc.collect()
+    current_idx = int(getattr(state, "current_frame_idx", 0))
+    current_idx = max(0, min(current_idx, state.num_frames - 1))
+    preview_img = update_frame_display(state, current_idx)
+    slider_minmax = gr.update(minimum=0, maximum=max(state.num_frames - 1, 0), interactive=True)
     slider_value = gr.update(value=current_idx)
     status = "Session reset. Prompts cleared; video preserved."
+    active_prompts = _get_active_prompts_display(state)
+    return state, preview_img, slider_minmax, slider_value, status, active_prompts
+def _on_video_change_pointbox(state: AppState, video: str | dict) -> tuple[AppState, dict, Image.Image, str]:
+    state, min_idx, max_idx, first_frame, status = init_video_session(state, video, "point_box")
     return (
+        state,
         gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
         first_frame,
         status,
     )
+def _on_video_change_text(state: AppState, video: str | dict) -> tuple[AppState, dict, Image.Image, str, str]:
+    if video is None:
+        return state, None, None, None, None
+    state, min_idx, max_idx, first_frame, status = init_video_session(state, video, "text")
+    active_prompts = _get_active_prompts_display(state)
     return (
+        state,
         gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
         first_frame,
         status,
     )
+with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose", neutral_hue="slate")) as demo:
+    app_state = gr.State(AppState())
     gr.Markdown(
         """
             with gr.Row():
                 with gr.Column(scale=1):
+                    video_in_text = gr.Video(label="Upload video", sources=["upload", "webcam"])
                     load_status_text = gr.Markdown(visible=True)
                     reset_btn_text = gr.Button("Reset Session", variant="secondary")
                 with gr.Column(scale=2):
+                    preview_text = gr.Image(label="Preview")
                     with gr.Row():
+                        frame_slider_text = gr.Slider(label="Frame", minimum=0, maximum=0, step=1, value=0)
                         with gr.Column(scale=0):
                             propagate_btn_text = gr.Button("Propagate across video", variant="primary")
                             propagate_status_text = gr.Markdown(visible=True)
             ]
             with gr.Row():
                 gr.Examples(
                     label="Examples",
+                    examples=examples_list_text,
+                    inputs=[app_state, video_in_text],
                     examples_per_page=5,
                 )
             with gr.Row():
                 with gr.Column(scale=1):
+                    video_in_pointbox = gr.Video(label="Upload video", sources=["upload", "webcam"], max_length=7)
                     load_status_pointbox = gr.Markdown(visible=True)
                     reset_btn_pointbox = gr.Button("Reset Session", variant="secondary")
                 with gr.Column(scale=2):
+                    preview_pointbox = gr.Image(label="Preview")
                     with gr.Row():
+                        frame_slider_pointbox = gr.Slider(label="Frame", minimum=0, maximum=0, step=1, value=0)
                         with gr.Column(scale=0):
                             propagate_btn_pointbox = gr.Button("Propagate across video", variant="primary")
                             propagate_status_pointbox = gr.Markdown(visible=True)
             ]
             with gr.Row():
                 gr.Examples(
                     label="Examples",
+                    examples=examples_list_pointbox,
+                    inputs=[app_state, video_in_pointbox],
                     examples_per_page=5,
                 )
     video_in_pointbox.change(
+        fn=_on_video_change_pointbox,
+        inputs=[app_state, video_in_pointbox],
+        outputs=[app_state, frame_slider_pointbox, preview_pointbox, load_status_pointbox],
         show_progress=True,
     )
+    def _sync_frame_idx_pointbox(state_in: AppState, idx: int) -> Image.Image:
         if state_in is not None:
             state_in.current_frame_idx = int(idx)
         return update_frame_display(state_in, int(idx))
     frame_slider_pointbox.change(
+        fn=_sync_frame_idx_pointbox,
+        inputs=[app_state, frame_slider_pointbox],
         outputs=preview_pointbox,
     )
     video_in_text.change(
+        fn=_on_video_change_text,
+        inputs=[app_state, video_in_text],
+        outputs=[app_state, frame_slider_text, preview_text, load_status_text, active_prompts_display],
         show_progress=True,
     )
+    def _sync_frame_idx_text(state_in: AppState, idx: int) -> Image.Image:
         if state_in is not None:
             state_in.current_frame_idx = int(idx)
         return update_frame_display(state_in, int(idx))
     frame_slider_text.change(
+        fn=_sync_frame_idx_text,
+        inputs=[app_state, frame_slider_text],
         outputs=preview_text,
     )
+    def _sync_obj_id(s: AppState, oid: int) -> None:
         if s is not None and oid is not None:
             s.current_obj_id = int(oid)
+    obj_id_inp.change(
+        fn=_sync_obj_id,
+        inputs=[app_state, obj_id_inp],
+    )
+    def _sync_label(s: AppState, lab: str) -> None:
         if s is not None and lab is not None:
             s.current_label = str(lab)
+    label_radio.change(
+        fn=_sync_label,
+        inputs=[app_state, label_radio],
+    )
+    def _sync_prompt_type(s: AppState, val: str) -> tuple[dict, dict]:
         if s is not None and val is not None:
             s.current_prompt_type = str(val)
             s.pending_box_start = None
         is_points = str(val).lower() == "points"
+        return (
             gr.update(visible=is_points),
             gr.update(interactive=is_points) if is_points else gr.update(value=True, interactive=False),
+        )
     prompt_type.change(
+        fn=_sync_prompt_type,
+        inputs=[app_state, prompt_type],
         outputs=[label_radio, clear_old_chk],
     )
     preview_pointbox.select(
+        fn=on_image_click,
+        inputs=[preview_pointbox, app_state, frame_slider_pointbox, obj_id_inp, label_radio, clear_old_chk],
+        outputs=[preview_pointbox, app_state],
     )
     text_apply_btn.click(
+        fn=on_text_prompt,
+        inputs=[app_state, frame_slider_text, text_prompt_input],
+        outputs=[preview_text, text_status, active_prompts_display, app_state],
     )
     reset_prompts_btn.click(
+        fn=reset_prompts,
+        inputs=app_state,
+        outputs=[app_state, preview_text, text_status, active_prompts_display],
     )
+    def _render_video(s: AppState) -> str:
         if s is None or s.num_frames == 0:
             raise gr.Error("Load a video first.")
         fps = s.video_fps if s.video_fps and s.video_fps > 0 else 12
             frames_np.append(np.array(img)[:, :, ::-1])
             if (idx + 1) % 60 == 0:
                 gc.collect()
         try:
+            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as out_path:
+                fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+                writer = cv2.VideoWriter(out_path.name, fourcc, fps, (w, h))
+                for fr_bgr in frames_np:
+                    writer.write(fr_bgr)
+                writer.release()
+                return out_path.name
         except Exception as e:
             print(f"Failed to render video with cv2: {e}")
             raise gr.Error(f"Failed to render video: {e}")
+    render_btn_pointbox.click(
+        fn=_render_video,
+        inputs=app_state,
+        outputs=playback_video_pointbox,
+    )
+    render_btn_text.click(
+        fn=_render_video,
+        inputs=app_state,
+        outputs=playback_video_text,
+    )
     propagate_btn_pointbox.click(
+        fn=propagate_masks,
+        inputs=app_state,
+        outputs=[app_state, propagate_status_pointbox, frame_slider_pointbox],
     )
     propagate_btn_text.click(
+        fn=propagate_masks,
+        inputs=app_state,
+        outputs=[app_state, propagate_status_text, frame_slider_text],
     )
     reset_btn_pointbox.click(
+        fn=reset_session,
+        inputs=app_state,
+        outputs=[app_state, preview_pointbox, frame_slider_pointbox, frame_slider_pointbox, load_status_pointbox],
     )
     reset_btn_text.click(
+        fn=reset_session,
+        inputs=app_state,
         outputs=[
+            app_state,
             preview_text,
             frame_slider_text,
             frame_slider_text,

pyproject.toml ADDED Viewed

	@@ -0,0 +1,60 @@

+[project]
+name = "sam3-video-segmentation"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "accelerate>=1.11.0",
+    "gradio>=5.49.1",
+    "imageio[pyav]>=2.37.2",
+    "kernels>=0.11.0",
+    "opencv-python>=4.12.0.88",
+    "spaces>=0.42.1",
+    "torch==2.8.0",
+    "torchvision>=0.23.0",
+    "transformers",
+]
+[tool.ruff]
+line-length = 119
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+    "COM812", # missing-trailing-comma
+    "D203",   # one-blank-line-before-class
+    "D213",   # multi-line-summary-second-line
+    "E501",   # line-too-long
+    "SIM117", # multiple-with-statements
+    #
+    "D100",    # undocumented-public-module
+    "D101",    # undocumented-public-class
+    "D102",    # undocumented-public-method
+    "D103",    # undocumented-public-function
+    "D104",    # undocumented-public-package
+    "D105",    # undocumented-magic-method
+    "D107",    # undocumented-public-init
+    "EM101",   # raw-string-in-exception
+    "FBT001",  # boolean-type-hint-positional-argument
+    "FBT002",  # boolean-default-value-positional-argument
+    "PGH003",  # blanket-type-ignore
+    "PLR0913", # too-many-arguments
+    "PLR0915", # too-many-statements
+    "TRY003",  # raise-vanilla-args
+]
+unfixable = [
+    "F401", # unused-import
+]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.lint.per-file-ignores]
+"*.ipynb" = ["T201", "T203"]
+[tool.ruff.format]
+docstring-code-format = true
+[tool.uv.sources]
+transformers = { git = "https://github.com/huggingface/transformers.git", rev = "69f003696b" }

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff