Spaces:

furbola
/

chaskick

Running

App Files Files Community

Mirko Trasciatti commited on Nov 10, 2025

Commit

e7cbaa4

1 Parent(s): 962d5a0

Clean rebuild: Updated README, app.py, and requirements with pydantic fix

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +54 -17

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🎥
 colorFrom: blue
 colorTo: purple
 sdk: gradio
-sdk_version: 5.6.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """
-SAM2 Video Segmentation Space - Minimal Working Version
 """
 import gradio as gr
@@ -7,7 +8,9 @@ import torch
 import numpy as np
 import cv2
 import tempfile
 import os
 from transformers import Sam2VideoModel, Sam2VideoProcessor
 from PIL import Image
 import spaces
@@ -25,7 +28,9 @@ def initialize_model():
     if torch.cuda.is_available():
         device = torch.device("cuda")
-        dtype = torch.float32
     elif torch.backends.mps.is_available():
         device = torch.device("mps")
         dtype = torch.float32
@@ -33,7 +38,7 @@ def initialize_model():
         device = torch.device("cpu")
         dtype = torch.float32
-    print(f"Loading SAM2 model on {device}...")
     model = Sam2VideoModel.from_pretrained(MODEL_NAME).to(device, dtype=dtype)
     processor = Sam2VideoProcessor.from_pretrained(MODEL_NAME)
@@ -43,7 +48,7 @@ def initialize_model():
 def load_video_cv2(video_path):
-    """Load video using OpenCV."""
     cap = cv2.VideoCapture(video_path)
     frames = []
     fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
@@ -52,6 +57,7 @@ def load_video_cv2(video_path):
         ret, frame = cap.read()
         if not ret:
             break
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         frames.append(Image.fromarray(frame_rgb))
@@ -68,27 +74,33 @@ def segment_video_simple(video_file, point_x, point_y, frame_idx, remove_bg):
         initialize_model()
     try:
         if video_file is None:
             return None, "❌ Error: No video file provided"
         video_path = str(video_file)
         if not os.path.exists(video_path):
-            return None, f"❌ Error: Video file not found"
         # Convert inputs
         point_x = int(float(point_x))
         point_y = int(float(point_y))
         frame_idx = int(float(frame_idx))
-        # Load video
         video_frames, video_info = load_video_cv2(video_path)
         fps = video_info.get('fps', 30.0)
         # Initialize inference session
         inference_session = processor.init_video_session(
             video=video_frames,
             inference_device=device,
-            dtype=torch.float32,
         )
         # Add annotation
@@ -100,8 +112,11 @@ def segment_video_simple(video_file, point_x, point_y, frame_idx, remove_bg):
             input_labels=[[[1]]],
         )
-        # Run inference
-        model(inference_session=inference_session, frame_idx=frame_idx)
         # Propagate through video
         video_segments = {}
@@ -114,7 +129,7 @@ def segment_video_simple(video_file, point_x, point_y, frame_idx, remove_bg):
             video_segments[sam2_output.frame_idx] = video_res_masks
         # Create output video
-        output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
         first_frame = np.array(video_frames[0])
         height, width = first_frame.shape[:2]
@@ -147,14 +162,16 @@ def segment_video_simple(video_file, point_x, point_y, frame_idx, remove_bg):
         out.release()
         if os.path.exists(output_path):
             return output_path, f"✅ Success! Processed {len(video_segments)} frames"
         else:
-            return None, "❌ Error: Output file was not created"
     except Exception as e:
         import traceback
-        traceback.print_exc()
         return None, f"❌ Error: {str(e)}"
@@ -163,16 +180,25 @@ def create_app():
     initialize_model()
     with gr.Blocks(title="SAM2 Video Background Remover") as app:
-        gr.Markdown("# 🎥 SAM2 Video Background Remover")
-        gr.Markdown("Remove backgrounds from videos by tracking objects with SAM2")
         with gr.Row():
             with gr.Column():
                 video_input = gr.File(label="Upload Video", file_types=["video"])
                 with gr.Row():
-                    point_x = gr.Textbox(label="Point X", value="360")
-                    point_y = gr.Textbox(label="Point Y", value="640")
                 frame_idx = gr.Textbox(label="Frame Index", value="0")
                 remove_bg = gr.Checkbox(label="Remove Background", value=True)
@@ -188,10 +214,21 @@ def create_app():
             inputs=[video_input, point_x, point_y, frame_idx, remove_bg],
             outputs=[output_video, status_text]
         )
     return app
 if __name__ == "__main__":
     app = create_app()
-    app.launch(share=True)

 """
+SAM2 Video Segmentation Space - SIMPLIFIED VERSION
+Removes background from videos by tracking specified objects.
 """
 import gradio as gr
 import numpy as np
 import cv2
 import tempfile
+import json
 import os
+from typing import List, Tuple, Optional, Dict, Any
 from transformers import Sam2VideoModel, Sam2VideoProcessor
 from PIL import Image
 import spaces
     if torch.cuda.is_available():
         device = torch.device("cuda")
+        dtype = torch.float32  # Use float32 for universal GPU compatibility
+        print(f"CUDA available: {torch.cuda.is_available()}")
+        print(f"CUDA device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
     elif torch.backends.mps.is_available():
         device = torch.device("mps")
         dtype = torch.float32
         device = torch.device("cpu")
         dtype = torch.float32
+    print(f"Loading SAM2 model on {device} with dtype {dtype}...")
     model = Sam2VideoModel.from_pretrained(MODEL_NAME).to(device, dtype=dtype)
     processor = Sam2VideoProcessor.from_pretrained(MODEL_NAME)
 def load_video_cv2(video_path):
+    """Load video using OpenCV to preserve orientation."""
     cap = cv2.VideoCapture(video_path)
     frames = []
     fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
         ret, frame = cap.read()
         if not ret:
             break
+        # Convert BGR to RGB
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         frames.append(Image.fromarray(frame_rgb))
         initialize_model()
     try:
+        # Handle video_file - gr.File passes it as a string path directly
         if video_file is None:
             return None, "❌ Error: No video file provided"
+        # gr.File returns the file path as a string
         video_path = str(video_file)
         if not os.path.exists(video_path):
+            return None, f"❌ Error: Video file not found: {video_path}"
+        print(f"Processing video from: {video_path}")
         # Convert inputs
         point_x = int(float(point_x))
         point_y = int(float(point_y))
         frame_idx = int(float(frame_idx))
+        # Load video using OpenCV to preserve orientation
         video_frames, video_info = load_video_cv2(video_path)
         fps = video_info.get('fps', 30.0)
         # Initialize inference session
+        dtype = torch.float32  # Use float32 for universal compatibility
         inference_session = processor.init_video_session(
             video=video_frames,
             inference_device=device,
+            dtype=dtype,
         )
         # Add annotation
             input_labels=[[[1]]],
         )
+        # Run inference on first frame
+        outputs = model(
+            inference_session=inference_session,
+            frame_idx=frame_idx,
+        )
         # Propagate through video
         video_segments = {}
             video_segments[sam2_output.frame_idx] = video_res_masks
         # Create output video
+        output_path = tempfile.mktemp(suffix=".mp4")
         first_frame = np.array(video_frames[0])
         height, width = first_frame.shape[:2]
         out.release()
+        # Return the video file path (Gradio will handle it)
         if os.path.exists(output_path):
             return output_path, f"✅ Success! Processed {len(video_segments)} frames"
         else:
+            return None, f"❌ Error: Output file was not created"
     except Exception as e:
         import traceback
+        error_details = traceback.format_exc()
+        print(f"Error in segment_video_simple: {error_details}")
         return None, f"❌ Error: {str(e)}"
     initialize_model()
     with gr.Blocks(title="SAM2 Video Background Remover") as app:
+        gr.Markdown("""
+        # 🎥 SAM2 Video Background Remover
+        Remove backgrounds from videos by tracking objects with Meta's SAM2.
+        **How to use:**
+        1. Upload a video
+        2. Enter X, Y coordinates of the object to track (from first frame)
+        3. Click "Process Video"
+        """)
         with gr.Row():
             with gr.Column():
+                # Using gr.File instead of gr.Video for better API compatibility
                 video_input = gr.File(label="Upload Video", file_types=["video"])
                 with gr.Row():
+                    point_x = gr.Textbox(label="Point X", value="320")
+                    point_y = gr.Textbox(label="Point Y", value="240")
                 frame_idx = gr.Textbox(label="Frame Index", value="0")
                 remove_bg = gr.Checkbox(label="Remove Background", value=True)
             inputs=[video_input, point_x, point_y, frame_idx, remove_bg],
             outputs=[output_video, status_text]
         )
+        gr.Markdown("""
+        ### Tips:
+        - Point X, Y: Coordinates of the object in the video
+        - For a 720x1280 portrait video, center is typically X=360, Y=640
+        - For a 1920x1080 landscape video, center is typically X=960, Y=540
+        - Frame Index: Usually 0 (first frame)
+        - Processing time depends on video length (CPU processing is slow)
+        - Portrait and landscape videos are both supported!
+        """)
     return app
 if __name__ == "__main__":
     app = create_app()
+    app.launch(share=True, show_error=True)