diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..4d06260e737045cf5025b38dbd80cbb3c88bdc83
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,49 @@
+# Dockerfile для Hugging Face Spaces
+# Оптимизирован для CPU
+
+FROM python:3.10-slim
+
+WORKDIR /app
+
+# Системные зависимости для OpenCV и SAM2
+RUN apt-get update && apt-get install -y \
+ git \
+ wget \
+ build-essential \
+ libglib2.0-0 \
+ libsm6 \
+ libxext6 \
+ libxrender-dev \
+ libgomp1 \
+ libgl1-mesa-glx \
+ && rm -rf /var/lib/apt/lists/*
+
+# Копируем requirements
+COPY requirements.txt .
+
+# Устанавливаем Python зависимости
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Копируем код приложения
+COPY app.py .
+COPY download_model.py .
+COPY web_demo.html .
+COPY web_demo_advanced.html .
+
+# Копируем и устанавливаем SAM2
+COPY sam2_repo sam2_repo
+RUN cd sam2_repo && pip install --no-cache-dir -e .
+
+# Создаем папку для моделей
+RUN mkdir -p checkpoints
+
+# Скачиваем tiny модель (самая легкая для CPU)
+RUN python download_model.py tiny
+
+# Hugging Face Spaces использует порт 7860
+ENV PORT=7860
+EXPOSE 7860
+
+# Запуск с указанием хоста и порта
+CMD ["sh", "-c", "python -c 'import uvicorn; uvicorn.run(\"app:app\", host=\"0.0.0.0\", port=${PORT})'"]
+
diff --git a/README.md b/README.md
index 11394d1107ef4c653343fc0748478a871bec9440..9db51607b7209ff303885e6e34dfe4ba1ad0372f 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,174 @@
---
-title: Sam2 Api
-emoji: 📈
+title: SAM2 Segmentation API
+emoji: 🎯
colorFrom: purple
-colorTo: pink
+colorTo: blue
sdk: docker
+app_port: 7860
pinned: false
license: apache-2.0
---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# 🎯 SAM2 Segmentation API
+
+Мощный REST API для сегментации объектов на изображениях с использованием Meta SAM2 (Segment Anything Model 2).
+
+## ✨ Возможности
+
+- **🎯 Box Prompts** - выделение прямоугольником
+- **🖌️ Brush Prompts** - рисование кистью (зеленый = объект, красный = фон, белый = объект)
+- **📍 Point Prompts** - клики по объектам
+- **🔥 Batch API** - обработка множественных объектов за один запрос
+- **🖼️ Extract Objects** - автоматическое извлечение объектов с прозрачностью
+- **⚡ REST API** - полная документация в Swagger UI
+
+## 🚀 Быстрый старт
+
+### Web интерфейс
+
+После запуска Space откройте:
+
+- **Простой интерфейс**: `/web` - Box промпты
+- **Продвинутый**: `/web/advanced` - Box + Brush промпты
+- **API документация**: `/docs` - Swagger UI
+
+### API Endpoints
+
+#### POST `/segment/batch` - Батчинг API (рекомендуется)
+
+Обрабатывает множественные объекты за один запрос.
+
+**Пример запроса:**
+```json
+{
+ "image": "data:image/jpeg;base64,...",
+ "prompts": [
+ {
+ "id": 0,
+ "type": "mask",
+ "data": "data:image/png;base64,...",
+ "label": "person",
+ "selected": true
+ }
+ ],
+ "options": {
+ "extract_objects": true,
+ "include_masks": false,
+ "clean_masks": true
+ }
+}
+```
+
+#### POST `/segment` - Простая сегментация
+
+С box промптом:
+```bash
+curl -X POST "/segment?box_x1=50&box_y1=50&box_x2=300&box_y2=400&extract_objects=true" \
+ -F "file=@image.jpg"
+```
+
+## 📊 Производительность
+
+⚠️ **CPU Version**: Работает на бесплатном CPU tier Hugging Face Spaces. Скорость обработки: ~5-10 секунд на изображение.
+
+Для более быстрой обработки рекомендуется upgrade на GPU (Settings → Hardware).
+
+## 🎨 Форматы масок
+
+API поддерживает несколько форматов масок:
+
+- **🟢 Зеленый** (R<100, G>150, B<100) - foreground (объект)
+- **⚪ Белый** (R>200, G>200, B>200) - foreground (объект)
+- **🔴 Красный** (R>150, G<100, B<100) - background (исключить)
+
+## 🔧 Технологии
+
+- Meta SAM2 2.1 (Segment Anything Model)
+- FastAPI
+- PyTorch
+- OpenCV
+- Pydantic
+
+## 📝 Примеры использования
+
+### Python
+
+```python
+import requests
+import base64
+
+# Загрузить изображение
+with open("image.jpg", "rb") as f:
+ image_b64 = base64.b64encode(f.read()).decode()
+
+# Отправить запрос
+response = requests.post(
+ "https://YOUR-SPACE.hf.space/segment/batch",
+ json={
+ "image": f"data:image/jpeg;base64,{image_b64}",
+ "prompts": [{
+ "id": 0,
+ "type": "box",
+ "data": "",
+ "bbox": {"x_min": 0.1, "y_min": 0.2, "x_max": 0.5, "y_max": 0.8},
+ "label": "person",
+ "selected": True
+ }],
+ "options": {"extract_objects": True}
+ }
+)
+
+result = response.json()
+print(f"Обработано объектов: {len(result['results'])}")
+```
+
+### JavaScript
+
+```javascript
+const response = await fetch('https://YOUR-SPACE.hf.space/segment/batch', {
+ method: 'POST',
+ headers: {'Content-Type': 'application/json'},
+ body: JSON.stringify({
+ image: imageBase64,
+ prompts: [{
+ id: 0,
+ type: "box",
+ data: "",
+ bbox: {x_min: 0.1, y_min: 0.2, x_max: 0.5, y_max: 0.8},
+ label: "person",
+ selected: true
+ }],
+ options: {extract_objects: true}
+ })
+});
+
+const result = await response.json();
+console.log(`Обработано: ${result.results.length} объектов`);
+```
+
+## 📚 Документация
+
+Полная интерактивная документация доступна по адресу `/docs` после запуска Space.
+
+## 🤝 Поддержка
+
+- Модель: SAM 2.1 Hiera Tiny (для CPU)
+- Форматы изображений: JPG, PNG, WEBP, BMP
+- Максимальный размер: рекомендуется до 2048x2048px для разумной скорости
+
+## ⚡ Оптимизация для мобильных приложений
+
+1. Уменьшайте размер изображения перед отправкой (1024x1024)
+2. Используйте `include_masks: false` если контуры не нужны
+3. Кэшируйте результаты на клиенте
+4. Используйте батчинг API для множественных объектов
+
+## 📄 Лицензия
+
+Apache 2.0
+
+## 🔗 Ссылки
+
+- [SAM2 GitHub](https://github.com/facebookresearch/sam2)
+- [SAM2 Paper](https://arxiv.org/abs/2408.00714)
+
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7175636070147a04a7c3466706ce20ca7f7fc56
--- /dev/null
+++ b/app.py
@@ -0,0 +1,1109 @@
+"""
+REST API сервер для сегментации изображений через SAM2.
+Уставший сеньор кодит это в 3 часа ночи, поэтому код местами будет грязный.
+"""
+
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, File, UploadFile, HTTPException, Query, Body
+from fastapi.responses import JSONResponse, HTMLResponse, FileResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from PIL import Image
+import numpy as np
+import torch
+import io
+import os
+import base64
+import cv2
+from typing import List, Dict, Any, Optional, Literal
+import logging
+from datetime import datetime
+import json
+
+# Настройка логирования, потому что дебажить это говно иначе невозможно
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Глобальные переменные для модели (лень каждый раз загружать)
+predictor = None
+device = None
+
+# ===== Pydantic модели для батчинг API =====
+
+class BBoxModel(BaseModel):
+ """Bounding box в нормализованных координатах (0.0 - 1.0) или пиксельных"""
+ x_min: float = Field(..., description="X координата левого верхнего угла")
+ y_min: float = Field(..., description="Y координата левого верхнего угла")
+ x_max: float = Field(..., description="X координата правого нижнего угла")
+ y_max: float = Field(..., description="Y координата правого нижнего угла")
+
+class PromptModel(BaseModel):
+ """Промпт для сегментации одного объекта"""
+ id: int = Field(..., description="Уникальный ID объекта")
+ type: Literal["mask", "box", "points"] = Field(..., description="Тип промпта")
+ data: str = Field(..., description="Данные промпта (base64 для mask, JSON для points)")
+ bbox: Optional[BBoxModel] = Field(None, description="Опциональный bounding box")
+ label: Optional[str] = Field(None, description="Метка объекта (person, car, etc)")
+ selected: bool = Field(True, description="Обрабатывать ли этот промпт")
+
+class SegmentOptionsModel(BaseModel):
+ """Опции сегментации"""
+ extract_objects: bool = Field(True, description="Вернуть вырезанные объекты")
+ include_masks: bool = Field(False, description="Включить контуры масок")
+ clean_masks: bool = Field(True, description="Очистить маски от артефактов")
+
+class BatchSegmentRequest(BaseModel):
+ """Запрос на батчинг сегментацию"""
+ image: str = Field(..., description="Изображение в base64 (с data URL или без)")
+ prompts: List[PromptModel] = Field(..., description="Массив промптов")
+ options: Optional[SegmentOptionsModel] = Field(default_factory=SegmentOptionsModel)
+
+class SegmentResultModel(BaseModel):
+ """Результат сегментации одного объекта"""
+ id: int
+ label: Optional[str] = None
+ bbox: Dict[str, Any]
+ area: int
+ center: Dict[str, int]
+ confidence: float
+ extracted_image: Optional[str] = None
+ contours: Optional[List[Dict[str, Any]]] = None
+ mask_rle: Optional[Dict[str, Any]] = None
+
+class BatchSegmentResponse(BaseModel):
+ """Ответ батчинг сегментации"""
+ success: bool
+ image_size: Dict[str, int]
+ results: List[SegmentResultModel]
+
+def save_batch_request_log(request_data: dict, response_data: dict, image_width: int, image_height: int):
+ """
+ Сохраняет запрос батчинга для аудита и дебага.
+ Создает папку с timestamp и сохраняет только метаданные:
+ 1. Лог запроса (request.json) - параметры без base64
+ 2. Лог ответа (response.json) - результаты без base64
+ 3. Краткую сводку (summary.json)
+
+ ⚠️ Изображения и маски НЕ сохраняются для безопасности!
+ """
+ try:
+ # Создаем корневую папку для логов
+ logs_dir = "batch_logs"
+ os.makedirs(logs_dir, exist_ok=True)
+
+ # Создаем папку с timestamp
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] # Миллисекунды
+ request_dir = os.path.join(logs_dir, timestamp)
+ os.makedirs(request_dir, exist_ok=True)
+
+ logger.info(f"📁 Сохраняю лог запроса в: {request_dir}")
+
+ # Сохраняем запрос (без base64 для безопасности)
+ request_log = {
+ "timestamp": timestamp,
+ "image_size": {
+ "width": image_width,
+ "height": image_height
+ },
+ "prompts": [
+ {
+ "id": p.get("id"),
+ "type": p.get("type"),
+ "label": p.get("label"),
+ "bbox": p.get("bbox"),
+ "selected": p.get("selected"),
+ "data_length": len(p.get("data", "")) # Длина вместо самих данных
+ }
+ for p in request_data.get("prompts", [])
+ ],
+ "options": request_data.get("options", {})
+ }
+
+ request_path = os.path.join(request_dir, "request.json")
+ with open(request_path, "w", encoding="utf-8") as f:
+ json.dump(request_log, f, indent=2, ensure_ascii=False)
+ logger.info(f" ✓ Сохранен лог запроса: {request_path}")
+
+ # 4. Сохраняем ответ (без base64 объектов)
+ response_log = {
+ "timestamp": timestamp,
+ "success": response_data.get("success"),
+ "image_size": response_data.get("image_size"),
+ "results": [
+ {
+ "id": r.get("id"),
+ "label": r.get("label"),
+ "bbox": r.get("bbox"),
+ "area": r.get("area"),
+ "center": r.get("center"),
+ "confidence": r.get("confidence"),
+ "has_extracted_image": "extracted_image" in r,
+ "has_contours": "contours" in r
+ }
+ for r in response_data.get("results", [])
+ ]
+ }
+
+ response_path = os.path.join(request_dir, "response.json")
+ with open(response_path, "w", encoding="utf-8") as f:
+ json.dump(response_log, f, indent=2, ensure_ascii=False)
+ logger.info(f" ✓ Сохранен лог ответа: {response_path}")
+
+ # 3. Создаем summary файл
+ summary = {
+ "timestamp": timestamp,
+ "processed_prompts": len(response_data.get("results", [])),
+ "total_prompts": len(request_data.get("prompts", [])),
+ "selected_prompts": len([p for p in request_data.get("prompts", []) if p.get("selected", True)]),
+ "image_size": f"{image_width}x{image_height}",
+ "prompt_types": [p.get("type") for p in request_data.get("prompts", [])],
+ "files": {
+ "request": "request.json",
+ "response": "response.json"
+ }
+ }
+
+ summary_path = os.path.join(request_dir, "summary.json")
+ with open(summary_path, "w", encoding="utf-8") as f:
+ json.dump(summary, f, indent=2, ensure_ascii=False)
+
+ logger.info(f"✅ Лог запроса сохранен: {request_dir}")
+
+ except Exception as e:
+ logger.error(f"❌ Ошибка при сохранении лога: {e}")
+ # Не прерываем обработку запроса если не удалось сохранить лог
+
+def load_model(checkpoint_path: str = "checkpoints/sam2.1_hiera_tiny.pt"):
+ """
+ Загружает модель SAM2.
+ Вызывается один раз при старте сервера.
+ """
+ global predictor, device
+
+ try:
+ from sam2.build_sam import build_sam2
+ from sam2.sam2_image_predictor import SAM2ImagePredictor
+
+ # Проверяем CUDA
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ logger.info(f"Используем устройство: {device}")
+
+ if device == "cpu":
+ logger.warning("CUDA недоступна, работаем на CPU (будет медленно как черепаха)")
+
+ # Определяем конфиг по имени файла чекпоинта
+ # Указываем путь относительно configs/ директории в пакете sam2
+ checkpoint_name = os.path.basename(checkpoint_path)
+ if "tiny" in checkpoint_name:
+ config = "configs/sam2.1/sam2.1_hiera_t.yaml"
+ elif "small" in checkpoint_name:
+ config = "configs/sam2.1/sam2.1_hiera_s.yaml"
+ elif "base_plus" in checkpoint_name:
+ config = "configs/sam2.1/sam2.1_hiera_b+.yaml"
+ elif "large" in checkpoint_name:
+ config = "configs/sam2.1/sam2.1_hiera_l.yaml"
+ else:
+ logger.warning(f"Неизвестный тип модели, пробую tiny конфиг")
+ config = "configs/sam2.1/sam2.1_hiera_t.yaml"
+
+ logger.info(f"Загружаю модель из {checkpoint_path}")
+ logger.info(f"Конфиг: {config}")
+
+ sam2_model = build_sam2(config, checkpoint_path, device=device)
+ predictor = SAM2ImagePredictor(sam2_model)
+
+ logger.info("✓ Модель загружена успешно")
+
+ except Exception as e:
+ logger.error(f"Не удалось загрузить модель: {e}")
+ logger.error("Убедись что SAM2 установлен (./install_sam2.sh)")
+ raise
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ """Загружаем модель при старте, выгружаем при остановке"""
+ # Startup
+ checkpoint_dir = "checkpoints"
+ if os.path.exists(checkpoint_dir):
+ checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pt")]
+ if checkpoints:
+ checkpoint_path = os.path.join(checkpoint_dir, checkpoints[0])
+ load_model(checkpoint_path)
+ else:
+ logger.error("Нет чекпоинтов в директории checkpoints/")
+ logger.error("Запусти: python download_model.py")
+ else:
+ logger.error("Директория checkpoints/ не найдена")
+
+ yield # Сервер работает
+
+ # Shutdown (если нужна очистка)
+
+# Создаем FastAPI приложение с lifespan
+app = FastAPI(
+ title="SAM2 Segmentation API",
+ description="API для автоматической сегментации объектов на изображениях",
+ version="1.0.0",
+ lifespan=lifespan
+)
+
+# Добавляем CORS для работы с веб-интерфейсом
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"], # В продакшене указать конкретные домены
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+@app.get("/")
+async def root():
+ """Главная страница - информация об API"""
+ return {
+ "message": "SAM2 Segmentation API работает",
+ "version": "2.0.0",
+ "web_ui": {
+ "simple": "/web - Box промпты",
+ "advanced": "/web/advanced - Box + Brush промпты (рисование)"
+ },
+ "docs": "/docs",
+ "endpoints": {
+ "POST /segment": "Сегментация изображения (поддерживает points, box, mask via query params)",
+ "POST /segment/batch": "🔥 Батчинг сегментация (JSON API для множественных объектов)",
+ "POST /segment/auto": "Автоматическая сегментация всех объектов",
+ "GET /health": "Проверка здоровья сервиса"
+ }
+ }
+
+@app.get("/web", response_class=HTMLResponse)
+async def web_interface():
+ """Веб-интерфейс для тестирования Box Prompts (простой)"""
+ web_demo_path = os.path.join(os.path.dirname(__file__), "web_demo.html")
+ if os.path.exists(web_demo_path):
+ with open(web_demo_path, "r", encoding="utf-8") as f:
+ return f.read()
+ else:
+ return "
Веб-интерфейс не найден
Файл web_demo.html отсутствует
"
+
+@app.get("/web/advanced", response_class=HTMLResponse)
+async def web_interface_advanced():
+ """Продвинутый веб-интерфейс с Box + Brush промптами"""
+ web_demo_path = os.path.join(os.path.dirname(__file__), "web_demo_advanced.html")
+ if os.path.exists(web_demo_path):
+ with open(web_demo_path, "r", encoding="utf-8") as f:
+ return f.read()
+ else:
+ return "
Продвинутый интерфейс не найден
Файл web_demo_advanced.html отсутствует
"
+
+@app.get("/health")
+async def health():
+ """Проверка что всё ок"""
+ return {
+ "status": "healthy" if predictor is not None else "model not loaded",
+ "device": str(device) if device else "unknown"
+ }
+
+def process_image(image_bytes: bytes) -> np.ndarray:
+ """Конвертирует байты в numpy array"""
+ image = Image.open(io.BytesIO(image_bytes))
+ if image.mode != "RGB":
+ image = image.convert("RGB")
+ return np.array(image)
+
+def masks_to_coords(masks: np.ndarray, include_contours: bool = False) -> List[Dict[str, Any]]:
+ """
+ Конвертирует маски в координаты bounding box и контуров.
+ masks: (N, H, W) - N масок
+ include_contours: если True, добавляет контуры масок
+ """
+ results = []
+
+ for i, mask in enumerate(masks):
+ # Находим координаты пикселей маски
+ y_coords, x_coords = np.where(mask > 0)
+
+ if len(x_coords) == 0:
+ continue
+
+ # Bounding box
+ x_min, x_max = int(x_coords.min()), int(x_coords.max())
+ y_min, y_max = int(y_coords.min()), int(y_coords.max())
+
+ # Площадь сегмента
+ area = int(mask.sum())
+
+ segment_data = {
+ "segment_id": i,
+ "bbox": {
+ "x_min": x_min,
+ "y_min": y_min,
+ "x_max": x_max,
+ "y_max": y_max,
+ "width": x_max - x_min,
+ "height": y_max - y_min
+ },
+ "area": area,
+ "center": {
+ "x": int(x_coords.mean()),
+ "y": int(y_coords.mean())
+ }
+ }
+
+ # Добавляем контуры если нужно
+ if include_contours:
+ try:
+ # Конвертируем маску в uint8 (защита от булевых масок)
+ if mask.dtype == bool:
+ mask_uint8 = mask.astype(np.uint8) * 255
+ else:
+ mask_uint8 = (mask * 255).astype(np.uint8)
+
+ # Находим контуры с иерархией для поддержки "дыр"
+ # RETR_CCOMP: находит внешние контуры И внутренние дыры (holes)
+ # CHAIN_APPROX_NONE: сохраняет ВСЕ точки для pixel-perfect результата
+ contours, hierarchy = cv2.findContours(mask_uint8, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+ except Exception as e:
+ logger.warning(f"Ошибка при извлечении контуров: {e}, использую fallback")
+ # Fallback на простое извлечение без иерархии
+ contours, hierarchy = cv2.findContours(mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+ hierarchy = None
+
+ # Конвертируем контуры в список точек с учетом иерархии
+ contour_data = []
+
+ if hierarchy is not None and len(contours) > 0:
+ hierarchy = hierarchy[0] # OpenCV возвращает hierarchy в странном формате
+
+ for i, contour in enumerate(contours):
+ try:
+ # Небольшое упрощение только для очень больших контуров
+ if len(contour) > 1000:
+ arc_length = cv2.arcLength(contour, True)
+ if arc_length > 0: # Защита от деления на 0
+ epsilon = 0.0005 * arc_length
+ approx = cv2.approxPolyDP(contour, epsilon, True)
+ else:
+ approx = contour
+ else:
+ approx = contour
+
+ # Конвертируем в список [x, y]
+ points = [[int(point[0][0]), int(point[0][1])] for point in approx]
+
+ if len(points) > 2:
+ # hierarchy[i] = [Next, Previous, First_Child, Parent]
+ # Если Parent == -1, это внешний контур
+ # Если Parent >= 0, это дыра (hole) внутри родительского контура
+ is_hole = hierarchy[i][3] != -1
+
+ contour_data.append({
+ "points": points,
+ "is_hole": is_hole
+ })
+ except Exception as e:
+ logger.warning(f"Ошибка при обработке контура {i}: {e}")
+ continue
+ else:
+ # Fallback если hierarchy не вернулась
+ for contour in contours:
+ try:
+ if len(contour) > 1000:
+ arc_length = cv2.arcLength(contour, True)
+ if arc_length > 0:
+ epsilon = 0.0005 * arc_length
+ approx = cv2.approxPolyDP(contour, epsilon, True)
+ else:
+ approx = contour
+ else:
+ approx = contour
+
+ points = [[int(point[0][0]), int(point[0][1])] for point in approx]
+ if len(points) > 2:
+ contour_data.append({
+ "points": points,
+ "is_hole": False
+ })
+ except Exception as e:
+ logger.warning(f"Ошибка при обработке контура: {e}")
+ continue
+
+ segment_data["contours"] = contour_data if len(contour_data) > 0 else []
+
+ # Также добавляем RLE (Run-Length Encoding) для компактного представления
+ # Это полезно если нужно восстановить точную маску
+ segment_data["mask_rle"] = mask_to_rle(mask)
+
+ results.append(segment_data)
+
+ return results
+
+def mask_to_rle(mask: np.ndarray) -> Dict[str, Any]:
+ """
+ Конвертирует бинарную маску в RLE (Run-Length Encoding)
+ Компактное представление маски
+ """
+ # Конвертируем в int если это bool
+ if mask.dtype == bool:
+ pixels = mask.astype(np.uint8).flatten()
+ else:
+ pixels = mask.flatten()
+
+ pixels = np.concatenate([[0], pixels, [0]])
+ runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+ runs[1::2] -= runs[::2]
+
+ return {
+ "counts": [int(x) for x in runs], # Конвертируем numpy int в Python int
+ "size": [int(x) for x in mask.shape] # Конвертируем в Python int
+ }
+
+def convert_to_native_types(obj):
+ """
+ Рекурсивно конвертирует numpy типы в нативные Python типы
+ Нужно для сериализации в JSON через FastAPI
+ """
+ if isinstance(obj, np.integer):
+ return int(obj)
+ elif isinstance(obj, np.floating):
+ return float(obj)
+ elif isinstance(obj, np.ndarray):
+ return obj.tolist()
+ elif isinstance(obj, np.bool_):
+ return bool(obj)
+ elif isinstance(obj, dict):
+ return {key: convert_to_native_types(value) for key, value in obj.items()}
+ elif isinstance(obj, list):
+ return [convert_to_native_types(item) for item in obj]
+ return obj
+
+def clean_mask(mask: np.ndarray, min_area: int = 100) -> np.ndarray:
+ """
+ Очищает маску от мелких артефактов и дыр.
+
+ mask: бинарная маска (H, W)
+ min_area: минимальная площадь компонента в пикселях
+
+ Returns: очищенная маска
+ """
+ # Конвертируем в uint8 если нужно
+ if mask.dtype == bool:
+ mask_uint8 = mask.astype(np.uint8) * 255
+ else:
+ mask_uint8 = (mask * 255).astype(np.uint8)
+
+ # Морфологическое закрытие для удаления мелких дыр
+ kernel = np.ones((3, 3), np.uint8)
+ mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_CLOSE, kernel, iterations=2)
+
+ # Морфологическое открытие для удаления мелких шумов
+ mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_OPEN, kernel, iterations=1)
+
+ # Находим все связанные компоненты
+ num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(mask_uint8, connectivity=8)
+
+ # Создаем чистую маску
+ clean_mask = np.zeros_like(mask_uint8)
+
+ # Оставляем только большие компоненты
+ for i in range(1, num_labels): # Пропускаем фон (0)
+ area = stats[i, cv2.CC_STAT_AREA]
+ if area >= min_area:
+ clean_mask[labels == i] = 255
+
+ # Если ничего не осталось, возвращаем самый большой компонент
+ if clean_mask.sum() == 0 and num_labels > 1:
+ # Находим самый большой компонент
+ largest_component = 1 + np.argmax([stats[i, cv2.CC_STAT_AREA] for i in range(1, num_labels)])
+ clean_mask[labels == largest_component] = 255
+
+ return (clean_mask > 127).astype(bool)
+
+def extract_object_image(image: np.ndarray, mask: np.ndarray, clean: bool = True) -> str:
+ """
+ Вырезает объект из изображения по маске и возвращает base64 PNG с прозрачностью.
+
+ image: RGB изображение (H, W, 3)
+ mask: бинарная маска (H, W)
+ clean: применить постобработку для удаления артефактов
+
+ Returns: base64 строка PNG изображения с альфа-каналом
+ """
+ # Конвертируем маску в bool если нужно
+ if mask.dtype != bool:
+ mask = mask > 0.5
+
+ # Очищаем маску от артефактов
+ if clean:
+ mask = clean_mask(mask, min_area=100)
+
+ # Создаем RGBA изображение
+ h, w = image.shape[:2]
+ rgba = np.zeros((h, w, 4), dtype=np.uint8)
+ rgba[:, :, :3] = image # RGB каналы
+ rgba[:, :, 3] = (mask * 255).astype(np.uint8) # Alpha канал из маски
+
+ # Конвертируем в PIL Image
+ pil_image = Image.fromarray(rgba, 'RGBA')
+
+ # Конвертируем в base64
+ buffer = io.BytesIO()
+ pil_image.save(buffer, format='PNG')
+ buffer.seek(0)
+ img_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+
+ return f"data:image/png;base64,{img_base64}"
+
+@app.post("/segment")
+async def segment_image(
+ file: UploadFile = File(...),
+ point_x: List[float] = Query(None, description="X координаты точек промпта"),
+ point_y: List[float] = Query(None, description="Y координаты точек промпта"),
+ point_labels: List[int] = Query(None, description="Лейблы точек (1=foreground, 0=background)"),
+ box_x1: float = Query(None, description="X координата левого верхнего угла бокса"),
+ box_y1: float = Query(None, description="Y координата левого верхнего угла бокса"),
+ box_x2: float = Query(None, description="X координата правого нижнего угла бокса"),
+ box_y2: float = Query(None, description="Y координата правого нижнего угла бокса"),
+ mask_data: str = Query(None, description="Base64 закодированная маска (PNG с альфа-каналом)"),
+ include_masks: bool = Query(True, description="Включить контуры масок в ответ"),
+ extract_objects: bool = Query(False, description="Вернуть вырезанные объекты как base64 PNG"),
+):
+ """
+ Сегментирует изображение по промпту (точкам, боксу, маске или их комбинации).
+
+ Поддерживаемые промпты:
+ - Точки (point_x, point_y, point_labels) - клики пользователя
+ - Бокс (box_x1, box_y1, box_x2, box_y2) - прямоугольное выделение
+ - Маска (mask_data) - нарисованная кистью маска (зеленый=foreground, красный=background)
+ - Комбинация промптов - для максимальной точности
+
+ Если промпты не указаны, сегментирует центральный объект.
+ Если include_masks=True, возвращает контуры масок для точной отрисовки.
+ Если extract_objects=True, возвращает готовые вырезанные объекты как base64 PNG.
+ """
+ if predictor is None:
+ raise HTTPException(status_code=503, detail="Модель не загружена, перезапусти сервер")
+
+ try:
+ # Читаем изображение
+ image_bytes = await file.read()
+ image = process_image(image_bytes)
+
+ logger.info(f"Обрабатываю изображение: {image.shape}")
+ logger.info(f"Параметры: include_masks={include_masks}, extract_objects={extract_objects}")
+
+ # Устанавливаем изображение в предиктор
+ predictor.set_image(image)
+
+ # Подготавливаем промпты
+ points = None
+ labels = None
+ box = None
+
+ # Проверяем наличие точек
+ if point_x and point_y:
+ if len(point_x) != len(point_y):
+ raise HTTPException(status_code=400, detail="Количество X и Y координат должно совпадать")
+ points = np.array([[x, y] for x, y in zip(point_x, point_y)])
+ labels = np.array(point_labels) if point_labels else np.ones(len(points))
+ logger.info(f"Промпт: {len(points)} точек")
+
+ # Проверяем наличие бокса
+ if all(v is not None for v in [box_x1, box_y1, box_x2, box_y2]):
+ box = np.array([box_x1, box_y1, box_x2, box_y2])
+ logger.info(f"Промпт: бокс [{box_x1:.1f}, {box_y1:.1f}, {box_x2:.1f}, {box_y2:.1f}]")
+
+ # Валидация бокса
+ if box_x2 <= box_x1 or box_y2 <= box_y1:
+ raise HTTPException(
+ status_code=400,
+ detail="Некорректный бокс: x2 должен быть больше x1, y2 больше y1"
+ )
+
+ # Проверяем наличие нарисованной маски
+ if mask_data:
+ logger.info("Обрабатываю нарисованную маску...")
+ try:
+ # Декодируем base64
+ if ',' in mask_data:
+ mask_data = mask_data.split(',')[1] # Убираем data:image/png;base64,
+
+ mask_bytes = base64.b64decode(mask_data)
+ mask_image = Image.open(io.BytesIO(mask_bytes)).convert('RGBA')
+ mask_array = np.array(mask_image)
+
+ # Извлекаем foreground и background пиксели
+ # Поддерживаем несколько форматов:
+ # 1. Зеленый (R<100, G>150, B<100) - классический foreground
+ # 2. Белый/светлый (R>200, G>200, B>200) - часто используется фронтами
+ # 3. Красный (R>150, G<100, B<100) - background
+
+ green_mask = (mask_array[:, :, 0] < 100) & (mask_array[:, :, 1] > 150) & (mask_array[:, :, 2] < 100) & (mask_array[:, :, 3] > 0)
+ white_mask = (mask_array[:, :, 0] > 200) & (mask_array[:, :, 1] > 200) & (mask_array[:, :, 2] > 200) & (mask_array[:, :, 3] > 0)
+ red_mask = (mask_array[:, :, 0] > 150) & (mask_array[:, :, 1] < 100) & (mask_array[:, :, 2] < 100) & (mask_array[:, :, 3] > 0)
+
+ # Объединяем зеленые и белые как foreground
+ foreground_mask = green_mask | white_mask
+
+ # Сэмплируем точки из закрашенных областей
+ mask_points = []
+ mask_labels = []
+
+ # Foreground точки (зеленые + белые)
+ foreground_coords = np.argwhere(foreground_mask)
+ if len(foreground_coords) > 0:
+ # Масштабируем к размеру исходного изображения
+ scale_y = image.shape[0] / mask_array.shape[0]
+ scale_x = image.shape[1] / mask_array.shape[1]
+
+ # Сэмплируем до 20 точек равномерно (меньше = стабильнее)
+ step = max(1, len(foreground_coords) // 20)
+ sampled = foreground_coords[::step][:20] # Максимум 20 точек
+
+ for y, x in sampled:
+ mask_points.append([x * scale_x, y * scale_y])
+ mask_labels.append(1) # foreground
+
+ # Background точки (красные)
+ red_coords = np.argwhere(red_mask)
+ if len(red_coords) > 0:
+ scale_y = image.shape[0] / mask_array.shape[0]
+ scale_x = image.shape[1] / mask_array.shape[1]
+
+ step = max(1, len(red_coords) // 20)
+ sampled = red_coords[::step][:20] # Максимум 20 точек
+
+ for y, x in sampled:
+ mask_points.append([x * scale_x, y * scale_y])
+ mask_labels.append(0) # background
+
+ if mask_points:
+ # Объединяем с существующими точками
+ if points is not None:
+ points = np.vstack([points, np.array(mask_points)])
+ labels = np.concatenate([labels, np.array(mask_labels)])
+ else:
+ points = np.array(mask_points)
+ labels = np.array(mask_labels)
+
+ logger.info(f"Промпт из маски: {len(mask_points)} точек ({np.sum(np.array(mask_labels) == 1)} foreground, {np.sum(np.array(mask_labels) == 0)} background)")
+ else:
+ logger.warning("Маска пустая или не содержит foreground (зеленых/белых) или background (красных) пикселей")
+
+ except Exception as e:
+ logger.error(f"Ошибка обработки маски: {e}")
+ raise HTTPException(status_code=400, detail=f"Некорректная маска: {str(e)}")
+
+ # Делаем предсказание с промптами
+ if points is not None or box is not None:
+ logger.info(f"Используем промпты: points={points is not None}, box={box is not None}")
+
+ # Если много точек (>10), используем single mask для стабильности
+ # Если мало точек или только box, используем multimask для вариативности
+ use_multimask = True
+ if points is not None and len(points) > 10:
+ use_multimask = False
+ logger.info("Много точек, используем single mask mode для стабильности")
+
+ masks, scores, logits = predictor.predict(
+ point_coords=points,
+ point_labels=labels,
+ box=box,
+ multimask_output=use_multimask,
+ )
+
+ # Если multimask, выбираем лучшую по score
+ if use_multimask and len(masks) > 1:
+ best_idx = np.argmax(scores)
+ masks = masks[best_idx:best_idx+1]
+ scores = scores[best_idx:best_idx+1]
+ logger.info(f"Выбрана маска {best_idx} с confidence {scores[0]:.3f}")
+ else:
+ # Автоматическая сегментация - берем центральную точку
+ logger.info("Промпты не указаны, сегментирую центральный объект")
+ h, w = image.shape[:2]
+ point = np.array([[w // 2, h // 2]])
+ label = np.array([1])
+
+ masks, scores, logits = predictor.predict(
+ point_coords=point,
+ point_labels=label,
+ multimask_output=True,
+ )
+
+ # Конвертируем маски в координаты (с контурами если нужно)
+ segments = masks_to_coords(masks, include_contours=include_masks)
+
+ logger.info(f"Найдено сегментов: {len(segments)}, масок: {len(masks)}")
+ logger.info(f"extract_objects = {extract_objects}")
+
+ # Добавляем confidence scores
+ for i, seg in enumerate(segments):
+ seg["confidence"] = float(scores[i]) if i < len(scores) else 0.0
+
+ # Если нужно - вырезаем объект и добавляем base64
+ logger.info(f"Обрабатываю сегмент {i}: extract_objects={extract_objects}, i < len(masks) = {i < len(masks)}")
+ if extract_objects and i < len(masks):
+ logger.info(f"Вырезаю объект {i}...")
+ seg["extracted_image"] = extract_object_image(image, masks[i])
+ logger.info(f"✓ Вырезан объект {i}, размер маски: {masks[i].sum()} пикселей")
+ else:
+ logger.warning(f"❌ Пропускаю объект {i}: extract_objects={extract_objects}")
+
+ result = {
+ "success": True,
+ "image_size": {
+ "width": int(image.shape[1]),
+ "height": int(image.shape[0])
+ },
+ "segments_count": len(segments),
+ "segments": segments
+ }
+
+ # Конвертируем все numpy типы в нативные Python типы
+ return convert_to_native_types(result)
+
+ except Exception as e:
+ logger.error(f"Ошибка при сегментации: {e}")
+ raise HTTPException(status_code=500, detail=f"Ошибка обработки: {str(e)}")
+
+@app.post("/segment/auto")
+async def segment_auto(
+ file: UploadFile = File(...),
+ points_per_side: int = Query(32, description="Количество точек на сторону для автосегментации"),
+ include_masks: bool = Query(True, description="Включить контуры масок в ответ"),
+):
+ """
+ Автоматическая сегментация всех объектов на изображении.
+ Использует grid of points для поиска всех возможных объектов.
+ Если include_masks=True, возвращает контуры масок для точной отрисовки.
+ """
+ if predictor is None:
+ raise HTTPException(status_code=503, detail="Модель не загружена")
+
+ try:
+ image_bytes = await file.read()
+ image = process_image(image_bytes)
+
+ logger.info(f"Автосегментация изображения: {image.shape}")
+
+ predictor.set_image(image)
+
+ # Создаем сетку точек
+ h, w = image.shape[:2]
+ x_coords = np.linspace(0, w, points_per_side)
+ y_coords = np.linspace(0, h, points_per_side)
+
+ all_segments = []
+ segment_id = 0
+
+ # Для каждой точки в сетке пытаемся найти объект
+ for y in y_coords:
+ for x in x_coords:
+ point = np.array([[x, y]])
+ label = np.array([1])
+
+ masks, scores, _ = predictor.predict(
+ point_coords=point,
+ point_labels=label,
+ multimask_output=False,
+ )
+
+ if masks.shape[0] > 0 and scores[0] > 0.5: # Порог confidence
+ segments = masks_to_coords(masks, include_contours=include_masks)
+ for seg in segments:
+ seg["segment_id"] = segment_id
+ seg["confidence"] = float(scores[0])
+ all_segments.append(seg)
+ segment_id += 1
+
+ # Убираем дубликаты (примерно)
+ # Два сегмента считаем дубликатами если их центры близко
+ unique_segments = []
+ for seg in all_segments:
+ is_duplicate = False
+ for unique_seg in unique_segments:
+ dx = seg["center"]["x"] - unique_seg["center"]["x"]
+ dy = seg["center"]["y"] - unique_seg["center"]["y"]
+ dist = (dx**2 + dy**2) ** 0.5
+
+ if dist < 50: # Порог расстояния между центрами
+ is_duplicate = True
+ break
+
+ if not is_duplicate:
+ unique_segments.append(seg)
+
+ result = {
+ "success": True,
+ "image_size": {
+ "width": int(image.shape[1]),
+ "height": int(image.shape[0])
+ },
+ "segments_count": len(unique_segments),
+ "segments": unique_segments
+ }
+
+ # Конвертируем все numpy типы в нативные Python типы
+ return convert_to_native_types(result)
+
+ except Exception as e:
+ logger.error(f"Ошибка при автосегментации: {e}")
+ raise HTTPException(status_code=500, detail=f"Ошибка обработки: {str(e)}")
+
+@app.post("/segment/batch", response_model=BatchSegmentResponse)
+async def segment_batch(request: BatchSegmentRequest = Body(...)):
+ """
+ Батчинг сегментация нескольких объектов.
+
+ Принимает изображение и массив промптов (mask/box/points).
+ Обрабатывает каждый selected промпт отдельно.
+ Возвращает массив результатов с метаданными.
+
+ Идеально для:
+ - Множественных объектов
+ - Мобильных приложений
+ - Когда фронт уже разделил объекты
+ """
+ if predictor is None:
+ raise HTTPException(status_code=503, detail="Модель не загружена, перезапусти сервер")
+
+ try:
+ # Декодируем изображение из base64
+ image_data = request.image
+ if ',' in image_data:
+ image_data = image_data.split(',')[1] # Убираем data:image/...;base64,
+
+ image_bytes = base64.b64decode(image_data)
+ image = process_image(image_bytes)
+
+ logger.info(f"Батчинг сегментация: {image.shape}, промптов: {len(request.prompts)}")
+
+ # Устанавливаем изображение один раз
+ predictor.set_image(image)
+
+ results = []
+
+ # Фильтруем только selected промпты
+ selected_prompts = [p for p in request.prompts if p.selected]
+ logger.info(f"Обрабатываем {len(selected_prompts)} из {len(request.prompts)} промптов")
+
+ # Обрабатываем каждый промпт отдельно
+ for prompt in selected_prompts:
+ logger.info(f"Обрабатываю промпт #{prompt.id}, тип: {prompt.type}, label: {prompt.label}")
+
+ try:
+ # Подготавливаем промпт в зависимости от типа
+ points = None
+ labels = None
+ box = None
+
+ if prompt.type == "mask":
+ # Декодируем маску и извлекаем точки
+ mask_data = prompt.data
+ if ',' in mask_data:
+ mask_data = mask_data.split(',')[1]
+
+ mask_bytes = base64.b64decode(mask_data)
+ mask_image = Image.open(io.BytesIO(mask_bytes)).convert('RGBA')
+ mask_array = np.array(mask_image)
+
+ # Извлекаем foreground и background пиксели
+ # Поддерживаем несколько форматов:
+ # 1. Зеленый (R<100, G>150, B<100) - классический foreground
+ # 2. Белый/светлый (R>200, G>200, B>200) - часто используется фронтами
+ # 3. Красный (R>150, G<100, B<100) - background
+
+ green_mask = (mask_array[:, :, 0] < 100) & (mask_array[:, :, 1] > 150) & (mask_array[:, :, 2] < 100) & (mask_array[:, :, 3] > 0)
+ white_mask = (mask_array[:, :, 0] > 200) & (mask_array[:, :, 1] > 200) & (mask_array[:, :, 2] > 200) & (mask_array[:, :, 3] > 0)
+ red_mask = (mask_array[:, :, 0] > 150) & (mask_array[:, :, 1] < 100) & (mask_array[:, :, 2] < 100) & (mask_array[:, :, 3] > 0)
+
+ # Объединяем зеленые и белые как foreground
+ foreground_mask = green_mask | white_mask
+
+ mask_points = []
+ mask_labels = []
+
+ # Foreground точки (зеленые + белые)
+ foreground_coords = np.argwhere(foreground_mask)
+ if len(foreground_coords) > 0:
+ scale_y = image.shape[0] / mask_array.shape[0]
+ scale_x = image.shape[1] / mask_array.shape[1]
+ step = max(1, len(foreground_coords) // 20)
+ sampled = foreground_coords[::step][:20]
+
+ for y, x in sampled:
+ mask_points.append([x * scale_x, y * scale_y])
+ mask_labels.append(1)
+
+ # Background точки
+ red_coords = np.argwhere(red_mask)
+ if len(red_coords) > 0:
+ scale_y = image.shape[0] / mask_array.shape[0]
+ scale_x = image.shape[1] / mask_array.shape[1]
+ step = max(1, len(red_coords) // 20)
+ sampled = red_coords[::step][:20]
+
+ for y, x in sampled:
+ mask_points.append([x * scale_x, y * scale_y])
+ mask_labels.append(0)
+
+ if mask_points:
+ points = np.array(mask_points)
+ labels = np.array(mask_labels)
+
+ elif prompt.type == "box":
+ # Парсим bbox - может быть нормализованный (0-1) или пиксельный
+ bbox_data = prompt.bbox if prompt.bbox else None
+
+ if bbox_data:
+ x1 = bbox_data.x_min
+ y1 = bbox_data.y_min
+ x2 = bbox_data.x_max
+ y2 = bbox_data.y_max
+
+ # Если нормализованные координаты (0-1), конвертируем в пиксели
+ if x2 <= 1.0 and y2 <= 1.0:
+ x1 *= image.shape[1]
+ x2 *= image.shape[1]
+ y1 *= image.shape[0]
+ y2 *= image.shape[0]
+
+ box = np.array([x1, y1, x2, y2])
+
+ elif prompt.type == "points":
+ # Ожидаем JSON в формате [[x, y, label], ...]
+ import json
+ points_data = json.loads(prompt.data)
+
+ points_list = []
+ labels_list = []
+
+ for point in points_data:
+ x, y = point[0], point[1]
+ label = point[2] if len(point) > 2 else 1
+
+ # Если нормализованные, конвертируем
+ if x <= 1.0 and y <= 1.0:
+ x *= image.shape[1]
+ y *= image.shape[0]
+
+ points_list.append([x, y])
+ labels_list.append(label)
+
+ points = np.array(points_list)
+ labels = np.array(labels_list)
+
+ # Делаем предсказание
+ if points is not None or box is not None:
+ # Решаем использовать ли multimask
+ use_multimask = True
+ if points is not None and len(points) > 10:
+ use_multimask = False
+
+ masks, scores, logits = predictor.predict(
+ point_coords=points,
+ point_labels=labels,
+ box=box,
+ multimask_output=use_multimask,
+ )
+
+ # Если multimask, выбираем лучшую
+ if use_multimask and len(masks) > 1:
+ best_idx = np.argmax(scores)
+ masks = masks[best_idx:best_idx+1]
+ scores = scores[best_idx:best_idx+1]
+
+ # Берем первую маску
+ mask = masks[0]
+ score = float(scores[0])
+
+ # Очищаем маску если нужно
+ if request.options.clean_masks:
+ mask = clean_mask(mask, min_area=100)
+
+ # Вычисляем метрики
+ y_coords, x_coords = np.where(mask > 0)
+
+ if len(x_coords) > 0:
+ x_min, x_max = int(x_coords.min()), int(x_coords.max())
+ y_min, y_max = int(y_coords.min()), int(y_coords.max())
+ area = int(mask.sum())
+ center_x = int(x_coords.mean())
+ center_y = int(y_coords.mean())
+
+ # Формируем результат
+ result = {
+ "id": prompt.id,
+ "label": prompt.label,
+ "bbox": {
+ "x_min": x_min,
+ "y_min": y_min,
+ "x_max": x_max,
+ "y_max": y_max,
+ "width": x_max - x_min,
+ "height": y_max - y_min
+ },
+ "area": area,
+ "center": {
+ "x": center_x,
+ "y": center_y
+ },
+ "confidence": score
+ }
+
+ # Добавляем вырезанный объект если нужно
+ if request.options.extract_objects:
+ result["extracted_image"] = extract_object_image(
+ image, mask, clean=request.options.clean_masks
+ )
+
+ # Добавляем контуры если нужно
+ if request.options.include_masks:
+ segments = masks_to_coords(masks, include_contours=True)
+ if segments:
+ result["contours"] = segments[0].get("contours", [])
+ result["mask_rle"] = segments[0].get("mask_rle", {})
+
+ results.append(result)
+ logger.info(f"✓ Промпт #{prompt.id} обработан, confidence: {score:.3f}")
+ else:
+ logger.warning(f"✗ Промпт #{prompt.id} не дал результата")
+ else:
+ logger.warning(f"✗ Промпт #{prompt.id}: нет данных для сегментации")
+
+ except Exception as e:
+ logger.error(f"✗ Ошибка обработки промпта #{prompt.id}: {e}")
+ # Продолжаем обработку остальных промптов
+ continue
+
+ response = {
+ "success": True,
+ "image_size": {
+ "width": int(image.shape[1]),
+ "height": int(image.shape[0])
+ },
+ "results": results
+ }
+
+ logger.info(f"Батчинг завершен: обработано {len(results)} объектов")
+
+ # Сохраняем лог запроса для аудита (только метаданные, без изображений)
+ try:
+ request_dict = request.dict()
+ save_batch_request_log(request_dict, response, image.shape[1], image.shape[0])
+ except Exception as e:
+ logger.warning(f"Не удалось сохранить лог запроса: {e}")
+
+ return convert_to_native_types(response)
+
+ except Exception as e:
+ logger.error(f"Ошибка при батчинг сегментации: {e}")
+ raise HTTPException(status_code=500, detail=f"Ошибка обработки: {str(e)}")
+
+if __name__ == "__main__":
+ import uvicorn
+ import os
+
+ # Порт из переменной окружения (для HF Spaces) или 8000 по умолчанию
+ port = int(os.getenv("PORT", 8000))
+ uvicorn.run(app, host="0.0.0.0", port=port)
diff --git a/download_model.py b/download_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..82fce62e0a161b7a70ac6989840c43b47484d75a
--- /dev/null
+++ b/download_model.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Скрипт для скачивания модели SAM2.
+Блин, Facebook не может нормально в pip packaging, поэтому качаем руками.
+"""
+
+import os
+import urllib.request
+import sys
+
+# Директория для чекпоинтов
+CHECKPOINT_DIR = "checkpoints"
+os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+
+# Модели на выбор
+MODELS = {
+ "tiny": {
+ "url": "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_tiny.pt",
+ "filename": "sam2.1_hiera_tiny.pt",
+ "size": "~39MB"
+ },
+ "small": {
+ "url": "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_small.pt",
+ "filename": "sam2.1_hiera_small.pt",
+ "size": "~46MB"
+ },
+ "base_plus": {
+ "url": "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_base_plus.pt",
+ "filename": "sam2.1_hiera_base_plus.pt",
+ "size": "~81MB"
+ },
+ "large": {
+ "url": "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt",
+ "filename": "sam2.1_hiera_large.pt",
+ "size": "~224MB"
+ }
+}
+
+def download_model(model_name="tiny"):
+ """Качает модель, показывает прогресс"""
+ if model_name not in MODELS:
+ print(f"Неизвестная модель: {model_name}")
+ print(f"Доступные: {', '.join(MODELS.keys())}")
+ sys.exit(1)
+
+ model_info = MODELS[model_name]
+ filepath = os.path.join(CHECKPOINT_DIR, model_info["filename"])
+
+ if os.path.exists(filepath):
+ print(f"Модель уже скачана: {filepath}")
+ return filepath
+
+ print(f"Качаю {model_name} модель ({model_info['size']})...")
+ print(f"URL: {model_info['url']}")
+
+ def progress_hook(block_num, block_size, total_size):
+ downloaded = block_num * block_size
+ if total_size > 0:
+ percent = min(100, downloaded * 100 / total_size)
+ sys.stdout.write(f"\rПрогресс: {percent:.1f}%")
+ sys.stdout.flush()
+
+ try:
+ urllib.request.urlretrieve(
+ model_info["url"],
+ filepath,
+ reporthook=progress_hook
+ )
+ print(f"\n✓ Модель скачана: {filepath}")
+ return filepath
+ except Exception as e:
+ print(f"\n✗ Ошибка при скачивании: {e}")
+ if os.path.exists(filepath):
+ os.remove(filepath)
+ sys.exit(1)
+
+if __name__ == "__main__":
+ model_name = sys.argv[1] if len(sys.argv) > 1 else "tiny"
+ download_model(model_name)
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..42b894ca10faaa31c21a48a93d7653f94dba16a2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+fastapi==0.115.0
+uvicorn[standard]==0.32.0
+python-multipart==0.0.12
+Pillow==11.0.0
+numpy==2.1.0
+torch==2.6.0
+torchvision==0.21.0
+opencv-python==4.10.0.84
+pydantic==2.9.0
+
diff --git a/sam2_repo/README.md b/sam2_repo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..85a7eb958bced5495ff990c2bcbe7d99662c660f
--- /dev/null
+++ b/sam2_repo/README.md
@@ -0,0 +1,224 @@
+# SAM 2: Segment Anything in Images and Videos
+
+**[AI at Meta, FAIR](https://ai.meta.com/research/)**
+
+[Nikhila Ravi](https://nikhilaravi.com/), [Valentin Gabeur](https://gabeur.github.io/), [Yuan-Ting Hu](https://scholar.google.com/citations?user=E8DVVYQAAAAJ&hl=en), [Ronghang Hu](https://ronghanghu.com/), [Chaitanya Ryali](https://scholar.google.com/citations?user=4LWx24UAAAAJ&hl=en), [Tengyu Ma](https://scholar.google.com/citations?user=VeTSl0wAAAAJ&hl=en), [Haitham Khedr](https://hkhedr.com/), [Roman Rädle](https://scholar.google.de/citations?user=Tpt57v0AAAAJ&hl=en), [Chloe Rolland](https://scholar.google.com/citations?hl=fr&user=n-SnMhoAAAAJ), [Laura Gustafson](https://scholar.google.com/citations?user=c8IpF9gAAAAJ&hl=en), [Eric Mintun](https://ericmintun.github.io/), [Junting Pan](https://junting.github.io/), [Kalyan Vasudev Alwala](https://scholar.google.co.in/citations?user=m34oaWEAAAAJ&hl=en), [Nicolas Carion](https://www.nicolascarion.com/), [Chao-Yuan Wu](https://chaoyuan.org/), [Ross Girshick](https://www.rossgirshick.info/), [Piotr Dollár](https://pdollar.github.io/), [Christoph Feichtenhofer](https://feichtenhofer.github.io/)
+
+[[`Paper`](https://ai.meta.com/research/publications/sam-2-segment-anything-in-images-and-videos/)] [[`Project`](https://ai.meta.com/sam2)] [[`Demo`](https://sam2.metademolab.com/)] [[`Dataset`](https://ai.meta.com/datasets/segment-anything-video)] [[`Blog`](https://ai.meta.com/blog/segment-anything-2)] [[`BibTeX`](#citing-sam-2)]
+
+
+
+**Segment Anything Model 2 (SAM 2)** is a foundation model towards solving promptable visual segmentation in images and videos. We extend SAM to video by considering images as a video with a single frame. The model design is a simple transformer architecture with streaming memory for real-time video processing. We build a model-in-the-loop data engine, which improves model and data via user interaction, to collect [**our SA-V dataset**](https://ai.meta.com/datasets/segment-anything-video), the largest video segmentation dataset to date. SAM 2 trained on our data provides strong performance across a wide range of tasks and visual domains.
+
+
+
+## Latest updates
+
+**12/11/2024 -- full model compilation for a major VOS speedup and a new `SAM2VideoPredictor` to better handle multi-object tracking**
+
+- We now support `torch.compile` of the entire SAM 2 model on videos, which can be turned on by setting `vos_optimized=True` in `build_sam2_video_predictor`, leading to a major speedup for VOS inference.
+- We update the implementation of `SAM2VideoPredictor` to support independent per-object inference, allowing us to relax the assumption of prompting for multi-object tracking and adding new objects after tracking starts.
+- See [`RELEASE_NOTES.md`](RELEASE_NOTES.md) for full details.
+
+**09/30/2024 -- SAM 2.1 Developer Suite (new checkpoints, training code, web demo) is released**
+
+- A new suite of improved model checkpoints (denoted as **SAM 2.1**) are released. See [Model Description](#model-description) for details.
+ * To use the new SAM 2.1 checkpoints, you need the latest model code from this repo. If you have installed an earlier version of this repo, please first uninstall the previous version via `pip uninstall SAM-2`, pull the latest code from this repo (with `git pull`), and then reinstall the repo following [Installation](#installation) below.
+- The training (and fine-tuning) code has been released. See [`training/README.md`](training/README.md) on how to get started.
+- The frontend + backend code for the SAM 2 web demo has been released. See [`demo/README.md`](demo/README.md) for details.
+
+## Installation
+
+SAM 2 needs to be installed first before use. The code requires `python>=3.10`, as well as `torch>=2.5.1` and `torchvision>=0.20.1`. Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install both PyTorch and TorchVision dependencies. You can install SAM 2 on a GPU machine using:
+
+```bash
+git clone https://github.com/facebookresearch/sam2.git && cd sam2
+
+pip install -e .
+```
+If you are installing on Windows, it's strongly recommended to use [Windows Subsystem for Linux (WSL)](https://learn.microsoft.com/en-us/windows/wsl/install) with Ubuntu.
+
+To use the SAM 2 predictor and run the example notebooks, `jupyter` and `matplotlib` are required and can be installed by:
+
+```bash
+pip install -e ".[notebooks]"
+```
+
+Note:
+1. It's recommended to create a new Python environment via [Anaconda](https://www.anaconda.com/) for this installation and install PyTorch 2.5.1 (or higher) via `pip` following https://pytorch.org/. If you have a PyTorch version lower than 2.5.1 in your current environment, the installation command above will try to upgrade it to the latest PyTorch version using `pip`.
+2. The step above requires compiling a custom CUDA kernel with the `nvcc` compiler. If it isn't already available on your machine, please install the [CUDA toolkits](https://developer.nvidia.com/cuda-toolkit-archive) with a version that matches your PyTorch CUDA version.
+3. If you see a message like `Failed to build the SAM 2 CUDA extension` during installation, you can ignore it and still use SAM 2 (some post-processing functionality may be limited, but it doesn't affect the results in most cases).
+
+Please see [`INSTALL.md`](./INSTALL.md) for FAQs on potential issues and solutions.
+
+## Getting Started
+
+### Download Checkpoints
+
+First, we need to download a model checkpoint. All the model checkpoints can be downloaded by running:
+
+```bash
+cd checkpoints && \
+./download_ckpts.sh && \
+cd ..
+```
+
+or individually from:
+
+- [sam2.1_hiera_tiny.pt](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_tiny.pt)
+- [sam2.1_hiera_small.pt](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_small.pt)
+- [sam2.1_hiera_base_plus.pt](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_base_plus.pt)
+- [sam2.1_hiera_large.pt](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt)
+
+(note that these are the improved checkpoints denoted as SAM 2.1; see [Model Description](#model-description) for details.)
+
+Then SAM 2 can be used in a few lines as follows for image and video prediction.
+
+### Image prediction
+
+SAM 2 has all the capabilities of [SAM](https://github.com/facebookresearch/segment-anything) on static images, and we provide image prediction APIs that closely resemble SAM for image use cases. The `SAM2ImagePredictor` class has an easy interface for image prompting.
+
+```python
+import torch
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+
+checkpoint = "./checkpoints/sam2.1_hiera_large.pt"
+model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
+predictor = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint))
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+ predictor.set_image()
+ masks, _, _ = predictor.predict()
+```
+
+Please refer to the examples in [image_predictor_example.ipynb](./notebooks/image_predictor_example.ipynb) (also in Colab [here](https://colab.research.google.com/github/facebookresearch/sam2/blob/main/notebooks/image_predictor_example.ipynb)) for static image use cases.
+
+SAM 2 also supports automatic mask generation on images just like SAM. Please see [automatic_mask_generator_example.ipynb](./notebooks/automatic_mask_generator_example.ipynb) (also in Colab [here](https://colab.research.google.com/github/facebookresearch/sam2/blob/main/notebooks/automatic_mask_generator_example.ipynb)) for automatic mask generation in images.
+
+### Video prediction
+
+For promptable segmentation and tracking in videos, we provide a video predictor with APIs for example to add prompts and propagate masklets throughout a video. SAM 2 supports video inference on multiple objects and uses an inference state to keep track of the interactions in each video.
+
+```python
+import torch
+from sam2.build_sam import build_sam2_video_predictor
+
+checkpoint = "./checkpoints/sam2.1_hiera_large.pt"
+model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
+predictor = build_sam2_video_predictor(model_cfg, checkpoint)
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+ state = predictor.init_state()
+
+ # add new prompts and instantly get the output on the same frame
+ frame_idx, object_ids, masks = predictor.add_new_points_or_box(state, ):
+
+ # propagate the prompts to get masklets throughout the video
+ for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
+ ...
+```
+
+Please refer to the examples in [video_predictor_example.ipynb](./notebooks/video_predictor_example.ipynb) (also in Colab [here](https://colab.research.google.com/github/facebookresearch/sam2/blob/main/notebooks/video_predictor_example.ipynb)) for details on how to add click or box prompts, make refinements, and track multiple objects in videos.
+
+## Load from 🤗 Hugging Face
+
+Alternatively, models can also be loaded from [Hugging Face](https://huggingface.co/models?search=facebook/sam2) (requires `pip install huggingface_hub`).
+
+For image prediction:
+
+```python
+import torch
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+
+predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-large")
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+ predictor.set_image()
+ masks, _, _ = predictor.predict()
+```
+
+For video prediction:
+
+```python
+import torch
+from sam2.sam2_video_predictor import SAM2VideoPredictor
+
+predictor = SAM2VideoPredictor.from_pretrained("facebook/sam2-hiera-large")
+
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+ state = predictor.init_state()
+
+ # add new prompts and instantly get the output on the same frame
+ frame_idx, object_ids, masks = predictor.add_new_points_or_box(state, ):
+
+ # propagate the prompts to get masklets throughout the video
+ for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
+ ...
+```
+
+## Model Description
+
+### SAM 2.1 checkpoints
+
+The table below shows the improved SAM 2.1 checkpoints released on September 29, 2024.
+| **Model** | **Size (M)** | **Speed (FPS)** | **SA-V test (J&F)** | **MOSE val (J&F)** | **LVOS v2 (J&F)** |
+| :------------------: | :----------: | :--------------------: | :-----------------: | :----------------: | :---------------: |
+| sam2.1_hiera_tiny ([config](sam2/configs/sam2.1/sam2.1_hiera_t.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_tiny.pt)) | 38.9 | 91.2 | 76.5 | 71.8 | 77.3 |
+| sam2.1_hiera_small ([config](sam2/configs/sam2.1/sam2.1_hiera_s.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_small.pt)) | 46 | 84.8 | 76.6 | 73.5 | 78.3 |
+| sam2.1_hiera_base_plus ([config](sam2/configs/sam2.1/sam2.1_hiera_b+.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_base_plus.pt)) | 80.8 | 64.1 | 78.2 | 73.7 | 78.2 |
+| sam2.1_hiera_large ([config](sam2/configs/sam2.1/sam2.1_hiera_l.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt)) | 224.4 | 39.5 | 79.5 | 74.6 | 80.6 |
+
+### SAM 2 checkpoints
+
+The previous SAM 2 checkpoints released on July 29, 2024 can be found as follows:
+
+| **Model** | **Size (M)** | **Speed (FPS)** | **SA-V test (J&F)** | **MOSE val (J&F)** | **LVOS v2 (J&F)** |
+| :------------------: | :----------: | :--------------------: | :-----------------: | :----------------: | :---------------: |
+| sam2_hiera_tiny ([config](sam2/configs/sam2/sam2_hiera_t.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt)) | 38.9 | 91.5 | 75.0 | 70.9 | 75.3 |
+| sam2_hiera_small ([config](sam2/configs/sam2/sam2_hiera_s.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt)) | 46 | 85.6 | 74.9 | 71.5 | 76.4 |
+| sam2_hiera_base_plus ([config](sam2/configs/sam2/sam2_hiera_b+.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_base_plus.pt)) | 80.8 | 64.8 | 74.7 | 72.8 | 75.8 |
+| sam2_hiera_large ([config](sam2/configs/sam2/sam2_hiera_l.yaml), [checkpoint](https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt)) | 224.4 | 39.7 | 76.0 | 74.6 | 79.8 |
+
+Speed measured on an A100 with `torch 2.5.1, cuda 12.4`. See `benchmark.py` for an example on benchmarking (compiling all the model components). Compiling only the image encoder can be more flexible and also provide (a smaller) speed-up (set `compile_image_encoder: True` in the config).
+## Segment Anything Video Dataset
+
+See [sav_dataset/README.md](sav_dataset/README.md) for details.
+
+## Training SAM 2
+
+You can train or fine-tune SAM 2 on custom datasets of images, videos, or both. Please check the training [README](training/README.md) on how to get started.
+
+## Web demo for SAM 2
+
+We have released the frontend + backend code for the SAM 2 web demo (a locally deployable version similar to https://sam2.metademolab.com/demo). Please see the web demo [README](demo/README.md) for details.
+
+## License
+
+The SAM 2 model checkpoints, SAM 2 demo code (front-end and back-end), and SAM 2 training code are licensed under [Apache 2.0](./LICENSE), however the [Inter Font](https://github.com/rsms/inter?tab=OFL-1.1-1-ov-file) and [Noto Color Emoji](https://github.com/googlefonts/noto-emoji) used in the SAM 2 demo code are made available under the [SIL Open Font License, version 1.1](https://openfontlicense.org/open-font-license-official-text/).
+
+## Contributing
+
+See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
+
+## Contributors
+
+The SAM 2 project was made possible with the help of many contributors (alphabetical):
+
+Karen Bergan, Daniel Bolya, Alex Bosenberg, Kai Brown, Vispi Cassod, Christopher Chedeau, Ida Cheng, Luc Dahlin, Shoubhik Debnath, Rene Martinez Doehner, Grant Gardner, Sahir Gomez, Rishi Godugu, Baishan Guo, Caleb Ho, Andrew Huang, Somya Jain, Bob Kamma, Amanda Kallet, Jake Kinney, Alexander Kirillov, Shiva Koduvayur, Devansh Kukreja, Robert Kuo, Aohan Lin, Parth Malani, Jitendra Malik, Mallika Malhotra, Miguel Martin, Alexander Miller, Sasha Mitts, William Ngan, George Orlin, Joelle Pineau, Kate Saenko, Rodrick Shepard, Azita Shokrpour, David Soofian, Jonathan Torres, Jenny Truong, Sagar Vaze, Meng Wang, Claudette Ward, Pengchuan Zhang.
+
+Third-party code: we use a GPU-based connected component algorithm adapted from [`cc_torch`](https://github.com/zsef123/Connected_components_PyTorch) (with its license in [`LICENSE_cctorch`](./LICENSE_cctorch)) as an optional post-processing step for the mask predictions.
+
+## Citing SAM 2
+
+If you use SAM 2 or the SA-V dataset in your research, please use the following BibTeX entry.
+
+```bibtex
+@article{ravi2024sam2,
+ title={SAM 2: Segment Anything in Images and Videos},
+ author={Ravi, Nikhila and Gabeur, Valentin and Hu, Yuan-Ting and Hu, Ronghang and Ryali, Chaitanya and Ma, Tengyu and Khedr, Haitham and R{\"a}dle, Roman and Rolland, Chloe and Gustafson, Laura and Mintun, Eric and Pan, Junting and Alwala, Kalyan Vasudev and Carion, Nicolas and Wu, Chao-Yuan and Girshick, Ross and Doll{\'a}r, Piotr and Feichtenhofer, Christoph},
+ journal={arXiv preprint arXiv:2408.00714},
+ url={https://arxiv.org/abs/2408.00714},
+ year={2024}
+}
+```
diff --git a/sam2_repo/checkpoints/download_ckpts.sh b/sam2_repo/checkpoints/download_ckpts.sh
new file mode 100755
index 0000000000000000000000000000000000000000..eedee8eee153f17c6db3b92de5492fa0a11ec3b7
--- /dev/null
+++ b/sam2_repo/checkpoints/download_ckpts.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Use either wget or curl to download the checkpoints
+if command -v wget &> /dev/null; then
+ CMD="wget"
+elif command -v curl &> /dev/null; then
+ CMD="curl -L -O"
+else
+ echo "Please install wget or curl to download the checkpoints."
+ exit 1
+fi
+
+# Define the URLs for SAM 2 checkpoints
+# SAM2_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/072824"
+# sam2_hiera_t_url="${SAM2_BASE_URL}/sam2_hiera_tiny.pt"
+# sam2_hiera_s_url="${SAM2_BASE_URL}/sam2_hiera_small.pt"
+# sam2_hiera_b_plus_url="${SAM2_BASE_URL}/sam2_hiera_base_plus.pt"
+# sam2_hiera_l_url="${SAM2_BASE_URL}/sam2_hiera_large.pt"
+
+# Download each of the four checkpoints using wget
+# echo "Downloading sam2_hiera_tiny.pt checkpoint..."
+# $CMD $sam2_hiera_t_url || { echo "Failed to download checkpoint from $sam2_hiera_t_url"; exit 1; }
+
+# echo "Downloading sam2_hiera_small.pt checkpoint..."
+# $CMD $sam2_hiera_s_url || { echo "Failed to download checkpoint from $sam2_hiera_s_url"; exit 1; }
+
+# echo "Downloading sam2_hiera_base_plus.pt checkpoint..."
+# $CMD $sam2_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2_hiera_b_plus_url"; exit 1; }
+
+# echo "Downloading sam2_hiera_large.pt checkpoint..."
+# $CMD $sam2_hiera_l_url || { echo "Failed to download checkpoint from $sam2_hiera_l_url"; exit 1; }
+
+# Define the URLs for SAM 2.1 checkpoints
+SAM2p1_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/092824"
+sam2p1_hiera_t_url="${SAM2p1_BASE_URL}/sam2.1_hiera_tiny.pt"
+sam2p1_hiera_s_url="${SAM2p1_BASE_URL}/sam2.1_hiera_small.pt"
+sam2p1_hiera_b_plus_url="${SAM2p1_BASE_URL}/sam2.1_hiera_base_plus.pt"
+sam2p1_hiera_l_url="${SAM2p1_BASE_URL}/sam2.1_hiera_large.pt"
+
+# SAM 2.1 checkpoints
+echo "Downloading sam2.1_hiera_tiny.pt checkpoint..."
+$CMD $sam2p1_hiera_t_url || { echo "Failed to download checkpoint from $sam2p1_hiera_t_url"; exit 1; }
+
+echo "Downloading sam2.1_hiera_small.pt checkpoint..."
+$CMD $sam2p1_hiera_s_url || { echo "Failed to download checkpoint from $sam2p1_hiera_s_url"; exit 1; }
+
+echo "Downloading sam2.1_hiera_base_plus.pt checkpoint..."
+$CMD $sam2p1_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2p1_hiera_b_plus_url"; exit 1; }
+
+echo "Downloading sam2.1_hiera_large.pt checkpoint..."
+$CMD $sam2p1_hiera_l_url || { echo "Failed to download checkpoint from $sam2p1_hiera_l_url"; exit 1; }
+
+echo "All checkpoints are downloaded successfully."
diff --git a/sam2_repo/pyproject.toml b/sam2_repo/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..f84317dbbfa6ba4f2d972cab2e2e0d0bdf07f003
--- /dev/null
+++ b/sam2_repo/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = [
+ "setuptools>=61.0",
+ "torch>=2.5.1",
+ ]
+build-backend = "setuptools.build_meta"
diff --git a/sam2_repo/sam2/__init__.py b/sam2_repo/sam2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0712dd03cb280ab94ba04f8a32aa8ddc8aa3db4a
--- /dev/null
+++ b/sam2_repo/sam2/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from hydra import initialize_config_module
+from hydra.core.global_hydra import GlobalHydra
+
+if not GlobalHydra.instance().is_initialized():
+ initialize_config_module("sam2", version_base="1.2")
diff --git a/sam2_repo/sam2/__pycache__/__init__.cpython-313.pyc b/sam2_repo/sam2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d8d6bb54369cec12ba2462a5690e2a264ed4e43
Binary files /dev/null and b/sam2_repo/sam2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/__pycache__/build_sam.cpython-313.pyc b/sam2_repo/sam2/__pycache__/build_sam.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4512a51dfbb0f12b28190eee4a73cfd5a7bc1e35
Binary files /dev/null and b/sam2_repo/sam2/__pycache__/build_sam.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/__pycache__/sam2_image_predictor.cpython-313.pyc b/sam2_repo/sam2/__pycache__/sam2_image_predictor.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae3e4b4483a1a1bcb63c9d64d8e088b38a081f8a
Binary files /dev/null and b/sam2_repo/sam2/__pycache__/sam2_image_predictor.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/automatic_mask_generator.py b/sam2_repo/sam2/automatic_mask_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..065e469e27c2d3af40d51d072031e828692c799b
--- /dev/null
+++ b/sam2_repo/sam2/automatic_mask_generator.py
@@ -0,0 +1,454 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Adapted from https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from torchvision.ops.boxes import batched_nms, box_area # type: ignore
+
+from sam2.modeling.sam2_base import SAM2Base
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+from sam2.utils.amg import (
+ area_from_rle,
+ batch_iterator,
+ batched_mask_to_box,
+ box_xyxy_to_xywh,
+ build_all_layer_point_grids,
+ calculate_stability_score,
+ coco_encode_rle,
+ generate_crop_boxes,
+ is_box_near_crop_edge,
+ mask_to_rle_pytorch,
+ MaskData,
+ remove_small_regions,
+ rle_to_mask,
+ uncrop_boxes_xyxy,
+ uncrop_masks,
+ uncrop_points,
+)
+
+
+class SAM2AutomaticMaskGenerator:
+ def __init__(
+ self,
+ model: SAM2Base,
+ points_per_side: Optional[int] = 32,
+ points_per_batch: int = 64,
+ pred_iou_thresh: float = 0.8,
+ stability_score_thresh: float = 0.95,
+ stability_score_offset: float = 1.0,
+ mask_threshold: float = 0.0,
+ box_nms_thresh: float = 0.7,
+ crop_n_layers: int = 0,
+ crop_nms_thresh: float = 0.7,
+ crop_overlap_ratio: float = 512 / 1500,
+ crop_n_points_downscale_factor: int = 1,
+ point_grids: Optional[List[np.ndarray]] = None,
+ min_mask_region_area: int = 0,
+ output_mode: str = "binary_mask",
+ use_m2m: bool = False,
+ multimask_output: bool = True,
+ **kwargs,
+ ) -> None:
+ """
+ Using a SAM 2 model, generates masks for the entire image.
+ Generates a grid of point prompts over the image, then filters
+ low quality and duplicate masks. The default settings are chosen
+ for SAM 2 with a HieraL backbone.
+
+ Arguments:
+ model (Sam): The SAM 2 model to use for mask prediction.
+ points_per_side (int or None): The number of points to be sampled
+ along one side of the image. The total number of points is
+ points_per_side**2. If None, 'point_grids' must provide explicit
+ point sampling.
+ points_per_batch (int): Sets the number of points run simultaneously
+ by the model. Higher numbers may be faster but use more GPU memory.
+ pred_iou_thresh (float): A filtering threshold in [0,1], using the
+ model's predicted mask quality.
+ stability_score_thresh (float): A filtering threshold in [0,1], using
+ the stability of the mask under changes to the cutoff used to binarize
+ the model's mask predictions.
+ stability_score_offset (float): The amount to shift the cutoff when
+ calculated the stability score.
+ mask_threshold (float): Threshold for binarizing the mask logits
+ box_nms_thresh (float): The box IoU cutoff used by non-maximal
+ suppression to filter duplicate masks.
+ crop_n_layers (int): If >0, mask prediction will be run again on
+ crops of the image. Sets the number of layers to run, where each
+ layer has 2**i_layer number of image crops.
+ crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+ suppression to filter duplicate masks between different crops.
+ crop_overlap_ratio (float): Sets the degree to which crops overlap.
+ In the first crop layer, crops will overlap by this fraction of
+ the image length. Later layers with more crops scale down this overlap.
+ crop_n_points_downscale_factor (int): The number of points-per-side
+ sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+ point_grids (list(np.ndarray) or None): A list over explicit grids
+ of points used for sampling, normalized to [0,1]. The nth grid in the
+ list is used in the nth crop layer. Exclusive with points_per_side.
+ min_mask_region_area (int): If >0, postprocessing will be applied
+ to remove disconnected regions and holes in masks with area smaller
+ than min_mask_region_area. Requires opencv.
+ output_mode (str): The form masks are returned in. Can be 'binary_mask',
+ 'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+ For large resolutions, 'binary_mask' may consume large amounts of
+ memory.
+ use_m2m (bool): Whether to add a one step refinement using previous mask predictions.
+ multimask_output (bool): Whether to output multimask at each point of the grid.
+ """
+
+ assert (points_per_side is None) != (
+ point_grids is None
+ ), "Exactly one of points_per_side or point_grid must be provided."
+ if points_per_side is not None:
+ self.point_grids = build_all_layer_point_grids(
+ points_per_side,
+ crop_n_layers,
+ crop_n_points_downscale_factor,
+ )
+ elif point_grids is not None:
+ self.point_grids = point_grids
+ else:
+ raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+ assert output_mode in [
+ "binary_mask",
+ "uncompressed_rle",
+ "coco_rle",
+ ], f"Unknown output_mode {output_mode}."
+ if output_mode == "coco_rle":
+ try:
+ from pycocotools import mask as mask_utils # type: ignore # noqa: F401
+ except ImportError as e:
+ print("Please install pycocotools")
+ raise e
+
+ self.predictor = SAM2ImagePredictor(
+ model,
+ max_hole_area=min_mask_region_area,
+ max_sprinkle_area=min_mask_region_area,
+ )
+ self.points_per_batch = points_per_batch
+ self.pred_iou_thresh = pred_iou_thresh
+ self.stability_score_thresh = stability_score_thresh
+ self.stability_score_offset = stability_score_offset
+ self.mask_threshold = mask_threshold
+ self.box_nms_thresh = box_nms_thresh
+ self.crop_n_layers = crop_n_layers
+ self.crop_nms_thresh = crop_nms_thresh
+ self.crop_overlap_ratio = crop_overlap_ratio
+ self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+ self.min_mask_region_area = min_mask_region_area
+ self.output_mode = output_mode
+ self.use_m2m = use_m2m
+ self.multimask_output = multimask_output
+
+ @classmethod
+ def from_pretrained(cls, model_id: str, **kwargs) -> "SAM2AutomaticMaskGenerator":
+ """
+ Load a pretrained model from the Hugging Face hub.
+
+ Arguments:
+ model_id (str): The Hugging Face repository ID.
+ **kwargs: Additional arguments to pass to the model constructor.
+
+ Returns:
+ (SAM2AutomaticMaskGenerator): The loaded model.
+ """
+ from sam2.build_sam import build_sam2_hf
+
+ sam_model = build_sam2_hf(model_id, **kwargs)
+ return cls(sam_model, **kwargs)
+
+ @torch.no_grad()
+ def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+ """
+ Generates masks for the given image.
+
+ Arguments:
+ image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+ Returns:
+ list(dict(str, any)): A list over records for masks. Each record is
+ a dict containing the following keys:
+ segmentation (dict(str, any) or np.ndarray): The mask. If
+ output_mode='binary_mask', is an array of shape HW. Otherwise,
+ is a dictionary containing the RLE.
+ bbox (list(float)): The box around the mask, in XYWH format.
+ area (int): The area in pixels of the mask.
+ predicted_iou (float): The model's own prediction of the mask's
+ quality. This is filtered by the pred_iou_thresh parameter.
+ point_coords (list(list(float))): The point coordinates input
+ to the model to generate this mask.
+ stability_score (float): A measure of the mask's quality. This
+ is filtered on using the stability_score_thresh parameter.
+ crop_box (list(float)): The crop of the image used to generate
+ the mask, given in XYWH format.
+ """
+
+ # Generate masks
+ mask_data = self._generate_masks(image)
+
+ # Encode masks
+ if self.output_mode == "coco_rle":
+ mask_data["segmentations"] = [
+ coco_encode_rle(rle) for rle in mask_data["rles"]
+ ]
+ elif self.output_mode == "binary_mask":
+ mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+ else:
+ mask_data["segmentations"] = mask_data["rles"]
+
+ # Write mask records
+ curr_anns = []
+ for idx in range(len(mask_data["segmentations"])):
+ ann = {
+ "segmentation": mask_data["segmentations"][idx],
+ "area": area_from_rle(mask_data["rles"][idx]),
+ "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+ "predicted_iou": mask_data["iou_preds"][idx].item(),
+ "point_coords": [mask_data["points"][idx].tolist()],
+ "stability_score": mask_data["stability_score"][idx].item(),
+ "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+ }
+ curr_anns.append(ann)
+
+ return curr_anns
+
+ def _generate_masks(self, image: np.ndarray) -> MaskData:
+ orig_size = image.shape[:2]
+ crop_boxes, layer_idxs = generate_crop_boxes(
+ orig_size, self.crop_n_layers, self.crop_overlap_ratio
+ )
+
+ # Iterate over image crops
+ data = MaskData()
+ for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+ crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+ data.cat(crop_data)
+
+ # Remove duplicate masks between crops
+ if len(crop_boxes) > 1:
+ # Prefer masks from smaller crops
+ scores = 1 / box_area(data["crop_boxes"])
+ scores = scores.to(data["boxes"].device)
+ keep_by_nms = batched_nms(
+ data["boxes"].float(),
+ scores,
+ torch.zeros_like(data["boxes"][:, 0]), # categories
+ iou_threshold=self.crop_nms_thresh,
+ )
+ data.filter(keep_by_nms)
+ data.to_numpy()
+ return data
+
+ def _process_crop(
+ self,
+ image: np.ndarray,
+ crop_box: List[int],
+ crop_layer_idx: int,
+ orig_size: Tuple[int, ...],
+ ) -> MaskData:
+ # Crop the image and calculate embeddings
+ x0, y0, x1, y1 = crop_box
+ cropped_im = image[y0:y1, x0:x1, :]
+ cropped_im_size = cropped_im.shape[:2]
+ self.predictor.set_image(cropped_im)
+
+ # Get points for this crop
+ points_scale = np.array(cropped_im_size)[None, ::-1]
+ points_for_image = self.point_grids[crop_layer_idx] * points_scale
+
+ # Generate masks for this crop in batches
+ data = MaskData()
+ for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+ batch_data = self._process_batch(
+ points, cropped_im_size, crop_box, orig_size, normalize=True
+ )
+ data.cat(batch_data)
+ del batch_data
+ self.predictor.reset_predictor()
+
+ # Remove duplicates within this crop.
+ keep_by_nms = batched_nms(
+ data["boxes"].float(),
+ data["iou_preds"],
+ torch.zeros_like(data["boxes"][:, 0]), # categories
+ iou_threshold=self.box_nms_thresh,
+ )
+ data.filter(keep_by_nms)
+
+ # Return to the original image frame
+ data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+ data["points"] = uncrop_points(data["points"], crop_box)
+ data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+ return data
+
+ def _process_batch(
+ self,
+ points: np.ndarray,
+ im_size: Tuple[int, ...],
+ crop_box: List[int],
+ orig_size: Tuple[int, ...],
+ normalize=False,
+ ) -> MaskData:
+ orig_h, orig_w = orig_size
+
+ # Run model on this batch
+ points = torch.as_tensor(
+ points, dtype=torch.float32, device=self.predictor.device
+ )
+ in_points = self.predictor._transforms.transform_coords(
+ points, normalize=normalize, orig_hw=im_size
+ )
+ in_labels = torch.ones(
+ in_points.shape[0], dtype=torch.int, device=in_points.device
+ )
+ masks, iou_preds, low_res_masks = self.predictor._predict(
+ in_points[:, None, :],
+ in_labels[:, None],
+ multimask_output=self.multimask_output,
+ return_logits=True,
+ )
+
+ # Serialize predictions and store in MaskData
+ data = MaskData(
+ masks=masks.flatten(0, 1),
+ iou_preds=iou_preds.flatten(0, 1),
+ points=points.repeat_interleave(masks.shape[1], dim=0),
+ low_res_masks=low_res_masks.flatten(0, 1),
+ )
+ del masks
+
+ if not self.use_m2m:
+ # Filter by predicted IoU
+ if self.pred_iou_thresh > 0.0:
+ keep_mask = data["iou_preds"] > self.pred_iou_thresh
+ data.filter(keep_mask)
+
+ # Calculate and filter by stability score
+ data["stability_score"] = calculate_stability_score(
+ data["masks"], self.mask_threshold, self.stability_score_offset
+ )
+ if self.stability_score_thresh > 0.0:
+ keep_mask = data["stability_score"] >= self.stability_score_thresh
+ data.filter(keep_mask)
+ else:
+ # One step refinement using previous mask predictions
+ in_points = self.predictor._transforms.transform_coords(
+ data["points"], normalize=normalize, orig_hw=im_size
+ )
+ labels = torch.ones(
+ in_points.shape[0], dtype=torch.int, device=in_points.device
+ )
+ masks, ious = self.refine_with_m2m(
+ in_points, labels, data["low_res_masks"], self.points_per_batch
+ )
+ data["masks"] = masks.squeeze(1)
+ data["iou_preds"] = ious.squeeze(1)
+
+ if self.pred_iou_thresh > 0.0:
+ keep_mask = data["iou_preds"] > self.pred_iou_thresh
+ data.filter(keep_mask)
+
+ data["stability_score"] = calculate_stability_score(
+ data["masks"], self.mask_threshold, self.stability_score_offset
+ )
+ if self.stability_score_thresh > 0.0:
+ keep_mask = data["stability_score"] >= self.stability_score_thresh
+ data.filter(keep_mask)
+
+ # Threshold masks and calculate boxes
+ data["masks"] = data["masks"] > self.mask_threshold
+ data["boxes"] = batched_mask_to_box(data["masks"])
+
+ # Filter boxes that touch crop boundaries
+ keep_mask = ~is_box_near_crop_edge(
+ data["boxes"], crop_box, [0, 0, orig_w, orig_h]
+ )
+ if not torch.all(keep_mask):
+ data.filter(keep_mask)
+
+ # Compress to RLE
+ data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+ data["rles"] = mask_to_rle_pytorch(data["masks"])
+ del data["masks"]
+
+ return data
+
+ @staticmethod
+ def postprocess_small_regions(
+ mask_data: MaskData, min_area: int, nms_thresh: float
+ ) -> MaskData:
+ """
+ Removes small disconnected regions and holes in masks, then reruns
+ box NMS to remove any new duplicates.
+
+ Edits mask_data in place.
+
+ Requires open-cv as a dependency.
+ """
+ if len(mask_data["rles"]) == 0:
+ return mask_data
+
+ # Filter small disconnected regions and holes
+ new_masks = []
+ scores = []
+ for rle in mask_data["rles"]:
+ mask = rle_to_mask(rle)
+
+ mask, changed = remove_small_regions(mask, min_area, mode="holes")
+ unchanged = not changed
+ mask, changed = remove_small_regions(mask, min_area, mode="islands")
+ unchanged = unchanged and not changed
+
+ new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+ # Give score=0 to changed masks and score=1 to unchanged masks
+ # so NMS will prefer ones that didn't need postprocessing
+ scores.append(float(unchanged))
+
+ # Recalculate boxes and remove any new duplicates
+ masks = torch.cat(new_masks, dim=0)
+ boxes = batched_mask_to_box(masks)
+ keep_by_nms = batched_nms(
+ boxes.float(),
+ torch.as_tensor(scores),
+ torch.zeros_like(boxes[:, 0]), # categories
+ iou_threshold=nms_thresh,
+ )
+
+ # Only recalculate RLEs for masks that have changed
+ for i_mask in keep_by_nms:
+ if scores[i_mask] == 0.0:
+ mask_torch = masks[i_mask].unsqueeze(0)
+ mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+ mask_data["boxes"][i_mask] = boxes[i_mask] # update res directly
+ mask_data.filter(keep_by_nms)
+
+ return mask_data
+
+ def refine_with_m2m(self, points, point_labels, low_res_masks, points_per_batch):
+ new_masks = []
+ new_iou_preds = []
+
+ for cur_points, cur_point_labels, low_res_mask in batch_iterator(
+ points_per_batch, points, point_labels, low_res_masks
+ ):
+ best_masks, best_iou_preds, _ = self.predictor._predict(
+ cur_points[:, None, :],
+ cur_point_labels[:, None],
+ mask_input=low_res_mask[:, None, :],
+ multimask_output=False,
+ return_logits=True,
+ )
+ new_masks.append(best_masks)
+ new_iou_preds.append(best_iou_preds)
+ masks = torch.cat(new_masks, dim=0)
+ return masks, torch.cat(new_iou_preds, dim=0)
diff --git a/sam2_repo/sam2/benchmark.py b/sam2_repo/sam2/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6519534c8619e04b9a632859a5128ad2cee34c13
--- /dev/null
+++ b/sam2_repo/sam2/benchmark.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import time
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from sam2.build_sam import build_sam2_video_predictor
+
+# Only cuda supported
+assert torch.cuda.is_available()
+device = torch.device("cuda")
+
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+if torch.cuda.get_device_properties(0).major >= 8:
+ # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+
+# Config and checkpoint
+sam2_checkpoint = "checkpoints/sam2.1_hiera_base_plus.pt"
+model_cfg = "configs/sam2.1/sam2.1_hiera_b+.yaml"
+
+# Build video predictor with vos_optimized=True setting
+predictor = build_sam2_video_predictor(
+ model_cfg, sam2_checkpoint, device=device, vos_optimized=True
+)
+
+
+# Initialize with video
+video_dir = "notebooks/videos/bedroom"
+# scan all the JPEG frame names in this directory
+frame_names = [
+ p
+ for p in os.listdir(video_dir)
+ if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
+]
+frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
+inference_state = predictor.init_state(video_path=video_dir)
+
+
+# Number of runs, warmup etc
+warm_up, runs = 5, 25
+verbose = True
+num_frames = len(frame_names)
+total, count = 0, 0
+torch.cuda.empty_cache()
+
+# We will select an object with a click.
+# See video_predictor_example.ipynb for more detailed explanation
+ann_frame_idx, ann_obj_id = 0, 1
+# Add a positive click at (x, y) = (210, 350)
+# For labels, `1` means positive click
+points = np.array([[210, 350]], dtype=np.float32)
+labels = np.array([1], np.int32)
+
+_, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(
+ inference_state=inference_state,
+ frame_idx=ann_frame_idx,
+ obj_id=ann_obj_id,
+ points=points,
+ labels=labels,
+)
+
+# Warmup and then average FPS over several runs
+with torch.autocast("cuda", torch.bfloat16):
+ with torch.inference_mode():
+ for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
+ start = time.time()
+ # Start tracking
+ for (
+ out_frame_idx,
+ out_obj_ids,
+ out_mask_logits,
+ ) in predictor.propagate_in_video(inference_state):
+ pass
+
+ end = time.time()
+ total += end - start
+ count += 1
+ if i == warm_up - 1:
+ print("Warmup FPS: ", count * num_frames / total)
+ total = 0
+ count = 0
+
+print("FPS: ", count * num_frames / total)
diff --git a/sam2_repo/sam2/build_sam.py b/sam2_repo/sam2/build_sam.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a3bef1e566d86c3ba0fd75f425530bc6505e9bf
--- /dev/null
+++ b/sam2_repo/sam2/build_sam.py
@@ -0,0 +1,174 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import torch
+from hydra import compose
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+
+import sam2
+
+# Check if the user is running Python from the parent directory of the sam2 repo
+# (i.e. the directory where this repo is cloned into) -- this is not supported since
+# it could shadow the sam2 package and cause issues.
+if os.path.isdir(os.path.join(sam2.__path__[0], "sam2")):
+ # If the user has "sam2/sam2" in their path, they are likey importing the repo itself
+ # as "sam2" rather than importing the "sam2" python package (i.e. "sam2/sam2" directory).
+ # This typically happens because the user is running Python from the parent directory
+ # that contains the sam2 repo they cloned.
+ raise RuntimeError(
+ "You're likely running Python from the parent directory of the sam2 repository "
+ "(i.e. the directory where https://github.com/facebookresearch/sam2 is cloned into). "
+ "This is not supported since the `sam2` Python package could be shadowed by the "
+ "repository name (the repository is also named `sam2` and contains the Python package "
+ "in `sam2/sam2`). Please run Python from another directory (e.g. from the repo dir "
+ "rather than its parent dir, or from your home directory) after installing SAM 2."
+ )
+
+
+HF_MODEL_ID_TO_FILENAMES = {
+ "facebook/sam2-hiera-tiny": (
+ "configs/sam2/sam2_hiera_t.yaml",
+ "sam2_hiera_tiny.pt",
+ ),
+ "facebook/sam2-hiera-small": (
+ "configs/sam2/sam2_hiera_s.yaml",
+ "sam2_hiera_small.pt",
+ ),
+ "facebook/sam2-hiera-base-plus": (
+ "configs/sam2/sam2_hiera_b+.yaml",
+ "sam2_hiera_base_plus.pt",
+ ),
+ "facebook/sam2-hiera-large": (
+ "configs/sam2/sam2_hiera_l.yaml",
+ "sam2_hiera_large.pt",
+ ),
+ "facebook/sam2.1-hiera-tiny": (
+ "configs/sam2.1/sam2.1_hiera_t.yaml",
+ "sam2.1_hiera_tiny.pt",
+ ),
+ "facebook/sam2.1-hiera-small": (
+ "configs/sam2.1/sam2.1_hiera_s.yaml",
+ "sam2.1_hiera_small.pt",
+ ),
+ "facebook/sam2.1-hiera-base-plus": (
+ "configs/sam2.1/sam2.1_hiera_b+.yaml",
+ "sam2.1_hiera_base_plus.pt",
+ ),
+ "facebook/sam2.1-hiera-large": (
+ "configs/sam2.1/sam2.1_hiera_l.yaml",
+ "sam2.1_hiera_large.pt",
+ ),
+}
+
+
+def build_sam2(
+ config_file,
+ ckpt_path=None,
+ device="cuda",
+ mode="eval",
+ hydra_overrides_extra=[],
+ apply_postprocessing=True,
+ **kwargs,
+):
+
+ if apply_postprocessing:
+ hydra_overrides_extra = hydra_overrides_extra.copy()
+ hydra_overrides_extra += [
+ # dynamically fall back to multi-mask if the single mask is not stable
+ "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
+ "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
+ "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
+ ]
+ # Read config and init model
+ cfg = compose(config_name=config_file, overrides=hydra_overrides_extra)
+ OmegaConf.resolve(cfg)
+ model = instantiate(cfg.model, _recursive_=True)
+ _load_checkpoint(model, ckpt_path)
+ model = model.to(device)
+ if mode == "eval":
+ model.eval()
+ return model
+
+
+def build_sam2_video_predictor(
+ config_file,
+ ckpt_path=None,
+ device="cuda",
+ mode="eval",
+ hydra_overrides_extra=[],
+ apply_postprocessing=True,
+ vos_optimized=False,
+ **kwargs,
+):
+ hydra_overrides = [
+ "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictor",
+ ]
+ if vos_optimized:
+ hydra_overrides = [
+ "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictorVOS",
+ "++model.compile_image_encoder=True", # Let sam2_base handle this
+ ]
+
+ if apply_postprocessing:
+ hydra_overrides_extra = hydra_overrides_extra.copy()
+ hydra_overrides_extra += [
+ # dynamically fall back to multi-mask if the single mask is not stable
+ "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
+ "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
+ "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
+ # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
+ "++model.binarize_mask_from_pts_for_mem_enc=true",
+ # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
+ "++model.fill_hole_area=8",
+ ]
+ hydra_overrides.extend(hydra_overrides_extra)
+
+ # Read config and init model
+ cfg = compose(config_name=config_file, overrides=hydra_overrides)
+ OmegaConf.resolve(cfg)
+ model = instantiate(cfg.model, _recursive_=True)
+ _load_checkpoint(model, ckpt_path)
+ model = model.to(device)
+ if mode == "eval":
+ model.eval()
+ return model
+
+
+def _hf_download(model_id):
+ from huggingface_hub import hf_hub_download
+
+ config_name, checkpoint_name = HF_MODEL_ID_TO_FILENAMES[model_id]
+ ckpt_path = hf_hub_download(repo_id=model_id, filename=checkpoint_name)
+ return config_name, ckpt_path
+
+
+def build_sam2_hf(model_id, **kwargs):
+ config_name, ckpt_path = _hf_download(model_id)
+ return build_sam2(config_file=config_name, ckpt_path=ckpt_path, **kwargs)
+
+
+def build_sam2_video_predictor_hf(model_id, **kwargs):
+ config_name, ckpt_path = _hf_download(model_id)
+ return build_sam2_video_predictor(
+ config_file=config_name, ckpt_path=ckpt_path, **kwargs
+ )
+
+
+def _load_checkpoint(model, ckpt_path):
+ if ckpt_path is not None:
+ sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"]
+ missing_keys, unexpected_keys = model.load_state_dict(sd)
+ if missing_keys:
+ logging.error(missing_keys)
+ raise RuntimeError()
+ if unexpected_keys:
+ logging.error(unexpected_keys)
+ raise RuntimeError()
+ logging.info("Loaded checkpoint sucessfully")
diff --git a/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml b/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7172f9b0b663aaaace97fed7e2a08db75150461
--- /dev/null
+++ b/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
@@ -0,0 +1,116 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 112
+ num_heads: 2
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [896, 448, 224, 112]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ no_obj_embed_spatial: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: true
+ proj_tpos_enc_in_obj_ptrs: true
+ use_signed_tpos_enc_to_obj_ptrs: true
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_l.yaml b/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23073ea7a95901be656b3c6d1a66ce8736ab7ad3
--- /dev/null
+++ b/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
@@ -0,0 +1,120 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 144
+ num_heads: 2
+ stages: [2, 6, 36, 4]
+ global_att_blocks: [23, 33, 43]
+ window_pos_embed_bkg_spatial_size: [7, 7]
+ window_spec: [8, 4, 16, 8]
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [1152, 576, 288, 144]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ no_obj_embed_spatial: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: true
+ proj_tpos_enc_in_obj_ptrs: true
+ use_signed_tpos_enc_to_obj_ptrs: true
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_s.yaml b/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd8d40465b18b3de39b0a565aca712306306c4ed
--- /dev/null
+++ b/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
@@ -0,0 +1,119 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 96
+ num_heads: 1
+ stages: [1, 2, 11, 2]
+ global_att_blocks: [7, 10, 13]
+ window_pos_embed_bkg_spatial_size: [7, 7]
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [768, 384, 192, 96]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ no_obj_embed_spatial: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: true
+ proj_tpos_enc_in_obj_ptrs: true
+ use_signed_tpos_enc_to_obj_ptrs: true
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_t.yaml b/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e762aec932f26436d13798f3feb3ec82c360a943
--- /dev/null
+++ b/sam2_repo/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
@@ -0,0 +1,121 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 96
+ num_heads: 1
+ stages: [1, 2, 7, 2]
+ global_att_blocks: [5, 7, 9]
+ window_pos_embed_bkg_spatial_size: [7, 7]
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [768, 384, 192, 96]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ # SAM decoder
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ no_obj_embed_spatial: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: true
+ proj_tpos_enc_in_obj_ptrs: true
+ use_signed_tpos_enc_to_obj_ptrs: true
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ # HieraT does not currently support compilation, should always be set to False
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml b/sam2_repo/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b6faa79f47ee576faf007bffd23fb6649bd881d
--- /dev/null
+++ b/sam2_repo/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
@@ -0,0 +1,339 @@
+# @package _global_
+
+scratch:
+ resolution: 1024
+ train_batch_size: 1
+ num_train_workers: 10
+ num_frames: 8
+ max_num_objects: 3
+ base_lr: 5.0e-6
+ vision_lr: 3.0e-06
+ phases_per_epoch: 1
+ num_epochs: 40
+
+dataset:
+ # PATHS to Dataset
+ img_folder: null # PATH to MOSE JPEGImages folder
+ gt_folder: null # PATH to MOSE Annotations folder
+ file_list_txt: training/assets/MOSE_sample_train_list.txt # Optional PATH to filelist containing a subset of videos to be used for training
+ multiplier: 2
+
+# Video transforms
+vos:
+ train_transforms:
+ - _target_: training.dataset.transforms.ComposeAPI
+ transforms:
+ - _target_: training.dataset.transforms.RandomHorizontalFlip
+ consistent_transform: True
+ - _target_: training.dataset.transforms.RandomAffine
+ degrees: 25
+ shear: 20
+ image_interpolation: bilinear
+ consistent_transform: True
+ - _target_: training.dataset.transforms.RandomResizeAPI
+ sizes: ${scratch.resolution}
+ square: true
+ consistent_transform: True
+ - _target_: training.dataset.transforms.ColorJitter
+ consistent_transform: True
+ brightness: 0.1
+ contrast: 0.03
+ saturation: 0.03
+ hue: null
+ - _target_: training.dataset.transforms.RandomGrayscale
+ p: 0.05
+ consistent_transform: True
+ - _target_: training.dataset.transforms.ColorJitter
+ consistent_transform: False
+ brightness: 0.1
+ contrast: 0.05
+ saturation: 0.05
+ hue: null
+ - _target_: training.dataset.transforms.ToTensorAPI
+ - _target_: training.dataset.transforms.NormalizeAPI
+ mean: [0.485, 0.456, 0.406]
+ std: [0.229, 0.224, 0.225]
+
+trainer:
+ _target_: training.trainer.Trainer
+ mode: train_only
+ max_epochs: ${times:${scratch.num_epochs},${scratch.phases_per_epoch}}
+ accelerator: cuda
+ seed_value: 123
+
+ model:
+ _target_: training.model.sam2.SAM2Train
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 112
+ num_heads: 2
+ drop_path_rate: 0.1
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [896, 448, 224, 112]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: ${scratch.resolution}
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ no_obj_embed_spatial: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: true
+ proj_tpos_enc_in_obj_ptrs: true
+ use_signed_tpos_enc_to_obj_ptrs: true
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ # compile_image_encoder: False
+
+ ####### Training specific params #######
+ # box/point input and corrections
+ prob_to_use_pt_input_for_train: 0.5
+ prob_to_use_pt_input_for_eval: 0.0
+ prob_to_use_box_input_for_train: 0.5 # 0.5*0.5 = 0.25 prob to use box instead of points
+ prob_to_use_box_input_for_eval: 0.0
+ prob_to_sample_from_gt_for_train: 0.1 # with a small prob, sampling correction points from GT mask instead of prediction errors
+ num_frames_to_correct_for_train: 2 # iteratively sample on random 1~2 frames (always include the first frame)
+ num_frames_to_correct_for_eval: 1 # only iteratively sample on first frame
+ rand_frames_to_correct_for_train: True # random #init-cond-frame ~ 2
+ add_all_frames_to_correct_as_cond: True # when a frame receives a correction click, it becomes a conditioning frame (even if it's not initially a conditioning frame)
+ # maximum 2 initial conditioning frames
+ num_init_cond_frames_for_train: 2
+ rand_init_cond_frames_for_train: True # random 1~2
+ num_correction_pt_per_frame: 7
+ use_act_ckpt_iterative_pt_sampling: false
+
+
+
+ num_init_cond_frames_for_eval: 1 # only mask on the first frame
+ forward_backbone_per_frame_for_eval: True
+
+
+ data:
+ train:
+ _target_: training.dataset.sam2_datasets.TorchTrainMixedDataset
+ phases_per_epoch: ${scratch.phases_per_epoch}
+ batch_sizes:
+ - ${scratch.train_batch_size}
+
+ datasets:
+ - _target_: training.dataset.utils.RepeatFactorWrapper
+ dataset:
+ _target_: training.dataset.utils.ConcatDataset
+ datasets:
+ - _target_: training.dataset.vos_dataset.VOSDataset
+ transforms: ${vos.train_transforms}
+ training: true
+ video_dataset:
+ _target_: training.dataset.vos_raw_dataset.PNGRawDataset
+ img_folder: ${dataset.img_folder}
+ gt_folder: ${dataset.gt_folder}
+ file_list_txt: ${dataset.file_list_txt}
+ sampler:
+ _target_: training.dataset.vos_sampler.RandomUniformSampler
+ num_frames: ${scratch.num_frames}
+ max_num_objects: ${scratch.max_num_objects}
+ multiplier: ${dataset.multiplier}
+ shuffle: True
+ num_workers: ${scratch.num_train_workers}
+ pin_memory: True
+ drop_last: True
+ collate_fn:
+ _target_: training.utils.data_utils.collate_fn
+ _partial_: true
+ dict_key: all
+
+ optim:
+ amp:
+ enabled: True
+ amp_dtype: bfloat16
+
+ optimizer:
+ _target_: torch.optim.AdamW
+
+ gradient_clip:
+ _target_: training.optimizer.GradientClipper
+ max_norm: 0.1
+ norm_type: 2
+
+ param_group_modifiers:
+ - _target_: training.optimizer.layer_decay_param_modifier
+ _partial_: True
+ layer_decay_value: 0.9
+ apply_to: 'image_encoder.trunk'
+ overrides:
+ - pattern: '*pos_embed*'
+ value: 1.0
+
+ options:
+ lr:
+ - scheduler:
+ _target_: fvcore.common.param_scheduler.CosineParamScheduler
+ start_value: ${scratch.base_lr}
+ end_value: ${divide:${scratch.base_lr},10}
+ - scheduler:
+ _target_: fvcore.common.param_scheduler.CosineParamScheduler
+ start_value: ${scratch.vision_lr}
+ end_value: ${divide:${scratch.vision_lr},10}
+ param_names:
+ - 'image_encoder.*'
+ weight_decay:
+ - scheduler:
+ _target_: fvcore.common.param_scheduler.ConstantParamScheduler
+ value: 0.1
+ - scheduler:
+ _target_: fvcore.common.param_scheduler.ConstantParamScheduler
+ value: 0.0
+ param_names:
+ - '*bias*'
+ module_cls_names: ['torch.nn.LayerNorm']
+
+ loss:
+ all:
+ _target_: training.loss_fns.MultiStepMultiMasksAndIous
+ weight_dict:
+ loss_mask: 20
+ loss_dice: 1
+ loss_iou: 1
+ loss_class: 1
+ supervise_all_iou: true
+ iou_use_l1_loss: true
+ pred_obj_scores: true
+ focal_gamma_obj_score: 0.0
+ focal_alpha_obj_score: -1.0
+
+ distributed:
+ backend: nccl
+ find_unused_parameters: True
+
+ logging:
+ tensorboard_writer:
+ _target_: training.utils.logger.make_tensorboard_logger
+ log_dir: ${launcher.experiment_log_dir}/tensorboard
+ flush_secs: 120
+ should_log: True
+ log_dir: ${launcher.experiment_log_dir}/logs
+ log_freq: 10
+
+ # initialize from a SAM 2 checkpoint
+ checkpoint:
+ save_dir: ${launcher.experiment_log_dir}/checkpoints
+ save_freq: 0 # 0 only last checkpoint is saved.
+ model_weight_initializer:
+ _partial_: True
+ _target_: training.utils.checkpoint_utils.load_state_dict_into_model
+ strict: True
+ ignore_unexpected_keys: null
+ ignore_missing_keys: null
+
+ state_dict:
+ _target_: training.utils.checkpoint_utils.load_checkpoint_and_apply_kernels
+ checkpoint_path: ./checkpoints/sam2.1_hiera_base_plus.pt # PATH to SAM 2.1 checkpoint
+ ckpt_state_dict_keys: ['model']
+
+launcher:
+ num_nodes: 1
+ gpus_per_node: 8
+ experiment_log_dir: null # Path to log directory, defaults to ./sam2_logs/${config_name}
+
+# SLURM args if running on a cluster
+submitit:
+ partition: null
+ account: null
+ qos: null
+ cpus_per_task: 10
+ use_cluster: false
+ timeout_hour: 24
+ name: null
+ port_range: [10000, 65000]
+
diff --git a/sam2_repo/sam2/configs/sam2/sam2_hiera_b+.yaml b/sam2_repo/sam2/configs/sam2/sam2_hiera_b+.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f435af02fc88e2d3b7bff06f8cf8013cc079c24
--- /dev/null
+++ b/sam2_repo/sam2/configs/sam2/sam2_hiera_b+.yaml
@@ -0,0 +1,113 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 112
+ num_heads: 2
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [896, 448, 224, 112]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: false
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/configs/sam2/sam2_hiera_l.yaml b/sam2_repo/sam2/configs/sam2/sam2_hiera_l.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1092802b1d24be6fedf78939f45b0d021d4ec560
--- /dev/null
+++ b/sam2_repo/sam2/configs/sam2/sam2_hiera_l.yaml
@@ -0,0 +1,117 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 144
+ num_heads: 2
+ stages: [2, 6, 36, 4]
+ global_att_blocks: [23, 33, 43]
+ window_pos_embed_bkg_spatial_size: [7, 7]
+ window_spec: [8, 4, 16, 8]
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [1152, 576, 288, 144]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: false
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/configs/sam2/sam2_hiera_s.yaml b/sam2_repo/sam2/configs/sam2/sam2_hiera_s.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..174e414f1467d80e94a34e9525dc373058f8caaa
--- /dev/null
+++ b/sam2_repo/sam2/configs/sam2/sam2_hiera_s.yaml
@@ -0,0 +1,116 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 96
+ num_heads: 1
+ stages: [1, 2, 11, 2]
+ global_att_blocks: [7, 10, 13]
+ window_pos_embed_bkg_spatial_size: [7, 7]
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [768, 384, 192, 96]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: false
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/configs/sam2/sam2_hiera_t.yaml b/sam2_repo/sam2/configs/sam2/sam2_hiera_t.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..121447aabd5318fac20efc2bc00d7c406ca26f01
--- /dev/null
+++ b/sam2_repo/sam2/configs/sam2/sam2_hiera_t.yaml
@@ -0,0 +1,118 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 96
+ num_heads: 1
+ stages: [1, 2, 7, 2]
+ global_att_blocks: [5, 7, 9]
+ window_pos_embed_bkg_spatial_size: [7, 7]
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [768, 384, 192, 96]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ # SAM decoder
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: false
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ # HieraT does not currently support compilation, should always be set to False
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/csrc/connected_components.cu b/sam2_repo/sam2/csrc/connected_components.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ced21eb32eaaadb818d441c1322b99d1bf068f45
--- /dev/null
+++ b/sam2_repo/sam2/csrc/connected_components.cu
@@ -0,0 +1,289 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+// adapted from https://github.com/zsef123/Connected_components_PyTorch
+// with license found in the LICENSE_cctorch file in the root directory.
+#include
+#include
+#include
+#include
+#include
+#include
+
+// 2d
+#define BLOCK_ROWS 16
+#define BLOCK_COLS 16
+
+namespace cc2d {
+
+template
+__device__ __forceinline__ unsigned char hasBit(T bitmap, unsigned char pos) {
+ return (bitmap >> pos) & 1;
+}
+
+__device__ int32_t find(const int32_t* s_buf, int32_t n) {
+ while (s_buf[n] != n)
+ n = s_buf[n];
+ return n;
+}
+
+__device__ int32_t find_n_compress(int32_t* s_buf, int32_t n) {
+ const int32_t id = n;
+ while (s_buf[n] != n) {
+ n = s_buf[n];
+ s_buf[id] = n;
+ }
+ return n;
+}
+
+__device__ void union_(int32_t* s_buf, int32_t a, int32_t b) {
+ bool done;
+ do {
+ a = find(s_buf, a);
+ b = find(s_buf, b);
+
+ if (a < b) {
+ int32_t old = atomicMin(s_buf + b, a);
+ done = (old == b);
+ b = old;
+ } else if (b < a) {
+ int32_t old = atomicMin(s_buf + a, b);
+ done = (old == a);
+ a = old;
+ } else
+ done = true;
+
+ } while (!done);
+}
+
+__global__ void
+init_labeling(int32_t* label, const uint32_t W, const uint32_t H) {
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+ const uint32_t idx = row * W + col;
+
+ if (row < H && col < W)
+ label[idx] = idx;
+}
+
+__global__ void
+merge(uint8_t* img, int32_t* label, const uint32_t W, const uint32_t H) {
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+ const uint32_t idx = row * W + col;
+
+ if (row >= H || col >= W)
+ return;
+
+ uint32_t P = 0;
+
+ if (img[idx])
+ P |= 0x777;
+ if (row + 1 < H && img[idx + W])
+ P |= 0x777 << 4;
+ if (col + 1 < W && img[idx + 1])
+ P |= 0x777 << 1;
+
+ if (col == 0)
+ P &= 0xEEEE;
+ if (col + 1 >= W)
+ P &= 0x3333;
+ else if (col + 2 >= W)
+ P &= 0x7777;
+
+ if (row == 0)
+ P &= 0xFFF0;
+ if (row + 1 >= H)
+ P &= 0xFF;
+
+ if (P > 0) {
+ // If need check about top-left pixel(if flag the first bit) and hit the
+ // top-left pixel
+ if (hasBit(P, 0) && img[idx - W - 1]) {
+ union_(label, idx, idx - 2 * W - 2); // top left block
+ }
+
+ if ((hasBit(P, 1) && img[idx - W]) || (hasBit(P, 2) && img[idx - W + 1]))
+ union_(label, idx, idx - 2 * W); // top bottom block
+
+ if (hasBit(P, 3) && img[idx + 2 - W])
+ union_(label, idx, idx - 2 * W + 2); // top right block
+
+ if ((hasBit(P, 4) && img[idx - 1]) || (hasBit(P, 8) && img[idx + W - 1]))
+ union_(label, idx, idx - 2); // just left block
+ }
+}
+
+__global__ void compression(int32_t* label, const int32_t W, const int32_t H) {
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+ const uint32_t idx = row * W + col;
+
+ if (row < H && col < W)
+ find_n_compress(label, idx);
+}
+
+__global__ void final_labeling(
+ const uint8_t* img,
+ int32_t* label,
+ const int32_t W,
+ const int32_t H) {
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+ const uint32_t idx = row * W + col;
+
+ if (row >= H || col >= W)
+ return;
+
+ int32_t y = label[idx] + 1;
+
+ if (img[idx])
+ label[idx] = y;
+ else
+ label[idx] = 0;
+
+ if (col + 1 < W) {
+ if (img[idx + 1])
+ label[idx + 1] = y;
+ else
+ label[idx + 1] = 0;
+
+ if (row + 1 < H) {
+ if (img[idx + W + 1])
+ label[idx + W + 1] = y;
+ else
+ label[idx + W + 1] = 0;
+ }
+ }
+
+ if (row + 1 < H) {
+ if (img[idx + W])
+ label[idx + W] = y;
+ else
+ label[idx + W] = 0;
+ }
+}
+
+__global__ void init_counting(
+ const int32_t* label,
+ int32_t* count_init,
+ const int32_t W,
+ const int32_t H) {
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
+ const uint32_t idx = row * W + col;
+
+ if (row >= H || col >= W)
+ return;
+
+ int32_t y = label[idx];
+ if (y > 0) {
+ int32_t count_idx = y - 1;
+ atomicAdd(count_init + count_idx, 1);
+ }
+}
+
+__global__ void final_counting(
+ const int32_t* label,
+ const int32_t* count_init,
+ int32_t* count_final,
+ const int32_t W,
+ const int32_t H) {
+ const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
+ const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
+ const uint32_t idx = row * W + col;
+
+ if (row >= H || col >= W)
+ return;
+
+ int32_t y = label[idx];
+ if (y > 0) {
+ int32_t count_idx = y - 1;
+ count_final[idx] = count_init[count_idx];
+ } else {
+ count_final[idx] = 0;
+ }
+}
+
+} // namespace cc2d
+
+std::vector get_connected_componnets(
+ const torch::Tensor& inputs) {
+ AT_ASSERTM(inputs.is_cuda(), "inputs must be a CUDA tensor");
+ AT_ASSERTM(inputs.ndimension() == 4, "inputs must be [N, 1, H, W] shape");
+ AT_ASSERTM(
+ inputs.scalar_type() == torch::kUInt8, "inputs must be a uint8 type");
+
+ const uint32_t N = inputs.size(0);
+ const uint32_t C = inputs.size(1);
+ const uint32_t H = inputs.size(2);
+ const uint32_t W = inputs.size(3);
+
+ AT_ASSERTM(C == 1, "inputs must be [N, 1, H, W] shape");
+ AT_ASSERTM((H % 2) == 0, "height must be an even number");
+ AT_ASSERTM((W % 2) == 0, "width must be an even number");
+
+ // label must be uint32_t
+ auto label_options =
+ torch::TensorOptions().dtype(torch::kInt32).device(inputs.device());
+ torch::Tensor labels = torch::zeros({N, C, H, W}, label_options);
+ torch::Tensor counts_init = torch::zeros({N, C, H, W}, label_options);
+ torch::Tensor counts_final = torch::zeros({N, C, H, W}, label_options);
+
+ dim3 grid = dim3(
+ ((W + 1) / 2 + BLOCK_COLS - 1) / BLOCK_COLS,
+ ((H + 1) / 2 + BLOCK_ROWS - 1) / BLOCK_ROWS);
+ dim3 block = dim3(BLOCK_COLS, BLOCK_ROWS);
+ dim3 grid_count =
+ dim3((W + BLOCK_COLS) / BLOCK_COLS, (H + BLOCK_ROWS) / BLOCK_ROWS);
+ dim3 block_count = dim3(BLOCK_COLS, BLOCK_ROWS);
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ for (int n = 0; n < N; n++) {
+ uint32_t offset = n * H * W;
+
+ cc2d::init_labeling<<>>(
+ labels.data_ptr() + offset, W, H);
+ cc2d::merge<<>>(
+ inputs.data_ptr() + offset,
+ labels.data_ptr() + offset,
+ W,
+ H);
+ cc2d::compression<<>>(
+ labels.data_ptr() + offset, W, H);
+ cc2d::final_labeling<<>>(
+ inputs.data_ptr() + offset,
+ labels.data_ptr() + offset,
+ W,
+ H);
+
+ // get the counting of each pixel
+ cc2d::init_counting<<>>(
+ labels.data_ptr() + offset,
+ counts_init.data_ptr() + offset,
+ W,
+ H);
+ cc2d::final_counting<<>>(
+ labels.data_ptr() + offset,
+ counts_init.data_ptr() + offset,
+ counts_final.data_ptr() + offset,
+ W,
+ H);
+ }
+
+ // returned values are [labels, counts]
+ std::vector outputs;
+ outputs.push_back(labels);
+ outputs.push_back(counts_final);
+ return outputs;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+ m.def(
+ "get_connected_componnets",
+ &get_connected_componnets,
+ "get_connected_componnets");
+}
diff --git a/sam2_repo/sam2/modeling/__init__.py b/sam2_repo/sam2/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/sam2_repo/sam2/modeling/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/sam2_repo/sam2/modeling/__pycache__/__init__.cpython-313.pyc b/sam2_repo/sam2/modeling/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f2d48abdb00e8d0f0b054dde510c6ba0c9242e8
Binary files /dev/null and b/sam2_repo/sam2/modeling/__pycache__/__init__.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/__pycache__/memory_attention.cpython-313.pyc b/sam2_repo/sam2/modeling/__pycache__/memory_attention.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..caad394f7def7359395e77e6c3b9860bf744c33d
Binary files /dev/null and b/sam2_repo/sam2/modeling/__pycache__/memory_attention.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/__pycache__/memory_encoder.cpython-313.pyc b/sam2_repo/sam2/modeling/__pycache__/memory_encoder.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c98a0be5f016ee105c54f0a0e8ef09f1d5aa2b3
Binary files /dev/null and b/sam2_repo/sam2/modeling/__pycache__/memory_encoder.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/__pycache__/position_encoding.cpython-313.pyc b/sam2_repo/sam2/modeling/__pycache__/position_encoding.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f1ae19d6e077b4d03231ac5738c8f76ae11d660
Binary files /dev/null and b/sam2_repo/sam2/modeling/__pycache__/position_encoding.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/__pycache__/sam2_base.cpython-313.pyc b/sam2_repo/sam2/modeling/__pycache__/sam2_base.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92957e9842ae025f89775e76a721a73a0ca9007a
Binary files /dev/null and b/sam2_repo/sam2/modeling/__pycache__/sam2_base.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/__pycache__/sam2_utils.cpython-313.pyc b/sam2_repo/sam2/modeling/__pycache__/sam2_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22a4cb18c2e349e031548f89fa4c4ea93603d7f5
Binary files /dev/null and b/sam2_repo/sam2/modeling/__pycache__/sam2_utils.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/backbones/__init__.py b/sam2_repo/sam2/modeling/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/sam2_repo/sam2/modeling/backbones/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/sam2_repo/sam2/modeling/backbones/__pycache__/__init__.cpython-313.pyc b/sam2_repo/sam2/modeling/backbones/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..098d8dd33d84133403e099a182cc68c25f3a6c0d
Binary files /dev/null and b/sam2_repo/sam2/modeling/backbones/__pycache__/__init__.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/backbones/__pycache__/hieradet.cpython-313.pyc b/sam2_repo/sam2/modeling/backbones/__pycache__/hieradet.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..807f5de0bbb16bf8ac7bddccf7233e559558aa25
Binary files /dev/null and b/sam2_repo/sam2/modeling/backbones/__pycache__/hieradet.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/backbones/__pycache__/image_encoder.cpython-313.pyc b/sam2_repo/sam2/modeling/backbones/__pycache__/image_encoder.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1f50129cd206e7cc5d3247f7a09c808358cc392
Binary files /dev/null and b/sam2_repo/sam2/modeling/backbones/__pycache__/image_encoder.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/backbones/__pycache__/utils.cpython-313.pyc b/sam2_repo/sam2/modeling/backbones/__pycache__/utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e393ab13034367b6fb1a52e803947f37785f40b
Binary files /dev/null and b/sam2_repo/sam2/modeling/backbones/__pycache__/utils.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/backbones/hieradet.py b/sam2_repo/sam2/modeling/backbones/hieradet.py
new file mode 100644
index 0000000000000000000000000000000000000000..19ac77b61d8e1345a301686d39ef2ab6e4b035fb
--- /dev/null
+++ b/sam2_repo/sam2/modeling/backbones/hieradet.py
@@ -0,0 +1,317 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from functools import partial
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from iopath.common.file_io import g_pathmgr
+
+from sam2.modeling.backbones.utils import (
+ PatchEmbed,
+ window_partition,
+ window_unpartition,
+)
+
+from sam2.modeling.sam2_utils import DropPath, MLP
+
+
+def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
+ if pool is None:
+ return x
+ # (B, H, W, C) -> (B, C, H, W)
+ x = x.permute(0, 3, 1, 2)
+ x = pool(x)
+ # (B, C, H', W') -> (B, H', W', C)
+ x = x.permute(0, 2, 3, 1)
+ if norm:
+ x = norm(x)
+
+ return x
+
+
+class MultiScaleAttention(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ dim_out: int,
+ num_heads: int,
+ q_pool: nn.Module = None,
+ ):
+ super().__init__()
+
+ self.dim = dim
+ self.dim_out = dim_out
+ self.num_heads = num_heads
+ self.q_pool = q_pool
+ self.qkv = nn.Linear(dim, dim_out * 3)
+ self.proj = nn.Linear(dim_out, dim_out)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ B, H, W, _ = x.shape
+ # qkv with shape (B, H * W, 3, nHead, C)
+ qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1)
+ # q, k, v with shape (B, H * W, nheads, C)
+ q, k, v = torch.unbind(qkv, 2)
+
+ # Q pooling (for downsample at stage changes)
+ if self.q_pool:
+ q = do_pool(q.reshape(B, H, W, -1), self.q_pool)
+ H, W = q.shape[1:3] # downsampled shape
+ q = q.reshape(B, H * W, self.num_heads, -1)
+
+ # Torch's SDPA expects [B, nheads, H*W, C] so we transpose
+ x = F.scaled_dot_product_attention(
+ q.transpose(1, 2),
+ k.transpose(1, 2),
+ v.transpose(1, 2),
+ )
+ # Transpose back
+ x = x.transpose(1, 2)
+ x = x.reshape(B, H, W, -1)
+
+ x = self.proj(x)
+
+ return x
+
+
+class MultiScaleBlock(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ dim_out: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ drop_path: float = 0.0,
+ norm_layer: Union[nn.Module, str] = "LayerNorm",
+ q_stride: Tuple[int, int] = None,
+ act_layer: nn.Module = nn.GELU,
+ window_size: int = 0,
+ ):
+ super().__init__()
+
+ if isinstance(norm_layer, str):
+ norm_layer = partial(getattr(nn, norm_layer), eps=1e-6)
+
+ self.dim = dim
+ self.dim_out = dim_out
+ self.norm1 = norm_layer(dim)
+
+ self.window_size = window_size
+
+ self.pool, self.q_stride = None, q_stride
+ if self.q_stride:
+ self.pool = nn.MaxPool2d(
+ kernel_size=q_stride, stride=q_stride, ceil_mode=False
+ )
+
+ self.attn = MultiScaleAttention(
+ dim,
+ dim_out,
+ num_heads=num_heads,
+ q_pool=self.pool,
+ )
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ self.norm2 = norm_layer(dim_out)
+ self.mlp = MLP(
+ dim_out,
+ int(dim_out * mlp_ratio),
+ dim_out,
+ num_layers=2,
+ activation=act_layer,
+ )
+
+ if dim != dim_out:
+ self.proj = nn.Linear(dim, dim_out)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ shortcut = x # B, H, W, C
+ x = self.norm1(x)
+
+ # Skip connection
+ if self.dim != self.dim_out:
+ shortcut = do_pool(self.proj(x), self.pool)
+
+ # Window partition
+ window_size = self.window_size
+ if window_size > 0:
+ H, W = x.shape[1], x.shape[2]
+ x, pad_hw = window_partition(x, window_size)
+
+ # Window Attention + Q Pooling (if stage change)
+ x = self.attn(x)
+ if self.q_stride:
+ # Shapes have changed due to Q pooling
+ window_size = self.window_size // self.q_stride[0]
+ H, W = shortcut.shape[1:3]
+
+ pad_h = (window_size - H % window_size) % window_size
+ pad_w = (window_size - W % window_size) % window_size
+ pad_hw = (H + pad_h, W + pad_w)
+
+ # Reverse window partition
+ if self.window_size > 0:
+ x = window_unpartition(x, window_size, pad_hw, (H, W))
+
+ x = shortcut + self.drop_path(x)
+ # MLP
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+
+class Hiera(nn.Module):
+ """
+ Reference: https://arxiv.org/abs/2306.00989
+ """
+
+ def __init__(
+ self,
+ embed_dim: int = 96, # initial embed dim
+ num_heads: int = 1, # initial number of heads
+ drop_path_rate: float = 0.0, # stochastic depth
+ q_pool: int = 3, # number of q_pool stages
+ q_stride: Tuple[int, int] = (2, 2), # downsample stride bet. stages
+ stages: Tuple[int, ...] = (2, 3, 16, 3), # blocks per stage
+ dim_mul: float = 2.0, # dim_mul factor at stage shift
+ head_mul: float = 2.0, # head_mul factor at stage shift
+ window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
+ # window size per stage, when not using global att.
+ window_spec: Tuple[int, ...] = (
+ 8,
+ 4,
+ 14,
+ 7,
+ ),
+ # global attn in these blocks
+ global_att_blocks: Tuple[int, ...] = (
+ 12,
+ 16,
+ 20,
+ ),
+ weights_path=None,
+ return_interm_layers=True, # return feats from every stage
+ ):
+ super().__init__()
+
+ assert len(stages) == len(window_spec)
+ self.window_spec = window_spec
+
+ depth = sum(stages)
+ self.q_stride = q_stride
+ self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
+ assert 0 <= q_pool <= len(self.stage_ends[:-1])
+ self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool]
+ self.return_interm_layers = return_interm_layers
+
+ self.patch_embed = PatchEmbed(
+ embed_dim=embed_dim,
+ )
+ # Which blocks have global att?
+ self.global_att_blocks = global_att_blocks
+
+ # Windowed positional embedding (https://arxiv.org/abs/2311.05613)
+ self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size
+ self.pos_embed = nn.Parameter(
+ torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size)
+ )
+ self.pos_embed_window = nn.Parameter(
+ torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0])
+ )
+
+ dpr = [
+ x.item() for x in torch.linspace(0, drop_path_rate, depth)
+ ] # stochastic depth decay rule
+
+ cur_stage = 1
+ self.blocks = nn.ModuleList()
+
+ for i in range(depth):
+ dim_out = embed_dim
+ # lags by a block, so first block of
+ # next stage uses an initial window size
+ # of previous stage and final window size of current stage
+ window_size = self.window_spec[cur_stage - 1]
+
+ if self.global_att_blocks is not None:
+ window_size = 0 if i in self.global_att_blocks else window_size
+
+ if i - 1 in self.stage_ends:
+ dim_out = int(embed_dim * dim_mul)
+ num_heads = int(num_heads * head_mul)
+ cur_stage += 1
+
+ block = MultiScaleBlock(
+ dim=embed_dim,
+ dim_out=dim_out,
+ num_heads=num_heads,
+ drop_path=dpr[i],
+ q_stride=self.q_stride if i in self.q_pool_blocks else None,
+ window_size=window_size,
+ )
+
+ embed_dim = dim_out
+ self.blocks.append(block)
+
+ self.channel_list = (
+ [self.blocks[i].dim_out for i in self.stage_ends[::-1]]
+ if return_interm_layers
+ else [self.blocks[-1].dim_out]
+ )
+
+ if weights_path is not None:
+ with g_pathmgr.open(weights_path, "rb") as f:
+ chkpt = torch.load(f, map_location="cpu")
+ logging.info("loading Hiera", self.load_state_dict(chkpt, strict=False))
+
+ def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
+ h, w = hw
+ window_embed = self.pos_embed_window
+ pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic")
+ pos_embed = pos_embed + window_embed.tile(
+ [x // y for x, y in zip(pos_embed.shape, window_embed.shape)]
+ )
+ pos_embed = pos_embed.permute(0, 2, 3, 1)
+ return pos_embed
+
+ def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+ x = self.patch_embed(x)
+ # x: (B, H, W, C)
+
+ # Add pos embed
+ x = x + self._get_pos_embed(x.shape[1:3])
+
+ outputs = []
+ for i, blk in enumerate(self.blocks):
+ x = blk(x)
+ if (i == self.stage_ends[-1]) or (
+ i in self.stage_ends and self.return_interm_layers
+ ):
+ feats = x.permute(0, 3, 1, 2)
+ outputs.append(feats)
+
+ return outputs
+
+ def get_layer_id(self, layer_name):
+ # https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
+ num_layers = self.get_num_layers()
+
+ if layer_name.find("rel_pos") != -1:
+ return num_layers + 1
+ elif layer_name.find("pos_embed") != -1:
+ return 0
+ elif layer_name.find("patch_embed") != -1:
+ return 0
+ elif layer_name.find("blocks") != -1:
+ return int(layer_name.split("blocks")[1].split(".")[1]) + 1
+ else:
+ return num_layers + 1
+
+ def get_num_layers(self) -> int:
+ return len(self.blocks)
diff --git a/sam2_repo/sam2/modeling/backbones/image_encoder.py b/sam2_repo/sam2/modeling/backbones/image_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..37e9266bc98596e97ca303118c910ed24f6cee2c
--- /dev/null
+++ b/sam2_repo/sam2/modeling/backbones/image_encoder.py
@@ -0,0 +1,134 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ImageEncoder(nn.Module):
+ def __init__(
+ self,
+ trunk: nn.Module,
+ neck: nn.Module,
+ scalp: int = 0,
+ ):
+ super().__init__()
+ self.trunk = trunk
+ self.neck = neck
+ self.scalp = scalp
+ assert (
+ self.trunk.channel_list == self.neck.backbone_channel_list
+ ), f"Channel dims of trunk and neck do not match. Trunk: {self.trunk.channel_list}, neck: {self.neck.backbone_channel_list}"
+
+ def forward(self, sample: torch.Tensor):
+ # Forward through backbone
+ features, pos = self.neck(self.trunk(sample))
+ if self.scalp > 0:
+ # Discard the lowest resolution features
+ features, pos = features[: -self.scalp], pos[: -self.scalp]
+
+ src = features[-1]
+ output = {
+ "vision_features": src,
+ "vision_pos_enc": pos,
+ "backbone_fpn": features,
+ }
+ return output
+
+
+class FpnNeck(nn.Module):
+ """
+ A modified variant of Feature Pyramid Network (FPN) neck
+ (we remove output conv and also do bicubic interpolation similar to ViT
+ pos embed interpolation)
+ """
+
+ def __init__(
+ self,
+ position_encoding: nn.Module,
+ d_model: int,
+ backbone_channel_list: List[int],
+ kernel_size: int = 1,
+ stride: int = 1,
+ padding: int = 0,
+ fpn_interp_model: str = "bilinear",
+ fuse_type: str = "sum",
+ fpn_top_down_levels: Optional[List[int]] = None,
+ ):
+ """Initialize the neck
+ :param trunk: the backbone
+ :param position_encoding: the positional encoding to use
+ :param d_model: the dimension of the model
+ :param neck_norm: the normalization to use
+ """
+ super().__init__()
+ self.position_encoding = position_encoding
+ self.convs = nn.ModuleList()
+ self.backbone_channel_list = backbone_channel_list
+ self.d_model = d_model
+ for dim in backbone_channel_list:
+ current = nn.Sequential()
+ current.add_module(
+ "conv",
+ nn.Conv2d(
+ in_channels=dim,
+ out_channels=d_model,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ ),
+ )
+
+ self.convs.append(current)
+ self.fpn_interp_model = fpn_interp_model
+ assert fuse_type in ["sum", "avg"]
+ self.fuse_type = fuse_type
+
+ # levels to have top-down features in its outputs
+ # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
+ # have top-down propagation, while outputs of level 0 and level 1 have only
+ # lateral features from the same backbone level.
+ if fpn_top_down_levels is None:
+ # default is to have top-down features on all levels
+ fpn_top_down_levels = range(len(self.convs))
+ self.fpn_top_down_levels = list(fpn_top_down_levels)
+
+ def forward(self, xs: List[torch.Tensor]):
+
+ out = [None] * len(self.convs)
+ pos = [None] * len(self.convs)
+ assert len(xs) == len(self.convs)
+ # fpn forward pass
+ # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
+ prev_features = None
+ # forward in top-down order (from low to high resolution)
+ n = len(self.convs) - 1
+ for i in range(n, -1, -1):
+ x = xs[i]
+ lateral_features = self.convs[n - i](x)
+ if i in self.fpn_top_down_levels and prev_features is not None:
+ top_down_features = F.interpolate(
+ prev_features.to(dtype=torch.float32),
+ scale_factor=2.0,
+ mode=self.fpn_interp_model,
+ align_corners=(
+ None if self.fpn_interp_model == "nearest" else False
+ ),
+ antialias=False,
+ )
+ prev_features = lateral_features + top_down_features
+ if self.fuse_type == "avg":
+ prev_features /= 2
+ else:
+ prev_features = lateral_features
+ x_out = prev_features
+ out[i] = x_out
+ pos[i] = self.position_encoding(x_out).to(x_out.dtype)
+
+ return out, pos
diff --git a/sam2_repo/sam2/modeling/backbones/utils.py b/sam2_repo/sam2/modeling/backbones/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..930b1b7622e7b0e7270120dcafccc242ef0f4f28
--- /dev/null
+++ b/sam2_repo/sam2/modeling/backbones/utils.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Some utilities for backbones, in particular for windowing"""
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def window_partition(x, window_size):
+ """
+ Partition into non-overlapping windows with padding if needed.
+ Args:
+ x (tensor): input tokens with [B, H, W, C].
+ window_size (int): window size.
+ Returns:
+ windows: windows after partition with [B * num_windows, window_size, window_size, C].
+ (Hp, Wp): padded height and width before partition
+ """
+ B, H, W, C = x.shape
+
+ pad_h = (window_size - H % window_size) % window_size
+ pad_w = (window_size - W % window_size) % window_size
+ if pad_h > 0 or pad_w > 0:
+ x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+ Hp, Wp = H + pad_h, W + pad_w
+
+ x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+ windows = x.permute(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, C)
+ return windows, (Hp, Wp)
+
+
+def window_unpartition(windows, window_size, pad_hw, hw):
+ """
+ Window unpartition into original sequences and removing padding.
+ Args:
+ x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+ window_size (int): window size.
+ pad_hw (Tuple): padded height and width (Hp, Wp).
+ hw (Tuple): original height and width (H, W) before padding.
+ Returns:
+ x: unpartitioned sequences with [B, H, W, C].
+ """
+ Hp, Wp = pad_hw
+ H, W = hw
+ B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+ x = windows.reshape(
+ B, Hp // window_size, Wp // window_size, window_size, window_size, -1
+ )
+ x = x.permute(0, 1, 3, 2, 4, 5).reshape(B, Hp, Wp, -1)
+
+ if Hp > H or Wp > W:
+ x = x[:, :H, :W, :]
+ return x
+
+
+class PatchEmbed(nn.Module):
+ """
+ Image to Patch Embedding.
+ """
+
+ def __init__(
+ self,
+ kernel_size: Tuple[int, ...] = (7, 7),
+ stride: Tuple[int, ...] = (4, 4),
+ padding: Tuple[int, ...] = (3, 3),
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ ):
+ """
+ Args:
+ kernel_size (Tuple): kernel size of the projection layer.
+ stride (Tuple): stride of the projection layer.
+ padding (Tuple): padding size of the projection layer.
+ in_chans (int): Number of input image channels.
+ embed_dim (int): embed_dim (int): Patch embedding dimension.
+ """
+ super().__init__()
+ self.proj = nn.Conv2d(
+ in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.proj(x)
+ # B C H W -> B H W C
+ x = x.permute(0, 2, 3, 1)
+ return x
diff --git a/sam2_repo/sam2/modeling/memory_attention.py b/sam2_repo/sam2/modeling/memory_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b07f9d87e3d8194ca5e11fc20f01604d591a59d
--- /dev/null
+++ b/sam2_repo/sam2/modeling/memory_attention.py
@@ -0,0 +1,169 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+from torch import nn, Tensor
+
+from sam2.modeling.sam.transformer import RoPEAttention
+
+from sam2.modeling.sam2_utils import get_activation_fn, get_clones
+
+
+class MemoryAttentionLayer(nn.Module):
+
+ def __init__(
+ self,
+ activation: str,
+ cross_attention: nn.Module,
+ d_model: int,
+ dim_feedforward: int,
+ dropout: float,
+ pos_enc_at_attn: bool,
+ pos_enc_at_cross_attn_keys: bool,
+ pos_enc_at_cross_attn_queries: bool,
+ self_attention: nn.Module,
+ ):
+ super().__init__()
+ self.d_model = d_model
+ self.dim_feedforward = dim_feedforward
+ self.dropout_value = dropout
+ self.self_attn = self_attention
+ self.cross_attn_image = cross_attention
+
+ # Implementation of Feedforward model
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
+ self.dropout = nn.Dropout(dropout)
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+ self.norm1 = nn.LayerNorm(d_model)
+ self.norm2 = nn.LayerNorm(d_model)
+ self.norm3 = nn.LayerNorm(d_model)
+ self.dropout1 = nn.Dropout(dropout)
+ self.dropout2 = nn.Dropout(dropout)
+ self.dropout3 = nn.Dropout(dropout)
+
+ self.activation_str = activation
+ self.activation = get_activation_fn(activation)
+
+ # Where to add pos enc
+ self.pos_enc_at_attn = pos_enc_at_attn
+ self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
+ self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
+
+ def _forward_sa(self, tgt, query_pos):
+ # Self-Attention
+ tgt2 = self.norm1(tgt)
+ q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
+ tgt2 = self.self_attn(q, k, v=tgt2)
+ tgt = tgt + self.dropout1(tgt2)
+ return tgt
+
+ def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0):
+ kwds = {}
+ if num_k_exclude_rope > 0:
+ assert isinstance(self.cross_attn_image, RoPEAttention)
+ kwds = {"num_k_exclude_rope": num_k_exclude_rope}
+
+ # Cross-Attention
+ tgt2 = self.norm2(tgt)
+ tgt2 = self.cross_attn_image(
+ q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
+ k=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
+ v=memory,
+ **kwds,
+ )
+ tgt = tgt + self.dropout2(tgt2)
+ return tgt
+
+ def forward(
+ self,
+ tgt,
+ memory,
+ pos: Optional[Tensor] = None,
+ query_pos: Optional[Tensor] = None,
+ num_k_exclude_rope: int = 0,
+ ) -> torch.Tensor:
+
+ # Self-Attn, Cross-Attn
+ tgt = self._forward_sa(tgt, query_pos)
+ tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
+ # MLP
+ tgt2 = self.norm3(tgt)
+ tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+ tgt = tgt + self.dropout3(tgt2)
+ return tgt
+
+
+class MemoryAttention(nn.Module):
+ def __init__(
+ self,
+ d_model: int,
+ pos_enc_at_input: bool,
+ layer: nn.Module,
+ num_layers: int,
+ batch_first: bool = True, # Do layers expect batch first input?
+ ):
+ super().__init__()
+ self.d_model = d_model
+ self.layers = get_clones(layer, num_layers)
+ self.num_layers = num_layers
+ self.norm = nn.LayerNorm(d_model)
+ self.pos_enc_at_input = pos_enc_at_input
+ self.batch_first = batch_first
+
+ def forward(
+ self,
+ curr: torch.Tensor, # self-attention inputs
+ memory: torch.Tensor, # cross-attention inputs
+ curr_pos: Optional[Tensor] = None, # pos_enc for self-attention inputs
+ memory_pos: Optional[Tensor] = None, # pos_enc for cross-attention inputs
+ num_obj_ptr_tokens: int = 0, # number of object pointer *tokens*
+ ):
+ if isinstance(curr, list):
+ assert isinstance(curr_pos, list)
+ assert len(curr) == len(curr_pos) == 1
+ curr, curr_pos = (
+ curr[0],
+ curr_pos[0],
+ )
+
+ assert (
+ curr.shape[1] == memory.shape[1]
+ ), "Batch size must be the same for curr and memory"
+
+ output = curr
+ if self.pos_enc_at_input and curr_pos is not None:
+ output = output + 0.1 * curr_pos
+
+ if self.batch_first:
+ # Convert to batch first
+ output = output.transpose(0, 1)
+ curr_pos = curr_pos.transpose(0, 1)
+ memory = memory.transpose(0, 1)
+ memory_pos = memory_pos.transpose(0, 1)
+
+ for layer in self.layers:
+ kwds = {}
+ if isinstance(layer.cross_attn_image, RoPEAttention):
+ kwds = {"num_k_exclude_rope": num_obj_ptr_tokens}
+
+ output = layer(
+ tgt=output,
+ memory=memory,
+ pos=memory_pos,
+ query_pos=curr_pos,
+ **kwds,
+ )
+ normed_output = self.norm(output)
+
+ if self.batch_first:
+ # Convert back to seq first
+ normed_output = normed_output.transpose(0, 1)
+ curr_pos = curr_pos.transpose(0, 1)
+
+ return normed_output
diff --git a/sam2_repo/sam2/modeling/memory_encoder.py b/sam2_repo/sam2/modeling/memory_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f60202dfaba87232c3870fb2101b5322a119d985
--- /dev/null
+++ b/sam2_repo/sam2/modeling/memory_encoder.py
@@ -0,0 +1,181 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from sam2.modeling.sam2_utils import DropPath, get_clones, LayerNorm2d
+
+
+class MaskDownSampler(nn.Module):
+ """
+ Progressively downsample a mask by total_stride, each time by stride.
+ Note that LayerNorm is applied per *token*, like in ViT.
+
+ With each downsample (by a factor stride**2), channel capacity increases by the same factor.
+ In the end, we linearly project to embed_dim channels.
+ """
+
+ def __init__(
+ self,
+ embed_dim=256,
+ kernel_size=4,
+ stride=4,
+ padding=0,
+ total_stride=16,
+ activation=nn.GELU,
+ ):
+ super().__init__()
+ num_layers = int(math.log2(total_stride) // math.log2(stride))
+ assert stride**num_layers == total_stride
+ self.encoder = nn.Sequential()
+ mask_in_chans, mask_out_chans = 1, 1
+ for _ in range(num_layers):
+ mask_out_chans = mask_in_chans * (stride**2)
+ self.encoder.append(
+ nn.Conv2d(
+ mask_in_chans,
+ mask_out_chans,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ )
+ )
+ self.encoder.append(LayerNorm2d(mask_out_chans))
+ self.encoder.append(activation())
+ mask_in_chans = mask_out_chans
+
+ self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1))
+
+ def forward(self, x):
+ return self.encoder(x)
+
+
+# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt)
+class CXBlock(nn.Module):
+ r"""ConvNeXt Block. There are two equivalent implementations:
+ (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+ (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+ We use (2) as we find it slightly faster in PyTorch
+
+ Args:
+ dim (int): Number of input channels.
+ drop_path (float): Stochastic depth rate. Default: 0.0
+ layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+ """
+
+ def __init__(
+ self,
+ dim,
+ kernel_size=7,
+ padding=3,
+ drop_path=0.0,
+ layer_scale_init_value=1e-6,
+ use_dwconv=True,
+ ):
+ super().__init__()
+ self.dwconv = nn.Conv2d(
+ dim,
+ dim,
+ kernel_size=kernel_size,
+ padding=padding,
+ groups=dim if use_dwconv else 1,
+ ) # depthwise conv
+ self.norm = LayerNorm2d(dim, eps=1e-6)
+ self.pwconv1 = nn.Linear(
+ dim, 4 * dim
+ ) # pointwise/1x1 convs, implemented with linear layers
+ self.act = nn.GELU()
+ self.pwconv2 = nn.Linear(4 * dim, dim)
+ self.gamma = (
+ nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+ if layer_scale_init_value > 0
+ else None
+ )
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+ def forward(self, x):
+ input = x
+ x = self.dwconv(x)
+ x = self.norm(x)
+ x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+ x = self.pwconv1(x)
+ x = self.act(x)
+ x = self.pwconv2(x)
+ if self.gamma is not None:
+ x = self.gamma * x
+ x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+
+ x = input + self.drop_path(x)
+ return x
+
+
+class Fuser(nn.Module):
+ def __init__(self, layer, num_layers, dim=None, input_projection=False):
+ super().__init__()
+ self.proj = nn.Identity()
+ self.layers = get_clones(layer, num_layers)
+
+ if input_projection:
+ assert dim is not None
+ self.proj = nn.Conv2d(dim, dim, kernel_size=1)
+
+ def forward(self, x):
+ # normally x: (N, C, H, W)
+ x = self.proj(x)
+ for layer in self.layers:
+ x = layer(x)
+ return x
+
+
+class MemoryEncoder(nn.Module):
+ def __init__(
+ self,
+ out_dim,
+ mask_downsampler,
+ fuser,
+ position_encoding,
+ in_dim=256, # in_dim of pix_feats
+ ):
+ super().__init__()
+
+ self.mask_downsampler = mask_downsampler
+
+ self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
+ self.fuser = fuser
+ self.position_encoding = position_encoding
+ self.out_proj = nn.Identity()
+ if out_dim != in_dim:
+ self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1)
+
+ def forward(
+ self,
+ pix_feat: torch.Tensor,
+ masks: torch.Tensor,
+ skip_mask_sigmoid: bool = False,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ ## Process masks
+ # sigmoid, so that less domain shift from gt masks which are bool
+ if not skip_mask_sigmoid:
+ masks = F.sigmoid(masks)
+ masks = self.mask_downsampler(masks)
+
+ ## Fuse pix_feats and downsampled masks
+ # in case the visual features are on CPU, cast them to CUDA
+ pix_feat = pix_feat.to(masks.device)
+
+ x = self.pix_feat_proj(pix_feat)
+ x = x + masks
+ x = self.fuser(x)
+ x = self.out_proj(x)
+
+ pos = self.position_encoding(x).to(x.dtype)
+
+ return {"vision_features": x, "vision_pos_enc": [pos]}
diff --git a/sam2_repo/sam2/modeling/position_encoding.py b/sam2_repo/sam2/modeling/position_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..2241d4cf1a4495b4c67dc35cbed1c606357b9b7a
--- /dev/null
+++ b/sam2_repo/sam2/modeling/position_encoding.py
@@ -0,0 +1,239 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Any, Optional, Tuple
+
+import numpy as np
+
+import torch
+from torch import nn
+
+
+class PositionEmbeddingSine(nn.Module):
+ """
+ This is a more standard version of the position embedding, very similar to the one
+ used by the Attention Is All You Need paper, generalized to work on images.
+ """
+
+ def __init__(
+ self,
+ num_pos_feats,
+ temperature: int = 10000,
+ normalize: bool = True,
+ scale: Optional[float] = None,
+ # Following settings only relevant
+ # for warmping up cache for compilation
+ warmup_cache: bool = True,
+ image_size: int = 1024,
+ strides: Tuple[int] = (4, 8, 16, 32),
+ ):
+ super().__init__()
+ assert num_pos_feats % 2 == 0, "Expecting even model width"
+ self.num_pos_feats = num_pos_feats // 2
+ self.temperature = temperature
+ self.normalize = normalize
+ if scale is not None and normalize is False:
+ raise ValueError("normalize should be True if scale is passed")
+ if scale is None:
+ scale = 2 * math.pi
+ self.scale = scale
+
+ self.cache = {}
+ if warmup_cache and torch.cuda.is_available():
+ # Warmup cache for cuda, to help with compilation
+ device = torch.device("cuda")
+ for stride in strides:
+ cache_key = (image_size // stride, image_size // stride)
+ self._pe(1, device, *cache_key)
+
+ def _encode_xy(self, x, y):
+ # The positions are expected to be normalized
+ assert len(x) == len(y) and x.ndim == y.ndim == 1
+ x_embed = x * self.scale
+ y_embed = y * self.scale
+
+ dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+ dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+ pos_x = x_embed[:, None] / dim_t
+ pos_y = y_embed[:, None] / dim_t
+ pos_x = torch.stack(
+ (pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2
+ ).flatten(1)
+ pos_y = torch.stack(
+ (pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2
+ ).flatten(1)
+ return pos_x, pos_y
+
+ @torch.no_grad()
+ def encode_boxes(self, x, y, w, h):
+ pos_x, pos_y = self._encode_xy(x, y)
+ pos = torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
+ return pos
+
+ encode = encode_boxes # Backwards compatibility
+
+ @torch.no_grad()
+ def encode_points(self, x, y, labels):
+ (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
+ assert bx == by and nx == ny and bx == bl and nx == nl
+ pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten())
+ pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1)
+ pos = torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2)
+ return pos
+
+ @torch.no_grad()
+ def _pe(self, B, device, *cache_key):
+ H, W = cache_key
+ if cache_key in self.cache:
+ return self.cache[cache_key].to(device)[None].repeat(B, 1, 1, 1)
+
+ y_embed = (
+ torch.arange(1, H + 1, dtype=torch.float32, device=device)
+ .view(1, -1, 1)
+ .repeat(B, 1, W)
+ )
+ x_embed = (
+ torch.arange(1, W + 1, dtype=torch.float32, device=device)
+ .view(1, 1, -1)
+ .repeat(B, H, 1)
+ )
+
+ if self.normalize:
+ eps = 1e-6
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+ dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=device)
+ dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+ pos_x = x_embed[:, :, :, None] / dim_t
+ pos_y = y_embed[:, :, :, None] / dim_t
+ pos_x = torch.stack(
+ (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+ ).flatten(3)
+ pos_y = torch.stack(
+ (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+ ).flatten(3)
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+ self.cache[cache_key] = pos[0]
+ return pos
+
+ @torch.no_grad()
+ def forward(self, x: torch.Tensor):
+ B = x.shape[0]
+ cache_key = (x.shape[-2], x.shape[-1])
+ return self._pe(B, x.device, *cache_key)
+
+
+class PositionEmbeddingRandom(nn.Module):
+ """
+ Positional encoding using random spatial frequencies.
+ """
+
+ def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+ super().__init__()
+ if scale is None or scale <= 0.0:
+ scale = 1.0
+ self.register_buffer(
+ "positional_encoding_gaussian_matrix",
+ scale * torch.randn((2, num_pos_feats)),
+ )
+
+ def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+ """Positionally encode points that are normalized to [0,1]."""
+ # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+ coords = 2 * coords - 1
+ coords = coords @ self.positional_encoding_gaussian_matrix
+ coords = 2 * np.pi * coords
+ # outputs d_1 x ... x d_n x C shape
+ return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+
+ def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+ """Generate positional encoding for a grid of the specified size."""
+ h, w = size
+ device: Any = self.positional_encoding_gaussian_matrix.device
+ grid = torch.ones((h, w), device=device, dtype=torch.float32)
+ y_embed = grid.cumsum(dim=0) - 0.5
+ x_embed = grid.cumsum(dim=1) - 0.5
+ y_embed = y_embed / h
+ x_embed = x_embed / w
+
+ pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+ return pe.permute(2, 0, 1) # C x H x W
+
+ def forward_with_coords(
+ self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+ ) -> torch.Tensor:
+ """Positionally encode points that are not normalized to [0,1]."""
+ coords = coords_input.clone()
+ coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+ coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+ return self._pe_encoding(coords.to(torch.float)) # B x N x C
+
+
+# Rotary Positional Encoding, adapted from:
+# 1. https://github.com/meta-llama/codellama/blob/main/llama/model.py
+# 2. https://github.com/naver-ai/rope-vit
+# 3. https://github.com/lucidrains/rotary-embedding-torch
+
+
+def init_t_xy(end_x: int, end_y: int):
+ t = torch.arange(end_x * end_y, dtype=torch.float32)
+ t_x = (t % end_x).float()
+ t_y = torch.div(t, end_x, rounding_mode="floor").float()
+ return t_x, t_y
+
+
+def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
+ freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+ freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+
+ t_x, t_y = init_t_xy(end_x, end_y)
+ freqs_x = torch.outer(t_x, freqs_x)
+ freqs_y = torch.outer(t_y, freqs_y)
+ freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
+ freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
+ return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
+
+
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+ ndim = x.ndim
+ assert 0 <= 1 < ndim
+ assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
+ shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
+ return freqs_cis.view(*shape)
+
+
+def apply_rotary_enc(
+ xq: torch.Tensor,
+ xk: torch.Tensor,
+ freqs_cis: torch.Tensor,
+ repeat_freqs_k: bool = False,
+):
+ xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+ xk_ = (
+ torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+ if xk.shape[-2] != 0
+ else None
+ )
+ freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+ xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+ if xk_ is None:
+ # no keys to rotate, due to dropout
+ return xq_out.type_as(xq).to(xq.device), xk
+ # repeat freqs along seq_len dim to match k seq_len
+ if repeat_freqs_k:
+ r = xk_.shape[-2] // xq_.shape[-2]
+ if freqs_cis.is_cuda:
+ freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
+ else:
+ # torch.repeat on complex numbers may not be supported on non-CUDA devices
+ # (freqs_cis has 4 dims and we repeat on dim 2) so we use expand + flatten
+ freqs_cis = freqs_cis.unsqueeze(2).expand(-1, -1, r, -1, -1).flatten(2, 3)
+ xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+ return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)
diff --git a/sam2_repo/sam2/modeling/sam/__init__.py b/sam2_repo/sam2/modeling/sam/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/sam2_repo/sam2/modeling/sam/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/sam2_repo/sam2/modeling/sam/__pycache__/__init__.cpython-313.pyc b/sam2_repo/sam2/modeling/sam/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1d298a625e5358d30dbc18880b8b15333e06724
Binary files /dev/null and b/sam2_repo/sam2/modeling/sam/__pycache__/__init__.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/sam/__pycache__/mask_decoder.cpython-313.pyc b/sam2_repo/sam2/modeling/sam/__pycache__/mask_decoder.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2aacd9d52efd5d1c67b45e8c8eec4b3a019b863
Binary files /dev/null and b/sam2_repo/sam2/modeling/sam/__pycache__/mask_decoder.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/sam/__pycache__/prompt_encoder.cpython-313.pyc b/sam2_repo/sam2/modeling/sam/__pycache__/prompt_encoder.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53ee2e2ad292c950349a7e077a34e0477415897f
Binary files /dev/null and b/sam2_repo/sam2/modeling/sam/__pycache__/prompt_encoder.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/sam/__pycache__/transformer.cpython-313.pyc b/sam2_repo/sam2/modeling/sam/__pycache__/transformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9d532bcf1305cc59b6aca7dd99820ee7c560b7f
Binary files /dev/null and b/sam2_repo/sam2/modeling/sam/__pycache__/transformer.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/modeling/sam/mask_decoder.py b/sam2_repo/sam2/modeling/sam/mask_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bebc0366b2703ffcb80a44bfd19cce8339b4fed
--- /dev/null
+++ b/sam2_repo/sam2/modeling/sam/mask_decoder.py
@@ -0,0 +1,295 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional, Tuple, Type
+
+import torch
+from torch import nn
+
+from sam2.modeling.sam2_utils import LayerNorm2d, MLP
+
+
+class MaskDecoder(nn.Module):
+ def __init__(
+ self,
+ *,
+ transformer_dim: int,
+ transformer: nn.Module,
+ num_multimask_outputs: int = 3,
+ activation: Type[nn.Module] = nn.GELU,
+ iou_head_depth: int = 3,
+ iou_head_hidden_dim: int = 256,
+ use_high_res_features: bool = False,
+ iou_prediction_use_sigmoid=False,
+ dynamic_multimask_via_stability=False,
+ dynamic_multimask_stability_delta=0.05,
+ dynamic_multimask_stability_thresh=0.98,
+ pred_obj_scores: bool = False,
+ pred_obj_scores_mlp: bool = False,
+ use_multimask_token_for_obj_ptr: bool = False,
+ ) -> None:
+ """
+ Predicts masks given an image and prompt embeddings, using a
+ transformer architecture.
+
+ Arguments:
+ transformer_dim (int): the channel dimension of the transformer
+ transformer (nn.Module): the transformer used to predict masks
+ num_multimask_outputs (int): the number of masks to predict
+ when disambiguating masks
+ activation (nn.Module): the type of activation to use when
+ upscaling masks
+ iou_head_depth (int): the depth of the MLP used to predict
+ mask quality
+ iou_head_hidden_dim (int): the hidden dimension of the MLP
+ used to predict mask quality
+ """
+ super().__init__()
+ self.transformer_dim = transformer_dim
+ self.transformer = transformer
+
+ self.num_multimask_outputs = num_multimask_outputs
+
+ self.iou_token = nn.Embedding(1, transformer_dim)
+ self.num_mask_tokens = num_multimask_outputs + 1
+ self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+
+ self.pred_obj_scores = pred_obj_scores
+ if self.pred_obj_scores:
+ self.obj_score_token = nn.Embedding(1, transformer_dim)
+ self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
+
+ self.output_upscaling = nn.Sequential(
+ nn.ConvTranspose2d(
+ transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
+ ),
+ LayerNorm2d(transformer_dim // 4),
+ activation(),
+ nn.ConvTranspose2d(
+ transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
+ ),
+ activation(),
+ )
+ self.use_high_res_features = use_high_res_features
+ if use_high_res_features:
+ self.conv_s0 = nn.Conv2d(
+ transformer_dim, transformer_dim // 8, kernel_size=1, stride=1
+ )
+ self.conv_s1 = nn.Conv2d(
+ transformer_dim, transformer_dim // 4, kernel_size=1, stride=1
+ )
+
+ self.output_hypernetworks_mlps = nn.ModuleList(
+ [
+ MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+ for i in range(self.num_mask_tokens)
+ ]
+ )
+
+ self.iou_prediction_head = MLP(
+ transformer_dim,
+ iou_head_hidden_dim,
+ self.num_mask_tokens,
+ iou_head_depth,
+ sigmoid_output=iou_prediction_use_sigmoid,
+ )
+ if self.pred_obj_scores:
+ self.pred_obj_score_head = nn.Linear(transformer_dim, 1)
+ if pred_obj_scores_mlp:
+ self.pred_obj_score_head = MLP(transformer_dim, transformer_dim, 1, 3)
+
+ # When outputting a single mask, optionally we can dynamically fall back to the best
+ # multimask output token if the single mask output token gives low stability scores.
+ self.dynamic_multimask_via_stability = dynamic_multimask_via_stability
+ self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta
+ self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh
+
+ def forward(
+ self,
+ image_embeddings: torch.Tensor,
+ image_pe: torch.Tensor,
+ sparse_prompt_embeddings: torch.Tensor,
+ dense_prompt_embeddings: torch.Tensor,
+ multimask_output: bool,
+ repeat_image: bool,
+ high_res_features: Optional[List[torch.Tensor]] = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Predict masks given image and prompt embeddings.
+
+ Arguments:
+ image_embeddings (torch.Tensor): the embeddings from the image encoder
+ image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
+ sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+ dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
+ multimask_output (bool): Whether to return multiple masks or a single
+ mask.
+
+ Returns:
+ torch.Tensor: batched predicted masks
+ torch.Tensor: batched predictions of mask quality
+ torch.Tensor: batched SAM token for mask output
+ """
+ masks, iou_pred, mask_tokens_out, object_score_logits = self.predict_masks(
+ image_embeddings=image_embeddings,
+ image_pe=image_pe,
+ sparse_prompt_embeddings=sparse_prompt_embeddings,
+ dense_prompt_embeddings=dense_prompt_embeddings,
+ repeat_image=repeat_image,
+ high_res_features=high_res_features,
+ )
+
+ # Select the correct mask or masks for output
+ if multimask_output:
+ masks = masks[:, 1:, :, :]
+ iou_pred = iou_pred[:, 1:]
+ elif self.dynamic_multimask_via_stability and not self.training:
+ masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred)
+ else:
+ masks = masks[:, 0:1, :, :]
+ iou_pred = iou_pred[:, 0:1]
+
+ if multimask_output and self.use_multimask_token_for_obj_ptr:
+ sam_tokens_out = mask_tokens_out[:, 1:] # [b, 3, c] shape
+ else:
+ # Take the mask output token. Here we *always* use the token for single mask output.
+ # At test time, even if we track after 1-click (and using multimask_output=True),
+ # we still take the single mask token here. The rationale is that we always track
+ # after multiple clicks during training, so the past tokens seen during training
+ # are always the single mask token (and we'll let it be the object-memory token).
+ sam_tokens_out = mask_tokens_out[:, 0:1] # [b, 1, c] shape
+
+ # Prepare output
+ return masks, iou_pred, sam_tokens_out, object_score_logits
+
+ def predict_masks(
+ self,
+ image_embeddings: torch.Tensor,
+ image_pe: torch.Tensor,
+ sparse_prompt_embeddings: torch.Tensor,
+ dense_prompt_embeddings: torch.Tensor,
+ repeat_image: bool,
+ high_res_features: Optional[List[torch.Tensor]] = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Predicts masks. See 'forward' for more details."""
+ # Concatenate output tokens
+ s = 0
+ if self.pred_obj_scores:
+ output_tokens = torch.cat(
+ [
+ self.obj_score_token.weight,
+ self.iou_token.weight,
+ self.mask_tokens.weight,
+ ],
+ dim=0,
+ )
+ s = 1
+ else:
+ output_tokens = torch.cat(
+ [self.iou_token.weight, self.mask_tokens.weight], dim=0
+ )
+ output_tokens = output_tokens.unsqueeze(0).expand(
+ sparse_prompt_embeddings.size(0), -1, -1
+ )
+ tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+
+ # Expand per-image data in batch direction to be per-mask
+ if repeat_image:
+ src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+ else:
+ assert image_embeddings.shape[0] == tokens.shape[0]
+ src = image_embeddings
+ src = src + dense_prompt_embeddings
+ assert (
+ image_pe.size(0) == 1
+ ), "image_pe should have size 1 in batch dim (from `get_dense_pe()`)"
+ pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+ b, c, h, w = src.shape
+
+ # Run the transformer
+ hs, src = self.transformer(src, pos_src, tokens)
+ iou_token_out = hs[:, s, :]
+ mask_tokens_out = hs[:, s + 1 : (s + 1 + self.num_mask_tokens), :]
+
+ # Upscale mask embeddings and predict masks using the mask tokens
+ src = src.transpose(1, 2).view(b, c, h, w)
+ if not self.use_high_res_features:
+ upscaled_embedding = self.output_upscaling(src)
+ else:
+ dc1, ln1, act1, dc2, act2 = self.output_upscaling
+ feat_s0, feat_s1 = high_res_features
+ upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
+ upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
+
+ hyper_in_list: List[torch.Tensor] = []
+ for i in range(self.num_mask_tokens):
+ hyper_in_list.append(
+ self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])
+ )
+ hyper_in = torch.stack(hyper_in_list, dim=1)
+ b, c, h, w = upscaled_embedding.shape
+ masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+
+ # Generate mask quality predictions
+ iou_pred = self.iou_prediction_head(iou_token_out)
+ if self.pred_obj_scores:
+ assert s == 1
+ object_score_logits = self.pred_obj_score_head(hs[:, 0, :])
+ else:
+ # Obj scores logits - default to 10.0, i.e. assuming the object is present, sigmoid(10)=1
+ object_score_logits = 10.0 * iou_pred.new_ones(iou_pred.shape[0], 1)
+
+ return masks, iou_pred, mask_tokens_out, object_score_logits
+
+ def _get_stability_scores(self, mask_logits):
+ """
+ Compute stability scores of the mask logits based on the IoU between upper and
+ lower thresholds.
+ """
+ mask_logits = mask_logits.flatten(-2)
+ stability_delta = self.dynamic_multimask_stability_delta
+ area_i = torch.sum(mask_logits > stability_delta, dim=-1).float()
+ area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float()
+ stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0)
+ return stability_scores
+
+ def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores):
+ """
+ When outputting a single mask, if the stability score from the current single-mask
+ output (based on output token 0) falls below a threshold, we instead select from
+ multi-mask outputs (based on output token 1~3) the mask with the highest predicted
+ IoU score. This is intended to ensure a valid mask for both clicking and tracking.
+ """
+ # The best mask from multimask output tokens (1~3)
+ multimask_logits = all_mask_logits[:, 1:, :, :]
+ multimask_iou_scores = all_iou_scores[:, 1:]
+ best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1)
+ batch_inds = torch.arange(
+ multimask_iou_scores.size(0), device=all_iou_scores.device
+ )
+ best_multimask_logits = multimask_logits[batch_inds, best_scores_inds]
+ best_multimask_logits = best_multimask_logits.unsqueeze(1)
+ best_multimask_iou_scores = multimask_iou_scores[batch_inds, best_scores_inds]
+ best_multimask_iou_scores = best_multimask_iou_scores.unsqueeze(1)
+
+ # The mask from singlemask output token 0 and its stability score
+ singlemask_logits = all_mask_logits[:, 0:1, :, :]
+ singlemask_iou_scores = all_iou_scores[:, 0:1]
+ stability_scores = self._get_stability_scores(singlemask_logits)
+ is_stable = stability_scores >= self.dynamic_multimask_stability_thresh
+
+ # Dynamically fall back to best multimask output upon low stability scores.
+ mask_logits_out = torch.where(
+ is_stable[..., None, None].expand_as(singlemask_logits),
+ singlemask_logits,
+ best_multimask_logits,
+ )
+ iou_scores_out = torch.where(
+ is_stable.expand_as(singlemask_iou_scores),
+ singlemask_iou_scores,
+ best_multimask_iou_scores,
+ )
+ return mask_logits_out, iou_scores_out
diff --git a/sam2_repo/sam2/modeling/sam/prompt_encoder.py b/sam2_repo/sam2/modeling/sam/prompt_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57876264b51f8c5236867359350e32d590efcb5
--- /dev/null
+++ b/sam2_repo/sam2/modeling/sam/prompt_encoder.py
@@ -0,0 +1,202 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple, Type
+
+import torch
+from torch import nn
+
+from sam2.modeling.position_encoding import PositionEmbeddingRandom
+
+from sam2.modeling.sam2_utils import LayerNorm2d
+
+
+class PromptEncoder(nn.Module):
+ def __init__(
+ self,
+ embed_dim: int,
+ image_embedding_size: Tuple[int, int],
+ input_image_size: Tuple[int, int],
+ mask_in_chans: int,
+ activation: Type[nn.Module] = nn.GELU,
+ ) -> None:
+ """
+ Encodes prompts for input to SAM's mask decoder.
+
+ Arguments:
+ embed_dim (int): The prompts' embedding dimension
+ image_embedding_size (tuple(int, int)): The spatial size of the
+ image embedding, as (H, W).
+ input_image_size (int): The padded size of the image as input
+ to the image encoder, as (H, W).
+ mask_in_chans (int): The number of hidden channels used for
+ encoding input masks.
+ activation (nn.Module): The activation to use when encoding
+ input masks.
+ """
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.input_image_size = input_image_size
+ self.image_embedding_size = image_embedding_size
+ self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+
+ self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners
+ point_embeddings = [
+ nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)
+ ]
+ self.point_embeddings = nn.ModuleList(point_embeddings)
+ self.not_a_point_embed = nn.Embedding(1, embed_dim)
+
+ self.mask_input_size = (
+ 4 * image_embedding_size[0],
+ 4 * image_embedding_size[1],
+ )
+ self.mask_downscaling = nn.Sequential(
+ nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+ LayerNorm2d(mask_in_chans // 4),
+ activation(),
+ nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+ LayerNorm2d(mask_in_chans),
+ activation(),
+ nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+ )
+ self.no_mask_embed = nn.Embedding(1, embed_dim)
+
+ def get_dense_pe(self) -> torch.Tensor:
+ """
+ Returns the positional encoding used to encode point prompts,
+ applied to a dense set of points the shape of the image encoding.
+
+ Returns:
+ torch.Tensor: Positional encoding with shape
+ 1x(embed_dim)x(embedding_h)x(embedding_w)
+ """
+ return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+
+ def _embed_points(
+ self,
+ points: torch.Tensor,
+ labels: torch.Tensor,
+ pad: bool,
+ ) -> torch.Tensor:
+ """Embeds point prompts."""
+ points = points + 0.5 # Shift to center of pixel
+ if pad:
+ padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
+ padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+ points = torch.cat([points, padding_point], dim=1)
+ labels = torch.cat([labels, padding_label], dim=1)
+ point_embedding = self.pe_layer.forward_with_coords(
+ points, self.input_image_size
+ )
+
+ point_embedding = torch.where(
+ (labels == -1).unsqueeze(-1),
+ torch.zeros_like(point_embedding) + self.not_a_point_embed.weight,
+ point_embedding,
+ )
+ point_embedding = torch.where(
+ (labels == 0).unsqueeze(-1),
+ point_embedding + self.point_embeddings[0].weight,
+ point_embedding,
+ )
+ point_embedding = torch.where(
+ (labels == 1).unsqueeze(-1),
+ point_embedding + self.point_embeddings[1].weight,
+ point_embedding,
+ )
+ point_embedding = torch.where(
+ (labels == 2).unsqueeze(-1),
+ point_embedding + self.point_embeddings[2].weight,
+ point_embedding,
+ )
+ point_embedding = torch.where(
+ (labels == 3).unsqueeze(-1),
+ point_embedding + self.point_embeddings[3].weight,
+ point_embedding,
+ )
+ return point_embedding
+
+ def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+ """Embeds box prompts."""
+ boxes = boxes + 0.5 # Shift to center of pixel
+ coords = boxes.reshape(-1, 2, 2)
+ corner_embedding = self.pe_layer.forward_with_coords(
+ coords, self.input_image_size
+ )
+ corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+ corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+ return corner_embedding
+
+ def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+ """Embeds mask inputs."""
+ mask_embedding = self.mask_downscaling(masks)
+ return mask_embedding
+
+ def _get_batch_size(
+ self,
+ points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+ boxes: Optional[torch.Tensor],
+ masks: Optional[torch.Tensor],
+ ) -> int:
+ """
+ Gets the batch size of the output given the batch size of the input prompts.
+ """
+ if points is not None:
+ return points[0].shape[0]
+ elif boxes is not None:
+ return boxes.shape[0]
+ elif masks is not None:
+ return masks.shape[0]
+ else:
+ return 1
+
+ def _get_device(self) -> torch.device:
+ return self.point_embeddings[0].weight.device
+
+ def forward(
+ self,
+ points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+ boxes: Optional[torch.Tensor],
+ masks: Optional[torch.Tensor],
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Embeds different types of prompts, returning both sparse and dense
+ embeddings.
+
+ Arguments:
+ points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+ and labels to embed.
+ boxes (torch.Tensor or none): boxes to embed
+ masks (torch.Tensor or none): masks to embed
+
+ Returns:
+ torch.Tensor: sparse embeddings for the points and boxes, with shape
+ BxNx(embed_dim), where N is determined by the number of input points
+ and boxes.
+ torch.Tensor: dense embeddings for the masks, in the shape
+ Bx(embed_dim)x(embed_H)x(embed_W)
+ """
+ bs = self._get_batch_size(points, boxes, masks)
+ sparse_embeddings = torch.empty(
+ (bs, 0, self.embed_dim), device=self._get_device()
+ )
+ if points is not None:
+ coords, labels = points
+ point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
+ sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+ if boxes is not None:
+ box_embeddings = self._embed_boxes(boxes)
+ sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
+
+ if masks is not None:
+ dense_embeddings = self._embed_masks(masks)
+ else:
+ dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+ bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+ )
+
+ return sparse_embeddings, dense_embeddings
diff --git a/sam2_repo/sam2/modeling/sam/transformer.py b/sam2_repo/sam2/modeling/sam/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9fe9a3fbc5cce4f1abe8ee0ae3a8602bbe2ff1b
--- /dev/null
+++ b/sam2_repo/sam2/modeling/sam/transformer.py
@@ -0,0 +1,311 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from functools import partial
+from typing import Tuple, Type
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from sam2.modeling.position_encoding import apply_rotary_enc, compute_axial_cis
+from sam2.modeling.sam2_utils import MLP
+
+
+class TwoWayTransformer(nn.Module):
+ def __init__(
+ self,
+ depth: int,
+ embedding_dim: int,
+ num_heads: int,
+ mlp_dim: int,
+ activation: Type[nn.Module] = nn.ReLU,
+ attention_downsample_rate: int = 2,
+ ) -> None:
+ """
+ A transformer decoder that attends to an input image using
+ queries whose positional embedding is supplied.
+
+ Args:
+ depth (int): number of layers in the transformer
+ embedding_dim (int): the channel dimension for the input embeddings
+ num_heads (int): the number of heads for multihead attention. Must
+ divide embedding_dim
+ mlp_dim (int): the channel dimension internal to the MLP block
+ activation (nn.Module): the activation to use in the MLP block
+ """
+ super().__init__()
+ self.depth = depth
+ self.embedding_dim = embedding_dim
+ self.num_heads = num_heads
+ self.mlp_dim = mlp_dim
+ self.layers = nn.ModuleList()
+
+ for i in range(depth):
+ self.layers.append(
+ TwoWayAttentionBlock(
+ embedding_dim=embedding_dim,
+ num_heads=num_heads,
+ mlp_dim=mlp_dim,
+ activation=activation,
+ attention_downsample_rate=attention_downsample_rate,
+ skip_first_layer_pe=(i == 0),
+ )
+ )
+
+ self.final_attn_token_to_image = Attention(
+ embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+ )
+ self.norm_final_attn = nn.LayerNorm(embedding_dim)
+
+ def forward(
+ self,
+ image_embedding: Tensor,
+ image_pe: Tensor,
+ point_embedding: Tensor,
+ ) -> Tuple[Tensor, Tensor]:
+ """
+ Args:
+ image_embedding (torch.Tensor): image to attend to. Should be shape
+ B x embedding_dim x h x w for any h and w.
+ image_pe (torch.Tensor): the positional encoding to add to the image. Must
+ have the same shape as image_embedding.
+ point_embedding (torch.Tensor): the embedding to add to the query points.
+ Must have shape B x N_points x embedding_dim for any N_points.
+
+ Returns:
+ torch.Tensor: the processed point_embedding
+ torch.Tensor: the processed image_embedding
+ """
+ # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+ bs, c, h, w = image_embedding.shape
+ image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+ image_pe = image_pe.flatten(2).permute(0, 2, 1)
+
+ # Prepare queries
+ queries = point_embedding
+ keys = image_embedding
+
+ # Apply transformer blocks and final layernorm
+ for layer in self.layers:
+ queries, keys = layer(
+ queries=queries,
+ keys=keys,
+ query_pe=point_embedding,
+ key_pe=image_pe,
+ )
+
+ # Apply the final attention layer from the points to the image
+ q = queries + point_embedding
+ k = keys + image_pe
+ attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+ queries = queries + attn_out
+ queries = self.norm_final_attn(queries)
+
+ return queries, keys
+
+
+class TwoWayAttentionBlock(nn.Module):
+ def __init__(
+ self,
+ embedding_dim: int,
+ num_heads: int,
+ mlp_dim: int = 2048,
+ activation: Type[nn.Module] = nn.ReLU,
+ attention_downsample_rate: int = 2,
+ skip_first_layer_pe: bool = False,
+ ) -> None:
+ """
+ A transformer block with four layers: (1) self-attention of sparse
+ inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+ block on sparse inputs, and (4) cross attention of dense inputs to sparse
+ inputs.
+
+ Arguments:
+ embedding_dim (int): the channel dimension of the embeddings
+ num_heads (int): the number of heads in the attention layers
+ mlp_dim (int): the hidden dimension of the mlp block
+ activation (nn.Module): the activation of the mlp block
+ skip_first_layer_pe (bool): skip the PE on the first layer
+ """
+ super().__init__()
+ self.self_attn = Attention(embedding_dim, num_heads)
+ self.norm1 = nn.LayerNorm(embedding_dim)
+
+ self.cross_attn_token_to_image = Attention(
+ embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+ )
+ self.norm2 = nn.LayerNorm(embedding_dim)
+
+ self.mlp = MLP(
+ embedding_dim, mlp_dim, embedding_dim, num_layers=2, activation=activation
+ )
+ self.norm3 = nn.LayerNorm(embedding_dim)
+
+ self.norm4 = nn.LayerNorm(embedding_dim)
+ self.cross_attn_image_to_token = Attention(
+ embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+ )
+
+ self.skip_first_layer_pe = skip_first_layer_pe
+
+ def forward(
+ self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+ ) -> Tuple[Tensor, Tensor]:
+ # Self attention block
+ if self.skip_first_layer_pe:
+ queries = self.self_attn(q=queries, k=queries, v=queries)
+ else:
+ q = queries + query_pe
+ attn_out = self.self_attn(q=q, k=q, v=queries)
+ queries = queries + attn_out
+ queries = self.norm1(queries)
+
+ # Cross attention block, tokens attending to image embedding
+ q = queries + query_pe
+ k = keys + key_pe
+ attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+ queries = queries + attn_out
+ queries = self.norm2(queries)
+
+ # MLP block
+ mlp_out = self.mlp(queries)
+ queries = queries + mlp_out
+ queries = self.norm3(queries)
+
+ # Cross attention block, image embedding attending to tokens
+ q = queries + query_pe
+ k = keys + key_pe
+ attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+ keys = keys + attn_out
+ keys = self.norm4(keys)
+
+ return queries, keys
+
+
+class Attention(nn.Module):
+ """
+ An attention layer that allows for downscaling the size of the embedding
+ after projection to queries, keys, and values.
+ """
+
+ def __init__(
+ self,
+ embedding_dim: int,
+ num_heads: int,
+ downsample_rate: int = 1,
+ dropout: float = 0.0,
+ kv_in_dim: int = None,
+ ) -> None:
+ super().__init__()
+ self.embedding_dim = embedding_dim
+ self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
+ self.internal_dim = embedding_dim // downsample_rate
+ self.num_heads = num_heads
+ assert (
+ self.internal_dim % num_heads == 0
+ ), "num_heads must divide embedding_dim."
+
+ self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+ self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+ self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+ self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+
+ self.dropout_p = dropout
+
+ def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+ b, n, c = x.shape
+ x = x.reshape(b, n, num_heads, c // num_heads)
+ return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head
+
+ def _recombine_heads(self, x: Tensor) -> Tensor:
+ b, n_heads, n_tokens, c_per_head = x.shape
+ x = x.transpose(1, 2)
+ return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C
+
+ def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+ # Input projections
+ q = self.q_proj(q)
+ k = self.k_proj(k)
+ v = self.v_proj(v)
+
+ # Separate into heads
+ q = self._separate_heads(q, self.num_heads)
+ k = self._separate_heads(k, self.num_heads)
+ v = self._separate_heads(v, self.num_heads)
+
+ dropout_p = self.dropout_p if self.training else 0.0
+ # Attention
+ out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+
+ out = self._recombine_heads(out)
+ out = self.out_proj(out)
+
+ return out
+
+
+class RoPEAttention(Attention):
+ """Attention with rotary position encoding."""
+
+ def __init__(
+ self,
+ *args,
+ rope_theta=10000.0,
+ # whether to repeat q rope to match k length
+ # this is needed for cross-attention to memories
+ rope_k_repeat=False,
+ feat_sizes=(64, 64), # [w, h] for stride 16 feats at 1024 resolution
+ **kwargs,
+ ):
+ super().__init__(*args, **kwargs)
+
+ self.compute_cis = partial(
+ compute_axial_cis, dim=self.internal_dim // self.num_heads, theta=rope_theta
+ )
+ freqs_cis = self.compute_cis(end_x=feat_sizes[0], end_y=feat_sizes[1])
+ self.freqs_cis = (
+ freqs_cis.to("cuda") if torch.cuda.is_available() else freqs_cis
+ )
+ self.rope_k_repeat = rope_k_repeat
+
+ def forward(
+ self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0
+ ) -> Tensor:
+ # Input projections
+ q = self.q_proj(q)
+ k = self.k_proj(k)
+ v = self.v_proj(v)
+
+ # Separate into heads
+ q = self._separate_heads(q, self.num_heads)
+ k = self._separate_heads(k, self.num_heads)
+ v = self._separate_heads(v, self.num_heads)
+
+ # Apply rotary position encoding
+ w = h = math.sqrt(q.shape[-2])
+ self.freqs_cis = self.freqs_cis.to(q.device)
+ if self.freqs_cis.shape[0] != q.shape[-2]:
+ self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device)
+ if q.shape[-2] != k.shape[-2]:
+ assert self.rope_k_repeat
+
+ num_k_rope = k.size(-2) - num_k_exclude_rope
+ q, k[:, :, :num_k_rope] = apply_rotary_enc(
+ q,
+ k[:, :, :num_k_rope],
+ freqs_cis=self.freqs_cis,
+ repeat_freqs_k=self.rope_k_repeat,
+ )
+
+ dropout_p = self.dropout_p if self.training else 0.0
+ # Attention
+ out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+
+ out = self._recombine_heads(out)
+ out = self.out_proj(out)
+
+ return out
diff --git a/sam2_repo/sam2/modeling/sam2_base.py b/sam2_repo/sam2/modeling/sam2_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f4e515b0d161942bf2bb64560056b3efbe6dac
--- /dev/null
+++ b/sam2_repo/sam2/modeling/sam2_base.py
@@ -0,0 +1,909 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+
+from torch.nn.init import trunc_normal_
+
+from sam2.modeling.sam.mask_decoder import MaskDecoder
+from sam2.modeling.sam.prompt_encoder import PromptEncoder
+from sam2.modeling.sam.transformer import TwoWayTransformer
+from sam2.modeling.sam2_utils import get_1d_sine_pe, MLP, select_closest_cond_frames
+
+# a large negative value as a placeholder score for missing objects
+NO_OBJ_SCORE = -1024.0
+
+
+class SAM2Base(torch.nn.Module):
+ def __init__(
+ self,
+ image_encoder,
+ memory_attention,
+ memory_encoder,
+ num_maskmem=7, # default 1 input frame + 6 previous frames
+ image_size=512,
+ backbone_stride=16, # stride of the image backbone output
+ sigmoid_scale_for_mem_enc=1.0, # scale factor for mask sigmoid prob
+ sigmoid_bias_for_mem_enc=0.0, # bias factor for mask sigmoid prob
+ # During evaluation, whether to binarize the sigmoid mask logits on interacted frames with clicks
+ binarize_mask_from_pts_for_mem_enc=False,
+ use_mask_input_as_output_without_sam=False, # on frames with mask input, whether to directly output the input mask without using a SAM prompt encoder + mask decoder
+ # The maximum number of conditioning frames to participate in the memory attention (-1 means no limit; if there are more conditioning frames than this limit,
+ # we only cross-attend to the temporally closest `max_cond_frames_in_attn` conditioning frames in the encoder when tracking each frame). This gives the model
+ # a temporal locality when handling a large number of annotated frames (since closer frames should be more important) and also avoids GPU OOM.
+ max_cond_frames_in_attn=-1,
+ # on the first frame, whether to directly add the no-memory embedding to the image feature
+ # (instead of using the transformer encoder)
+ directly_add_no_mem_embed=False,
+ # whether to use high-resolution feature maps in the SAM mask decoder
+ use_high_res_features_in_sam=False,
+ # whether to output multiple (3) masks for the first click on initial conditioning frames
+ multimask_output_in_sam=False,
+ # the minimum and maximum number of clicks to use multimask_output_in_sam (only relevant when `multimask_output_in_sam=True`;
+ # default is 1 for both, meaning that only the first click gives multimask output; also note that a box counts as two points)
+ multimask_min_pt_num=1,
+ multimask_max_pt_num=1,
+ # whether to also use multimask output for tracking (not just for the first click on initial conditioning frames; only relevant when `multimask_output_in_sam=True`)
+ multimask_output_for_tracking=False,
+ # Whether to use multimask tokens for obj ptr; Only relevant when both
+ # use_obj_ptrs_in_encoder=True and multimask_output_for_tracking=True
+ use_multimask_token_for_obj_ptr: bool = False,
+ # whether to use sigmoid to restrict ious prediction to [0-1]
+ iou_prediction_use_sigmoid=False,
+ # The memory bank's temporal stride during evaluation (i.e. the `r` parameter in XMem and Cutie; XMem and Cutie use r=5).
+ # For r>1, the (self.num_maskmem - 1) non-conditioning memory frames consist of
+ # (self.num_maskmem - 2) nearest frames from every r-th frames, plus the last frame.
+ memory_temporal_stride_for_eval=1,
+ # whether to apply non-overlapping constraints on the object masks in the memory encoder during evaluation (to avoid/alleviate superposing masks)
+ non_overlap_masks_for_mem_enc=False,
+ # whether to cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder=False,
+ # the maximum number of object pointers from other frames in encoder cross attention (only relevant when `use_obj_ptrs_in_encoder=True`)
+ max_obj_ptrs_in_encoder=16,
+ # whether to add temporal positional encoding to the object pointers in the encoder (only relevant when `use_obj_ptrs_in_encoder=True`)
+ add_tpos_enc_to_obj_ptrs=True,
+ # whether to add an extra linear projection layer for the temporal positional encoding in the object pointers to avoid potential interference
+ # with spatial positional encoding (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`)
+ proj_tpos_enc_in_obj_ptrs=False,
+ # whether to use signed distance (instead of unsigned absolute distance) in the temporal positional encoding in the object pointers
+ # (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`)
+ use_signed_tpos_enc_to_obj_ptrs=False,
+ # whether to only attend to object pointers in the past (before the current frame) in the encoder during evaluation
+ # (only relevant when `use_obj_ptrs_in_encoder=True`; this might avoid pointer information too far in the future to distract the initial tracking)
+ only_obj_ptrs_in_the_past_for_eval=False,
+ # Whether to predict if there is an object in the frame
+ pred_obj_scores: bool = False,
+ # Whether to use an MLP to predict object scores
+ pred_obj_scores_mlp: bool = False,
+ # Only relevant if pred_obj_scores=True and use_obj_ptrs_in_encoder=True;
+ # Whether to have a fixed no obj pointer when there is no object present
+ # or to use it as an additive embedding with obj_ptr produced by decoder
+ fixed_no_obj_ptr: bool = False,
+ # Soft no object, i.e. mix in no_obj_ptr softly,
+ # hope to make recovery easier if there is a mistake and mitigate accumulation of errors
+ soft_no_obj_ptr: bool = False,
+ use_mlp_for_obj_ptr_proj: bool = False,
+ # add no obj embedding to spatial frames
+ no_obj_embed_spatial: bool = False,
+ # extra arguments used to construct the SAM mask decoder; if not None, it should be a dict of kwargs to be passed into `MaskDecoder` class.
+ sam_mask_decoder_extra_args=None,
+ compile_image_encoder: bool = False,
+ ):
+ super().__init__()
+
+ # Part 1: the image backbone
+ self.image_encoder = image_encoder
+ # Use level 0, 1, 2 for high-res setting, or just level 2 for the default setting
+ self.use_high_res_features_in_sam = use_high_res_features_in_sam
+ self.num_feature_levels = 3 if use_high_res_features_in_sam else 1
+ self.use_obj_ptrs_in_encoder = use_obj_ptrs_in_encoder
+ self.max_obj_ptrs_in_encoder = max_obj_ptrs_in_encoder
+ if use_obj_ptrs_in_encoder:
+ # A conv layer to downsample the mask prompt to stride 4 (the same stride as
+ # low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale,
+ # so that it can be fed into the SAM mask decoder to generate a pointer.
+ self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4)
+ self.add_tpos_enc_to_obj_ptrs = add_tpos_enc_to_obj_ptrs
+ if proj_tpos_enc_in_obj_ptrs:
+ assert add_tpos_enc_to_obj_ptrs # these options need to be used together
+ self.proj_tpos_enc_in_obj_ptrs = proj_tpos_enc_in_obj_ptrs
+ self.use_signed_tpos_enc_to_obj_ptrs = use_signed_tpos_enc_to_obj_ptrs
+ self.only_obj_ptrs_in_the_past_for_eval = only_obj_ptrs_in_the_past_for_eval
+
+ # Part 2: memory attention to condition current frame's visual features
+ # with memories (and obj ptrs) from past frames
+ self.memory_attention = memory_attention
+ self.hidden_dim = image_encoder.neck.d_model
+
+ # Part 3: memory encoder for the previous frame's outputs
+ self.memory_encoder = memory_encoder
+ self.mem_dim = self.hidden_dim
+ if hasattr(self.memory_encoder, "out_proj") and hasattr(
+ self.memory_encoder.out_proj, "weight"
+ ):
+ # if there is compression of memories along channel dim
+ self.mem_dim = self.memory_encoder.out_proj.weight.shape[0]
+ self.num_maskmem = num_maskmem # Number of memories accessible
+ # Temporal encoding of the memories
+ self.maskmem_tpos_enc = torch.nn.Parameter(
+ torch.zeros(num_maskmem, 1, 1, self.mem_dim)
+ )
+ trunc_normal_(self.maskmem_tpos_enc, std=0.02)
+ # a single token to indicate no memory embedding from previous frames
+ self.no_mem_embed = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
+ self.no_mem_pos_enc = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
+ trunc_normal_(self.no_mem_embed, std=0.02)
+ trunc_normal_(self.no_mem_pos_enc, std=0.02)
+ self.directly_add_no_mem_embed = directly_add_no_mem_embed
+ # Apply sigmoid to the output raw mask logits (to turn them from
+ # range (-inf, +inf) to range (0, 1)) before feeding them into the memory encoder
+ self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc
+ self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc
+ self.binarize_mask_from_pts_for_mem_enc = binarize_mask_from_pts_for_mem_enc
+ self.non_overlap_masks_for_mem_enc = non_overlap_masks_for_mem_enc
+ self.memory_temporal_stride_for_eval = memory_temporal_stride_for_eval
+ # On frames with mask input, whether to directly output the input mask without
+ # using a SAM prompt encoder + mask decoder
+ self.use_mask_input_as_output_without_sam = use_mask_input_as_output_without_sam
+ self.multimask_output_in_sam = multimask_output_in_sam
+ self.multimask_min_pt_num = multimask_min_pt_num
+ self.multimask_max_pt_num = multimask_max_pt_num
+ self.multimask_output_for_tracking = multimask_output_for_tracking
+ self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
+ self.iou_prediction_use_sigmoid = iou_prediction_use_sigmoid
+
+ # Part 4: SAM-style prompt encoder (for both mask and point inputs)
+ # and SAM-style mask decoder for the final mask output
+ self.image_size = image_size
+ self.backbone_stride = backbone_stride
+ self.sam_mask_decoder_extra_args = sam_mask_decoder_extra_args
+ self.pred_obj_scores = pred_obj_scores
+ self.pred_obj_scores_mlp = pred_obj_scores_mlp
+ self.fixed_no_obj_ptr = fixed_no_obj_ptr
+ self.soft_no_obj_ptr = soft_no_obj_ptr
+ if self.fixed_no_obj_ptr:
+ assert self.pred_obj_scores
+ assert self.use_obj_ptrs_in_encoder
+ if self.pred_obj_scores and self.use_obj_ptrs_in_encoder:
+ self.no_obj_ptr = torch.nn.Parameter(torch.zeros(1, self.hidden_dim))
+ trunc_normal_(self.no_obj_ptr, std=0.02)
+ self.use_mlp_for_obj_ptr_proj = use_mlp_for_obj_ptr_proj
+ self.no_obj_embed_spatial = None
+ if no_obj_embed_spatial:
+ self.no_obj_embed_spatial = torch.nn.Parameter(torch.zeros(1, self.mem_dim))
+ trunc_normal_(self.no_obj_embed_spatial, std=0.02)
+
+ self._build_sam_heads()
+ self.max_cond_frames_in_attn = max_cond_frames_in_attn
+
+ # Model compilation
+ if compile_image_encoder:
+ # Compile the forward function (not the full module) to allow loading checkpoints.
+ print(
+ "Image encoder compilation is enabled. First forward pass will be slow."
+ )
+ self.image_encoder.forward = torch.compile(
+ self.image_encoder.forward,
+ mode="max-autotune",
+ fullgraph=True,
+ dynamic=False,
+ )
+
+ @property
+ def device(self):
+ return next(self.parameters()).device
+
+ def forward(self, *args, **kwargs):
+ raise NotImplementedError(
+ "Please use the corresponding methods in SAM2VideoPredictor for inference or SAM2Train for training/fine-tuning"
+ "See notebooks/video_predictor_example.ipynb for an inference example."
+ )
+
+ def _build_sam_heads(self):
+ """Build SAM-style prompt encoder and mask decoder."""
+ self.sam_prompt_embed_dim = self.hidden_dim
+ self.sam_image_embedding_size = self.image_size // self.backbone_stride
+
+ # build PromptEncoder and MaskDecoder from SAM
+ # (their hyperparameters like `mask_in_chans=16` are from SAM code)
+ self.sam_prompt_encoder = PromptEncoder(
+ embed_dim=self.sam_prompt_embed_dim,
+ image_embedding_size=(
+ self.sam_image_embedding_size,
+ self.sam_image_embedding_size,
+ ),
+ input_image_size=(self.image_size, self.image_size),
+ mask_in_chans=16,
+ )
+ self.sam_mask_decoder = MaskDecoder(
+ num_multimask_outputs=3,
+ transformer=TwoWayTransformer(
+ depth=2,
+ embedding_dim=self.sam_prompt_embed_dim,
+ mlp_dim=2048,
+ num_heads=8,
+ ),
+ transformer_dim=self.sam_prompt_embed_dim,
+ iou_head_depth=3,
+ iou_head_hidden_dim=256,
+ use_high_res_features=self.use_high_res_features_in_sam,
+ iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid,
+ pred_obj_scores=self.pred_obj_scores,
+ pred_obj_scores_mlp=self.pred_obj_scores_mlp,
+ use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr,
+ **(self.sam_mask_decoder_extra_args or {}),
+ )
+ if self.use_obj_ptrs_in_encoder:
+ # a linear projection on SAM output tokens to turn them into object pointers
+ self.obj_ptr_proj = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
+ if self.use_mlp_for_obj_ptr_proj:
+ self.obj_ptr_proj = MLP(
+ self.hidden_dim, self.hidden_dim, self.hidden_dim, 3
+ )
+ else:
+ self.obj_ptr_proj = torch.nn.Identity()
+ if self.proj_tpos_enc_in_obj_ptrs:
+ # a linear projection on temporal positional encoding in object pointers to
+ # avoid potential interference with spatial positional encoding
+ self.obj_ptr_tpos_proj = torch.nn.Linear(self.hidden_dim, self.mem_dim)
+ else:
+ self.obj_ptr_tpos_proj = torch.nn.Identity()
+
+ def _forward_sam_heads(
+ self,
+ backbone_features,
+ point_inputs=None,
+ mask_inputs=None,
+ high_res_features=None,
+ multimask_output=False,
+ ):
+ """
+ Forward SAM prompt encoders and mask heads.
+
+ Inputs:
+ - backbone_features: image features of [B, C, H, W] shape
+ - point_inputs: a dictionary with "point_coords" and "point_labels", where
+ 1) "point_coords" has [B, P, 2] shape and float32 dtype and contains the
+ absolute pixel-unit coordinate in (x, y) format of the P input points
+ 2) "point_labels" has shape [B, P] and int32 dtype, where 1 means
+ positive clicks, 0 means negative clicks, and -1 means padding
+ - mask_inputs: a mask of [B, 1, H*16, W*16] shape, float or bool, with the
+ same spatial size as the image.
+ - high_res_features: either 1) None or 2) or a list of length 2 containing
+ two feature maps of [B, C, 4*H, 4*W] and [B, C, 2*H, 2*W] shapes respectively,
+ which will be used as high-resolution feature maps for SAM decoder.
+ - multimask_output: if it's True, we output 3 candidate masks and their 3
+ corresponding IoU estimates, and if it's False, we output only 1 mask and
+ its corresponding IoU estimate.
+
+ Outputs:
+ - low_res_multimasks: [B, M, H*4, W*4] shape (where M = 3 if
+ `multimask_output=True` and M = 1 if `multimask_output=False`), the SAM
+ output mask logits (before sigmoid) for the low-resolution masks, with 4x
+ the resolution (1/4 stride) of the input backbone_features.
+ - high_res_multimasks: [B, M, H*16, W*16] shape (where M = 3
+ if `multimask_output=True` and M = 1 if `multimask_output=False`),
+ upsampled from the low-resolution masks, with shape size as the image
+ (stride is 1 pixel).
+ - ious, [B, M] shape, where (where M = 3 if `multimask_output=True` and M = 1
+ if `multimask_output=False`), the estimated IoU of each output mask.
+ - low_res_masks: [B, 1, H*4, W*4] shape, the best mask in `low_res_multimasks`.
+ If `multimask_output=True`, it's the mask with the highest IoU estimate.
+ If `multimask_output=False`, it's the same as `low_res_multimasks`.
+ - high_res_masks: [B, 1, H*16, W*16] shape, the best mask in `high_res_multimasks`.
+ If `multimask_output=True`, it's the mask with the highest IoU estimate.
+ If `multimask_output=False`, it's the same as `high_res_multimasks`.
+ - obj_ptr: [B, C] shape, the object pointer vector for the output mask, extracted
+ based on the output token from the SAM mask decoder.
+ """
+ B = backbone_features.size(0)
+ device = backbone_features.device
+ assert backbone_features.size(1) == self.sam_prompt_embed_dim
+ assert backbone_features.size(2) == self.sam_image_embedding_size
+ assert backbone_features.size(3) == self.sam_image_embedding_size
+
+ # a) Handle point prompts
+ if point_inputs is not None:
+ sam_point_coords = point_inputs["point_coords"]
+ sam_point_labels = point_inputs["point_labels"]
+ assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
+ else:
+ # If no points are provide, pad with an empty point (with label -1)
+ sam_point_coords = torch.zeros(B, 1, 2, device=device)
+ sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
+
+ # b) Handle mask prompts
+ if mask_inputs is not None:
+ # If mask_inputs is provided, downsize it into low-res mask input if needed
+ # and feed it as a dense mask prompt into the SAM mask encoder
+ assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
+ if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
+ sam_mask_prompt = F.interpolate(
+ mask_inputs.float(),
+ size=self.sam_prompt_encoder.mask_input_size,
+ align_corners=False,
+ mode="bilinear",
+ antialias=True, # use antialias for downsampling
+ )
+ else:
+ sam_mask_prompt = mask_inputs
+ else:
+ # Otherwise, simply feed None (and SAM's prompt encoder will add
+ # a learned `no_mask_embed` to indicate no mask input in this case).
+ sam_mask_prompt = None
+
+ sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
+ points=(sam_point_coords, sam_point_labels),
+ boxes=None,
+ masks=sam_mask_prompt,
+ )
+ (
+ low_res_multimasks,
+ ious,
+ sam_output_tokens,
+ object_score_logits,
+ ) = self.sam_mask_decoder(
+ image_embeddings=backbone_features,
+ image_pe=self.sam_prompt_encoder.get_dense_pe(),
+ sparse_prompt_embeddings=sparse_embeddings,
+ dense_prompt_embeddings=dense_embeddings,
+ multimask_output=multimask_output,
+ repeat_image=False, # the image is already batched
+ high_res_features=high_res_features,
+ )
+ if self.pred_obj_scores:
+ is_obj_appearing = object_score_logits > 0
+
+ # Mask used for spatial memories is always a *hard* choice between obj and no obj,
+ # consistent with the actual mask prediction
+ low_res_multimasks = torch.where(
+ is_obj_appearing[:, None, None],
+ low_res_multimasks,
+ NO_OBJ_SCORE,
+ )
+
+ # convert masks from possibly bfloat16 (or float16) to float32
+ # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
+ low_res_multimasks = low_res_multimasks.float()
+ high_res_multimasks = F.interpolate(
+ low_res_multimasks,
+ size=(self.image_size, self.image_size),
+ mode="bilinear",
+ align_corners=False,
+ )
+
+ sam_output_token = sam_output_tokens[:, 0]
+ if multimask_output:
+ # take the best mask prediction (with the highest IoU estimation)
+ best_iou_inds = torch.argmax(ious, dim=-1)
+ batch_inds = torch.arange(B, device=device)
+ low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+ high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+ if sam_output_tokens.size(1) > 1:
+ sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
+ else:
+ low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks
+
+ # Extract object pointer from the SAM output token (with occlusion handling)
+ obj_ptr = self.obj_ptr_proj(sam_output_token)
+ if self.pred_obj_scores:
+ # Allow *soft* no obj ptr, unlike for masks
+ if self.soft_no_obj_ptr:
+ lambda_is_obj_appearing = object_score_logits.sigmoid()
+ else:
+ lambda_is_obj_appearing = is_obj_appearing.float()
+
+ if self.fixed_no_obj_ptr:
+ obj_ptr = lambda_is_obj_appearing * obj_ptr
+ obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
+
+ return (
+ low_res_multimasks,
+ high_res_multimasks,
+ ious,
+ low_res_masks,
+ high_res_masks,
+ obj_ptr,
+ object_score_logits,
+ )
+
+ def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
+ """
+ Directly turn binary `mask_inputs` into a output mask logits without using SAM.
+ (same input and output shapes as in _forward_sam_heads above).
+ """
+ # Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid).
+ out_scale, out_bias = 20.0, -10.0 # sigmoid(-10.0)=4.5398e-05
+ mask_inputs_float = mask_inputs.float()
+ high_res_masks = mask_inputs_float * out_scale + out_bias
+ low_res_masks = F.interpolate(
+ high_res_masks,
+ size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4),
+ align_corners=False,
+ mode="bilinear",
+ antialias=True, # use antialias for downsampling
+ )
+ # a dummy IoU prediction of all 1's under mask input
+ ious = mask_inputs.new_ones(mask_inputs.size(0), 1).float()
+ if not self.use_obj_ptrs_in_encoder:
+ # all zeros as a dummy object pointer (of shape [B, C])
+ obj_ptr = torch.zeros(
+ mask_inputs.size(0), self.hidden_dim, device=mask_inputs.device
+ )
+ else:
+ # produce an object pointer using the SAM decoder from the mask input
+ _, _, _, _, _, obj_ptr, _ = self._forward_sam_heads(
+ backbone_features=backbone_features,
+ mask_inputs=self.mask_downsample(mask_inputs_float),
+ high_res_features=high_res_features,
+ )
+ # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem;
+ # Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying
+ # on the object_scores from the SAM decoder.
+ is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1)
+ is_obj_appearing = is_obj_appearing[..., None]
+ lambda_is_obj_appearing = is_obj_appearing.float()
+ object_score_logits = out_scale * lambda_is_obj_appearing + out_bias
+ if self.pred_obj_scores:
+ if self.fixed_no_obj_ptr:
+ obj_ptr = lambda_is_obj_appearing * obj_ptr
+ obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
+
+ return (
+ low_res_masks,
+ high_res_masks,
+ ious,
+ low_res_masks,
+ high_res_masks,
+ obj_ptr,
+ object_score_logits,
+ )
+
+ def forward_image(self, img_batch: torch.Tensor):
+ """Get the image feature on the input batch."""
+ backbone_out = self.image_encoder(img_batch)
+ if self.use_high_res_features_in_sam:
+ # precompute projected level 0 and level 1 features in SAM decoder
+ # to avoid running it again on every SAM click
+ backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(
+ backbone_out["backbone_fpn"][0]
+ )
+ backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(
+ backbone_out["backbone_fpn"][1]
+ )
+ return backbone_out
+
+ def _prepare_backbone_features(self, backbone_out):
+ """Prepare and flatten visual features."""
+ backbone_out = backbone_out.copy()
+ assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
+ assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
+
+ feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
+ vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
+
+ feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
+ # flatten NxCxHxW to HWxNxC
+ vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
+ vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
+
+ return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
+
+ def _prepare_memory_conditioned_features(
+ self,
+ frame_idx,
+ is_init_cond_frame,
+ current_vision_feats,
+ current_vision_pos_embeds,
+ feat_sizes,
+ output_dict,
+ num_frames,
+ track_in_reverse=False, # tracking in reverse time order (for demo usage)
+ ):
+ """Fuse the current frame's visual feature map with previous memory."""
+ B = current_vision_feats[-1].size(1) # batch size on this frame
+ C = self.hidden_dim
+ H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size
+ device = current_vision_feats[-1].device
+ # The case of `self.num_maskmem == 0` below is primarily used for reproducing SAM on images.
+ # In this case, we skip the fusion with any memory.
+ if self.num_maskmem == 0: # Disable memory and skip fusion
+ pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
+ return pix_feat
+
+ num_obj_ptr_tokens = 0
+ tpos_sign_mul = -1 if track_in_reverse else 1
+ # Step 1: condition the visual features of the current frame on previous memories
+ if not is_init_cond_frame:
+ # Retrieve the memories encoded with the maskmem backbone
+ to_cat_memory, to_cat_memory_pos_embed = [], []
+ # Add conditioning frames's output first (all cond frames have t_pos=0 for
+ # when getting temporal positional embedding below)
+ assert len(output_dict["cond_frame_outputs"]) > 0
+ # Select a maximum number of temporally closest cond frames for cross attention
+ cond_outputs = output_dict["cond_frame_outputs"]
+ selected_cond_outputs, unselected_cond_outputs = select_closest_cond_frames(
+ frame_idx, cond_outputs, self.max_cond_frames_in_attn
+ )
+ t_pos_and_prevs = [(0, out) for out in selected_cond_outputs.values()]
+ # Add last (self.num_maskmem - 1) frames before current frame for non-conditioning memory
+ # the earliest one has t_pos=1 and the latest one has t_pos=self.num_maskmem-1
+ # We also allow taking the memory frame non-consecutively (with stride>1), in which case
+ # we take (self.num_maskmem - 2) frames among every stride-th frames plus the last frame.
+ stride = 1 if self.training else self.memory_temporal_stride_for_eval
+ for t_pos in range(1, self.num_maskmem):
+ t_rel = self.num_maskmem - t_pos # how many frames before current frame
+ if t_rel == 1:
+ # for t_rel == 1, we take the last frame (regardless of r)
+ if not track_in_reverse:
+ # the frame immediately before this frame (i.e. frame_idx - 1)
+ prev_frame_idx = frame_idx - t_rel
+ else:
+ # the frame immediately after this frame (i.e. frame_idx + 1)
+ prev_frame_idx = frame_idx + t_rel
+ else:
+ # for t_rel >= 2, we take the memory frame from every r-th frames
+ if not track_in_reverse:
+ # first find the nearest frame among every r-th frames before this frame
+ # for r=1, this would be (frame_idx - 2)
+ prev_frame_idx = ((frame_idx - 2) // stride) * stride
+ # then seek further among every r-th frames
+ prev_frame_idx = prev_frame_idx - (t_rel - 2) * stride
+ else:
+ # first find the nearest frame among every r-th frames after this frame
+ # for r=1, this would be (frame_idx + 2)
+ prev_frame_idx = -(-(frame_idx + 2) // stride) * stride
+ # then seek further among every r-th frames
+ prev_frame_idx = prev_frame_idx + (t_rel - 2) * stride
+ out = output_dict["non_cond_frame_outputs"].get(prev_frame_idx, None)
+ if out is None:
+ # If an unselected conditioning frame is among the last (self.num_maskmem - 1)
+ # frames, we still attend to it as if it's a non-conditioning frame.
+ out = unselected_cond_outputs.get(prev_frame_idx, None)
+ t_pos_and_prevs.append((t_pos, out))
+
+ for t_pos, prev in t_pos_and_prevs:
+ if prev is None:
+ continue # skip padding frames
+ # "maskmem_features" might have been offloaded to CPU in demo use cases,
+ # so we load it back to GPU (it's a no-op if it's already on GPU).
+ feats = prev["maskmem_features"].to(device, non_blocking=True)
+ to_cat_memory.append(feats.flatten(2).permute(2, 0, 1))
+ # Spatial positional encoding (it might have been offloaded to CPU in eval)
+ maskmem_enc = prev["maskmem_pos_enc"][-1].to(device)
+ maskmem_enc = maskmem_enc.flatten(2).permute(2, 0, 1)
+ # Temporal positional encoding
+ maskmem_enc = (
+ maskmem_enc + self.maskmem_tpos_enc[self.num_maskmem - t_pos - 1]
+ )
+ to_cat_memory_pos_embed.append(maskmem_enc)
+
+ # Construct the list of past object pointers
+ if self.use_obj_ptrs_in_encoder:
+ max_obj_ptrs_in_encoder = min(num_frames, self.max_obj_ptrs_in_encoder)
+ # First add those object pointers from selected conditioning frames
+ # (optionally, only include object pointers in the past during evaluation)
+ if not self.training and self.only_obj_ptrs_in_the_past_for_eval:
+ ptr_cond_outputs = {
+ t: out
+ for t, out in selected_cond_outputs.items()
+ if (t >= frame_idx if track_in_reverse else t <= frame_idx)
+ }
+ else:
+ ptr_cond_outputs = selected_cond_outputs
+ pos_and_ptrs = [
+ # Temporal pos encoding contains how far away each pointer is from current frame
+ (
+ (
+ (frame_idx - t) * tpos_sign_mul
+ if self.use_signed_tpos_enc_to_obj_ptrs
+ else abs(frame_idx - t)
+ ),
+ out["obj_ptr"],
+ )
+ for t, out in ptr_cond_outputs.items()
+ ]
+ # Add up to (max_obj_ptrs_in_encoder - 1) non-conditioning frames before current frame
+ for t_diff in range(1, max_obj_ptrs_in_encoder):
+ t = frame_idx + t_diff if track_in_reverse else frame_idx - t_diff
+ if t < 0 or (num_frames is not None and t >= num_frames):
+ break
+ out = output_dict["non_cond_frame_outputs"].get(
+ t, unselected_cond_outputs.get(t, None)
+ )
+ if out is not None:
+ pos_and_ptrs.append((t_diff, out["obj_ptr"]))
+ # If we have at least one object pointer, add them to the across attention
+ if len(pos_and_ptrs) > 0:
+ pos_list, ptrs_list = zip(*pos_and_ptrs)
+ # stack object pointers along dim=0 into [ptr_seq_len, B, C] shape
+ obj_ptrs = torch.stack(ptrs_list, dim=0)
+ # a temporal positional embedding based on how far each object pointer is from
+ # the current frame (sine embedding normalized by the max pointer num).
+ if self.add_tpos_enc_to_obj_ptrs:
+ t_diff_max = max_obj_ptrs_in_encoder - 1
+ tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim
+ obj_pos = torch.tensor(pos_list).to(
+ device=device, non_blocking=True
+ )
+ obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim)
+ obj_pos = self.obj_ptr_tpos_proj(obj_pos)
+ obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim)
+ else:
+ obj_pos = obj_ptrs.new_zeros(len(pos_list), B, self.mem_dim)
+ if self.mem_dim < C:
+ # split a pointer into (C // self.mem_dim) tokens for self.mem_dim < C
+ obj_ptrs = obj_ptrs.reshape(
+ -1, B, C // self.mem_dim, self.mem_dim
+ )
+ obj_ptrs = obj_ptrs.permute(0, 2, 1, 3).flatten(0, 1)
+ obj_pos = obj_pos.repeat_interleave(C // self.mem_dim, dim=0)
+ to_cat_memory.append(obj_ptrs)
+ to_cat_memory_pos_embed.append(obj_pos)
+ num_obj_ptr_tokens = obj_ptrs.shape[0]
+ else:
+ num_obj_ptr_tokens = 0
+ else:
+ # for initial conditioning frames, encode them without using any previous memory
+ if self.directly_add_no_mem_embed:
+ # directly add no-mem embedding (instead of using the transformer encoder)
+ pix_feat_with_mem = current_vision_feats[-1] + self.no_mem_embed
+ pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
+ return pix_feat_with_mem
+
+ # Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder)
+ to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
+ to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
+
+ # Step 2: Concatenate the memories and forward through the transformer encoder
+ memory = torch.cat(to_cat_memory, dim=0)
+ memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
+
+ pix_feat_with_mem = self.memory_attention(
+ curr=current_vision_feats,
+ curr_pos=current_vision_pos_embeds,
+ memory=memory,
+ memory_pos=memory_pos_embed,
+ num_obj_ptr_tokens=num_obj_ptr_tokens,
+ )
+ # reshape the output (HW)BC => BCHW
+ pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
+ return pix_feat_with_mem
+
+ def _encode_new_memory(
+ self,
+ current_vision_feats,
+ feat_sizes,
+ pred_masks_high_res,
+ object_score_logits,
+ is_mask_from_pts,
+ ):
+ """Encode the current image and its prediction into a memory feature."""
+ B = current_vision_feats[-1].size(1) # batch size on this frame
+ C = self.hidden_dim
+ H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size
+ # top-level feature, (HW)BC => BCHW
+ pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
+ if self.non_overlap_masks_for_mem_enc and not self.training:
+ # optionally, apply non-overlapping constraints to the masks (it's applied
+ # in the batch dimension and should only be used during eval, where all
+ # the objects come from the same video under batch size 1).
+ pred_masks_high_res = self._apply_non_overlapping_constraints(
+ pred_masks_high_res
+ )
+ # scale the raw mask logits with a temperature before applying sigmoid
+ binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
+ if binarize and not self.training:
+ mask_for_mem = (pred_masks_high_res > 0).float()
+ else:
+ # apply sigmoid on the raw mask logits to turn them into range (0, 1)
+ mask_for_mem = torch.sigmoid(pred_masks_high_res)
+ # apply scale and bias terms to the sigmoid probabilities
+ if self.sigmoid_scale_for_mem_enc != 1.0:
+ mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
+ if self.sigmoid_bias_for_mem_enc != 0.0:
+ mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
+ maskmem_out = self.memory_encoder(
+ pix_feat, mask_for_mem, skip_mask_sigmoid=True # sigmoid already applied
+ )
+ maskmem_features = maskmem_out["vision_features"]
+ maskmem_pos_enc = maskmem_out["vision_pos_enc"]
+ # add a no-object embedding to the spatial memory to indicate that the frame
+ # is predicted to be occluded (i.e. no object is appearing in the frame)
+ if self.no_obj_embed_spatial is not None:
+ is_obj_appearing = (object_score_logits > 0).float()
+ maskmem_features += (
+ 1 - is_obj_appearing[..., None, None]
+ ) * self.no_obj_embed_spatial[..., None, None].expand(
+ *maskmem_features.shape
+ )
+
+ return maskmem_features, maskmem_pos_enc
+
+ def _track_step(
+ self,
+ frame_idx,
+ is_init_cond_frame,
+ current_vision_feats,
+ current_vision_pos_embeds,
+ feat_sizes,
+ point_inputs,
+ mask_inputs,
+ output_dict,
+ num_frames,
+ track_in_reverse,
+ prev_sam_mask_logits,
+ ):
+ current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
+ # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
+ if len(current_vision_feats) > 1:
+ high_res_features = [
+ x.permute(1, 2, 0).view(x.size(1), x.size(2), *s)
+ for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1])
+ ]
+ else:
+ high_res_features = None
+ if mask_inputs is not None and self.use_mask_input_as_output_without_sam:
+ # When use_mask_input_as_output_without_sam=True, we directly output the mask input
+ # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
+ pix_feat = current_vision_feats[-1].permute(1, 2, 0)
+ pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
+ sam_outputs = self._use_mask_as_output(
+ pix_feat, high_res_features, mask_inputs
+ )
+ else:
+ # fused the visual feature with previous memory features in the memory bank
+ pix_feat = self._prepare_memory_conditioned_features(
+ frame_idx=frame_idx,
+ is_init_cond_frame=is_init_cond_frame,
+ current_vision_feats=current_vision_feats[-1:],
+ current_vision_pos_embeds=current_vision_pos_embeds[-1:],
+ feat_sizes=feat_sizes[-1:],
+ output_dict=output_dict,
+ num_frames=num_frames,
+ track_in_reverse=track_in_reverse,
+ )
+ # apply SAM-style segmentation head
+ # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder,
+ # e.g. in demo where such logits come from earlier interaction instead of correction sampling
+ # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead)
+ if prev_sam_mask_logits is not None:
+ assert point_inputs is not None and mask_inputs is None
+ mask_inputs = prev_sam_mask_logits
+ multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
+ sam_outputs = self._forward_sam_heads(
+ backbone_features=pix_feat,
+ point_inputs=point_inputs,
+ mask_inputs=mask_inputs,
+ high_res_features=high_res_features,
+ multimask_output=multimask_output,
+ )
+
+ return current_out, sam_outputs, high_res_features, pix_feat
+
+ def _encode_memory_in_output(
+ self,
+ current_vision_feats,
+ feat_sizes,
+ point_inputs,
+ run_mem_encoder,
+ high_res_masks,
+ object_score_logits,
+ current_out,
+ ):
+ if run_mem_encoder and self.num_maskmem > 0:
+ high_res_masks_for_mem_enc = high_res_masks
+ maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+ current_vision_feats=current_vision_feats,
+ feat_sizes=feat_sizes,
+ pred_masks_high_res=high_res_masks_for_mem_enc,
+ object_score_logits=object_score_logits,
+ is_mask_from_pts=(point_inputs is not None),
+ )
+ current_out["maskmem_features"] = maskmem_features
+ current_out["maskmem_pos_enc"] = maskmem_pos_enc
+ else:
+ current_out["maskmem_features"] = None
+ current_out["maskmem_pos_enc"] = None
+
+ def track_step(
+ self,
+ frame_idx,
+ is_init_cond_frame,
+ current_vision_feats,
+ current_vision_pos_embeds,
+ feat_sizes,
+ point_inputs,
+ mask_inputs,
+ output_dict,
+ num_frames,
+ track_in_reverse=False, # tracking in reverse time order (for demo usage)
+ # Whether to run the memory encoder on the predicted masks. Sometimes we might want
+ # to skip the memory encoder with `run_mem_encoder=False`. For example,
+ # in demo we might call `track_step` multiple times for each user click,
+ # and only encode the memory when the user finalizes their clicks. And in ablation
+ # settings like SAM training on static images, we don't need the memory encoder.
+ run_mem_encoder=True,
+ # The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
+ prev_sam_mask_logits=None,
+ ):
+ current_out, sam_outputs, _, _ = self._track_step(
+ frame_idx,
+ is_init_cond_frame,
+ current_vision_feats,
+ current_vision_pos_embeds,
+ feat_sizes,
+ point_inputs,
+ mask_inputs,
+ output_dict,
+ num_frames,
+ track_in_reverse,
+ prev_sam_mask_logits,
+ )
+
+ (
+ _,
+ _,
+ _,
+ low_res_masks,
+ high_res_masks,
+ obj_ptr,
+ object_score_logits,
+ ) = sam_outputs
+
+ current_out["pred_masks"] = low_res_masks
+ current_out["pred_masks_high_res"] = high_res_masks
+ current_out["obj_ptr"] = obj_ptr
+ if not self.training:
+ # Only add this in inference (to avoid unused param in activation checkpointing;
+ # it's mainly used in the demo to encode spatial memories w/ consolidated masks)
+ current_out["object_score_logits"] = object_score_logits
+
+ # Finally run the memory encoder on the predicted mask to encode
+ # it into a new memory feature (that can be used in future frames)
+ self._encode_memory_in_output(
+ current_vision_feats,
+ feat_sizes,
+ point_inputs,
+ run_mem_encoder,
+ high_res_masks,
+ object_score_logits,
+ current_out,
+ )
+
+ return current_out
+
+ def _use_multimask(self, is_init_cond_frame, point_inputs):
+ """Whether to use multimask output in the SAM head."""
+ num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
+ multimask_output = (
+ self.multimask_output_in_sam
+ and (is_init_cond_frame or self.multimask_output_for_tracking)
+ and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num)
+ )
+ return multimask_output
+
+ def _apply_non_overlapping_constraints(self, pred_masks):
+ """
+ Apply non-overlapping constraints to the object scores in pred_masks. Here we
+ keep only the highest scoring object at each spatial location in pred_masks.
+ """
+ batch_size = pred_masks.size(0)
+ if batch_size == 1:
+ return pred_masks
+
+ device = pred_masks.device
+ # "max_obj_inds": object index of the object with the highest score at each location
+ max_obj_inds = torch.argmax(pred_masks, dim=0, keepdim=True)
+ # "batch_obj_inds": object index of each object slice (along dim 0) in `pred_masks`
+ batch_obj_inds = torch.arange(batch_size, device=device)[:, None, None, None]
+ keep = max_obj_inds == batch_obj_inds
+ # suppress overlapping regions' scores below -10.0 so that the foreground regions
+ # don't overlap (here sigmoid(-10.0)=4.5398e-05)
+ pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
+ return pred_masks
diff --git a/sam2_repo/sam2/modeling/sam2_utils.py b/sam2_repo/sam2/modeling/sam2_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16caae3a9a49e451b2d03d1ee60c47f8e9ed23c
--- /dev/null
+++ b/sam2_repo/sam2/modeling/sam2_utils.py
@@ -0,0 +1,323 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import copy
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from sam2.utils.misc import mask_to_box
+
+
+def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
+ """
+ Select up to `max_cond_frame_num` conditioning frames from `cond_frame_outputs`
+ that are temporally closest to the current frame at `frame_idx`. Here, we take
+ - a) the closest conditioning frame before `frame_idx` (if any);
+ - b) the closest conditioning frame after `frame_idx` (if any);
+ - c) any other temporally closest conditioning frames until reaching a total
+ of `max_cond_frame_num` conditioning frames.
+
+ Outputs:
+ - selected_outputs: selected items (keys & values) from `cond_frame_outputs`.
+ - unselected_outputs: items (keys & values) not selected in `cond_frame_outputs`.
+ """
+ if max_cond_frame_num == -1 or len(cond_frame_outputs) <= max_cond_frame_num:
+ selected_outputs = cond_frame_outputs
+ unselected_outputs = {}
+ else:
+ assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames"
+ selected_outputs = {}
+
+ # the closest conditioning frame before `frame_idx` (if any)
+ idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None)
+ if idx_before is not None:
+ selected_outputs[idx_before] = cond_frame_outputs[idx_before]
+
+ # the closest conditioning frame after `frame_idx` (if any)
+ idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None)
+ if idx_after is not None:
+ selected_outputs[idx_after] = cond_frame_outputs[idx_after]
+
+ # add other temporally closest conditioning frames until reaching a total
+ # of `max_cond_frame_num` conditioning frames.
+ num_remain = max_cond_frame_num - len(selected_outputs)
+ inds_remain = sorted(
+ (t for t in cond_frame_outputs if t not in selected_outputs),
+ key=lambda x: abs(x - frame_idx),
+ )[:num_remain]
+ selected_outputs.update((t, cond_frame_outputs[t]) for t in inds_remain)
+ unselected_outputs = {
+ t: v for t, v in cond_frame_outputs.items() if t not in selected_outputs
+ }
+
+ return selected_outputs, unselected_outputs
+
+
+def get_1d_sine_pe(pos_inds, dim, temperature=10000):
+ """
+ Get 1D sine positional embedding as in the original Transformer paper.
+ """
+ pe_dim = dim // 2
+ dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device)
+ dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)
+
+ pos_embed = pos_inds.unsqueeze(-1) / dim_t
+ pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1)
+ return pos_embed
+
+
+def get_activation_fn(activation):
+ """Return an activation function given a string"""
+ if activation == "relu":
+ return F.relu
+ if activation == "gelu":
+ return F.gelu
+ if activation == "glu":
+ return F.glu
+ raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
+
+
+def get_clones(module, N):
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+class DropPath(nn.Module):
+ # adapted from https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+ def __init__(self, drop_prob=0.0, scale_by_keep=True):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+ self.scale_by_keep = scale_by_keep
+
+ def forward(self, x):
+ if self.drop_prob == 0.0 or not self.training:
+ return x
+ keep_prob = 1 - self.drop_prob
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+ if keep_prob > 0.0 and self.scale_by_keep:
+ random_tensor.div_(keep_prob)
+ return x * random_tensor
+
+
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+ def __init__(
+ self,
+ input_dim: int,
+ hidden_dim: int,
+ output_dim: int,
+ num_layers: int,
+ activation: nn.Module = nn.ReLU,
+ sigmoid_output: bool = False,
+ ) -> None:
+ super().__init__()
+ self.num_layers = num_layers
+ h = [hidden_dim] * (num_layers - 1)
+ self.layers = nn.ModuleList(
+ nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+ )
+ self.sigmoid_output = sigmoid_output
+ self.act = activation()
+
+ def forward(self, x):
+ for i, layer in enumerate(self.layers):
+ x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+ if self.sigmoid_output:
+ x = F.sigmoid(x)
+ return x
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
+class LayerNorm2d(nn.Module):
+ def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(num_channels))
+ self.bias = nn.Parameter(torch.zeros(num_channels))
+ self.eps = eps
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ u = x.mean(1, keepdim=True)
+ s = (x - u).pow(2).mean(1, keepdim=True)
+ x = (x - u) / torch.sqrt(s + self.eps)
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
+ return x
+
+
+def sample_box_points(
+ masks: torch.Tensor,
+ noise: float = 0.1, # SAM default
+ noise_bound: int = 20, # SAM default
+ top_left_label: int = 2,
+ bottom_right_label: int = 3,
+) -> Tuple[np.array, np.array]:
+ """
+ Sample a noised version of the top left and bottom right corners of a given `bbox`
+
+ Inputs:
+ - masks: [B, 1, H,W] boxes, dtype=torch.Tensor
+ - noise: noise as a fraction of box width and height, dtype=float
+ - noise_bound: maximum amount of noise (in pure pixesl), dtype=int
+
+ Returns:
+ - box_coords: [B, num_pt, 2], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.float
+ - box_labels: [B, num_pt], label 2 is reserverd for top left and 3 for bottom right corners, dtype=torch.int32
+ """
+ device = masks.device
+ box_coords = mask_to_box(masks)
+ B, _, H, W = masks.shape
+ box_labels = torch.tensor(
+ [top_left_label, bottom_right_label], dtype=torch.int, device=device
+ ).repeat(B)
+ if noise > 0.0:
+ if not isinstance(noise_bound, torch.Tensor):
+ noise_bound = torch.tensor(noise_bound, device=device)
+ bbox_w = box_coords[..., 2] - box_coords[..., 0]
+ bbox_h = box_coords[..., 3] - box_coords[..., 1]
+ max_dx = torch.min(bbox_w * noise, noise_bound)
+ max_dy = torch.min(bbox_h * noise, noise_bound)
+ box_noise = 2 * torch.rand(B, 1, 4, device=device) - 1
+ box_noise = box_noise * torch.stack((max_dx, max_dy, max_dx, max_dy), dim=-1)
+
+ box_coords = box_coords + box_noise
+ img_bounds = (
+ torch.tensor([W, H, W, H], device=device) - 1
+ ) # uncentered pixel coords
+ box_coords.clamp_(torch.zeros_like(img_bounds), img_bounds) # In place clamping
+
+ box_coords = box_coords.reshape(-1, 2, 2) # always 2 points
+ box_labels = box_labels.reshape(-1, 2)
+ return box_coords, box_labels
+
+
+def sample_random_points_from_errors(gt_masks, pred_masks, num_pt=1):
+ """
+ Sample `num_pt` random points (along with their labels) independently from the error regions.
+
+ Inputs:
+ - gt_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool
+ - pred_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool or None
+ - num_pt: int, number of points to sample independently for each of the B error maps
+
+ Outputs:
+ - points: [B, num_pt, 2], dtype=torch.float, contains (x, y) coordinates of each sampled point
+ - labels: [B, num_pt], dtype=torch.int32, where 1 means positive clicks and 0 means
+ negative clicks
+ """
+ if pred_masks is None: # if pred_masks is not provided, treat it as empty
+ pred_masks = torch.zeros_like(gt_masks)
+ assert gt_masks.dtype == torch.bool and gt_masks.size(1) == 1
+ assert pred_masks.dtype == torch.bool and pred_masks.shape == gt_masks.shape
+ assert num_pt >= 0
+
+ B, _, H_im, W_im = gt_masks.shape
+ device = gt_masks.device
+
+ # false positive region, a new point sampled in this region should have
+ # negative label to correct the FP error
+ fp_masks = ~gt_masks & pred_masks
+ # false negative region, a new point sampled in this region should have
+ # positive label to correct the FN error
+ fn_masks = gt_masks & ~pred_masks
+ # whether the prediction completely match the ground-truth on each mask
+ all_correct = torch.all((gt_masks == pred_masks).flatten(2), dim=2)
+ all_correct = all_correct[..., None, None]
+
+ # channel 0 is FP map, while channel 1 is FN map
+ pts_noise = torch.rand(B, num_pt, H_im, W_im, 2, device=device)
+ # sample a negative new click from FP region or a positive new click
+ # from FN region, depend on where the maximum falls,
+ # and in case the predictions are all correct (no FP or FN), we just
+ # sample a negative click from the background region
+ pts_noise[..., 0] *= fp_masks | (all_correct & ~gt_masks)
+ pts_noise[..., 1] *= fn_masks
+ pts_idx = pts_noise.flatten(2).argmax(dim=2)
+ labels = (pts_idx % 2).to(torch.int32)
+ pts_idx = pts_idx // 2
+ pts_x = pts_idx % W_im
+ pts_y = pts_idx // W_im
+ points = torch.stack([pts_x, pts_y], dim=2).to(torch.float)
+ return points, labels
+
+
+def sample_one_point_from_error_center(gt_masks, pred_masks, padding=True):
+ """
+ Sample 1 random point (along with its label) from the center of each error region,
+ that is, the point with the largest distance to the boundary of each error region.
+ This is the RITM sampling method from https://github.com/saic-vul/ritm_interactive_segmentation/blob/master/isegm/inference/clicker.py
+
+ Inputs:
+ - gt_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool
+ - pred_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool or None
+ - padding: if True, pad with boundary of 1 px for distance transform
+
+ Outputs:
+ - points: [B, 1, 2], dtype=torch.float, contains (x, y) coordinates of each sampled point
+ - labels: [B, 1], dtype=torch.int32, where 1 means positive clicks and 0 means negative clicks
+ """
+ import cv2
+
+ if pred_masks is None:
+ pred_masks = torch.zeros_like(gt_masks)
+ assert gt_masks.dtype == torch.bool and gt_masks.size(1) == 1
+ assert pred_masks.dtype == torch.bool and pred_masks.shape == gt_masks.shape
+
+ B, _, _, W_im = gt_masks.shape
+ device = gt_masks.device
+
+ # false positive region, a new point sampled in this region should have
+ # negative label to correct the FP error
+ fp_masks = ~gt_masks & pred_masks
+ # false negative region, a new point sampled in this region should have
+ # positive label to correct the FN error
+ fn_masks = gt_masks & ~pred_masks
+
+ fp_masks = fp_masks.cpu().numpy()
+ fn_masks = fn_masks.cpu().numpy()
+ points = torch.zeros(B, 1, 2, dtype=torch.float)
+ labels = torch.ones(B, 1, dtype=torch.int32)
+ for b in range(B):
+ fn_mask = fn_masks[b, 0]
+ fp_mask = fp_masks[b, 0]
+ if padding:
+ fn_mask = np.pad(fn_mask, ((1, 1), (1, 1)), "constant")
+ fp_mask = np.pad(fp_mask, ((1, 1), (1, 1)), "constant")
+ # compute the distance of each point in FN/FP region to its boundary
+ fn_mask_dt = cv2.distanceTransform(fn_mask.astype(np.uint8), cv2.DIST_L2, 0)
+ fp_mask_dt = cv2.distanceTransform(fp_mask.astype(np.uint8), cv2.DIST_L2, 0)
+ if padding:
+ fn_mask_dt = fn_mask_dt[1:-1, 1:-1]
+ fp_mask_dt = fp_mask_dt[1:-1, 1:-1]
+
+ # take the point in FN/FP region with the largest distance to its boundary
+ fn_mask_dt_flat = fn_mask_dt.reshape(-1)
+ fp_mask_dt_flat = fp_mask_dt.reshape(-1)
+ fn_argmax = np.argmax(fn_mask_dt_flat)
+ fp_argmax = np.argmax(fp_mask_dt_flat)
+ is_positive = fn_mask_dt_flat[fn_argmax] > fp_mask_dt_flat[fp_argmax]
+ pt_idx = fn_argmax if is_positive else fp_argmax
+ points[b, 0, 0] = pt_idx % W_im # x
+ points[b, 0, 1] = pt_idx // W_im # y
+ labels[b, 0] = int(is_positive)
+
+ points = points.to(device)
+ labels = labels.to(device)
+ return points, labels
+
+
+def get_next_point(gt_masks, pred_masks, method):
+ if method == "uniform":
+ return sample_random_points_from_errors(gt_masks, pred_masks)
+ elif method == "center":
+ return sample_one_point_from_error_center(gt_masks, pred_masks)
+ else:
+ raise ValueError(f"unknown sampling method {method}")
diff --git a/sam2_repo/sam2/sam2_hiera_b+.yaml b/sam2_repo/sam2/sam2_hiera_b+.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f435af02fc88e2d3b7bff06f8cf8013cc079c24
--- /dev/null
+++ b/sam2_repo/sam2/sam2_hiera_b+.yaml
@@ -0,0 +1,113 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 112
+ num_heads: 2
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [896, 448, 224, 112]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: false
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/sam2_hiera_l.yaml b/sam2_repo/sam2/sam2_hiera_l.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1092802b1d24be6fedf78939f45b0d021d4ec560
--- /dev/null
+++ b/sam2_repo/sam2/sam2_hiera_l.yaml
@@ -0,0 +1,117 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 144
+ num_heads: 2
+ stages: [2, 6, 36, 4]
+ global_att_blocks: [23, 33, 43]
+ window_pos_embed_bkg_spatial_size: [7, 7]
+ window_spec: [8, 4, 16, 8]
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [1152, 576, 288, 144]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: false
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/sam2_hiera_s.yaml b/sam2_repo/sam2/sam2_hiera_s.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..174e414f1467d80e94a34e9525dc373058f8caaa
--- /dev/null
+++ b/sam2_repo/sam2/sam2_hiera_s.yaml
@@ -0,0 +1,116 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 96
+ num_heads: 1
+ stages: [1, 2, 11, 2]
+ global_att_blocks: [7, 10, 13]
+ window_pos_embed_bkg_spatial_size: [7, 7]
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [768, 384, 192, 96]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: false
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/sam2_hiera_t.yaml b/sam2_repo/sam2/sam2_hiera_t.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..121447aabd5318fac20efc2bc00d7c406ca26f01
--- /dev/null
+++ b/sam2_repo/sam2/sam2_hiera_t.yaml
@@ -0,0 +1,118 @@
+# @package _global_
+
+# Model
+model:
+ _target_: sam2.modeling.sam2_base.SAM2Base
+ image_encoder:
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+ scalp: 1
+ trunk:
+ _target_: sam2.modeling.backbones.hieradet.Hiera
+ embed_dim: 96
+ num_heads: 1
+ stages: [1, 2, 7, 2]
+ global_att_blocks: [5, 7, 9]
+ window_pos_embed_bkg_spatial_size: [7, 7]
+ neck:
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 256
+ normalize: true
+ scale: null
+ temperature: 10000
+ d_model: 256
+ backbone_channel_list: [768, 384, 192, 96]
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
+ fpn_interp_model: nearest
+
+ memory_attention:
+ _target_: sam2.modeling.memory_attention.MemoryAttention
+ d_model: 256
+ pos_enc_at_input: true
+ layer:
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+ activation: relu
+ dim_feedforward: 2048
+ dropout: 0.1
+ pos_enc_at_attn: false
+ self_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ d_model: 256
+ pos_enc_at_cross_attn_keys: true
+ pos_enc_at_cross_attn_queries: false
+ cross_attention:
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
+ rope_theta: 10000.0
+ feat_sizes: [64, 64]
+ rope_k_repeat: True
+ embedding_dim: 256
+ num_heads: 1
+ downsample_rate: 1
+ dropout: 0.1
+ kv_in_dim: 64
+ num_layers: 4
+
+ memory_encoder:
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
+ out_dim: 64
+ position_encoding:
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+ num_pos_feats: 64
+ normalize: true
+ scale: null
+ temperature: 10000
+ mask_downsampler:
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
+ kernel_size: 3
+ stride: 2
+ padding: 1
+ fuser:
+ _target_: sam2.modeling.memory_encoder.Fuser
+ layer:
+ _target_: sam2.modeling.memory_encoder.CXBlock
+ dim: 256
+ kernel_size: 7
+ padding: 3
+ layer_scale_init_value: 1e-6
+ use_dwconv: True # depth-wise convs
+ num_layers: 2
+
+ num_maskmem: 7
+ image_size: 1024
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+ # SAM decoder
+ sigmoid_scale_for_mem_enc: 20.0
+ sigmoid_bias_for_mem_enc: -10.0
+ use_mask_input_as_output_without_sam: true
+ # Memory
+ directly_add_no_mem_embed: true
+ # use high-resolution feature map in the SAM mask decoder
+ use_high_res_features_in_sam: true
+ # output 3 masks on the first click on initial conditioning frames
+ multimask_output_in_sam: true
+ # SAM heads
+ iou_prediction_use_sigmoid: True
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+ use_obj_ptrs_in_encoder: true
+ add_tpos_enc_to_obj_ptrs: false
+ only_obj_ptrs_in_the_past_for_eval: true
+ # object occlusion prediction
+ pred_obj_scores: true
+ pred_obj_scores_mlp: true
+ fixed_no_obj_ptr: true
+ # multimask tracking settings
+ multimask_output_for_tracking: true
+ use_multimask_token_for_obj_ptr: true
+ multimask_min_pt_num: 0
+ multimask_max_pt_num: 1
+ use_mlp_for_obj_ptr_proj: true
+ # Compilation flag
+ # HieraT does not currently support compilation, should always be set to False
+ compile_image_encoder: False
diff --git a/sam2_repo/sam2/sam2_image_predictor.py b/sam2_repo/sam2/sam2_image_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..41ce53af5924504c07216df52b2d2eefaeec7ae9
--- /dev/null
+++ b/sam2_repo/sam2/sam2_image_predictor.py
@@ -0,0 +1,466 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL.Image import Image
+
+from sam2.modeling.sam2_base import SAM2Base
+
+from sam2.utils.transforms import SAM2Transforms
+
+
+class SAM2ImagePredictor:
+ def __init__(
+ self,
+ sam_model: SAM2Base,
+ mask_threshold=0.0,
+ max_hole_area=0.0,
+ max_sprinkle_area=0.0,
+ **kwargs,
+ ) -> None:
+ """
+ Uses SAM-2 to calculate the image embedding for an image, and then
+ allow repeated, efficient mask prediction given prompts.
+
+ Arguments:
+ sam_model (Sam-2): The model to use for mask prediction.
+ mask_threshold (float): The threshold to use when converting mask logits
+ to binary masks. Masks are thresholded at 0 by default.
+ max_hole_area (int): If max_hole_area > 0, we fill small holes in up to
+ the maximum area of max_hole_area in low_res_masks.
+ max_sprinkle_area (int): If max_sprinkle_area > 0, we remove small sprinkles up to
+ the maximum area of max_sprinkle_area in low_res_masks.
+ """
+ super().__init__()
+ self.model = sam_model
+ self._transforms = SAM2Transforms(
+ resolution=self.model.image_size,
+ mask_threshold=mask_threshold,
+ max_hole_area=max_hole_area,
+ max_sprinkle_area=max_sprinkle_area,
+ )
+
+ # Predictor state
+ self._is_image_set = False
+ self._features = None
+ self._orig_hw = None
+ # Whether the predictor is set for single image or a batch of images
+ self._is_batch = False
+
+ # Predictor config
+ self.mask_threshold = mask_threshold
+
+ # Spatial dim for backbone feature maps
+ self._bb_feat_sizes = [
+ (256, 256),
+ (128, 128),
+ (64, 64),
+ ]
+
+ @classmethod
+ def from_pretrained(cls, model_id: str, **kwargs) -> "SAM2ImagePredictor":
+ """
+ Load a pretrained model from the Hugging Face hub.
+
+ Arguments:
+ model_id (str): The Hugging Face repository ID.
+ **kwargs: Additional arguments to pass to the model constructor.
+
+ Returns:
+ (SAM2ImagePredictor): The loaded model.
+ """
+ from sam2.build_sam import build_sam2_hf
+
+ sam_model = build_sam2_hf(model_id, **kwargs)
+ return cls(sam_model, **kwargs)
+
+ @torch.no_grad()
+ def set_image(
+ self,
+ image: Union[np.ndarray, Image],
+ ) -> None:
+ """
+ Calculates the image embeddings for the provided image, allowing
+ masks to be predicted with the 'predict' method.
+
+ Arguments:
+ image (np.ndarray or PIL Image): The input image to embed in RGB format. The image should be in HWC format if np.ndarray, or WHC format if PIL Image
+ with pixel values in [0, 255].
+ image_format (str): The color format of the image, in ['RGB', 'BGR'].
+ """
+ self.reset_predictor()
+ # Transform the image to the form expected by the model
+ if isinstance(image, np.ndarray):
+ logging.info("For numpy array image, we assume (HxWxC) format")
+ self._orig_hw = [image.shape[:2]]
+ elif isinstance(image, Image):
+ w, h = image.size
+ self._orig_hw = [(h, w)]
+ else:
+ raise NotImplementedError("Image format not supported")
+
+ input_image = self._transforms(image)
+ input_image = input_image[None, ...].to(self.device)
+
+ assert (
+ len(input_image.shape) == 4 and input_image.shape[1] == 3
+ ), f"input_image must be of size 1x3xHxW, got {input_image.shape}"
+ logging.info("Computing image embeddings for the provided image...")
+ backbone_out = self.model.forward_image(input_image)
+ _, vision_feats, _, _ = self.model._prepare_backbone_features(backbone_out)
+ # Add no_mem_embed, which is added to the lowest rest feat. map during training on videos
+ if self.model.directly_add_no_mem_embed:
+ vision_feats[-1] = vision_feats[-1] + self.model.no_mem_embed
+
+ feats = [
+ feat.permute(1, 2, 0).view(1, -1, *feat_size)
+ for feat, feat_size in zip(vision_feats[::-1], self._bb_feat_sizes[::-1])
+ ][::-1]
+ self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]}
+ self._is_image_set = True
+ logging.info("Image embeddings computed.")
+
+ @torch.no_grad()
+ def set_image_batch(
+ self,
+ image_list: List[Union[np.ndarray]],
+ ) -> None:
+ """
+ Calculates the image embeddings for the provided image batch, allowing
+ masks to be predicted with the 'predict_batch' method.
+
+ Arguments:
+ image_list (List[np.ndarray]): The input images to embed in RGB format. The image should be in HWC format if np.ndarray
+ with pixel values in [0, 255].
+ """
+ self.reset_predictor()
+ assert isinstance(image_list, list)
+ self._orig_hw = []
+ for image in image_list:
+ assert isinstance(
+ image, np.ndarray
+ ), "Images are expected to be an np.ndarray in RGB format, and of shape HWC"
+ self._orig_hw.append(image.shape[:2])
+ # Transform the image to the form expected by the model
+ img_batch = self._transforms.forward_batch(image_list)
+ img_batch = img_batch.to(self.device)
+ batch_size = img_batch.shape[0]
+ assert (
+ len(img_batch.shape) == 4 and img_batch.shape[1] == 3
+ ), f"img_batch must be of size Bx3xHxW, got {img_batch.shape}"
+ logging.info("Computing image embeddings for the provided images...")
+ backbone_out = self.model.forward_image(img_batch)
+ _, vision_feats, _, _ = self.model._prepare_backbone_features(backbone_out)
+ # Add no_mem_embed, which is added to the lowest rest feat. map during training on videos
+ if self.model.directly_add_no_mem_embed:
+ vision_feats[-1] = vision_feats[-1] + self.model.no_mem_embed
+
+ feats = [
+ feat.permute(1, 2, 0).view(batch_size, -1, *feat_size)
+ for feat, feat_size in zip(vision_feats[::-1], self._bb_feat_sizes[::-1])
+ ][::-1]
+ self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]}
+ self._is_image_set = True
+ self._is_batch = True
+ logging.info("Image embeddings computed.")
+
+ def predict_batch(
+ self,
+ point_coords_batch: List[np.ndarray] = None,
+ point_labels_batch: List[np.ndarray] = None,
+ box_batch: List[np.ndarray] = None,
+ mask_input_batch: List[np.ndarray] = None,
+ multimask_output: bool = True,
+ return_logits: bool = False,
+ normalize_coords=True,
+ ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
+ """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
+ It returns a tuple of lists of masks, ious, and low_res_masks_logits.
+ """
+ assert self._is_batch, "This function should only be used when in batched mode"
+ if not self._is_image_set:
+ raise RuntimeError(
+ "An image must be set with .set_image_batch(...) before mask prediction."
+ )
+ num_images = len(self._features["image_embed"])
+ all_masks = []
+ all_ious = []
+ all_low_res_masks = []
+ for img_idx in range(num_images):
+ # Transform input prompts
+ point_coords = (
+ point_coords_batch[img_idx] if point_coords_batch is not None else None
+ )
+ point_labels = (
+ point_labels_batch[img_idx] if point_labels_batch is not None else None
+ )
+ box = box_batch[img_idx] if box_batch is not None else None
+ mask_input = (
+ mask_input_batch[img_idx] if mask_input_batch is not None else None
+ )
+ mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts(
+ point_coords,
+ point_labels,
+ box,
+ mask_input,
+ normalize_coords,
+ img_idx=img_idx,
+ )
+ masks, iou_predictions, low_res_masks = self._predict(
+ unnorm_coords,
+ labels,
+ unnorm_box,
+ mask_input,
+ multimask_output,
+ return_logits=return_logits,
+ img_idx=img_idx,
+ )
+ masks_np = masks.squeeze(0).float().detach().cpu().numpy()
+ iou_predictions_np = (
+ iou_predictions.squeeze(0).float().detach().cpu().numpy()
+ )
+ low_res_masks_np = low_res_masks.squeeze(0).float().detach().cpu().numpy()
+ all_masks.append(masks_np)
+ all_ious.append(iou_predictions_np)
+ all_low_res_masks.append(low_res_masks_np)
+
+ return all_masks, all_ious, all_low_res_masks
+
+ def predict(
+ self,
+ point_coords: Optional[np.ndarray] = None,
+ point_labels: Optional[np.ndarray] = None,
+ box: Optional[np.ndarray] = None,
+ mask_input: Optional[np.ndarray] = None,
+ multimask_output: bool = True,
+ return_logits: bool = False,
+ normalize_coords=True,
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """
+ Predict masks for the given input prompts, using the currently set image.
+
+ Arguments:
+ point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+ model. Each point is in (X,Y) in pixels.
+ point_labels (np.ndarray or None): A length N array of labels for the
+ point prompts. 1 indicates a foreground point and 0 indicates a
+ background point.
+ box (np.ndarray or None): A length 4 array given a box prompt to the
+ model, in XYXY format.
+ mask_input (np.ndarray): A low resolution mask input to the model, typically
+ coming from a previous prediction iteration. Has form 1xHxW, where
+ for SAM, H=W=256.
+ multimask_output (bool): If true, the model will return three masks.
+ For ambiguous input prompts (such as a single click), this will often
+ produce better masks than a single prediction. If only a single
+ mask is needed, the model's predicted quality score can be used
+ to select the best mask. For non-ambiguous prompts, such as multiple
+ input prompts, multimask_output=False can give better results.
+ return_logits (bool): If true, returns un-thresholded masks logits
+ instead of a binary mask.
+ normalize_coords (bool): If true, the point coordinates will be normalized to the range [0,1] and point_coords is expected to be wrt. image dimensions.
+
+ Returns:
+ (np.ndarray): The output masks in CxHxW format, where C is the
+ number of masks, and (H, W) is the original image size.
+ (np.ndarray): An array of length C containing the model's
+ predictions for the quality of each mask.
+ (np.ndarray): An array of shape CxHxW, where C is the number
+ of masks and H=W=256. These low resolution logits can be passed to
+ a subsequent iteration as mask input.
+ """
+ if not self._is_image_set:
+ raise RuntimeError(
+ "An image must be set with .set_image(...) before mask prediction."
+ )
+
+ # Transform input prompts
+
+ mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts(
+ point_coords, point_labels, box, mask_input, normalize_coords
+ )
+
+ masks, iou_predictions, low_res_masks = self._predict(
+ unnorm_coords,
+ labels,
+ unnorm_box,
+ mask_input,
+ multimask_output,
+ return_logits=return_logits,
+ )
+
+ masks_np = masks.squeeze(0).float().detach().cpu().numpy()
+ iou_predictions_np = iou_predictions.squeeze(0).float().detach().cpu().numpy()
+ low_res_masks_np = low_res_masks.squeeze(0).float().detach().cpu().numpy()
+ return masks_np, iou_predictions_np, low_res_masks_np
+
+ def _prep_prompts(
+ self, point_coords, point_labels, box, mask_logits, normalize_coords, img_idx=-1
+ ):
+
+ unnorm_coords, labels, unnorm_box, mask_input = None, None, None, None
+ if point_coords is not None:
+ assert (
+ point_labels is not None
+ ), "point_labels must be supplied if point_coords is supplied."
+ point_coords = torch.as_tensor(
+ point_coords, dtype=torch.float, device=self.device
+ )
+ unnorm_coords = self._transforms.transform_coords(
+ point_coords, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx]
+ )
+ labels = torch.as_tensor(point_labels, dtype=torch.int, device=self.device)
+ if len(unnorm_coords.shape) == 2:
+ unnorm_coords, labels = unnorm_coords[None, ...], labels[None, ...]
+ if box is not None:
+ box = torch.as_tensor(box, dtype=torch.float, device=self.device)
+ unnorm_box = self._transforms.transform_boxes(
+ box, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx]
+ ) # Bx2x2
+ if mask_logits is not None:
+ mask_input = torch.as_tensor(
+ mask_logits, dtype=torch.float, device=self.device
+ )
+ if len(mask_input.shape) == 3:
+ mask_input = mask_input[None, :, :, :]
+ return mask_input, unnorm_coords, labels, unnorm_box
+
+ @torch.no_grad()
+ def _predict(
+ self,
+ point_coords: Optional[torch.Tensor],
+ point_labels: Optional[torch.Tensor],
+ boxes: Optional[torch.Tensor] = None,
+ mask_input: Optional[torch.Tensor] = None,
+ multimask_output: bool = True,
+ return_logits: bool = False,
+ img_idx: int = -1,
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Predict masks for the given input prompts, using the currently set image.
+ Input prompts are batched torch tensors and are expected to already be
+ transformed to the input frame using SAM2Transforms.
+
+ Arguments:
+ point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+ model. Each point is in (X,Y) in pixels.
+ point_labels (torch.Tensor or None): A BxN array of labels for the
+ point prompts. 1 indicates a foreground point and 0 indicates a
+ background point.
+ boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+ model, in XYXY format.
+ mask_input (np.ndarray): A low resolution mask input to the model, typically
+ coming from a previous prediction iteration. Has form Bx1xHxW, where
+ for SAM, H=W=256. Masks returned by a previous iteration of the
+ predict method do not need further transformation.
+ multimask_output (bool): If true, the model will return three masks.
+ For ambiguous input prompts (such as a single click), this will often
+ produce better masks than a single prediction. If only a single
+ mask is needed, the model's predicted quality score can be used
+ to select the best mask. For non-ambiguous prompts, such as multiple
+ input prompts, multimask_output=False can give better results.
+ return_logits (bool): If true, returns un-thresholded masks logits
+ instead of a binary mask.
+
+ Returns:
+ (torch.Tensor): The output masks in BxCxHxW format, where C is the
+ number of masks, and (H, W) is the original image size.
+ (torch.Tensor): An array of shape BxC containing the model's
+ predictions for the quality of each mask.
+ (torch.Tensor): An array of shape BxCxHxW, where C is the number
+ of masks and H=W=256. These low res logits can be passed to
+ a subsequent iteration as mask input.
+ """
+ if not self._is_image_set:
+ raise RuntimeError(
+ "An image must be set with .set_image(...) before mask prediction."
+ )
+
+ if point_coords is not None:
+ concat_points = (point_coords, point_labels)
+ else:
+ concat_points = None
+
+ # Embed prompts
+ if boxes is not None:
+ box_coords = boxes.reshape(-1, 2, 2)
+ box_labels = torch.tensor([[2, 3]], dtype=torch.int, device=boxes.device)
+ box_labels = box_labels.repeat(boxes.size(0), 1)
+ # we merge "boxes" and "points" into a single "concat_points" input (where
+ # boxes are added at the beginning) to sam_prompt_encoder
+ if concat_points is not None:
+ concat_coords = torch.cat([box_coords, concat_points[0]], dim=1)
+ concat_labels = torch.cat([box_labels, concat_points[1]], dim=1)
+ concat_points = (concat_coords, concat_labels)
+ else:
+ concat_points = (box_coords, box_labels)
+
+ sparse_embeddings, dense_embeddings = self.model.sam_prompt_encoder(
+ points=concat_points,
+ boxes=None,
+ masks=mask_input,
+ )
+
+ # Predict masks
+ batched_mode = (
+ concat_points is not None and concat_points[0].shape[0] > 1
+ ) # multi object prediction
+ high_res_features = [
+ feat_level[img_idx].unsqueeze(0)
+ for feat_level in self._features["high_res_feats"]
+ ]
+ low_res_masks, iou_predictions, _, _ = self.model.sam_mask_decoder(
+ image_embeddings=self._features["image_embed"][img_idx].unsqueeze(0),
+ image_pe=self.model.sam_prompt_encoder.get_dense_pe(),
+ sparse_prompt_embeddings=sparse_embeddings,
+ dense_prompt_embeddings=dense_embeddings,
+ multimask_output=multimask_output,
+ repeat_image=batched_mode,
+ high_res_features=high_res_features,
+ )
+
+ # Upscale the masks to the original image resolution
+ masks = self._transforms.postprocess_masks(
+ low_res_masks, self._orig_hw[img_idx]
+ )
+ low_res_masks = torch.clamp(low_res_masks, -32.0, 32.0)
+ if not return_logits:
+ masks = masks > self.mask_threshold
+
+ return masks, iou_predictions, low_res_masks
+
+ def get_image_embedding(self) -> torch.Tensor:
+ """
+ Returns the image embeddings for the currently set image, with
+ shape 1xCxHxW, where C is the embedding dimension and (H,W) are
+ the embedding spatial dimension of SAM (typically C=256, H=W=64).
+ """
+ if not self._is_image_set:
+ raise RuntimeError(
+ "An image must be set with .set_image(...) to generate an embedding."
+ )
+ assert (
+ self._features is not None
+ ), "Features must exist if an image has been set."
+ return self._features["image_embed"]
+
+ @property
+ def device(self) -> torch.device:
+ return self.model.device
+
+ def reset_predictor(self) -> None:
+ """
+ Resets the image embeddings and other state variables.
+ """
+ self._is_image_set = False
+ self._features = None
+ self._orig_hw = None
+ self._is_batch = False
diff --git a/sam2_repo/sam2/sam2_video_predictor.py b/sam2_repo/sam2/sam2_video_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a7e1a01c4d6e89db0453ce982ea8a31b16651c8
--- /dev/null
+++ b/sam2_repo/sam2/sam2_video_predictor.py
@@ -0,0 +1,1223 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+
+from tqdm import tqdm
+
+from sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
+from sam2.utils.misc import concat_points, fill_holes_in_mask_scores, load_video_frames
+
+
+class SAM2VideoPredictor(SAM2Base):
+ """The predictor class to handle user interactions and manage inference states."""
+
+ def __init__(
+ self,
+ fill_hole_area=0,
+ # whether to apply non-overlapping constraints on the output object masks
+ non_overlap_masks=False,
+ # whether to clear non-conditioning memory of the surrounding frames (which may contain outdated information) after adding correction clicks;
+ # note that this would only apply to *single-object tracking* unless `clear_non_cond_mem_for_multi_obj` is also set to True)
+ clear_non_cond_mem_around_input=False,
+ # if `add_all_frames_to_correct_as_cond` is True, we also append to the conditioning frame list any frame that receives a later correction click
+ # if `add_all_frames_to_correct_as_cond` is False, we conditioning frame list to only use those initial conditioning frames
+ add_all_frames_to_correct_as_cond=False,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.fill_hole_area = fill_hole_area
+ self.non_overlap_masks = non_overlap_masks
+ self.clear_non_cond_mem_around_input = clear_non_cond_mem_around_input
+ self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond
+
+ @torch.inference_mode()
+ def init_state(
+ self,
+ video_path,
+ offload_video_to_cpu=False,
+ offload_state_to_cpu=False,
+ async_loading_frames=False,
+ ):
+ """Initialize an inference state."""
+ compute_device = self.device # device of the model
+ images, video_height, video_width = load_video_frames(
+ video_path=video_path,
+ image_size=self.image_size,
+ offload_video_to_cpu=offload_video_to_cpu,
+ async_loading_frames=async_loading_frames,
+ compute_device=compute_device,
+ )
+ inference_state = {}
+ inference_state["images"] = images
+ inference_state["num_frames"] = len(images)
+ # whether to offload the video frames to CPU memory
+ # turning on this option saves the GPU memory with only a very small overhead
+ inference_state["offload_video_to_cpu"] = offload_video_to_cpu
+ # whether to offload the inference state to CPU memory
+ # turning on this option saves the GPU memory at the cost of a lower tracking fps
+ # (e.g. in a test case of 768x768 model, fps dropped from 27 to 24 when tracking one object
+ # and from 24 to 21 when tracking two objects)
+ inference_state["offload_state_to_cpu"] = offload_state_to_cpu
+ # the original video height and width, used for resizing final output scores
+ inference_state["video_height"] = video_height
+ inference_state["video_width"] = video_width
+ inference_state["device"] = compute_device
+ if offload_state_to_cpu:
+ inference_state["storage_device"] = torch.device("cpu")
+ else:
+ inference_state["storage_device"] = compute_device
+ # inputs on each frame
+ inference_state["point_inputs_per_obj"] = {}
+ inference_state["mask_inputs_per_obj"] = {}
+ # visual features on a small number of recently visited frames for quick interactions
+ inference_state["cached_features"] = {}
+ # values that don't change across frames (so we only need to hold one copy of them)
+ inference_state["constants"] = {}
+ # mapping between client-side object id and model-side object index
+ inference_state["obj_id_to_idx"] = OrderedDict()
+ inference_state["obj_idx_to_id"] = OrderedDict()
+ inference_state["obj_ids"] = []
+ # Slice (view) of each object tracking results, sharing the same memory with "output_dict"
+ inference_state["output_dict_per_obj"] = {}
+ # A temporary storage to hold new outputs when user interact with a frame
+ # to add clicks or mask (it's merged into "output_dict" before propagation starts)
+ inference_state["temp_output_dict_per_obj"] = {}
+ # Frames that already holds consolidated outputs from click or mask inputs
+ # (we directly use their consolidated outputs during tracking)
+ # metadata for each tracking frame (e.g. which direction it's tracked)
+ inference_state["frames_tracked_per_obj"] = {}
+ # Warm up the visual backbone and cache the image feature on frame 0
+ self._get_image_feature(inference_state, frame_idx=0, batch_size=1)
+ return inference_state
+
+ @classmethod
+ def from_pretrained(cls, model_id: str, **kwargs) -> "SAM2VideoPredictor":
+ """
+ Load a pretrained model from the Hugging Face hub.
+
+ Arguments:
+ model_id (str): The Hugging Face repository ID.
+ **kwargs: Additional arguments to pass to the model constructor.
+
+ Returns:
+ (SAM2VideoPredictor): The loaded model.
+ """
+ from sam2.build_sam import build_sam2_video_predictor_hf
+
+ sam_model = build_sam2_video_predictor_hf(model_id, **kwargs)
+ return sam_model
+
+ def _obj_id_to_idx(self, inference_state, obj_id):
+ """Map client-side object id to model-side object index."""
+ obj_idx = inference_state["obj_id_to_idx"].get(obj_id, None)
+ if obj_idx is not None:
+ return obj_idx
+
+ # We always allow adding new objects (including after tracking starts).
+ allow_new_object = True
+ if allow_new_object:
+ # get the next object slot
+ obj_idx = len(inference_state["obj_id_to_idx"])
+ inference_state["obj_id_to_idx"][obj_id] = obj_idx
+ inference_state["obj_idx_to_id"][obj_idx] = obj_id
+ inference_state["obj_ids"] = list(inference_state["obj_id_to_idx"])
+ # set up input and output structures for this object
+ inference_state["point_inputs_per_obj"][obj_idx] = {}
+ inference_state["mask_inputs_per_obj"][obj_idx] = {}
+ inference_state["output_dict_per_obj"][obj_idx] = {
+ "cond_frame_outputs": {}, # dict containing {frame_idx: }
+ "non_cond_frame_outputs": {}, # dict containing {frame_idx: }
+ }
+ inference_state["temp_output_dict_per_obj"][obj_idx] = {
+ "cond_frame_outputs": {}, # dict containing {frame_idx: }
+ "non_cond_frame_outputs": {}, # dict containing {frame_idx: }
+ }
+ inference_state["frames_tracked_per_obj"][obj_idx] = {}
+ return obj_idx
+ else:
+ raise RuntimeError(
+ f"Cannot add new object id {obj_id} after tracking starts. "
+ f"All existing object ids: {inference_state['obj_ids']}. "
+ f"Please call 'reset_state' to restart from scratch."
+ )
+
+ def _obj_idx_to_id(self, inference_state, obj_idx):
+ """Map model-side object index to client-side object id."""
+ return inference_state["obj_idx_to_id"][obj_idx]
+
+ def _get_obj_num(self, inference_state):
+ """Get the total number of unique object ids received so far in this session."""
+ return len(inference_state["obj_idx_to_id"])
+
+ @torch.inference_mode()
+ def add_new_points_or_box(
+ self,
+ inference_state,
+ frame_idx,
+ obj_id,
+ points=None,
+ labels=None,
+ clear_old_points=True,
+ normalize_coords=True,
+ box=None,
+ ):
+ """Add new points to a frame."""
+ obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+ point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
+ mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
+
+ if (points is not None) != (labels is not None):
+ raise ValueError("points and labels must be provided together")
+ if points is None and box is None:
+ raise ValueError("at least one of points or box must be provided as input")
+
+ if points is None:
+ points = torch.zeros(0, 2, dtype=torch.float32)
+ elif not isinstance(points, torch.Tensor):
+ points = torch.tensor(points, dtype=torch.float32)
+ if labels is None:
+ labels = torch.zeros(0, dtype=torch.int32)
+ elif not isinstance(labels, torch.Tensor):
+ labels = torch.tensor(labels, dtype=torch.int32)
+ if points.dim() == 2:
+ points = points.unsqueeze(0) # add batch dimension
+ if labels.dim() == 1:
+ labels = labels.unsqueeze(0) # add batch dimension
+
+ # If `box` is provided, we add it as the first two points with labels 2 and 3
+ # along with the user-provided points (consistent with how SAM 2 is trained).
+ if box is not None:
+ if not clear_old_points:
+ raise ValueError(
+ "cannot add box without clearing old points, since "
+ "box prompt must be provided before any point prompt "
+ "(please use clear_old_points=True instead)"
+ )
+ if not isinstance(box, torch.Tensor):
+ box = torch.tensor(box, dtype=torch.float32, device=points.device)
+ box_coords = box.reshape(1, 2, 2)
+ box_labels = torch.tensor([2, 3], dtype=torch.int32, device=labels.device)
+ box_labels = box_labels.reshape(1, 2)
+ points = torch.cat([box_coords, points], dim=1)
+ labels = torch.cat([box_labels, labels], dim=1)
+
+ if normalize_coords:
+ video_H = inference_state["video_height"]
+ video_W = inference_state["video_width"]
+ points = points / torch.tensor([video_W, video_H]).to(points.device)
+ # scale the (normalized) coordinates by the model's internal image size
+ points = points * self.image_size
+ points = points.to(inference_state["device"])
+ labels = labels.to(inference_state["device"])
+
+ if not clear_old_points:
+ point_inputs = point_inputs_per_frame.get(frame_idx, None)
+ else:
+ point_inputs = None
+ point_inputs = concat_points(point_inputs, points, labels)
+
+ point_inputs_per_frame[frame_idx] = point_inputs
+ mask_inputs_per_frame.pop(frame_idx, None)
+ # If this frame hasn't been tracked before, we treat it as an initial conditioning
+ # frame, meaning that the inputs points are to generate segments on this frame without
+ # using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
+ # the input points will be used to correct the already tracked masks.
+ obj_frames_tracked = inference_state["frames_tracked_per_obj"][obj_idx]
+ is_init_cond_frame = frame_idx not in obj_frames_tracked
+ # whether to track in reverse time order
+ if is_init_cond_frame:
+ reverse = False
+ else:
+ reverse = obj_frames_tracked[frame_idx]["reverse"]
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+ # Add a frame to conditioning output if it's an initial conditioning frame or
+ # if the model sees all frames receiving clicks/mask as conditioning frames.
+ is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
+ storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+
+ # Get any previously predicted mask logits on this object and feed it along with
+ # the new clicks into the SAM mask decoder.
+ prev_sam_mask_logits = None
+ # lookup temporary output dict first, which contains the most recent output
+ # (if not found, then lookup conditioning and non-conditioning frame output)
+ prev_out = obj_temp_output_dict[storage_key].get(frame_idx)
+ if prev_out is None:
+ prev_out = obj_output_dict["cond_frame_outputs"].get(frame_idx)
+ if prev_out is None:
+ prev_out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx)
+
+ if prev_out is not None and prev_out["pred_masks"] is not None:
+ device = inference_state["device"]
+ prev_sam_mask_logits = prev_out["pred_masks"].to(device, non_blocking=True)
+ # Clamp the scale of prev_sam_mask_logits to avoid rare numerical issues.
+ prev_sam_mask_logits = torch.clamp(prev_sam_mask_logits, -32.0, 32.0)
+ current_out, _ = self._run_single_frame_inference(
+ inference_state=inference_state,
+ output_dict=obj_output_dict, # run on the slice of a single object
+ frame_idx=frame_idx,
+ batch_size=1, # run on the slice of a single object
+ is_init_cond_frame=is_init_cond_frame,
+ point_inputs=point_inputs,
+ mask_inputs=None,
+ reverse=reverse,
+ # Skip the memory encoder when adding clicks or mask. We execute the memory encoder
+ # at the beginning of `propagate_in_video` (after user finalize their clicks). This
+ # allows us to enforce non-overlapping constraints on all objects before encoding
+ # them into memory.
+ run_mem_encoder=False,
+ prev_sam_mask_logits=prev_sam_mask_logits,
+ )
+ # Add the output to the output dict (to be used as future memory)
+ obj_temp_output_dict[storage_key][frame_idx] = current_out
+
+ # Resize the output mask to the original video resolution
+ obj_ids = inference_state["obj_ids"]
+ consolidated_out = self._consolidate_temp_output_across_obj(
+ inference_state,
+ frame_idx,
+ is_cond=is_cond,
+ consolidate_at_video_res=True,
+ )
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, consolidated_out["pred_masks_video_res"]
+ )
+ return frame_idx, obj_ids, video_res_masks
+
+ def add_new_points(self, *args, **kwargs):
+ """Deprecated method. Please use `add_new_points_or_box` instead."""
+ return self.add_new_points_or_box(*args, **kwargs)
+
+ @torch.inference_mode()
+ def add_new_mask(
+ self,
+ inference_state,
+ frame_idx,
+ obj_id,
+ mask,
+ ):
+ """Add new mask to a frame."""
+ obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+ point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
+ mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
+
+ if not isinstance(mask, torch.Tensor):
+ mask = torch.tensor(mask, dtype=torch.bool)
+ assert mask.dim() == 2
+ mask_H, mask_W = mask.shape
+ mask_inputs_orig = mask[None, None] # add batch and channel dimension
+ mask_inputs_orig = mask_inputs_orig.float().to(inference_state["device"])
+
+ # resize the mask if it doesn't match the model's image size
+ if mask_H != self.image_size or mask_W != self.image_size:
+ mask_inputs = torch.nn.functional.interpolate(
+ mask_inputs_orig,
+ size=(self.image_size, self.image_size),
+ align_corners=False,
+ mode="bilinear",
+ antialias=True, # use antialias for downsampling
+ )
+ mask_inputs = (mask_inputs >= 0.5).float()
+ else:
+ mask_inputs = mask_inputs_orig
+
+ mask_inputs_per_frame[frame_idx] = mask_inputs
+ point_inputs_per_frame.pop(frame_idx, None)
+ # If this frame hasn't been tracked before, we treat it as an initial conditioning
+ # frame, meaning that the inputs points are to generate segments on this frame without
+ # using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
+ # the input points will be used to correct the already tracked masks.
+ obj_frames_tracked = inference_state["frames_tracked_per_obj"][obj_idx]
+ is_init_cond_frame = frame_idx not in obj_frames_tracked
+ # whether to track in reverse time order
+ if is_init_cond_frame:
+ reverse = False
+ else:
+ reverse = obj_frames_tracked[frame_idx]["reverse"]
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+ # Add a frame to conditioning output if it's an initial conditioning frame or
+ # if the model sees all frames receiving clicks/mask as conditioning frames.
+ is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
+ storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+
+ current_out, _ = self._run_single_frame_inference(
+ inference_state=inference_state,
+ output_dict=obj_output_dict, # run on the slice of a single object
+ frame_idx=frame_idx,
+ batch_size=1, # run on the slice of a single object
+ is_init_cond_frame=is_init_cond_frame,
+ point_inputs=None,
+ mask_inputs=mask_inputs,
+ reverse=reverse,
+ # Skip the memory encoder when adding clicks or mask. We execute the memory encoder
+ # at the beginning of `propagate_in_video` (after user finalize their clicks). This
+ # allows us to enforce non-overlapping constraints on all objects before encoding
+ # them into memory.
+ run_mem_encoder=False,
+ )
+ # Add the output to the output dict (to be used as future memory)
+ obj_temp_output_dict[storage_key][frame_idx] = current_out
+
+ # Resize the output mask to the original video resolution
+ obj_ids = inference_state["obj_ids"]
+ consolidated_out = self._consolidate_temp_output_across_obj(
+ inference_state,
+ frame_idx,
+ is_cond=is_cond,
+ consolidate_at_video_res=True,
+ )
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, consolidated_out["pred_masks_video_res"]
+ )
+ return frame_idx, obj_ids, video_res_masks
+
+ def _get_orig_video_res_output(self, inference_state, any_res_masks):
+ """
+ Resize the object scores to the original video resolution (video_res_masks)
+ and apply non-overlapping constraints for final output.
+ """
+ device = inference_state["device"]
+ video_H = inference_state["video_height"]
+ video_W = inference_state["video_width"]
+ any_res_masks = any_res_masks.to(device, non_blocking=True)
+ if any_res_masks.shape[-2:] == (video_H, video_W):
+ video_res_masks = any_res_masks
+ else:
+ video_res_masks = torch.nn.functional.interpolate(
+ any_res_masks,
+ size=(video_H, video_W),
+ mode="bilinear",
+ align_corners=False,
+ )
+ if self.non_overlap_masks:
+ video_res_masks = self._apply_non_overlapping_constraints(video_res_masks)
+ return any_res_masks, video_res_masks
+
+ def _consolidate_temp_output_across_obj(
+ self,
+ inference_state,
+ frame_idx,
+ is_cond,
+ consolidate_at_video_res=False,
+ ):
+ """
+ Consolidate the per-object temporary outputs in `temp_output_dict_per_obj` on
+ a frame into a single output for all objects, including
+ 1) fill any missing objects either from `output_dict_per_obj` (if they exist in
+ `output_dict_per_obj` for this frame) or leave them as placeholder values
+ (if they don't exist in `output_dict_per_obj` for this frame);
+ 2) if specified, rerun memory encoder after apply non-overlapping constraints
+ on the object scores.
+ """
+ batch_size = self._get_obj_num(inference_state)
+ storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+ # Optionally, we allow consolidating the temporary outputs at the original
+ # video resolution (to provide a better editing experience for mask prompts).
+ if consolidate_at_video_res:
+ consolidated_H = inference_state["video_height"]
+ consolidated_W = inference_state["video_width"]
+ consolidated_mask_key = "pred_masks_video_res"
+ else:
+ consolidated_H = consolidated_W = self.image_size // 4
+ consolidated_mask_key = "pred_masks"
+
+ # Initialize `consolidated_out`. Its "maskmem_features" and "maskmem_pos_enc"
+ # will be added when rerunning the memory encoder after applying non-overlapping
+ # constraints to object scores. Its "pred_masks" are prefilled with a large
+ # negative value (NO_OBJ_SCORE) to represent missing objects.
+ consolidated_out = {
+ consolidated_mask_key: torch.full(
+ size=(batch_size, 1, consolidated_H, consolidated_W),
+ fill_value=NO_OBJ_SCORE,
+ dtype=torch.float32,
+ device=inference_state["storage_device"],
+ ),
+ }
+ for obj_idx in range(batch_size):
+ obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ out = obj_temp_output_dict[storage_key].get(frame_idx, None)
+ # If the object doesn't appear in "temp_output_dict_per_obj" on this frame,
+ # we fall back and look up its previous output in "output_dict_per_obj".
+ # We look up both "cond_frame_outputs" and "non_cond_frame_outputs" in
+ # "output_dict_per_obj" to find a previous output for this object.
+ if out is None:
+ out = obj_output_dict["cond_frame_outputs"].get(frame_idx, None)
+ if out is None:
+ out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx, None)
+ # If the object doesn't appear in "output_dict_per_obj" either, we skip it
+ # and leave its mask scores to the default scores (i.e. the NO_OBJ_SCORE
+ # placeholder above) and set its object pointer to be a dummy pointer.
+ if out is None:
+ continue
+ # Add the temporary object output mask to consolidated output mask
+ obj_mask = out["pred_masks"]
+ consolidated_pred_masks = consolidated_out[consolidated_mask_key]
+ if obj_mask.shape[-2:] == consolidated_pred_masks.shape[-2:]:
+ consolidated_pred_masks[obj_idx : obj_idx + 1] = obj_mask
+ else:
+ # Resize first if temporary object mask has a different resolution
+ resized_obj_mask = torch.nn.functional.interpolate(
+ obj_mask,
+ size=consolidated_pred_masks.shape[-2:],
+ mode="bilinear",
+ align_corners=False,
+ )
+ consolidated_pred_masks[obj_idx : obj_idx + 1] = resized_obj_mask
+
+ return consolidated_out
+
+ @torch.inference_mode()
+ def propagate_in_video_preflight(self, inference_state):
+ """Prepare inference_state and consolidate temporary outputs before tracking."""
+ # Check and make sure that every object has received input points or masks.
+ batch_size = self._get_obj_num(inference_state)
+ if batch_size == 0:
+ raise RuntimeError(
+ "No input points or masks are provided for any object; please add inputs first."
+ )
+
+ # Consolidate per-object temporary outputs in "temp_output_dict_per_obj" and
+ # add them into "output_dict".
+ for obj_idx in range(batch_size):
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+ for is_cond in [False, True]:
+ # Separately consolidate conditioning and non-conditioning temp outputs
+ storage_key = (
+ "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+ )
+ # Find all the frames that contain temporary outputs for any objects
+ # (these should be the frames that have just received clicks for mask inputs
+ # via `add_new_points_or_box` or `add_new_mask`)
+ for frame_idx, out in obj_temp_output_dict[storage_key].items():
+ # Run memory encoder on the temporary outputs (if the memory feature is missing)
+ if out["maskmem_features"] is None:
+ high_res_masks = torch.nn.functional.interpolate(
+ out["pred_masks"].to(inference_state["device"]),
+ size=(self.image_size, self.image_size),
+ mode="bilinear",
+ align_corners=False,
+ )
+ maskmem_features, maskmem_pos_enc = self._run_memory_encoder(
+ inference_state=inference_state,
+ frame_idx=frame_idx,
+ batch_size=1, # run on the slice of a single object
+ high_res_masks=high_res_masks,
+ object_score_logits=out["object_score_logits"],
+ # these frames are what the user interacted with
+ is_mask_from_pts=True,
+ )
+ out["maskmem_features"] = maskmem_features
+ out["maskmem_pos_enc"] = maskmem_pos_enc
+
+ obj_output_dict[storage_key][frame_idx] = out
+ if self.clear_non_cond_mem_around_input:
+ # clear non-conditioning memory of the surrounding frames
+ self._clear_obj_non_cond_mem_around_input(
+ inference_state, frame_idx, obj_idx
+ )
+
+ # clear temporary outputs in `temp_output_dict_per_obj`
+ obj_temp_output_dict[storage_key].clear()
+
+ # check and make sure that every object has received input points or masks
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ if len(obj_output_dict["cond_frame_outputs"]) == 0:
+ obj_id = self._obj_idx_to_id(inference_state, obj_idx)
+ raise RuntimeError(
+ f"No input points or masks are provided for object id {obj_id}; please add inputs first."
+ )
+ # edge case: if an output is added to "cond_frame_outputs", we remove any prior
+ # output on the same frame in "non_cond_frame_outputs"
+ for frame_idx in obj_output_dict["cond_frame_outputs"]:
+ obj_output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
+
+ @torch.inference_mode()
+ def propagate_in_video(
+ self,
+ inference_state,
+ start_frame_idx=None,
+ max_frame_num_to_track=None,
+ reverse=False,
+ ):
+ """Propagate the input points across frames to track in the entire video."""
+ self.propagate_in_video_preflight(inference_state)
+
+ obj_ids = inference_state["obj_ids"]
+ num_frames = inference_state["num_frames"]
+ batch_size = self._get_obj_num(inference_state)
+
+ # set start index, end index, and processing order
+ if start_frame_idx is None:
+ # default: start from the earliest frame with input points
+ start_frame_idx = min(
+ t
+ for obj_output_dict in inference_state["output_dict_per_obj"].values()
+ for t in obj_output_dict["cond_frame_outputs"]
+ )
+ if max_frame_num_to_track is None:
+ # default: track all the frames in the video
+ max_frame_num_to_track = num_frames
+ if reverse:
+ end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0)
+ if start_frame_idx > 0:
+ processing_order = range(start_frame_idx, end_frame_idx - 1, -1)
+ else:
+ processing_order = [] # skip reverse tracking if starting from frame 0
+ else:
+ end_frame_idx = min(
+ start_frame_idx + max_frame_num_to_track, num_frames - 1
+ )
+ processing_order = range(start_frame_idx, end_frame_idx + 1)
+
+ for frame_idx in tqdm(processing_order, desc="propagate in video"):
+ pred_masks_per_obj = [None] * batch_size
+ for obj_idx in range(batch_size):
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ # We skip those frames already in consolidated outputs (these are frames
+ # that received input clicks or mask). Note that we cannot directly run
+ # batched forward on them via `_run_single_frame_inference` because the
+ # number of clicks on each object might be different.
+ if frame_idx in obj_output_dict["cond_frame_outputs"]:
+ storage_key = "cond_frame_outputs"
+ current_out = obj_output_dict[storage_key][frame_idx]
+ device = inference_state["device"]
+ pred_masks = current_out["pred_masks"].to(device, non_blocking=True)
+ if self.clear_non_cond_mem_around_input:
+ # clear non-conditioning memory of the surrounding frames
+ self._clear_obj_non_cond_mem_around_input(
+ inference_state, frame_idx, obj_idx
+ )
+ else:
+ storage_key = "non_cond_frame_outputs"
+ current_out, pred_masks = self._run_single_frame_inference(
+ inference_state=inference_state,
+ output_dict=obj_output_dict,
+ frame_idx=frame_idx,
+ batch_size=1, # run on the slice of a single object
+ is_init_cond_frame=False,
+ point_inputs=None,
+ mask_inputs=None,
+ reverse=reverse,
+ run_mem_encoder=True,
+ )
+ obj_output_dict[storage_key][frame_idx] = current_out
+
+ inference_state["frames_tracked_per_obj"][obj_idx][frame_idx] = {
+ "reverse": reverse
+ }
+ pred_masks_per_obj[obj_idx] = pred_masks
+
+ # Resize the output mask to the original video resolution (we directly use
+ # the mask scores on GPU for output to avoid any CPU conversion in between)
+ if len(pred_masks_per_obj) > 1:
+ all_pred_masks = torch.cat(pred_masks_per_obj, dim=0)
+ else:
+ all_pred_masks = pred_masks_per_obj[0]
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, all_pred_masks
+ )
+ yield frame_idx, obj_ids, video_res_masks
+
+ @torch.inference_mode()
+ def clear_all_prompts_in_frame(
+ self, inference_state, frame_idx, obj_id, need_output=True
+ ):
+ """Remove all input points or mask in a specific frame for a given object."""
+ obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+
+ # Clear the conditioning information on the given frame
+ inference_state["point_inputs_per_obj"][obj_idx].pop(frame_idx, None)
+ inference_state["mask_inputs_per_obj"][obj_idx].pop(frame_idx, None)
+
+ temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+ temp_output_dict_per_obj[obj_idx]["cond_frame_outputs"].pop(frame_idx, None)
+ temp_output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].pop(frame_idx, None)
+
+ # Remove the frame's conditioning output (possibly downgrading it to non-conditioning)
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ out = obj_output_dict["cond_frame_outputs"].pop(frame_idx, None)
+ if out is not None:
+ # The frame is not a conditioning frame anymore since it's not receiving inputs,
+ # so we "downgrade" its output (if exists) to a non-conditioning frame output.
+ obj_output_dict["non_cond_frame_outputs"][frame_idx] = out
+ inference_state["frames_tracked_per_obj"][obj_idx].pop(frame_idx, None)
+
+ if not need_output:
+ return
+ # Finally, output updated masks per object (after removing the inputs above)
+ obj_ids = inference_state["obj_ids"]
+ is_cond = any(
+ frame_idx in obj_temp_output_dict["cond_frame_outputs"]
+ for obj_temp_output_dict in temp_output_dict_per_obj.values()
+ )
+ consolidated_out = self._consolidate_temp_output_across_obj(
+ inference_state,
+ frame_idx,
+ is_cond=is_cond,
+ consolidate_at_video_res=True,
+ )
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, consolidated_out["pred_masks_video_res"]
+ )
+ return frame_idx, obj_ids, video_res_masks
+
+ @torch.inference_mode()
+ def reset_state(self, inference_state):
+ """Remove all input points or mask in all frames throughout the video."""
+ self._reset_tracking_results(inference_state)
+ # Remove all object ids
+ inference_state["obj_id_to_idx"].clear()
+ inference_state["obj_idx_to_id"].clear()
+ inference_state["obj_ids"].clear()
+ inference_state["point_inputs_per_obj"].clear()
+ inference_state["mask_inputs_per_obj"].clear()
+ inference_state["output_dict_per_obj"].clear()
+ inference_state["temp_output_dict_per_obj"].clear()
+ inference_state["frames_tracked_per_obj"].clear()
+
+ def _reset_tracking_results(self, inference_state):
+ """Reset all tracking inputs and results across the videos."""
+ for v in inference_state["point_inputs_per_obj"].values():
+ v.clear()
+ for v in inference_state["mask_inputs_per_obj"].values():
+ v.clear()
+ for v in inference_state["output_dict_per_obj"].values():
+ v["cond_frame_outputs"].clear()
+ v["non_cond_frame_outputs"].clear()
+ for v in inference_state["temp_output_dict_per_obj"].values():
+ v["cond_frame_outputs"].clear()
+ v["non_cond_frame_outputs"].clear()
+ for v in inference_state["frames_tracked_per_obj"].values():
+ v.clear()
+
+ def _get_image_feature(self, inference_state, frame_idx, batch_size):
+ """Compute the image features on a given frame."""
+ # Look up in the cache first
+ image, backbone_out = inference_state["cached_features"].get(
+ frame_idx, (None, None)
+ )
+ if backbone_out is None:
+ # Cache miss -- we will run inference on a single image
+ device = inference_state["device"]
+ image = inference_state["images"][frame_idx].to(device).float().unsqueeze(0)
+ backbone_out = self.forward_image(image)
+ # Cache the most recent frame's feature (for repeated interactions with
+ # a frame; we can use an LRU cache for more frames in the future).
+ inference_state["cached_features"] = {frame_idx: (image, backbone_out)}
+
+ # expand the features to have the same dimension as the number of objects
+ expanded_image = image.expand(batch_size, -1, -1, -1)
+ expanded_backbone_out = {
+ "backbone_fpn": backbone_out["backbone_fpn"].copy(),
+ "vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
+ }
+ for i, feat in enumerate(expanded_backbone_out["backbone_fpn"]):
+ expanded_backbone_out["backbone_fpn"][i] = feat.expand(
+ batch_size, -1, -1, -1
+ )
+ for i, pos in enumerate(expanded_backbone_out["vision_pos_enc"]):
+ pos = pos.expand(batch_size, -1, -1, -1)
+ expanded_backbone_out["vision_pos_enc"][i] = pos
+
+ features = self._prepare_backbone_features(expanded_backbone_out)
+ features = (expanded_image,) + features
+ return features
+
+ def _run_single_frame_inference(
+ self,
+ inference_state,
+ output_dict,
+ frame_idx,
+ batch_size,
+ is_init_cond_frame,
+ point_inputs,
+ mask_inputs,
+ reverse,
+ run_mem_encoder,
+ prev_sam_mask_logits=None,
+ ):
+ """Run tracking on a single frame based on current inputs and previous memory."""
+ # Retrieve correct image features
+ (
+ _,
+ _,
+ current_vision_feats,
+ current_vision_pos_embeds,
+ feat_sizes,
+ ) = self._get_image_feature(inference_state, frame_idx, batch_size)
+
+ # point and mask should not appear as input simultaneously on the same frame
+ assert point_inputs is None or mask_inputs is None
+ current_out = self.track_step(
+ frame_idx=frame_idx,
+ is_init_cond_frame=is_init_cond_frame,
+ current_vision_feats=current_vision_feats,
+ current_vision_pos_embeds=current_vision_pos_embeds,
+ feat_sizes=feat_sizes,
+ point_inputs=point_inputs,
+ mask_inputs=mask_inputs,
+ output_dict=output_dict,
+ num_frames=inference_state["num_frames"],
+ track_in_reverse=reverse,
+ run_mem_encoder=run_mem_encoder,
+ prev_sam_mask_logits=prev_sam_mask_logits,
+ )
+
+ # optionally offload the output to CPU memory to save GPU space
+ storage_device = inference_state["storage_device"]
+ maskmem_features = current_out["maskmem_features"]
+ if maskmem_features is not None:
+ maskmem_features = maskmem_features.to(torch.bfloat16)
+ maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
+ pred_masks_gpu = current_out["pred_masks"]
+ # potentially fill holes in the predicted masks
+ if self.fill_hole_area > 0:
+ pred_masks_gpu = fill_holes_in_mask_scores(
+ pred_masks_gpu, self.fill_hole_area
+ )
+ pred_masks = pred_masks_gpu.to(storage_device, non_blocking=True)
+ # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+ maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, current_out)
+ # object pointer is a small tensor, so we always keep it on GPU memory for fast access
+ obj_ptr = current_out["obj_ptr"]
+ object_score_logits = current_out["object_score_logits"]
+ # make a compact version of this frame's output to reduce the state size
+ compact_current_out = {
+ "maskmem_features": maskmem_features,
+ "maskmem_pos_enc": maskmem_pos_enc,
+ "pred_masks": pred_masks,
+ "obj_ptr": obj_ptr,
+ "object_score_logits": object_score_logits,
+ }
+ return compact_current_out, pred_masks_gpu
+
+ def _run_memory_encoder(
+ self,
+ inference_state,
+ frame_idx,
+ batch_size,
+ high_res_masks,
+ object_score_logits,
+ is_mask_from_pts,
+ ):
+ """
+ Run the memory encoder on `high_res_masks`. This is usually after applying
+ non-overlapping constraints to object scores. Since their scores changed, their
+ memory also need to be computed again with the memory encoder.
+ """
+ # Retrieve correct image features
+ _, _, current_vision_feats, _, feat_sizes = self._get_image_feature(
+ inference_state, frame_idx, batch_size
+ )
+ maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+ current_vision_feats=current_vision_feats,
+ feat_sizes=feat_sizes,
+ pred_masks_high_res=high_res_masks,
+ object_score_logits=object_score_logits,
+ is_mask_from_pts=is_mask_from_pts,
+ )
+
+ # optionally offload the output to CPU memory to save GPU space
+ storage_device = inference_state["storage_device"]
+ maskmem_features = maskmem_features.to(torch.bfloat16)
+ maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
+ # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+ maskmem_pos_enc = self._get_maskmem_pos_enc(
+ inference_state, {"maskmem_pos_enc": maskmem_pos_enc}
+ )
+ return maskmem_features, maskmem_pos_enc
+
+ def _get_maskmem_pos_enc(self, inference_state, current_out):
+ """
+ `maskmem_pos_enc` is the same across frames and objects, so we cache it as
+ a constant in the inference session to reduce session storage size.
+ """
+ model_constants = inference_state["constants"]
+ # "out_maskmem_pos_enc" should be either a list of tensors or None
+ out_maskmem_pos_enc = current_out["maskmem_pos_enc"]
+ if out_maskmem_pos_enc is not None:
+ if "maskmem_pos_enc" not in model_constants:
+ assert isinstance(out_maskmem_pos_enc, list)
+ # only take the slice for one object, since it's same across objects
+ maskmem_pos_enc = [x[0:1].clone() for x in out_maskmem_pos_enc]
+ model_constants["maskmem_pos_enc"] = maskmem_pos_enc
+ else:
+ maskmem_pos_enc = model_constants["maskmem_pos_enc"]
+ # expand the cached maskmem_pos_enc to the actual batch size
+ batch_size = out_maskmem_pos_enc[0].size(0)
+ expanded_maskmem_pos_enc = [
+ x.expand(batch_size, -1, -1, -1) for x in maskmem_pos_enc
+ ]
+ else:
+ expanded_maskmem_pos_enc = None
+ return expanded_maskmem_pos_enc
+
+ @torch.inference_mode()
+ def remove_object(self, inference_state, obj_id, strict=False, need_output=True):
+ """
+ Remove an object id from the tracking state. If strict is True, we check whether
+ the object id actually exists and raise an error if it doesn't exist.
+ """
+ old_obj_idx_to_rm = inference_state["obj_id_to_idx"].get(obj_id, None)
+ updated_frames = []
+ # Check whether this object_id to remove actually exists and possibly raise an error.
+ if old_obj_idx_to_rm is None:
+ if not strict:
+ return inference_state["obj_ids"], updated_frames
+ raise RuntimeError(
+ f"Cannot remove object id {obj_id} as it doesn't exist. "
+ f"All existing object ids: {inference_state['obj_ids']}."
+ )
+
+ # If this is the only remaining object id, we simply reset the state.
+ if len(inference_state["obj_id_to_idx"]) == 1:
+ self.reset_state(inference_state)
+ return inference_state["obj_ids"], updated_frames
+
+ # There are still remaining objects after removing this object id. In this case,
+ # we need to delete the object storage from inference state tensors.
+ # Step 0: clear the input on those frames where this object id has point or mask input
+ # (note that this step is required as it might downgrade conditioning frames to
+ # non-conditioning ones)
+ obj_input_frames_inds = set()
+ obj_input_frames_inds.update(
+ inference_state["point_inputs_per_obj"][old_obj_idx_to_rm]
+ )
+ obj_input_frames_inds.update(
+ inference_state["mask_inputs_per_obj"][old_obj_idx_to_rm]
+ )
+ for frame_idx in obj_input_frames_inds:
+ self.clear_all_prompts_in_frame(
+ inference_state, frame_idx, obj_id, need_output=False
+ )
+
+ # Step 1: Update the object id mapping (note that it must be done after Step 0,
+ # since Step 0 still requires the old object id mappings in inference_state)
+ old_obj_ids = inference_state["obj_ids"]
+ old_obj_inds = list(range(len(old_obj_ids)))
+ remain_old_obj_inds = old_obj_inds.copy()
+ remain_old_obj_inds.remove(old_obj_idx_to_rm)
+ new_obj_ids = [old_obj_ids[old_idx] for old_idx in remain_old_obj_inds]
+ new_obj_inds = list(range(len(new_obj_ids)))
+ # build new mappings
+ old_idx_to_new_idx = dict(zip(remain_old_obj_inds, new_obj_inds))
+ inference_state["obj_id_to_idx"] = dict(zip(new_obj_ids, new_obj_inds))
+ inference_state["obj_idx_to_id"] = dict(zip(new_obj_inds, new_obj_ids))
+ inference_state["obj_ids"] = new_obj_ids
+
+ # Step 2: For per-object tensor storage, we shift their obj_idx in the dict keys.
+ def _map_keys(container):
+ new_kvs = []
+ for k in old_obj_inds:
+ v = container.pop(k)
+ if k in old_idx_to_new_idx:
+ new_kvs.append((old_idx_to_new_idx[k], v))
+ container.update(new_kvs)
+
+ _map_keys(inference_state["point_inputs_per_obj"])
+ _map_keys(inference_state["mask_inputs_per_obj"])
+ _map_keys(inference_state["output_dict_per_obj"])
+ _map_keys(inference_state["temp_output_dict_per_obj"])
+ _map_keys(inference_state["frames_tracked_per_obj"])
+
+ # Step 3: Further collect the outputs on those frames in `obj_input_frames_inds`, which
+ # could show an updated mask for objects previously occluded by the object being removed
+ if need_output:
+ temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+ for frame_idx in obj_input_frames_inds:
+ is_cond = any(
+ frame_idx in obj_temp_output_dict["cond_frame_outputs"]
+ for obj_temp_output_dict in temp_output_dict_per_obj.values()
+ )
+ consolidated_out = self._consolidate_temp_output_across_obj(
+ inference_state,
+ frame_idx,
+ is_cond=is_cond,
+ consolidate_at_video_res=True,
+ )
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, consolidated_out["pred_masks_video_res"]
+ )
+ updated_frames.append((frame_idx, video_res_masks))
+
+ return inference_state["obj_ids"], updated_frames
+
+ def _clear_non_cond_mem_around_input(self, inference_state, frame_idx):
+ """
+ Remove the non-conditioning memory around the input frame. When users provide
+ correction clicks, the surrounding frames' non-conditioning memories can still
+ contain outdated object appearance information and could confuse the model.
+
+ This method clears those non-conditioning memories surrounding the interacted
+ frame to avoid giving the model both old and new information about the object.
+ """
+ r = self.memory_temporal_stride_for_eval
+ frame_idx_begin = frame_idx - r * self.num_maskmem
+ frame_idx_end = frame_idx + r * self.num_maskmem
+ batch_size = self._get_obj_num(inference_state)
+ for obj_idx in range(batch_size):
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ non_cond_frame_outputs = obj_output_dict["non_cond_frame_outputs"]
+ for t in range(frame_idx_begin, frame_idx_end + 1):
+ non_cond_frame_outputs.pop(t, None)
+
+
+class SAM2VideoPredictorVOS(SAM2VideoPredictor):
+ """Optimized for the VOS setting"""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._compile_all_components()
+
+ def _compile_all_components(self):
+ print("Compiling all components for VOS setting. First time may be very slow.")
+ self.memory_encoder.forward = torch.compile(
+ self.memory_encoder.forward,
+ mode="max-autotune",
+ fullgraph=True,
+ dynamic=False,
+ )
+
+ self.memory_attention.forward = torch.compile(
+ self.memory_attention.forward,
+ mode="max-autotune",
+ fullgraph=True,
+ dynamic=True, # Num. of memories varies
+ )
+
+ self.sam_prompt_encoder.forward = torch.compile(
+ self.sam_prompt_encoder.forward,
+ mode="max-autotune",
+ fullgraph=True,
+ dynamic=False, # Accuracy regression on True
+ )
+
+ self.sam_mask_decoder.forward = torch.compile(
+ self.sam_mask_decoder.forward,
+ mode="max-autotune",
+ fullgraph=True,
+ dynamic=False, # Accuracy regression on True
+ )
+
+ def forward_image(self, img_batch: torch.Tensor):
+ """
+ Identical to the corresponding method in the parent (SAM2VideoPredictor), but
+ cloning the backbone features and pos encoding to enable compilation.
+ """
+ backbone_out = self.image_encoder(img_batch)
+ if self.use_high_res_features_in_sam:
+ # precompute projected level 0 and level 1 features in SAM decoder
+ # to avoid running it again on every SAM click
+ backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(
+ backbone_out["backbone_fpn"][0]
+ )
+ backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(
+ backbone_out["backbone_fpn"][1]
+ )
+ # Clone to help torch.compile
+ for i in range(len(backbone_out["backbone_fpn"])):
+ backbone_out["backbone_fpn"][i] = backbone_out["backbone_fpn"][i].clone()
+ backbone_out["vision_pos_enc"][i] = backbone_out["vision_pos_enc"][
+ i
+ ].clone()
+ return backbone_out
+
+ def _forward_sam_heads(
+ self,
+ backbone_features,
+ point_inputs=None,
+ mask_inputs=None,
+ high_res_features=None,
+ multimask_output=False,
+ ):
+ """
+ Identical to the corresponding method in the parent (SAM2VideoPredictor), but
+ cloning the outputs of prompt_encoder and mask_decoder to enable compilation.
+ """
+ B = backbone_features.size(0)
+ device = backbone_features.device
+ assert backbone_features.size(1) == self.sam_prompt_embed_dim
+ assert backbone_features.size(2) == self.sam_image_embedding_size
+ assert backbone_features.size(3) == self.sam_image_embedding_size
+
+ # a) Handle point prompts
+ if point_inputs is not None:
+ sam_point_coords = point_inputs["point_coords"]
+ sam_point_labels = point_inputs["point_labels"]
+ assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
+ else:
+ # If no points are provide, pad with an empty point (with label -1)
+ sam_point_coords = torch.zeros(B, 1, 2, device=device)
+ sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
+
+ # b) Handle mask prompts
+ if mask_inputs is not None:
+ # If mask_inputs is provided, downsize it into low-res mask input if needed
+ # and feed it as a dense mask prompt into the SAM mask encoder
+ assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
+ if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
+ sam_mask_prompt = F.interpolate(
+ mask_inputs.float(),
+ size=self.sam_prompt_encoder.mask_input_size,
+ align_corners=False,
+ mode="bilinear",
+ antialias=True, # use antialias for downsampling
+ )
+ else:
+ sam_mask_prompt = mask_inputs
+ else:
+ # Otherwise, simply feed None (and SAM's prompt encoder will add
+ # a learned `no_mask_embed` to indicate no mask input in this case).
+ sam_mask_prompt = None
+
+ sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
+ points=(sam_point_coords, sam_point_labels),
+ boxes=None,
+ masks=sam_mask_prompt,
+ )
+ # Clone image_pe and the outputs of sam_prompt_encoder
+ # to enable compilation
+ sparse_embeddings = sparse_embeddings.clone()
+ dense_embeddings = dense_embeddings.clone()
+ image_pe = self.sam_prompt_encoder.get_dense_pe().clone()
+ (
+ low_res_multimasks,
+ ious,
+ sam_output_tokens,
+ object_score_logits,
+ ) = self.sam_mask_decoder(
+ image_embeddings=backbone_features,
+ image_pe=image_pe,
+ sparse_prompt_embeddings=sparse_embeddings,
+ dense_prompt_embeddings=dense_embeddings,
+ multimask_output=multimask_output,
+ repeat_image=False, # the image is already batched
+ high_res_features=high_res_features,
+ )
+ # Clone the output of sam_mask_decoder
+ # to enable compilation
+ low_res_multimasks = low_res_multimasks.clone()
+ ious = ious.clone()
+ sam_output_tokens = sam_output_tokens.clone()
+ object_score_logits = object_score_logits.clone()
+
+ if self.pred_obj_scores:
+ is_obj_appearing = object_score_logits > 0
+
+ # Mask used for spatial memories is always a *hard* choice between obj and no obj,
+ # consistent with the actual mask prediction
+ low_res_multimasks = torch.where(
+ is_obj_appearing[:, None, None],
+ low_res_multimasks,
+ NO_OBJ_SCORE,
+ )
+
+ # convert masks from possibly bfloat16 (or float16) to float32
+ # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
+ low_res_multimasks = low_res_multimasks.float()
+ high_res_multimasks = F.interpolate(
+ low_res_multimasks,
+ size=(self.image_size, self.image_size),
+ mode="bilinear",
+ align_corners=False,
+ )
+
+ sam_output_token = sam_output_tokens[:, 0]
+ if multimask_output:
+ # take the best mask prediction (with the highest IoU estimation)
+ best_iou_inds = torch.argmax(ious, dim=-1)
+ batch_inds = torch.arange(B, device=device)
+ low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+ high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+ if sam_output_tokens.size(1) > 1:
+ sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
+ else:
+ low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks
+
+ # Extract object pointer from the SAM output token (with occlusion handling)
+ obj_ptr = self.obj_ptr_proj(sam_output_token)
+ if self.pred_obj_scores:
+ # Allow *soft* no obj ptr, unlike for masks
+ if self.soft_no_obj_ptr:
+ lambda_is_obj_appearing = object_score_logits.sigmoid()
+ else:
+ lambda_is_obj_appearing = is_obj_appearing.float()
+
+ if self.fixed_no_obj_ptr:
+ obj_ptr = lambda_is_obj_appearing * obj_ptr
+ obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
+
+ return (
+ low_res_multimasks,
+ high_res_multimasks,
+ ious,
+ low_res_masks,
+ high_res_masks,
+ obj_ptr,
+ object_score_logits,
+ )
+
+ def _encode_new_memory(
+ self,
+ current_vision_feats,
+ feat_sizes,
+ pred_masks_high_res,
+ object_score_logits,
+ is_mask_from_pts,
+ ):
+ """
+ Identical to the corresponding method in the parent (SAM2VideoPredictor), but
+ cloning the memories and their pos enc to enable compilation.
+ """
+ B = current_vision_feats[-1].size(1) # batch size on this frame
+ C = self.hidden_dim
+ H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size
+ # top-level feature, (HW)BC => BCHW
+ pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
+ if self.non_overlap_masks_for_mem_enc and not self.training:
+ # optionally, apply non-overlapping constraints to the masks (it's applied
+ # in the batch dimension and should only be used during eval, where all
+ # the objects come from the same video under batch size 1).
+ pred_masks_high_res = self._apply_non_overlapping_constraints(
+ pred_masks_high_res
+ )
+ # scale the raw mask logits with a temperature before applying sigmoid
+ binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
+ if binarize and not self.training:
+ mask_for_mem = (pred_masks_high_res > 0).float()
+ else:
+ # apply sigmoid on the raw mask logits to turn them into range (0, 1)
+ mask_for_mem = torch.sigmoid(pred_masks_high_res)
+ # apply scale and bias terms to the sigmoid probabilities
+ if self.sigmoid_scale_for_mem_enc != 1.0:
+ mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
+ if self.sigmoid_bias_for_mem_enc != 0.0:
+ mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
+ maskmem_out = self.memory_encoder(
+ pix_feat, mask_for_mem, skip_mask_sigmoid=True # sigmoid already applied
+ )
+ # Clone the feats and pos_enc to enable compilation
+ maskmem_features = maskmem_out["vision_features"].clone()
+ maskmem_pos_enc = [m.clone() for m in maskmem_out["vision_pos_enc"]]
+ # add a no-object embedding to the spatial memory to indicate that the frame
+ # is predicted to be occluded (i.e. no object is appearing in the frame)
+ if self.no_obj_embed_spatial is not None:
+ is_obj_appearing = (object_score_logits > 0).float()
+ maskmem_features += (
+ 1 - is_obj_appearing[..., None, None]
+ ) * self.no_obj_embed_spatial[..., None, None].expand(
+ *maskmem_features.shape
+ )
+
+ return maskmem_features, maskmem_pos_enc
diff --git a/sam2_repo/sam2/sam2_video_predictor_legacy.py b/sam2_repo/sam2/sam2_video_predictor_legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7e01ccf972491904b013526333826b337354db1
--- /dev/null
+++ b/sam2_repo/sam2/sam2_video_predictor_legacy.py
@@ -0,0 +1,1172 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from collections import OrderedDict
+
+import torch
+
+from tqdm import tqdm
+
+from sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
+from sam2.utils.misc import concat_points, fill_holes_in_mask_scores, load_video_frames
+
+
+class SAM2VideoPredictor(SAM2Base):
+ """The predictor class to handle user interactions and manage inference states."""
+
+ def __init__(
+ self,
+ fill_hole_area=0,
+ # whether to apply non-overlapping constraints on the output object masks
+ non_overlap_masks=False,
+ # whether to clear non-conditioning memory of the surrounding frames (which may contain outdated information) after adding correction clicks;
+ # note that this would only apply to *single-object tracking* unless `clear_non_cond_mem_for_multi_obj` is also set to True)
+ clear_non_cond_mem_around_input=False,
+ # whether to also clear non-conditioning memory of the surrounding frames (only effective when `clear_non_cond_mem_around_input` is True).
+ clear_non_cond_mem_for_multi_obj=False,
+ # if `add_all_frames_to_correct_as_cond` is True, we also append to the conditioning frame list any frame that receives a later correction click
+ # if `add_all_frames_to_correct_as_cond` is False, we conditioning frame list to only use those initial conditioning frames
+ add_all_frames_to_correct_as_cond=False,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.fill_hole_area = fill_hole_area
+ self.non_overlap_masks = non_overlap_masks
+ self.clear_non_cond_mem_around_input = clear_non_cond_mem_around_input
+ self.clear_non_cond_mem_for_multi_obj = clear_non_cond_mem_for_multi_obj
+ self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond
+
+ @torch.inference_mode()
+ def init_state(
+ self,
+ video_path,
+ offload_video_to_cpu=False,
+ offload_state_to_cpu=False,
+ async_loading_frames=False,
+ ):
+ """Initialize an inference state."""
+ compute_device = self.device # device of the model
+ images, video_height, video_width = load_video_frames(
+ video_path=video_path,
+ image_size=self.image_size,
+ offload_video_to_cpu=offload_video_to_cpu,
+ async_loading_frames=async_loading_frames,
+ compute_device=compute_device,
+ )
+ inference_state = {}
+ inference_state["images"] = images
+ inference_state["num_frames"] = len(images)
+ # whether to offload the video frames to CPU memory
+ # turning on this option saves the GPU memory with only a very small overhead
+ inference_state["offload_video_to_cpu"] = offload_video_to_cpu
+ # whether to offload the inference state to CPU memory
+ # turning on this option saves the GPU memory at the cost of a lower tracking fps
+ # (e.g. in a test case of 768x768 model, fps dropped from 27 to 24 when tracking one object
+ # and from 24 to 21 when tracking two objects)
+ inference_state["offload_state_to_cpu"] = offload_state_to_cpu
+ # the original video height and width, used for resizing final output scores
+ inference_state["video_height"] = video_height
+ inference_state["video_width"] = video_width
+ inference_state["device"] = compute_device
+ if offload_state_to_cpu:
+ inference_state["storage_device"] = torch.device("cpu")
+ else:
+ inference_state["storage_device"] = compute_device
+ # inputs on each frame
+ inference_state["point_inputs_per_obj"] = {}
+ inference_state["mask_inputs_per_obj"] = {}
+ # visual features on a small number of recently visited frames for quick interactions
+ inference_state["cached_features"] = {}
+ # values that don't change across frames (so we only need to hold one copy of them)
+ inference_state["constants"] = {}
+ # mapping between client-side object id and model-side object index
+ inference_state["obj_id_to_idx"] = OrderedDict()
+ inference_state["obj_idx_to_id"] = OrderedDict()
+ inference_state["obj_ids"] = []
+ # A storage to hold the model's tracking results and states on each frame
+ inference_state["output_dict"] = {
+ "cond_frame_outputs": {}, # dict containing {frame_idx: }
+ "non_cond_frame_outputs": {}, # dict containing {frame_idx: }
+ }
+ # Slice (view) of each object tracking results, sharing the same memory with "output_dict"
+ inference_state["output_dict_per_obj"] = {}
+ # A temporary storage to hold new outputs when user interact with a frame
+ # to add clicks or mask (it's merged into "output_dict" before propagation starts)
+ inference_state["temp_output_dict_per_obj"] = {}
+ # Frames that already holds consolidated outputs from click or mask inputs
+ # (we directly use their consolidated outputs during tracking)
+ inference_state["consolidated_frame_inds"] = {
+ "cond_frame_outputs": set(), # set containing frame indices
+ "non_cond_frame_outputs": set(), # set containing frame indices
+ }
+ # metadata for each tracking frame (e.g. which direction it's tracked)
+ inference_state["tracking_has_started"] = False
+ inference_state["frames_already_tracked"] = {}
+ # Warm up the visual backbone and cache the image feature on frame 0
+ self._get_image_feature(inference_state, frame_idx=0, batch_size=1)
+ return inference_state
+
+ @classmethod
+ def from_pretrained(cls, model_id: str, **kwargs) -> "SAM2VideoPredictor":
+ """
+ Load a pretrained model from the Hugging Face hub.
+
+ Arguments:
+ model_id (str): The Hugging Face repository ID.
+ **kwargs: Additional arguments to pass to the model constructor.
+
+ Returns:
+ (SAM2VideoPredictor): The loaded model.
+ """
+ from sam2.build_sam import build_sam2_video_predictor_hf
+
+ sam_model = build_sam2_video_predictor_hf(model_id, **kwargs)
+ return sam_model
+
+ def _obj_id_to_idx(self, inference_state, obj_id):
+ """Map client-side object id to model-side object index."""
+ obj_idx = inference_state["obj_id_to_idx"].get(obj_id, None)
+ if obj_idx is not None:
+ return obj_idx
+
+ # This is a new object id not sent to the server before. We only allow adding
+ # new objects *before* the tracking starts.
+ allow_new_object = not inference_state["tracking_has_started"]
+ if allow_new_object:
+ # get the next object slot
+ obj_idx = len(inference_state["obj_id_to_idx"])
+ inference_state["obj_id_to_idx"][obj_id] = obj_idx
+ inference_state["obj_idx_to_id"][obj_idx] = obj_id
+ inference_state["obj_ids"] = list(inference_state["obj_id_to_idx"])
+ # set up input and output structures for this object
+ inference_state["point_inputs_per_obj"][obj_idx] = {}
+ inference_state["mask_inputs_per_obj"][obj_idx] = {}
+ inference_state["output_dict_per_obj"][obj_idx] = {
+ "cond_frame_outputs": {}, # dict containing {frame_idx: }
+ "non_cond_frame_outputs": {}, # dict containing {frame_idx: }
+ }
+ inference_state["temp_output_dict_per_obj"][obj_idx] = {
+ "cond_frame_outputs": {}, # dict containing {frame_idx: }
+ "non_cond_frame_outputs": {}, # dict containing {frame_idx: }
+ }
+ return obj_idx
+ else:
+ raise RuntimeError(
+ f"Cannot add new object id {obj_id} after tracking starts. "
+ f"All existing object ids: {inference_state['obj_ids']}. "
+ f"Please call 'reset_state' to restart from scratch."
+ )
+
+ def _obj_idx_to_id(self, inference_state, obj_idx):
+ """Map model-side object index to client-side object id."""
+ return inference_state["obj_idx_to_id"][obj_idx]
+
+ def _get_obj_num(self, inference_state):
+ """Get the total number of unique object ids received so far in this session."""
+ return len(inference_state["obj_idx_to_id"])
+
+ @torch.inference_mode()
+ def add_new_points_or_box(
+ self,
+ inference_state,
+ frame_idx,
+ obj_id,
+ points=None,
+ labels=None,
+ clear_old_points=True,
+ normalize_coords=True,
+ box=None,
+ ):
+ """Add new points to a frame."""
+ obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+ point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
+ mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
+
+ if (points is not None) != (labels is not None):
+ raise ValueError("points and labels must be provided together")
+ if points is None and box is None:
+ raise ValueError("at least one of points or box must be provided as input")
+
+ if points is None:
+ points = torch.zeros(0, 2, dtype=torch.float32)
+ elif not isinstance(points, torch.Tensor):
+ points = torch.tensor(points, dtype=torch.float32)
+ if labels is None:
+ labels = torch.zeros(0, dtype=torch.int32)
+ elif not isinstance(labels, torch.Tensor):
+ labels = torch.tensor(labels, dtype=torch.int32)
+ if points.dim() == 2:
+ points = points.unsqueeze(0) # add batch dimension
+ if labels.dim() == 1:
+ labels = labels.unsqueeze(0) # add batch dimension
+
+ # If `box` is provided, we add it as the first two points with labels 2 and 3
+ # along with the user-provided points (consistent with how SAM 2 is trained).
+ if box is not None:
+ if not clear_old_points:
+ raise ValueError(
+ "cannot add box without clearing old points, since "
+ "box prompt must be provided before any point prompt "
+ "(please use clear_old_points=True instead)"
+ )
+ if inference_state["tracking_has_started"]:
+ warnings.warn(
+ "You are adding a box after tracking starts. SAM 2 may not always be "
+ "able to incorporate a box prompt for *refinement*. If you intend to "
+ "use box prompt as an *initial* input before tracking, please call "
+ "'reset_state' on the inference state to restart from scratch.",
+ category=UserWarning,
+ stacklevel=2,
+ )
+ if not isinstance(box, torch.Tensor):
+ box = torch.tensor(box, dtype=torch.float32, device=points.device)
+ box_coords = box.reshape(1, 2, 2)
+ box_labels = torch.tensor([2, 3], dtype=torch.int32, device=labels.device)
+ box_labels = box_labels.reshape(1, 2)
+ points = torch.cat([box_coords, points], dim=1)
+ labels = torch.cat([box_labels, labels], dim=1)
+
+ if normalize_coords:
+ video_H = inference_state["video_height"]
+ video_W = inference_state["video_width"]
+ points = points / torch.tensor([video_W, video_H]).to(points.device)
+ # scale the (normalized) coordinates by the model's internal image size
+ points = points * self.image_size
+ points = points.to(inference_state["device"])
+ labels = labels.to(inference_state["device"])
+
+ if not clear_old_points:
+ point_inputs = point_inputs_per_frame.get(frame_idx, None)
+ else:
+ point_inputs = None
+ point_inputs = concat_points(point_inputs, points, labels)
+
+ point_inputs_per_frame[frame_idx] = point_inputs
+ mask_inputs_per_frame.pop(frame_idx, None)
+ # If this frame hasn't been tracked before, we treat it as an initial conditioning
+ # frame, meaning that the inputs points are to generate segments on this frame without
+ # using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
+ # the input points will be used to correct the already tracked masks.
+ is_init_cond_frame = frame_idx not in inference_state["frames_already_tracked"]
+ # whether to track in reverse time order
+ if is_init_cond_frame:
+ reverse = False
+ else:
+ reverse = inference_state["frames_already_tracked"][frame_idx]["reverse"]
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+ # Add a frame to conditioning output if it's an initial conditioning frame or
+ # if the model sees all frames receiving clicks/mask as conditioning frames.
+ is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
+ storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+
+ # Get any previously predicted mask logits on this object and feed it along with
+ # the new clicks into the SAM mask decoder.
+ prev_sam_mask_logits = None
+ # lookup temporary output dict first, which contains the most recent output
+ # (if not found, then lookup conditioning and non-conditioning frame output)
+ prev_out = obj_temp_output_dict[storage_key].get(frame_idx)
+ if prev_out is None:
+ prev_out = obj_output_dict["cond_frame_outputs"].get(frame_idx)
+ if prev_out is None:
+ prev_out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx)
+
+ if prev_out is not None and prev_out["pred_masks"] is not None:
+ device = inference_state["device"]
+ prev_sam_mask_logits = prev_out["pred_masks"].to(device, non_blocking=True)
+ # Clamp the scale of prev_sam_mask_logits to avoid rare numerical issues.
+ prev_sam_mask_logits = torch.clamp(prev_sam_mask_logits, -32.0, 32.0)
+ current_out, _ = self._run_single_frame_inference(
+ inference_state=inference_state,
+ output_dict=obj_output_dict, # run on the slice of a single object
+ frame_idx=frame_idx,
+ batch_size=1, # run on the slice of a single object
+ is_init_cond_frame=is_init_cond_frame,
+ point_inputs=point_inputs,
+ mask_inputs=None,
+ reverse=reverse,
+ # Skip the memory encoder when adding clicks or mask. We execute the memory encoder
+ # at the beginning of `propagate_in_video` (after user finalize their clicks). This
+ # allows us to enforce non-overlapping constraints on all objects before encoding
+ # them into memory.
+ run_mem_encoder=False,
+ prev_sam_mask_logits=prev_sam_mask_logits,
+ )
+ # Add the output to the output dict (to be used as future memory)
+ obj_temp_output_dict[storage_key][frame_idx] = current_out
+
+ # Resize the output mask to the original video resolution
+ obj_ids = inference_state["obj_ids"]
+ consolidated_out = self._consolidate_temp_output_across_obj(
+ inference_state,
+ frame_idx,
+ is_cond=is_cond,
+ run_mem_encoder=False,
+ consolidate_at_video_res=True,
+ )
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, consolidated_out["pred_masks_video_res"]
+ )
+ return frame_idx, obj_ids, video_res_masks
+
+ def add_new_points(self, *args, **kwargs):
+ """Deprecated method. Please use `add_new_points_or_box` instead."""
+ return self.add_new_points_or_box(*args, **kwargs)
+
+ @torch.inference_mode()
+ def add_new_mask(
+ self,
+ inference_state,
+ frame_idx,
+ obj_id,
+ mask,
+ ):
+ """Add new mask to a frame."""
+ obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+ point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
+ mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
+
+ if not isinstance(mask, torch.Tensor):
+ mask = torch.tensor(mask, dtype=torch.bool)
+ assert mask.dim() == 2
+ mask_H, mask_W = mask.shape
+ mask_inputs_orig = mask[None, None] # add batch and channel dimension
+ mask_inputs_orig = mask_inputs_orig.float().to(inference_state["device"])
+
+ # resize the mask if it doesn't match the model's image size
+ if mask_H != self.image_size or mask_W != self.image_size:
+ mask_inputs = torch.nn.functional.interpolate(
+ mask_inputs_orig,
+ size=(self.image_size, self.image_size),
+ align_corners=False,
+ mode="bilinear",
+ antialias=True, # use antialias for downsampling
+ )
+ mask_inputs = (mask_inputs >= 0.5).float()
+ else:
+ mask_inputs = mask_inputs_orig
+
+ mask_inputs_per_frame[frame_idx] = mask_inputs
+ point_inputs_per_frame.pop(frame_idx, None)
+ # If this frame hasn't been tracked before, we treat it as an initial conditioning
+ # frame, meaning that the inputs points are to generate segments on this frame without
+ # using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
+ # the input points will be used to correct the already tracked masks.
+ is_init_cond_frame = frame_idx not in inference_state["frames_already_tracked"]
+ # whether to track in reverse time order
+ if is_init_cond_frame:
+ reverse = False
+ else:
+ reverse = inference_state["frames_already_tracked"][frame_idx]["reverse"]
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+ # Add a frame to conditioning output if it's an initial conditioning frame or
+ # if the model sees all frames receiving clicks/mask as conditioning frames.
+ is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
+ storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+
+ current_out, _ = self._run_single_frame_inference(
+ inference_state=inference_state,
+ output_dict=obj_output_dict, # run on the slice of a single object
+ frame_idx=frame_idx,
+ batch_size=1, # run on the slice of a single object
+ is_init_cond_frame=is_init_cond_frame,
+ point_inputs=None,
+ mask_inputs=mask_inputs,
+ reverse=reverse,
+ # Skip the memory encoder when adding clicks or mask. We execute the memory encoder
+ # at the beginning of `propagate_in_video` (after user finalize their clicks). This
+ # allows us to enforce non-overlapping constraints on all objects before encoding
+ # them into memory.
+ run_mem_encoder=False,
+ )
+ # Add the output to the output dict (to be used as future memory)
+ obj_temp_output_dict[storage_key][frame_idx] = current_out
+
+ # Resize the output mask to the original video resolution
+ obj_ids = inference_state["obj_ids"]
+ consolidated_out = self._consolidate_temp_output_across_obj(
+ inference_state,
+ frame_idx,
+ is_cond=is_cond,
+ run_mem_encoder=False,
+ consolidate_at_video_res=True,
+ )
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, consolidated_out["pred_masks_video_res"]
+ )
+ return frame_idx, obj_ids, video_res_masks
+
+ def _get_orig_video_res_output(self, inference_state, any_res_masks):
+ """
+ Resize the object scores to the original video resolution (video_res_masks)
+ and apply non-overlapping constraints for final output.
+ """
+ device = inference_state["device"]
+ video_H = inference_state["video_height"]
+ video_W = inference_state["video_width"]
+ any_res_masks = any_res_masks.to(device, non_blocking=True)
+ if any_res_masks.shape[-2:] == (video_H, video_W):
+ video_res_masks = any_res_masks
+ else:
+ video_res_masks = torch.nn.functional.interpolate(
+ any_res_masks,
+ size=(video_H, video_W),
+ mode="bilinear",
+ align_corners=False,
+ )
+ if self.non_overlap_masks:
+ video_res_masks = self._apply_non_overlapping_constraints(video_res_masks)
+ return any_res_masks, video_res_masks
+
+ def _consolidate_temp_output_across_obj(
+ self,
+ inference_state,
+ frame_idx,
+ is_cond,
+ run_mem_encoder,
+ consolidate_at_video_res=False,
+ ):
+ """
+ Consolidate the per-object temporary outputs in `temp_output_dict_per_obj` on
+ a frame into a single output for all objects, including
+ 1) fill any missing objects either from `output_dict_per_obj` (if they exist in
+ `output_dict_per_obj` for this frame) or leave them as placeholder values
+ (if they don't exist in `output_dict_per_obj` for this frame);
+ 2) if specified, rerun memory encoder after apply non-overlapping constraints
+ on the object scores.
+ """
+ batch_size = self._get_obj_num(inference_state)
+ storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+ # Optionally, we allow consolidating the temporary outputs at the original
+ # video resolution (to provide a better editing experience for mask prompts).
+ if consolidate_at_video_res:
+ assert not run_mem_encoder, "memory encoder cannot run at video resolution"
+ consolidated_H = inference_state["video_height"]
+ consolidated_W = inference_state["video_width"]
+ consolidated_mask_key = "pred_masks_video_res"
+ else:
+ consolidated_H = consolidated_W = self.image_size // 4
+ consolidated_mask_key = "pred_masks"
+
+ # Initialize `consolidated_out`. Its "maskmem_features" and "maskmem_pos_enc"
+ # will be added when rerunning the memory encoder after applying non-overlapping
+ # constraints to object scores. Its "pred_masks" are prefilled with a large
+ # negative value (NO_OBJ_SCORE) to represent missing objects.
+ consolidated_out = {
+ "maskmem_features": None,
+ "maskmem_pos_enc": None,
+ consolidated_mask_key: torch.full(
+ size=(batch_size, 1, consolidated_H, consolidated_W),
+ fill_value=NO_OBJ_SCORE,
+ dtype=torch.float32,
+ device=inference_state["storage_device"],
+ ),
+ "obj_ptr": torch.full(
+ size=(batch_size, self.hidden_dim),
+ fill_value=NO_OBJ_SCORE,
+ dtype=torch.float32,
+ device=inference_state["device"],
+ ),
+ "object_score_logits": torch.full(
+ size=(batch_size, 1),
+ # default to 10.0 for object_score_logits, i.e. assuming the object is
+ # present as sigmoid(10)=1, same as in `predict_masks` of `MaskDecoder`
+ fill_value=10.0,
+ dtype=torch.float32,
+ device=inference_state["device"],
+ ),
+ }
+ empty_mask_ptr = None
+ for obj_idx in range(batch_size):
+ obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
+ out = obj_temp_output_dict[storage_key].get(frame_idx, None)
+ # If the object doesn't appear in "temp_output_dict_per_obj" on this frame,
+ # we fall back and look up its previous output in "output_dict_per_obj".
+ # We look up both "cond_frame_outputs" and "non_cond_frame_outputs" in
+ # "output_dict_per_obj" to find a previous output for this object.
+ if out is None:
+ out = obj_output_dict["cond_frame_outputs"].get(frame_idx, None)
+ if out is None:
+ out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx, None)
+ # If the object doesn't appear in "output_dict_per_obj" either, we skip it
+ # and leave its mask scores to the default scores (i.e. the NO_OBJ_SCORE
+ # placeholder above) and set its object pointer to be a dummy pointer.
+ if out is None:
+ # Fill in dummy object pointers for those objects without any inputs or
+ # tracking outcomes on this frame (only do it under `run_mem_encoder=True`,
+ # i.e. when we need to build the memory for tracking).
+ if run_mem_encoder:
+ if empty_mask_ptr is None:
+ empty_mask_ptr = self._get_empty_mask_ptr(
+ inference_state, frame_idx
+ )
+ # fill object pointer with a dummy pointer (based on an empty mask)
+ consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = empty_mask_ptr
+ continue
+ # Add the temporary object output mask to consolidated output mask
+ obj_mask = out["pred_masks"]
+ consolidated_pred_masks = consolidated_out[consolidated_mask_key]
+ if obj_mask.shape[-2:] == consolidated_pred_masks.shape[-2:]:
+ consolidated_pred_masks[obj_idx : obj_idx + 1] = obj_mask
+ else:
+ # Resize first if temporary object mask has a different resolution
+ resized_obj_mask = torch.nn.functional.interpolate(
+ obj_mask,
+ size=consolidated_pred_masks.shape[-2:],
+ mode="bilinear",
+ align_corners=False,
+ )
+ consolidated_pred_masks[obj_idx : obj_idx + 1] = resized_obj_mask
+ consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = out["obj_ptr"]
+ consolidated_out["object_score_logits"][obj_idx : obj_idx + 1] = out[
+ "object_score_logits"
+ ]
+
+ # Optionally, apply non-overlapping constraints on the consolidated scores
+ # and rerun the memory encoder
+ if run_mem_encoder:
+ device = inference_state["device"]
+ high_res_masks = torch.nn.functional.interpolate(
+ consolidated_out["pred_masks"].to(device, non_blocking=True),
+ size=(self.image_size, self.image_size),
+ mode="bilinear",
+ align_corners=False,
+ )
+ if self.non_overlap_masks_for_mem_enc:
+ high_res_masks = self._apply_non_overlapping_constraints(high_res_masks)
+ maskmem_features, maskmem_pos_enc = self._run_memory_encoder(
+ inference_state=inference_state,
+ frame_idx=frame_idx,
+ batch_size=batch_size,
+ high_res_masks=high_res_masks,
+ object_score_logits=consolidated_out["object_score_logits"],
+ is_mask_from_pts=True, # these frames are what the user interacted with
+ )
+ consolidated_out["maskmem_features"] = maskmem_features
+ consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc
+
+ return consolidated_out
+
+ def _get_empty_mask_ptr(self, inference_state, frame_idx):
+ """Get a dummy object pointer based on an empty mask on the current frame."""
+ # A dummy (empty) mask with a single object
+ batch_size = 1
+ mask_inputs = torch.zeros(
+ (batch_size, 1, self.image_size, self.image_size),
+ dtype=torch.float32,
+ device=inference_state["device"],
+ )
+
+ # Retrieve correct image features
+ (
+ _,
+ _,
+ current_vision_feats,
+ current_vision_pos_embeds,
+ feat_sizes,
+ ) = self._get_image_feature(inference_state, frame_idx, batch_size)
+
+ # Feed the empty mask and image feature above to get a dummy object pointer
+ current_out = self.track_step(
+ frame_idx=frame_idx,
+ is_init_cond_frame=True,
+ current_vision_feats=current_vision_feats,
+ current_vision_pos_embeds=current_vision_pos_embeds,
+ feat_sizes=feat_sizes,
+ point_inputs=None,
+ mask_inputs=mask_inputs,
+ output_dict={},
+ num_frames=inference_state["num_frames"],
+ track_in_reverse=False,
+ run_mem_encoder=False,
+ prev_sam_mask_logits=None,
+ )
+ return current_out["obj_ptr"]
+
+ @torch.inference_mode()
+ def propagate_in_video_preflight(self, inference_state):
+ """Prepare inference_state and consolidate temporary outputs before tracking."""
+ # Tracking has started and we don't allow adding new objects until session is reset.
+ inference_state["tracking_has_started"] = True
+ batch_size = self._get_obj_num(inference_state)
+
+ # Consolidate per-object temporary outputs in "temp_output_dict_per_obj" and
+ # add them into "output_dict".
+ temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+ output_dict = inference_state["output_dict"]
+ # "consolidated_frame_inds" contains indices of those frames where consolidated
+ # temporary outputs have been added (either in this call or any previous calls
+ # to `propagate_in_video_preflight`).
+ consolidated_frame_inds = inference_state["consolidated_frame_inds"]
+ for is_cond in [False, True]:
+ # Separately consolidate conditioning and non-conditioning temp outputs
+ storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
+ # Find all the frames that contain temporary outputs for any objects
+ # (these should be the frames that have just received clicks for mask inputs
+ # via `add_new_points_or_box` or `add_new_mask`)
+ temp_frame_inds = set()
+ for obj_temp_output_dict in temp_output_dict_per_obj.values():
+ temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
+ consolidated_frame_inds[storage_key].update(temp_frame_inds)
+ # consolidate the temporary output across all objects on this frame
+ for frame_idx in temp_frame_inds:
+ consolidated_out = self._consolidate_temp_output_across_obj(
+ inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True
+ )
+ # merge them into "output_dict" and also create per-object slices
+ output_dict[storage_key][frame_idx] = consolidated_out
+ self._add_output_per_object(
+ inference_state, frame_idx, consolidated_out, storage_key
+ )
+ clear_non_cond_mem = self.clear_non_cond_mem_around_input and (
+ self.clear_non_cond_mem_for_multi_obj or batch_size <= 1
+ )
+ if clear_non_cond_mem:
+ # clear non-conditioning memory of the surrounding frames
+ self._clear_non_cond_mem_around_input(inference_state, frame_idx)
+
+ # clear temporary outputs in `temp_output_dict_per_obj`
+ for obj_temp_output_dict in temp_output_dict_per_obj.values():
+ obj_temp_output_dict[storage_key].clear()
+
+ # edge case: if an output is added to "cond_frame_outputs", we remove any prior
+ # output on the same frame in "non_cond_frame_outputs"
+ for frame_idx in output_dict["cond_frame_outputs"]:
+ output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
+ for obj_output_dict in inference_state["output_dict_per_obj"].values():
+ for frame_idx in obj_output_dict["cond_frame_outputs"]:
+ obj_output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
+ for frame_idx in consolidated_frame_inds["cond_frame_outputs"]:
+ assert frame_idx in output_dict["cond_frame_outputs"]
+ consolidated_frame_inds["non_cond_frame_outputs"].discard(frame_idx)
+
+ # Make sure that the frame indices in "consolidated_frame_inds" are exactly those frames
+ # with either points or mask inputs (which should be true under a correct workflow).
+ all_consolidated_frame_inds = (
+ consolidated_frame_inds["cond_frame_outputs"]
+ | consolidated_frame_inds["non_cond_frame_outputs"]
+ )
+ input_frames_inds = set()
+ for point_inputs_per_frame in inference_state["point_inputs_per_obj"].values():
+ input_frames_inds.update(point_inputs_per_frame.keys())
+ for mask_inputs_per_frame in inference_state["mask_inputs_per_obj"].values():
+ input_frames_inds.update(mask_inputs_per_frame.keys())
+ assert all_consolidated_frame_inds == input_frames_inds
+
+ @torch.inference_mode()
+ def propagate_in_video(
+ self,
+ inference_state,
+ start_frame_idx=None,
+ max_frame_num_to_track=None,
+ reverse=False,
+ ):
+ """Propagate the input points across frames to track in the entire video."""
+ self.propagate_in_video_preflight(inference_state)
+
+ output_dict = inference_state["output_dict"]
+ consolidated_frame_inds = inference_state["consolidated_frame_inds"]
+ obj_ids = inference_state["obj_ids"]
+ num_frames = inference_state["num_frames"]
+ batch_size = self._get_obj_num(inference_state)
+ if len(output_dict["cond_frame_outputs"]) == 0:
+ raise RuntimeError("No points are provided; please add points first")
+ clear_non_cond_mem = self.clear_non_cond_mem_around_input and (
+ self.clear_non_cond_mem_for_multi_obj or batch_size <= 1
+ )
+
+ # set start index, end index, and processing order
+ if start_frame_idx is None:
+ # default: start from the earliest frame with input points
+ start_frame_idx = min(output_dict["cond_frame_outputs"])
+ if max_frame_num_to_track is None:
+ # default: track all the frames in the video
+ max_frame_num_to_track = num_frames
+ if reverse:
+ end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0)
+ if start_frame_idx > 0:
+ processing_order = range(start_frame_idx, end_frame_idx - 1, -1)
+ else:
+ processing_order = [] # skip reverse tracking if starting from frame 0
+ else:
+ end_frame_idx = min(
+ start_frame_idx + max_frame_num_to_track, num_frames - 1
+ )
+ processing_order = range(start_frame_idx, end_frame_idx + 1)
+
+ for frame_idx in tqdm(processing_order, desc="propagate in video"):
+ # We skip those frames already in consolidated outputs (these are frames
+ # that received input clicks or mask). Note that we cannot directly run
+ # batched forward on them via `_run_single_frame_inference` because the
+ # number of clicks on each object might be different.
+ if frame_idx in consolidated_frame_inds["cond_frame_outputs"]:
+ storage_key = "cond_frame_outputs"
+ current_out = output_dict[storage_key][frame_idx]
+ pred_masks = current_out["pred_masks"]
+ if clear_non_cond_mem:
+ # clear non-conditioning memory of the surrounding frames
+ self._clear_non_cond_mem_around_input(inference_state, frame_idx)
+ elif frame_idx in consolidated_frame_inds["non_cond_frame_outputs"]:
+ storage_key = "non_cond_frame_outputs"
+ current_out = output_dict[storage_key][frame_idx]
+ pred_masks = current_out["pred_masks"]
+ else:
+ storage_key = "non_cond_frame_outputs"
+ current_out, pred_masks = self._run_single_frame_inference(
+ inference_state=inference_state,
+ output_dict=output_dict,
+ frame_idx=frame_idx,
+ batch_size=batch_size,
+ is_init_cond_frame=False,
+ point_inputs=None,
+ mask_inputs=None,
+ reverse=reverse,
+ run_mem_encoder=True,
+ )
+ output_dict[storage_key][frame_idx] = current_out
+ # Create slices of per-object outputs for subsequent interaction with each
+ # individual object after tracking.
+ self._add_output_per_object(
+ inference_state, frame_idx, current_out, storage_key
+ )
+ inference_state["frames_already_tracked"][frame_idx] = {"reverse": reverse}
+
+ # Resize the output mask to the original video resolution (we directly use
+ # the mask scores on GPU for output to avoid any CPU conversion in between)
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, pred_masks
+ )
+ yield frame_idx, obj_ids, video_res_masks
+
+ def _add_output_per_object(
+ self, inference_state, frame_idx, current_out, storage_key
+ ):
+ """
+ Split a multi-object output into per-object output slices and add them into
+ `output_dict_per_obj`. The resulting slices share the same tensor storage.
+ """
+ maskmem_features = current_out["maskmem_features"]
+ assert maskmem_features is None or isinstance(maskmem_features, torch.Tensor)
+
+ maskmem_pos_enc = current_out["maskmem_pos_enc"]
+ assert maskmem_pos_enc is None or isinstance(maskmem_pos_enc, list)
+
+ output_dict_per_obj = inference_state["output_dict_per_obj"]
+ for obj_idx, obj_output_dict in output_dict_per_obj.items():
+ obj_slice = slice(obj_idx, obj_idx + 1)
+ obj_out = {
+ "maskmem_features": None,
+ "maskmem_pos_enc": None,
+ "pred_masks": current_out["pred_masks"][obj_slice],
+ "obj_ptr": current_out["obj_ptr"][obj_slice],
+ "object_score_logits": current_out["object_score_logits"][obj_slice],
+ }
+ if maskmem_features is not None:
+ obj_out["maskmem_features"] = maskmem_features[obj_slice]
+ if maskmem_pos_enc is not None:
+ obj_out["maskmem_pos_enc"] = [x[obj_slice] for x in maskmem_pos_enc]
+ obj_output_dict[storage_key][frame_idx] = obj_out
+
+ @torch.inference_mode()
+ def clear_all_prompts_in_frame(
+ self, inference_state, frame_idx, obj_id, need_output=True
+ ):
+ """Remove all input points or mask in a specific frame for a given object."""
+ obj_idx = self._obj_id_to_idx(inference_state, obj_id)
+
+ # Clear the conditioning information on the given frame
+ inference_state["point_inputs_per_obj"][obj_idx].pop(frame_idx, None)
+ inference_state["mask_inputs_per_obj"][obj_idx].pop(frame_idx, None)
+
+ temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+ temp_output_dict_per_obj[obj_idx]["cond_frame_outputs"].pop(frame_idx, None)
+ temp_output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].pop(frame_idx, None)
+
+ # Check and see if there are still any inputs left on this frame
+ batch_size = self._get_obj_num(inference_state)
+ frame_has_input = False
+ for obj_idx2 in range(batch_size):
+ if frame_idx in inference_state["point_inputs_per_obj"][obj_idx2]:
+ frame_has_input = True
+ break
+ if frame_idx in inference_state["mask_inputs_per_obj"][obj_idx2]:
+ frame_has_input = True
+ break
+
+ # If this frame has no remaining inputs for any objects, we further clear its
+ # conditioning frame status
+ if not frame_has_input:
+ output_dict = inference_state["output_dict"]
+ consolidated_frame_inds = inference_state["consolidated_frame_inds"]
+ consolidated_frame_inds["cond_frame_outputs"].discard(frame_idx)
+ consolidated_frame_inds["non_cond_frame_outputs"].discard(frame_idx)
+ # Remove the frame's conditioning output (possibly downgrading it to non-conditioning)
+ out = output_dict["cond_frame_outputs"].pop(frame_idx, None)
+ if out is not None:
+ # The frame is not a conditioning frame anymore since it's not receiving inputs,
+ # so we "downgrade" its output (if exists) to a non-conditioning frame output.
+ output_dict["non_cond_frame_outputs"][frame_idx] = out
+ inference_state["frames_already_tracked"].pop(frame_idx, None)
+ # Similarly, do it for the sliced output on each object.
+ for obj_idx2 in range(batch_size):
+ obj_output_dict = inference_state["output_dict_per_obj"][obj_idx2]
+ obj_out = obj_output_dict["cond_frame_outputs"].pop(frame_idx, None)
+ if obj_out is not None:
+ obj_output_dict["non_cond_frame_outputs"][frame_idx] = obj_out
+
+ # If all the conditioning frames have been removed, we also clear the tracking outputs
+ if len(output_dict["cond_frame_outputs"]) == 0:
+ self._reset_tracking_results(inference_state)
+
+ if not need_output:
+ return
+ # Finally, output updated masks per object (after removing the inputs above)
+ obj_ids = inference_state["obj_ids"]
+ is_cond = any(
+ frame_idx in obj_temp_output_dict["cond_frame_outputs"]
+ for obj_temp_output_dict in temp_output_dict_per_obj.values()
+ )
+ consolidated_out = self._consolidate_temp_output_across_obj(
+ inference_state,
+ frame_idx,
+ is_cond=is_cond,
+ run_mem_encoder=False,
+ consolidate_at_video_res=True,
+ )
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, consolidated_out["pred_masks_video_res"]
+ )
+ return frame_idx, obj_ids, video_res_masks
+
+ @torch.inference_mode()
+ def reset_state(self, inference_state):
+ """Remove all input points or mask in all frames throughout the video."""
+ self._reset_tracking_results(inference_state)
+ # Remove all object ids
+ inference_state["obj_id_to_idx"].clear()
+ inference_state["obj_idx_to_id"].clear()
+ inference_state["obj_ids"].clear()
+ inference_state["point_inputs_per_obj"].clear()
+ inference_state["mask_inputs_per_obj"].clear()
+ inference_state["output_dict_per_obj"].clear()
+ inference_state["temp_output_dict_per_obj"].clear()
+
+ def _reset_tracking_results(self, inference_state):
+ """Reset all tracking inputs and results across the videos."""
+ for v in inference_state["point_inputs_per_obj"].values():
+ v.clear()
+ for v in inference_state["mask_inputs_per_obj"].values():
+ v.clear()
+ for v in inference_state["output_dict_per_obj"].values():
+ v["cond_frame_outputs"].clear()
+ v["non_cond_frame_outputs"].clear()
+ for v in inference_state["temp_output_dict_per_obj"].values():
+ v["cond_frame_outputs"].clear()
+ v["non_cond_frame_outputs"].clear()
+ inference_state["output_dict"]["cond_frame_outputs"].clear()
+ inference_state["output_dict"]["non_cond_frame_outputs"].clear()
+ inference_state["consolidated_frame_inds"]["cond_frame_outputs"].clear()
+ inference_state["consolidated_frame_inds"]["non_cond_frame_outputs"].clear()
+ inference_state["tracking_has_started"] = False
+ inference_state["frames_already_tracked"].clear()
+
+ def _get_image_feature(self, inference_state, frame_idx, batch_size):
+ """Compute the image features on a given frame."""
+ # Look up in the cache first
+ image, backbone_out = inference_state["cached_features"].get(
+ frame_idx, (None, None)
+ )
+ if backbone_out is None:
+ # Cache miss -- we will run inference on a single image
+ device = inference_state["device"]
+ image = inference_state["images"][frame_idx].to(device).float().unsqueeze(0)
+ backbone_out = self.forward_image(image)
+ # Cache the most recent frame's feature (for repeated interactions with
+ # a frame; we can use an LRU cache for more frames in the future).
+ inference_state["cached_features"] = {frame_idx: (image, backbone_out)}
+
+ # expand the features to have the same dimension as the number of objects
+ expanded_image = image.expand(batch_size, -1, -1, -1)
+ expanded_backbone_out = {
+ "backbone_fpn": backbone_out["backbone_fpn"].copy(),
+ "vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
+ }
+ for i, feat in enumerate(expanded_backbone_out["backbone_fpn"]):
+ expanded_backbone_out["backbone_fpn"][i] = feat.expand(
+ batch_size, -1, -1, -1
+ )
+ for i, pos in enumerate(expanded_backbone_out["vision_pos_enc"]):
+ pos = pos.expand(batch_size, -1, -1, -1)
+ expanded_backbone_out["vision_pos_enc"][i] = pos
+
+ features = self._prepare_backbone_features(expanded_backbone_out)
+ features = (expanded_image,) + features
+ return features
+
+ def _run_single_frame_inference(
+ self,
+ inference_state,
+ output_dict,
+ frame_idx,
+ batch_size,
+ is_init_cond_frame,
+ point_inputs,
+ mask_inputs,
+ reverse,
+ run_mem_encoder,
+ prev_sam_mask_logits=None,
+ ):
+ """Run tracking on a single frame based on current inputs and previous memory."""
+ # Retrieve correct image features
+ (
+ _,
+ _,
+ current_vision_feats,
+ current_vision_pos_embeds,
+ feat_sizes,
+ ) = self._get_image_feature(inference_state, frame_idx, batch_size)
+
+ # point and mask should not appear as input simultaneously on the same frame
+ assert point_inputs is None or mask_inputs is None
+ current_out = self.track_step(
+ frame_idx=frame_idx,
+ is_init_cond_frame=is_init_cond_frame,
+ current_vision_feats=current_vision_feats,
+ current_vision_pos_embeds=current_vision_pos_embeds,
+ feat_sizes=feat_sizes,
+ point_inputs=point_inputs,
+ mask_inputs=mask_inputs,
+ output_dict=output_dict,
+ num_frames=inference_state["num_frames"],
+ track_in_reverse=reverse,
+ run_mem_encoder=run_mem_encoder,
+ prev_sam_mask_logits=prev_sam_mask_logits,
+ )
+
+ # optionally offload the output to CPU memory to save GPU space
+ storage_device = inference_state["storage_device"]
+ maskmem_features = current_out["maskmem_features"]
+ if maskmem_features is not None:
+ maskmem_features = maskmem_features.to(torch.bfloat16)
+ maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
+ pred_masks_gpu = current_out["pred_masks"]
+ # potentially fill holes in the predicted masks
+ if self.fill_hole_area > 0:
+ pred_masks_gpu = fill_holes_in_mask_scores(
+ pred_masks_gpu, self.fill_hole_area
+ )
+ pred_masks = pred_masks_gpu.to(storage_device, non_blocking=True)
+ # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+ maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, current_out)
+ # object pointer is a small tensor, so we always keep it on GPU memory for fast access
+ obj_ptr = current_out["obj_ptr"]
+ object_score_logits = current_out["object_score_logits"]
+ # make a compact version of this frame's output to reduce the state size
+ compact_current_out = {
+ "maskmem_features": maskmem_features,
+ "maskmem_pos_enc": maskmem_pos_enc,
+ "pred_masks": pred_masks,
+ "obj_ptr": obj_ptr,
+ "object_score_logits": object_score_logits,
+ }
+ return compact_current_out, pred_masks_gpu
+
+ def _run_memory_encoder(
+ self,
+ inference_state,
+ frame_idx,
+ batch_size,
+ high_res_masks,
+ object_score_logits,
+ is_mask_from_pts,
+ ):
+ """
+ Run the memory encoder on `high_res_masks`. This is usually after applying
+ non-overlapping constraints to object scores. Since their scores changed, their
+ memory also need to be computed again with the memory encoder.
+ """
+ # Retrieve correct image features
+ _, _, current_vision_feats, _, feat_sizes = self._get_image_feature(
+ inference_state, frame_idx, batch_size
+ )
+ maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+ current_vision_feats=current_vision_feats,
+ feat_sizes=feat_sizes,
+ pred_masks_high_res=high_res_masks,
+ object_score_logits=object_score_logits,
+ is_mask_from_pts=is_mask_from_pts,
+ )
+
+ # optionally offload the output to CPU memory to save GPU space
+ storage_device = inference_state["storage_device"]
+ maskmem_features = maskmem_features.to(torch.bfloat16)
+ maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
+ # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+ maskmem_pos_enc = self._get_maskmem_pos_enc(
+ inference_state, {"maskmem_pos_enc": maskmem_pos_enc}
+ )
+ return maskmem_features, maskmem_pos_enc
+
+ def _get_maskmem_pos_enc(self, inference_state, current_out):
+ """
+ `maskmem_pos_enc` is the same across frames and objects, so we cache it as
+ a constant in the inference session to reduce session storage size.
+ """
+ model_constants = inference_state["constants"]
+ # "out_maskmem_pos_enc" should be either a list of tensors or None
+ out_maskmem_pos_enc = current_out["maskmem_pos_enc"]
+ if out_maskmem_pos_enc is not None:
+ if "maskmem_pos_enc" not in model_constants:
+ assert isinstance(out_maskmem_pos_enc, list)
+ # only take the slice for one object, since it's same across objects
+ maskmem_pos_enc = [x[0:1].clone() for x in out_maskmem_pos_enc]
+ model_constants["maskmem_pos_enc"] = maskmem_pos_enc
+ else:
+ maskmem_pos_enc = model_constants["maskmem_pos_enc"]
+ # expand the cached maskmem_pos_enc to the actual batch size
+ batch_size = out_maskmem_pos_enc[0].size(0)
+ expanded_maskmem_pos_enc = [
+ x.expand(batch_size, -1, -1, -1) for x in maskmem_pos_enc
+ ]
+ else:
+ expanded_maskmem_pos_enc = None
+ return expanded_maskmem_pos_enc
+
+ @torch.inference_mode()
+ def remove_object(self, inference_state, obj_id, strict=False, need_output=True):
+ """
+ Remove an object id from the tracking state. If strict is True, we check whether
+ the object id actually exists and raise an error if it doesn't exist.
+ """
+ old_obj_idx_to_rm = inference_state["obj_id_to_idx"].get(obj_id, None)
+ updated_frames = []
+ # Check whether this object_id to remove actually exists and possibly raise an error.
+ if old_obj_idx_to_rm is None:
+ if not strict:
+ return inference_state["obj_ids"], updated_frames
+ raise RuntimeError(
+ f"Cannot remove object id {obj_id} as it doesn't exist. "
+ f"All existing object ids: {inference_state['obj_ids']}."
+ )
+
+ # If this is the only remaining object id, we simply reset the state.
+ if len(inference_state["obj_id_to_idx"]) == 1:
+ self.reset_state(inference_state)
+ return inference_state["obj_ids"], updated_frames
+
+ # There are still remaining objects after removing this object id. In this case,
+ # we need to delete the object storage from inference state tensors.
+ # Step 0: clear the input on those frames where this object id has point or mask input
+ # (note that this step is required as it might downgrade conditioning frames to
+ # non-conditioning ones)
+ obj_input_frames_inds = set()
+ obj_input_frames_inds.update(
+ inference_state["point_inputs_per_obj"][old_obj_idx_to_rm]
+ )
+ obj_input_frames_inds.update(
+ inference_state["mask_inputs_per_obj"][old_obj_idx_to_rm]
+ )
+ for frame_idx in obj_input_frames_inds:
+ self.clear_all_prompts_in_frame(
+ inference_state, frame_idx, obj_id, need_output=False
+ )
+
+ # Step 1: Update the object id mapping (note that it must be done after Step 0,
+ # since Step 0 still requires the old object id mappings in inference_state)
+ old_obj_ids = inference_state["obj_ids"]
+ old_obj_inds = list(range(len(old_obj_ids)))
+ remain_old_obj_inds = old_obj_inds.copy()
+ remain_old_obj_inds.remove(old_obj_idx_to_rm)
+ new_obj_ids = [old_obj_ids[old_idx] for old_idx in remain_old_obj_inds]
+ new_obj_inds = list(range(len(new_obj_ids)))
+ # build new mappings
+ old_idx_to_new_idx = dict(zip(remain_old_obj_inds, new_obj_inds))
+ inference_state["obj_id_to_idx"] = dict(zip(new_obj_ids, new_obj_inds))
+ inference_state["obj_idx_to_id"] = dict(zip(new_obj_inds, new_obj_ids))
+ inference_state["obj_ids"] = new_obj_ids
+
+ # Step 2: For per-object tensor storage, we shift their obj_idx in the dict keys.
+ # (note that "consolidated_frame_inds" doesn't need to be updated in this step as
+ # it's already handled in Step 0)
+ def _map_keys(container):
+ new_kvs = []
+ for k in old_obj_inds:
+ v = container.pop(k)
+ if k in old_idx_to_new_idx:
+ new_kvs.append((old_idx_to_new_idx[k], v))
+ container.update(new_kvs)
+
+ _map_keys(inference_state["point_inputs_per_obj"])
+ _map_keys(inference_state["mask_inputs_per_obj"])
+ _map_keys(inference_state["output_dict_per_obj"])
+ _map_keys(inference_state["temp_output_dict_per_obj"])
+
+ # Step 3: For packed tensor storage, we index the remaining ids and rebuild the per-object slices.
+ def _slice_state(output_dict, storage_key):
+ for frame_idx, out in output_dict[storage_key].items():
+ out["maskmem_features"] = out["maskmem_features"][remain_old_obj_inds]
+ out["maskmem_pos_enc"] = [
+ x[remain_old_obj_inds] for x in out["maskmem_pos_enc"]
+ ]
+ # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
+ out["maskmem_pos_enc"] = self._get_maskmem_pos_enc(inference_state, out)
+ out["pred_masks"] = out["pred_masks"][remain_old_obj_inds]
+ out["obj_ptr"] = out["obj_ptr"][remain_old_obj_inds]
+ out["object_score_logits"] = out["object_score_logits"][
+ remain_old_obj_inds
+ ]
+ # also update the per-object slices
+ self._add_output_per_object(
+ inference_state, frame_idx, out, storage_key
+ )
+
+ _slice_state(inference_state["output_dict"], "cond_frame_outputs")
+ _slice_state(inference_state["output_dict"], "non_cond_frame_outputs")
+
+ # Step 4: Further collect the outputs on those frames in `obj_input_frames_inds`, which
+ # could show an updated mask for objects previously occluded by the object being removed
+ if need_output:
+ temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
+ for frame_idx in obj_input_frames_inds:
+ is_cond = any(
+ frame_idx in obj_temp_output_dict["cond_frame_outputs"]
+ for obj_temp_output_dict in temp_output_dict_per_obj.values()
+ )
+ consolidated_out = self._consolidate_temp_output_across_obj(
+ inference_state,
+ frame_idx,
+ is_cond=is_cond,
+ run_mem_encoder=False,
+ consolidate_at_video_res=True,
+ )
+ _, video_res_masks = self._get_orig_video_res_output(
+ inference_state, consolidated_out["pred_masks_video_res"]
+ )
+ updated_frames.append((frame_idx, video_res_masks))
+
+ return inference_state["obj_ids"], updated_frames
+
+ def _clear_non_cond_mem_around_input(self, inference_state, frame_idx):
+ """
+ Remove the non-conditioning memory around the input frame. When users provide
+ correction clicks, the surrounding frames' non-conditioning memories can still
+ contain outdated object appearance information and could confuse the model.
+
+ This method clears those non-conditioning memories surrounding the interacted
+ frame to avoid giving the model both old and new information about the object.
+ """
+ r = self.memory_temporal_stride_for_eval
+ frame_idx_begin = frame_idx - r * self.num_maskmem
+ frame_idx_end = frame_idx + r * self.num_maskmem
+ output_dict = inference_state["output_dict"]
+ non_cond_frame_outputs = output_dict["non_cond_frame_outputs"]
+ for t in range(frame_idx_begin, frame_idx_end + 1):
+ non_cond_frame_outputs.pop(t, None)
+ for obj_output_dict in inference_state["output_dict_per_obj"].values():
+ obj_output_dict["non_cond_frame_outputs"].pop(t, None)
diff --git a/sam2_repo/sam2/utils/__init__.py b/sam2_repo/sam2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5277f46157403e47fd830fc519144b97ef69d4ae
--- /dev/null
+++ b/sam2_repo/sam2/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/sam2_repo/sam2/utils/__pycache__/__init__.cpython-313.pyc b/sam2_repo/sam2/utils/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1b58d862696a11817829b23aae3d8da833a998e
Binary files /dev/null and b/sam2_repo/sam2/utils/__pycache__/__init__.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/utils/__pycache__/misc.cpython-313.pyc b/sam2_repo/sam2/utils/__pycache__/misc.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1f148ceefef0549ef3f3258a7c41cd1a4c1b6d1
Binary files /dev/null and b/sam2_repo/sam2/utils/__pycache__/misc.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/utils/__pycache__/transforms.cpython-313.pyc b/sam2_repo/sam2/utils/__pycache__/transforms.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ee57d423ed8b565ab4f16369a22ab1ef501f2cd
Binary files /dev/null and b/sam2_repo/sam2/utils/__pycache__/transforms.cpython-313.pyc differ
diff --git a/sam2_repo/sam2/utils/amg.py b/sam2_repo/sam2/utils/amg.py
new file mode 100644
index 0000000000000000000000000000000000000000..986842960cf5deca00614b7b1cde1ab77dad7e6e
--- /dev/null
+++ b/sam2_repo/sam2/utils/amg.py
@@ -0,0 +1,348 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from copy import deepcopy
+from itertools import product
+from typing import Any, Dict, Generator, ItemsView, List, Tuple
+
+import numpy as np
+import torch
+
+# Very lightly adapted from https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/utils/amg.py
+
+
+class MaskData:
+ """
+ A structure for storing masks and their related data in batched format.
+ Implements basic filtering and concatenation.
+ """
+
+ def __init__(self, **kwargs) -> None:
+ for v in kwargs.values():
+ assert isinstance(
+ v, (list, np.ndarray, torch.Tensor)
+ ), "MaskData only supports list, numpy arrays, and torch tensors."
+ self._stats = dict(**kwargs)
+
+ def __setitem__(self, key: str, item: Any) -> None:
+ assert isinstance(
+ item, (list, np.ndarray, torch.Tensor)
+ ), "MaskData only supports list, numpy arrays, and torch tensors."
+ self._stats[key] = item
+
+ def __delitem__(self, key: str) -> None:
+ del self._stats[key]
+
+ def __getitem__(self, key: str) -> Any:
+ return self._stats[key]
+
+ def items(self) -> ItemsView[str, Any]:
+ return self._stats.items()
+
+ def filter(self, keep: torch.Tensor) -> None:
+ for k, v in self._stats.items():
+ if v is None:
+ self._stats[k] = None
+ elif isinstance(v, torch.Tensor):
+ self._stats[k] = v[torch.as_tensor(keep, device=v.device)]
+ elif isinstance(v, np.ndarray):
+ self._stats[k] = v[keep.detach().cpu().numpy()]
+ elif isinstance(v, list) and keep.dtype == torch.bool:
+ self._stats[k] = [a for i, a in enumerate(v) if keep[i]]
+ elif isinstance(v, list):
+ self._stats[k] = [v[i] for i in keep]
+ else:
+ raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+ def cat(self, new_stats: "MaskData") -> None:
+ for k, v in new_stats.items():
+ if k not in self._stats or self._stats[k] is None:
+ self._stats[k] = deepcopy(v)
+ elif isinstance(v, torch.Tensor):
+ self._stats[k] = torch.cat([self._stats[k], v], dim=0)
+ elif isinstance(v, np.ndarray):
+ self._stats[k] = np.concatenate([self._stats[k], v], axis=0)
+ elif isinstance(v, list):
+ self._stats[k] = self._stats[k] + deepcopy(v)
+ else:
+ raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.")
+
+ def to_numpy(self) -> None:
+ for k, v in self._stats.items():
+ if isinstance(v, torch.Tensor):
+ self._stats[k] = v.float().detach().cpu().numpy()
+
+
+def is_box_near_crop_edge(
+ boxes: torch.Tensor, crop_box: List[int], orig_box: List[int], atol: float = 20.0
+) -> torch.Tensor:
+ """Filter masks at the edge of a crop, but not at the edge of the original image."""
+ crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device)
+ orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device)
+ boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
+ near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0)
+ near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0)
+ near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
+ return torch.any(near_crop_edge, dim=1)
+
+
+def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor:
+ box_xywh = deepcopy(box_xyxy)
+ box_xywh[2] = box_xywh[2] - box_xywh[0]
+ box_xywh[3] = box_xywh[3] - box_xywh[1]
+ return box_xywh
+
+
+def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
+ assert len(args) > 0 and all(
+ len(a) == len(args[0]) for a in args
+ ), "Batched iteration must have inputs of all the same size."
+ n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0)
+ for b in range(n_batches):
+ yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args]
+
+
+def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]:
+ """
+ Encodes masks to an uncompressed RLE, in the format expected by
+ pycoco tools.
+ """
+ # Put in fortran order and flatten h,w
+ b, h, w = tensor.shape
+ tensor = tensor.permute(0, 2, 1).flatten(1)
+
+ # Compute change indices
+ diff = tensor[:, 1:] ^ tensor[:, :-1]
+ change_indices = diff.nonzero()
+
+ # Encode run length
+ out = []
+ for i in range(b):
+ cur_idxs = change_indices[change_indices[:, 0] == i, 1]
+ cur_idxs = torch.cat(
+ [
+ torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device),
+ cur_idxs + 1,
+ torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device),
+ ]
+ )
+ btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+ counts = [] if tensor[i, 0] == 0 else [0]
+ counts.extend(btw_idxs.detach().cpu().tolist())
+ out.append({"size": [h, w], "counts": counts})
+ return out
+
+
+def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
+ """Compute a binary mask from an uncompressed RLE."""
+ h, w = rle["size"]
+ mask = np.empty(h * w, dtype=bool)
+ idx = 0
+ parity = False
+ for count in rle["counts"]:
+ mask[idx : idx + count] = parity
+ idx += count
+ parity ^= True
+ mask = mask.reshape(w, h)
+ return mask.transpose() # Put in C order
+
+
+def area_from_rle(rle: Dict[str, Any]) -> int:
+ return sum(rle["counts"][1::2])
+
+
+def calculate_stability_score(
+ masks: torch.Tensor, mask_threshold: float, threshold_offset: float
+) -> torch.Tensor:
+ """
+ Computes the stability score for a batch of masks. The stability
+ score is the IoU between the binary masks obtained by thresholding
+ the predicted mask logits at high and low values.
+ """
+ # One mask is always contained inside the other.
+ # Save memory by preventing unnecessary cast to torch.int64
+ intersections = (
+ (masks > (mask_threshold + threshold_offset))
+ .sum(-1, dtype=torch.int16)
+ .sum(-1, dtype=torch.int32)
+ )
+ unions = (
+ (masks > (mask_threshold - threshold_offset))
+ .sum(-1, dtype=torch.int16)
+ .sum(-1, dtype=torch.int32)
+ )
+ return intersections / unions
+
+
+def build_point_grid(n_per_side: int) -> np.ndarray:
+ """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
+ offset = 1 / (2 * n_per_side)
+ points_one_side = np.linspace(offset, 1 - offset, n_per_side)
+ points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
+ points_y = np.tile(points_one_side[:, None], (1, n_per_side))
+ points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
+ return points
+
+
+def build_all_layer_point_grids(
+ n_per_side: int, n_layers: int, scale_per_layer: int
+) -> List[np.ndarray]:
+ """Generates point grids for all crop layers."""
+ points_by_layer = []
+ for i in range(n_layers + 1):
+ n_points = int(n_per_side / (scale_per_layer**i))
+ points_by_layer.append(build_point_grid(n_points))
+ return points_by_layer
+
+
+def generate_crop_boxes(
+ im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float
+) -> Tuple[List[List[int]], List[int]]:
+ """
+ Generates a list of crop boxes of different sizes. Each layer
+ has (2**i)**2 boxes for the ith layer.
+ """
+ crop_boxes, layer_idxs = [], []
+ im_h, im_w = im_size
+ short_side = min(im_h, im_w)
+
+ # Original image
+ crop_boxes.append([0, 0, im_w, im_h])
+ layer_idxs.append(0)
+
+ def crop_len(orig_len, n_crops, overlap):
+ return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
+
+ for i_layer in range(n_layers):
+ n_crops_per_side = 2 ** (i_layer + 1)
+ overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
+
+ crop_w = crop_len(im_w, n_crops_per_side, overlap)
+ crop_h = crop_len(im_h, n_crops_per_side, overlap)
+
+ crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)]
+ crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)]
+
+ # Crops in XYWH format
+ for x0, y0 in product(crop_box_x0, crop_box_y0):
+ box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
+ crop_boxes.append(box)
+ layer_idxs.append(i_layer + 1)
+
+ return crop_boxes, layer_idxs
+
+
+def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+ x0, y0, _, _ = crop_box
+ offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
+ # Check if boxes has a channel dimension
+ if len(boxes.shape) == 3:
+ offset = offset.unsqueeze(1)
+ return boxes + offset
+
+
+def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+ x0, y0, _, _ = crop_box
+ offset = torch.tensor([[x0, y0]], device=points.device)
+ # Check if points has a channel dimension
+ if len(points.shape) == 3:
+ offset = offset.unsqueeze(1)
+ return points + offset
+
+
+def uncrop_masks(
+ masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int
+) -> torch.Tensor:
+ x0, y0, x1, y1 = crop_box
+ if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
+ return masks
+ # Coordinate transform masks
+ pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
+ pad = (x0, pad_x - x0, y0, pad_y - y0)
+ return torch.nn.functional.pad(masks, pad, value=0)
+
+
+def remove_small_regions(
+ mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+ """
+ Removes small disconnected regions and holes in a mask. Returns the
+ mask and an indicator of if the mask has been modified.
+ """
+ import cv2 # type: ignore
+
+ assert mode in ["holes", "islands"]
+ correct_holes = mode == "holes"
+ working_mask = (correct_holes ^ mask).astype(np.uint8)
+ n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+ sizes = stats[:, -1][1:] # Row 0 is background label
+ small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+ if len(small_regions) == 0:
+ return mask, False
+ fill_labels = [0] + small_regions
+ if not correct_holes:
+ fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+ # If every region is below threshold, keep largest
+ if len(fill_labels) == 0:
+ fill_labels = [int(np.argmax(sizes)) + 1]
+ mask = np.isin(regions, fill_labels)
+ return mask, True
+
+
+def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]:
+ from pycocotools import mask as mask_utils # type: ignore
+
+ h, w = uncompressed_rle["size"]
+ rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
+ rle["counts"] = rle["counts"].decode("utf-8") # Necessary to serialize with json
+ return rle
+
+
+def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
+ """
+ Calculates boxes in XYXY format around masks. Return [0,0,0,0] for
+ an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4.
+ """
+ # torch.max below raises an error on empty inputs, just skip in this case
+ if torch.numel(masks) == 0:
+ return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
+
+ # Normalize shape to CxHxW
+ shape = masks.shape
+ h, w = shape[-2:]
+ if len(shape) > 2:
+ masks = masks.flatten(0, -3)
+ else:
+ masks = masks.unsqueeze(0)
+
+ # Get top and bottom edges
+ in_height, _ = torch.max(masks, dim=-1)
+ in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :]
+ bottom_edges, _ = torch.max(in_height_coords, dim=-1)
+ in_height_coords = in_height_coords + h * (~in_height)
+ top_edges, _ = torch.min(in_height_coords, dim=-1)
+
+ # Get left and right edges
+ in_width, _ = torch.max(masks, dim=-2)
+ in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :]
+ right_edges, _ = torch.max(in_width_coords, dim=-1)
+ in_width_coords = in_width_coords + w * (~in_width)
+ left_edges, _ = torch.min(in_width_coords, dim=-1)
+
+ # If the mask is empty the right edge will be to the left of the left edge.
+ # Replace these boxes with [0, 0, 0, 0]
+ empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+ out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1)
+ out = out * (~empty_filter).unsqueeze(-1)
+
+ # Return to original shape
+ if len(shape) > 2:
+ out = out.reshape(*shape[:-2], 4)
+ else:
+ out = out[0]
+
+ return out
diff --git a/sam2_repo/sam2/utils/misc.py b/sam2_repo/sam2/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b65ee825732ff85137805be650edd4cbe8e6f6d4
--- /dev/null
+++ b/sam2_repo/sam2/utils/misc.py
@@ -0,0 +1,349 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import warnings
+from threading import Thread
+
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+
+
+def get_sdpa_settings():
+ if torch.cuda.is_available():
+ old_gpu = torch.cuda.get_device_properties(0).major < 7
+ # only use Flash Attention on Ampere (8.0) or newer GPUs
+ use_flash_attn = torch.cuda.get_device_properties(0).major >= 8
+ if not use_flash_attn:
+ warnings.warn(
+ "Flash Attention is disabled as it requires a GPU with Ampere (8.0) CUDA capability.",
+ category=UserWarning,
+ stacklevel=2,
+ )
+ # keep math kernel for PyTorch versions before 2.2 (Flash Attention v2 is only
+ # available on PyTorch 2.2+, while Flash Attention v1 cannot handle all cases)
+ pytorch_version = tuple(int(v) for v in torch.__version__.split(".")[:2])
+ if pytorch_version < (2, 2):
+ warnings.warn(
+ f"You are using PyTorch {torch.__version__} without Flash Attention v2 support. "
+ "Consider upgrading to PyTorch 2.2+ for Flash Attention v2 (which could be faster).",
+ category=UserWarning,
+ stacklevel=2,
+ )
+ math_kernel_on = pytorch_version < (2, 2) or not use_flash_attn
+ else:
+ old_gpu = True
+ use_flash_attn = False
+ math_kernel_on = True
+
+ return old_gpu, use_flash_attn, math_kernel_on
+
+
+def get_connected_components(mask):
+ """
+ Get the connected components (8-connectivity) of binary masks of shape (N, 1, H, W).
+
+ Inputs:
+ - mask: A binary mask tensor of shape (N, 1, H, W), where 1 is foreground and 0 is
+ background.
+
+ Outputs:
+ - labels: A tensor of shape (N, 1, H, W) containing the connected component labels
+ for foreground pixels and 0 for background pixels.
+ - counts: A tensor of shape (N, 1, H, W) containing the area of the connected
+ components for foreground pixels and 0 for background pixels.
+ """
+ from sam2 import _C
+
+ return _C.get_connected_componnets(mask.to(torch.uint8).contiguous())
+
+
+def mask_to_box(masks: torch.Tensor):
+ """
+ compute bounding box given an input mask
+
+ Inputs:
+ - masks: [B, 1, H, W] masks, dtype=torch.Tensor
+
+ Returns:
+ - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
+ """
+ B, _, h, w = masks.shape
+ device = masks.device
+ xs = torch.arange(w, device=device, dtype=torch.int32)
+ ys = torch.arange(h, device=device, dtype=torch.int32)
+ grid_xs, grid_ys = torch.meshgrid(xs, ys, indexing="xy")
+ grid_xs = grid_xs[None, None, ...].expand(B, 1, h, w)
+ grid_ys = grid_ys[None, None, ...].expand(B, 1, h, w)
+ min_xs, _ = torch.min(torch.where(masks, grid_xs, w).flatten(-2), dim=-1)
+ max_xs, _ = torch.max(torch.where(masks, grid_xs, -1).flatten(-2), dim=-1)
+ min_ys, _ = torch.min(torch.where(masks, grid_ys, h).flatten(-2), dim=-1)
+ max_ys, _ = torch.max(torch.where(masks, grid_ys, -1).flatten(-2), dim=-1)
+ bbox_coords = torch.stack((min_xs, min_ys, max_xs, max_ys), dim=-1)
+
+ return bbox_coords
+
+
+def _load_img_as_tensor(img_path, image_size):
+ img_pil = Image.open(img_path)
+ img_np = np.array(img_pil.convert("RGB").resize((image_size, image_size)))
+ if img_np.dtype == np.uint8: # np.uint8 is expected for JPEG images
+ img_np = img_np / 255.0
+ else:
+ raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}")
+ img = torch.from_numpy(img_np).permute(2, 0, 1)
+ video_width, video_height = img_pil.size # the original video size
+ return img, video_height, video_width
+
+
+class AsyncVideoFrameLoader:
+ """
+ A list of video frames to be load asynchronously without blocking session start.
+ """
+
+ def __init__(
+ self,
+ img_paths,
+ image_size,
+ offload_video_to_cpu,
+ img_mean,
+ img_std,
+ compute_device,
+ ):
+ self.img_paths = img_paths
+ self.image_size = image_size
+ self.offload_video_to_cpu = offload_video_to_cpu
+ self.img_mean = img_mean
+ self.img_std = img_std
+ # items in `self.images` will be loaded asynchronously
+ self.images = [None] * len(img_paths)
+ # catch and raise any exceptions in the async loading thread
+ self.exception = None
+ # video_height and video_width be filled when loading the first image
+ self.video_height = None
+ self.video_width = None
+ self.compute_device = compute_device
+
+ # load the first frame to fill video_height and video_width and also
+ # to cache it (since it's most likely where the user will click)
+ self.__getitem__(0)
+
+ # load the rest of frames asynchronously without blocking the session start
+ def _load_frames():
+ try:
+ for n in tqdm(range(len(self.images)), desc="frame loading (JPEG)"):
+ self.__getitem__(n)
+ except Exception as e:
+ self.exception = e
+
+ self.thread = Thread(target=_load_frames, daemon=True)
+ self.thread.start()
+
+ def __getitem__(self, index):
+ if self.exception is not None:
+ raise RuntimeError("Failure in frame loading thread") from self.exception
+
+ img = self.images[index]
+ if img is not None:
+ return img
+
+ img, video_height, video_width = _load_img_as_tensor(
+ self.img_paths[index], self.image_size
+ )
+ self.video_height = video_height
+ self.video_width = video_width
+ # normalize by mean and std
+ img -= self.img_mean
+ img /= self.img_std
+ if not self.offload_video_to_cpu:
+ img = img.to(self.compute_device, non_blocking=True)
+ self.images[index] = img
+ return img
+
+ def __len__(self):
+ return len(self.images)
+
+
+def load_video_frames(
+ video_path,
+ image_size,
+ offload_video_to_cpu,
+ img_mean=(0.485, 0.456, 0.406),
+ img_std=(0.229, 0.224, 0.225),
+ async_loading_frames=False,
+ compute_device=torch.device("cuda"),
+):
+ """
+ Load the video frames from video_path. The frames are resized to image_size as in
+ the model and are loaded to GPU if offload_video_to_cpu=False. This is used by the demo.
+ """
+ is_bytes = isinstance(video_path, bytes)
+ is_str = isinstance(video_path, str)
+ is_mp4_path = is_str and os.path.splitext(video_path)[-1] in [".mp4", ".MP4"]
+ if is_bytes or is_mp4_path:
+ return load_video_frames_from_video_file(
+ video_path=video_path,
+ image_size=image_size,
+ offload_video_to_cpu=offload_video_to_cpu,
+ img_mean=img_mean,
+ img_std=img_std,
+ compute_device=compute_device,
+ )
+ elif is_str and os.path.isdir(video_path):
+ return load_video_frames_from_jpg_images(
+ video_path=video_path,
+ image_size=image_size,
+ offload_video_to_cpu=offload_video_to_cpu,
+ img_mean=img_mean,
+ img_std=img_std,
+ async_loading_frames=async_loading_frames,
+ compute_device=compute_device,
+ )
+ else:
+ raise NotImplementedError(
+ "Only MP4 video and JPEG folder are supported at this moment"
+ )
+
+
+def load_video_frames_from_jpg_images(
+ video_path,
+ image_size,
+ offload_video_to_cpu,
+ img_mean=(0.485, 0.456, 0.406),
+ img_std=(0.229, 0.224, 0.225),
+ async_loading_frames=False,
+ compute_device=torch.device("cuda"),
+):
+ """
+ Load the video frames from a directory of JPEG files (".jpg" format).
+
+ The frames are resized to image_size x image_size and are loaded to GPU if
+ `offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`.
+
+ You can load a frame asynchronously by setting `async_loading_frames` to `True`.
+ """
+ if isinstance(video_path, str) and os.path.isdir(video_path):
+ jpg_folder = video_path
+ else:
+ raise NotImplementedError(
+ "Only JPEG frames are supported at this moment. For video files, you may use "
+ "ffmpeg (https://ffmpeg.org/) to extract frames into a folder of JPEG files, such as \n"
+ "```\n"
+ "ffmpeg -i .mp4 -q:v 2 -start_number 0 /'%05d.jpg'\n"
+ "```\n"
+ "where `-q:v` generates high-quality JPEG frames and `-start_number 0` asks "
+ "ffmpeg to start the JPEG file from 00000.jpg."
+ )
+
+ frame_names = [
+ p
+ for p in os.listdir(jpg_folder)
+ if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
+ ]
+ frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
+ num_frames = len(frame_names)
+ if num_frames == 0:
+ raise RuntimeError(f"no images found in {jpg_folder}")
+ img_paths = [os.path.join(jpg_folder, frame_name) for frame_name in frame_names]
+ img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
+ img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
+
+ if async_loading_frames:
+ lazy_images = AsyncVideoFrameLoader(
+ img_paths,
+ image_size,
+ offload_video_to_cpu,
+ img_mean,
+ img_std,
+ compute_device,
+ )
+ return lazy_images, lazy_images.video_height, lazy_images.video_width
+
+ images = torch.zeros(num_frames, 3, image_size, image_size, dtype=torch.float32)
+ for n, img_path in enumerate(tqdm(img_paths, desc="frame loading (JPEG)")):
+ images[n], video_height, video_width = _load_img_as_tensor(img_path, image_size)
+ if not offload_video_to_cpu:
+ images = images.to(compute_device)
+ img_mean = img_mean.to(compute_device)
+ img_std = img_std.to(compute_device)
+ # normalize by mean and std
+ images -= img_mean
+ images /= img_std
+ return images, video_height, video_width
+
+
+def load_video_frames_from_video_file(
+ video_path,
+ image_size,
+ offload_video_to_cpu,
+ img_mean=(0.485, 0.456, 0.406),
+ img_std=(0.229, 0.224, 0.225),
+ compute_device=torch.device("cuda"),
+):
+ """Load the video frames from a video file."""
+ import decord
+
+ img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
+ img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
+ # Get the original video height and width
+ decord.bridge.set_bridge("torch")
+ video_height, video_width, _ = decord.VideoReader(video_path).next().shape
+ # Iterate over all frames in the video
+ images = []
+ for frame in decord.VideoReader(video_path, width=image_size, height=image_size):
+ images.append(frame.permute(2, 0, 1))
+
+ images = torch.stack(images, dim=0).float() / 255.0
+ if not offload_video_to_cpu:
+ images = images.to(compute_device)
+ img_mean = img_mean.to(compute_device)
+ img_std = img_std.to(compute_device)
+ # normalize by mean and std
+ images -= img_mean
+ images /= img_std
+ return images, video_height, video_width
+
+
+def fill_holes_in_mask_scores(mask, max_area):
+ """
+ A post processor to fill small holes in mask scores with area under `max_area`.
+ """
+ # Holes are those connected components in background with area <= self.max_area
+ # (background regions are those with mask scores <= 0)
+ assert max_area > 0, "max_area must be positive"
+
+ input_mask = mask
+ try:
+ labels, areas = get_connected_components(mask <= 0)
+ is_hole = (labels > 0) & (areas <= max_area)
+ # We fill holes with a small positive mask score (0.1) to change them to foreground.
+ mask = torch.where(is_hole, 0.1, mask)
+ except Exception as e:
+ # Skip the post-processing step on removing small holes if the CUDA kernel fails
+ warnings.warn(
+ f"{e}\n\nSkipping the post-processing step due to the error above. You can "
+ "still use SAM 2 and it's OK to ignore the error above, although some post-processing "
+ "functionality may be limited (which doesn't affect the results in most cases; see "
+ "https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).",
+ category=UserWarning,
+ stacklevel=2,
+ )
+ mask = input_mask
+
+ return mask
+
+
+def concat_points(old_point_inputs, new_points, new_labels):
+ """Add new points and labels to previous point inputs (add at the end)."""
+ if old_point_inputs is None:
+ points, labels = new_points, new_labels
+ else:
+ points = torch.cat([old_point_inputs["point_coords"], new_points], dim=1)
+ labels = torch.cat([old_point_inputs["point_labels"], new_labels], dim=1)
+
+ return {"point_coords": points, "point_labels": labels}
diff --git a/sam2_repo/sam2/utils/transforms.py b/sam2_repo/sam2/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc17bebfab104b659c5469e8434cf357ae7e24b6
--- /dev/null
+++ b/sam2_repo/sam2/utils/transforms.py
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.transforms import Normalize, Resize, ToTensor
+
+
+class SAM2Transforms(nn.Module):
+ def __init__(
+ self, resolution, mask_threshold, max_hole_area=0.0, max_sprinkle_area=0.0
+ ):
+ """
+ Transforms for SAM2.
+ """
+ super().__init__()
+ self.resolution = resolution
+ self.mask_threshold = mask_threshold
+ self.max_hole_area = max_hole_area
+ self.max_sprinkle_area = max_sprinkle_area
+ self.mean = [0.485, 0.456, 0.406]
+ self.std = [0.229, 0.224, 0.225]
+ self.to_tensor = ToTensor()
+ self.transforms = torch.jit.script(
+ nn.Sequential(
+ Resize((self.resolution, self.resolution)),
+ Normalize(self.mean, self.std),
+ )
+ )
+
+ def __call__(self, x):
+ x = self.to_tensor(x)
+ return self.transforms(x)
+
+ def forward_batch(self, img_list):
+ img_batch = [self.transforms(self.to_tensor(img)) for img in img_list]
+ img_batch = torch.stack(img_batch, dim=0)
+ return img_batch
+
+ def transform_coords(
+ self, coords: torch.Tensor, normalize=False, orig_hw=None
+ ) -> torch.Tensor:
+ """
+ Expects a torch tensor with length 2 in the last dimension. The coordinates can be in absolute image or normalized coordinates,
+ If the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
+
+ Returns
+ Un-normalized coordinates in the range of [0, 1] which is expected by the SAM2 model.
+ """
+ if normalize:
+ assert orig_hw is not None
+ h, w = orig_hw
+ coords = coords.clone()
+ coords[..., 0] = coords[..., 0] / w
+ coords[..., 1] = coords[..., 1] / h
+
+ coords = coords * self.resolution # unnormalize coords
+ return coords
+
+ def transform_boxes(
+ self, boxes: torch.Tensor, normalize=False, orig_hw=None
+ ) -> torch.Tensor:
+ """
+ Expects a tensor of shape Bx4. The coordinates can be in absolute image or normalized coordinates,
+ if the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
+ """
+ boxes = self.transform_coords(boxes.reshape(-1, 2, 2), normalize, orig_hw)
+ return boxes
+
+ def postprocess_masks(self, masks: torch.Tensor, orig_hw) -> torch.Tensor:
+ """
+ Perform PostProcessing on output masks.
+ """
+ from sam2.utils.misc import get_connected_components
+
+ masks = masks.float()
+ input_masks = masks
+ mask_flat = masks.flatten(0, 1).unsqueeze(1) # flatten as 1-channel image
+ try:
+ if self.max_hole_area > 0:
+ # Holes are those connected components in background with area <= self.fill_hole_area
+ # (background regions are those with mask scores <= self.mask_threshold)
+ labels, areas = get_connected_components(
+ mask_flat <= self.mask_threshold
+ )
+ is_hole = (labels > 0) & (areas <= self.max_hole_area)
+ is_hole = is_hole.reshape_as(masks)
+ # We fill holes with a small positive mask score (10.0) to change them to foreground.
+ masks = torch.where(is_hole, self.mask_threshold + 10.0, masks)
+
+ if self.max_sprinkle_area > 0:
+ labels, areas = get_connected_components(
+ mask_flat > self.mask_threshold
+ )
+ is_hole = (labels > 0) & (areas <= self.max_sprinkle_area)
+ is_hole = is_hole.reshape_as(masks)
+ # We fill holes with negative mask score (-10.0) to change them to background.
+ masks = torch.where(is_hole, self.mask_threshold - 10.0, masks)
+ except Exception as e:
+ # Skip the post-processing step if the CUDA kernel fails
+ warnings.warn(
+ f"{e}\n\nSkipping the post-processing step due to the error above. You can "
+ "still use SAM 2 and it's OK to ignore the error above, although some post-processing "
+ "functionality may be limited (which doesn't affect the results in most cases; see "
+ "https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).",
+ category=UserWarning,
+ stacklevel=2,
+ )
+ masks = input_masks
+
+ masks = F.interpolate(masks, orig_hw, mode="bilinear", align_corners=False)
+ return masks
diff --git a/sam2_repo/setup.py b/sam2_repo/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..78a634cddb19615c45601681ffbcd1f29af66f47
--- /dev/null
+++ b/sam2_repo/setup.py
@@ -0,0 +1,174 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+
+from setuptools import find_packages, setup
+
+# Package metadata
+NAME = "SAM-2"
+VERSION = "1.0"
+DESCRIPTION = "SAM 2: Segment Anything in Images and Videos"
+URL = "https://github.com/facebookresearch/sam2"
+AUTHOR = "Meta AI"
+AUTHOR_EMAIL = "segment-anything@meta.com"
+LICENSE = "Apache 2.0"
+
+# Read the contents of README file
+with open("README.md", "r", encoding="utf-8") as f:
+ LONG_DESCRIPTION = f.read()
+
+# Required dependencies
+REQUIRED_PACKAGES = [
+ "torch>=2.5.1",
+ "torchvision>=0.20.1",
+ "numpy>=1.24.4",
+ "tqdm>=4.66.1",
+ "hydra-core>=1.3.2",
+ "iopath>=0.1.10",
+ "pillow>=9.4.0",
+]
+
+EXTRA_PACKAGES = {
+ "notebooks": [
+ "matplotlib>=3.9.1",
+ "jupyter>=1.0.0",
+ "opencv-python>=4.7.0",
+ "eva-decord>=0.6.1",
+ ],
+ "interactive-demo": [
+ "Flask>=3.0.3",
+ "Flask-Cors>=5.0.0",
+ "av>=13.0.0",
+ "dataclasses-json>=0.6.7",
+ "eva-decord>=0.6.1",
+ "gunicorn>=23.0.0",
+ "imagesize>=1.4.1",
+ "pycocotools>=2.0.8",
+ "strawberry-graphql>=0.243.0",
+ ],
+ "dev": [
+ "black==24.2.0",
+ "usort==1.0.2",
+ "ufmt==2.0.0b2",
+ "fvcore>=0.1.5.post20221221",
+ "pandas>=2.2.2",
+ "scikit-image>=0.24.0",
+ "tensorboard>=2.17.0",
+ "pycocotools>=2.0.8",
+ "tensordict>=0.6.0",
+ "opencv-python>=4.7.0",
+ "submitit>=1.5.1",
+ ],
+}
+
+# By default, we also build the SAM 2 CUDA extension.
+# You may turn off CUDA build with `export SAM2_BUILD_CUDA=0`.
+BUILD_CUDA = os.getenv("SAM2_BUILD_CUDA", "1") == "1"
+# By default, we allow SAM 2 installation to proceed even with build errors.
+# You may force stopping on errors with `export SAM2_BUILD_ALLOW_ERRORS=0`.
+BUILD_ALLOW_ERRORS = os.getenv("SAM2_BUILD_ALLOW_ERRORS", "1") == "1"
+
+# Catch and skip errors during extension building and print a warning message
+# (note that this message only shows up under verbose build mode
+# "pip install -v -e ." or "python setup.py build_ext -v")
+CUDA_ERROR_MSG = (
+ "{}\n\n"
+ "Failed to build the SAM 2 CUDA extension due to the error above. "
+ "You can still use SAM 2 and it's OK to ignore the error above, although some "
+ "post-processing functionality may be limited (which doesn't affect the results in most cases; "
+ "(see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).\n"
+)
+
+
+def get_extensions():
+ if not BUILD_CUDA:
+ return []
+
+ try:
+ from torch.utils.cpp_extension import CUDAExtension
+
+ srcs = ["sam2/csrc/connected_components.cu"]
+ compile_args = {
+ "cxx": [],
+ "nvcc": [
+ "-DCUDA_HAS_FP16=1",
+ "-D__CUDA_NO_HALF_OPERATORS__",
+ "-D__CUDA_NO_HALF_CONVERSIONS__",
+ "-D__CUDA_NO_HALF2_OPERATORS__",
+ ],
+ }
+ ext_modules = [CUDAExtension("sam2._C", srcs, extra_compile_args=compile_args)]
+ except Exception as e:
+ if BUILD_ALLOW_ERRORS:
+ print(CUDA_ERROR_MSG.format(e))
+ ext_modules = []
+ else:
+ raise e
+
+ return ext_modules
+
+
+try:
+ from torch.utils.cpp_extension import BuildExtension
+
+ class BuildExtensionIgnoreErrors(BuildExtension):
+
+ def finalize_options(self):
+ try:
+ super().finalize_options()
+ except Exception as e:
+ print(CUDA_ERROR_MSG.format(e))
+ self.extensions = []
+
+ def build_extensions(self):
+ try:
+ super().build_extensions()
+ except Exception as e:
+ print(CUDA_ERROR_MSG.format(e))
+ self.extensions = []
+
+ def get_ext_filename(self, ext_name):
+ try:
+ return super().get_ext_filename(ext_name)
+ except Exception as e:
+ print(CUDA_ERROR_MSG.format(e))
+ self.extensions = []
+ return "_C.so"
+
+ cmdclass = {
+ "build_ext": (
+ BuildExtensionIgnoreErrors.with_options(no_python_abi_suffix=True)
+ if BUILD_ALLOW_ERRORS
+ else BuildExtension.with_options(no_python_abi_suffix=True)
+ )
+ }
+except Exception as e:
+ cmdclass = {}
+ if BUILD_ALLOW_ERRORS:
+ print(CUDA_ERROR_MSG.format(e))
+ else:
+ raise e
+
+
+# Setup configuration
+setup(
+ name=NAME,
+ version=VERSION,
+ description=DESCRIPTION,
+ long_description=LONG_DESCRIPTION,
+ long_description_content_type="text/markdown",
+ url=URL,
+ author=AUTHOR,
+ author_email=AUTHOR_EMAIL,
+ license=LICENSE,
+ packages=find_packages(exclude="notebooks"),
+ include_package_data=True,
+ install_requires=REQUIRED_PACKAGES,
+ extras_require=EXTRA_PACKAGES,
+ python_requires=">=3.10.0",
+ ext_modules=get_extensions(),
+ cmdclass=cmdclass,
+)
diff --git a/web_demo.html b/web_demo.html
new file mode 100644
index 0000000000000000000000000000000000000000..65580d6742fad5134f6f4a3c9123e3a4071837db
--- /dev/null
+++ b/web_demo.html
@@ -0,0 +1,596 @@
+
+
+
+
+
+ SAM2 Box Prompt Demo
+
+
+
+
+
🎯 SAM2 Box Prompt Demo
+
Выдели объект прямоугольником → Получи точную сегментацию
+
+
+
+
📸 Загрузи изображение
+
Кликни или перетащи фото сюда
+
+
+
+
+
+
+ 📝 Как пользоваться:
+ 1. Зажми левую кнопку мыши и нарисуй прямоугольник вокруг объекта
+ 2. Нажми "Сегментировать" чтобы получить результат
+ 3. Справа увидишь вырезанный объект с прозрачностью
+