--- library_name: transformers pipeline_tag: mask-generation inference: true widget: - text: Hello! example_title: Hello world group: Python base_model: - facebook/sam3 --- This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [facebook/sam3](https://huggingface.co/facebook/sam3). ### Example usage: ```python import requests import torch from PIL import Image from transformers import Sam3Model, Sam3Processor from transformers.models.sam3.modeling_sam3 import Sam3Config model_id = "yujiepan/sam3-tiny-random" device = "cuda" if torch.cuda.is_available() else "cpu" model = Sam3Model.from_pretrained(model_id).to(device) processor = Sam3Processor.from_pretrained(model_id) kitchen_url = "http://images.cocodataset.org/val2017/000000136466.jpg" kitchen_image = Image.open(requests.get( kitchen_url, stream=True).raw).convert("RGB") # Segment "handle" but exclude the oven handle using a negative box text = "handle" # Negative box covering oven handle area (xyxy): [40, 183, 318, 204] oven_handle_box = [40, 183, 318, 204] input_boxes = [[oven_handle_box]] inputs = processor( images=kitchen_image, text=text, input_boxes=input_boxes, input_boxes_labels=[[0]], # 0 = negative (exclude this region) return_tensors="pt" ).to(device) with torch.no_grad(): outputs = model(**inputs) # Post-process results results = processor.post_process_instance_segmentation( outputs, threshold=0.5, mask_threshold=0.5, target_sizes=inputs.get("original_sizes").tolist() )[0] print(results) # This will segment pot handles but exclude the oven handle ``` ### Codes to create this repo: ```python import json from pathlib import Path import accelerate import torch from huggingface_hub import file_exists, hf_hub_download from transformers import ( AutoConfig, AutoModelForCausalLM, AutoProcessor, GenerationConfig, Sam3Processor, set_seed, ) from transformers.models.sam3.modeling_sam3 import Sam3Config, Sam3Model source_model_id = "facebook/sam3" save_folder = "/tmp/yujiepan/sam3-tiny-random" processor = Sam3Processor.from_pretrained( source_model_id, trust_remote_code=True) processor.save_pretrained(save_folder) with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f: config_json = json.load(f) HIDDEN_SIZE = 16 INTERMEDIATE_SIZE = 32 NUM_ATTENTION_HEADS = 2 config_json['detector_config']['detr_decoder_config'].update({ 'hidden_size': HIDDEN_SIZE, 'intermediate_size': INTERMEDIATE_SIZE, 'num_attention_heads': NUM_ATTENTION_HEADS, }) config_json['detector_config']['detr_encoder_config'].update({ 'hidden_size': HIDDEN_SIZE, 'intermediate_size': INTERMEDIATE_SIZE, 'num_attention_heads': NUM_ATTENTION_HEADS, }) config_json['detector_config']['geometry_encoder_config'].update({ 'hidden_size': HIDDEN_SIZE, 'intermediate_size': INTERMEDIATE_SIZE, 'num_attention_heads': NUM_ATTENTION_HEADS, }) config_json['detector_config']['mask_decoder_config'].update({ 'hidden_size': HIDDEN_SIZE, 'intermediate_size': INTERMEDIATE_SIZE, 'num_attention_heads': NUM_ATTENTION_HEADS, }) config_json['detector_config']['text_config'].update({ 'hidden_size': HIDDEN_SIZE, 'intermediate_size': INTERMEDIATE_SIZE, 'num_attention_heads': NUM_ATTENTION_HEADS, 'projection_dim': HIDDEN_SIZE, 'num_hidden_layers': 2, }) config_json['detector_config']['vision_config']['backbone_config'].update({ 'hidden_size': HIDDEN_SIZE, 'intermediate_size': INTERMEDIATE_SIZE, 'num_attention_heads': NUM_ATTENTION_HEADS, 'fpn_hidden_size': HIDDEN_SIZE, 'global_attn_indexes': [1, 3, 5, 7], 'num_hidden_layers': 8, }) config_json['detector_config']['vision_config'].update({ 'fpn_hidden_size': HIDDEN_SIZE, }) config_json['tracker_config']['mask_decoder_config'].update({ 'hidden_size': HIDDEN_SIZE, 'iou_head_hidden_dim': HIDDEN_SIZE, 'num_attention_heads': NUM_ATTENTION_HEADS, }) config_json['tracker_config'].update({ 'mask_downsampler_embed_dim': HIDDEN_SIZE, 'memory_attention_feed_forward_hidden_size': HIDDEN_SIZE, 'memory_attention_hidden_size': HIDDEN_SIZE, 'memory_encoder_hidden_size': HIDDEN_SIZE, 'memory_fuser_embed_dim': HIDDEN_SIZE, 'memory_fuser_intermediate_dim': INTERMEDIATE_SIZE, }) config_json['tracker_config']['prompt_encoder_config'].update({ 'hidden_size': HIDDEN_SIZE, 'intermediate_size': INTERMEDIATE_SIZE, 'num_attention_heads': NUM_ATTENTION_HEADS, }) config_json['tracker_config']['vision_config']['backbone_config'].update({ 'hidden_size': HIDDEN_SIZE, 'intermediate_size': INTERMEDIATE_SIZE, 'num_attention_heads': NUM_ATTENTION_HEADS, 'global_attn_indexes': [1, 3, 5, 7], 'num_hidden_layers': 8, }) config_json['tracker_config']['vision_config'].update({ 'fpn_hidden_size': HIDDEN_SIZE, }) with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f: json.dump(config_json, f, indent=2) config = Sam3Config.from_pretrained( save_folder, trust_remote_code=True, ) print(config) torch.set_default_dtype(torch.float32) model = Sam3Model(config) set_seed(42) model = model.cpu() with torch.no_grad(): for name, p in sorted(model.named_parameters()): torch.nn.init.normal_(p, 0, 0.1) print(name, p.shape) model.save_pretrained(save_folder) # print(list(model.state_dict().keys())) # there is some bug in model.save_pretrained... Re-save the model weights here. import safetensors.torch safetensors.torch.save_file( tensors=model.state_dict(), filename=f"{save_folder}/model.safetensors" ) ``` ### Printing the model: ```text Sam3Model( (vision_encoder): Sam3VisionModel( (backbone): Sam3ViTModel( (embeddings): Sam3ViTEmbeddings( (patch_embeddings): Sam3ViTPatchEmbeddings( (projection): Conv2d(3, 16, kernel_size=(14, 14), stride=(14, 14), bias=False) ) (dropout): Dropout(p=0.0, inplace=False) ) (layer_norm): LayerNorm((16,), eps=1e-06, elementwise_affine=True) (layers): ModuleList( (0-7): 8 x Sam3ViTLayer( (layer_norm1): LayerNorm((16,), eps=1e-06, elementwise_affine=True) (rotary_emb): Sam3ViTRotaryEmbedding() (attention): Sam3ViTRoPEAttention( (q_proj): Linear(in_features=16, out_features=16, bias=True) (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (o_proj): Linear(in_features=16, out_features=16, bias=True) ) (layer_norm2): LayerNorm((16,), eps=1e-06, elementwise_affine=True) (mlp): Sam3MLP( (activation_fn): GELUActivation() (fc1): Linear(in_features=16, out_features=32, bias=True) (fc2): Linear(in_features=32, out_features=16, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) (dropout): Dropout(p=0.0, inplace=False) ) ) ) (neck): Sam3VisionNeck( (position_encoding): Sam3SinePositionEmbedding() (fpn_layers): ModuleList( (0): Sam3FPNLayer( (scale_layers): ModuleList( (0): ConvTranspose2d(16, 8, kernel_size=(2, 2), stride=(2, 2)) (1): GELU(approximate='none') (2): ConvTranspose2d(8, 4, kernel_size=(2, 2), stride=(2, 2)) ) (proj1): Conv2d(4, 16, kernel_size=(1, 1), stride=(1, 1)) (proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (1): Sam3FPNLayer( (scale_layers): ModuleList( (0): ConvTranspose2d(16, 8, kernel_size=(2, 2), stride=(2, 2)) ) (proj1): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1)) (proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (2): Sam3FPNLayer( (scale_layers): ModuleList() (proj1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1)) (proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (3): Sam3FPNLayer( (scale_layers): ModuleList( (0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) ) (proj1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1)) (proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) ) ) (text_encoder): CLIPTextModelWithProjection( (text_model): CLIPTextTransformer( (embeddings): CLIPTextEmbeddings( (token_embedding): Embedding(49408, 16) (position_embedding): Embedding(32, 16) ) (encoder): CLIPEncoder( (layers): ModuleList( (0-1): 2 x CLIPEncoderLayer( (self_attn): CLIPAttention( (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (q_proj): Linear(in_features=16, out_features=16, bias=True) (out_proj): Linear(in_features=16, out_features=16, bias=True) ) (layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (mlp): CLIPMLP( (activation_fn): GELUActivation() (fc1): Linear(in_features=16, out_features=32, bias=True) (fc2): Linear(in_features=32, out_features=16, bias=True) ) (layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True) ) ) ) (final_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) ) (text_projection): Linear(in_features=16, out_features=16, bias=False) ) (text_projection): Linear(in_features=16, out_features=16, bias=True) (geometry_encoder): Sam3GeometryEncoder( (position_encoding): Sam3SinePositionEmbedding() (label_embed): Embedding(2, 16) (cls_embed): Embedding(1, 16) (boxes_direct_project): Linear(in_features=4, out_features=16, bias=True) (boxes_pool_project): Conv2d(16, 16, kernel_size=(7, 7), stride=(1, 1)) (boxes_pos_enc_project): Linear(in_features=18, out_features=16, bias=True) (vision_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (final_proj): Linear(in_features=16, out_features=16, bias=True) (prompt_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (layers): ModuleList( (0-2): 3 x Sam3GeometryEncoderLayer( (layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (self_attn): Sam3Attention( (q_proj): Linear(in_features=16, out_features=16, bias=True) (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (o_proj): Linear(in_features=16, out_features=16, bias=True) ) (dropout): Dropout(p=0.1, inplace=False) (cross_attn): Sam3Attention( (q_proj): Linear(in_features=16, out_features=16, bias=True) (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (o_proj): Linear(in_features=16, out_features=16, bias=True) ) (layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (mlp): Sam3MLP( (activation_fn): ReLU() (fc1): Linear(in_features=16, out_features=32, bias=True) (fc2): Linear(in_features=32, out_features=16, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) (layer_norm3): LayerNorm((16,), eps=1e-05, elementwise_affine=True) ) ) (output_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) ) (detr_encoder): Sam3DetrEncoder( (layers): ModuleList( (0-5): 6 x Sam3DetrEncoderLayer( (layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (self_attn): Sam3Attention( (q_proj): Linear(in_features=16, out_features=16, bias=True) (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (o_proj): Linear(in_features=16, out_features=16, bias=True) ) (dropout): Dropout(p=0.1, inplace=False) (cross_attn): Sam3Attention( (q_proj): Linear(in_features=16, out_features=16, bias=True) (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (o_proj): Linear(in_features=16, out_features=16, bias=True) ) (layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (mlp): Sam3MLP( (activation_fn): ReLU() (fc1): Linear(in_features=16, out_features=32, bias=True) (fc2): Linear(in_features=32, out_features=16, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) (layer_norm3): LayerNorm((16,), eps=1e-05, elementwise_affine=True) ) ) ) (detr_decoder): Sam3DetrDecoder( (layers): ModuleList( (0-5): 6 x Sam3DetrDecoderLayer( (self_attn): Sam3Attention( (q_proj): Linear(in_features=16, out_features=16, bias=True) (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (o_proj): Linear(in_features=16, out_features=16, bias=True) ) (self_attn_dropout): Dropout(p=0.1, inplace=False) (self_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (text_cross_attn): Sam3Attention( (q_proj): Linear(in_features=16, out_features=16, bias=True) (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (o_proj): Linear(in_features=16, out_features=16, bias=True) ) (text_cross_attn_dropout): Dropout(p=0.1, inplace=False) (text_cross_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (vision_cross_attn): Sam3Attention( (q_proj): Linear(in_features=16, out_features=16, bias=True) (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (o_proj): Linear(in_features=16, out_features=16, bias=True) ) (vision_cross_attn_dropout): Dropout(p=0.1, inplace=False) (vision_cross_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (mlp): Sam3MLP( (activation_fn): ReLU() (fc1): Linear(in_features=16, out_features=32, bias=True) (fc2): Linear(in_features=32, out_features=16, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) (mlp_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (mlp_dropout): Dropout(p=0.1, inplace=False) ) ) (output_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (box_head): Sam3DecoderMLP( (layer1): Linear(in_features=16, out_features=16, bias=True) (layer2): Linear(in_features=16, out_features=16, bias=True) (layer3): Linear(in_features=16, out_features=4, bias=True) ) (query_embed): Embedding(200, 16) (reference_points): Embedding(200, 4) (presence_token): Embedding(1, 16) (presence_head): Sam3DecoderMLP( (layer1): Linear(in_features=16, out_features=16, bias=True) (layer2): Linear(in_features=16, out_features=16, bias=True) (layer3): Linear(in_features=16, out_features=1, bias=True) ) (presence_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (ref_point_head): Sam3DecoderMLP( (layer1): Linear(in_features=32, out_features=16, bias=True) (layer2): Linear(in_features=16, out_features=16, bias=True) ) (box_rpb_embed_x): Sam3DecoderMLP( (layer1): Linear(in_features=2, out_features=16, bias=True) (layer2): Linear(in_features=16, out_features=2, bias=True) ) (box_rpb_embed_y): Sam3DecoderMLP( (layer1): Linear(in_features=2, out_features=16, bias=True) (layer2): Linear(in_features=16, out_features=2, bias=True) ) (position_encoding): Sam3SinePositionEmbedding() ) (mask_decoder): Sam3MaskDecoder( (pixel_decoder): Sam3PixelDecoder( (conv_layers): ModuleList( (0-2): 3 x Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (norms): ModuleList( (0-2): 3 x GroupNorm(8, 16, eps=1e-05, affine=True) ) ) (mask_embedder): Sam3MaskEmbedder( (layers): ModuleList( (0-2): 3 x Linear(in_features=16, out_features=16, bias=True) ) (activation): ReLU() ) (instance_projection): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1)) (semantic_projection): Conv2d(16, 1, kernel_size=(1, 1), stride=(1, 1)) (prompt_cross_attn): Sam3Attention( (q_proj): Linear(in_features=16, out_features=16, bias=True) (k_proj): Linear(in_features=16, out_features=16, bias=True) (v_proj): Linear(in_features=16, out_features=16, bias=True) (o_proj): Linear(in_features=16, out_features=16, bias=True) ) (prompt_cross_attn_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (prompt_cross_attn_dropout): Dropout(p=0.0, inplace=False) ) (dot_product_scoring): Sam3DotProductScoring( (text_mlp): Sam3DecoderMLP( (layer1): Linear(in_features=16, out_features=32, bias=True) (layer2): Linear(in_features=32, out_features=16, bias=True) ) (text_mlp_dropout): Dropout(p=0.1, inplace=False) (text_mlp_out_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) (text_proj): Linear(in_features=16, out_features=16, bias=True) (query_proj): Linear(in_features=16, out_features=16, bias=True) ) ) ```