Spaces:

lmms-lab
/

Multimodal-SAE

Running on Zero

App Files Files Community

kcz358 commited on Mar 4

Commit

18b266d

1 Parent(s): 88f8ab3

Fix cached error

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -50,10 +50,12 @@ happy_file_path = "assets/happy.jpg"
 def generate_activations(image):
     prompt = "<image>"
     inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
-    global cached_tensor, topk_indices
     def hook(module: torch.nn.Module, _, outputs):
-        global cached_tensor, topk_indices
         # Maybe unpack tuple outputs
         if isinstance(outputs, tuple):
             unpack_outputs = list(outputs)
@@ -72,7 +74,7 @@ def generate_activations(image):
         result = torch.zeros_like(latents)
         # results (bs, seq, num_latents)
         result.scatter_(-1, topk.indices, topk.values)
-        cached_tensor = result.detach().cpu()
         topk_indices = (
             latents.squeeze(0).mean(dim=0).topk(k=100).indices.detach().cpu()
         )
@@ -91,10 +93,9 @@ def generate_activations(image):
             handle.remove()
     torch.cuda.empty_cache()
-    return topk_indices
-def visualize_activations(image, feature_num):
-    global cached_tensor
     base_img_tokens = 576
     patch_size = 24
     # Using Cached tensor
@@ -191,6 +192,7 @@ def generate_with_clamp(feature_idx, feature_strength, text, image, chat_history
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         # Large Multi-modal Models Can Interpret Features in Large Multi-modal Models
@@ -210,12 +212,12 @@ with gr.Blocks() as demo:
                     with gr.Row():
                         clear_btn = gr.ClearButton([image, topk_features], value="Clear")
                         submit_btn = gr.Button("Submit", variant="primary")
-                        submit_btn.click(generate_activations, inputs=[image], outputs=[topk_features])
                 with gr.Column():
                     output = gr.Image(label="Activation Visualization")
                     feature_num = gr.Slider(1, 131072, 1, 1, label="Feature Number", interactive=True)
                     visualize_btn = gr.Button("Visualize", variant="primary")
-                    visualize_btn.click(visualize_activations, inputs=[image, feature_num], outputs=[output])
             dummy_text = gr.Textbox(visible=False, label="Explanation")
             gr.Examples(
@@ -261,7 +263,6 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
-    cached_tensor = None
     tokenizer = AutoTokenizer.from_pretrained("llava-hf/llama3-llava-next-8b-hf")
     sae = load_single_sae("lmms-lab/llama3-llava-next-8b-hf-sae-131k", "model.layers.24")
     model, processor = maybe_load_llava_model(

 def generate_activations(image):
     prompt = "<image>"
     inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
+    global topk_indices
+    cached_list = []
     def hook(module: torch.nn.Module, _, outputs):
+        global topk_indices
         # Maybe unpack tuple outputs
         if isinstance(outputs, tuple):
             unpack_outputs = list(outputs)
         result = torch.zeros_like(latents)
         # results (bs, seq, num_latents)
         result.scatter_(-1, topk.indices, topk.values)
+        cached_list.append(result.detach().cpu())
         topk_indices = (
             latents.squeeze(0).mean(dim=0).topk(k=100).indices.detach().cpu()
         )
             handle.remove()
     torch.cuda.empty_cache()
+    return topk_indices, cached_list[0]
+def visualize_activations(image, feature_num, cached_tensor):
     base_img_tokens = 576
     patch_size = 24
     # Using Cached tensor
 with gr.Blocks() as demo:
+    cached_tensor = gr.State()
     gr.Markdown(
         """
         # Large Multi-modal Models Can Interpret Features in Large Multi-modal Models
                     with gr.Row():
                         clear_btn = gr.ClearButton([image, topk_features], value="Clear")
                         submit_btn = gr.Button("Submit", variant="primary")
+                        submit_btn.click(generate_activations, inputs=[image], outputs=[topk_features, cached_tensor])
                 with gr.Column():
                     output = gr.Image(label="Activation Visualization")
                     feature_num = gr.Slider(1, 131072, 1, 1, label="Feature Number", interactive=True)
                     visualize_btn = gr.Button("Visualize", variant="primary")
+                    visualize_btn.click(visualize_activations, inputs=[image, feature_num, cached_tensor], outputs=[output])
             dummy_text = gr.Textbox(visible=False, label="Explanation")
             gr.Examples(
 if __name__ == "__main__":
     tokenizer = AutoTokenizer.from_pretrained("llava-hf/llama3-llava-next-8b-hf")
     sae = load_single_sae("lmms-lab/llama3-llava-next-8b-hf-sae-131k", "model.layers.24")
     model, processor = maybe_load_llava_model(