Spaces:

mknolan
/

model-loading-diagnostic

Runtime error

App Files Files Community

mknolan commited on Mar 15

Commit

de08500

verified ·

1 Parent(s): 961e80c

Upload debug_model_loading.py with huggingface_hub

Browse files

Files changed (1) hide show

debug_model_loading.py +108 -0

debug_model_loading.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+import os
+import sys
+import traceback
+import requests
+import json
+import platform
+print("=" * 50)
+print("DETAILED MODEL LOADING DIAGNOSTIC")
+print("=" * 50)
+# System information
+print("\n1. SYSTEM INFORMATION:")
+print(f"Python version: {sys.version}")
+print(f"PyTorch version: {torch.__version__}")
+print(f"Platform: {platform.platform()}")
+print(f"Processor: {platform.processor()}")
+# Environment variables
+print("\n2. ENVIRONMENT VARIABLES:")
+relevant_vars = ["CUDA_VISIBLE_DEVICES", "NVIDIA_VISIBLE_DEVICES", "TRANSFORMERS_CACHE", "HF_HOME"]
+for var in relevant_vars:
+    print(f"{var}: {os.environ.get(var, 'Not set')}")
+# GPU information
+print("\n3. GPU DETECTION:")
+print(f"CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    try:
+        print(f"CUDA version: {torch.version.cuda}")
+        print(f"GPU count: {torch.cuda.device_count()}")
+        for i in range(torch.cuda.device_count()):
+            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
+        # Test GPU with a simple operation
+        print("\nTesting GPU with tensor operations...")
+        test_tensor = torch.rand(1000, 1000, device="cuda")
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        result = torch.matmul(test_tensor, test_tensor)
+        end.record()
+        torch.cuda.synchronize()
+        print(f"GPU tensor operation completed in {start.elapsed_time(end):.2f} ms")
+        # Memory info
+        print(f"\nTotal GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+        print(f"Allocated GPU memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+        print(f"Reserved GPU memory: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
+    except Exception as e:
+        print(f"Error testing GPU: {str(e)}")
+        traceback.print_exc()
+else:
+    print("CUDA is not available. This is a critical issue for model loading.")
+# HuggingFace hub connectivity
+print("\n4. HUGGINGFACE HUB CONNECTIVITY:")
+try:
+    print("Testing connection to HuggingFace Hub...")
+    response = requests.get("https://huggingface.co/api/models/OpenGVLab/InternViT-6B-224px")
+    if response.status_code == 200:
+        print("Successfully connected to HuggingFace Hub")
+        model_info = response.json()
+        print(f"Model exists: OpenGVLab/InternViT-6B-224px")
+        if 'downloads' in model_info:
+            print(f"Downloads: {model_info['downloads']}")
+    else:
+        print(f"Failed to connect to HuggingFace Hub: Status code {response.status_code}")
+        print(response.text)
+except Exception as e:
+    print(f"Error connecting to HuggingFace Hub: {str(e)}")
+    traceback.print_exc()
+# Attempt model loading with detailed error capture
+print("\n5. ATTEMPTING MODEL LOADING:")
+try:
+    print("Importing transformers...")
+    from transformers import AutoModel, AutoProcessor
+    print("✓ Transformers imported successfully")
+    print("\nLoading AutoProcessor...")
+    processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px")
+    print("✓ AutoProcessor loaded successfully")
+    print("\nLoading AutoModel...")
+    model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px")
+    print("✓ AutoModel loaded successfully")
+    if torch.cuda.is_available():
+        print("\nMoving model to CUDA...")
+        model = model.to("cuda")
+        print("✓ Model moved to CUDA successfully")
+    print("\nModel loading SUCCESSFUL")
+    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
+except Exception as e:
+    print(f"\n❌ ERROR LOADING MODEL: {str(e)}")
+    print("\nDetailed traceback:")
+    traceback.print_exc()
+print("\n" + "=" * 50)
+print("DIAGNOSTIC COMPLETE")
+print("=" * 50)