Spaces:
Sleeping
Sleeping
Evgueni Poloukarov
Claude
commited on
Commit
·
73e9c10
1
Parent(s):
d2f9ff2
fix: revert PyTorch + redirect cache to /tmp for A100
Browse files- Revert torch>=2.4.0 back to torch>=2.0.0 (2.4+ broke L40)
- Add cache redirection to /tmp to prevent 50GB storage limit exceeded
(most common cause of silent A100 failures per HF forums)
Environment variables set before imports:
- TORCH_HOME=/tmp/torch_cache
- HF_HOME=/tmp/hf_home
- TRANSFORMERS_CACHE=/tmp/transformers_cache
- HUB_DIR=/tmp/torch_hub
Co-Authored-By: Claude <[email protected]>
- app.py +8 -0
- requirements.txt +2 -2
app.py
CHANGED
|
@@ -12,6 +12,14 @@ FORCE REBUILD: Optimized for 96GB VRAM with memory profiling diagnostics
|
|
| 12 |
import os
|
| 13 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
import sys
|
| 16 |
print(f"[STARTUP] Python version: {sys.version}", flush=True)
|
| 17 |
print(f"[STARTUP] Python path: {sys.path[:3]}", flush=True)
|
|
|
|
| 12 |
import os
|
| 13 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
| 14 |
|
| 15 |
+
# Redirect ALL caches to /tmp to prevent 50GB storage limit exceeded
|
| 16 |
+
# This is the most common cause of "no logs" silent failures on A100 Spaces
|
| 17 |
+
# See: https://discuss.huggingface.co/t/how-to-fix-workload-evicted-storage-limit-exceeded-50g-error-in-huggingface-spaces/169258
|
| 18 |
+
os.environ['TORCH_HOME'] = '/tmp/torch_cache'
|
| 19 |
+
os.environ['HF_HOME'] = '/tmp/hf_home'
|
| 20 |
+
os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'
|
| 21 |
+
os.environ['HUB_DIR'] = '/tmp/torch_hub'
|
| 22 |
+
|
| 23 |
import sys
|
| 24 |
print(f"[STARTUP] Python version: {sys.version}", flush=True)
|
| 25 |
print(f"[STARTUP] Python path: {sys.path[:3]}", flush=True)
|
requirements.txt
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
gradio==4.44.0
|
| 3 |
|
| 4 |
# Core ML/Data
|
| 5 |
-
torch>=2.
|
| 6 |
transformers>=4.35.0
|
| 7 |
chronos-forecasting>=1.2.0
|
| 8 |
datasets>=2.14.0
|
|
@@ -19,4 +19,4 @@ altair>=5.0.0
|
|
| 19 |
python-dotenv
|
| 20 |
tqdm
|
| 21 |
|
| 22 |
-
# Cache bust: v1.
|
|
|
|
| 2 |
gradio==4.44.0
|
| 3 |
|
| 4 |
# Core ML/Data
|
| 5 |
+
torch>=2.0.0
|
| 6 |
transformers>=4.35.0
|
| 7 |
chronos-forecasting>=1.2.0
|
| 8 |
datasets>=2.14.0
|
|
|
|
| 19 |
python-dotenv
|
| 20 |
tqdm
|
| 21 |
|
| 22 |
+
# Cache bust: v1.7.0 - Revert PyTorch + cache to /tmp for A100
|