Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

Alikestocode commited on Nov 8

Commit

f43bdac

1 Parent(s): 83a232d

Fix syntax error: correct indentation in BitsAndBytes fallback block

Files changed (1) hide show

app.py CHANGED Viewed

@@ -252,17 +252,18 @@ def load_pipeline(model_name: str):
             if FLASH_ATTN_AVAILABLE:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
-        pipe = pipeline(
-            task="text-generation",
-            model=repo,
-            tokenizer=tokenizer,
-            trust_remote_code=True,
-            device_map="auto",
                 model_kwargs=model_kwargs,
-            use_cache=True,
                 token=HF_TOKEN,
                 torch_dtype=torch.bfloat16,
             )
             pipe.model.eval()
             # Apply torch.compile for kernel fusion (~10-20% speedup after first call)
@@ -272,11 +273,11 @@ def load_pipeline(model_name: str):
             except Exception:
                 pass
-        PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
-        return pipe
-    except Exception as exc:
-        print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
     # Fallback to bfloat16/fp16/fp32
     for dtype in (torch.bfloat16, torch.float16, torch.float32):

             if FLASH_ATTN_AVAILABLE:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
+            pipe = pipeline(
+                task="text-generation",
+                model=repo,
+                tokenizer=tokenizer,
+                trust_remote_code=True,
+                device_map="auto",
                 model_kwargs=model_kwargs,
+                use_cache=True,
                 token=HF_TOKEN,
                 torch_dtype=torch.bfloat16,
             )
             pipe.model.eval()
             # Apply torch.compile for kernel fusion (~10-20% speedup after first call)
             except Exception:
                 pass
+            PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
+            return pipe
+        except Exception as exc:
+            print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
     # Fallback to bfloat16/fp16/fp32
     for dtype in (torch.bfloat16, torch.float16, torch.float32):