Alikestocode commited on
Commit
f43bdac
·
1 Parent(s): 83a232d

Fix syntax error: correct indentation in BitsAndBytes fallback block

Browse files
Files changed (1) hide show
  1. app.py +12 -11
app.py CHANGED
@@ -252,17 +252,18 @@ def load_pipeline(model_name: str):
252
  if FLASH_ATTN_AVAILABLE:
253
  model_kwargs["attn_implementation"] = "flash_attention_2"
254
 
255
- pipe = pipeline(
256
- task="text-generation",
257
- model=repo,
258
- tokenizer=tokenizer,
259
- trust_remote_code=True,
260
- device_map="auto",
261
  model_kwargs=model_kwargs,
262
- use_cache=True,
263
  token=HF_TOKEN,
264
  torch_dtype=torch.bfloat16,
265
  )
 
266
  pipe.model.eval()
267
 
268
  # Apply torch.compile for kernel fusion (~10-20% speedup after first call)
@@ -272,11 +273,11 @@ def load_pipeline(model_name: str):
272
  except Exception:
273
  pass
274
 
275
- PIPELINES[model_name] = pipe
276
  _schedule_background_warm(model_name)
277
- return pipe
278
- except Exception as exc:
279
- print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
280
 
281
  # Fallback to bfloat16/fp16/fp32
282
  for dtype in (torch.bfloat16, torch.float16, torch.float32):
 
252
  if FLASH_ATTN_AVAILABLE:
253
  model_kwargs["attn_implementation"] = "flash_attention_2"
254
 
255
+ pipe = pipeline(
256
+ task="text-generation",
257
+ model=repo,
258
+ tokenizer=tokenizer,
259
+ trust_remote_code=True,
260
+ device_map="auto",
261
  model_kwargs=model_kwargs,
262
+ use_cache=True,
263
  token=HF_TOKEN,
264
  torch_dtype=torch.bfloat16,
265
  )
266
+
267
  pipe.model.eval()
268
 
269
  # Apply torch.compile for kernel fusion (~10-20% speedup after first call)
 
273
  except Exception:
274
  pass
275
 
276
+ PIPELINES[model_name] = pipe
277
  _schedule_background_warm(model_name)
278
+ return pipe
279
+ except Exception as exc:
280
+ print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
281
 
282
  # Fallback to bfloat16/fp16/fp32
283
  for dtype in (torch.bfloat16, torch.float16, torch.float32):