JustQuiteMadMax commited on
Commit
27ebbf9
·
verified ·
1 Parent(s): fba7f09

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +15 -3
app.py CHANGED
@@ -11,6 +11,7 @@ import spaces
11
  import gradio as gr
12
  import torch
13
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
14
  from kernels import get_kernel
15
 
16
  #vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3")
@@ -18,7 +19,11 @@ from kernels import get_kernel
18
  #torch._dynamo.config.disable = True
19
 
20
  MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"
 
21
 
 
 
 
22
 
23
  def load_model():
24
  """Lazy-load model & tokenizer (for zeroGPU)."""
@@ -28,7 +33,7 @@ def load_model():
28
  MODEL_ID,
29
  dtype=torch.bfloat16, # if device == "cuda" else torch.float32,
30
  device_map="auto", # if device == "cuda" else None,
31
- attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # #
32
  ) # .cuda()
33
  print(f"Selected device:", device)
34
  return model, tokenizer, device
@@ -54,12 +59,13 @@ def append_example_message(x: gr.SelectData, history):
54
 
55
  @spaces.GPU
56
  def bot(
57
- history: list[dict[str, str]],
58
  # max_tokens,
59
  # temperature,
60
  # top_p,
61
  ):
62
-
 
63
  # [{"role": "system", "content": system_message}] +
64
  # Build conversation
65
  max_tokens = 4096
@@ -103,6 +109,9 @@ def bot(
103
  history[-1]["content"] += new_text
104
  yield history
105
 
 
 
 
106
 
107
  # --- drop-in UI compatible with older Gradio versions ---
108
  import os, tempfile, time
@@ -130,6 +139,9 @@ def _clear_chat():
130
  return "", []
131
 
132
  with gr.Blocks(theme=THEME, css=CSS, fill_height=True) as demo:
 
 
 
133
  # Header (no gr.Box to avoid version issues)
134
  gr.HTML(
135
  """
 
11
  import gradio as gr
12
  import torch
13
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
14
+ from analytics import AnalyticsLogger
15
  from kernels import get_kernel
16
 
17
  #vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3")
 
19
  #torch._dynamo.config.disable = True
20
 
21
  MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"
22
+ logger = AnalyticsLogger()
23
 
24
+ def _begin_analytics_session():
25
+ # Called once per client on app load
26
+ _ = logger.start_session(MODEL_ID)
27
 
28
  def load_model():
29
  """Lazy-load model & tokenizer (for zeroGPU)."""
 
33
  MODEL_ID,
34
  dtype=torch.bfloat16, # if device == "cuda" else torch.float32,
35
  device_map="auto", # if device == "cuda" else None,
36
+ attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # #
37
  ) # .cuda()
38
  print(f"Selected device:", device)
39
  return model, tokenizer, device
 
59
 
60
  @spaces.GPU
61
  def bot(
62
+ history: list[dict[str, str]]
63
  # max_tokens,
64
  # temperature,
65
  # top_p,
66
  ):
67
+ user_message = history[-1]["content"]
68
+ print('User message:', user_message)
69
  # [{"role": "system", "content": system_message}] +
70
  # Build conversation
71
  max_tokens = 4096
 
109
  history[-1]["content"] += new_text
110
  yield history
111
 
112
+ assistant_message = history[-1]["content"]
113
+ logger.log_interaction(user=user_message, answer=assistant_message)
114
+
115
 
116
  # --- drop-in UI compatible with older Gradio versions ---
117
  import os, tempfile, time
 
139
  return "", []
140
 
141
  with gr.Blocks(theme=THEME, css=CSS, fill_height=True) as demo:
142
+ demo.load(fn=_begin_analytics_session, inputs=None, outputs=None)
143
+
144
+
145
  # Header (no gr.Box to avoid version issues)
146
  gr.HTML(
147
  """