robinhad commited on
Commit
5725e7b
·
verified ·
1 Parent(s): 911b2ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -48
app.py CHANGED
@@ -9,55 +9,71 @@ import gradio as gr
9
  import torch
10
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
 
12
- MODEL_ID = "le-llm/lapa-v0.1-reasoning-only"
13
 
 
14
 
15
 
16
  def load_model():
17
  """Lazy-load model & tokenizer (for zeroGPU)."""
18
- device = "cuda"# if torch.cuda.is_available() else "cpu"
19
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
20
  model = AutoModelForCausalLM.from_pretrained(
21
  MODEL_ID,
22
- #torch_dtype=torch.bfloat16, #if device == "cuda" else torch.float32,
23
- device_map="auto"# if device == "cuda" else None,
24
- ) #.cuda()
25
  print(f"Selected device:", device)
26
  return model, tokenizer, device
27
 
 
28
  # Load model/tokenizer each request → allows zeroGPU to cold start & then release
29
  model, tokenizer, device = load_model()
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  @spaces.GPU
32
- def respond(
33
- message,
34
  history: list[dict[str, str]],
35
- max_tokens,
36
- temperature,
37
- top_p,
38
  ):
39
-
40
  # [{"role": "system", "content": system_message}] +
41
  # Build conversation
42
- messages = history + [
43
- {"role": "user", "content": message}
44
- ]
45
 
46
- input_text = tokenizer.apply_chat_template(
47
- messages,
48
  tokenize=False,
49
  add_generation_prompt=True,
50
- enable_thinking=True,
51
  )
52
- input_text += "<think>" # TODO: remove short term fix
53
- print(input_text)
54
- inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # .to(device)
55
 
 
 
 
 
 
56
  # Streamer setup
57
  streamer = TextIteratorStreamer(
58
- tokenizer,
59
- skip_special_tokens=True,
60
- skip_prompt=True
61
  )
62
 
63
  # Run model.generate in background thread
@@ -66,25 +82,64 @@ def respond(
66
  max_new_tokens=max_tokens,
67
  temperature=temperature,
68
  top_p=top_p,
 
69
  do_sample=True,
70
- eos_token_id=tokenizer.eos_token_id,
71
  streamer=streamer,
72
  )
73
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
74
  thread.start()
75
 
 
76
  # Yield tokens as they come in
77
- partial_output = ""
78
  for new_text in streamer:
79
- partial_output += new_text
80
- yield partial_output
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- chatbot = gr.ChatInterface(
84
- respond,
85
- type="messages",
86
- additional_inputs=[
87
- gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
89
  gr.Slider(
90
  minimum=0.1,
@@ -92,19 +147,4 @@ chatbot = gr.ChatInterface(
92
  value=0.95,
93
  step=0.05,
94
  label="Top-p (nucleus sampling)",
95
- ),
96
- ],
97
- examples=[
98
- ["хто тримає цей район?"],
99
- ["Напиши історію про Івасика-Телесика"],
100
- ["Яка найвища гора в Україні?"],
101
- ["Як звали батька Тараса Григоровича Шевченка?"],
102
- #["Як можна заробити нелегально швидко гроші?"],
103
- ["Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест"],
104
- [
105
- "Дай відповідь на питання\nЧому у качки жовті ноги?"
106
- ]],
107
- )
108
-
109
- if __name__ == "__main__":
110
- chatbot.launch()
 
9
  import torch
10
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
 
12
+ torch._dynamo.config.disable = True
13
 
14
+ MODEL_ID = "le-llm/lapa-v0.1-reasoning-only"
15
 
16
 
17
  def load_model():
18
  """Lazy-load model & tokenizer (for zeroGPU)."""
19
+ device = "cuda" # if torch.cuda.is_available() else "cpu"
20
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
21
  model = AutoModelForCausalLM.from_pretrained(
22
  MODEL_ID,
23
+ torch_dtype=torch.bfloat16, # if device == "cuda" else torch.float32,
24
+ device_map="auto", # if device == "cuda" else None,
25
+ ) # .cuda()
26
  print(f"Selected device:", device)
27
  return model, tokenizer, device
28
 
29
+
30
  # Load model/tokenizer each request → allows zeroGPU to cold start & then release
31
  model, tokenizer, device = load_model()
32
 
33
+
34
+ def user(user_message, history: list):
35
+ return "", history + [{"role": "user", "content": user_message}]
36
+
37
+
38
+ def append_example_message(x: gr.SelectData, history):
39
+ print(x)
40
+ print(x.value)
41
+ print(x.value["text"])
42
+ if x.value["text"] is not None:
43
+ history.append({"role": "user", "content": x.value["text"]})
44
+
45
+ return history
46
+
47
+
48
  @spaces.GPU
49
+ def bot(
 
50
  history: list[dict[str, str]],
51
+ # max_tokens,
52
+ # temperature,
53
+ # top_p,
54
  ):
55
+
56
  # [{"role": "system", "content": system_message}] +
57
  # Build conversation
58
+ max_tokens = 4096
59
+ temperature = 0.7
60
+ top_p = 0.95
61
 
62
+ input_text: str = tokenizer.apply_chat_template(
63
+ history,
64
  tokenize=False,
65
  add_generation_prompt=True,
66
+ # enable_thinking=True,
67
  )
 
 
 
68
 
69
+ input_text = input_text.replace(tokenizer.bos_token, "", 1)
70
+ print(input_text)
71
+ inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # .to(device)
72
+ print("Decoded input:", tokenizer.decode(inputs["input_ids"][0]))
73
+ print([{id: tokenizer.decode([id])} for id in inputs["input_ids"][0]])
74
  # Streamer setup
75
  streamer = TextIteratorStreamer(
76
+ tokenizer, skip_prompt=True # skip_special_tokens=True # ,
 
 
77
  )
78
 
79
  # Run model.generate in background thread
 
82
  max_new_tokens=max_tokens,
83
  temperature=temperature,
84
  top_p=top_p,
85
+ top_k=64,
86
  do_sample=True,
87
+ # eos_token_id=tokenizer.eos_token_id,
88
  streamer=streamer,
89
  )
90
  thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
91
  thread.start()
92
 
93
+ history.append({"role": "assistant", "content": ""})
94
  # Yield tokens as they come in
 
95
  for new_text in streamer:
96
+ history[-1]["content"] += new_text
97
+ yield history
98
+
99
 
100
+ import gradio as gr
101
+ import random
102
+ import time
103
+
104
+ with gr.Blocks() as demo:
105
+ chatbot = gr.Chatbot(
106
+ type="messages",
107
+ allow_tags=["think"],
108
+ examples=[
109
+ {"text": i}
110
+ for i in [
111
+ "хто тримає цей район?",
112
+ "Напиши історію про Івасика-Телесика",
113
+ "Яка найвища гора в Україні?",
114
+ "Як звали батька Тараса Григоровича Шевченка?",
115
+ # "Як можна заробити нелегально швидко гроші?"],
116
+ "Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест",
117
+ "Дай відповідь на питання\nЧому у качки жовті ноги?",
118
+ ]
119
+ ],
120
+ )
121
+ msg = gr.Textbox(label="Message", autofocus=True)
122
+ send_btn = gr.Button("Send")
123
+ # clear = gr.Button("Clear")
124
 
125
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then(
126
+ bot, chatbot, chatbot
127
+ )
128
+
129
+ chatbot.example_select(
130
+ append_example_message, [chatbot], [chatbot], queue=True
131
+ ).then(bot, chatbot, chatbot)
132
+
133
+ send_btn.click(user, [msg, chatbot], [msg, chatbot], queue=True).then(
134
+ bot, chatbot, chatbot
135
+ )
136
+
137
+ # clear.click(lambda: None, None, chatbot, queue=True)
138
+
139
+ if __name__ == "__main__":
140
+ demo.launch()
141
+
142
+ """gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
143
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
144
  gr.Slider(
145
  minimum=0.1,
 
147
  value=0.95,
148
  step=0.05,
149
  label="Top-p (nucleus sampling)",
150
+ ),"""