crevans commited on
Commit
2f404b2
Β·
verified Β·
1 Parent(s): a8fbc80

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -34
app.py CHANGED
@@ -60,15 +60,13 @@ try:
60
  print("βœ… Google Cloud STT/TTS clients initialized.")
61
  except Exception as e:
62
  print(f"πŸ›‘ CRITICAL: Could not initialize Google Cloud clients. {e}")
63
- # This will fail if the secret was not set correctly.
64
 
65
  # ==============================================================================
66
  # 4. HELPER FUNCTIONS (STT AND TTS)
67
  # ==============================================================================
68
 
69
  def transcribe_audio(audio_filepath: str, language_code: str):
70
- if not audio_filepath:
71
- return ""
72
  print(f"Loading audio file: {audio_filepath}")
73
  try:
74
  audio = AudioSegment.from_file(audio_filepath)
@@ -96,7 +94,7 @@ def transcribe_audio(audio_filepath: str, language_code: str):
96
  finally:
97
  if audio_filepath and os.path.exists(audio_filepath):
98
  try: os.remove(audio_filepath)
99
- except OSError: pass # Ignore if file deletion fails
100
 
101
  def synthesize_speech(text, voice_code):
102
  print(f"Synthesizing speech with requested code: {voice_code}...")
@@ -123,7 +121,6 @@ def synthesize_speech(text, voice_code):
123
  voice = texttospeech.VoiceSelectionParams(**voice_params)
124
  audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
125
 
126
- # Diagnostic check
127
  if not voice_code.startswith("en"):
128
  try:
129
  print(f"--- Listing available voices for language code: {selected_language_code} ---")
@@ -148,47 +145,54 @@ def synthesize_speech(text, voice_code):
148
  return None
149
 
150
  # ==============================================================================
151
- # 4. CORE CHAT FUNCTION (AS A GENERATOR) - DUAL RESPONSE
152
  # ==============================================================================
153
  def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
 
 
 
 
 
154
  user_audio_path = audio_input
155
  if user_audio_path is None:
156
  yield history, None, None
157
  return
158
  print(f"Received audio filepath: {user_audio_path}")
159
 
 
160
  transcribed_text = transcribe_audio(user_audio_path, input_lang)
161
  if transcribed_text is None:
162
  transcribed_text = "[Error: Transcription failed internally]"
163
 
164
- history.append((transcribed_text, None))
165
- yield history, None, None
 
 
166
 
167
  if transcribed_text.startswith("["):
168
  return
169
 
 
170
  print("Generating N-ATLaS response (Run 1: Conversation)...")
 
171
  if output_voice.startswith("ha"): lang = "Hausa"
172
  elif output_voice.startswith("yo"): lang = "Yoruba"
173
  elif output_voice.startswith("ig"): lang = "Igbo"
174
  else: lang = "Nigerian English"
175
 
176
  system_prompt = f"You are a helpful, friendly assistant. Listen to what the user says and respond naturally. You must respond ONLY in {lang}."
177
- messages = []
178
- for user_msg, assistant_msg in history:
179
- user_content = str(user_msg) if user_msg is not None else "[empty]"
180
- messages.append({"role": "user", "content": user_content})
181
- if assistant_msg:
182
- if "**Conversational Reply:**" in str(assistant_msg):
183
- reply_text = str(assistant_msg).split("---")[0].replace("**Conversational Reply:**\n", "").strip()
184
- messages.append({"role": "assistant", "content": reply_text})
185
- else:
186
- messages.append({"role": "assistant", "content": str(assistant_msg)})
187
 
 
 
 
 
 
188
  conversation_messages = messages + [{"role": "system", "content": system_prompt}]
189
  conversation_prompt = tokenizer.apply_chat_template(conversation_messages, tokenize=False, add_generation_prompt=True)
 
190
  inputs = tokenizer(conversation_prompt, return_tensors="pt").to(model.device)
191
  input_length = inputs.input_ids.shape[1]
 
192
  outputs = model.generate(
193
  **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
194
  pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9
@@ -196,28 +200,46 @@ def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
196
  conversational_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
197
  print(f" -> Conversational Reply: {conversational_text}")
198
 
 
199
  print("Generating N-ATLaS response (Run 2: Translation)...")
 
200
  translation_system_prompt = f"Translate the following text to {lang}:"
201
- translation_messages = [{"role": "system", "content": translation_system_prompt}, {"role": "user", "content": transcribed_text}]
 
 
 
202
  translation_prompt = tokenizer.apply_chat_template(translation_messages, tokenize=False, add_generation_prompt=True)
 
203
  inputs = tokenizer(translation_prompt, return_tensors="pt").to(model.device)
204
  input_length = inputs.input_ids.shape[1]
 
205
  outputs = model.generate(
206
  **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
207
  pad_token_id=tokenizer.eos_token_id, do_sample=False, temperature=0.1, top_p=0.9
208
  )
209
- translation_text = tokenizer.decode(outputs[0][input_length:], skip__special_tokens=True).strip()
210
  print(f" -> Direct Translation: {translation_text}")
211
 
 
212
  bot_audio_path = synthesize_speech(conversational_text, output_voice)
213
- bot_response_string = f"**Conversational Reply:**\n{conversational_text}\n\n---\n**Direct Translation:**\n{translation_text}"
214
- final_user_text = transcribed_text if transcribed_text is not None else "[Error]"
215
- history[-1] = (final_user_text, bot_response_string)
 
216
 
 
 
 
 
 
 
 
 
 
217
  yield history, bot_audio_path, None
218
 
219
  # ==============================================================================
220
- # 5. GRADIO UI (using Blocks) - Modern Gradio 4.x compatible
221
  # ==============================================================================
222
  with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
223
  gr.Markdown("# πŸ‡³πŸ‡¬ N-ATLaS Multilingual Voice Test")
@@ -249,11 +271,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
249
  value="en-NG"
250
  )
251
 
252
- # --- FIX for Gradio 4.x warning ---
 
253
  chatbot = gr.Chatbot(label="Conversation", height=400, type="messages")
254
 
255
  mic_input = gr.Audio(
256
- sources=["microphone"],
257
  type="filepath",
258
  label="3. Press record and speak"
259
  )
@@ -263,12 +286,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
263
  )
264
  submit_btn = gr.Button("Submit Audio")
265
 
266
- # --- FIX for Gradio 4.x history ---
267
- # We must now use the 'type="messages"' format: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
268
- # This requires modifying the chat function to handle this new format.
269
-
270
- # Let's revert to the simpler gr.State() for now to avoid a full rewrite,
271
- # as the warning is not critical. But we'll fix the launch.
272
  chat_history = gr.State([])
273
 
274
  submit_btn.click(
@@ -278,5 +297,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
278
  )
279
 
280
  print("Launching Gradio interface...")
281
- # Add share=True back, as it's good practice for sharing
282
- iface.launch(share=True)
 
60
  print("βœ… Google Cloud STT/TTS clients initialized.")
61
  except Exception as e:
62
  print(f"πŸ›‘ CRITICAL: Could not initialize Google Cloud clients. {e}")
 
63
 
64
  # ==============================================================================
65
  # 4. HELPER FUNCTIONS (STT AND TTS)
66
  # ==============================================================================
67
 
68
  def transcribe_audio(audio_filepath: str, language_code: str):
69
+ if not audio_filepath: return ""
 
70
  print(f"Loading audio file: {audio_filepath}")
71
  try:
72
  audio = AudioSegment.from_file(audio_filepath)
 
94
  finally:
95
  if audio_filepath and os.path.exists(audio_filepath):
96
  try: os.remove(audio_filepath)
97
+ except OSError: pass
98
 
99
  def synthesize_speech(text, voice_code):
100
  print(f"Synthesizing speech with requested code: {voice_code}...")
 
121
  voice = texttospeech.VoiceSelectionParams(**voice_params)
122
  audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
123
 
 
124
  if not voice_code.startswith("en"):
125
  try:
126
  print(f"--- Listing available voices for language code: {selected_language_code} ---")
 
145
  return None
146
 
147
  # ==============================================================================
148
+ # 4. CORE CHAT FUNCTION (AS A GENERATOR) - *** UPDATED FOR GRADIO 4.x ***
149
  # ==============================================================================
150
  def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
151
+ """
152
+ Main function for the Gradio app. Handles filepath audio input, uses 'yield',
153
+ and generates BOTH a translation and a conversational reply.
154
+ HISTORY is now a list of dictionaries: [{"role": "user", "content": ...}]
155
+ """
156
  user_audio_path = audio_input
157
  if user_audio_path is None:
158
  yield history, None, None
159
  return
160
  print(f"Received audio filepath: {user_audio_path}")
161
 
162
+ # ----- STAGE 1: Transcribe User -----
163
  transcribed_text = transcribe_audio(user_audio_path, input_lang)
164
  if transcribed_text is None:
165
  transcribed_text = "[Error: Transcription failed internally]"
166
 
167
+ # --- HISTORY FIX 1 ---
168
+ # Append the user's transcribed text to the history in the new format
169
+ history.append({"role": "user", "content": transcribed_text})
170
+ yield history, None, None # Update UI with transcribed text
171
 
172
  if transcribed_text.startswith("["):
173
  return
174
 
175
+ # ----- STAGE 2: Get N-ATLaS Response (RUN 1: CONVERSATION) -----
176
  print("Generating N-ATLaS response (Run 1: Conversation)...")
177
+
178
  if output_voice.startswith("ha"): lang = "Hausa"
179
  elif output_voice.startswith("yo"): lang = "Yoruba"
180
  elif output_voice.startswith("ig"): lang = "Igbo"
181
  else: lang = "Nigerian English"
182
 
183
  system_prompt = f"You are a helpful, friendly assistant. Listen to what the user says and respond naturally. You must respond ONLY in {lang}."
 
 
 
 
 
 
 
 
 
 
184
 
185
+ # --- HISTORY FIX 2 ---
186
+ # The history is already in the correct format. Just make a copy.
187
+ messages = list(history)
188
+
189
+ # Add the final system prompt
190
  conversation_messages = messages + [{"role": "system", "content": system_prompt}]
191
  conversation_prompt = tokenizer.apply_chat_template(conversation_messages, tokenize=False, add_generation_prompt=True)
192
+
193
  inputs = tokenizer(conversation_prompt, return_tensors="pt").to(model.device)
194
  input_length = inputs.input_ids.shape[1]
195
+
196
  outputs = model.generate(
197
  **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
198
  pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9
 
200
  conversational_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
201
  print(f" -> Conversational Reply: {conversational_text}")
202
 
203
+ # ----- STAGE 3: Get N-ATLaS Response (RUN 2: TRANSLATION) -----
204
  print("Generating N-ATLaS response (Run 2: Translation)...")
205
+
206
  translation_system_prompt = f"Translate the following text to {lang}:"
207
+ translation_messages = [
208
+ {"role": "system", "content": translation_system_prompt},
209
+ {"role": "user", "content": transcribed_text}
210
+ ]
211
  translation_prompt = tokenizer.apply_chat_template(translation_messages, tokenize=False, add_generation_prompt=True)
212
+
213
  inputs = tokenizer(translation_prompt, return_tensors="pt").to(model.device)
214
  input_length = inputs.input_ids.shape[1]
215
+
216
  outputs = model.generate(
217
  **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
218
  pad_token_id=tokenizer.eos_token_id, do_sample=False, temperature=0.1, top_p=0.9
219
  )
220
+ translation_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
221
  print(f" -> Direct Translation: {translation_text}")
222
 
223
+ # ----- STAGE 4: Synthesize and Format Response -----
224
  bot_audio_path = synthesize_speech(conversational_text, output_voice)
225
+
226
+ bot_response_string = f"""
227
+ **Conversational Reply:**
228
+ {conversational_text}
229
 
230
+ ---
231
+ **Direct Translation:**
232
+ {translation_text}
233
+ """
234
+ # --- HISTORY FIX 3 ---
235
+ # Append the bot's complete response to the history
236
+ history.append({"role": "assistant", "content": bot_response_string})
237
+
238
+ # Yield the final history, the bot's audio, and clear the mic input
239
  yield history, bot_audio_path, None
240
 
241
  # ==============================================================================
242
+ # 5. GRADIO UI (using Blocks) - *** UPDATED FOR GRADIO 4.x ***
243
  # ==============================================================================
244
  with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
245
  gr.Markdown("# πŸ‡³πŸ‡¬ N-ATLaS Multilingual Voice Test")
 
271
  value="en-NG"
272
  )
273
 
274
+ # --- UI FIX 1 ---
275
+ # Set type="messages" for the Chatbot component
276
  chatbot = gr.Chatbot(label="Conversation", height=400, type="messages")
277
 
278
  mic_input = gr.Audio(
279
+ sources=["microphone"], # Use 'sources' (plural) for Gradio 4.x
280
  type="filepath",
281
  label="3. Press record and speak"
282
  )
 
286
  )
287
  submit_btn = gr.Button("Submit Audio")
288
 
289
+ # --- UI FIX 2 ---
290
+ # Initialize history as an empty list (Gradio 4.x handles this)
 
 
 
 
291
  chat_history = gr.State([])
292
 
293
  submit_btn.click(
 
297
  )
298
 
299
  print("Launching Gradio interface...")
300
+ # No share=True needed on Spaces, and queue is enabled by default
301
+ iface.launch()