Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -60,15 +60,13 @@ try:
|
|
| 60 |
print("β
Google Cloud STT/TTS clients initialized.")
|
| 61 |
except Exception as e:
|
| 62 |
print(f"π CRITICAL: Could not initialize Google Cloud clients. {e}")
|
| 63 |
-
# This will fail if the secret was not set correctly.
|
| 64 |
|
| 65 |
# ==============================================================================
|
| 66 |
# 4. HELPER FUNCTIONS (STT AND TTS)
|
| 67 |
# ==============================================================================
|
| 68 |
|
| 69 |
def transcribe_audio(audio_filepath: str, language_code: str):
|
| 70 |
-
if not audio_filepath:
|
| 71 |
-
return ""
|
| 72 |
print(f"Loading audio file: {audio_filepath}")
|
| 73 |
try:
|
| 74 |
audio = AudioSegment.from_file(audio_filepath)
|
|
@@ -96,7 +94,7 @@ def transcribe_audio(audio_filepath: str, language_code: str):
|
|
| 96 |
finally:
|
| 97 |
if audio_filepath and os.path.exists(audio_filepath):
|
| 98 |
try: os.remove(audio_filepath)
|
| 99 |
-
except OSError: pass
|
| 100 |
|
| 101 |
def synthesize_speech(text, voice_code):
|
| 102 |
print(f"Synthesizing speech with requested code: {voice_code}...")
|
|
@@ -123,7 +121,6 @@ def synthesize_speech(text, voice_code):
|
|
| 123 |
voice = texttospeech.VoiceSelectionParams(**voice_params)
|
| 124 |
audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
|
| 125 |
|
| 126 |
-
# Diagnostic check
|
| 127 |
if not voice_code.startswith("en"):
|
| 128 |
try:
|
| 129 |
print(f"--- Listing available voices for language code: {selected_language_code} ---")
|
|
@@ -148,47 +145,54 @@ def synthesize_speech(text, voice_code):
|
|
| 148 |
return None
|
| 149 |
|
| 150 |
# ==============================================================================
|
| 151 |
-
# 4. CORE CHAT FUNCTION (AS A GENERATOR) -
|
| 152 |
# ==============================================================================
|
| 153 |
def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
user_audio_path = audio_input
|
| 155 |
if user_audio_path is None:
|
| 156 |
yield history, None, None
|
| 157 |
return
|
| 158 |
print(f"Received audio filepath: {user_audio_path}")
|
| 159 |
|
|
|
|
| 160 |
transcribed_text = transcribe_audio(user_audio_path, input_lang)
|
| 161 |
if transcribed_text is None:
|
| 162 |
transcribed_text = "[Error: Transcription failed internally]"
|
| 163 |
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
| 166 |
|
| 167 |
if transcribed_text.startswith("["):
|
| 168 |
return
|
| 169 |
|
|
|
|
| 170 |
print("Generating N-ATLaS response (Run 1: Conversation)...")
|
|
|
|
| 171 |
if output_voice.startswith("ha"): lang = "Hausa"
|
| 172 |
elif output_voice.startswith("yo"): lang = "Yoruba"
|
| 173 |
elif output_voice.startswith("ig"): lang = "Igbo"
|
| 174 |
else: lang = "Nigerian English"
|
| 175 |
|
| 176 |
system_prompt = f"You are a helpful, friendly assistant. Listen to what the user says and respond naturally. You must respond ONLY in {lang}."
|
| 177 |
-
messages = []
|
| 178 |
-
for user_msg, assistant_msg in history:
|
| 179 |
-
user_content = str(user_msg) if user_msg is not None else "[empty]"
|
| 180 |
-
messages.append({"role": "user", "content": user_content})
|
| 181 |
-
if assistant_msg:
|
| 182 |
-
if "**Conversational Reply:**" in str(assistant_msg):
|
| 183 |
-
reply_text = str(assistant_msg).split("---")[0].replace("**Conversational Reply:**\n", "").strip()
|
| 184 |
-
messages.append({"role": "assistant", "content": reply_text})
|
| 185 |
-
else:
|
| 186 |
-
messages.append({"role": "assistant", "content": str(assistant_msg)})
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
conversation_messages = messages + [{"role": "system", "content": system_prompt}]
|
| 189 |
conversation_prompt = tokenizer.apply_chat_template(conversation_messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 190 |
inputs = tokenizer(conversation_prompt, return_tensors="pt").to(model.device)
|
| 191 |
input_length = inputs.input_ids.shape[1]
|
|
|
|
| 192 |
outputs = model.generate(
|
| 193 |
**inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
|
| 194 |
pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9
|
|
@@ -196,28 +200,46 @@ def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
|
|
| 196 |
conversational_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
|
| 197 |
print(f" -> Conversational Reply: {conversational_text}")
|
| 198 |
|
|
|
|
| 199 |
print("Generating N-ATLaS response (Run 2: Translation)...")
|
|
|
|
| 200 |
translation_system_prompt = f"Translate the following text to {lang}:"
|
| 201 |
-
translation_messages = [
|
|
|
|
|
|
|
|
|
|
| 202 |
translation_prompt = tokenizer.apply_chat_template(translation_messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 203 |
inputs = tokenizer(translation_prompt, return_tensors="pt").to(model.device)
|
| 204 |
input_length = inputs.input_ids.shape[1]
|
|
|
|
| 205 |
outputs = model.generate(
|
| 206 |
**inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
|
| 207 |
pad_token_id=tokenizer.eos_token_id, do_sample=False, temperature=0.1, top_p=0.9
|
| 208 |
)
|
| 209 |
-
translation_text = tokenizer.decode(outputs[0][input_length:],
|
| 210 |
print(f" -> Direct Translation: {translation_text}")
|
| 211 |
|
|
|
|
| 212 |
bot_audio_path = synthesize_speech(conversational_text, output_voice)
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
|
|
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
yield history, bot_audio_path, None
|
| 218 |
|
| 219 |
# ==============================================================================
|
| 220 |
-
# 5. GRADIO UI (using Blocks) -
|
| 221 |
# ==============================================================================
|
| 222 |
with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
|
| 223 |
gr.Markdown("# π³π¬ N-ATLaS Multilingual Voice Test")
|
|
@@ -249,11 +271,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
|
|
| 249 |
value="en-NG"
|
| 250 |
)
|
| 251 |
|
| 252 |
-
# --- FIX
|
|
|
|
| 253 |
chatbot = gr.Chatbot(label="Conversation", height=400, type="messages")
|
| 254 |
|
| 255 |
mic_input = gr.Audio(
|
| 256 |
-
sources=["microphone"],
|
| 257 |
type="filepath",
|
| 258 |
label="3. Press record and speak"
|
| 259 |
)
|
|
@@ -263,12 +286,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
|
|
| 263 |
)
|
| 264 |
submit_btn = gr.Button("Submit Audio")
|
| 265 |
|
| 266 |
-
# --- FIX
|
| 267 |
-
#
|
| 268 |
-
# This requires modifying the chat function to handle this new format.
|
| 269 |
-
|
| 270 |
-
# Let's revert to the simpler gr.State() for now to avoid a full rewrite,
|
| 271 |
-
# as the warning is not critical. But we'll fix the launch.
|
| 272 |
chat_history = gr.State([])
|
| 273 |
|
| 274 |
submit_btn.click(
|
|
@@ -278,5 +297,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
|
|
| 278 |
)
|
| 279 |
|
| 280 |
print("Launching Gradio interface...")
|
| 281 |
-
#
|
| 282 |
-
iface.launch(
|
|
|
|
| 60 |
print("β
Google Cloud STT/TTS clients initialized.")
|
| 61 |
except Exception as e:
|
| 62 |
print(f"π CRITICAL: Could not initialize Google Cloud clients. {e}")
|
|
|
|
| 63 |
|
| 64 |
# ==============================================================================
|
| 65 |
# 4. HELPER FUNCTIONS (STT AND TTS)
|
| 66 |
# ==============================================================================
|
| 67 |
|
| 68 |
def transcribe_audio(audio_filepath: str, language_code: str):
|
| 69 |
+
if not audio_filepath: return ""
|
|
|
|
| 70 |
print(f"Loading audio file: {audio_filepath}")
|
| 71 |
try:
|
| 72 |
audio = AudioSegment.from_file(audio_filepath)
|
|
|
|
| 94 |
finally:
|
| 95 |
if audio_filepath and os.path.exists(audio_filepath):
|
| 96 |
try: os.remove(audio_filepath)
|
| 97 |
+
except OSError: pass
|
| 98 |
|
| 99 |
def synthesize_speech(text, voice_code):
|
| 100 |
print(f"Synthesizing speech with requested code: {voice_code}...")
|
|
|
|
| 121 |
voice = texttospeech.VoiceSelectionParams(**voice_params)
|
| 122 |
audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
|
| 123 |
|
|
|
|
| 124 |
if not voice_code.startswith("en"):
|
| 125 |
try:
|
| 126 |
print(f"--- Listing available voices for language code: {selected_language_code} ---")
|
|
|
|
| 145 |
return None
|
| 146 |
|
| 147 |
# ==============================================================================
|
| 148 |
+
# 4. CORE CHAT FUNCTION (AS A GENERATOR) - *** UPDATED FOR GRADIO 4.x ***
|
| 149 |
# ==============================================================================
|
| 150 |
def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
|
| 151 |
+
"""
|
| 152 |
+
Main function for the Gradio app. Handles filepath audio input, uses 'yield',
|
| 153 |
+
and generates BOTH a translation and a conversational reply.
|
| 154 |
+
HISTORY is now a list of dictionaries: [{"role": "user", "content": ...}]
|
| 155 |
+
"""
|
| 156 |
user_audio_path = audio_input
|
| 157 |
if user_audio_path is None:
|
| 158 |
yield history, None, None
|
| 159 |
return
|
| 160 |
print(f"Received audio filepath: {user_audio_path}")
|
| 161 |
|
| 162 |
+
# ----- STAGE 1: Transcribe User -----
|
| 163 |
transcribed_text = transcribe_audio(user_audio_path, input_lang)
|
| 164 |
if transcribed_text is None:
|
| 165 |
transcribed_text = "[Error: Transcription failed internally]"
|
| 166 |
|
| 167 |
+
# --- HISTORY FIX 1 ---
|
| 168 |
+
# Append the user's transcribed text to the history in the new format
|
| 169 |
+
history.append({"role": "user", "content": transcribed_text})
|
| 170 |
+
yield history, None, None # Update UI with transcribed text
|
| 171 |
|
| 172 |
if transcribed_text.startswith("["):
|
| 173 |
return
|
| 174 |
|
| 175 |
+
# ----- STAGE 2: Get N-ATLaS Response (RUN 1: CONVERSATION) -----
|
| 176 |
print("Generating N-ATLaS response (Run 1: Conversation)...")
|
| 177 |
+
|
| 178 |
if output_voice.startswith("ha"): lang = "Hausa"
|
| 179 |
elif output_voice.startswith("yo"): lang = "Yoruba"
|
| 180 |
elif output_voice.startswith("ig"): lang = "Igbo"
|
| 181 |
else: lang = "Nigerian English"
|
| 182 |
|
| 183 |
system_prompt = f"You are a helpful, friendly assistant. Listen to what the user says and respond naturally. You must respond ONLY in {lang}."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
+
# --- HISTORY FIX 2 ---
|
| 186 |
+
# The history is already in the correct format. Just make a copy.
|
| 187 |
+
messages = list(history)
|
| 188 |
+
|
| 189 |
+
# Add the final system prompt
|
| 190 |
conversation_messages = messages + [{"role": "system", "content": system_prompt}]
|
| 191 |
conversation_prompt = tokenizer.apply_chat_template(conversation_messages, tokenize=False, add_generation_prompt=True)
|
| 192 |
+
|
| 193 |
inputs = tokenizer(conversation_prompt, return_tensors="pt").to(model.device)
|
| 194 |
input_length = inputs.input_ids.shape[1]
|
| 195 |
+
|
| 196 |
outputs = model.generate(
|
| 197 |
**inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
|
| 198 |
pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9
|
|
|
|
| 200 |
conversational_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
|
| 201 |
print(f" -> Conversational Reply: {conversational_text}")
|
| 202 |
|
| 203 |
+
# ----- STAGE 3: Get N-ATLaS Response (RUN 2: TRANSLATION) -----
|
| 204 |
print("Generating N-ATLaS response (Run 2: Translation)...")
|
| 205 |
+
|
| 206 |
translation_system_prompt = f"Translate the following text to {lang}:"
|
| 207 |
+
translation_messages = [
|
| 208 |
+
{"role": "system", "content": translation_system_prompt},
|
| 209 |
+
{"role": "user", "content": transcribed_text}
|
| 210 |
+
]
|
| 211 |
translation_prompt = tokenizer.apply_chat_template(translation_messages, tokenize=False, add_generation_prompt=True)
|
| 212 |
+
|
| 213 |
inputs = tokenizer(translation_prompt, return_tensors="pt").to(model.device)
|
| 214 |
input_length = inputs.input_ids.shape[1]
|
| 215 |
+
|
| 216 |
outputs = model.generate(
|
| 217 |
**inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
|
| 218 |
pad_token_id=tokenizer.eos_token_id, do_sample=False, temperature=0.1, top_p=0.9
|
| 219 |
)
|
| 220 |
+
translation_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
|
| 221 |
print(f" -> Direct Translation: {translation_text}")
|
| 222 |
|
| 223 |
+
# ----- STAGE 4: Synthesize and Format Response -----
|
| 224 |
bot_audio_path = synthesize_speech(conversational_text, output_voice)
|
| 225 |
+
|
| 226 |
+
bot_response_string = f"""
|
| 227 |
+
**Conversational Reply:**
|
| 228 |
+
{conversational_text}
|
| 229 |
|
| 230 |
+
---
|
| 231 |
+
**Direct Translation:**
|
| 232 |
+
{translation_text}
|
| 233 |
+
"""
|
| 234 |
+
# --- HISTORY FIX 3 ---
|
| 235 |
+
# Append the bot's complete response to the history
|
| 236 |
+
history.append({"role": "assistant", "content": bot_response_string})
|
| 237 |
+
|
| 238 |
+
# Yield the final history, the bot's audio, and clear the mic input
|
| 239 |
yield history, bot_audio_path, None
|
| 240 |
|
| 241 |
# ==============================================================================
|
| 242 |
+
# 5. GRADIO UI (using Blocks) - *** UPDATED FOR GRADIO 4.x ***
|
| 243 |
# ==============================================================================
|
| 244 |
with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
|
| 245 |
gr.Markdown("# π³π¬ N-ATLaS Multilingual Voice Test")
|
|
|
|
| 271 |
value="en-NG"
|
| 272 |
)
|
| 273 |
|
| 274 |
+
# --- UI FIX 1 ---
|
| 275 |
+
# Set type="messages" for the Chatbot component
|
| 276 |
chatbot = gr.Chatbot(label="Conversation", height=400, type="messages")
|
| 277 |
|
| 278 |
mic_input = gr.Audio(
|
| 279 |
+
sources=["microphone"], # Use 'sources' (plural) for Gradio 4.x
|
| 280 |
type="filepath",
|
| 281 |
label="3. Press record and speak"
|
| 282 |
)
|
|
|
|
| 286 |
)
|
| 287 |
submit_btn = gr.Button("Submit Audio")
|
| 288 |
|
| 289 |
+
# --- UI FIX 2 ---
|
| 290 |
+
# Initialize history as an empty list (Gradio 4.x handles this)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
chat_history = gr.State([])
|
| 292 |
|
| 293 |
submit_btn.click(
|
|
|
|
| 297 |
)
|
| 298 |
|
| 299 |
print("Launching Gradio interface...")
|
| 300 |
+
# No share=True needed on Spaces, and queue is enabled by default
|
| 301 |
+
iface.launch()
|