Spaces:

crevans
/

NAT-Eval-Test

Sleeping

App Files Files Community

crevans commited on Nov 6

Commit

2f404b2

verified ·

1 Parent(s): a8fbc80

Upload app.py

Browse files

Files changed (1) hide show

app.py +53 -34

app.py CHANGED Viewed

@@ -60,15 +60,13 @@ try:
     print("✅ Google Cloud STT/TTS clients initialized.")
 except Exception as e:
     print(f"🛑 CRITICAL: Could not initialize Google Cloud clients. {e}")
-    # This will fail if the secret was not set correctly.
 # ==============================================================================
 # 4. HELPER FUNCTIONS (STT AND TTS)
 # ==============================================================================
 def transcribe_audio(audio_filepath: str, language_code: str):
-    if not audio_filepath:
-        return ""
     print(f"Loading audio file: {audio_filepath}")
     try:
         audio = AudioSegment.from_file(audio_filepath)
@@ -96,7 +94,7 @@ def transcribe_audio(audio_filepath: str, language_code: str):
     finally:
         if audio_filepath and os.path.exists(audio_filepath):
              try: os.remove(audio_filepath)
-             except OSError: pass # Ignore if file deletion fails
 def synthesize_speech(text, voice_code):
     print(f"Synthesizing speech with requested code: {voice_code}...")
@@ -123,7 +121,6 @@ def synthesize_speech(text, voice_code):
     voice = texttospeech.VoiceSelectionParams(**voice_params)
     audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
-    # Diagnostic check
     if not voice_code.startswith("en"):
         try:
             print(f"--- Listing available voices for language code: {selected_language_code} ---")
@@ -148,47 +145,54 @@ def synthesize_speech(text, voice_code):
         return None
 # ==============================================================================
-# 4. CORE CHAT FUNCTION (AS A GENERATOR) - DUAL RESPONSE
 # ==============================================================================
 def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
     user_audio_path = audio_input
     if user_audio_path is None:
          yield history, None, None
          return
     print(f"Received audio filepath: {user_audio_path}")
     transcribed_text = transcribe_audio(user_audio_path, input_lang)
     if transcribed_text is None:
         transcribed_text = "[Error: Transcription failed internally]"
-    history.append((transcribed_text, None))
-    yield history, None, None
     if transcribed_text.startswith("["):
         return
     print("Generating N-ATLaS response (Run 1: Conversation)...")
     if output_voice.startswith("ha"): lang = "Hausa"
     elif output_voice.startswith("yo"): lang = "Yoruba"
     elif output_voice.startswith("ig"): lang = "Igbo"
     else: lang = "Nigerian English"
     system_prompt = f"You are a helpful, friendly assistant. Listen to what the user says and respond naturally. You must respond ONLY in {lang}."
-    messages = []
-    for user_msg, assistant_msg in history:
-        user_content = str(user_msg) if user_msg is not None else "[empty]"
-        messages.append({"role": "user", "content": user_content})
-        if assistant_msg:
-             if "**Conversational Reply:**" in str(assistant_msg):
-                 reply_text = str(assistant_msg).split("---")[0].replace("**Conversational Reply:**\n", "").strip()
-                 messages.append({"role": "assistant", "content": reply_text})
-             else:
-                 messages.append({"role": "assistant", "content": str(assistant_msg)})
     conversation_messages = messages + [{"role": "system", "content": system_prompt}]
     conversation_prompt = tokenizer.apply_chat_template(conversation_messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(conversation_prompt, return_tensors="pt").to(model.device)
     input_length = inputs.input_ids.shape[1]
     outputs = model.generate(
         **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
         pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9
@@ -196,28 +200,46 @@ def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
     conversational_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
     print(f"   -> Conversational Reply: {conversational_text}")
     print("Generating N-ATLaS response (Run 2: Translation)...")
     translation_system_prompt = f"Translate the following text to {lang}:"
-    translation_messages = [{"role": "system", "content": translation_system_prompt}, {"role": "user", "content": transcribed_text}]
     translation_prompt = tokenizer.apply_chat_template(translation_messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(translation_prompt, return_tensors="pt").to(model.device)
     input_length = inputs.input_ids.shape[1]
     outputs = model.generate(
         **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
         pad_token_id=tokenizer.eos_token_id, do_sample=False, temperature=0.1, top_p=0.9
     )
-    translation_text = tokenizer.decode(outputs[0][input_length:], skip__special_tokens=True).strip()
     print(f"   -> Direct Translation: {translation_text}")
     bot_audio_path = synthesize_speech(conversational_text, output_voice)
-    bot_response_string = f"**Conversational Reply:**\n{conversational_text}\n\n---\n**Direct Translation:**\n{translation_text}"
-    final_user_text = transcribed_text if transcribed_text is not None else "[Error]"
-    history[-1] = (final_user_text, bot_response_string)
     yield history, bot_audio_path, None
 # ==============================================================================
-# 5. GRADIO UI (using Blocks) - Modern Gradio 4.x compatible
 # ==============================================================================
 with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
     gr.Markdown("# 🇳🇬 N-ATLaS Multilingual Voice Test")
@@ -249,11 +271,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
             value="en-NG"
         )
-    # --- FIX for Gradio 4.x warning ---
     chatbot = gr.Chatbot(label="Conversation", height=400, type="messages")
     mic_input = gr.Audio(
-        sources=["microphone"],
         type="filepath",
         label="3. Press record and speak"
     )
@@ -263,12 +286,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
     )
     submit_btn = gr.Button("Submit Audio")
-    # --- FIX for Gradio 4.x history ---
-    # We must now use the 'type="messages"' format: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
-    # This requires modifying the chat function to handle this new format.
-    # Let's revert to the simpler gr.State() for now to avoid a full rewrite,
-    # as the warning is not critical. But we'll fix the launch.
     chat_history = gr.State([])
     submit_btn.click(
@@ -278,5 +297,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
     )
 print("Launching Gradio interface...")
-# Add share=True back, as it's good practice for sharing
-iface.launch(share=True)

     print("✅ Google Cloud STT/TTS clients initialized.")
 except Exception as e:
     print(f"🛑 CRITICAL: Could not initialize Google Cloud clients. {e}")
 # ==============================================================================
 # 4. HELPER FUNCTIONS (STT AND TTS)
 # ==============================================================================
 def transcribe_audio(audio_filepath: str, language_code: str):
+    if not audio_filepath: return ""
     print(f"Loading audio file: {audio_filepath}")
     try:
         audio = AudioSegment.from_file(audio_filepath)
     finally:
         if audio_filepath and os.path.exists(audio_filepath):
              try: os.remove(audio_filepath)
+             except OSError: pass
 def synthesize_speech(text, voice_code):
     print(f"Synthesizing speech with requested code: {voice_code}...")
     voice = texttospeech.VoiceSelectionParams(**voice_params)
     audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
     if not voice_code.startswith("en"):
         try:
             print(f"--- Listing available voices for language code: {selected_language_code} ---")
         return None
 # ==============================================================================
+# 4. CORE CHAT FUNCTION (AS A GENERATOR) - *** UPDATED FOR GRADIO 4.x ***
 # ==============================================================================
 def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
+    """
+    Main function for the Gradio app. Handles filepath audio input, uses 'yield',
+    and generates BOTH a translation and a conversational reply.
+    HISTORY is now a list of dictionaries: [{"role": "user", "content": ...}]
+    """
     user_audio_path = audio_input
     if user_audio_path is None:
          yield history, None, None
          return
     print(f"Received audio filepath: {user_audio_path}")
+    # ----- STAGE 1: Transcribe User -----
     transcribed_text = transcribe_audio(user_audio_path, input_lang)
     if transcribed_text is None:
         transcribed_text = "[Error: Transcription failed internally]"
+    # --- HISTORY FIX 1 ---
+    # Append the user's transcribed text to the history in the new format
+    history.append({"role": "user", "content": transcribed_text})
+    yield history, None, None # Update UI with transcribed text
     if transcribed_text.startswith("["):
         return
+    # ----- STAGE 2: Get N-ATLaS Response (RUN 1: CONVERSATION) -----
     print("Generating N-ATLaS response (Run 1: Conversation)...")
     if output_voice.startswith("ha"): lang = "Hausa"
     elif output_voice.startswith("yo"): lang = "Yoruba"
     elif output_voice.startswith("ig"): lang = "Igbo"
     else: lang = "Nigerian English"
     system_prompt = f"You are a helpful, friendly assistant. Listen to what the user says and respond naturally. You must respond ONLY in {lang}."
+    # --- HISTORY FIX 2 ---
+    # The history is already in the correct format. Just make a copy.
+    messages = list(history)
+    # Add the final system prompt
     conversation_messages = messages + [{"role": "system", "content": system_prompt}]
     conversation_prompt = tokenizer.apply_chat_template(conversation_messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(conversation_prompt, return_tensors="pt").to(model.device)
     input_length = inputs.input_ids.shape[1]
     outputs = model.generate(
         **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
         pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9
     conversational_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
     print(f"   -> Conversational Reply: {conversational_text}")
+    # ----- STAGE 3: Get N-ATLaS Response (RUN 2: TRANSLATION) -----
     print("Generating N-ATLaS response (Run 2: Translation)...")
     translation_system_prompt = f"Translate the following text to {lang}:"
+    translation_messages = [
+        {"role": "system", "content": translation_system_prompt},
+        {"role": "user", "content": transcribed_text}
+    ]
     translation_prompt = tokenizer.apply_chat_template(translation_messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(translation_prompt, return_tensors="pt").to(model.device)
     input_length = inputs.input_ids.shape[1]
     outputs = model.generate(
         **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
         pad_token_id=tokenizer.eos_token_id, do_sample=False, temperature=0.1, top_p=0.9
     )
+    translation_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
     print(f"   -> Direct Translation: {translation_text}")
+    # ----- STAGE 4: Synthesize and Format Response -----
     bot_audio_path = synthesize_speech(conversational_text, output_voice)
+    bot_response_string = f"""
+**Conversational Reply:**
+{conversational_text}
+---
+**Direct Translation:**
+{translation_text}
+"""
+    # --- HISTORY FIX 3 ---
+    # Append the bot's complete response to the history
+    history.append({"role": "assistant", "content": bot_response_string})
+    # Yield the final history, the bot's audio, and clear the mic input
     yield history, bot_audio_path, None
 # ==============================================================================
+# 5. GRADIO UI (using Blocks) - *** UPDATED FOR GRADIO 4.x ***
 # ==============================================================================
 with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
     gr.Markdown("# 🇳🇬 N-ATLaS Multilingual Voice Test")
             value="en-NG"
         )
+    # --- UI FIX 1 ---
+    # Set type="messages" for the Chatbot component
     chatbot = gr.Chatbot(label="Conversation", height=400, type="messages")
     mic_input = gr.Audio(
+        sources=["microphone"], # Use 'sources' (plural) for Gradio 4.x
         type="filepath",
         label="3. Press record and speak"
     )
     )
     submit_btn = gr.Button("Submit Audio")
+    # --- UI FIX 2 ---
+    # Initialize history as an empty list (Gradio 4.x handles this)
     chat_history = gr.State([])
     submit_btn.click(
     )
 print("Launching Gradio interface...")
+# No share=True needed on Spaces, and queue is enabled by default
+iface.launch()