import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from google.cloud import speech, texttospeech
import os
import tempfile
import time
from pydub import AudioSegment 

# ==============================================================================
# 1. HANDLE AUTHENTICATION FROM HUGGING FACE SECRETS
# ==============================================================================

hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
    print("WARNING: HF_TOKEN secret not set. Download may fail.")

gcp_key_json_string = os.environ.get("GCP_SERVICE_ACCOUNT_KEY")
if not gcp_key_json_string:
    print("🛑 CRITICAL: GCP_SERVICE_ACCOUNT_KEY secret not set. STT/TTS will fail.")
else:
    try:
        # We must write the secret string to a temporary file for Google's clients
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json") as f:
            f.write(gcp_key_json_string)
            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = f.name
            print(f"✅ Google credentials written to temporary file: {f.name}")
    except Exception as e:
        print(f"🛑 CRITICAL: Failed to write GCP key to temp file. {e}")

# ==============================================================================
# 2. CONFIGURE AND LOAD N-ATLaS MODEL (FOR T4 GPU)
# ==============================================================================

MODEL_ID = "NCAIR1/N-ATLaS"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print(f"Loading model: {MODEL_ID} with 4-bit quantization...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    token=hf_token
)
print("✅ N-ATLaS Model loaded.")

# ==============================================================================
# 3. INITIALIZE GOOGLE CLOUD CLIENTS
# ==============================================================================
try:
    speech_client = speech.SpeechClient()
    tts_client = texttospeech.TextToSpeechClient()
    print("✅ Google Cloud STT/TTS clients initialized.")
except Exception as e:
    print(f"🛑 CRITICAL: Could not initialize Google Cloud clients. {e}")

# ==============================================================================
# 4. HELPER FUNCTIONS (STT AND TTS)
# ==============================================================================

def transcribe_audio(audio_filepath: str, language_code: str):
    if not audio_filepath: return ""
    print(f"Loading audio file: {audio_filepath}")
    try:
        audio = AudioSegment.from_file(audio_filepath)
        print("   -> AudioSegment loaded successfully.")
        target_sample_rate = 16000
        target_channels = 1 
        audio = audio.set_frame_rate(target_sample_rate).set_channels(target_channels)
        wav_data = audio.raw_data
        print(f"Transcribing {len(wav_data)} bytes with language: {language_code} at {target_sample_rate} Hz...")
        recognition_audio = speech.RecognitionAudio(content=wav_data)
        recognition_config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=target_sample_rate,
            language_code=language_code,
            audio_channel_count=target_channels
        )
        response = speech_client.recognize(config=recognition_config, audio=recognition_audio)
        if not response.results: return "[Could not understand audio]"
        transcribed_text = response.results[0].alternatives[0].transcript
        print(f"   -> Transcribed: {transcribed_text}")
        return transcribed_text
    except Exception as e:
        print(f"   -> 🛑 ERROR during audio processing or transcription: {e}")
        return f"[Error processing audio: {e}]"
    finally:
        if audio_filepath and os.path.exists(audio_filepath):
             try: os.remove(audio_filepath)
             except OSError: pass

def synthesize_speech(text, voice_code):
    print(f"Synthesizing speech with requested code: {voice_code}...")
    synthesis_input = texttospeech.SynthesisInput(text=text)

    selected_voice_name = None 
    selected_ssml_gender = None 

    if voice_code.startswith("en"):
        selected_language_code = "en-US" 
        selected_voice_name = "en-US-Wavenet-A"
        print(f"   -> Using high-quality English voice: {selected_voice_name}")
    else:
        selected_language_code = voice_code.split('-')[0] # Use 'ha', 'ig', 'yo'
        selected_ssml_gender = texttospeech.SsmlVoiceGender.FEMALE 
        print(f"   -> Requesting default FEMALE voice for language: {selected_language_code}")

    voice_params = {"language_code": selected_language_code}
    if selected_voice_name:
        voice_params["name"] = selected_voice_name
    elif selected_ssml_gender:
         voice_params["ssml_gender"] = selected_ssml_gender

    voice = texttospeech.VoiceSelectionParams(**voice_params)
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)

    if not voice_code.startswith("en"):
        try:
            print(f"--- Listing available voices for language code: {selected_language_code} ---")
            list_voices_response = tts_client.list_voices(language_code=selected_language_code)
            available_voices = [v.name for v in list_voices_response.voices]
            if available_voices:
                print(f"Available voices found: {available_voices}")
            else:
                print("No voices found for this language code.")
        except Exception as list_err:
            print(f"   -> ERROR trying to list voices: {list_err}")
    
    try:
        response = tts_client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
            fp.write(response.audio_content)
            temp_audio_path = fp.name
        print(f"   -> Audio saved to: {temp_audio_path}")
        return temp_audio_path
    except Exception as e:
        print(f"   -> 🛑 ERROR during speech synthesis: {e}")
        return None

# ==============================================================================
# 4. CORE CHAT FUNCTION (AS A GENERATOR) - *** UPDATED FOR GRADIO 4.x ***
# ==============================================================================
def speech_to_speech_chat(audio_input, history, input_lang, output_voice):
    """
    Main function for the Gradio app. Handles filepath audio input, uses 'yield',
    and generates BOTH a translation and a conversational reply.
    HISTORY is now a list of dictionaries: [{"role": "user", "content": ...}]
    """
    user_audio_path = audio_input 
    if user_audio_path is None:
         yield history, None, None
         return 
    print(f"Received audio filepath: {user_audio_path}")

    # ----- STAGE 1: Transcribe User -----
    transcribed_text = transcribe_audio(user_audio_path, input_lang) 
    if transcribed_text is None:
        transcribed_text = "[Error: Transcription failed internally]"

    # --- HISTORY FIX 1 ---
    # Append the user's transcribed text to the history in the new format
    history.append({"role": "user", "content": transcribed_text})
    yield history, None, None # Update UI with transcribed text

    if transcribed_text.startswith("["):
        return 

    # ----- STAGE 2: Get N-ATLaS Response (RUN 1: CONVERSATION) -----
    print("Generating N-ATLaS response (Run 1: Conversation)...")

    if output_voice.startswith("ha"): lang = "Hausa"
    elif output_voice.startswith("yo"): lang = "Yoruba"
    elif output_voice.startswith("ig"): lang = "Igbo"
    else: lang = "Nigerian English"

    system_prompt = f"You are a helpful, friendly assistant. Listen to what the user says and respond naturally. You must respond ONLY in {lang}."
    
    # --- HISTORY FIX 2 ---
    # The history is already in the correct format. Just make a copy.
    messages = list(history)
    
    # Add the final system prompt
    conversation_messages = messages + [{"role": "system", "content": system_prompt}]
    conversation_prompt = tokenizer.apply_chat_template(conversation_messages, tokenize=False, add_generation_prompt=True)
    
    inputs = tokenizer(conversation_prompt, return_tensors="pt").to(model.device)
    input_length = inputs.input_ids.shape[1]

    outputs = model.generate(
        **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9
    )
    conversational_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
    print(f"   -> Conversational Reply: {conversational_text}")

    # ----- STAGE 3: Get N-ATLaS Response (RUN 2: TRANSLATION) -----
    print("Generating N-ATLaS response (Run 2: Translation)...")
    
    translation_system_prompt = f"Translate the following text to {lang}:"
    translation_messages = [
        {"role": "system", "content": translation_system_prompt},
        {"role": "user", "content": transcribed_text} 
    ]
    translation_prompt = tokenizer.apply_chat_template(translation_messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(translation_prompt, return_tensors="pt").to(model.device)
    input_length = inputs.input_ids.shape[1]

    outputs = model.generate(
        **inputs, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id, do_sample=False, temperature=0.1, top_p=0.9 
    )
    translation_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
    print(f"   -> Direct Translation: {translation_text}")
    
    # ----- STAGE 4: Synthesize and Format Response -----
    bot_audio_path = synthesize_speech(conversational_text, output_voice)
    
    bot_response_string = f"""
**Conversational Reply:**
{conversational_text}

---
**Direct Translation:**
{translation_text}
"""
    # --- HISTORY FIX 3 ---
    # Append the bot's complete response to the history
    history.append({"role": "assistant", "content": bot_response_string})

    # Yield the final history, the bot's audio, and clear the mic input
    yield history, bot_audio_path, None

# ==============================================================================
# 5. GRADIO UI (using Blocks) - *** UPDATED FOR GRADIO 4.x ***
# ==============================================================================
with gr.Blocks(theme=gr.themes.Soft(), title="N-ATLaS Voice Test") as iface:
    gr.Markdown("# 🇳🇬 N-ATLaS Multilingual Voice Test")
    gr.Markdown(
        "**Instructions:** Select your spoken language and desired response voice. "
        "Speak into the microphone, then press 'Submit'.\n"
        "**This app is running on a T4 GPU. Responses should be fast.**"
    )
    with gr.Row():
        input_lang = gr.Dropdown(
            label="1. Language I am Speaking",
            choices=[
                ("American English", "en-US"),
                ("Nigerian Pidgin / English", "en-NG"),
                ("Hausa", "ha-NG"),
                ("Igbo", "ig-NG"),
                ("Yoruba", "yo-NG")
            ],
            value="en-US" 
        )
        output_voice = gr.Dropdown(
            label="2. Language for Bot to Speak",
            choices=[
                ("Nigerian English", "en-NG"),
                ("Hausa", "ha-NG"),
                ("Igbo", "ig-NG"),
                ("Yoruba", "yo-NG")
            ],
            value="en-NG"
        )
    
    # --- UI FIX 1 ---
    # Set type="messages" for the Chatbot component
    chatbot = gr.Chatbot(label="Conversation", height=400, type="messages")
    
    mic_input = gr.Audio(
        sources=["microphone"], # Use 'sources' (plural) for Gradio 4.x
        type="filepath",     
        label="3. Press record and speak"
    )
    bot_audio_output = gr.Audio(
        label="Bot's Spoken Response",
        autoplay=True
    )
    submit_btn = gr.Button("Submit Audio")
    
    # --- UI FIX 2 ---
    # Initialize history as an empty list (Gradio 4.x handles this)
    chat_history = gr.State([]) 
    
    submit_btn.click(
        fn=speech_to_speech_chat,
        inputs=[mic_input, chat_history, input_lang, output_voice],
        outputs=[chatbot, bot_audio_output, mic_input]
    )

print("Launching Gradio interface...")
# No share=True needed on Spaces, and queue is enabled by default
iface.launch()