| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| import numpy as np | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| model_id = "openai/whisper-large-v3-turbo" | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| model_id, | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True | |
| ) | |
| model.to(device) | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| chunk_length_s=30, | |
| batch_size=8 | |
| ) | |
| def transcribe_audio(audio_file, task="transcribe", language="auto", return_timestamps=False): | |
| if audio_file is None: | |
| return "No audio file provided." | |
| try: | |
| generate_kwargs = { | |
| "task": task, | |
| "language": None if language == "auto" else language, | |
| } | |
| if task == "translate": | |
| generate_kwargs["task"] = "translate" | |
| result = pipe( | |
| audio_file, | |
| return_timestamps=return_timestamps, | |
| generate_kwargs=generate_kwargs | |
| ) | |
| if return_timestamps and "chunks" in result: | |
| formatted_result = [] | |
| for chunk in result["chunks"]: | |
| timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]" | |
| formatted_result.append(f"{timestamp} {chunk['text']}") | |
| return "\n".join(formatted_result) | |
| else: | |
| return result["text"] | |
| except Exception as e: | |
| return f"Error processing audio: {str(e)}" | |
| def transcribe_microphone(audio_data, task="transcribe", language="auto", return_timestamps=False): | |
| if audio_data is None: | |
| return "No audio recorded." | |
| try: | |
| sample_rate, audio_array = audio_data | |
| audio_array = audio_array.astype(np.float32) | |
| audio_array = audio_array / np.max(np.abs(audio_array)) | |
| generate_kwargs = { | |
| "task": task, | |
| "language": None if language == "auto" else language, | |
| } | |
| if task == "translate": | |
| generate_kwargs["task"] = "translate" | |
| result = pipe( | |
| {"array": audio_array, "sampling_rate": sample_rate}, | |
| return_timestamps=return_timestamps, | |
| generate_kwargs=generate_kwargs | |
| ) | |
| if return_timestamps and "chunks" in result: | |
| formatted_result = [] | |
| for chunk in result["chunks"]: | |
| timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]" | |
| formatted_result.append(f"{timestamp} {chunk['text']}") | |
| return "\n".join(formatted_result) | |
| else: | |
| return result["text"] | |
| except Exception as e: | |
| return f"Error processing audio: {str(e)}" | |
| languages = [ | |
| ("Auto Detect", "auto"), | |
| ("English", "en"), | |
| ("Chinese", "zh"), | |
| ("German", "de"), | |
| ("Spanish", "es"), | |
| ("Russian", "ru"), | |
| ("Korean", "ko"), | |
| ("French", "fr"), | |
| ("Japanese", "ja"), | |
| ("Portuguese", "pt"), | |
| ("Turkish", "tr"), | |
| ("Polish", "pl"), | |
| ("Catalan", "ca"), | |
| ("Dutch", "nl"), | |
| ("Arabic", "ar"), | |
| ("Swedish", "sv"), | |
| ("Italian", "it"), | |
| ("Indonesian", "id"), | |
| ("Hindi", "hi"), | |
| ("Finnish", "fi"), | |
| ("Vietnamese", "vi"), | |
| ("Hebrew", "he"), | |
| ("Ukrainian", "uk"), | |
| ("Greek", "el"), | |
| ("Malay", "ms"), | |
| ("Czech", "cs"), | |
| ("Romanian", "ro"), | |
| ("Danish", "da"), | |
| ("Hungarian", "hu"), | |
| ("Tamil", "ta"), | |
| ("Norwegian", "no"), | |
| ("Thai", "th"), | |
| ("Urdu", "ur"), | |
| ("Croatian", "hr"), | |
| ("Bulgarian", "bg"), | |
| ("Lithuanian", "lt"), | |
| ("Latin", "la"), | |
| ] | |
| with gr.Blocks(title="Whisper Large V3 Turbo - Speech to Text") as demo: | |
| gr.Markdown("# 🎤 Whisper Large V3 Turbo - Speech to Text") | |
| gr.Markdown("Upload an audio file or record directly to get high-quality transcription using OpenAI's Whisper Large V3 Turbo model.") | |
| with gr.Tab("Upload Audio File"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_file = gr.Audio( | |
| label="Upload Audio File", | |
| type="filepath" | |
| ) | |
| task_file = gr.Radio( | |
| choices=[("Transcribe", "transcribe"), ("Translate to English", "translate")], | |
| value="transcribe", | |
| label="Task" | |
| ) | |
| language_file = gr.Dropdown( | |
| choices=languages, | |
| value="auto", | |
| label="Source Language" | |
| ) | |
| timestamps_file = gr.Checkbox( | |
| label="Return Timestamps", | |
| value=False | |
| ) | |
| submit_file = gr.Button("Transcribe Audio File", variant="primary") | |
| with gr.Column(): | |
| output_file = gr.Textbox( | |
| label="Transcription Result", | |
| lines=10, | |
| max_lines=20 | |
| ) | |
| with gr.Tab("Record Audio"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_mic = gr.Audio( | |
| label="Record Audio", | |
| sources=["microphone"] | |
| ) | |
| task_mic = gr.Radio( | |
| choices=[("Transcribe", "transcribe"), ("Translate to English", "translate")], | |
| value="transcribe", | |
| label="Task" | |
| ) | |
| language_mic = gr.Dropdown( | |
| choices=languages, | |
| value="auto", | |
| label="Source Language" | |
| ) | |
| timestamps_mic = gr.Checkbox( | |
| label="Return Timestamps", | |
| value=False | |
| ) | |
| submit_mic = gr.Button("Transcribe Recording", variant="primary") | |
| with gr.Column(): | |
| output_mic = gr.Textbox( | |
| label="Transcription Result", | |
| lines=10, | |
| max_lines=20 | |
| ) | |
| submit_file.click( | |
| transcribe_audio, | |
| inputs=[audio_file, task_file, language_file, timestamps_file], | |
| outputs=output_file | |
| ) | |
| submit_mic.click( | |
| transcribe_microphone, | |
| inputs=[audio_mic, task_mic, language_mic, timestamps_mic], | |
| outputs=output_mic | |
| ) | |
| gr.Markdown("### Features:") | |
| gr.Markdown("- **High Accuracy**: Powered by Whisper Large V3 Turbo model") | |
| gr.Markdown("- **CPU Optimized**: Optimized for 2-core CPU with 16GB RAM") | |
| gr.Markdown("- **Multi-language**: Supports 99+ languages") | |
| gr.Markdown("- **Translation**: Can translate speech to English") | |
| gr.Markdown("- **Timestamps**: Optional word-level or sentence-level timestamps") | |
| gr.Markdown("- **Memory Efficient**: Uses chunked processing for better performance") | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |