Spaces:
Configuration error
Configuration error
| import torch | |
| import spaces | |
| import gradio as gr | |
| import os | |
| from pyannote.audio import Pipeline | |
| # instantiate the pipeline | |
| try: | |
| pipeline = Pipeline.from_pretrained( | |
| "pyannote/speaker-diarization-3.1", | |
| use_auth_token=os.environ["HUGGINGFACE_READ_TOKEN"] | |
| ) | |
| # Move the pipeline to the GPU | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| pipeline.to(device) | |
| except Exception as e: | |
| print(f"Error initializing pipeline: {e}") | |
| pipeline = None | |
| def save_audio(audio): | |
| if pipeline is None: | |
| return "Error: Pipeline not initialized" | |
| # Read the uploaded audio file as bytes | |
| with open(audio, "rb") as f: | |
| audio_data = f.read() | |
| # Save the uploaded audio file to a temporary location | |
| with open("temp.wav", "wb") as f: | |
| f.write(audio_data) | |
| return "temp.wav" | |
| def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers): | |
| if pipeline is None: | |
| return "Error: Pipeline not initialized" | |
| try: | |
| params = {} | |
| if num_speakers > 0: | |
| params["num_speakers"] = num_speakers | |
| if min_speakers > 0: | |
| params["min_speakers"] = min_speakers | |
| if max_speakers > 0: | |
| params["max_speakers"] = max_speakers | |
| diarization = pipeline(temp_file, **params) | |
| except Exception as e: | |
| return f"Error processing audio: {e}" | |
| # Remove the temporary file | |
| os.remove(temp_file) | |
| # Return the diarization output | |
| return str(diarization) | |
| def timestamp_to_seconds(timestamp): | |
| try: | |
| # Extracts hour, minute, and second from timestamp and converts to total seconds | |
| h, m, s = map(float, timestamp.split(':')) | |
| return 3600 * h + 60 * m + s | |
| except ValueError as e: | |
| print(f"Error converting timestamp to seconds: '{timestamp}'. Error: {e}") | |
| return None | |
| def generate_labels_from_diarization(diarization_output): | |
| successful_lines = 0 # Counter for successfully processed lines | |
| labels_path = 'labels.txt' | |
| try: | |
| with open(labels_path, 'w') as outfile: | |
| lines = diarization_output.strip().split('\n') | |
| for line in lines: | |
| try: | |
| parts = line.strip()[1:-1].split(' --> ') | |
| start_time = parts[0].strip() | |
| end_time = parts[1].split(']')[0].strip() | |
| label = line.split()[-1].strip() # Extracting the last word as label | |
| start_seconds = timestamp_to_seconds(start_time) | |
| end_seconds = timestamp_to_seconds(end_time) | |
| outfile.write(f"{start_seconds}\t{end_seconds}\t{label}\n") | |
| successful_lines += 1 | |
| except Exception as e: | |
| print(f"Error processing line: '{line.strip()}'. Error: {e}") | |
| print(f"Processed {successful_lines} lines successfully.") | |
| return labels_path if successful_lines > 0 else None | |
| except Exception as e: | |
| print(f"Cannot write to file '{labels_path}'. Error: {e}") | |
| return None | |
| def process_audio(audio, num_speakers, min_speakers, max_speakers): | |
| diarization_result = diarize_audio(save_audio(audio), num_speakers, min_speakers, max_speakers) | |
| if diarization_result.startswith("Error"): | |
| return diarization_result, None # Return None for label file link if there's an error | |
| else: | |
| label_file = generate_labels_from_diarization(diarization_result) | |
| return diarization_result, label_file | |
| with gr.Blocks() as demo: | |
| gr.Markdown(""" | |
| # 🗣️Pyannote Speaker Diarization 3.1🗣️ | |
| This model takes an audio file as input and outputs the diarization of the speakers in the audio. | |
| Please upload an audio file and adjust the parameters as needed. | |
| The maximum length of the audio file that can be processed depends based on the hardware it's running on. If you are on the ZeroGPU HuggingFace Space, it's around **35-40 minutes**. | |
| If you find this space helpful, please ❤ it. | |
| Join my server for support and open source AI discussion: https://discord.gg/osai | |
| IF YOU LEAVE ALL THE PARAMETERS BELOW TO 0, IT WILL BE ON AUTO MODE, AUTOMATICALLY DETECTING THE SPEAKERS, ELSE USE THE ONES BELOW FOR MORE COSTUMIZATION & BETTER RESULTS | |
| """) | |
| audio_input = gr.Audio(type="filepath", label="Upload Audio File") | |
| num_speakers_input = gr.Number(label="Number of Speakers", info="Use it only if you know the number of speakers in advance, else leave it to 0 and use the parameters below", value=0) | |
| gr.Markdown("Use the following parameters only if you don't know the number of speakers, you can set lower and/or upper bounds on the number of speakers, if instead you know it, leave the following parameters to 0 and use the one above") | |
| min_speakers_input = gr.Number(label="Minimum Number of Speakers", value=0) | |
| max_speakers_input = gr.Number(label="Maximum Number of Speakers", value=0) | |
| process_button = gr.Button("Process") | |
| diarization_output = gr.Textbox(label="Diarization Output") | |
| label_file_link = gr.File(label="Download DAW Labels") | |
| process_button.click( | |
| fn=process_audio, | |
| inputs=[audio_input, num_speakers_input, min_speakers_input, max_speakers_input], | |
| outputs=[diarization_output, label_file_link] | |
| ) | |
| demo.launch(share = False) |