TuKoResearch
/

WavTokenizer

+"""
+WavTokenizer Configuration for HuggingFace Transformers
+This configuration class defines all the hyperparameters for WavTokenizer,
+an acoustic discrete codec tokenizer for audio language modeling.
+"""
+from transformers import PretrainedConfig
+class WavTokenizerConfig(PretrainedConfig):
+    """
+    Configuration class for WavTokenizer model.
+    WavTokenizer is a SOTA discrete acoustic codec model that compresses audio
+    into discrete tokens (40 or 75 tokens per second) while maintaining high
+    reconstruction quality.
+    Args:
+        sample_rate (`int`, *optional*, defaults to 24000):
+            The sample rate of input audio.
+        n_fft (`int`, *optional*, defaults to 1280):
+            FFT size for STFT.
+        hop_length (`int`, *optional*, defaults to 320):
+            Hop length for STFT (determines frame rate: 24000/320 = 75 fps).
+        n_mels (`int`, *optional*, defaults to 128):
+            Number of mel filterbank channels.
+        padding (`str`, *optional*, defaults to "center"):
+            Padding mode for STFT ("center" or "same").
+        feature_dim (`int`, *optional*, defaults to 512):
+            Dimension of the feature backbone.
+        encoder_dim (`int`, *optional*, defaults to 64):
+            Dimension of encoder output.
+        encoder_rates (`list[int]`, *optional*, defaults to [8, 5, 4, 2]):
+            Downsampling rates for the encoder.
+        latent_dim (`int`, *optional*):
+            Dimension of the latent space (defaults to feature_dim).
+        codebook_size (`int`, *optional*, defaults to 4096):
+            Size of the VQ codebook.
+        codebook_dim (`int`, *optional*, defaults to 8):
+            Dimension of codebook vectors.
+        num_quantizers (`int`, *optional*, defaults to 1):
+            Number of residual vector quantizers.
+        backbone_type (`str`, *optional*, defaults to "vocos"):
+            Type of decoder backbone ("vocos").
+        backbone_dim (`int`, *optional*, defaults to 512):
+            Dimension of the decoder backbone.
+        backbone_num_blocks (`int`, *optional*, defaults to 8):
+            Number of ConvNeXt blocks in the backbone.
+        backbone_intermediate_dim (`int`, *optional*, defaults to 1536):
+            Intermediate dimension in ConvNeXt blocks.
+        backbone_kernel_size (`int`, *optional*, defaults to 7):
+            Kernel size for depthwise convolutions.
+        backbone_layer_scale_init_value (`float`, *optional*, defaults to 1e-6):
+            Initial value for layer scale.
+        head_type (`str`, *optional*, defaults to "istft"):
+            Type of waveform synthesis head ("istft").
+        head_dim (`int`, *optional*, defaults to 1025):
+            Output dimension for the head (n_fft // 2 + 1).
+        use_attention (`bool`, *optional*, defaults to True):
+            Whether to use attention in the decoder.
+        attention_dim (`int`, *optional*, defaults to 512):
+            Dimension for attention layers.
+        attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads.
+        attention_layers (`int`, *optional*, defaults to 1):
+            Number of attention layers.
+    """
+    model_type = "wavtokenizer"
+    def __init__(
+        self,
+        # Audio parameters
+        sample_rate: int = 24000,
+        n_fft: int = 1280,
+        hop_length: int = 320,
+        n_mels: int = 128,
+        padding: str = "center",
+        # Feature dimensions
+        feature_dim: int = 512,
+        encoder_dim: int = 64,
+        encoder_rates: list = None,
+        latent_dim: int = None,
+        # Quantizer parameters
+        codebook_size: int = 4096,
+        codebook_dim: int = 8,
+        num_quantizers: int = 1,
+        # Backbone parameters
+        backbone_type: str = "vocos",
+        backbone_dim: int = 512,
+        backbone_num_blocks: int = 8,
+        backbone_intermediate_dim: int = 1536,
+        backbone_kernel_size: int = 7,
+        backbone_layer_scale_init_value: float = 1e-6,
+        # Head parameters
+        head_type: str = "istft",
+        head_dim: int = 1025,
+        # Attention parameters
+        use_attention: bool = True,
+        attention_dim: int = 512,
+        attention_heads: int = 8,
+        attention_layers: int = 1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        # Audio
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.n_mels = n_mels
+        self.padding = padding
+        # Feature dimensions
+        self.feature_dim = feature_dim
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates if encoder_rates is not None else [8, 5, 4, 2]
+        self.latent_dim = latent_dim if latent_dim is not None else feature_dim
+        # Quantizer
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.num_quantizers = num_quantizers
+        # Backbone
+        self.backbone_type = backbone_type
+        self.backbone_dim = backbone_dim
+        self.backbone_num_blocks = backbone_num_blocks
+        self.backbone_intermediate_dim = backbone_intermediate_dim
+        self.backbone_kernel_size = backbone_kernel_size
+        self.backbone_layer_scale_init_value = backbone_layer_scale_init_value
+        # Head
+        self.head_type = head_type
+        self.head_dim = head_dim
+        # Attention
+        self.use_attention = use_attention
+        self.attention_dim = attention_dim
+        self.attention_heads = attention_heads
+        self.attention_layers = attention_layers
+    @property
+    def vocab_size(self) -> int:
+        """Returns the vocabulary size (codebook size)."""
+        return self.codebook_size
+    @property
+    def frame_rate(self) -> float:
+        """Returns the frame rate (tokens per second)."""
+        return self.sample_rate / self.hop_length