MERaLiON
/

MERaLiON-SpeechEncoder-2

@@ -706,6 +706,46 @@ class MeralionBestRqModel(PreTrainedModel):
         return_dict: Optional[bool] = None,
         ctc_decoder: Optional[nn.Module] = None,
     ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -800,11 +840,42 @@ class MeralionBestRqModelForCTC(PreTrainedModel):
         labels: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutput]:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
-            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
-            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
-            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-            config.vocab_size - 1]`.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -959,11 +1030,42 @@ class MeralionBestRqModelForLSTMCTC(PreTrainedModel):
         labels: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutput]:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
-            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
-            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
-            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
-            config.vocab_size - 1]`.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

         return_dict: Optional[bool] = None,
         ctc_decoder: Optional[nn.Module] = None,
     ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+        r"""
+        Performs the forward pass of the BEST-RQ Conformer model.
+        Args:
+            input_values (`torch.FloatTensor` of shape `(batch_size, num_features, sequence_length)`):
+                Float values of mel features extracted from the raw speech signal.
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, where 1 for
+                tokens that are not masked and 0 for tokens that are masked.
+            mask_time_indices (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Currently unused.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            ctc_decoder (`nn.Module`, *optional*):
+                A CTC decoder module that can be used for self-conditioning. If provided, the model will apply this
+                decoder at intermediate layers and use the output to condition the subsequent layers.
+        Returns:
+            [`Wav2Vec2BaseModelOutput`] or `tuple`:
+            A [`Wav2Vec2BaseModelOutput`] (if `return_dict=True`) or a tuple of tensors (if `return_dict=False`)
+            comprising the following elements:
+            - **last_hidden_state** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+              Sequence of hidden-states at the output of the last layer of the model.
+            - **extract_features** (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim)`):
+              Sequence of robustly extracted features from the CNN feature extractor.
+            - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
+              Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+              of shape `(batch_size, sequence_length, hidden_size)`.
+            - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
+              Tuple of `torch.FloatTensor` (one for each layer) of shape
+              `(batch_size, num_heads, sequence_length, sequence_length)`.
+            - **output_lengths** (`torch.LongTensor` of shape `(batch_size,)`):
+              The length of each sequence after the convolutional subsampling.
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         labels: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutput]:
         r"""
+        Performs the forward pass of the BEST-RQ Conformer model with a CTC head.
+        Args:
+            input_values (`torch.FloatTensor` of shape `(batch_size, num_features, sequence_length)`):
+                Float values of mel features extracted from the raw speech signal.
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, where 1 for
+                tokens that are not masked and 0 for tokens that are masked.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+                Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal
+                to the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+                All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+                config.vocab_size - 1]`.
+        Returns:
+            [`CausalLMOutput`] or `tuple`:
+            A [`CausalLMOutput`] (if `return_dict=True`) or a tuple of tensors (if `return_dict=False`)
+            comprising the following elements:
+            - **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+              CTC loss.
+            - **logits** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+              Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+            - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
+              Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+              of shape `(batch_size, sequence_length, hidden_size)`.
+            - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
+              Tuple of `torch.FloatTensor` (one for each layer) of shape
+              `(batch_size, num_heads, sequence_length, sequence_length)`.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         labels: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, CausalLMOutput]:
         r"""
+        Performs the forward pass of the BEST-RQ Conformer model with an LSTM-CTC head.
+        Args:
+            input_values (`torch.FloatTensor` of shape `(batch_size, num_features, sequence_length)`):
+                Float values of mel features extracted from the raw speech signal.
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, where 1 for
+                tokens that are not masked and 0 for tokens that are masked.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*, defaults to `True`):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+                Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal
+                to the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+                All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+                config.vocab_size - 1]`.
+        Returns:
+            [`CausalLMOutput`] or `tuple`:
+            A [`CausalLMOutput`] (if `return_dict=True`) or a tuple of tensors (if `return_dict=False`)
+            comprising the following elements:
+            - **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+              CTC loss.
+            - **logits** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+              Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+            - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
+              Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+              of shape `(batch_size, sequence_length, hidden_size)`.
+            - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
+              Tuple of `torch.FloatTensor` (one for each layer) of shape
+              `(batch_size, num_heads, sequence_length, sequence_length)`.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict