huzy0 commited on
Commit
06285a6
·
verified ·
1 Parent(s): a2a39af

Upload model

Browse files
Files changed (1) hide show
  1. modeling_bestrq_conformer.py +112 -10
modeling_bestrq_conformer.py CHANGED
@@ -706,6 +706,46 @@ class MeralionBestRqModel(PreTrainedModel):
706
  return_dict: Optional[bool] = None,
707
  ctc_decoder: Optional[nn.Module] = None,
708
  ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
710
  output_hidden_states = (
711
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -800,11 +840,42 @@ class MeralionBestRqModelForCTC(PreTrainedModel):
800
  labels: Optional[torch.Tensor] = None,
801
  ) -> Union[Tuple, CausalLMOutput]:
802
  r"""
803
- labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
804
- Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
805
- the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
806
- All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
807
- config.vocab_size - 1]`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808
  """
809
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
810
 
@@ -959,11 +1030,42 @@ class MeralionBestRqModelForLSTMCTC(PreTrainedModel):
959
  labels: Optional[torch.Tensor] = None,
960
  ) -> Union[Tuple, CausalLMOutput]:
961
  r"""
962
- labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
963
- Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
964
- the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
965
- All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
966
- config.vocab_size - 1]`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
967
  """
968
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
969
 
 
706
  return_dict: Optional[bool] = None,
707
  ctc_decoder: Optional[nn.Module] = None,
708
  ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
709
+ r"""
710
+ Performs the forward pass of the BEST-RQ Conformer model.
711
+
712
+ Args:
713
+ input_values (`torch.FloatTensor` of shape `(batch_size, num_features, sequence_length)`):
714
+ Float values of mel features extracted from the raw speech signal.
715
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
716
+ Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, where 1 for
717
+ tokens that are not masked and 0 for tokens that are masked.
718
+ mask_time_indices (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
719
+ Currently unused.
720
+ output_attentions (`bool`, *optional*):
721
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
722
+ returned tensors for more detail.
723
+ output_hidden_states (`bool`, *optional*):
724
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
725
+ for more detail.
726
+ return_dict (`bool`, *optional*):
727
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
728
+ ctc_decoder (`nn.Module`, *optional*):
729
+ A CTC decoder module that can be used for self-conditioning. If provided, the model will apply this
730
+ decoder at intermediate layers and use the output to condition the subsequent layers.
731
+
732
+ Returns:
733
+ [`Wav2Vec2BaseModelOutput`] or `tuple`:
734
+ A [`Wav2Vec2BaseModelOutput`] (if `return_dict=True`) or a tuple of tensors (if `return_dict=False`)
735
+ comprising the following elements:
736
+ - **last_hidden_state** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
737
+ Sequence of hidden-states at the output of the last layer of the model.
738
+ - **extract_features** (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim)`):
739
+ Sequence of robustly extracted features from the CNN feature extractor.
740
+ - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
741
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
742
+ of shape `(batch_size, sequence_length, hidden_size)`.
743
+ - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
744
+ Tuple of `torch.FloatTensor` (one for each layer) of shape
745
+ `(batch_size, num_heads, sequence_length, sequence_length)`.
746
+ - **output_lengths** (`torch.LongTensor` of shape `(batch_size,)`):
747
+ The length of each sequence after the convolutional subsampling.
748
+ """
749
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
750
  output_hidden_states = (
751
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
 
840
  labels: Optional[torch.Tensor] = None,
841
  ) -> Union[Tuple, CausalLMOutput]:
842
  r"""
843
+ Performs the forward pass of the BEST-RQ Conformer model with a CTC head.
844
+
845
+ Args:
846
+ input_values (`torch.FloatTensor` of shape `(batch_size, num_features, sequence_length)`):
847
+ Float values of mel features extracted from the raw speech signal.
848
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
849
+ Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, where 1 for
850
+ tokens that are not masked and 0 for tokens that are masked.
851
+ output_attentions (`bool`, *optional*):
852
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
853
+ returned tensors for more detail.
854
+ output_hidden_states (`bool`, *optional*):
855
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
856
+ for more detail.
857
+ return_dict (`bool`, *optional*):
858
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
859
+ labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
860
+ Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal
861
+ to the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
862
+ All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
863
+ config.vocab_size - 1]`.
864
+
865
+ Returns:
866
+ [`CausalLMOutput`] or `tuple`:
867
+ A [`CausalLMOutput`] (if `return_dict=True`) or a tuple of tensors (if `return_dict=False`)
868
+ comprising the following elements:
869
+ - **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
870
+ CTC loss.
871
+ - **logits** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
872
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
873
+ - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
874
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
875
+ of shape `(batch_size, sequence_length, hidden_size)`.
876
+ - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
877
+ Tuple of `torch.FloatTensor` (one for each layer) of shape
878
+ `(batch_size, num_heads, sequence_length, sequence_length)`.
879
  """
880
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
881
 
 
1030
  labels: Optional[torch.Tensor] = None,
1031
  ) -> Union[Tuple, CausalLMOutput]:
1032
  r"""
1033
+ Performs the forward pass of the BEST-RQ Conformer model with an LSTM-CTC head.
1034
+
1035
+ Args:
1036
+ input_values (`torch.FloatTensor` of shape `(batch_size, num_features, sequence_length)`):
1037
+ Float values of mel features extracted from the raw speech signal.
1038
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1039
+ Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, where 1 for
1040
+ tokens that are not masked and 0 for tokens that are masked.
1041
+ output_attentions (`bool`, *optional*):
1042
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
1043
+ returned tensors for more detail.
1044
+ output_hidden_states (`bool`, *optional*, defaults to `True`):
1045
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
1046
+ for more detail.
1047
+ return_dict (`bool`, *optional*):
1048
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1049
+ labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
1050
+ Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal
1051
+ to the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
1052
+ All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
1053
+ config.vocab_size - 1]`.
1054
+
1055
+ Returns:
1056
+ [`CausalLMOutput`] or `tuple`:
1057
+ A [`CausalLMOutput`] (if `return_dict=True`) or a tuple of tensors (if `return_dict=False`)
1058
+ comprising the following elements:
1059
+ - **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
1060
+ CTC loss.
1061
+ - **logits** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
1062
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
1063
+ - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
1064
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
1065
+ of shape `(batch_size, sequence_length, hidden_size)`.
1066
+ - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
1067
+ Tuple of `torch.FloatTensor` (one for each layer) of shape
1068
+ `(batch_size, num_heads, sequence_length, sequence_length)`.
1069
  """
1070
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1071