Upload model
Browse files- modeling_bestrq_conformer.py +112 -10
modeling_bestrq_conformer.py
CHANGED
|
@@ -706,6 +706,46 @@ class MeralionBestRqModel(PreTrainedModel):
|
|
| 706 |
return_dict: Optional[bool] = None,
|
| 707 |
ctc_decoder: Optional[nn.Module] = None,
|
| 708 |
) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 710 |
output_hidden_states = (
|
| 711 |
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
@@ -800,11 +840,42 @@ class MeralionBestRqModelForCTC(PreTrainedModel):
|
|
| 800 |
labels: Optional[torch.Tensor] = None,
|
| 801 |
) -> Union[Tuple, CausalLMOutput]:
|
| 802 |
r"""
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
"""
|
| 809 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 810 |
|
|
@@ -959,11 +1030,42 @@ class MeralionBestRqModelForLSTMCTC(PreTrainedModel):
|
|
| 959 |
labels: Optional[torch.Tensor] = None,
|
| 960 |
) -> Union[Tuple, CausalLMOutput]:
|
| 961 |
r"""
|
| 962 |
-
|
| 963 |
-
|
| 964 |
-
|
| 965 |
-
|
| 966 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 967 |
"""
|
| 968 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 969 |
|
|
|
|
| 706 |
return_dict: Optional[bool] = None,
|
| 707 |
ctc_decoder: Optional[nn.Module] = None,
|
| 708 |
) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
|
| 709 |
+
r"""
|
| 710 |
+
Performs the forward pass of the BEST-RQ Conformer model.
|
| 711 |
+
|
| 712 |
+
Args:
|
| 713 |
+
input_values (`torch.FloatTensor` of shape `(batch_size, num_features, sequence_length)`):
|
| 714 |
+
Float values of mel features extracted from the raw speech signal.
|
| 715 |
+
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 716 |
+
Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, where 1 for
|
| 717 |
+
tokens that are not masked and 0 for tokens that are masked.
|
| 718 |
+
mask_time_indices (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 719 |
+
Currently unused.
|
| 720 |
+
output_attentions (`bool`, *optional*):
|
| 721 |
+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
| 722 |
+
returned tensors for more detail.
|
| 723 |
+
output_hidden_states (`bool`, *optional*):
|
| 724 |
+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
| 725 |
+
for more detail.
|
| 726 |
+
return_dict (`bool`, *optional*):
|
| 727 |
+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| 728 |
+
ctc_decoder (`nn.Module`, *optional*):
|
| 729 |
+
A CTC decoder module that can be used for self-conditioning. If provided, the model will apply this
|
| 730 |
+
decoder at intermediate layers and use the output to condition the subsequent layers.
|
| 731 |
+
|
| 732 |
+
Returns:
|
| 733 |
+
[`Wav2Vec2BaseModelOutput`] or `tuple`:
|
| 734 |
+
A [`Wav2Vec2BaseModelOutput`] (if `return_dict=True`) or a tuple of tensors (if `return_dict=False`)
|
| 735 |
+
comprising the following elements:
|
| 736 |
+
- **last_hidden_state** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
| 737 |
+
Sequence of hidden-states at the output of the last layer of the model.
|
| 738 |
+
- **extract_features** (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim)`):
|
| 739 |
+
Sequence of robustly extracted features from the CNN feature extractor.
|
| 740 |
+
- **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
|
| 741 |
+
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
| 742 |
+
of shape `(batch_size, sequence_length, hidden_size)`.
|
| 743 |
+
- **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
|
| 744 |
+
Tuple of `torch.FloatTensor` (one for each layer) of shape
|
| 745 |
+
`(batch_size, num_heads, sequence_length, sequence_length)`.
|
| 746 |
+
- **output_lengths** (`torch.LongTensor` of shape `(batch_size,)`):
|
| 747 |
+
The length of each sequence after the convolutional subsampling.
|
| 748 |
+
"""
|
| 749 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 750 |
output_hidden_states = (
|
| 751 |
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
|
|
| 840 |
labels: Optional[torch.Tensor] = None,
|
| 841 |
) -> Union[Tuple, CausalLMOutput]:
|
| 842 |
r"""
|
| 843 |
+
Performs the forward pass of the BEST-RQ Conformer model with a CTC head.
|
| 844 |
+
|
| 845 |
+
Args:
|
| 846 |
+
input_values (`torch.FloatTensor` of shape `(batch_size, num_features, sequence_length)`):
|
| 847 |
+
Float values of mel features extracted from the raw speech signal.
|
| 848 |
+
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 849 |
+
Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, where 1 for
|
| 850 |
+
tokens that are not masked and 0 for tokens that are masked.
|
| 851 |
+
output_attentions (`bool`, *optional*):
|
| 852 |
+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
| 853 |
+
returned tensors for more detail.
|
| 854 |
+
output_hidden_states (`bool`, *optional*):
|
| 855 |
+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
| 856 |
+
for more detail.
|
| 857 |
+
return_dict (`bool`, *optional*):
|
| 858 |
+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| 859 |
+
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
| 860 |
+
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal
|
| 861 |
+
to the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
|
| 862 |
+
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
| 863 |
+
config.vocab_size - 1]`.
|
| 864 |
+
|
| 865 |
+
Returns:
|
| 866 |
+
[`CausalLMOutput`] or `tuple`:
|
| 867 |
+
A [`CausalLMOutput`] (if `return_dict=True`) or a tuple of tensors (if `return_dict=False`)
|
| 868 |
+
comprising the following elements:
|
| 869 |
+
- **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
| 870 |
+
CTC loss.
|
| 871 |
+
- **logits** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
| 872 |
+
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
| 873 |
+
- **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
|
| 874 |
+
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
| 875 |
+
of shape `(batch_size, sequence_length, hidden_size)`.
|
| 876 |
+
- **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
|
| 877 |
+
Tuple of `torch.FloatTensor` (one for each layer) of shape
|
| 878 |
+
`(batch_size, num_heads, sequence_length, sequence_length)`.
|
| 879 |
"""
|
| 880 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 881 |
|
|
|
|
| 1030 |
labels: Optional[torch.Tensor] = None,
|
| 1031 |
) -> Union[Tuple, CausalLMOutput]:
|
| 1032 |
r"""
|
| 1033 |
+
Performs the forward pass of the BEST-RQ Conformer model with an LSTM-CTC head.
|
| 1034 |
+
|
| 1035 |
+
Args:
|
| 1036 |
+
input_values (`torch.FloatTensor` of shape `(batch_size, num_features, sequence_length)`):
|
| 1037 |
+
Float values of mel features extracted from the raw speech signal.
|
| 1038 |
+
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 1039 |
+
Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, where 1 for
|
| 1040 |
+
tokens that are not masked and 0 for tokens that are masked.
|
| 1041 |
+
output_attentions (`bool`, *optional*):
|
| 1042 |
+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
| 1043 |
+
returned tensors for more detail.
|
| 1044 |
+
output_hidden_states (`bool`, *optional*, defaults to `True`):
|
| 1045 |
+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
| 1046 |
+
for more detail.
|
| 1047 |
+
return_dict (`bool`, *optional*):
|
| 1048 |
+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| 1049 |
+
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
| 1050 |
+
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal
|
| 1051 |
+
to the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
|
| 1052 |
+
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
| 1053 |
+
config.vocab_size - 1]`.
|
| 1054 |
+
|
| 1055 |
+
Returns:
|
| 1056 |
+
[`CausalLMOutput`] or `tuple`:
|
| 1057 |
+
A [`CausalLMOutput`] (if `return_dict=True`) or a tuple of tensors (if `return_dict=False`)
|
| 1058 |
+
comprising the following elements:
|
| 1059 |
+
- **loss** (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
| 1060 |
+
CTC loss.
|
| 1061 |
+
- **logits** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
| 1062 |
+
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
| 1063 |
+
- **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
|
| 1064 |
+
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
| 1065 |
+
of shape `(batch_size, sequence_length, hidden_size)`.
|
| 1066 |
+
- **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
|
| 1067 |
+
Tuple of `torch.FloatTensor` (one for each layer) of shape
|
| 1068 |
+
`(batch_size, num_heads, sequence_length, sequence_length)`.
|
| 1069 |
"""
|
| 1070 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1071 |
|