Model Card for Model ID
Speech-to-Text model for Garchen Rinpoche.
Usage
The wav2vec and associated kenlm can be loaded like this:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download
import json
import torch
# Load processor and model
processor = Wav2Vec2Processor.from_pretrained("billingsmoore/garchen-stt")
model = Wav2Vec2ForCTC.from_pretrained("billingsmoore/garchen-stt")
# Download KenLM model and decoder config from Hugging Face Hub
kenlm_path = hf_hub_download(repo_id="billingsmoore/garchen-stt", filename="kenlm_5gram.bin")
decoder_config_path = hf_hub_download(repo_id="billingsmoore/garchen-stt", filename="decoder_config.json")
# Load decoder config
with open(decoder_config_path, 'r') as f:
decoder_config = json.load(f)
# Build decoder
vocab = processor.tokenizer.get_vocab()
labels = [token for token, idx in sorted(vocab.items(), key=lambda x: x[1])]
decoder = build_ctcdecoder(
labels=labels,
kenlm_model_path=kenlm_path,
alpha=decoder_config["alpha"],
beta=decoder_config["beta"]
)
Then, assuming that your data has some batch['audio'] = = {"array": waveform, "sampling_rate": sr}, predictions can be generated like this:
def transcribe_with_lm(audio, processor=processor, model=model, decoder=decoder, decoder_config=decoder_config):
# Extract features
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
# Get model predictions
with torch.no_grad():
logits = model(inputs.input_values).logits.cpu().numpy()[0]
# Decode with LM (hyperparameters from grid search)
transcription = decoder.decode(logits,
beam_width=decoder_config["beam_width"],
beam_prune_logp=decoder_config["beam_prune_logp"],
token_min_logp=decoder_config["token_min_logp"])
return transcription
transcriptions = [transcribe_with_lm(batch["audio"]["array"]) for batch in dataset]
- Downloads last month
- 37
Model tree for billingsmoore/garchen-stt
Base model
facebook/wav2vec2-xls-r-300m