status in result tab got more informative
Browse files- frontend/app.js +12 -5
- src/asr.py +6 -5
- src/diarization.py +22 -5
- src/server/services/asr_service.py +40 -9
frontend/app.js
CHANGED
|
@@ -228,7 +228,6 @@ async function handleTranscription() {
|
|
| 228 |
const reader = response.body.getReader();
|
| 229 |
const decoder = new TextDecoder();
|
| 230 |
let buffer = '';
|
| 231 |
-
setStatus('Processing audio...', 'info');
|
| 232 |
|
| 233 |
while (true) {
|
| 234 |
const { done, value } = await reader.read();
|
|
@@ -265,12 +264,20 @@ function handleTranscriptionEvent(event) {
|
|
| 265 |
elements.audioPlayer.currentTime = 0;
|
| 266 |
}
|
| 267 |
break;
|
| 268 |
-
case '
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
|
|
|
|
|
|
| 272 |
}
|
| 273 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
case 'complete':
|
| 275 |
if (event.diarization) {
|
| 276 |
state.diarizedUtterances = event.diarization.utterances || [];
|
|
|
|
| 228 |
const reader = response.body.getReader();
|
| 229 |
const decoder = new TextDecoder();
|
| 230 |
let buffer = '';
|
|
|
|
| 231 |
|
| 232 |
while (true) {
|
| 233 |
const { done, value } = await reader.read();
|
|
|
|
| 264 |
elements.audioPlayer.currentTime = 0;
|
| 265 |
}
|
| 266 |
break;
|
| 267 |
+
case 'status':
|
| 268 |
+
setStatus(event.message, 'info');
|
| 269 |
+
break;
|
| 270 |
+
case 'progress':
|
| 271 |
+
if (event.stage === 'diarization') {
|
| 272 |
+
setStatus(`Performing speaker diarization... (${event.progress}%)`, 'info');
|
| 273 |
}
|
| 274 |
break;
|
| 275 |
+
case 'utterance':
|
| 276 |
+
state.utterances.push(event.utterance);
|
| 277 |
+
const progress = event.progress || 0;
|
| 278 |
+
setStatus(`Transcribing audio... (${state.utterances.length} utterances, ${progress}%)`, 'info');
|
| 279 |
+
renderTranscript();
|
| 280 |
+
break;
|
| 281 |
case 'complete':
|
| 282 |
if (event.diarization) {
|
| 283 |
state.diarizedUtterances = event.diarization.utterances || [];
|
src/asr.py
CHANGED
|
@@ -42,7 +42,7 @@ def transcribe_file(
|
|
| 42 |
backend: str = "moonshine",
|
| 43 |
language: str = "auto",
|
| 44 |
textnorm: str = "withitn",
|
| 45 |
-
) -> Iterable[Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]]]]:
|
| 46 |
"""
|
| 47 |
Transcribe audio file using specified backend.
|
| 48 |
|
|
@@ -124,7 +124,8 @@ def transcribe_file(
|
|
| 124 |
|
| 125 |
if text:
|
| 126 |
utterances.append((segment_start, segment_end, cleaned_text))
|
| 127 |
-
|
|
|
|
| 128 |
|
| 129 |
# Reset for next segment
|
| 130 |
speech_chunks = []
|
|
@@ -154,10 +155,10 @@ def transcribe_file(
|
|
| 154 |
|
| 155 |
if text:
|
| 156 |
utterances.append((segment_start, segment_end, cleaned_text))
|
| 157 |
-
yield utterances[-1], utterances.copy()
|
| 158 |
|
| 159 |
# Final yield with all utterances
|
| 160 |
if utterances:
|
| 161 |
-
yield None, utterances
|
| 162 |
else:
|
| 163 |
-
yield None, [(-1, -1, "No speech detected")]
|
|
|
|
| 42 |
backend: str = "moonshine",
|
| 43 |
language: str = "auto",
|
| 44 |
textnorm: str = "withitn",
|
| 45 |
+
) -> Iterable[Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]], float]]:
|
| 46 |
"""
|
| 47 |
Transcribe audio file using specified backend.
|
| 48 |
|
|
|
|
| 124 |
|
| 125 |
if text:
|
| 126 |
utterances.append((segment_start, segment_end, cleaned_text))
|
| 127 |
+
progress = min(100, (i / len(wav)) * 100)
|
| 128 |
+
yield utterances[-1], utterances.copy(), progress
|
| 129 |
|
| 130 |
# Reset for next segment
|
| 131 |
speech_chunks = []
|
|
|
|
| 155 |
|
| 156 |
if text:
|
| 157 |
utterances.append((segment_start, segment_end, cleaned_text))
|
| 158 |
+
yield utterances[-1], utterances.copy(), 100.0
|
| 159 |
|
| 160 |
# Final yield with all utterances
|
| 161 |
if utterances:
|
| 162 |
+
yield None, utterances, 100.0
|
| 163 |
else:
|
| 164 |
+
yield None, [(-1, -1, "No speech detected")], 100.0
|
src/diarization.py
CHANGED
|
@@ -211,8 +211,8 @@ def perform_speaker_diarization_on_utterances(
|
|
| 211 |
batch_size = max(1, total_utterances // 20) # Process in batches for progress updates
|
| 212 |
|
| 213 |
for i, (start, end, text) in enumerate(utterances):
|
| 214 |
-
if
|
| 215 |
-
|
| 216 |
|
| 217 |
# Extract audio segment
|
| 218 |
start_sample = int(start * sample_rate)
|
|
@@ -279,6 +279,7 @@ def perform_speaker_diarization_on_utterances(
|
|
| 279 |
|
| 280 |
if progress_callback:
|
| 281 |
progress_callback(0.9) # 90% for clustering
|
|
|
|
| 282 |
|
| 283 |
# Run enhanced diarization
|
| 284 |
try:
|
|
@@ -316,6 +317,7 @@ def perform_speaker_diarization_on_utterances(
|
|
| 316 |
|
| 317 |
if progress_callback:
|
| 318 |
progress_callback(1.0) # 100% complete
|
|
|
|
| 319 |
|
| 320 |
print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
|
| 321 |
logger.info(f"🎭 Enhanced clustering completed! Detected {n_speakers} speakers with {confidence} confidence")
|
|
@@ -332,8 +334,14 @@ def perform_speaker_diarization_on_utterances(
|
|
| 332 |
print("⚠️ Using fallback clustering")
|
| 333 |
|
| 334 |
# >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
|
| 335 |
-
|
| 336 |
config_dict, progress_callback)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
return diarization_result
|
| 339 |
|
|
@@ -509,7 +517,7 @@ def get_diarization_stats(
|
|
| 509 |
def faiss_clustering(embeddings: np.ndarray,
|
| 510 |
utterances: list,
|
| 511 |
config_dict: dict,
|
| 512 |
-
progress_callback=None)
|
| 513 |
"""
|
| 514 |
Clustering via FAISS (K-means) ultra-rapide CPU.
|
| 515 |
Retourne la liste (start, end, speaker_id) compatible avec l'ancien code.
|
|
@@ -518,7 +526,13 @@ def faiss_clustering(embeddings: np.ndarray,
|
|
| 518 |
import faiss
|
| 519 |
except ImportError:
|
| 520 |
# FAISS absent → on retombe sur AgglomerativeClustering d'origine
|
| 521 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
|
| 523 |
n_samples, dim = embeddings.shape
|
| 524 |
n_clusters = config_dict['num_speakers']
|
|
@@ -543,6 +557,7 @@ def faiss_clustering(embeddings: np.ndarray,
|
|
| 543 |
|
| 544 |
if progress_callback:
|
| 545 |
progress_callback(1.0)
|
|
|
|
| 546 |
|
| 547 |
num_speakers = len(set(labels))
|
| 548 |
print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
|
|
@@ -578,8 +593,10 @@ def sklearn_fallback_clustering(embeddings, utterances, config_dict, progress_ca
|
|
| 578 |
|
| 579 |
if progress_callback:
|
| 580 |
progress_callback(0.9)
|
|
|
|
| 581 |
labels = clustering.fit_predict(distance_matrix)
|
| 582 |
if progress_callback:
|
| 583 |
progress_callback(1.0)
|
|
|
|
| 584 |
|
| 585 |
return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]
|
|
|
|
| 211 |
batch_size = max(1, total_utterances // 20) # Process in batches for progress updates
|
| 212 |
|
| 213 |
for i, (start, end, text) in enumerate(utterances):
|
| 214 |
+
if i % batch_size == 0:
|
| 215 |
+
yield i / total_utterances * 0.8
|
| 216 |
|
| 217 |
# Extract audio segment
|
| 218 |
start_sample = int(start * sample_rate)
|
|
|
|
| 279 |
|
| 280 |
if progress_callback:
|
| 281 |
progress_callback(0.9) # 90% for clustering
|
| 282 |
+
yield 0.9
|
| 283 |
|
| 284 |
# Run enhanced diarization
|
| 285 |
try:
|
|
|
|
| 317 |
|
| 318 |
if progress_callback:
|
| 319 |
progress_callback(1.0) # 100% complete
|
| 320 |
+
yield 1.0
|
| 321 |
|
| 322 |
print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
|
| 323 |
logger.info(f"🎭 Enhanced clustering completed! Detected {n_speakers} speakers with {confidence} confidence")
|
|
|
|
| 334 |
print("⚠️ Using fallback clustering")
|
| 335 |
|
| 336 |
# >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
|
| 337 |
+
gen = faiss_clustering(embeddings_array, valid_utterances,
|
| 338 |
config_dict, progress_callback)
|
| 339 |
+
try:
|
| 340 |
+
while True:
|
| 341 |
+
p = next(gen)
|
| 342 |
+
yield p
|
| 343 |
+
except StopIteration as e:
|
| 344 |
+
diarization_result = e.value
|
| 345 |
|
| 346 |
return diarization_result
|
| 347 |
|
|
|
|
| 517 |
def faiss_clustering(embeddings: np.ndarray,
|
| 518 |
utterances: list,
|
| 519 |
config_dict: dict,
|
| 520 |
+
progress_callback=None):
|
| 521 |
"""
|
| 522 |
Clustering via FAISS (K-means) ultra-rapide CPU.
|
| 523 |
Retourne la liste (start, end, speaker_id) compatible avec l'ancien code.
|
|
|
|
| 526 |
import faiss
|
| 527 |
except ImportError:
|
| 528 |
# FAISS absent → on retombe sur AgglomerativeClustering d'origine
|
| 529 |
+
gen = sklearn_fallback_clustering(embeddings, utterances, config_dict, progress_callback)
|
| 530 |
+
try:
|
| 531 |
+
while True:
|
| 532 |
+
p = next(gen)
|
| 533 |
+
yield p
|
| 534 |
+
except StopIteration as e:
|
| 535 |
+
return e.value
|
| 536 |
|
| 537 |
n_samples, dim = embeddings.shape
|
| 538 |
n_clusters = config_dict['num_speakers']
|
|
|
|
| 557 |
|
| 558 |
if progress_callback:
|
| 559 |
progress_callback(1.0)
|
| 560 |
+
yield 1.0
|
| 561 |
|
| 562 |
num_speakers = len(set(labels))
|
| 563 |
print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
|
|
|
|
| 593 |
|
| 594 |
if progress_callback:
|
| 595 |
progress_callback(0.9)
|
| 596 |
+
yield 0.9
|
| 597 |
labels = clustering.fit_predict(distance_matrix)
|
| 598 |
if progress_callback:
|
| 599 |
progress_callback(1.0)
|
| 600 |
+
yield 1.0
|
| 601 |
|
| 602 |
return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]
|
src/server/services/asr_service.py
CHANGED
|
@@ -65,22 +65,38 @@ def iter_transcription_events(
|
|
| 65 |
"model": model_name,
|
| 66 |
}
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
final_utterances: List[Tuple[float, float, str]] = []
|
| 69 |
|
| 70 |
-
for current_utterance, all_utterances in generator:
|
| 71 |
if current_utterance:
|
| 72 |
start, end, text = current_utterance
|
| 73 |
yield {
|
| 74 |
"type": "utterance",
|
| 75 |
"utterance": _serialize_utterance((start, end, text)),
|
| 76 |
"index": len(all_utterances) - 1,
|
|
|
|
| 77 |
}
|
| 78 |
final_utterances = list(all_utterances)
|
| 79 |
|
| 80 |
# Final event with transcript and optional diarization
|
| 81 |
diarization_payload = None
|
| 82 |
if options.diarization.enable:
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
transcript_text = "\n".join([utt[2] for utt in final_utterances])
|
| 86 |
|
|
@@ -99,16 +115,18 @@ def _run_diarization(
|
|
| 99 |
audio_path: Path,
|
| 100 |
utterances: List[Tuple[float, float, str]],
|
| 101 |
options: DiarizationOptions,
|
| 102 |
-
)
|
| 103 |
if not utterances:
|
| 104 |
-
|
|
|
|
| 105 |
|
| 106 |
extractor_result = init_speaker_embedding_extractor(
|
| 107 |
cluster_threshold=options.cluster_threshold,
|
| 108 |
num_speakers=options.num_speakers,
|
| 109 |
)
|
| 110 |
if not extractor_result:
|
| 111 |
-
|
|
|
|
| 112 |
|
| 113 |
embedding_extractor, config_dict = extractor_result
|
| 114 |
|
|
@@ -124,7 +142,7 @@ def _run_diarization(
|
|
| 124 |
audio = resample(audio, target_num_samples)
|
| 125 |
sample_rate = 16000
|
| 126 |
|
| 127 |
-
|
| 128 |
audio=audio,
|
| 129 |
sample_rate=sample_rate,
|
| 130 |
utterances=utterances,
|
|
@@ -133,17 +151,30 @@ def _run_diarization(
|
|
| 133 |
progress_callback=None,
|
| 134 |
)
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
if not diarization_segments:
|
| 137 |
-
|
|
|
|
| 138 |
|
| 139 |
merged = merge_transcription_with_diarization(utterances, diarization_segments)
|
| 140 |
merged = merge_consecutive_utterances(merged, max_gap=1.0)
|
| 141 |
stats = get_diarization_stats(merged)
|
| 142 |
|
| 143 |
-
|
| 144 |
"utterances": [
|
| 145 |
_serialize_utterance((start, end, text), speaker)
|
| 146 |
for start, end, text, speaker in merged
|
| 147 |
],
|
| 148 |
"stats": stats,
|
| 149 |
-
}
|
|
|
|
| 65 |
"model": model_name,
|
| 66 |
}
|
| 67 |
|
| 68 |
+
yield {
|
| 69 |
+
"type": "status",
|
| 70 |
+
"message": "Transcribing audio...",
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
final_utterances: List[Tuple[float, float, str]] = []
|
| 74 |
|
| 75 |
+
for current_utterance, all_utterances, progress in generator:
|
| 76 |
if current_utterance:
|
| 77 |
start, end, text = current_utterance
|
| 78 |
yield {
|
| 79 |
"type": "utterance",
|
| 80 |
"utterance": _serialize_utterance((start, end, text)),
|
| 81 |
"index": len(all_utterances) - 1,
|
| 82 |
+
"progress": round(progress, 1),
|
| 83 |
}
|
| 84 |
final_utterances = list(all_utterances)
|
| 85 |
|
| 86 |
# Final event with transcript and optional diarization
|
| 87 |
diarization_payload = None
|
| 88 |
if options.diarization.enable:
|
| 89 |
+
yield {
|
| 90 |
+
"type": "status",
|
| 91 |
+
"message": "Performing speaker diarization...",
|
| 92 |
+
}
|
| 93 |
+
diarization_gen = _run_diarization(audio_path, final_utterances, options.diarization)
|
| 94 |
+
for event in diarization_gen:
|
| 95 |
+
if event["type"] == "progress":
|
| 96 |
+
yield event
|
| 97 |
+
elif event["type"] == "result":
|
| 98 |
+
diarization_payload = event["payload"]
|
| 99 |
+
break
|
| 100 |
|
| 101 |
transcript_text = "\n".join([utt[2] for utt in final_utterances])
|
| 102 |
|
|
|
|
| 115 |
audio_path: Path,
|
| 116 |
utterances: List[Tuple[float, float, str]],
|
| 117 |
options: DiarizationOptions,
|
| 118 |
+
):
|
| 119 |
if not utterances:
|
| 120 |
+
yield {"type": "result", "payload": None}
|
| 121 |
+
return
|
| 122 |
|
| 123 |
extractor_result = init_speaker_embedding_extractor(
|
| 124 |
cluster_threshold=options.cluster_threshold,
|
| 125 |
num_speakers=options.num_speakers,
|
| 126 |
)
|
| 127 |
if not extractor_result:
|
| 128 |
+
yield {"type": "result", "payload": None}
|
| 129 |
+
return
|
| 130 |
|
| 131 |
embedding_extractor, config_dict = extractor_result
|
| 132 |
|
|
|
|
| 142 |
audio = resample(audio, target_num_samples)
|
| 143 |
sample_rate = 16000
|
| 144 |
|
| 145 |
+
diarization_gen = perform_speaker_diarization_on_utterances(
|
| 146 |
audio=audio,
|
| 147 |
sample_rate=sample_rate,
|
| 148 |
utterances=utterances,
|
|
|
|
| 151 |
progress_callback=None,
|
| 152 |
)
|
| 153 |
|
| 154 |
+
diarization_segments = None
|
| 155 |
+
try:
|
| 156 |
+
while True:
|
| 157 |
+
item = next(diarization_gen)
|
| 158 |
+
if isinstance(item, float):
|
| 159 |
+
yield {"type": "progress", "stage": "diarization", "progress": round(item * 100, 1)}
|
| 160 |
+
else:
|
| 161 |
+
diarization_segments = item
|
| 162 |
+
break
|
| 163 |
+
except StopIteration as e:
|
| 164 |
+
diarization_segments = e.value
|
| 165 |
+
|
| 166 |
if not diarization_segments:
|
| 167 |
+
yield {"type": "result", "payload": None}
|
| 168 |
+
return
|
| 169 |
|
| 170 |
merged = merge_transcription_with_diarization(utterances, diarization_segments)
|
| 171 |
merged = merge_consecutive_utterances(merged, max_gap=1.0)
|
| 172 |
stats = get_diarization_stats(merged)
|
| 173 |
|
| 174 |
+
yield {"type": "result", "payload": {
|
| 175 |
"utterances": [
|
| 176 |
_serialize_utterance((start, end, text), speaker)
|
| 177 |
for start, end, text, speaker in merged
|
| 178 |
],
|
| 179 |
"stats": stats,
|
| 180 |
+
}}
|