Luigi commited on
Commit
030e33b
·
1 Parent(s): 07dbe5e

status in result tab got more informative

Browse files
frontend/app.js CHANGED
@@ -228,7 +228,6 @@ async function handleTranscription() {
228
  const reader = response.body.getReader();
229
  const decoder = new TextDecoder();
230
  let buffer = '';
231
- setStatus('Processing audio...', 'info');
232
 
233
  while (true) {
234
  const { done, value } = await reader.read();
@@ -265,12 +264,20 @@ function handleTranscriptionEvent(event) {
265
  elements.audioPlayer.currentTime = 0;
266
  }
267
  break;
268
- case 'utterance':
269
- if (event.utterance) {
270
- state.utterances.push(event.utterance);
271
- renderTranscript();
 
 
272
  }
273
  break;
 
 
 
 
 
 
274
  case 'complete':
275
  if (event.diarization) {
276
  state.diarizedUtterances = event.diarization.utterances || [];
 
228
  const reader = response.body.getReader();
229
  const decoder = new TextDecoder();
230
  let buffer = '';
 
231
 
232
  while (true) {
233
  const { done, value } = await reader.read();
 
264
  elements.audioPlayer.currentTime = 0;
265
  }
266
  break;
267
+ case 'status':
268
+ setStatus(event.message, 'info');
269
+ break;
270
+ case 'progress':
271
+ if (event.stage === 'diarization') {
272
+ setStatus(`Performing speaker diarization... (${event.progress}%)`, 'info');
273
  }
274
  break;
275
+ case 'utterance':
276
+ state.utterances.push(event.utterance);
277
+ const progress = event.progress || 0;
278
+ setStatus(`Transcribing audio... (${state.utterances.length} utterances, ${progress}%)`, 'info');
279
+ renderTranscript();
280
+ break;
281
  case 'complete':
282
  if (event.diarization) {
283
  state.diarizedUtterances = event.diarization.utterances || [];
src/asr.py CHANGED
@@ -42,7 +42,7 @@ def transcribe_file(
42
  backend: str = "moonshine",
43
  language: str = "auto",
44
  textnorm: str = "withitn",
45
- ) -> Iterable[Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]]]]:
46
  """
47
  Transcribe audio file using specified backend.
48
 
@@ -124,7 +124,8 @@ def transcribe_file(
124
 
125
  if text:
126
  utterances.append((segment_start, segment_end, cleaned_text))
127
- yield utterances[-1], utterances.copy()
 
128
 
129
  # Reset for next segment
130
  speech_chunks = []
@@ -154,10 +155,10 @@ def transcribe_file(
154
 
155
  if text:
156
  utterances.append((segment_start, segment_end, cleaned_text))
157
- yield utterances[-1], utterances.copy()
158
 
159
  # Final yield with all utterances
160
  if utterances:
161
- yield None, utterances
162
  else:
163
- yield None, [(-1, -1, "No speech detected")]
 
42
  backend: str = "moonshine",
43
  language: str = "auto",
44
  textnorm: str = "withitn",
45
+ ) -> Iterable[Tuple[Optional[Tuple[float, float, str]], List[Tuple[float, float, str]], float]]:
46
  """
47
  Transcribe audio file using specified backend.
48
 
 
124
 
125
  if text:
126
  utterances.append((segment_start, segment_end, cleaned_text))
127
+ progress = min(100, (i / len(wav)) * 100)
128
+ yield utterances[-1], utterances.copy(), progress
129
 
130
  # Reset for next segment
131
  speech_chunks = []
 
155
 
156
  if text:
157
  utterances.append((segment_start, segment_end, cleaned_text))
158
+ yield utterances[-1], utterances.copy(), 100.0
159
 
160
  # Final yield with all utterances
161
  if utterances:
162
+ yield None, utterances, 100.0
163
  else:
164
+ yield None, [(-1, -1, "No speech detected")], 100.0
src/diarization.py CHANGED
@@ -211,8 +211,8 @@ def perform_speaker_diarization_on_utterances(
211
  batch_size = max(1, total_utterances // 20) # Process in batches for progress updates
212
 
213
  for i, (start, end, text) in enumerate(utterances):
214
- if progress_callback and i % batch_size == 0:
215
- progress_callback(i / total_utterances * 0.8) # 80% for embedding extraction
216
 
217
  # Extract audio segment
218
  start_sample = int(start * sample_rate)
@@ -279,6 +279,7 @@ def perform_speaker_diarization_on_utterances(
279
 
280
  if progress_callback:
281
  progress_callback(0.9) # 90% for clustering
 
282
 
283
  # Run enhanced diarization
284
  try:
@@ -316,6 +317,7 @@ def perform_speaker_diarization_on_utterances(
316
 
317
  if progress_callback:
318
  progress_callback(1.0) # 100% complete
 
319
 
320
  print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
321
  logger.info(f"🎭 Enhanced clustering completed! Detected {n_speakers} speakers with {confidence} confidence")
@@ -332,8 +334,14 @@ def perform_speaker_diarization_on_utterances(
332
  print("⚠️ Using fallback clustering")
333
 
334
  # >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
335
- diarization_result = faiss_clustering(embeddings_array, valid_utterances,
336
  config_dict, progress_callback)
 
 
 
 
 
 
337
 
338
  return diarization_result
339
 
@@ -509,7 +517,7 @@ def get_diarization_stats(
509
  def faiss_clustering(embeddings: np.ndarray,
510
  utterances: list,
511
  config_dict: dict,
512
- progress_callback=None) -> list:
513
  """
514
  Clustering via FAISS (K-means) ultra-rapide CPU.
515
  Retourne la liste (start, end, speaker_id) compatible avec l'ancien code.
@@ -518,7 +526,13 @@ def faiss_clustering(embeddings: np.ndarray,
518
  import faiss
519
  except ImportError:
520
  # FAISS absent → on retombe sur AgglomerativeClustering d'origine
521
- return sklearn_fallback_clustering(embeddings, utterances, config_dict, progress_callback)
 
 
 
 
 
 
522
 
523
  n_samples, dim = embeddings.shape
524
  n_clusters = config_dict['num_speakers']
@@ -543,6 +557,7 @@ def faiss_clustering(embeddings: np.ndarray,
543
 
544
  if progress_callback:
545
  progress_callback(1.0)
 
546
 
547
  num_speakers = len(set(labels))
548
  print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
@@ -578,8 +593,10 @@ def sklearn_fallback_clustering(embeddings, utterances, config_dict, progress_ca
578
 
579
  if progress_callback:
580
  progress_callback(0.9)
 
581
  labels = clustering.fit_predict(distance_matrix)
582
  if progress_callback:
583
  progress_callback(1.0)
 
584
 
585
  return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]
 
211
  batch_size = max(1, total_utterances // 20) # Process in batches for progress updates
212
 
213
  for i, (start, end, text) in enumerate(utterances):
214
+ if i % batch_size == 0:
215
+ yield i / total_utterances * 0.8
216
 
217
  # Extract audio segment
218
  start_sample = int(start * sample_rate)
 
279
 
280
  if progress_callback:
281
  progress_callback(0.9) # 90% for clustering
282
+ yield 0.9
283
 
284
  # Run enhanced diarization
285
  try:
 
317
 
318
  if progress_callback:
319
  progress_callback(1.0) # 100% complete
320
+ yield 1.0
321
 
322
  print(f"✅ DEBUG: Enhanced result - {n_speakers} speakers, {len(diarization_result)} segments")
323
  logger.info(f"🎭 Enhanced clustering completed! Detected {n_speakers} speakers with {confidence} confidence")
 
334
  print("⚠️ Using fallback clustering")
335
 
336
  # >>> NOUVEAU : clustering FAISS si disponible, sinon ancien code
337
+ gen = faiss_clustering(embeddings_array, valid_utterances,
338
  config_dict, progress_callback)
339
+ try:
340
+ while True:
341
+ p = next(gen)
342
+ yield p
343
+ except StopIteration as e:
344
+ diarization_result = e.value
345
 
346
  return diarization_result
347
 
 
517
  def faiss_clustering(embeddings: np.ndarray,
518
  utterances: list,
519
  config_dict: dict,
520
+ progress_callback=None):
521
  """
522
  Clustering via FAISS (K-means) ultra-rapide CPU.
523
  Retourne la liste (start, end, speaker_id) compatible avec l'ancien code.
 
526
  import faiss
527
  except ImportError:
528
  # FAISS absent → on retombe sur AgglomerativeClustering d'origine
529
+ gen = sklearn_fallback_clustering(embeddings, utterances, config_dict, progress_callback)
530
+ try:
531
+ while True:
532
+ p = next(gen)
533
+ yield p
534
+ except StopIteration as e:
535
+ return e.value
536
 
537
  n_samples, dim = embeddings.shape
538
  n_clusters = config_dict['num_speakers']
 
557
 
558
  if progress_callback:
559
  progress_callback(1.0)
560
+ yield 1.0
561
 
562
  num_speakers = len(set(labels))
563
  print(f"✅ DEBUG: FAISS clustering — {num_speakers} speakers, {len(utterances)} segments")
 
593
 
594
  if progress_callback:
595
  progress_callback(0.9)
596
+ yield 0.9
597
  labels = clustering.fit_predict(distance_matrix)
598
  if progress_callback:
599
  progress_callback(1.0)
600
+ yield 1.0
601
 
602
  return [(start, end, int(lbl)) for (start, end, _), lbl in zip(utterances, labels)]
src/server/services/asr_service.py CHANGED
@@ -65,22 +65,38 @@ def iter_transcription_events(
65
  "model": model_name,
66
  }
67
 
 
 
 
 
 
68
  final_utterances: List[Tuple[float, float, str]] = []
69
 
70
- for current_utterance, all_utterances in generator:
71
  if current_utterance:
72
  start, end, text = current_utterance
73
  yield {
74
  "type": "utterance",
75
  "utterance": _serialize_utterance((start, end, text)),
76
  "index": len(all_utterances) - 1,
 
77
  }
78
  final_utterances = list(all_utterances)
79
 
80
  # Final event with transcript and optional diarization
81
  diarization_payload = None
82
  if options.diarization.enable:
83
- diarization_payload = _run_diarization(audio_path, final_utterances, options.diarization)
 
 
 
 
 
 
 
 
 
 
84
 
85
  transcript_text = "\n".join([utt[2] for utt in final_utterances])
86
 
@@ -99,16 +115,18 @@ def _run_diarization(
99
  audio_path: Path,
100
  utterances: List[Tuple[float, float, str]],
101
  options: DiarizationOptions,
102
- ) -> Optional[Dict[str, object]]:
103
  if not utterances:
104
- return None
 
105
 
106
  extractor_result = init_speaker_embedding_extractor(
107
  cluster_threshold=options.cluster_threshold,
108
  num_speakers=options.num_speakers,
109
  )
110
  if not extractor_result:
111
- return None
 
112
 
113
  embedding_extractor, config_dict = extractor_result
114
 
@@ -124,7 +142,7 @@ def _run_diarization(
124
  audio = resample(audio, target_num_samples)
125
  sample_rate = 16000
126
 
127
- diarization_segments = perform_speaker_diarization_on_utterances(
128
  audio=audio,
129
  sample_rate=sample_rate,
130
  utterances=utterances,
@@ -133,17 +151,30 @@ def _run_diarization(
133
  progress_callback=None,
134
  )
135
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  if not diarization_segments:
137
- return None
 
138
 
139
  merged = merge_transcription_with_diarization(utterances, diarization_segments)
140
  merged = merge_consecutive_utterances(merged, max_gap=1.0)
141
  stats = get_diarization_stats(merged)
142
 
143
- return {
144
  "utterances": [
145
  _serialize_utterance((start, end, text), speaker)
146
  for start, end, text, speaker in merged
147
  ],
148
  "stats": stats,
149
- }
 
65
  "model": model_name,
66
  }
67
 
68
+ yield {
69
+ "type": "status",
70
+ "message": "Transcribing audio...",
71
+ }
72
+
73
  final_utterances: List[Tuple[float, float, str]] = []
74
 
75
+ for current_utterance, all_utterances, progress in generator:
76
  if current_utterance:
77
  start, end, text = current_utterance
78
  yield {
79
  "type": "utterance",
80
  "utterance": _serialize_utterance((start, end, text)),
81
  "index": len(all_utterances) - 1,
82
+ "progress": round(progress, 1),
83
  }
84
  final_utterances = list(all_utterances)
85
 
86
  # Final event with transcript and optional diarization
87
  diarization_payload = None
88
  if options.diarization.enable:
89
+ yield {
90
+ "type": "status",
91
+ "message": "Performing speaker diarization...",
92
+ }
93
+ diarization_gen = _run_diarization(audio_path, final_utterances, options.diarization)
94
+ for event in diarization_gen:
95
+ if event["type"] == "progress":
96
+ yield event
97
+ elif event["type"] == "result":
98
+ diarization_payload = event["payload"]
99
+ break
100
 
101
  transcript_text = "\n".join([utt[2] for utt in final_utterances])
102
 
 
115
  audio_path: Path,
116
  utterances: List[Tuple[float, float, str]],
117
  options: DiarizationOptions,
118
+ ):
119
  if not utterances:
120
+ yield {"type": "result", "payload": None}
121
+ return
122
 
123
  extractor_result = init_speaker_embedding_extractor(
124
  cluster_threshold=options.cluster_threshold,
125
  num_speakers=options.num_speakers,
126
  )
127
  if not extractor_result:
128
+ yield {"type": "result", "payload": None}
129
+ return
130
 
131
  embedding_extractor, config_dict = extractor_result
132
 
 
142
  audio = resample(audio, target_num_samples)
143
  sample_rate = 16000
144
 
145
+ diarization_gen = perform_speaker_diarization_on_utterances(
146
  audio=audio,
147
  sample_rate=sample_rate,
148
  utterances=utterances,
 
151
  progress_callback=None,
152
  )
153
 
154
+ diarization_segments = None
155
+ try:
156
+ while True:
157
+ item = next(diarization_gen)
158
+ if isinstance(item, float):
159
+ yield {"type": "progress", "stage": "diarization", "progress": round(item * 100, 1)}
160
+ else:
161
+ diarization_segments = item
162
+ break
163
+ except StopIteration as e:
164
+ diarization_segments = e.value
165
+
166
  if not diarization_segments:
167
+ yield {"type": "result", "payload": None}
168
+ return
169
 
170
  merged = merge_transcription_with_diarization(utterances, diarization_segments)
171
  merged = merge_consecutive_utterances(merged, max_gap=1.0)
172
  stats = get_diarization_stats(merged)
173
 
174
+ yield {"type": "result", "payload": {
175
  "utterances": [
176
  _serialize_utterance((start, end, text), speaker)
177
  for start, end, text, speaker in merged
178
  ],
179
  "stats": stats,
180
+ }}