|
|
""" |
|
|
Export utilities for transcripts and summaries |
|
|
Supports various formats depending on speaker diarization state |
|
|
""" |
|
|
|
|
|
import json |
|
|
from typing import List, Tuple, Dict, Any |
|
|
from datetime import timedelta |
|
|
import re |
|
|
|
|
|
def format_timestamp(seconds: float, format_type: str = "srt") -> str: |
|
|
"""Format timestamp for different subtitle formats""" |
|
|
td = timedelta(seconds=seconds) |
|
|
hours = int(td.total_seconds() // 3600) |
|
|
minutes = int((td.total_seconds() % 3600) // 60) |
|
|
secs = td.total_seconds() % 60 |
|
|
|
|
|
if format_type == "srt": |
|
|
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace(".", ",") |
|
|
elif format_type == "vtt": |
|
|
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}" |
|
|
elif format_type == "ass": |
|
|
return f"{hours:01d}:{minutes:02d}:{secs:05.2f}" |
|
|
else: |
|
|
return f"{hours:02d}:{minutes:02d}:{secs:04.1f}" |
|
|
|
|
|
def export_to_srt(utterances: List[Tuple[float, float, str]], utterances_with_speakers=None) -> str: |
|
|
"""Export to SubRip (.srt) format""" |
|
|
srt_content = [] |
|
|
|
|
|
|
|
|
data_source = utterances_with_speakers if utterances_with_speakers else [(start, end, text, 0) for start, end, text in utterances] |
|
|
|
|
|
for i, (start, end, text, speaker_id) in enumerate(data_source, 1): |
|
|
speaker_prefix = f"Speaker {speaker_id + 1}: " if utterances_with_speakers else "" |
|
|
srt_content.append(f"{i}") |
|
|
srt_content.append(f"{format_timestamp(start, 'srt')} --> {format_timestamp(end, 'srt')}") |
|
|
srt_content.append(f"{speaker_prefix}{text}") |
|
|
srt_content.append("") |
|
|
|
|
|
return "\n".join(srt_content) |
|
|
|
|
|
def export_to_vtt(utterances: List[Tuple[float, float, str]], utterances_with_speakers=None) -> str: |
|
|
"""Export to WebVTT (.vtt) format""" |
|
|
vtt_content = ["WEBVTT", ""] |
|
|
|
|
|
|
|
|
data_source = utterances_with_speakers if utterances_with_speakers else [(start, end, text, 0) for start, end, text in utterances] |
|
|
|
|
|
for start, end, text, speaker_id in data_source: |
|
|
speaker_prefix = f"Speaker {speaker_id + 1}: " if utterances_with_speakers else "" |
|
|
vtt_content.append(f"{format_timestamp(start, 'vtt')} --> {format_timestamp(end, 'vtt')}") |
|
|
vtt_content.append(f"{speaker_prefix}{text}") |
|
|
vtt_content.append("") |
|
|
|
|
|
return "\n".join(vtt_content) |
|
|
|
|
|
def export_to_ass(utterances: List[Tuple[float, float, str]], utterances_with_speakers=None) -> str: |
|
|
"""Export to Advanced SubStation Alpha (.ass) format""" |
|
|
header = """[Script Info] |
|
|
Title: VoxSum Transcript |
|
|
ScriptType: v4.00+ |
|
|
|
|
|
[V4+ Styles] |
|
|
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding |
|
|
Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,0,2,10,10,10,1 |
|
|
|
|
|
[Events] |
|
|
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text |
|
|
""" |
|
|
|
|
|
events = [] |
|
|
|
|
|
data_source = utterances_with_speakers if utterances_with_speakers else [(start, end, text, 0) for start, end, text in utterances] |
|
|
|
|
|
for start, end, text, speaker_id in data_source: |
|
|
speaker_prefix = f"Speaker {speaker_id + 1}: " if utterances_with_speakers else "" |
|
|
events.append(f"Dialogue: 0,{format_timestamp(start, 'ass')},{format_timestamp(end, 'ass')},Default,,0,0,0,,{speaker_prefix}{text}") |
|
|
|
|
|
return header + "\n".join(events) |
|
|
|
|
|
def export_to_transcript_json(utterances: List[Tuple[float, float, str]], utterances_with_speakers=None, metadata=None) -> str: |
|
|
"""Export to JSON format with detailed transcript data""" |
|
|
|
|
|
data_source = utterances_with_speakers if utterances_with_speakers else [(start, end, text, 0) for start, end, text in utterances] |
|
|
|
|
|
transcript_data = { |
|
|
"metadata": metadata or { |
|
|
"source": "VoxSum", |
|
|
"format_version": "1.0", |
|
|
"speakers_detected": len(set(speaker for _, _, _, speaker in data_source)) if utterances_with_speakers else 1 |
|
|
}, |
|
|
"utterances": [ |
|
|
{ |
|
|
"start": start, |
|
|
"end": end, |
|
|
"duration": end - start, |
|
|
"text": text, |
|
|
"speaker_id": speaker_id, |
|
|
"speaker_label": f"Speaker {speaker_id + 1}" |
|
|
} |
|
|
for start, end, text, speaker_id in data_source |
|
|
] |
|
|
} |
|
|
|
|
|
return json.dumps(transcript_data, indent=2, ensure_ascii=False) |
|
|
|
|
|
def export_to_elan_eaf(utterances: List[Tuple[float, float, str]], utterances_with_speakers=None) -> str: |
|
|
"""Export to ELAN (.eaf) format for linguistic analysis""" |
|
|
import datetime |
|
|
|
|
|
|
|
|
data_source = utterances_with_speakers if utterances_with_speakers else [(start, end, text, 0) for start, end, text in utterances] |
|
|
|
|
|
|
|
|
speakers = sorted(set(speaker for _, _, _, speaker in data_source)) |
|
|
|
|
|
current_date = datetime.datetime.now().isoformat() |
|
|
eaf_content = f"""<?xml version="1.0" encoding="UTF-8"?> |
|
|
<ANNOTATION_DOCUMENT AUTHOR="VoxSum" DATE="{current_date}" FORMAT="3.0" VERSION="3.0"> |
|
|
<HEADER MEDIA_FILE="" TIME_UNITS="milliseconds"> |
|
|
<PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:voxsum-transcript</PROPERTY> |
|
|
<PROPERTY NAME="lastUsedAnnotationId">{len(data_source)}</PROPERTY> |
|
|
</HEADER> |
|
|
<TIME_ORDER> |
|
|
""" |
|
|
|
|
|
|
|
|
time_id = 1 |
|
|
for start, end, _, _ in data_source: |
|
|
eaf_content += f' <TIME_SLOT TIME_SLOT_ID="ts{time_id}" TIME_VALUE="{int(start * 1000)}"/>\n' |
|
|
time_id += 1 |
|
|
eaf_content += f' <TIME_SLOT TIME_SLOT_ID="ts{time_id}" TIME_VALUE="{int(end * 1000)}"/>\n' |
|
|
time_id += 1 |
|
|
|
|
|
eaf_content += " </TIME_ORDER>\n" |
|
|
|
|
|
|
|
|
for speaker_id in speakers: |
|
|
eaf_content += f' <TIER LINGUISTIC_TYPE_REF="default-lt" TIER_ID="Speaker_{speaker_id + 1}">\n' |
|
|
|
|
|
annotation_id = 1 |
|
|
time_id = 1 |
|
|
for start, end, text, spk_id in data_source: |
|
|
if spk_id == speaker_id: |
|
|
eaf_content += f' <ANNOTATION>\n' |
|
|
eaf_content += f' <ALIGNABLE_ANNOTATION ANNOTATION_ID="a{annotation_id}" TIME_SLOT_REF1="ts{time_id}" TIME_SLOT_REF2="ts{time_id + 1}">\n' |
|
|
eaf_content += f' <ANNOTATION_VALUE>{text}</ANNOTATION_VALUE>\n' |
|
|
eaf_content += f' </ALIGNABLE_ANNOTATION>\n' |
|
|
eaf_content += f' </ANNOTATION>\n' |
|
|
annotation_id += 1 |
|
|
time_id += 2 |
|
|
|
|
|
eaf_content += " </TIER>\n" |
|
|
|
|
|
eaf_content += """ <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/> |
|
|
<CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/> |
|
|
<CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/> |
|
|
<CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/> |
|
|
<CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/> |
|
|
</ANNOTATION_DOCUMENT>""" |
|
|
|
|
|
return eaf_content |
|
|
|
|
|
def export_plain_text(utterances: List[Tuple[float, float, str]], utterances_with_speakers=None, include_timestamps=True) -> str: |
|
|
"""Export to plain text format""" |
|
|
lines = [] |
|
|
|
|
|
|
|
|
data_source = utterances_with_speakers if utterances_with_speakers else [(start, end, text, 0) for start, end, text in utterances] |
|
|
|
|
|
current_speaker = None |
|
|
for start, end, text, speaker_id in data_source: |
|
|
|
|
|
if utterances_with_speakers and speaker_id != current_speaker: |
|
|
if lines: |
|
|
lines.append("") |
|
|
lines.append(f"Speaker {speaker_id + 1}:") |
|
|
current_speaker = speaker_id |
|
|
|
|
|
|
|
|
if include_timestamps: |
|
|
timestamp = f"[{format_timestamp(start, 'default')}] " |
|
|
else: |
|
|
timestamp = "" |
|
|
|
|
|
lines.append(f"{timestamp}{text}") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
def export_summary_markdown(summary: str, metadata=None) -> str: |
|
|
"""Export summary in Markdown format with metadata""" |
|
|
md_content = [] |
|
|
|
|
|
if metadata: |
|
|
md_content.append("# Summary") |
|
|
md_content.append("") |
|
|
if metadata.get("title"): |
|
|
md_content.append(f"**Title:** {metadata['title']}") |
|
|
if metadata.get("duration"): |
|
|
md_content.append(f"**Duration:** {metadata['duration']}") |
|
|
if metadata.get("speakers"): |
|
|
md_content.append(f"**Speakers:** {metadata['speakers']}") |
|
|
if metadata.get("date"): |
|
|
md_content.append(f"**Date:** {metadata['date']}") |
|
|
md_content.append("") |
|
|
md_content.append("## Content") |
|
|
md_content.append("") |
|
|
|
|
|
md_content.append(summary) |
|
|
|
|
|
return "\n".join(md_content) |
|
|
|
|
|
def export_summary_plain_text(summary: str, metadata=None) -> str: |
|
|
"""Export summary in plain text format""" |
|
|
content = [] |
|
|
|
|
|
if metadata: |
|
|
content.append("SUMMARY") |
|
|
content.append("=" * 50) |
|
|
content.append("") |
|
|
if metadata.get("title"): |
|
|
content.append(f"Title: {metadata['title']}") |
|
|
if metadata.get("duration"): |
|
|
content.append(f"Duration: {metadata['duration']}") |
|
|
if metadata.get("speakers"): |
|
|
content.append(f"Speakers: {metadata['speakers']}") |
|
|
if metadata.get("date"): |
|
|
content.append(f"Date: {metadata['date']}") |
|
|
content.append("") |
|
|
content.append("CONTENT") |
|
|
content.append("-" * 50) |
|
|
content.append("") |
|
|
|
|
|
content.append(summary) |
|
|
|
|
|
return "\n".join(content) |
|
|
|
|
|
|
|
|
SUBTITLE_FORMATS = { |
|
|
"SRT (SubRip)": { |
|
|
"extension": ".srt", |
|
|
"mime_type": "text/plain", |
|
|
"function": export_to_srt |
|
|
}, |
|
|
"VTT (WebVTT)": { |
|
|
"extension": ".vtt", |
|
|
"mime_type": "text/vtt", |
|
|
"function": export_to_vtt |
|
|
}, |
|
|
"ASS (Advanced SubStation Alpha)": { |
|
|
"extension": ".ass", |
|
|
"mime_type": "text/plain", |
|
|
"function": export_to_ass |
|
|
} |
|
|
} |
|
|
|
|
|
TRANSCRIPT_FORMATS = { |
|
|
"Plain Text": { |
|
|
"extension": ".txt", |
|
|
"mime_type": "text/plain", |
|
|
"function": export_plain_text |
|
|
}, |
|
|
"JSON": { |
|
|
"extension": ".json", |
|
|
"mime_type": "application/json", |
|
|
"function": export_to_transcript_json |
|
|
}, |
|
|
"ELAN (EAF)": { |
|
|
"extension": ".eaf", |
|
|
"mime_type": "application/xml", |
|
|
"function": export_to_elan_eaf |
|
|
} |
|
|
} |
|
|
|
|
|
SUMMARY_FORMATS = { |
|
|
"Markdown": { |
|
|
"extension": ".md", |
|
|
"mime_type": "text/markdown", |
|
|
"function": export_summary_markdown |
|
|
}, |
|
|
"Plain Text": { |
|
|
"extension": ".txt", |
|
|
"mime_type": "text/plain", |
|
|
"function": export_summary_plain_text |
|
|
} |
|
|
} |