car-voice 0.15.1

//! Transcript parsing utilities.
//!
//! ElevenLabs Scribe (and most modern STT engines) annotate
//! transcripts with parenthetical sound-effect tags alongside the
//! actual spoken words:
//!
//! ```text
//! "What time is it? (chimes) (smacks lips)"
//! ```
//!
//! The parenthetical content isn't noise to filter — it's structured
//! audio context the LLM can use to decide its response. A request
//! that arrives with `(audience laughing)` annotated is meaningfully
//! different from one with `(crying)` annotated.
//!
//! [`parse_transcript`] splits a Scribe transcript into:
//!
//! - `speech` — the user's actual words, parens removed
//! - `audio_context` — the parenthetical tags as separate strings
//!
//! Channels (Tokhn's intent dispatcher, the CLI, future agents)
//! consume both and decide what to do with them. The LLM can be
//! shown the audio context as supplementary metadata.

use serde::{Deserialize, Serialize};

/// Result of running a Scribe transcript through [`parse_transcript`].
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ParsedTranscript {
    /// The user's actual spoken words. Whitespace collapsed, leading
    /// and trailing whitespace trimmed.
    pub speech: String,
    /// Audio context tags from the original transcript. Each tag is
    /// the inner content of a `(...)` or `[...]` block, with leading
    /// and trailing whitespace trimmed and the wrapping brackets
    /// removed. Order preserved from the source.
    pub audio_context: Vec<String>,
}

impl ParsedTranscript {
    /// True if there are no spoken words. Pure-annotation transcripts
    /// (e.g. `"(chimes) (footsteps)"`) parse to a `ParsedTranscript`
    /// with empty `speech` and non-empty `audio_context`.
    pub fn is_speech_only_empty(&self) -> bool {
        self.speech.is_empty()
    }
}

/// Split a transcript into spoken words + audio context tags.
///
/// Handles `(...)` and `[...]` blocks. Nested brackets are flattened
/// into the parent (one tag per outermost block). Mismatched closes
/// are tolerated — extra `)` characters are dropped.
pub fn parse_transcript(raw: &str) -> ParsedTranscript {
    let mut speech = String::with_capacity(raw.len());
    let mut audio_context: Vec<String> = Vec::new();
    let mut current_tag = String::new();
    let mut depth: i32 = 0;

    for c in raw.chars() {
        match c {
            '(' | '[' => {
                if depth == 0 {
                    current_tag.clear();
                }
                depth += 1;
            }
            ')' | ']' => {
                if depth > 0 {
                    depth -= 1;
                    if depth == 0 {
                        let tag = current_tag.trim().to_string();
                        if !tag.is_empty() {
                            audio_context.push(tag);
                        }
                        current_tag.clear();
                    }
                }
            }
            _ if depth > 0 => {
                current_tag.push(c);
            }
            _ => {
                speech.push(c);
            }
        }
    }

    ParsedTranscript {
        speech: collapse_whitespace(&speech),
        audio_context,
    }
}

/// Collapse runs of whitespace and trim. Used to clean up the speech
/// portion after parens have been removed (which can leave double
/// spaces).
fn collapse_whitespace(s: &str) -> String {
    let mut out = String::with_capacity(s.len());
    let mut prev_space = true;
    for c in s.chars() {
        if c.is_whitespace() {
            if !prev_space {
                out.push(' ');
                prev_space = true;
            }
        } else {
            out.push(c);
            prev_space = false;
        }
    }
    out.trim().to_string()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn pure_speech_has_no_context() {
        let p = parse_transcript("Can you list the files in the directory?");
        assert_eq!(p.speech, "Can you list the files in the directory?");
        assert!(p.audio_context.is_empty());
    }

    #[test]
    fn trailing_paren_extracts_context() {
        let p = parse_transcript("What time is it? (chimes) (smacks lips)");
        assert_eq!(p.speech, "What time is it?");
        assert_eq!(p.audio_context, vec!["chimes", "smacks lips"]);
    }

    #[test]
    fn leading_paren_still_extracts() {
        let p = parse_transcript("(chimes) Hello there");
        assert_eq!(p.speech, "Hello there");
        assert_eq!(p.audio_context, vec!["chimes"]);
    }

    #[test]
    fn pure_paren_has_empty_speech() {
        let p = parse_transcript("(footsteps thudding) (rustling)");
        assert!(p.speech.is_empty());
        assert!(p.is_speech_only_empty());
        assert_eq!(p.audio_context, vec!["footsteps thudding", "rustling"]);
    }

    #[test]
    fn square_brackets_also_work() {
        let p = parse_transcript("Sounds good [music]");
        assert_eq!(p.speech, "Sounds good");
        assert_eq!(p.audio_context, vec!["music"]);
    }

    #[test]
    fn collapses_double_spaces_after_strip() {
        let p = parse_transcript("Hello (sigh) world");
        assert_eq!(p.speech, "Hello world");
    }

    #[test]
    fn empty_input_round_trips() {
        let p = parse_transcript("");
        assert!(p.speech.is_empty());
        assert!(p.audio_context.is_empty());
    }

    #[test]
    fn unbalanced_close_paren_is_tolerated() {
        // Stray `)` shouldn't panic or eat content.
        let p = parse_transcript("hello) world");
        assert_eq!(p.speech, "hello world");
        assert!(p.audio_context.is_empty());
    }

    #[test]
    fn whitespace_inside_tag_is_trimmed() {
        let p = parse_transcript("(  loud bang  )");
        assert_eq!(p.audio_context, vec!["loud bang"]);
    }
}