use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use nab::analyze::{AudioExtractor, TranscribeOptions, default_backend};
use rust_mcp_sdk::McpServer;
use rust_mcp_sdk::macros::{JsonSchema, mcp_tool};
use rust_mcp_sdk::schema::{CallToolResult, TextContent, schema_utils::CallToolError};
use serde::{Deserialize, Serialize};
use crate::hebb_client::HebbClient;
#[mcp_tool(
name = "analyze",
description = "Transcribe audio or video file with multilingual SOTA ASR.
Returns JSON with text, segments, word-level timestamps, and optional speaker
diarization.
Supported inputs:
- Audio: .wav, .mp3, .flac, .m4a, .aac, .ogg (passed directly to ASR)
- Video: .mp4, .mkv, .mov, .avi, .webm (audio extracted via ffmpeg first)
Language support (Parakeet TDT v3, macOS Apple Silicon):
- English, German, French, Spanish, Italian, Portuguese, Dutch, Polish, Russian,
Ukrainian, Czech, Slovak, Romanian, Hungarian, Finnish, Swedish, Danish,
Norwegian, Greek, Turkish, Arabic, Hebrew, Hindi, Japanese, Chinese
Backend:
- macOS Apple Silicon: FluidAudio (CoreML, Neural Engine, ~143× realtime)
- Other platforms: returns backend unavailability error
Active reading (active_reading=true):
- Identifies papers, people, tools, and claims in the transcript via MCP sampling
- Fetches and summarises each reference
- Inlines numbered footnotes into the transcript segments
- Requires the MCP client to support sampling/createMessage
Returns: JSON-serialized TranscriptionResult with segments, language, RTFx,
processing time, optional speaker diarization, and optional footnotes.",
read_only_hint = true,
open_world_hint = false
)]
#[derive(Debug, Deserialize, Serialize, JsonSchema)]
pub struct AnalyzeTool {
pub input: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
#[serde(default)]
pub diarize: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub backend: Option<String>,
#[serde(default)]
pub active_reading: bool,
#[serde(default)]
pub include_embeddings: bool,
}
impl AnalyzeTool {
pub async fn run(&self, runtime: &Arc<dyn McpServer>) -> Result<CallToolResult, CallToolError> {
let input_path = PathBuf::from(&self.input);
tracing::info!(
input = %self.input,
language = ?self.language,
diarize = self.diarize,
active_reading = self.active_reading,
"analyze start"
);
if !input_path.exists() {
return Err(CallToolError::from_message(format!(
"File not found: {}",
self.input
)));
}
let audio_path = extract_audio_if_needed(&input_path).await?;
let opts = TranscribeOptions {
language: self.language.clone(),
word_timestamps: true,
diarize: self.diarize,
max_duration_seconds: None,
include_embeddings: self.include_embeddings,
};
let backend = default_backend();
tracing::info!(backend = %backend.name(), "using ASR backend");
if !backend.is_available() {
return Err(CallToolError::from_message(format!(
"ASR backend '{}' is not available on this platform. \
Install fluidaudiocli with `nab models fetch fluidaudio` or build from \
https://github.com/FluidInference/FluidAudio",
backend.name()
)));
}
let mut result = backend
.transcribe(&audio_path, opts)
.await
.map_err(|e| CallToolError::from_message(format!("transcription failed: {e}")))?;
tracing::info!(
segments = result.segments.len(),
rtfx = result.rtfx,
backend = %result.backend,
"analyze complete"
);
if self.diarize
&& self.include_embeddings
&& let Some(ref mut speakers) = result.speakers
{
let speaker_map = match match_speakers_with_hebb(speakers).await {
Ok(m) => m,
Err(e) => {
tracing::warn!("hebb voice match skipped: {e}");
HashMap::new()
}
};
if !speaker_map.is_empty() {
apply_speaker_names(&mut result.segments, &speaker_map);
}
}
if self.active_reading {
apply_active_reading(&mut result, runtime).await;
}
if audio_path != input_path {
let _ = tokio::fs::remove_file(&audio_path).await;
}
let json = serde_json::to_string_pretty(&result)
.map_err(|e| CallToolError::from_message(format!("serialization failed: {e}")))?;
let structured = serde_json::to_value(&result)
.ok()
.and_then(|v| v.as_object().cloned());
let mut call_result = CallToolResult::text_content(vec![TextContent::from(json)]);
call_result.structured_content = structured;
Ok(call_result)
}
}
async fn apply_active_reading(
result: &mut nab::analyze::TranscriptionResult,
runtime: &Arc<dyn McpServer>,
) {
use crate::active_reading_mcp::{McpLlmSampler, NabUrlFetcher};
use nab::analyze::{ActiveReader, ActiveReadingConfig};
if !crate::sampling::is_supported(runtime) {
tracing::warn!(
"active reading requested but the MCP client does not support sampling; \
falling back to passive transcription"
);
return;
}
let sampler = McpLlmSampler::new(runtime.clone());
let client = match nab::AcceleratedClient::new() {
Ok(c) => Arc::new(c),
Err(e) => {
tracing::warn!("active reading: could not create HTTP client: {e}");
return;
}
};
let fetcher = NabUrlFetcher::new(client);
let mut reader = ActiveReader::new(&sampler, &fetcher, ActiveReadingConfig::default());
match reader.process(result).await {
Ok(output) => {
tracing::info!(
footnotes = output.footnotes.len(),
tokens_spent = output.metadata.tokens_spent,
"active reading complete"
);
result.footnotes = Some(output.footnotes);
result.active_reading = Some(output.metadata);
}
Err(e) => {
tracing::warn!("active reading failed: {e}; returning passive transcript");
}
}
}
const VOICE_MATCH_THRESHOLD: f32 = 0.7;
const VOICE_MATCH_LIMIT: u32 = 3;
async fn match_speakers_with_hebb(
speakers: &[nab::analyze::AsrSpeakerSegment],
) -> anyhow::Result<HashMap<String, String>> {
if !HebbClient::is_available() {
return Ok(HashMap::new());
}
let client_arc = HebbClient::global().await?;
let mut map = HashMap::new();
for seg in speakers {
let embedding = match &seg.embedding {
Some(e) if !e.is_empty() => e,
_ => continue,
};
let mut client = client_arc.lock().await;
match client
.voice_match(embedding, VOICE_MATCH_THRESHOLD, VOICE_MATCH_LIMIT)
.await
{
Ok(matches) => {
if let Some(best) = matches
.into_iter()
.find(|m| m.similarity >= VOICE_MATCH_THRESHOLD)
&& let Some(name) = best.name
{
map.insert(seg.speaker.clone(), name);
}
}
Err(e) => {
tracing::warn!(speaker = %seg.speaker, "voice_match failed: {e}");
}
}
}
Ok(map)
}
fn apply_speaker_names(
segments: &mut [nab::analyze::AsrTranscriptSegment],
speaker_map: &HashMap<String, String>,
) {
for seg in segments {
if let Some(label) = &seg.speaker
&& let Some(name) = speaker_map.get(label)
{
seg.speaker = Some(name.clone());
}
}
}
const AUDIO_EXTENSIONS: &[&str] = &["wav", "mp3", "flac", "m4a", "aac", "ogg", "opus"];
fn is_audio_file(path: &std::path::Path) -> bool {
path.extension()
.and_then(|e| e.to_str())
.is_some_and(|ext| {
let lower = ext.to_ascii_lowercase();
AUDIO_EXTENSIONS.iter().any(|&a| a == lower)
})
}
async fn extract_audio_if_needed(input: &std::path::Path) -> Result<PathBuf, CallToolError> {
if is_audio_file(input) {
return Ok(input.to_path_buf());
}
let tmp_path = std::env::temp_dir().join(format!("nab_analyze_{}.wav", std::process::id()));
tracing::info!(
video = %input.display(),
output = %tmp_path.display(),
"extracting audio from video"
);
AudioExtractor::new()
.extract(input, &tmp_path)
.await
.map_err(|e| CallToolError::from_message(format!("audio extraction failed: {e}")))?;
Ok(tmp_path)
}
#[cfg(test)]
mod tests {
use nab::analyze::{AsrSpeakerSegment, AsrTranscriptSegment};
use super::*;
fn make_segment(speaker: Option<&str>) -> AsrTranscriptSegment {
AsrTranscriptSegment {
text: "hello".to_string(),
start: 0.0,
end: 1.0,
confidence: 1.0,
language: None,
speaker: speaker.map(String::from),
words: None,
}
}
fn make_speaker(label: &str, embedding: Option<Vec<f32>>) -> AsrSpeakerSegment {
AsrSpeakerSegment {
speaker: label.to_string(),
start: 0.0,
end: 1.0,
embedding,
}
}
#[test]
fn apply_speaker_names_replaces_matched_label() {
let mut segments = vec![make_segment(Some("SPEAKER_00"))];
let mut map = HashMap::new();
map.insert("SPEAKER_00".to_string(), "Alice".to_string());
apply_speaker_names(&mut segments, &map);
assert_eq!(segments[0].speaker.as_deref(), Some("Alice"));
}
#[test]
fn apply_speaker_names_leaves_unmatched_label_unchanged() {
let mut segments = vec![make_segment(Some("SPEAKER_01"))];
let mut map = HashMap::new();
map.insert("SPEAKER_00".to_string(), "Alice".to_string());
apply_speaker_names(&mut segments, &map);
assert_eq!(segments[0].speaker.as_deref(), Some("SPEAKER_01"));
}
#[test]
fn apply_speaker_names_skips_segments_without_speaker() {
let mut segments = vec![make_segment(None)];
let map: HashMap<String, String> = HashMap::new();
apply_speaker_names(&mut segments, &map);
assert!(segments[0].speaker.is_none());
}
#[tokio::test]
async fn match_speakers_with_hebb_returns_empty_when_hebb_unavailable() {
let speakers = vec![make_speaker("SPEAKER_00", Some(vec![0.1; 256]))];
let available = HebbClient::is_available();
if available {
return;
}
let result = match_speakers_with_hebb(&speakers).await;
assert!(result.is_ok(), "expected Ok, got: {result:?}");
assert!(result.unwrap().is_empty());
}
}