mcpkill 0.1.0 - Docs.rs

/// Split a MCP response text into meaningful chunks.
///
/// Strategy (in order of detection):
///   1. Markdown with headers → split by h1/h2/h3
///   2. JSON array            → one chunk per element
///   3. JSON object           → one chunk per top-level key
///   4. Plain text            → split by blank lines (paragraphs)
///
/// Chunks shorter than `MIN_CHUNK_LEN` chars are merged with the next one.
const MIN_CHUNK_LEN: usize = 80;

pub fn chunk(text: &str) -> Vec<String> {
    let chunks = if text.contains("## ") || text.contains("# ") {
        chunk_markdown(text)
    } else if let Ok(value) = serde_json::from_str::<serde_json::Value>(text) {
        chunk_json(&value)
    } else {
        chunk_paragraphs(text)
    };

    merge_short(chunks)
}

fn chunk_markdown(text: &str) -> Vec<String> {
    let mut chunks: Vec<String> = Vec::new();
    let mut current = String::new();

    for line in text.lines() {
        if (line.starts_with("# ") || line.starts_with("## ") || line.starts_with("### "))
            && !current.trim().is_empty()
        {
            chunks.push(current.trim().to_string());
            current = String::new();
        }
        current.push_str(line);
        current.push('\n');
    }

    if !current.trim().is_empty() {
        chunks.push(current.trim().to_string());
    }

    chunks
}

fn chunk_json(value: &serde_json::Value) -> Vec<String> {
    match value {
        serde_json::Value::Array(arr) => arr
            .iter()
            .map(|v| serde_json::to_string_pretty(v).unwrap_or_default())
            .filter(|s| !s.is_empty())
            .collect(),
        serde_json::Value::Object(map) => map
            .iter()
            .map(|(k, v)| {
                format!(
                    "{}: {}",
                    k,
                    serde_json::to_string_pretty(v).unwrap_or_default()
                )
            })
            .collect(),
        _ => vec![value.to_string()],
    }
}

fn chunk_paragraphs(text: &str) -> Vec<String> {
    text.split("\n\n")
        .map(|s| s.trim().to_string())
        .filter(|s| !s.is_empty())
        .collect()
}

/// Merge chunks that are too short into the next one.
pub fn merge_short(chunks: Vec<String>) -> Vec<String> {
    let mut result: Vec<String> = Vec::new();
    let mut carry = String::new();

    for chunk in chunks {
        if carry.is_empty() {
            carry = chunk;
        } else if carry.len() < MIN_CHUNK_LEN {
            carry.push_str("\n\n");
            carry.push_str(&chunk);
        } else {
            result.push(carry);
            carry = chunk;
        }
    }

    if !carry.is_empty() {
        result.push(carry);
    }

    result
}

#[cfg(test)]
mod tests {
    use super::*;

    // Helper: a string guaranteed to be above MIN_CHUNK_LEN.
    fn long(s: &str) -> String {
        format!("{s} {}", "x".repeat(MIN_CHUNK_LEN))
    }

    #[test]
    fn markdown_splits_on_headers() {
        let text = format!(
            "## Section A\n{}\n\n## Section B\n{}",
            long("Content about alpha topics."),
            long("Content about beta topics.")
        );
        let chunks = chunk(&text);
        assert!(
            chunks.len() >= 2,
            "expected ≥2 chunks, got {}",
            chunks.len()
        );
        assert!(chunks.iter().any(|c| c.contains("Section A")));
        assert!(chunks.iter().any(|c| c.contains("Section B")));
    }

    #[test]
    fn json_array_splits_per_element() {
        // Array of objects with enough content to stay as separate chunks.
        let items: Vec<String> = (0..3)
            .map(|i| format!(r#"{{"key_{i}": "{}"}}"#, "v".repeat(MIN_CHUNK_LEN)))
            .collect();
        let text = format!("[{}]", items.join(","));
        let chunks = chunk(&text);
        assert!(!chunks.is_empty());
    }

    #[test]
    fn plain_text_splits_on_blank_lines() {
        let text = format!(
            "{}\n\n{}",
            long("First paragraph with substantial content."),
            long("Second paragraph with substantial content.")
        );
        let chunks = chunk(&text);
        assert_eq!(chunks.len(), 2);
    }

    #[test]
    fn short_chunks_are_merged() {
        // All parts are shorter than MIN_CHUNK_LEN → merge into one
        let short = "Hi.\n\nBye.\n\nOk.";
        let chunks = chunk(short);
        assert_eq!(chunks.len(), 1);
    }

    #[test]
    fn empty_input_returns_empty() {
        assert!(chunk("").is_empty());
        assert!(chunk("   \n\n   ").is_empty());
    }
}