ai_tokenopt 0.5.10

//! Prose-to-YAML system prompt conversion.
//!
//! Converts freeform natural-language system prompts into a compact
//! YAML-structured format that uses fewer tokens while preserving
//! semantic content. Also strips common filler phrases.

/// Convert a prose system prompt into a compact YAML-like format.
///
/// Splits the prompt into labeled sections where possible, strips
/// filler phrases, and re-formats as a structured YAML block.
///
/// # How it works
///
/// 1. Strip filler phrases (e.g. "Please note that", "It is important to")
/// 2. Split into paragraphs
/// 3. Label paragraphs using heading detection or auto-numbering
/// 4. Output as a compact YAML-style block
///
/// If the input is already short (< 100 chars) or looks like YAML,
/// it is returned as-is.
///
/// # Examples
///
/// ```
/// use ai_tokenopt::prompt::structured::prose_to_yaml;
///
/// let short = "You are a helpful assistant.";
/// // Short text is returned as-is:
/// assert_eq!(prose_to_yaml(short), short);
/// ```
#[must_use]
pub fn prose_to_yaml(text: &str) -> String {
    let text = text.trim();

    // Skip if already YAML-ish or very short
    if text.len() < 100 || looks_like_yaml(text) {
        return text.to_string();
    }

    let cleaned = strip_filler(text);
    let paragraphs: Vec<&str> = cleaned
        .split("\n\n")
        .map(str::trim)
        .filter(|p| !p.is_empty())
        .collect();

    if paragraphs.is_empty() {
        return text.to_string();
    }

    let mut out = String::with_capacity(text.len());

    for (idx, para) in paragraphs.iter().enumerate() {
        let label = detect_label(para).unwrap_or_else(|| format!("instruction_{}", idx + 1));
        let content = para.replace('\n', " ").trim().to_string();
        out.push_str(&label);
        out.push_str(": |\n  ");
        out.push_str(&content);
        out.push('\n');
    }

    out
}

/// Strip common filler phrases that consume tokens but add no meaning.
///
/// Removes phrases like:
/// - "Please note that"
/// - "It is important to note that"
/// - "Keep in mind that"
/// - "Make sure to"
/// - "You should always"
/// - "Remember that"
///
/// # Examples
///
/// ```
/// use ai_tokenopt::prompt::structured::strip_filler;
///
/// let text = "Please note that you should respond concisely.";
/// let result = strip_filler(text);
/// assert_eq!(result, "you should respond concisely.");
/// ```
#[must_use]
pub fn strip_filler(text: &str) -> String {
    let filler_patterns = [
        "please note that ",
        "it is important to note that ",
        "it's important to note that ",
        "keep in mind that ",
        "make sure to always ",
        "make sure that you ",
        "make sure to ",
        "you should always remember to ",
        "you should always ",
        "always remember to ",
        "remember that ",
        "be sure to ",
        "it is essential that ",
        "it's essential that ",
        "i want you to ",
        "i would like you to ",
        "i need you to ",
    ];

    let mut result = text.to_string();

    for pattern in &filler_patterns {
        // Case-insensitive replacement: find pattern in lowercase of result
        let lower = result.to_lowercase();
        if let Some(pos) = lower.find(pattern) {
            // Only strip if at sentence boundary (start of string or after ". ")
            let at_boundary = pos == 0
                || result[..pos].ends_with(". ")
                || result[..pos].ends_with(".\n")
                || result[..pos].ends_with('\n');

            if at_boundary {
                result = format!("{}{}", &result[..pos], &result[pos + pattern.len()..]);
            }
        }
    }

    result
}

/// Check if text already looks like structured YAML.
fn looks_like_yaml(text: &str) -> bool {
    let first_lines: Vec<&str> = text.lines().take(3).collect();
    first_lines
        .iter()
        .any(|line| line.contains(": ") && !line.starts_with(' '))
}

/// Try to detect a section label from a paragraph's content.
///
/// Looks for markdown-style headings or common prompt section keywords.
fn detect_label(para: &str) -> Option<String> {
    let trimmed = para.trim();

    // Markdown heading: "## Section Name" or "# Section Name"
    if let Some(heading) = trimmed.strip_prefix("###") {
        return Some(slugify(heading.trim()));
    }
    if let Some(heading) = trimmed.strip_prefix("##") {
        return Some(slugify(heading.trim()));
    }
    if let Some(heading) = trimmed.strip_prefix('#') {
        return Some(slugify(heading.trim()));
    }

    // Detect common section keywords in the first sentence
    let first_sentence = trimmed.split('.').next().unwrap_or(trimmed).to_lowercase();

    let keywords = [
        ("role", "role"),
        ("persona", "role"),
        ("you are", "role"),
        ("respond", "response_format"),
        ("format", "response_format"),
        ("output", "response_format"),
        ("constraint", "constraints"),
        ("rule", "constraints"),
        ("guideline", "constraints"),
        ("limitation", "constraints"),
        ("never", "constraints"),
        ("do not", "constraints"),
        ("don't", "constraints"),
        ("context", "context"),
        ("background", "context"),
        ("knowledge", "context"),
        ("tool", "tools"),
        ("function", "tools"),
        ("capability", "capabilities"),
        ("tone", "tone"),
        ("style", "tone"),
    ];

    for (keyword, label) in &keywords {
        if first_sentence.contains(keyword) {
            return Some((*label).to_string());
        }
    }

    None
}

/// Convert a string to a snake_case slug.
fn slugify(text: &str) -> String {
    text.chars()
        .map(|c| {
            if c.is_alphanumeric() {
                c.to_ascii_lowercase()
            } else {
                '_'
            }
        })
        .collect::<String>()
        .split('_')
        .filter(|s| !s.is_empty())
        .collect::<Vec<&str>>()
        .join("_")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn short_text_returned_as_is() {
        let text = "You are a helpful assistant.";
        assert_eq!(prose_to_yaml(text), text);
    }

    #[test]
    fn yaml_like_text_returned_as_is() {
        let text = "role: assistant\nconstraints: be concise\nformat: plain text responses only please respond in English always";
        assert_eq!(prose_to_yaml(text), text);
    }

    #[test]
    fn long_prose_converted() {
        let text = "You are a helpful AI assistant that helps users with their questions.\n\n\
                     You should respond concisely and accurately. Format your responses in clean \
                     markdown when appropriate. Never include harmful content.\n\n\
                     You have access to various tools including search and calendar. Use them wisely.";
        let result = prose_to_yaml(text);
        assert!(result.contains(": |"));
    }

    #[test]
    fn filler_phrases_stripped() {
        assert_eq!(
            strip_filler("Please note that responses should be concise."),
            "responses should be concise."
        );
    }

    #[test]
    fn filler_keep_in_mind() {
        assert_eq!(
            strip_filler("Keep in mind that accuracy matters."),
            "accuracy matters."
        );
    }

    #[test]
    fn filler_make_sure() {
        assert_eq!(
            strip_filler("Make sure to respond in English."),
            "respond in English."
        );
    }

    #[test]
    fn no_filler_unchanged() {
        assert_eq!(strip_filler("Be concise."), "Be concise.");
    }

    #[test]
    fn markdown_heading_detected() {
        let label = detect_label("## Response Format");
        assert_eq!(label.as_deref(), Some("response_format"));
    }

    #[test]
    fn role_keyword_detected() {
        let label = detect_label("You are a helpful assistant");
        assert_eq!(label.as_deref(), Some("role"));
    }

    #[test]
    fn constraint_keyword_detected() {
        let label = detect_label("Never include harmful content in answers");
        assert_eq!(label.as_deref(), Some("constraints"));
    }

    #[test]
    fn tool_keyword_detected() {
        let label = detect_label("You have access to various tools");
        assert_eq!(label.as_deref(), Some("tools"));
    }

    #[test]
    fn slugify_basic() {
        assert_eq!(slugify("Hello World"), "hello_world");
    }

    #[test]
    fn slugify_special_chars() {
        assert_eq!(slugify("Response Format!!!"), "response_format");
    }

    #[test]
    fn no_label_returns_none() {
        assert!(detect_label("Lorem ipsum dolor sit amet").is_none());
    }
}