#[must_use]
pub fn prose_to_yaml(text: &str) -> String {
let text = text.trim();
if text.len() < 100 || looks_like_yaml(text) {
return text.to_string();
}
let cleaned = strip_filler(text);
let paragraphs: Vec<&str> = cleaned
.split("\n\n")
.map(str::trim)
.filter(|p| !p.is_empty())
.collect();
if paragraphs.is_empty() {
return text.to_string();
}
let mut out = String::with_capacity(text.len());
for (idx, para) in paragraphs.iter().enumerate() {
let label = detect_label(para).unwrap_or_else(|| format!("instruction_{}", idx + 1));
let content = para.replace('\n', " ").trim().to_string();
out.push_str(&label);
out.push_str(": |\n ");
out.push_str(&content);
out.push('\n');
}
out
}
#[must_use]
pub fn strip_filler(text: &str) -> String {
let filler_patterns = [
"please note that ",
"it is important to note that ",
"it's important to note that ",
"keep in mind that ",
"make sure to always ",
"make sure that you ",
"make sure to ",
"you should always remember to ",
"you should always ",
"always remember to ",
"remember that ",
"be sure to ",
"it is essential that ",
"it's essential that ",
"i want you to ",
"i would like you to ",
"i need you to ",
];
let mut result = text.to_string();
for pattern in &filler_patterns {
let lower = result.to_lowercase();
if let Some(pos) = lower.find(pattern) {
let at_boundary = pos == 0
|| result[..pos].ends_with(". ")
|| result[..pos].ends_with(".\n")
|| result[..pos].ends_with('\n');
if at_boundary {
result = format!("{}{}", &result[..pos], &result[pos + pattern.len()..]);
}
}
}
result
}
fn looks_like_yaml(text: &str) -> bool {
let first_lines: Vec<&str> = text.lines().take(3).collect();
first_lines
.iter()
.any(|line| line.contains(": ") && !line.starts_with(' '))
}
fn detect_label(para: &str) -> Option<String> {
let trimmed = para.trim();
if let Some(heading) = trimmed.strip_prefix("###") {
return Some(slugify(heading.trim()));
}
if let Some(heading) = trimmed.strip_prefix("##") {
return Some(slugify(heading.trim()));
}
if let Some(heading) = trimmed.strip_prefix('#') {
return Some(slugify(heading.trim()));
}
let first_sentence = trimmed.split('.').next().unwrap_or(trimmed).to_lowercase();
let keywords = [
("role", "role"),
("persona", "role"),
("you are", "role"),
("respond", "response_format"),
("format", "response_format"),
("output", "response_format"),
("constraint", "constraints"),
("rule", "constraints"),
("guideline", "constraints"),
("limitation", "constraints"),
("never", "constraints"),
("do not", "constraints"),
("don't", "constraints"),
("context", "context"),
("background", "context"),
("knowledge", "context"),
("tool", "tools"),
("function", "tools"),
("capability", "capabilities"),
("tone", "tone"),
("style", "tone"),
];
for (keyword, label) in &keywords {
if first_sentence.contains(keyword) {
return Some((*label).to_string());
}
}
None
}
fn slugify(text: &str) -> String {
text.chars()
.map(|c| {
if c.is_alphanumeric() {
c.to_ascii_lowercase()
} else {
'_'
}
})
.collect::<String>()
.split('_')
.filter(|s| !s.is_empty())
.collect::<Vec<&str>>()
.join("_")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn short_text_returned_as_is() {
let text = "You are a helpful assistant.";
assert_eq!(prose_to_yaml(text), text);
}
#[test]
fn yaml_like_text_returned_as_is() {
let text = "role: assistant\nconstraints: be concise\nformat: plain text responses only please respond in English always";
assert_eq!(prose_to_yaml(text), text);
}
#[test]
fn long_prose_converted() {
let text = "You are a helpful AI assistant that helps users with their questions.\n\n\
You should respond concisely and accurately. Format your responses in clean \
markdown when appropriate. Never include harmful content.\n\n\
You have access to various tools including search and calendar. Use them wisely.";
let result = prose_to_yaml(text);
assert!(result.contains(": |"));
}
#[test]
fn filler_phrases_stripped() {
assert_eq!(
strip_filler("Please note that responses should be concise."),
"responses should be concise."
);
}
#[test]
fn filler_keep_in_mind() {
assert_eq!(
strip_filler("Keep in mind that accuracy matters."),
"accuracy matters."
);
}
#[test]
fn filler_make_sure() {
assert_eq!(
strip_filler("Make sure to respond in English."),
"respond in English."
);
}
#[test]
fn no_filler_unchanged() {
assert_eq!(strip_filler("Be concise."), "Be concise.");
}
#[test]
fn markdown_heading_detected() {
let label = detect_label("## Response Format");
assert_eq!(label.as_deref(), Some("response_format"));
}
#[test]
fn role_keyword_detected() {
let label = detect_label("You are a helpful assistant");
assert_eq!(label.as_deref(), Some("role"));
}
#[test]
fn constraint_keyword_detected() {
let label = detect_label("Never include harmful content in answers");
assert_eq!(label.as_deref(), Some("constraints"));
}
#[test]
fn tool_keyword_detected() {
let label = detect_label("You have access to various tools");
assert_eq!(label.as_deref(), Some("tools"));
}
#[test]
fn slugify_basic() {
assert_eq!(slugify("Hello World"), "hello_world");
}
#[test]
fn slugify_special_chars() {
assert_eq!(slugify("Response Format!!!"), "response_format");
}
#[test]
fn no_label_returns_none() {
assert!(detect_label("Lorem ipsum dolor sit amet").is_none());
}
}