use anyhow::{Context, Result};
use clap::Args;
use std::path::PathBuf;
#[derive(Debug, Args)]
pub struct GenerateConfigArgs {
#[arg(short = 'l', long, value_name = "CODE", required = true)]
pub language_code: String,
#[arg(short, long, value_name = "FILE", required = true)]
pub output: PathBuf,
}
impl GenerateConfigArgs {
pub fn execute(&self) -> Result<()> {
use std::fs;
println!("Generating language configuration template...");
println!(" Language code: {}", self.language_code);
println!(" Output file: {}", self.output.display());
let template = self.generate_template();
fs::write(&self.output, template)
.with_context(|| format!("Failed to write to {}", self.output.display()))?;
println!("✓ Configuration template generated successfully!");
println!();
println!("Next steps:");
println!("1. Edit the configuration file to customize language rules");
println!("2. Validate your configuration:");
println!(
" sakurs validate --language-config {}",
self.output.display()
);
println!("3. Use it for processing:");
println!(
" sakurs process -i input.txt --language-config {}",
self.output.display()
);
Ok(())
}
fn generate_template(&self) -> String {
format!(
r#"# Language configuration for {}
[metadata]
code = "{}"
name = "Custom Language"
# Sentence terminator characters
[terminators]
# Basic sentence-ending punctuation
chars = [".", "!", "?"]
# Multi-character terminator patterns (optional)
patterns = [
# Example: {{ pattern = "!?", name = "surprised_question" }}
]
# Ellipsis handling
[ellipsis]
# Whether ellipsis should be treated as sentence boundary by default
treat_as_boundary = true
# Ellipsis patterns to recognize
patterns = ["...", "…"]
# Context-based rules for ellipsis
context_rules = [
{{ condition = "followed_by_capital", boundary = true }},
{{ condition = "followed_by_lowercase", boundary = false }}
]
# Exception patterns (regex) - patterns where ellipsis should NOT be a boundary
exceptions = [
# Example: {{ regex = "\\b(um|uh|er)\\.\\.\\.", boundary = false }}
]
# Enclosure pairs (quotes, parentheses, etc.)
[enclosures]
pairs = [
{{ open = "(", close = ")" }},
{{ open = "[", close = "]" }},
{{ open = "{{", close = "}}" }},
{{ open = '"', close = '"', symmetric = true }},
{{ open = "'", close = "'", symmetric = true }}
]
# Suppression rules for preventing false boundaries
[suppression]
# Fast pattern matching for enclosure suppression
fast_patterns = [
# Apostrophes in contractions
{{ char = "'", before = "alpha", after = "alpha" }},
# List items at line start: 1) item
{{ char = ")", line_start = true, before = "alnum" }}
]
# Regex patterns for more complex suppression (optional)
regex_patterns = [
# Example: {{ pattern = "\\d+'\\d+\"", description = "Feet and inches like 5'9\"" }}
]
# Abbreviations organized by category
[abbreviations]
# Add your abbreviations here, organized by category
# Category names are arbitrary - choose what makes sense for your language
titles = ["Dr", "Mr", "Mrs", "Ms", "Prof"]
academic = ["Ph.D", "M.D", "B.A", "M.A"]
business = ["Inc", "Corp", "Ltd", "LLC", "Co"]
common = ["etc", "vs", "e.g", "i.e"]
# Add more categories as needed:
# geographic = ["St", "Ave", "Blvd"]
# measurement = ["oz", "lb", "kg", "km"]
# month = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
# Sentence starters configuration
[sentence_starters]
# Whether to require whitespace after the sentence starter (default: true)
# When true, "The patient" matches but "Theater" does not
require_following_space = true
# Minimum word length to consider (default: 1)
min_word_length = 1
# Categories of sentence starter words
# Words should be written exactly as they should be matched
pronouns = ["I", "You", "He", "She", "It", "We", "They"]
articles = ["The", "A", "An"]
demonstratives = ["This", "That", "These", "Those"]
conjunctions = ["However", "But", "And", "So", "Therefore"]
interrogatives = ["What", "When", "Where", "Why", "How", "Who"]
# Add more categories as needed
"#,
self.language_code, self.language_code
)
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_generate_config_args_debug() {
let args = GenerateConfigArgs {
language_code: "fr".to_string(),
output: PathBuf::from("french.toml"),
};
let debug_str = format!("{:?}", args);
assert!(debug_str.contains("GenerateConfigArgs"));
assert!(debug_str.contains("fr"));
assert!(debug_str.contains("french.toml"));
}
#[test]
fn test_generate_template() {
let args = GenerateConfigArgs {
language_code: "test".to_string(),
output: PathBuf::from("test.toml"),
};
let template = args.generate_template();
assert!(template.contains("code = \"test\""));
assert!(template.contains("[metadata]"));
assert!(template.contains("[terminators]"));
assert!(template.contains("[abbreviations]"));
assert!(template.contains("[sentence_starters]"));
}
#[test]
fn test_execute_success() {
let temp_dir = TempDir::new().unwrap();
let output_path = temp_dir.path().join("test_config.toml");
let args = GenerateConfigArgs {
language_code: "test".to_string(),
output: output_path.clone(),
};
assert!(args.execute().is_ok());
assert!(output_path.exists());
let content = std::fs::read_to_string(&output_path).unwrap();
assert!(content.contains("code = \"test\""));
}
}