use std::collections::HashMap;
use std::fmt::Write as _;
use std::io::{self, Read};
use std::path::PathBuf;
use mecab_ko::Tokenizer;
struct Config {
top_n: usize,
min_length: usize,
pos_tags: Vec<String>,
dict_path: Option<PathBuf>,
text: Option<String>,
}
impl Default for Config {
fn default() -> Self {
Self {
top_n: 5,
min_length: 2,
pos_tags: vec!["NNG".to_string(), "NNP".to_string()],
dict_path: None,
text: None,
}
}
}
#[derive(Debug, Clone)]
struct Keyword {
term: String,
frequency: usize,
score: f64,
}
impl Keyword {
fn new(term: String, frequency: usize, total_terms: usize) -> Self {
let score = if total_terms > 0 {
frequency as f64 / total_terms as f64
} else {
0.0
};
Self {
term,
frequency,
score,
}
}
}
fn parse_args() -> Config {
let mut config = Config::default();
let args: Vec<String> = std::env::args().skip(1).collect();
let mut i = 0;
while i < args.len() {
match args[i].as_str() {
"--top" | "-n" => {
if i + 1 < args.len() {
if let Ok(n) = args[i + 1].parse::<usize>() {
config.top_n = n;
} else {
eprintln!("Invalid top-N value: {}", args[i + 1]);
}
i += 2;
} else {
eprintln!("Missing top-N value.");
i += 1;
}
}
"--min-length" | "-m" => {
if i + 1 < args.len() {
if let Ok(len) = args[i + 1].parse::<usize>() {
config.min_length = len;
} else {
eprintln!("Invalid min-length value: {}", args[i + 1]);
}
i += 2;
} else {
eprintln!("Missing min-length value.");
i += 1;
}
}
"--pos" | "-p" => {
if i + 1 < args.len() {
config.pos_tags = args[i + 1]
.split(',')
.map(|s| s.trim().to_string())
.collect();
i += 2;
} else {
eprintln!("Missing POS tags value.");
i += 1;
}
}
"--dict-path" | "-d" => {
if i + 1 < args.len() {
config.dict_path = Some(PathBuf::from(&args[i + 1]));
i += 2;
} else {
eprintln!("Missing dictionary path.");
i += 1;
}
}
"--text" | "-t" => {
if i + 1 < args.len() {
config.text = Some(args[i + 1].clone());
i += 2;
} else {
eprintln!("Missing text value.");
i += 1;
}
}
"--help" | "-h" => {
print_help();
std::process::exit(0);
}
_ => {
eprintln!("Unknown argument: {}", args[i]);
i += 1;
}
}
}
config
}
fn print_help() {
println!("Keyword Extractor - Extract keywords from Korean text");
println!();
println!("USAGE:");
println!(" keyword_extractor [OPTIONS]");
println!();
println!("OPTIONS:");
println!(" -n, --top <N> Number of keywords to extract (default: 5)");
println!(" -m, --min-length <LEN> Minimum keyword length (default: 2)");
println!(
" -p, --pos <TAGS> POS tags to include (comma-separated, default: NNG,NNP)"
);
println!(" -d, --dict-path <PATH> Custom dictionary path");
println!(" -t, --text <TEXT> Text to analyze (reads from stdin if not provided)");
println!(" -h, --help Print help information");
println!();
println!("EXAMPLES:");
println!(" # From stdin");
println!(" echo \"텍스트\" | keyword_extractor");
println!();
println!(" # From command line");
println!(" keyword_extractor --text \"인공지능은 미래 기술입니다\"");
println!();
println!(" # Custom settings");
println!(" keyword_extractor --top 10 --min-length 3 --pos NNG,NNP,VV");
}
fn extract_keywords(tokenizer: &mut Tokenizer, text: &str, config: &Config) -> Vec<Keyword> {
let tokens = tokenizer.tokenize(text);
let mut term_counts: HashMap<String, usize> = HashMap::new();
let mut total_terms = 0;
for token in tokens {
if config.pos_tags.iter().any(|pos| token.pos.starts_with(pos)) {
let surface = token.surface.trim();
if surface.chars().count() >= config.min_length {
*term_counts.entry(surface.to_string()).or_insert(0) += 1;
total_terms += 1;
}
}
}
let mut keywords: Vec<Keyword> = term_counts
.into_iter()
.map(|(term, freq)| Keyword::new(term, freq, total_terms))
.collect();
keywords.sort_by(|a, b| {
b.frequency
.cmp(&a.frequency)
.then_with(|| a.term.cmp(&b.term)) });
keywords.into_iter().take(config.top_n).collect()
}
fn format_keywords(keywords: &[Keyword]) -> String {
if keywords.is_empty() {
return "No keywords found.\n".to_string();
}
let mut output = String::new();
output.push_str("┌─────┬──────────────────┬───────────┬─────────┐\n");
output.push_str("│ Rank│ Keyword │ Frequency │ Score │\n");
output.push_str("├─────┼──────────────────┼───────────┼─────────┤\n");
for (i, keyword) in keywords.iter().enumerate() {
let _ = writeln!(
output,
"│ {:3} │ {:16} │ {:9} │ {:.5} │",
i + 1,
truncate(&keyword.term, 16),
keyword.frequency,
keyword.score
);
}
output.push_str("└─────┴──────────────────┴───────────┴─────────┘\n");
output
}
fn truncate(s: &str, max_len: usize) -> String {
if s.chars().count() > max_len {
let truncated: String = s.chars().take(max_len - 1).collect();
format!("{truncated}…")
} else {
s.to_string()
}
}
fn read_stdin() -> io::Result<String> {
let mut buffer = String::new();
io::stdin().read_to_string(&mut buffer)?;
Ok(buffer)
}
fn main() {
let config = parse_args();
let mut tokenizer = config.dict_path.as_ref().map_or_else(
|| match Tokenizer::new() {
Ok(t) => t,
Err(e) => {
eprintln!("Failed to initialize tokenizer: {e}");
eprintln!("Make sure the dictionary is available at the default location.");
std::process::exit(1);
}
},
|dict_path| match Tokenizer::with_dict(dict_path) {
Ok(t) => t,
Err(e) => {
eprintln!(
"Failed to load dictionary from {}: {e}",
dict_path.display()
);
std::process::exit(1);
}
},
);
let text = config.text.as_ref().map_or_else(
|| match read_stdin() {
Ok(text) => text,
Err(e) => {
eprintln!("Failed to read from stdin: {e}");
std::process::exit(1);
}
},
std::clone::Clone::clone,
);
if text.trim().is_empty() {
eprintln!("No input text provided.");
std::process::exit(1);
}
let keywords = extract_keywords(&mut tokenizer, &text, &config);
println!("Keyword Extraction Results");
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
println!("Text length: {} characters", text.chars().count());
println!("POS tags: {}", config.pos_tags.join(", "));
println!("Min length: {} characters", config.min_length);
println!("Top-N: {}", config.top_n);
println!();
println!("{}", format_keywords(&keywords));
}