use crate::types::{Opt, OptName};
use bstr::ByteSlice;
use ecow::{EcoString, EcoVec};
use memchr::memchr;
use regex::Regex;
use std::collections::HashSet;
pub struct Parser;
impl Parser {
pub fn parse_line(s: &str) -> EcoVec<Opt> {
let pairs = Self::preprocess(s);
let mut opts = EcoVec::new();
let mut seen: HashSet<Opt, foldhash::fast::RandomState> =
HashSet::with_capacity_and_hasher(pairs.len(), foldhash::fast::RandomState::default());
for (opt_str, desc_str) in pairs.iter() {
for opt in Self::parse_with_opt_part(opt_str, desc_str).iter() {
if seen.insert(opt.clone()) {
opts.push(opt.clone());
}
}
}
opts
}
pub fn preprocess(s: &str) -> EcoVec<(EcoString, EcoString)> {
let bytes = s.as_bytes();
let lines: Vec<&str> = bytes
.lines()
.filter_map(|line| std::str::from_utf8(line).ok())
.collect();
let mut result = EcoVec::new();
let mut i = 0;
while i < lines.len() {
let line = lines[i];
let trimmed = line.trim_start();
let trimmed_bytes = trimmed.as_bytes();
if trimmed_bytes.is_empty() || trimmed_bytes[0] != b'-' {
i += 1;
continue;
}
let mut opt_end = 0;
let mut part_count = 0;
for (idx, part) in trimmed.split_whitespace().enumerate() {
part_count += 1;
let part_bytes = part.as_bytes();
if part_bytes.first() == Some(&b'-') || idx == 0 {
opt_end = idx + 1;
} else if memchr(b'=', part_bytes).is_some() || part_bytes.first() != Some(&b'-') {
opt_end = idx + 1;
} else {
break;
}
}
if opt_end > 0 && opt_end < part_count {
let mut opt_str = EcoString::new();
let mut desc_str = EcoString::new();
for (idx, part) in trimmed.split_whitespace().enumerate() {
if idx < opt_end {
if !opt_str.is_empty() {
opt_str.push(' ');
}
opt_str.push_str(part);
} else {
if !desc_str.is_empty() {
desc_str.push(' ');
}
desc_str.push_str(part);
}
}
result.push((opt_str, desc_str));
i += 1;
} else if opt_end > 0 {
let opt_str = EcoString::from(trimmed);
let desc_str = if i + 1 < lines.len() {
let next_trimmed = lines[i + 1].trim_start();
let next_bytes = next_trimmed.as_bytes();
if !next_bytes.is_empty() && next_bytes[0] != b'-' {
EcoString::from(lines[i + 1].trim())
} else {
EcoString::new()
}
} else {
EcoString::new()
};
if !desc_str.is_empty() {
result.push((opt_str, desc_str));
i += 2;
} else {
result.push((opt_str, EcoString::new()));
i += 1;
}
} else {
i += 1;
}
}
result
}
pub fn parse_with_opt_part(opt_str: &str, desc_str: &str) -> EcoVec<Opt> {
let names = Self::parse_opt_names(opt_str);
let arg = Self::parse_opt_arg(opt_str);
if names.is_empty() {
return EcoVec::new();
}
let mut result = EcoVec::new();
result.push(Opt {
names,
argument: arg,
description: EcoString::from(desc_str),
});
result
}
fn parse_opt_names(s: &str) -> EcoVec<OptName> {
let mut names = EcoVec::new();
let mut seen: HashSet<EcoString, foldhash::fast::RandomState> =
HashSet::with_hasher(foldhash::fast::RandomState::default());
for part in s.split([',', '/', '|']) {
let trimmed = part.trim();
if trimmed.is_empty() {
continue;
}
for word in trimmed.split_whitespace() {
if word.starts_with('-')
&& let Some(name) = OptName::from_text(word)
{
if seen.insert(name.raw.clone()) {
let pos = names.iter().position(|n| n > &name).unwrap_or(names.len());
names.insert(pos, name);
}
}
}
}
names
}
fn parse_opt_arg(s: &str) -> EcoString {
for part in s.split([',', '/', '|']) {
let trimmed = part.trim();
if let Some(arg) = Self::extract_arg_from_part(trimmed)
&& !arg.is_empty()
{
return arg;
}
}
EcoString::new()
}
fn extract_arg_from_part(s: &str) -> Option<EcoString> {
let mut words = s.split_whitespace();
words.next()?;
let mut arg = EcoString::new();
for word in words {
if !arg.is_empty() {
arg.push(' ');
}
arg.push_str(word);
}
if arg.is_empty() || arg == "." {
return None;
}
Some(arg)
}
pub fn parse_usage_header(keywords: &[&str], block: &str) -> Option<EcoString> {
if keywords.is_empty() || block.is_empty() {
return None;
}
let header_line = block.lines().next()?.to_lowercase();
for keyword in keywords {
let pattern = format!(r"^\s*{}\s*:?\s*$", regex::escape(keyword));
if let Ok(re) = Regex::new(&pattern)
&& re.is_match(&header_line)
{
return Some(EcoString::from(header_line));
}
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_preprocess_same_and_next_line_descriptions() {
let input = " -a, --all show all\n -b\n show b";
let pairs = Parser::preprocess(input);
assert_eq!(pairs.len(), 2);
assert_eq!(pairs[0].0.as_str(), "-a, --all show all");
assert_eq!(pairs[0].1.as_str(), "");
assert_eq!(pairs[1].0.as_str(), "-b");
assert_eq!(pairs[1].1.as_str(), "show b");
}
#[test]
fn test_parse_usage_header_matches_keywords() {
let block = "Usage:\n cmd [OPTIONS]\n";
let header = Parser::parse_usage_header(&["usage"], block).unwrap();
assert!(header.contains("usage"));
}
#[test]
fn test_parse_opt_names() {
let names = Parser::parse_opt_names("-v, --verbose");
assert_eq!(names.len(), 2);
assert!(names.iter().any(|n| n.raw.as_str() == "-v"));
assert!(names.iter().any(|n| n.raw.as_str() == "--verbose"));
}
#[test]
fn test_parse_with_opt_part() {
let opts = Parser::parse_with_opt_part("-v, --verbose", "Enable verbose mode");
assert_eq!(opts.len(), 1);
assert_eq!(opts[0].names.len(), 2);
assert_eq!(opts[0].description.as_str(), "Enable verbose mode");
}
#[test]
fn test_parse_line_deduplicates_options() {
let input = " -v, --verbose verbose\n -v, --verbose verbose";
let opts = Parser::parse_line(input);
assert_eq!(opts.len(), 1);
assert_eq!(opts[0].names.len(), 2);
}
#[test]
fn test_parse_line_bioinformatics_style_help() {
let input = " -i, --input FILE Input FASTA/FASTQ file\n -o, --output FILE Output BAM file\n --min-mapq INT Minimum mapping quality (default: 30)";
let opts = Parser::parse_line(input);
assert_eq!(opts.len(), 3);
let all_names: Vec<String> = opts
.iter()
.flat_map(|o| o.names.iter().map(|n| n.raw.to_string()))
.collect();
assert!(all_names.contains(&"-i".to_string()));
assert!(all_names.contains(&"--input".to_string()));
assert!(all_names.contains(&"-o".to_string()));
assert!(all_names.contains(&"--output".to_string()));
assert!(all_names.contains(&"--min-mapq".to_string()));
}
}