use crate::error::FormatParseError;
use crate::types::definitions::{FieldSpec, FieldType};
use regex;
use std::collections::HashMap;
pub const MAX_NESTED_FORMAT_DEPTH: usize = 10;
const MAX_BRACE_DEPTH_IN_FORMAT_SPEC: i32 = 10;
pub type ParsedPatternParts = (
String,
String,
Vec<FieldSpec>,
Vec<Option<String>>,
Vec<Option<String>>,
HashMap<String, String>,
bool,
);
fn literal_delimits_empty_field(s: &str) -> bool {
!s.trim().is_empty()
}
fn collect_balanced_format_spec(
chars: &mut std::iter::Peekable<std::str::Chars>,
) -> Result<String, FormatParseError> {
let mut out = String::new();
let mut depth = 0i32;
loop {
let Some(&ch) = chars.peek() else {
return Err(FormatParseError::PatternError(
"Unclosed '{' in pattern: expected '}' to close the field".to_string(),
));
};
if ch == '}' && depth == 0 {
break;
}
let c = chars
.next()
.expect("peek matched a char so next() must succeed");
match c {
'{' => {
if chars.peek() == Some(&'{') {
chars.next();
out.push('{');
out.push('{');
} else {
depth += 1;
if depth > MAX_BRACE_DEPTH_IN_FORMAT_SPEC {
return Err(FormatParseError::PatternError(
"Format specification has too many nested '{' (max 10)".to_string(),
));
}
out.push('{');
}
}
'}' => {
depth -= 1;
if depth < 0 {
return Err(FormatParseError::PatternError(
"Unexpected '}' in format specification".to_string(),
));
}
out.push('}');
}
_ => out.push(c),
}
}
Ok(out)
}
fn brace_balance_valid_for_nested_candidate(s: &str) -> bool {
let mut depth = 0i32;
let mut it = s.chars().peekable();
while let Some(c) = it.next() {
match c {
'{' => {
if it.peek() == Some(&'{') {
it.next();
continue;
}
depth += 1;
}
'}' => {
depth -= 1;
if depth < 0 {
return false;
}
}
_ => {}
}
}
depth == 0
}
fn is_nested_format_spec_candidate(trimmed: &str) -> bool {
if trimmed.len() < 2 {
return false;
}
if !trimmed.starts_with('{') || trimmed.starts_with("{{") {
return false;
}
if !trimmed.ends_with('}') {
return false;
}
brace_balance_valid_for_nested_candidate(trimmed)
}
fn strip_regex_anchors(anchored: &str) -> String {
let s = anchored.strip_prefix('^').unwrap_or(anchored);
let s = s.strip_suffix('$').unwrap_or(s);
s.to_string()
}
fn has_trailing_literal_before_next_field(mut chars: std::iter::Peekable<std::str::Chars>) -> bool {
while chars.peek().is_some_and(|c| c.is_whitespace()) {
chars.next();
}
if chars.next() != Some('}') {
return false;
}
while chars.peek().is_some_and(|c| c.is_whitespace()) {
chars.next();
}
let mut literal = String::new();
loop {
match chars.next() {
None => return literal_delimits_empty_field(&literal),
Some('{') => {
if chars.peek() == Some(&'{') {
chars.next();
literal.push('{');
} else {
return literal_delimits_empty_field(&literal);
}
}
Some('}') => {
if chars.peek() == Some(&'}') {
chars.next();
literal.push('}');
} else {
literal.push('}');
}
}
Some(c) => literal.push(c),
}
}
}
pub fn parse_pattern(
pattern: &str,
custom_patterns: &HashMap<String, String>,
allow_empty_delimited_default_string: bool,
nesting_depth: usize,
) -> Result<ParsedPatternParts, FormatParseError> {
let estimated_fields = pattern.matches('{').count();
let mut regex_parts = Vec::with_capacity(estimated_fields * 2);
let mut field_specs = Vec::with_capacity(estimated_fields);
let mut field_names = Vec::with_capacity(estimated_fields); let mut normalized_names = Vec::with_capacity(estimated_fields); let mut name_mapping = HashMap::with_capacity(estimated_fields); let mut field_name_types = HashMap::with_capacity(estimated_fields); let mut chars: std::iter::Peekable<std::str::Chars> = pattern.chars().peekable();
let mut literal = String::new();
let mut allows_empty_default_string_match = true;
while let Some(ch) = chars.next() {
match ch {
'{' => {
if chars.peek() == Some(&'{') {
chars.next();
literal.push('{');
continue;
}
let had_leading_literal = !literal.trim().is_empty();
if !literal.is_empty() {
allows_empty_default_string_match = false;
let escaped = if literal.trim_end() != literal {
let trimmed = literal.trim_end();
let mut escaped_str = String::with_capacity(trimmed.len() + 4);
escaped_str.push_str(®ex::escape(trimmed));
escaped_str.push_str("\\s+");
escaped_str
} else {
regex::escape(&literal)
};
regex_parts.push(escaped);
literal.clear();
}
let (mut spec, name) = parse_field(&mut chars, nesting_depth)?;
if matches!(spec.field_type, FieldType::Nested) {
if nesting_depth >= MAX_NESTED_FORMAT_DEPTH {
return Err(FormatParseError::PatternError(
"Nested format patterns exceed max depth (10)".to_string(),
));
}
let inner = spec.nested_subpattern.as_ref().ok_or_else(|| {
FormatParseError::PatternError(
"Internal error: nested field missing subpattern".to_string(),
)
})?;
let (inner_anchored, _, _, _, _, _, _) = parse_pattern(
inner,
custom_patterns,
allow_empty_delimited_default_string,
nesting_depth + 1,
)?;
spec.nested_regex_body = Some(strip_regex_anchors(&inner_anchored));
}
if !spec.is_default_unconstrained_string() {
allows_empty_default_string_match = false;
}
let has_trailing_literal = has_trailing_literal_before_next_field(chars.clone());
let mut peek_chars = chars.clone();
let next_field_is_greedy = loop {
let mut found_closing = false;
while let Some(&ch) = peek_chars.peek() {
if ch.is_whitespace() {
peek_chars.next();
} else if ch == '}' {
peek_chars.next(); found_closing = true;
break;
} else {
break;
}
}
if !found_closing {
break None; }
while let Some(&ch) = peek_chars.peek() {
if ch.is_whitespace() {
peek_chars.next();
} else {
break;
}
}
if peek_chars.peek() == Some(&'{') {
peek_chars.next();
if peek_chars.peek() == Some(&'{') {
peek_chars.next();
continue; }
if peek_chars.peek() == Some(&'}') {
break Some(false);
} else {
let mut field_chars = peek_chars.clone();
let mut has_precision = false;
while let Some(&ch) = field_chars.peek() {
if ch == '}' {
break;
}
if ch == ':' {
field_chars.next();
while let Some(&next_ch) = field_chars.peek() {
if next_ch == '}' {
break;
}
if next_ch == '.' {
has_precision = true;
break;
}
field_chars.next();
}
break;
}
field_chars.next();
}
break Some(has_precision);
}
} else {
break None;
}
};
let allow_empty_delimited = allow_empty_delimited_default_string
&& spec.is_default_unconstrained_string()
&& (had_leading_literal || has_trailing_literal);
let pattern = spec.to_regex_pattern(
custom_patterns,
next_field_is_greedy,
allow_empty_delimited,
);
let la_raw = spec.regex_lookahead.as_deref().unwrap_or("");
let (lb_prefix, body, la_emit) =
crate::rewrite_field_fragments_for_engine_anchor(&pattern, la_raw);
if let Some(ref original_name) = name {
if let Some(existing_type) = field_name_types.get(original_name) {
if !field_types_match(existing_type, &spec.field_type) {
return Err(FormatParseError::RepeatedNameError(original_name.clone()));
}
} else {
field_name_types.insert(original_name.clone(), spec.field_type.clone());
}
}
let group_pattern = if matches!(spec.field_type, FieldType::BracedContent) {
let Some(ref original_name) = name else {
return Err(FormatParseError::PatternError(
"The :brace format requires a named field (e.g. {content:brace})"
.to_string(),
));
};
if original_name.chars().all(|c| c.is_ascii_digit()) {
return Err(FormatParseError::PatternError(
"The :brace format cannot be used with numbered fields".to_string(),
));
}
let normalized =
normalize_field_name(original_name, &mut name_mapping, &normalized_names);
format!("\\{{(?P<{}>.*?)\\}}", normalized)
} else if let Some(ref original_name) = name {
let is_numeric = original_name.chars().all(|c| c.is_ascii_digit());
if is_numeric {
format!("{}{}({}){}", lb_prefix, "", body, la_emit)
} else {
let normalized = normalize_field_name(
original_name,
&mut name_mapping,
&normalized_names,
);
format!("{}{}(?P<{}>{}){}", lb_prefix, "", normalized, body, la_emit)
}
} else {
format!("{}{}({}){}", lb_prefix, "", body, la_emit)
};
regex_parts.push(group_pattern);
if let Some(ref original_name) = name {
let is_numeric = original_name.chars().all(|c| c.is_ascii_digit());
if is_numeric {
field_names.push(None); normalized_names.push(None);
} else {
let normalized = normalize_field_name(
original_name,
&mut name_mapping,
&normalized_names,
);
field_names.push(Some(original_name.clone())); normalized_names.push(Some(normalized.clone())); name_mapping.insert(normalized, original_name.clone()); }
} else {
field_names.push(None);
normalized_names.push(None);
}
field_specs.push(spec);
if chars.next() != Some('}') {
return Err(FormatParseError::PatternError(
"Expected '}' after field specification".to_string(),
));
}
}
'}' => {
if chars.peek() == Some(&'}') {
chars.next();
literal.push('}');
continue;
}
literal.push('}');
}
_ => {
literal.push(ch);
}
}
}
if !literal.is_empty() {
allows_empty_default_string_match = false;
let escaped = if literal.trim_end() != literal {
let trimmed = literal.trim_end();
format!("{}\\s*", regex::escape(trimmed))
} else {
regex::escape(&literal)
};
regex_parts.push(escaped);
}
let regex_str = regex_parts.join("");
let regex_str_with_anchors = format!("^{}$", regex_str);
Ok((
regex_str_with_anchors,
regex_str,
field_specs,
field_names,
normalized_names,
name_mapping,
allows_empty_default_string_match,
))
}
pub fn normalize_field_name(
name: &str,
_name_mapping: &mut HashMap<String, String>,
existing_normalized: &[Option<String>],
) -> String {
let mut base_normalized = String::with_capacity(name.len());
for c in name.chars() {
match c {
'-' | '.' | '[' => base_normalized.push('_'),
']' => {}
_ => base_normalized.push(c),
}
}
let mut normalized = base_normalized.clone();
let underscore_pos = normalized.find('_');
let mut collision_count = 0;
while existing_normalized
.iter()
.any(|n| n.as_ref().map(|s| s == &normalized).unwrap_or(false))
{
collision_count += 1;
if let Some(pos) = underscore_pos {
let before = &base_normalized[..pos];
let after = &base_normalized[pos + 1..];
normalized = format!("{}{}{}", before, "_".repeat(1 + collision_count), after);
} else {
normalized = format!("{}{}", base_normalized, "_".repeat(collision_count));
}
}
normalized
}
pub fn validate_multiline_mvp(spec: &FieldSpec) -> Result<(), FormatParseError> {
if !matches!(
spec.field_type,
FieldType::Multiline | FieldType::IndentBlock
) {
return Ok(());
}
if spec.sign.is_some() || spec.zero_pad {
return Err(FormatParseError::PatternError(
"Multiline types :ml and :blk do not support sign or zero-padding".to_string(),
));
}
if spec.alignment == Some('=') {
return Err(FormatParseError::PatternError(
"Multiline types :ml and :blk do not support '=' alignment".to_string(),
));
}
Ok(())
}
pub fn field_types_match(t1: &FieldType, t2: &FieldType) -> bool {
use std::mem::discriminant;
discriminant(t1) == discriminant(t2)
}
pub fn parse_field_path(field_name: &str) -> Vec<String> {
let mut path = Vec::new();
let mut current = String::new();
let mut in_brackets = false;
for ch in field_name.chars() {
match ch {
'[' => {
if !current.is_empty() {
path.push(current.clone());
current.clear();
}
in_brackets = true;
}
']' => {
if in_brackets {
if !current.is_empty() {
path.push(current.clone());
current.clear();
}
in_brackets = false;
} else {
current.push(ch);
}
}
_ => {
current.push(ch);
}
}
}
if !current.is_empty() {
path.push(current);
}
path
}
pub fn parse_field(
chars: &mut std::iter::Peekable<std::str::Chars>,
nesting_depth: usize,
) -> Result<(FieldSpec, Option<String>), FormatParseError> {
let mut spec = FieldSpec::new();
let mut field_name = String::new();
let mut in_name = true;
let mut in_brackets = false;
while let Some(&ch) = chars.peek() {
match ch {
':' => {
chars.next();
in_name = false;
break;
}
'!' => {
chars.next();
if chars.peek().is_some() {
chars.next();
}
in_name = false;
}
'}' => {
break;
}
'[' => {
in_brackets = true;
field_name.push(ch);
chars.next();
}
']' => {
in_brackets = false;
field_name.push(ch);
chars.next();
}
'\'' | '"' => {
if in_brackets {
return Err(FormatParseError::NotImplementedError(
"Quoted keys in field names".to_string(),
));
}
in_name = false;
break;
}
_ => {
if ch.is_alphanumeric() || ch == '_' || ch == '-' || ch == '.' {
field_name.push(ch);
chars.next();
} else {
in_name = false;
break;
}
}
}
}
if !in_name {
let format_spec = collect_balanced_format_spec(chars)?;
let trimmed = format_spec.trim();
if is_nested_format_spec_candidate(trimmed) {
if nesting_depth >= MAX_NESTED_FORMAT_DEPTH {
return Err(FormatParseError::PatternError(
"Nested format patterns exceed max depth (10)".to_string(),
));
}
spec.field_type = FieldType::Nested;
spec.nested_subpattern = Some(trimmed.to_string());
} else {
parse_format_spec(&format_spec, &mut spec)?;
}
validate_multiline_mvp(&spec)?;
}
let name = if field_name.is_empty() {
None
} else {
Some(field_name)
};
Ok((spec, name))
}
pub fn parse_format_spec(format_spec: &str, spec: &mut FieldSpec) -> Result<(), FormatParseError> {
let mut chars = format_spec.chars().peekable();
if let Some(&ch) = chars.peek() {
if ch == '<' || ch == '>' || ch == '^' || ch == '=' {
spec.alignment = Some(ch);
chars.next();
} else {
let mut peek_iter = chars.clone();
peek_iter.next(); if let Some(next_ch) = peek_iter.next() {
if next_ch == '<' || next_ch == '>' || next_ch == '^' || next_ch == '=' {
spec.fill = Some(ch);
chars.next(); spec.alignment = Some(next_ch);
chars.next(); }
}
}
}
if let Some(&ch) = chars.peek() {
if ch == '+' || ch == '-' || ch == ' ' {
spec.sign = Some(ch);
chars.next();
}
}
if chars.peek() == Some(&'#') {
chars.next();
}
if chars.peek() == Some(&'0') {
spec.zero_pad = true;
chars.next();
}
let mut width_str = String::new();
while let Some(&ch) = chars.peek() {
if ch.is_ascii_digit() {
width_str.push(ch);
chars.next();
} else {
break;
}
}
if !width_str.is_empty() {
spec.width = width_str.parse::<usize>().ok();
}
if chars.peek() == Some(&',') {
chars.next();
}
if chars.peek() == Some(&'.') {
chars.next();
let mut precision_str = String::new();
while let Some(&ch) = chars.peek() {
if ch.is_ascii_digit() {
precision_str.push(ch);
chars.next();
} else {
break;
}
}
if !precision_str.is_empty() {
spec.precision = precision_str.parse::<usize>().ok();
}
}
let mut type_str = String::new();
for ch in chars {
type_str.push(ch);
}
if type_str == "%" {
spec.field_type = FieldType::Percentage;
return Ok(());
}
if type_str.starts_with('%') {
crate::reject_lookaround_in_strftime(&type_str).map_err(FormatParseError::PatternError)?;
spec.field_type = FieldType::DateTimeStrftime;
spec.strftime_format = Some(type_str.clone());
return Ok(());
}
let (type_base, lookaround_tail) = crate::split_type_base_and_lookaround_tail(&type_str);
if type_base.is_empty() && !lookaround_tail.is_empty() {
return Err(FormatParseError::PatternError(
"Type specification must precede lookaround assertions".to_string(),
));
}
let type_name: String = type_base.chars().filter(|c| c.is_alphabetic()).collect();
spec.field_type = if type_name.is_empty() {
FieldType::String
} else if type_name == "ti" {
FieldType::DateTimeISO
} else if type_name == "te" {
FieldType::DateTimeRFC2822
} else if type_name == "tg" {
FieldType::DateTimeGlobal
} else if type_name == "ta" {
FieldType::DateTimeUS
} else if type_name == "tc" {
FieldType::DateTimeCtime
} else if type_name == "th" {
FieldType::DateTimeHTTP
} else if type_name == "tt" {
FieldType::DateTimeTime
} else if type_name == "ts" {
FieldType::DateTimeSystem
} else if type_name == "brace" {
FieldType::BracedContent
} else if type_name == "ml" {
FieldType::Multiline
} else if type_name == "blk" {
FieldType::IndentBlock
} else if type_name.len() > 1 {
FieldType::Custom(type_name)
} else {
let type_char = type_name.chars().next().unwrap();
spec.original_type_char = Some(type_char);
match type_char {
's' => FieldType::String,
'd' | 'i' => FieldType::Integer,
'b' | 'o' | 'x' | 'X' => FieldType::Integer,
'n' => FieldType::NumberWithThousands,
'f' | 'F' => FieldType::Float,
'e' | 'E' => FieldType::Scientific,
'g' | 'G' => FieldType::GeneralNumber,
'l' => FieldType::Letters,
'w' => FieldType::Word,
'W' => FieldType::NonLetters,
'S' => FieldType::NonWhitespace,
'D' => FieldType::NonDigits,
c => FieldType::Custom(c.to_string()),
}
};
if !lookaround_tail.is_empty() {
let (lb, la) = crate::parse_lookaround_tail(lookaround_tail)
.map_err(FormatParseError::PatternError)?;
match &spec.field_type {
FieldType::Integer | FieldType::Float => {
spec.regex_lookbehind = if lb.is_empty() { None } else { Some(lb) };
spec.regex_lookahead = if la.is_empty() { None } else { Some(la) };
}
_ => {
return Err(FormatParseError::PatternError("Lookaround assertions are only supported for integer and float format types (d, i, b, o, x, X, f, F)".to_string()));
}
}
}
Ok(())
}
#[cfg(test)]
mod normalize_field_name_tests {
use super::normalize_field_name;
use std::collections::HashMap;
#[test]
fn dict_style_brackets_map_to_underscores() {
let mut m = HashMap::new();
let existing: Vec<Option<String>> = vec![];
assert_eq!(
normalize_field_name("hello[world]", &mut m, &existing),
"hello_world"
);
assert_eq!(
normalize_field_name("hello[foo][baz]", &mut m, &existing),
"hello_foo_baz"
);
}
#[test]
fn deep_nested_brackets_normalize() {
let mut m = HashMap::new();
assert_eq!(normalize_field_name("a[b[c[d]]]", &mut m, &[]), "a_b_c_d");
}
}