use regex::Regex;
use serde::{Deserialize, Serialize};
use crate::acg::prompt_ir::{PromptBlock, SpanId};
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum VariableCategory {
Timestamp,
RequestId,
SessionId,
Locale,
Custom(String),
}
pub struct VariablePattern {
pub name: String,
pub regex: Regex,
pub category: VariableCategory,
}
impl std::fmt::Debug for VariablePattern {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("VariablePattern")
.field("name", &self.name)
.field("regex", &self.regex.as_str())
.field("category", &self.category)
.finish()
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ExtractedVariable {
pub pattern_name: String,
pub original_value: String,
pub byte_offset: usize,
pub byte_length: usize,
pub category: VariableCategory,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ExtractionResult {
pub span_id: SpanId,
pub template_content: String,
pub variables: Vec<ExtractedVariable>,
}
pub fn default_variable_patterns() -> Vec<VariablePattern> {
vec![
VariablePattern {
name: "iso8601_timestamp".to_string(),
regex: Regex::new(
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})",
)
.expect("iso8601_timestamp regex is valid"),
category: VariableCategory::Timestamp,
},
VariablePattern {
name: "uuid".to_string(),
regex: Regex::new(
r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
)
.expect("uuid regex is valid"),
category: VariableCategory::RequestId,
},
VariablePattern {
name: "request_id".to_string(),
regex: Regex::new(r"(?:req|trace|span|run|call|session)[-_][a-zA-Z0-9]{8,}")
.expect("request_id regex is valid"),
category: VariableCategory::RequestId,
},
VariablePattern {
name: "date_string".to_string(),
regex: Regex::new(r"\d{4}-\d{2}-\d{2}").expect("date_string regex is valid"),
category: VariableCategory::Timestamp,
},
VariablePattern {
name: "unix_timestamp".to_string(),
regex: Regex::new(r"\b1[0-9]{9,12}\b").expect("unix_timestamp regex is valid"),
category: VariableCategory::Timestamp,
},
]
}
pub fn extract_variables(
content: &str,
span_id: &SpanId,
patterns: &[VariablePattern],
) -> Option<ExtractionResult> {
let mut all_matches: Vec<(usize, usize, &VariablePattern)> = Vec::new();
for pattern in patterns {
for matched in pattern.regex.find_iter(content) {
all_matches.push((matched.start(), matched.end(), pattern));
}
}
if all_matches.is_empty() {
return None;
}
all_matches.sort_by(|left, right| {
left.0
.cmp(&right.0)
.then_with(|| (right.1 - right.0).cmp(&(left.1 - left.0)))
});
let mut selected: Vec<(usize, usize, &VariablePattern)> = Vec::new();
for candidate in all_matches {
let overlaps = selected
.iter()
.any(|selected_match| candidate.0 < selected_match.1 && candidate.1 > selected_match.0);
if !overlaps {
selected.push(candidate);
}
}
selected.sort_by_key(|selected_match| selected_match.0);
let mut template = String::with_capacity(content.len());
let mut variables: Vec<ExtractedVariable> = Vec::new();
let mut last_end = 0;
for (start, end, pattern) in &selected {
template.push_str(&content[last_end..*start]);
template.push_str(&format!("{{{{{}}}}}", pattern.name));
variables.push(ExtractedVariable {
pattern_name: pattern.name.clone(),
original_value: content[*start..*end].to_string(),
byte_offset: *start,
byte_length: end - start,
category: pattern.category.clone(),
});
last_end = *end;
}
template.push_str(&content[last_end..]);
Some(ExtractionResult {
span_id: span_id.clone(),
template_content: template,
variables,
})
}
pub fn extract_variables_from_blocks(
blocks: &[PromptBlock],
patterns: &[VariablePattern],
) -> Vec<ExtractionResult> {
blocks
.iter()
.filter_map(|block| extract_variables(&block.content, &block.span_id, patterns))
.collect()
}
#[cfg(test)]
#[path = "../../tests/unit/acg/variable_extractor_tests.rs"]
mod tests;