1use regex::Regex;
7use serde::{Deserialize, Serialize};
8
9use crate::acg::prompt_ir::{PromptBlock, SpanId};
10
11#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum VariableCategory {
15 Timestamp,
17 RequestId,
19 SessionId,
21 Locale,
23 Custom(String),
25}
26
27pub struct VariablePattern {
29 pub name: String,
31 pub regex: Regex,
33 pub category: VariableCategory,
35}
36
37impl std::fmt::Debug for VariablePattern {
38 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
39 f.debug_struct("VariablePattern")
40 .field("name", &self.name)
41 .field("regex", &self.regex.as_str())
42 .field("category", &self.category)
43 .finish()
44 }
45}
46
47#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
49pub struct ExtractedVariable {
50 pub pattern_name: String,
52 pub original_value: String,
54 pub byte_offset: usize,
56 pub byte_length: usize,
58 pub category: VariableCategory,
60}
61
62#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
64pub struct ExtractionResult {
65 pub span_id: SpanId,
67 pub template_content: String,
69 pub variables: Vec<ExtractedVariable>,
71}
72
73pub fn default_variable_patterns() -> Vec<VariablePattern> {
79 vec![
80 VariablePattern {
81 name: "iso8601_timestamp".to_string(),
82 regex: Regex::new(
83 r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})",
84 )
85 .expect("iso8601_timestamp regex is valid"),
86 category: VariableCategory::Timestamp,
87 },
88 VariablePattern {
89 name: "uuid".to_string(),
90 regex: Regex::new(
91 r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
92 )
93 .expect("uuid regex is valid"),
94 category: VariableCategory::RequestId,
95 },
96 VariablePattern {
97 name: "request_id".to_string(),
98 regex: Regex::new(r"(?:req|trace|span|run|call|session)[-_][a-zA-Z0-9]{8,}")
99 .expect("request_id regex is valid"),
100 category: VariableCategory::RequestId,
101 },
102 VariablePattern {
103 name: "date_string".to_string(),
104 regex: Regex::new(r"\d{4}-\d{2}-\d{2}").expect("date_string regex is valid"),
105 category: VariableCategory::Timestamp,
106 },
107 VariablePattern {
108 name: "unix_timestamp".to_string(),
109 regex: Regex::new(r"\b1[0-9]{9,12}\b").expect("unix_timestamp regex is valid"),
110 category: VariableCategory::Timestamp,
111 },
112 ]
113}
114
115pub fn extract_variables(
129 content: &str,
130 span_id: &SpanId,
131 patterns: &[VariablePattern],
132) -> Option<ExtractionResult> {
133 let mut all_matches: Vec<(usize, usize, &VariablePattern)> = Vec::new();
134 for pattern in patterns {
135 for matched in pattern.regex.find_iter(content) {
136 all_matches.push((matched.start(), matched.end(), pattern));
137 }
138 }
139
140 if all_matches.is_empty() {
141 return None;
142 }
143
144 all_matches.sort_by(|left, right| {
145 left.0
146 .cmp(&right.0)
147 .then_with(|| (right.1 - right.0).cmp(&(left.1 - left.0)))
148 });
149
150 let mut selected: Vec<(usize, usize, &VariablePattern)> = Vec::new();
151 for candidate in all_matches {
152 let overlaps = selected
153 .iter()
154 .any(|selected_match| candidate.0 < selected_match.1 && candidate.1 > selected_match.0);
155 if !overlaps {
156 selected.push(candidate);
157 }
158 }
159
160 selected.sort_by_key(|selected_match| selected_match.0);
161
162 let mut template = String::with_capacity(content.len());
163 let mut variables: Vec<ExtractedVariable> = Vec::new();
164 let mut last_end = 0;
165
166 for (start, end, pattern) in &selected {
167 template.push_str(&content[last_end..*start]);
168 template.push_str(&format!("{{{{{}}}}}", pattern.name));
169 variables.push(ExtractedVariable {
170 pattern_name: pattern.name.clone(),
171 original_value: content[*start..*end].to_string(),
172 byte_offset: *start,
173 byte_length: end - start,
174 category: pattern.category.clone(),
175 });
176 last_end = *end;
177 }
178
179 template.push_str(&content[last_end..]);
180
181 Some(ExtractionResult {
182 span_id: span_id.clone(),
183 template_content: template,
184 variables,
185 })
186}
187
188pub fn extract_variables_from_blocks(
197 blocks: &[PromptBlock],
198 patterns: &[VariablePattern],
199) -> Vec<ExtractionResult> {
200 blocks
201 .iter()
202 .filter_map(|block| extract_variables(&block.content, &block.span_id, patterns))
203 .collect()
204}
205
206#[cfg(test)]
207#[path = "../../tests/unit/acg/variable_extractor_tests.rs"]
208mod tests;