Skip to main content

nemo_flow_adaptive/acg/
variable_extractor.rs

1// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Variable content detection and extraction from prompt blocks.
5
6use regex::Regex;
7use serde::{Deserialize, Serialize};
8
9use crate::acg::prompt_ir::{PromptBlock, SpanId};
10
11/// Category assigned to an extracted variable.
12#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum VariableCategory {
15    /// Timestamp-like content such as ISO 8601 strings.
16    Timestamp,
17    /// Request or trace identifier content.
18    RequestId,
19    /// Session identifier content.
20    SessionId,
21    /// Locale identifier content.
22    Locale,
23    /// Caller-defined variable category.
24    Custom(String),
25}
26
27/// Regex-based pattern used to detect variable content.
28pub struct VariablePattern {
29    /// Stable placeholder name inserted into the template.
30    pub name: String,
31    /// Regex used to detect matching content.
32    pub regex: Regex,
33    /// Semantic category assigned to matches from this pattern.
34    pub category: VariableCategory,
35}
36
37impl std::fmt::Debug for VariablePattern {
38    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
39        f.debug_struct("VariablePattern")
40            .field("name", &self.name)
41            .field("regex", &self.regex.as_str())
42            .field("category", &self.category)
43            .finish()
44    }
45}
46
47/// One extracted variable occurrence within a prompt block.
48#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
49pub struct ExtractedVariable {
50    /// Name of the pattern that matched this value.
51    pub pattern_name: String,
52    /// Original matched value before replacement.
53    pub original_value: String,
54    /// Byte offset of the match in the original content.
55    pub byte_offset: usize,
56    /// Byte length of the original match.
57    pub byte_length: usize,
58    /// Semantic category assigned to the variable.
59    pub category: VariableCategory,
60}
61
62/// Variable extraction result for one prompt block.
63#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
64pub struct ExtractionResult {
65    /// Span identifier of the analyzed block.
66    pub span_id: SpanId,
67    /// Block content with extracted values replaced by placeholders.
68    pub template_content: String,
69    /// Variables extracted from the block.
70    pub variables: Vec<ExtractedVariable>,
71}
72
73/// Return the default regex patterns used for variable extraction.
74///
75/// # Returns
76/// A vector of built-in [`VariablePattern`] values covering common timestamps
77/// and request identifiers.
78pub fn default_variable_patterns() -> Vec<VariablePattern> {
79    vec![
80        VariablePattern {
81            name: "iso8601_timestamp".to_string(),
82            regex: Regex::new(
83                r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})",
84            )
85            .expect("iso8601_timestamp regex is valid"),
86            category: VariableCategory::Timestamp,
87        },
88        VariablePattern {
89            name: "uuid".to_string(),
90            regex: Regex::new(
91                r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
92            )
93            .expect("uuid regex is valid"),
94            category: VariableCategory::RequestId,
95        },
96        VariablePattern {
97            name: "request_id".to_string(),
98            regex: Regex::new(r"(?:req|trace|span|run|call|session)[-_][a-zA-Z0-9]{8,}")
99                .expect("request_id regex is valid"),
100            category: VariableCategory::RequestId,
101        },
102        VariablePattern {
103            name: "date_string".to_string(),
104            regex: Regex::new(r"\d{4}-\d{2}-\d{2}").expect("date_string regex is valid"),
105            category: VariableCategory::Timestamp,
106        },
107        VariablePattern {
108            name: "unix_timestamp".to_string(),
109            regex: Regex::new(r"\b1[0-9]{9,12}\b").expect("unix_timestamp regex is valid"),
110            category: VariableCategory::Timestamp,
111        },
112    ]
113}
114
115/// Extract variables from one content string.
116///
117/// Matching patterns are applied greedily by start position, preferring longer
118/// matches when multiple patterns overlap.
119///
120/// # Parameters
121/// - `content`: Block content to analyze.
122/// - `span_id`: Span identifier associated with the content.
123/// - `patterns`: Variable patterns to evaluate.
124///
125/// # Returns
126/// `Some(ExtractionResult)` when at least one variable is found and `None`
127/// otherwise.
128pub fn extract_variables(
129    content: &str,
130    span_id: &SpanId,
131    patterns: &[VariablePattern],
132) -> Option<ExtractionResult> {
133    let mut all_matches: Vec<(usize, usize, &VariablePattern)> = Vec::new();
134    for pattern in patterns {
135        for matched in pattern.regex.find_iter(content) {
136            all_matches.push((matched.start(), matched.end(), pattern));
137        }
138    }
139
140    if all_matches.is_empty() {
141        return None;
142    }
143
144    all_matches.sort_by(|left, right| {
145        left.0
146            .cmp(&right.0)
147            .then_with(|| (right.1 - right.0).cmp(&(left.1 - left.0)))
148    });
149
150    let mut selected: Vec<(usize, usize, &VariablePattern)> = Vec::new();
151    for candidate in all_matches {
152        let overlaps = selected
153            .iter()
154            .any(|selected_match| candidate.0 < selected_match.1 && candidate.1 > selected_match.0);
155        if !overlaps {
156            selected.push(candidate);
157        }
158    }
159
160    selected.sort_by_key(|selected_match| selected_match.0);
161
162    let mut template = String::with_capacity(content.len());
163    let mut variables: Vec<ExtractedVariable> = Vec::new();
164    let mut last_end = 0;
165
166    for (start, end, pattern) in &selected {
167        template.push_str(&content[last_end..*start]);
168        template.push_str(&format!("{{{{{}}}}}", pattern.name));
169        variables.push(ExtractedVariable {
170            pattern_name: pattern.name.clone(),
171            original_value: content[*start..*end].to_string(),
172            byte_offset: *start,
173            byte_length: end - start,
174            category: pattern.category.clone(),
175        });
176        last_end = *end;
177    }
178
179    template.push_str(&content[last_end..]);
180
181    Some(ExtractionResult {
182        span_id: span_id.clone(),
183        template_content: template,
184        variables,
185    })
186}
187
188/// Extract variables from a set of prompt blocks.
189///
190/// # Parameters
191/// - `blocks`: Prompt blocks to analyze.
192/// - `patterns`: Variable patterns to evaluate.
193///
194/// # Returns
195/// One [`ExtractionResult`] per block that contained at least one variable.
196pub fn extract_variables_from_blocks(
197    blocks: &[PromptBlock],
198    patterns: &[VariablePattern],
199) -> Vec<ExtractionResult> {
200    blocks
201        .iter()
202        .filter_map(|block| extract_variables(&block.content, &block.span_id, patterns))
203        .collect()
204}
205
206#[cfg(test)]
207#[path = "../../tests/unit/acg/variable_extractor_tests.rs"]
208mod tests;