Skip to main content

oxidize_pdf/templates/
parser.rs

1use regex::Regex;
2use std::collections::HashSet;
3
4use super::error::{TemplateError, TemplateResult};
5
6/// A placeholder found in template text
7#[derive(Debug, Clone, PartialEq)]
8pub struct Placeholder {
9    /// The full placeholder text including delimiters (e.g., "{{name}}")
10    pub full_text: String,
11    /// The variable name (e.g., "name")
12    pub variable_name: String,
13    /// Start position in the original text
14    pub start: usize,
15    /// End position in the original text
16    pub end: usize,
17}
18
19impl Placeholder {
20    /// Create a new placeholder
21    pub fn new(full_text: String, variable_name: String, start: usize, end: usize) -> Self {
22        Self {
23            full_text,
24            variable_name,
25            start,
26            end,
27        }
28    }
29}
30
31/// Template parser for finding and validating placeholders
32pub struct TemplateParser {
33    /// Regex for matching placeholders
34    placeholder_regex: Regex,
35}
36
37impl TemplateParser {
38    /// Create a new template parser
39    ///
40    /// # Returns
41    /// Always returns a valid parser. The regex pattern is hardcoded and validated by tests.
42    pub fn new() -> Self {
43        // Regex to match {{variable_name}} patterns
44        // Supports alphanumeric, underscore, and dot notation
45        //
46        // SAFETY: This hardcoded regex pattern is compile-time validated via unit tests.
47        // The pattern has been verified to always compile successfully.
48        //
49        // If compilation somehow fails (impossible in practice), we fall back to
50        // a simpler pattern that matches anything between {{ }}.
51        let placeholder_regex = Regex::new(r"\{\{\s*([a-zA-Z_][a-zA-Z0-9_.]*)\s*\}\}")
52            .or_else(|_| Regex::new(r"\{\{([^}]+)\}\}"))
53            .or_else(|_| Regex::new(r"[^\x00-\x7F]+")) // Fallback: match non-ASCII (will match nothing in templates)
54            .unwrap_or_else(|_| {
55                // This branch is unreachable - at least one of the patterns above will compile
56                // If we reach here, the regex engine is fundamentally broken
57                unreachable!("All regex patterns failed to compile - regex engine is broken")
58            });
59
60        Self { placeholder_regex }
61    }
62
63    /// Parse a template string and extract all placeholders
64    pub fn parse(&self, template: &str) -> TemplateResult<Vec<Placeholder>> {
65        let mut placeholders = Vec::new();
66
67        for captures in self.placeholder_regex.captures_iter(template) {
68            // Group 0 (full match) and group 1 (captured group) are guaranteed by successful regex match
69            let full_match = &captures[0];
70            let variable_name_str = &captures[1];
71
72            let full_text = full_match.to_string();
73            let variable_name = variable_name_str.to_string();
74            let start = captures.get(0).map(|m| m.start()).unwrap_or(0);
75            let end = captures.get(0).map(|m| m.end()).unwrap_or(0);
76
77            // Validate variable name
78            self.validate_variable_name(&variable_name)?;
79
80            placeholders.push(Placeholder::new(full_text, variable_name, start, end));
81        }
82
83        // Check for invalid placeholder patterns first (empty, malformed, etc.)
84        self.check_for_invalid_patterns(template)?;
85
86        // Then check for invalid variable names in remaining double brace patterns
87        self.check_for_invalid_variable_names_in_braces(template)?;
88
89        Ok(placeholders)
90    }
91
92    /// Get all unique variable names from a template
93    pub fn get_variable_names(&self, template: &str) -> TemplateResult<Vec<String>> {
94        let placeholders = self.parse(template)?;
95        let mut names: HashSet<String> = HashSet::new();
96
97        for placeholder in placeholders {
98            names.insert(placeholder.variable_name);
99        }
100
101        let mut result: Vec<String> = names.into_iter().collect();
102        result.sort();
103        Ok(result)
104    }
105
106    /// Validate a variable name
107    fn validate_variable_name(&self, name: &str) -> TemplateResult<()> {
108        if name.is_empty() {
109            return Err(TemplateError::InvalidVariableName(name.to_string()));
110        }
111
112        // Check if it starts with a letter or underscore
113        if let Some(first_char) = name.chars().next() {
114            if !first_char.is_alphabetic() && first_char != '_' {
115                return Err(TemplateError::InvalidVariableName(name.to_string()));
116            }
117        } else {
118            // Empty name (should have been caught earlier, but be defensive)
119            return Err(TemplateError::InvalidVariableName(name.to_string()));
120        }
121
122        // Check if all characters are valid
123        for ch in name.chars() {
124            if !ch.is_alphanumeric() && ch != '_' && ch != '.' {
125                return Err(TemplateError::InvalidVariableName(name.to_string()));
126            }
127        }
128
129        Ok(())
130    }
131
132    /// Check for invalid placeholder patterns that might confuse users
133    fn check_for_invalid_patterns(&self, template: &str) -> TemplateResult<()> {
134        // Check for empty placeholders FIRST
135        let empty_placeholder_regex = Regex::new(r"\{\{\s*\}\}")?;
136        if let Some(empty_match) = empty_placeholder_regex.find(template) {
137            return Err(TemplateError::InvalidPlaceholder(format!(
138                "Empty placeholder found at position {}: '{}'",
139                empty_match.start(),
140                empty_match.as_str()
141            )));
142        }
143
144        // Check for malformed placeholders with too many braces
145        let malformed_regex = Regex::new(r"\{\{\{+|\}\}\}+")?;
146        if let Some(malformed_match) = malformed_regex.find(template) {
147            return Err(TemplateError::InvalidPlaceholder(format!(
148                "Malformed placeholder at position {}: '{}' - use exactly two braces",
149                malformed_match.start(),
150                malformed_match.as_str()
151            )));
152        }
153
154        // Check for single braces that might indicate user error
155        // Remove all double brace patterns (valid and invalid), then look for remaining single braces
156        let all_double_braces_regex = Regex::new(r"\{\{\s*[^}]*\s*\}\}")?;
157        let cleaned = all_double_braces_regex.replace_all(template, "");
158
159        // Now check for any remaining single braces
160        let single_brace_regex = Regex::new(r"[{}]")?;
161        if let Some(invalid_match) = single_brace_regex.find(&cleaned) {
162            // Find the position in the original string
163            let position = self.find_original_position(&cleaned, invalid_match.start(), template);
164            return Err(TemplateError::InvalidPlaceholder(format!(
165                "Found single brace near position {}: '{}' - did you mean to use double braces {{{{}}}}?",
166                position,
167                invalid_match.as_str()
168            )));
169        }
170
171        Ok(())
172    }
173
174    /// Check if a template has any placeholders
175    pub fn has_placeholders(&self, template: &str) -> bool {
176        self.placeholder_regex.is_match(template)
177    }
178
179    /// Count the number of placeholders in a template
180    pub fn count_placeholders(&self, template: &str) -> usize {
181        self.placeholder_regex.find_iter(template).count()
182    }
183
184    /// Helper method to find position in original string after replacements
185    fn find_original_position(&self, _cleaned: &str, cleaned_pos: usize, original: &str) -> usize {
186        // This is a simplified approximation - for better accuracy we'd need more complex tracking
187        // But for error reporting purposes, this should be sufficient
188        cleaned_pos.min(original.len().saturating_sub(1))
189    }
190
191    /// Check for double brace patterns with invalid variable names
192    fn check_for_invalid_variable_names_in_braces(&self, template: &str) -> TemplateResult<()> {
193        // Find all potential double brace patterns (regardless of variable name validity)
194        let all_double_braces_regex = Regex::new(r"\{\{\s*([^}]*)\s*\}\}")?;
195
196        for captures in all_double_braces_regex.captures_iter(template) {
197            // Group 1 is guaranteed by successful regex match (pattern has one capture group)
198            let variable_name = captures[1].trim();
199
200            // If this variable name is invalid, report it
201            if self.validate_variable_name(variable_name).is_err() {
202                return Err(TemplateError::InvalidVariableName(
203                    variable_name.to_string(),
204                ));
205            }
206        }
207
208        Ok(())
209    }
210}
211
212impl Default for TemplateParser {
213    fn default() -> Self {
214        Self::new()
215    }
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221
222    #[test]
223    fn test_basic_placeholder_parsing() {
224        let parser = TemplateParser::new();
225        let template = "Hello {{name}}, your total is {{total}}.";
226
227        let placeholders = parser.parse(template).unwrap();
228        assert_eq!(placeholders.len(), 2);
229
230        assert_eq!(placeholders[0].variable_name, "name");
231        assert_eq!(placeholders[0].full_text, "{{name}}");
232        assert_eq!(placeholders[0].start, 6);
233
234        assert_eq!(placeholders[1].variable_name, "total");
235        assert_eq!(placeholders[1].full_text, "{{total}}");
236    }
237
238    #[test]
239    fn test_dot_notation_variables() {
240        let parser = TemplateParser::new();
241        let template = "User: {{user.name}} ({{user.age}} years old)";
242
243        let placeholders = parser.parse(template).unwrap();
244        assert_eq!(placeholders.len(), 2);
245
246        assert_eq!(placeholders[0].variable_name, "user.name");
247        assert_eq!(placeholders[1].variable_name, "user.age");
248    }
249
250    #[test]
251    fn test_whitespace_handling() {
252        let parser = TemplateParser::new();
253        let template = "{{ name }} and {{  total  }}";
254
255        let placeholders = parser.parse(template).unwrap();
256        assert_eq!(placeholders.len(), 2);
257
258        assert_eq!(placeholders[0].variable_name, "name");
259        assert_eq!(placeholders[1].variable_name, "total");
260    }
261
262    #[test]
263    fn test_get_variable_names() {
264        let parser = TemplateParser::new();
265        let template = "{{name}} {{total}} {{name}} {{user.age}}";
266
267        let names = parser.get_variable_names(template).unwrap();
268        assert_eq!(names, vec!["name", "total", "user.age"]);
269    }
270
271    #[test]
272    fn test_invalid_variable_names() {
273        let parser = TemplateParser::new();
274        // Test invalid starting character
275        let template = "{{123invalid}}";
276        let result = parser.parse(template);
277        assert!(matches!(result, Err(TemplateError::InvalidVariableName(_))));
278    }
279
280    #[test]
281    fn test_invalid_placeholder_patterns() {
282        let parser = TemplateParser::new();
283        // Test single braces
284        let template = "Hello {name}";
285        let result = parser.parse(template);
286        assert!(matches!(result, Err(TemplateError::InvalidPlaceholder(_))));
287
288        // Test empty placeholder
289        let template = "Hello {{}}";
290        let result = parser.parse(template);
291        assert!(matches!(result, Err(TemplateError::InvalidPlaceholder(_))));
292
293        // Test too many braces
294        let template = "Hello {{{name}}}";
295        let result = parser.parse(template);
296        assert!(matches!(result, Err(TemplateError::InvalidPlaceholder(_))));
297    }
298
299    #[test]
300    fn test_has_placeholders() {
301        let parser = TemplateParser::new();
302        assert!(parser.has_placeholders("Hello {{name}}"));
303        assert!(!parser.has_placeholders("Hello world"));
304    }
305
306    #[test]
307    fn test_count_placeholders() {
308        let parser = TemplateParser::new();
309        assert_eq!(parser.count_placeholders("{{a}} {{b}} {{c}}"), 3);
310        assert_eq!(parser.count_placeholders("No placeholders here"), 0);
311        assert_eq!(parser.count_placeholders("{{duplicate}} {{duplicate}}"), 2);
312    }
313
314    #[test]
315    fn test_placeholder_positions() {
316        let parser = TemplateParser::new();
317        let template = "Start {{var1}} middle {{var2}} end";
318
319        let placeholders = parser.parse(template).unwrap();
320        assert_eq!(placeholders[0].start, 6);
321        assert_eq!(placeholders[0].end, 14); // 6 + len("{{var1}}") = 6 + 8 = 14
322        assert_eq!(placeholders[1].start, 22);
323        assert_eq!(placeholders[1].end, 30); // 22 + len("{{var2}}") = 22 + 8 = 30
324
325        // Verify the extracted text matches
326        assert_eq!(
327            &template[placeholders[0].start..placeholders[0].end],
328            "{{var1}}"
329        );
330        assert_eq!(
331            &template[placeholders[1].start..placeholders[1].end],
332            "{{var2}}"
333        );
334    }
335}