Skip to main content

stygian_plugin/domain/
transformation.rs

1//! Transformation pipeline for extracted data
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6/// A transformation to apply to extracted values
7///
8/// Transformations are chained in order to clean, normalize, and validate extracted data.
9#[derive(Debug, Clone, Serialize, Deserialize)]
10#[serde(tag = "type", content = "params")]
11pub enum Transformation {
12    /// Trim whitespace from start and end
13    Trim,
14
15    /// Convert to lowercase
16    Lowercase,
17
18    /// Convert to uppercase
19    Uppercase,
20
21    /// Remove all whitespace
22    RemoveWhitespace,
23
24    /// Replace using a regex pattern
25    Regex {
26        pattern: String,
27        replacement: String,
28    },
29
30    /// Extract using a regex capture group
31    RegexExtract { pattern: String, group: usize },
32
33    /// Coerce to a specific type
34    Coerce {
35        target_type: String, // "string", "number", "boolean", "date"
36    },
37
38    /// Keep only if matches a regex pattern
39    Filter { pattern: String },
40
41    /// Replace multiple consecutive whitespace with single space
42    NormalizeWhitespace,
43
44    /// Remove HTML tags
45    StripHtml,
46
47    /// Decode HTML entities
48    DecodeHtml,
49
50    /// Parse JSON string into object
51    ParseJson,
52
53    /// Custom JavaScript transformation (extension point)
54    #[cfg(feature = "javascript")]
55    JavaScript { code: String },
56}
57
58impl Transformation {
59    fn apply_regex(value: &str, pattern: &str, replacement: &str) -> crate::Result<String> {
60        let re = regex::Regex::new(pattern).map_err(|e| {
61            crate::error::PluginError::InvalidTransformation(format!("Invalid regex: {e}"))
62        })?;
63        Ok(re.replace_all(value, replacement).to_string())
64    }
65
66    fn apply_regex_extract(value: &str, pattern: &str, group: usize) -> crate::Result<String> {
67        let re = regex::Regex::new(pattern).map_err(|e| {
68            crate::error::PluginError::InvalidTransformation(format!("Invalid regex: {e}"))
69        })?;
70        let caps = re.captures(value).ok_or_else(|| {
71            crate::error::PluginError::ExtractionError(format!("No match for pattern: {pattern}"))
72        })?;
73        caps.get(group)
74            .ok_or_else(|| {
75                crate::error::PluginError::ExtractionError(format!(
76                    "Capture group {group} not found"
77                ))
78            })
79            .map(|m| m.as_str().to_string())
80    }
81
82    fn apply_coerce(value: &str, target_type: &str) -> crate::Result<String> {
83        match target_type {
84            "string" => Ok(value.to_string()),
85            "number" => {
86                value.parse::<f64>().map_err(|_| {
87                    crate::error::PluginError::InvalidTransformation(format!(
88                        "Cannot coerce '{value}' to number"
89                    ))
90                })?;
91                Ok(value.to_string())
92            }
93            "boolean" => {
94                let b = matches!(value.to_lowercase().as_str(), "true" | "yes" | "1");
95                Ok(b.to_string())
96            }
97            "date" => {
98                chrono::DateTime::parse_from_rfc3339(value).map_err(|_| {
99                    crate::error::PluginError::InvalidTransformation(format!(
100                        "Invalid date: {value}"
101                    ))
102                })?;
103                Ok(value.to_string())
104            }
105            _ => Err(crate::error::PluginError::InvalidTransformation(format!(
106                "Unknown type: {target_type}"
107            ))),
108        }
109    }
110
111    fn apply_filter(value: &str, pattern: &str) -> crate::Result<String> {
112        let re = regex::Regex::new(pattern).map_err(|e| {
113            crate::error::PluginError::InvalidTransformation(format!("Invalid regex: {e}"))
114        })?;
115        if re.is_match(value) {
116            Ok(value.to_string())
117        } else {
118            Err(crate::error::PluginError::ExtractionError(
119                "Value did not match filter pattern".to_string(),
120            ))
121        }
122    }
123
124    /// Apply this transformation to a value
125    pub fn apply(&self, value: &str) -> crate::Result<String> {
126        match self {
127            Self::Trim => Ok(value.trim().to_string()),
128            Self::Lowercase => Ok(value.to_lowercase()),
129            Self::Uppercase => Ok(value.to_uppercase()),
130            Self::RemoveWhitespace => Ok(value.chars().filter(|c| !c.is_whitespace()).collect()),
131            Self::Regex {
132                pattern,
133                replacement,
134            } => Self::apply_regex(value, pattern, replacement),
135            Self::RegexExtract { pattern, group } => {
136                Self::apply_regex_extract(value, pattern, *group)
137            }
138            Self::Coerce { target_type } => Self::apply_coerce(value, target_type),
139            Self::Filter { pattern } => Self::apply_filter(value, pattern),
140            Self::NormalizeWhitespace => Ok(value.split_whitespace().collect::<Vec<_>>().join(" ")),
141            Self::StripHtml => {
142                static RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
143                    #[expect(clippy::expect_used, reason = "hardcoded regex pattern is valid")]
144                    regex::Regex::new(r"<[^>]+>").expect("valid hardcoded HTML tag pattern")
145                });
146                Ok(RE.replace_all(value, "").to_string())
147            }
148            Self::DecodeHtml => Ok(value
149                .replace("&lt;", "<")
150                .replace("&gt;", ">")
151                .replace("&amp;", "&")
152                .replace("&quot;", "\"")
153                .replace("&#39;", "'")),
154            Self::ParseJson => serde_json::from_str::<Value>(value)
155                .map(|v| v.to_string())
156                .map_err(|e| crate::error::PluginError::InvalidTransformation(e.to_string())),
157            #[cfg(feature = "javascript")]
158            Self::JavaScript { code: _ } => Err(crate::error::PluginError::InvalidTransformation(
159                "JavaScript transformations not yet implemented".to_string(),
160            )),
161        }
162    }
163
164    /// Apply a chain of transformations to a value
165    pub fn apply_chain(transformations: &[Self], mut value: String) -> crate::Result<String> {
166        for transformation in transformations {
167            value = transformation.apply(&value)?;
168        }
169        Ok(value)
170    }
171}
172
173impl std::fmt::Display for Transformation {
174    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
175        match self {
176            Self::Trim => write!(f, "Trim"),
177            Self::Lowercase => write!(f, "Lowercase"),
178            Self::Uppercase => write!(f, "Uppercase"),
179            Self::RemoveWhitespace => write!(f, "RemoveWhitespace"),
180            Self::Regex { pattern, .. } => write!(f, "Regex({pattern})"),
181            Self::RegexExtract { pattern, group } => {
182                write!(f, "RegexExtract({pattern}, group {group})")
183            }
184            Self::Coerce { target_type } => write!(f, "Coerce({target_type})"),
185            Self::Filter { pattern } => write!(f, "Filter({pattern})"),
186            Self::NormalizeWhitespace => write!(f, "NormalizeWhitespace"),
187            Self::StripHtml => write!(f, "StripHtml"),
188            Self::DecodeHtml => write!(f, "DecodeHtml"),
189            Self::ParseJson => write!(f, "ParseJson"),
190            #[cfg(feature = "javascript")]
191            Self::JavaScript { .. } => write!(f, "JavaScript"),
192        }
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199
200    #[test]
201    fn test_trim() -> std::result::Result<(), Box<dyn std::error::Error>> {
202        let t = Transformation::Trim;
203        assert_eq!(t.apply("  hello  ")?, "hello");
204        Ok(())
205    }
206
207    #[test]
208    fn test_lowercase() -> std::result::Result<(), Box<dyn std::error::Error>> {
209        let t = Transformation::Lowercase;
210        assert_eq!(t.apply("HELLO")?, "hello");
211        Ok(())
212    }
213
214    #[test]
215    fn test_regex_replace() -> std::result::Result<(), Box<dyn std::error::Error>> {
216        let t = Transformation::Regex {
217            pattern: r"(\d{3})-(\d{4})".to_string(),
218            replacement: "($1) $2".to_string(),
219        };
220        assert_eq!(t.apply("123-4567")?, "(123) 4567");
221        Ok(())
222    }
223
224    #[test]
225    fn test_regex_extract() -> std::result::Result<(), Box<dyn std::error::Error>> {
226        let t = Transformation::RegexExtract {
227            pattern: r"Price: \$(\d+\.\d{2})".to_string(),
228            group: 1,
229        };
230        assert_eq!(t.apply("Price: $19.99")?, "19.99");
231        Ok(())
232    }
233
234    #[test]
235    fn test_coerce_number() {
236        let t = Transformation::Coerce {
237            target_type: "number".to_string(),
238        };
239        assert!(t.apply("123.45").is_ok());
240        assert!(t.apply("not a number").is_err());
241    }
242
243    #[test]
244    fn test_normalize_whitespace() -> std::result::Result<(), Box<dyn std::error::Error>> {
245        let t = Transformation::NormalizeWhitespace;
246        assert_eq!(t.apply("hello   world   foo")?, "hello world foo");
247        Ok(())
248    }
249
250    #[test]
251    fn test_strip_html() -> std::result::Result<(), Box<dyn std::error::Error>> {
252        let t = Transformation::StripHtml;
253        assert_eq!(t.apply("<p>Hello <b>world</b></p>")?, "Hello world");
254        Ok(())
255    }
256
257    #[test]
258    fn test_transformation_chain() -> std::result::Result<(), Box<dyn std::error::Error>> {
259        let transforms = vec![
260            Transformation::StripHtml,
261            Transformation::Trim,
262            Transformation::NormalizeWhitespace,
263        ];
264        let result =
265            Transformation::apply_chain(&transforms, "  <p>hello   world</p>  ".to_string())?;
266        assert_eq!(result, "hello world");
267        Ok(())
268    }
269}