Skip to main content

stygian_plugin/domain/
transformation.rs

1//! Transformation pipeline for extracted data
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6/// A transformation to apply to extracted values
7///
8/// Transformations are chained in order to clean, normalize, and validate extracted data.
9#[derive(Debug, Clone, Serialize, Deserialize)]
10#[serde(tag = "type", content = "params")]
11pub enum Transformation {
12    /// Trim whitespace from start and end
13    Trim,
14
15    /// Convert to lowercase
16    Lowercase,
17
18    /// Convert to uppercase
19    Uppercase,
20
21    /// Remove all whitespace
22    RemoveWhitespace,
23
24    /// Replace using a regex pattern
25    Regex {
26        pattern: String,
27        replacement: String,
28    },
29
30    /// Extract using a regex capture group
31    RegexExtract { pattern: String, group: usize },
32
33    /// Coerce to a specific type
34    Coerce {
35        target_type: String, // "string", "number", "boolean", "date"
36    },
37
38    /// Keep only if matches a regex pattern
39    Filter { pattern: String },
40
41    /// Replace multiple consecutive whitespace with single space
42    NormalizeWhitespace,
43
44    /// Remove HTML tags
45    StripHtml,
46
47    /// Decode HTML entities
48    DecodeHtml,
49
50    /// Parse JSON string into object
51    ParseJson,
52
53    /// Custom JavaScript transformation (extension point)
54    #[cfg(feature = "javascript")]
55    JavaScript { code: String },
56}
57
58impl Transformation {
59    fn apply_regex(value: &str, pattern: &str, replacement: &str) -> crate::Result<String> {
60        let re = regex::Regex::new(pattern).map_err(|e| {
61            crate::error::PluginError::InvalidTransformation(format!("Invalid regex: {e}"))
62        })?;
63        Ok(re.replace_all(value, replacement).to_string())
64    }
65
66    fn apply_regex_extract(value: &str, pattern: &str, group: usize) -> crate::Result<String> {
67        let re = regex::Regex::new(pattern).map_err(|e| {
68            crate::error::PluginError::InvalidTransformation(format!("Invalid regex: {e}"))
69        })?;
70        let caps = re.captures(value).ok_or_else(|| {
71            crate::error::PluginError::ExtractionError(format!("No match for pattern: {pattern}"))
72        })?;
73        caps.get(group)
74            .ok_or_else(|| {
75                crate::error::PluginError::ExtractionError(format!(
76                    "Capture group {group} not found"
77                ))
78            })
79            .map(|m| m.as_str().to_string())
80    }
81
82    fn apply_coerce(value: &str, target_type: &str) -> crate::Result<String> {
83        match target_type {
84            "string" => Ok(value.to_string()),
85            "number" => {
86                value.parse::<f64>().map_err(|_| {
87                    crate::error::PluginError::InvalidTransformation(format!(
88                        "Cannot coerce '{value}' to number"
89                    ))
90                })?;
91                Ok(value.to_string())
92            }
93            "boolean" => {
94                let b = matches!(value.to_lowercase().as_str(), "true" | "yes" | "1");
95                Ok(b.to_string())
96            }
97            "date" => {
98                chrono::DateTime::parse_from_rfc3339(value).map_err(|_| {
99                    crate::error::PluginError::InvalidTransformation(format!(
100                        "Invalid date: {value}"
101                    ))
102                })?;
103                Ok(value.to_string())
104            }
105            _ => Err(crate::error::PluginError::InvalidTransformation(format!(
106                "Unknown type: {target_type}"
107            ))),
108        }
109    }
110
111    fn apply_filter(value: &str, pattern: &str) -> crate::Result<String> {
112        let re = regex::Regex::new(pattern).map_err(|e| {
113            crate::error::PluginError::InvalidTransformation(format!("Invalid regex: {e}"))
114        })?;
115        if re.is_match(value) {
116            Ok(value.to_string())
117        } else {
118            Err(crate::error::PluginError::ExtractionError(
119                "Value did not match filter pattern".to_string(),
120            ))
121        }
122    }
123
124    /// Apply this transformation to a value
125    ///
126    /// # Errors
127    ///
128    /// Returns [`crate::error::PluginError::InvalidTransformation`] when the
129    /// transformation has an invalid regex pattern, an unknown `Coerce`
130    /// target type, or an unparseable input (e.g. numeric coercion on a
131    /// non-numeric value). Returns
132    /// [`crate::error::PluginError::ExtractionError`] when a `Filter`
133    /// rejects the value or a `RegexExtract` pattern does not match.
134    pub fn apply(&self, value: &str) -> crate::Result<String> {
135        match self {
136            Self::Trim => Ok(value.trim().to_string()),
137            Self::Lowercase => Ok(value.to_lowercase()),
138            Self::Uppercase => Ok(value.to_uppercase()),
139            Self::RemoveWhitespace => Ok(value.chars().filter(|c| !c.is_whitespace()).collect()),
140            Self::Regex {
141                pattern,
142                replacement,
143            } => Self::apply_regex(value, pattern, replacement),
144            Self::RegexExtract { pattern, group } => {
145                Self::apply_regex_extract(value, pattern, *group)
146            }
147            Self::Coerce { target_type } => Self::apply_coerce(value, target_type),
148            Self::Filter { pattern } => Self::apply_filter(value, pattern),
149            Self::NormalizeWhitespace => Ok(value.split_whitespace().collect::<Vec<_>>().join(" ")),
150            Self::StripHtml => {
151                static RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
152                    #[expect(clippy::expect_used, reason = "hardcoded regex pattern is valid")]
153                    regex::Regex::new(r"<[^>]+>").expect("valid hardcoded HTML tag pattern")
154                });
155                Ok(RE.replace_all(value, "").to_string())
156            }
157            Self::DecodeHtml => Ok(value
158                .replace("&lt;", "<")
159                .replace("&gt;", ">")
160                .replace("&amp;", "&")
161                .replace("&quot;", "\"")
162                .replace("&#39;", "'")),
163            Self::ParseJson => serde_json::from_str::<Value>(value)
164                .map(|v| v.to_string())
165                .map_err(|e| crate::error::PluginError::InvalidTransformation(e.to_string())),
166            #[cfg(feature = "javascript")]
167            Self::JavaScript { code: _ } => Err(crate::error::PluginError::InvalidTransformation(
168                "JavaScript transformations not yet implemented".to_string(),
169            )),
170        }
171    }
172
173    /// Apply a chain of transformations to a value
174    ///
175    /// # Errors
176    ///
177    /// Propagates any [`crate::error::PluginError`] raised by the first
178    /// failing transformation in the chain — see [`Transformation::apply`]
179    /// for the per-variant failure modes.
180    pub fn apply_chain(transformations: &[Self], mut value: String) -> crate::Result<String> {
181        for transformation in transformations {
182            value = transformation.apply(&value)?;
183        }
184        Ok(value)
185    }
186}
187
188impl std::fmt::Display for Transformation {
189    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190        match self {
191            Self::Trim => write!(f, "Trim"),
192            Self::Lowercase => write!(f, "Lowercase"),
193            Self::Uppercase => write!(f, "Uppercase"),
194            Self::RemoveWhitespace => write!(f, "RemoveWhitespace"),
195            Self::Regex { pattern, .. } => write!(f, "Regex({pattern})"),
196            Self::RegexExtract { pattern, group } => {
197                write!(f, "RegexExtract({pattern}, group {group})")
198            }
199            Self::Coerce { target_type } => write!(f, "Coerce({target_type})"),
200            Self::Filter { pattern } => write!(f, "Filter({pattern})"),
201            Self::NormalizeWhitespace => write!(f, "NormalizeWhitespace"),
202            Self::StripHtml => write!(f, "StripHtml"),
203            Self::DecodeHtml => write!(f, "DecodeHtml"),
204            Self::ParseJson => write!(f, "ParseJson"),
205            #[cfg(feature = "javascript")]
206            Self::JavaScript { .. } => write!(f, "JavaScript"),
207        }
208    }
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    #[test]
216    fn test_trim() -> std::result::Result<(), Box<dyn std::error::Error>> {
217        let t = Transformation::Trim;
218        assert_eq!(t.apply("  hello  ")?, "hello");
219        Ok(())
220    }
221
222    #[test]
223    fn test_lowercase() -> std::result::Result<(), Box<dyn std::error::Error>> {
224        let t = Transformation::Lowercase;
225        assert_eq!(t.apply("HELLO")?, "hello");
226        Ok(())
227    }
228
229    #[test]
230    fn test_regex_replace() -> std::result::Result<(), Box<dyn std::error::Error>> {
231        let t = Transformation::Regex {
232            pattern: r"(\d{3})-(\d{4})".to_string(),
233            replacement: "($1) $2".to_string(),
234        };
235        assert_eq!(t.apply("123-4567")?, "(123) 4567");
236        Ok(())
237    }
238
239    #[test]
240    fn test_regex_extract() -> std::result::Result<(), Box<dyn std::error::Error>> {
241        let t = Transformation::RegexExtract {
242            pattern: r"Price: \$(\d+\.\d{2})".to_string(),
243            group: 1,
244        };
245        assert_eq!(t.apply("Price: $19.99")?, "19.99");
246        Ok(())
247    }
248
249    #[test]
250    fn test_coerce_number() {
251        let t = Transformation::Coerce {
252            target_type: "number".to_string(),
253        };
254        assert!(t.apply("123.45").is_ok());
255        assert!(t.apply("not a number").is_err());
256    }
257
258    #[test]
259    fn test_normalize_whitespace() -> std::result::Result<(), Box<dyn std::error::Error>> {
260        let t = Transformation::NormalizeWhitespace;
261        assert_eq!(t.apply("hello   world   foo")?, "hello world foo");
262        Ok(())
263    }
264
265    #[test]
266    fn test_strip_html() -> std::result::Result<(), Box<dyn std::error::Error>> {
267        let t = Transformation::StripHtml;
268        assert_eq!(t.apply("<p>Hello <b>world</b></p>")?, "Hello world");
269        Ok(())
270    }
271
272    #[test]
273    fn test_transformation_chain() -> std::result::Result<(), Box<dyn std::error::Error>> {
274        let transforms = vec![
275            Transformation::StripHtml,
276            Transformation::Trim,
277            Transformation::NormalizeWhitespace,
278        ];
279        let result =
280            Transformation::apply_chain(&transforms, "  <p>hello   world</p>  ".to_string())?;
281        assert_eq!(result, "hello world");
282        Ok(())
283    }
284}