dom_finder/
pipeline.rs

1use regex::Regex;
2
3use crate::errors::PipelineError;
4
5use super::errors::ParseError;
6
7// Constants representing the names of different pipeline processing procedures
8const REGEX_PROC: &str = "regex";
9const REGEX_FIND_PROC: &str = "regex_find";
10const REPLACE_PROC: &str = "replace";
11const EXTRACT_JSON: &str = "extract_json";
12const TRIM_SPACE: &str = "trim_space";
13const TRIM: &str = "trim";
14const NORMALIZE_SPACES: &str = "normalize_spaces";
15const HTML_UNESCAPE: &str = "html_unescape";
16
17/// Represents a pipeline of processing procedures.
18#[derive(Debug)]
19pub struct Pipeline {
20    procs: Vec<Proc>,
21}
22
23impl Pipeline {
24    /// Creates a new `Pipeline` instance based on the provided raw pipelines.
25    ///
26    /// # Arguments
27    ///
28    /// * `raw_pipelines` - A reference to a vector of vectors of strings representing the raw pipeline elements.
29    ///
30    /// # Returns
31    ///
32    /// Returns a new `Result<Pipeline, ParseError>` instance. Because regex can fail to compile and user can provide an invalid procedure.
33    pub fn new(raw_pipelines: &Vec<Vec<String>>) -> Result<Pipeline, ParseError> {
34        let mut procs = vec![];
35        for proc_args in raw_pipelines {
36            if let Some((proc_name, args)) = proc_args.split_first() {
37                let proc = Proc::new(proc_name, args)?;
38                procs.push(proc);
39            }
40        }
41        Ok(Pipeline { procs })
42    }
43
44    /// Handles the given value by applying all the processing procedures in the pipeline.
45    ///
46    /// # Arguments
47    ///
48    /// * `value` - The input value to be processed.
49    ///
50    /// # Returns
51    ///
52    /// Returns the processed value as a string.
53    pub fn handle(&self, value: String) -> String {
54        let mut res: String = value;
55        for command in self.procs.iter() {
56            res = command.handle(&res)
57        }
58        res
59    }
60}
61
62/// Represents a procedure in the pipeline.
63#[derive(Debug)]
64pub enum Proc {
65    /// finds all captured groups from the first matching.
66    /// It returns concatenated string from all captured groups.
67    /// If you need a full match, please use `RegexFind` instead.
68    /// `Regex.captures` is applied under the hood.  It requires one argument - the `Regex`.
69    Regex(Regex),
70    /// it returns the first entire match of the regex in the given value (haystack).
71    /// `Regex.find` is applied It requires one argument - the `Regex`.
72    RegexFind(Regex),
73    /// requires two arguments - the old and the new string.
74    Replace(Box<str>, Box<str>),
75    /// requires one argument - the path to the json value, if the string represents a json.
76    ExtractJson(Box<str>),
77    /// requires no arguments. It trims spaces at the start and the end of the string.
78    TrimSpace,
79    /// requires one argument - it trims characters from the (start and end of) string with the cut set.
80    Trim(Vec<char>),
81    /// requires no arguments. It normalizes spaces in the string. Includes tabulations and new lines.
82    NormalizeSpaces,
83    /// unescape html entities, requires no arguments.
84    HtmlUnescape,
85}
86
87impl Proc {
88    /// Creates a new `Proc` instance based on the provided `proc_args`.
89    ///
90    /// # Arguments
91    ///
92    /// * `proc_args` - A slice of strings representing the arguments for the `Proc`.
93    ///
94    /// # Returns
95    ///
96    /// Returns a new `Result<Option<Proc>, ParseError>` instance. Because:
97    /// * regex can fail to compile
98    /// * user can provide an invalid procedure
99    /// * user can provide an invalid number of arguments for a procedures
100    fn new<'b>(proc_name: &'b str, args: &'b [String]) -> Result<Self, PipelineError> {
101        let proc_opt = match proc_name {
102            REGEX_PROC => {
103                validate_args_len(proc_name, args.len(), 1)?;
104                Proc::Regex(Regex::new(&args[0])?)
105            }
106            REGEX_FIND_PROC => {
107                validate_args_len(proc_name, args.len(), 1)?;
108                Proc::RegexFind(Regex::new(&args[0])?)
109            }
110            EXTRACT_JSON => {
111                validate_args_len(proc_name, args.len(), 1)?;
112                Proc::ExtractJson(args[0].clone().into())
113            }
114            REPLACE_PROC => {
115                validate_args_len(proc_name, args.len(), 2)?;
116                Proc::Replace(args[0].clone().into(), args[1].clone().into())
117            }
118            TRIM_SPACE => Proc::TrimSpace,
119            TRIM => {
120                validate_args_len(proc_name, args.len(), 1)?;
121                let cut_set: Vec<char> = args[0].chars().collect();
122                Proc::Trim(cut_set)
123            }
124            NORMALIZE_SPACES => Proc::NormalizeSpaces,
125            HTML_UNESCAPE => Proc::HtmlUnescape,
126            _ => return Err(PipelineError::ProcDoesNotExist(proc_name.to_string())),
127        };
128        Ok(proc_opt)
129    }
130
131    /// Handles the given value by applying the processing procedure.
132    ///
133    /// # Arguments
134    ///
135    /// * `value` - The input value to be processed.
136    ///
137    /// # Returns
138    ///
139    /// Returns the processed value as a string.
140    fn handle(&self, value: &str) -> String {
141        match self {
142            Proc::Regex(re) => re_extract_matches(re, value),
143            Proc::RegexFind(re) => re
144                .find(value)
145                .map(|m| m.as_str())
146                .unwrap_or_default()
147                .to_string(),
148            Proc::Replace(old, new) => value.replace(old.as_ref(), new),
149            Proc::ExtractJson(path) => gjson::get(value, path).to_string(),
150            Proc::TrimSpace => value.trim().to_string(),
151            Proc::Trim(pat) => value.trim_matches(pat.as_slice()).to_string(),
152            Proc::NormalizeSpaces => normalize_spaces(value),
153            Proc::HtmlUnescape => html_escape::decode_html_entities(value).to_string(),
154        }
155    }
156}
157
158fn validate_args_len(proc_name: &str, args_len: usize, len: usize) -> Result<(), PipelineError> {
159    if args_len < len {
160        return Err(PipelineError::ProcNotEnoughArguments(
161            proc_name.to_string(),
162            args_len,
163            len,
164        ));
165    }
166    Ok(())
167}
168
169fn re_extract_matches(re: &Regex, haystack: &str) -> String {
170    let cap_groups = re.captures_len();
171    match re.captures(haystack) {
172        Some(m) => (1..cap_groups)
173            .filter_map(|i| m.get(i))
174            .map(|cap| cap.as_str())
175            .collect(),
176        None => "".to_string(),
177    }
178}
179
180fn normalize_spaces(text: &str) -> String {
181    text.split_whitespace().collect::<Vec<&str>>().join(" ")
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn regex_proc_matching_group() {
190        let re = Regex::new(r"(?:https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
191        let proc = Proc::Regex(re);
192        let res = proc.handle("http://www.example.com/p1/?q=2");
193        assert_eq!(res, "www.example.com");
194    }
195    #[test]
196    fn regex_proc_only_capture_groups() {
197        let re = Regex::new(r"(https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
198        let proc = Proc::Regex(re);
199        let res = proc.handle("http://www.example.com/p1/?q=2");
200        assert_eq!(res, "http://www.example.com");
201    }
202
203    #[test]
204    fn regex_find_proc() {
205        let re = Regex::new(r"(?:https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
206        let proc = Proc::RegexFind(re);
207        let res = proc.handle("http://www.example.com/p1/?q=2");
208        assert_eq!(res, "http://www.example.com/");
209    }
210
211    #[test]
212    fn extract_json() {
213        let proc = Proc::ExtractJson("a.b.c".into());
214        let res = proc.handle(r#"{"a":{"b":{"c":"d"}}}"#);
215        assert_eq!(res, "d");
216    }
217
218    #[test]
219    fn trim() {
220        let proc = Proc::Trim(vec![' ', '-', '=']);
221        let res = proc.handle(" -=1=- ");
222        assert_eq!(res, "1");
223    }
224    #[test]
225    fn replace() {
226        let proc = Proc::Replace("%20".into(), "+".into());
227        let res = proc.handle("search/?q=mob%20100");
228        assert_eq!(res, "search/?q=mob+100");
229    }
230    #[test]
231    fn normalize_spaces() {
232        let proc = Proc::NormalizeSpaces;
233        let res = proc.handle("<div>\n    Some\t</span>green</span>  text\n</div>\n");
234        assert_eq!(res, "<div> Some </span>green</span> text </div>");
235    }
236
237    #[test]
238    fn parse_replace_proc_from_args() {
239        // Replace via the factory/parse method, not the enum constructor
240        let proc = Proc::new("replace", &["%20".into(), "+".into()])
241            .expect("should build `Proc::Replace` proc");
242        let result = proc.handle("search/?q=mob%20100");
243        assert_eq!(result, "search/?q=mob+100");
244    }
245}