dom_finder 0.6.0

HTML parsing with CSS selectors
Documentation
use regex::Regex;

use crate::errors::PipelineError;

use super::errors::ParseError;

// ProcName represents the names of different pipeline processing procedures
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ProcName {
    Regex,
    RegexFind,
    Replace,
    ExtractJson,
    TrimSpace,
    Trim,
    NormalizeSpaces,
    HtmlUnescape,
}

impl std::str::FromStr for ProcName {
    type Err = PipelineError;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "regex" => Ok(ProcName::Regex),
            "regex_find" => Ok(ProcName::RegexFind),
            "replace" => Ok(ProcName::Replace),
            "extract_json" => Ok(ProcName::ExtractJson),
            "trim_space" => Ok(ProcName::TrimSpace),
            "trim" => Ok(ProcName::Trim),
            "normalize_spaces" => Ok(ProcName::NormalizeSpaces),
            "html_unescape" => Ok(ProcName::HtmlUnescape),
            _ => Err(PipelineError::ProcDoesNotExist(s.to_string())),
        }
    }
}

/// Represents a pipeline of processing procedures.
#[derive(Debug)]
pub struct Pipeline {
    procs: Vec<Proc>,
}

impl Pipeline {
    /// Creates a new `Pipeline` instance based on the provided raw pipelines.
    ///
    /// # Arguments
    ///
    /// * `raw_pipelines` - A reference to a vector of vectors of strings representing the raw pipeline elements.
    ///
    /// # Returns
    ///
    /// Returns a new `Result<Pipeline, ParseError>` instance. Because regex can fail to compile and user can provide an invalid procedure.
    pub fn new(raw_pipelines: &[Vec<String>]) -> Result<Pipeline, ParseError> {
        let mut procs = vec![];
        for proc_args in raw_pipelines {
            if let Some((proc_name, args)) = proc_args.split_first() {
                let proc = Proc::new(proc_name, args)?;
                procs.push(proc);
            }
        }
        Ok(Pipeline { procs })
    }

    /// Handles the given value by applying all the processing procedures in the pipeline.
    ///
    /// # Arguments
    ///
    /// * `value` - The input value to be processed.
    ///
    /// # Returns
    ///
    /// Returns the processed value as a string.
    pub fn handle(&self, value: String) -> String {
        let mut res: String = value;
        for command in self.procs.iter() {
            res = command.handle(&res)
        }
        res
    }
}

/// Represents a procedure in the pipeline.
#[derive(Debug)]
pub enum Proc {
    /// finds all captured groups from the first matching.
    /// It returns concatenated string from all captured groups.
    /// If you need a full match, please use `RegexFind` instead.
    /// `Regex.captures` is applied under the hood.  It requires one argument - the `Regex`.
    Regex(Regex),
    /// it returns the first entire match of the regex in the given value (haystack).
    /// `Regex.find` is applied It requires one argument - the `Regex`.
    RegexFind(Regex),
    /// requires two arguments - the old and the new string.
    Replace(Box<str>, Box<str>),
    /// requires one argument - the path to the json value, if the string represents a json.
    ExtractJson(Box<str>),
    /// requires no arguments. It trims spaces at the start and the end of the string.
    TrimSpace,
    /// requires one argument - it trims characters from the (start and end of) string with the cut set.
    Trim(Vec<char>),
    /// requires no arguments. It normalizes spaces in the string. Includes tabulations and new lines.
    NormalizeSpaces,
    /// unescape html entities, requires no arguments.
    HtmlUnescape,
}

impl Proc {
    /// Creates a new `Proc` instance based on the provided `proc_args`.
    ///
    /// # Arguments
    ///
    /// * `proc_args` - A slice of strings representing the arguments for the `Proc`.
    ///
    /// # Returns
    ///
    /// Returns a new `Result<Option<Proc>, ParseError>` instance. Because:
    /// * regex can fail to compile
    /// * user can provide an invalid procedure
    /// * user can provide an invalid number of arguments for a procedures
    fn new<'b>(proc_name: &'b str, args: &'b [String]) -> Result<Self, PipelineError> {
        let proc_enum: ProcName = proc_name.parse()?;

        let proc_opt = match proc_enum {
            ProcName::Regex => {
                validate_args_len(proc_name, args.len(), 1)?;
                Proc::Regex(Regex::new(&args[0])?)
            }
            ProcName::RegexFind => {
                validate_args_len(proc_name, args.len(), 1)?;
                Proc::RegexFind(Regex::new(&args[0])?)
            }
            ProcName::ExtractJson => {
                validate_args_len(proc_name, args.len(), 1)?;
                Proc::ExtractJson(args[0].clone().into())
            }
            ProcName::Replace => {
                validate_args_len(proc_name, args.len(), 2)?;
                Proc::Replace(args[0].clone().into(), args[1].clone().into())
            }
            ProcName::TrimSpace => Proc::TrimSpace,
            ProcName::Trim => {
                validate_args_len(proc_name, args.len(), 1)?;
                let cut_set: Vec<char> = args[0].chars().collect();
                Proc::Trim(cut_set)
            }
            ProcName::NormalizeSpaces => {
                validate_args_len(proc_name, args.len(), 0)?;
                Proc::NormalizeSpaces
            }
            ProcName::HtmlUnescape => {
                validate_args_len(proc_name, args.len(), 0)?;
                Proc::HtmlUnescape
            }
        };
        Ok(proc_opt)
    }

    /// Handles the given value by applying the processing procedure.
    ///
    /// # Arguments
    ///
    /// * `value` - The input value to be processed.
    ///
    /// # Returns
    ///
    /// Returns the processed value as a string.
    fn handle(&self, value: &str) -> String {
        match self {
            Proc::Regex(re) => re_extract_matches(re, value),
            Proc::RegexFind(re) => re
                .find(value)
                .map(|m| m.as_str())
                .unwrap_or_default()
                .to_string(),
            Proc::Replace(old, new) => value.replace(old.as_ref(), new),
            Proc::ExtractJson(path) => gjson::get(value, path).to_string(),
            Proc::TrimSpace => value.trim().to_string(),
            Proc::Trim(pat) => value.trim_matches(pat.as_slice()).to_string(),
            Proc::NormalizeSpaces => normalize_spaces(value),
            Proc::HtmlUnescape => html_escape::decode_html_entities(value).to_string(),
        }
    }
}

fn validate_args_len(proc_name: &str, actual: usize, expected: usize) -> Result<(), PipelineError> {
    if actual != expected {
        return Err(PipelineError::ProcWrongNumberArguments(
            proc_name.to_string(),
            expected,
            actual,
        ));
    }
    Ok(())
}

fn re_extract_matches(re: &Regex, haystack: &str) -> String {
    let cap_groups = re.captures_len();
    match re.captures(haystack) {
        Some(m) => (1..cap_groups)
            .filter_map(|i| m.get(i))
            .map(|cap| cap.as_str())
            .collect(),
        None => "".to_string(),
    }
}

fn normalize_spaces(text: &str) -> String {
    text.split_whitespace().fold(String::new(), |mut acc, s| {
        if !acc.is_empty() {
            acc.push(' ');
        }
        acc.push_str(s);
        acc
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn regex_proc_matching_group() {
        let re = Regex::new(r"(?:https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
        let proc = Proc::Regex(re);
        let res = proc.handle("http://www.example.com/p1/?q=2");
        assert_eq!(res, "www.example.com");
    }
    #[test]
    fn regex_proc_only_capture_groups() {
        let re = Regex::new(r"(https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
        let proc = Proc::Regex(re);
        let res = proc.handle("http://www.example.com/p1/?q=2");
        assert_eq!(res, "http://www.example.com");
    }

    #[test]
    fn regex_find_proc() {
        let re = Regex::new(r"(?:https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
        let proc = Proc::RegexFind(re);
        let res = proc.handle("http://www.example.com/p1/?q=2");
        assert_eq!(res, "http://www.example.com/");
    }

    #[test]
    fn extract_json() {
        let proc = Proc::ExtractJson("a.b.c".into());
        let res = proc.handle(r#"{"a":{"b":{"c":"d"}}}"#);
        assert_eq!(res, "d");
    }

    #[test]
    fn trim() {
        let proc = Proc::Trim(vec![' ', '-', '=']);
        let res = proc.handle(" -=1=- ");
        assert_eq!(res, "1");
    }
    #[test]
    fn replace() {
        let proc = Proc::Replace("%20".into(), "+".into());
        let res = proc.handle("search/?q=mob%20100");
        assert_eq!(res, "search/?q=mob+100");
    }
    #[test]
    fn normalize_spaces() {
        let proc = Proc::NormalizeSpaces;
        let res = proc.handle("<div>\n    Some\t</span>green</span>  text\n</div>\n");
        assert_eq!(res, "<div> Some </span>green</span> text </div>");
    }

    #[test]
    fn parse_replace_proc_from_args() {
        // Replace via the factory/parse method, not the enum constructor
        let proc = Proc::new("replace", &["%20".into(), "+".into()])
            .expect("should build `Proc::Replace` proc");
        let result = proc.handle("search/?q=mob%20100");
        assert_eq!(result, "search/?q=mob+100");
    }
}