use regex::Regex;
use crate::errors::PipelineError;
use super::errors::ParseError;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ProcName {
Regex,
RegexFind,
Replace,
ExtractJson,
TrimSpace,
Trim,
NormalizeSpaces,
HtmlUnescape,
}
impl std::str::FromStr for ProcName {
type Err = PipelineError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"regex" => Ok(ProcName::Regex),
"regex_find" => Ok(ProcName::RegexFind),
"replace" => Ok(ProcName::Replace),
"extract_json" => Ok(ProcName::ExtractJson),
"trim_space" => Ok(ProcName::TrimSpace),
"trim" => Ok(ProcName::Trim),
"normalize_spaces" => Ok(ProcName::NormalizeSpaces),
"html_unescape" => Ok(ProcName::HtmlUnescape),
_ => Err(PipelineError::ProcDoesNotExist(s.to_string())),
}
}
}
#[derive(Debug)]
pub struct Pipeline {
procs: Vec<Proc>,
}
impl Pipeline {
pub fn new(raw_pipelines: &[Vec<String>]) -> Result<Pipeline, ParseError> {
let mut procs = vec![];
for proc_args in raw_pipelines {
if let Some((proc_name, args)) = proc_args.split_first() {
let proc = Proc::new(proc_name, args)?;
procs.push(proc);
}
}
Ok(Pipeline { procs })
}
pub fn handle(&self, value: String) -> String {
let mut res: String = value;
for command in self.procs.iter() {
res = command.handle(&res)
}
res
}
}
#[derive(Debug)]
pub enum Proc {
Regex(Regex),
RegexFind(Regex),
Replace(Box<str>, Box<str>),
ExtractJson(Box<str>),
TrimSpace,
Trim(Vec<char>),
NormalizeSpaces,
HtmlUnescape,
}
impl Proc {
fn new<'b>(proc_name: &'b str, args: &'b [String]) -> Result<Self, PipelineError> {
let proc_enum: ProcName = proc_name.parse()?;
let proc_opt = match proc_enum {
ProcName::Regex => {
validate_args_len(proc_name, args.len(), 1)?;
Proc::Regex(Regex::new(&args[0])?)
}
ProcName::RegexFind => {
validate_args_len(proc_name, args.len(), 1)?;
Proc::RegexFind(Regex::new(&args[0])?)
}
ProcName::ExtractJson => {
validate_args_len(proc_name, args.len(), 1)?;
Proc::ExtractJson(args[0].clone().into())
}
ProcName::Replace => {
validate_args_len(proc_name, args.len(), 2)?;
Proc::Replace(args[0].clone().into(), args[1].clone().into())
}
ProcName::TrimSpace => Proc::TrimSpace,
ProcName::Trim => {
validate_args_len(proc_name, args.len(), 1)?;
let cut_set: Vec<char> = args[0].chars().collect();
Proc::Trim(cut_set)
}
ProcName::NormalizeSpaces => {
validate_args_len(proc_name, args.len(), 0)?;
Proc::NormalizeSpaces
}
ProcName::HtmlUnescape => {
validate_args_len(proc_name, args.len(), 0)?;
Proc::HtmlUnescape
}
};
Ok(proc_opt)
}
fn handle(&self, value: &str) -> String {
match self {
Proc::Regex(re) => re_extract_matches(re, value),
Proc::RegexFind(re) => re
.find(value)
.map(|m| m.as_str())
.unwrap_or_default()
.to_string(),
Proc::Replace(old, new) => value.replace(old.as_ref(), new),
Proc::ExtractJson(path) => gjson::get(value, path).to_string(),
Proc::TrimSpace => value.trim().to_string(),
Proc::Trim(pat) => value.trim_matches(pat.as_slice()).to_string(),
Proc::NormalizeSpaces => normalize_spaces(value),
Proc::HtmlUnescape => html_escape::decode_html_entities(value).to_string(),
}
}
}
fn validate_args_len(proc_name: &str, actual: usize, expected: usize) -> Result<(), PipelineError> {
if actual != expected {
return Err(PipelineError::ProcWrongNumberArguments(
proc_name.to_string(),
expected,
actual,
));
}
Ok(())
}
fn re_extract_matches(re: &Regex, haystack: &str) -> String {
let cap_groups = re.captures_len();
match re.captures(haystack) {
Some(m) => (1..cap_groups)
.filter_map(|i| m.get(i))
.map(|cap| cap.as_str())
.collect(),
None => "".to_string(),
}
}
fn normalize_spaces(text: &str) -> String {
text.split_whitespace().fold(String::new(), |mut acc, s| {
if !acc.is_empty() {
acc.push(' ');
}
acc.push_str(s);
acc
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn regex_proc_matching_group() {
let re = Regex::new(r"(?:https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
let proc = Proc::Regex(re);
let res = proc.handle("http://www.example.com/p1/?q=2");
assert_eq!(res, "www.example.com");
}
#[test]
fn regex_proc_only_capture_groups() {
let re = Regex::new(r"(https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
let proc = Proc::Regex(re);
let res = proc.handle("http://www.example.com/p1/?q=2");
assert_eq!(res, "http://www.example.com");
}
#[test]
fn regex_find_proc() {
let re = Regex::new(r"(?:https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
let proc = Proc::RegexFind(re);
let res = proc.handle("http://www.example.com/p1/?q=2");
assert_eq!(res, "http://www.example.com/");
}
#[test]
fn extract_json() {
let proc = Proc::ExtractJson("a.b.c".into());
let res = proc.handle(r#"{"a":{"b":{"c":"d"}}}"#);
assert_eq!(res, "d");
}
#[test]
fn trim() {
let proc = Proc::Trim(vec![' ', '-', '=']);
let res = proc.handle(" -=1=- ");
assert_eq!(res, "1");
}
#[test]
fn replace() {
let proc = Proc::Replace("%20".into(), "+".into());
let res = proc.handle("search/?q=mob%20100");
assert_eq!(res, "search/?q=mob+100");
}
#[test]
fn normalize_spaces() {
let proc = Proc::NormalizeSpaces;
let res = proc.handle("<div>\n Some\t</span>green</span> text\n</div>\n");
assert_eq!(res, "<div> Some </span>green</span> text </div>");
}
#[test]
fn parse_replace_proc_from_args() {
let proc = Proc::new("replace", &["%20".into(), "+".into()])
.expect("should build `Proc::Replace` proc");
let result = proc.handle("search/?q=mob%20100");
assert_eq!(result, "search/?q=mob+100");
}
}