1use regex::Regex;
2
3use crate::errors::PipelineError;
4
5use super::errors::ParseError;
6
7const REGEX_PROC: &str = "regex";
9const REGEX_FIND_PROC: &str = "regex_find";
10const REPLACE_PROC: &str = "replace";
11const EXTRACT_JSON: &str = "extract_json";
12const TRIM_SPACE: &str = "trim_space";
13const TRIM: &str = "trim";
14const NORMALIZE_SPACES: &str = "normalize_spaces";
15const HTML_UNESCAPE: &str = "html_unescape";
16
17#[derive(Debug)]
19pub struct Pipeline {
20 procs: Vec<Proc>,
21}
22
23impl Pipeline {
24 pub fn new(raw_pipelines: &Vec<Vec<String>>) -> Result<Pipeline, ParseError> {
34 let mut procs = vec![];
35 for proc_args in raw_pipelines {
36 if let Some((proc_name, args)) = proc_args.split_first() {
37 let proc = Proc::new(proc_name, args)?;
38 procs.push(proc);
39 }
40 }
41 Ok(Pipeline { procs })
42 }
43
44 pub fn handle(&self, value: String) -> String {
54 let mut res: String = value;
55 for command in self.procs.iter() {
56 res = command.handle(&res)
57 }
58 res
59 }
60}
61
62#[derive(Debug)]
64pub enum Proc {
65 Regex(Regex),
70 RegexFind(Regex),
73 Replace(Box<str>, Box<str>),
75 ExtractJson(Box<str>),
77 TrimSpace,
79 Trim(Vec<char>),
81 NormalizeSpaces,
83 HtmlUnescape,
85}
86
87impl Proc {
88 fn new<'b>(proc_name: &'b str, args: &'b [String]) -> Result<Self, PipelineError> {
101 let proc_opt = match proc_name {
102 REGEX_PROC => {
103 validate_args_len(proc_name, args.len(), 1)?;
104 Proc::Regex(Regex::new(&args[0])?)
105 }
106 REGEX_FIND_PROC => {
107 validate_args_len(proc_name, args.len(), 1)?;
108 Proc::RegexFind(Regex::new(&args[0])?)
109 }
110 EXTRACT_JSON => {
111 validate_args_len(proc_name, args.len(), 1)?;
112 Proc::ExtractJson(args[0].clone().into())
113 }
114 REPLACE_PROC => {
115 validate_args_len(proc_name, args.len(), 2)?;
116 Proc::Replace(args[0].clone().into(), args[1].clone().into())
117 }
118 TRIM_SPACE => Proc::TrimSpace,
119 TRIM => {
120 validate_args_len(proc_name, args.len(), 1)?;
121 let cut_set: Vec<char> = args[0].chars().collect();
122 Proc::Trim(cut_set)
123 }
124 NORMALIZE_SPACES => Proc::NormalizeSpaces,
125 HTML_UNESCAPE => Proc::HtmlUnescape,
126 _ => return Err(PipelineError::ProcDoesNotExist(proc_name.to_string())),
127 };
128 Ok(proc_opt)
129 }
130
131 fn handle(&self, value: &str) -> String {
141 match self {
142 Proc::Regex(re) => re_extract_matches(re, value),
143 Proc::RegexFind(re) => re
144 .find(value)
145 .map(|m| m.as_str())
146 .unwrap_or_default()
147 .to_string(),
148 Proc::Replace(old, new) => value.replace(old.as_ref(), new),
149 Proc::ExtractJson(path) => gjson::get(value, path).to_string(),
150 Proc::TrimSpace => value.trim().to_string(),
151 Proc::Trim(pat) => value.trim_matches(pat.as_slice()).to_string(),
152 Proc::NormalizeSpaces => normalize_spaces(value),
153 Proc::HtmlUnescape => html_escape::decode_html_entities(value).to_string(),
154 }
155 }
156}
157
158fn validate_args_len(proc_name: &str, args_len: usize, len: usize) -> Result<(), PipelineError> {
159 if args_len < len {
160 return Err(PipelineError::ProcNotEnoughArguments(
161 proc_name.to_string(),
162 args_len,
163 len,
164 ));
165 }
166 Ok(())
167}
168
169fn re_extract_matches(re: &Regex, haystack: &str) -> String {
170 let cap_groups = re.captures_len();
171 match re.captures(haystack) {
172 Some(m) => (1..cap_groups)
173 .filter_map(|i| m.get(i))
174 .map(|cap| cap.as_str())
175 .collect(),
176 None => "".to_string(),
177 }
178}
179
180fn normalize_spaces(text: &str) -> String {
181 text.split_whitespace().collect::<Vec<&str>>().join(" ")
182}
183
184#[cfg(test)]
185mod tests {
186 use super::*;
187
188 #[test]
189 fn regex_proc_matching_group() {
190 let re = Regex::new(r"(?:https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
191 let proc = Proc::Regex(re);
192 let res = proc.handle("http://www.example.com/p1/?q=2");
193 assert_eq!(res, "www.example.com");
194 }
195 #[test]
196 fn regex_proc_only_capture_groups() {
197 let re = Regex::new(r"(https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
198 let proc = Proc::Regex(re);
199 let res = proc.handle("http://www.example.com/p1/?q=2");
200 assert_eq!(res, "http://www.example.com");
201 }
202
203 #[test]
204 fn regex_find_proc() {
205 let re = Regex::new(r"(?:https?://)(?<domain>[a-zA-Z0-9.-]+)/").unwrap();
206 let proc = Proc::RegexFind(re);
207 let res = proc.handle("http://www.example.com/p1/?q=2");
208 assert_eq!(res, "http://www.example.com/");
209 }
210
211 #[test]
212 fn extract_json() {
213 let proc = Proc::ExtractJson("a.b.c".into());
214 let res = proc.handle(r#"{"a":{"b":{"c":"d"}}}"#);
215 assert_eq!(res, "d");
216 }
217
218 #[test]
219 fn trim() {
220 let proc = Proc::Trim(vec![' ', '-', '=']);
221 let res = proc.handle(" -=1=- ");
222 assert_eq!(res, "1");
223 }
224 #[test]
225 fn replace() {
226 let proc = Proc::Replace("%20".into(), "+".into());
227 let res = proc.handle("search/?q=mob%20100");
228 assert_eq!(res, "search/?q=mob+100");
229 }
230 #[test]
231 fn normalize_spaces() {
232 let proc = Proc::NormalizeSpaces;
233 let res = proc.handle("<div>\n Some\t</span>green</span> text\n</div>\n");
234 assert_eq!(res, "<div> Some </span>green</span> text </div>");
235 }
236
237 #[test]
238 fn parse_replace_proc_from_args() {
239 let proc = Proc::new("replace", &["%20".into(), "+".into()])
241 .expect("should build `Proc::Replace` proc");
242 let result = proc.handle("search/?q=mob%20100");
243 assert_eq!(result, "search/?q=mob+100");
244 }
245}