Skip to main content

jpx_core/extensions/
multi_match.rs

1//! Multi-pattern matching functions.
2
3use std::collections::HashSet;
4
5use aho_corasick::AhoCorasick;
6use serde_json::{Number, Value};
7
8use crate::functions::Function;
9use crate::interpreter::SearchResult;
10use crate::registry::register_if_enabled;
11use crate::{Context, Runtime, arg, defn};
12
13// match_any(string, patterns) -> boolean
14// Returns true if any of the patterns match the string
15defn!(MatchAnyFn, vec![arg!(string), arg!(array)], None);
16
17impl Function for MatchAnyFn {
18    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
19        self.signature.validate(args, ctx)?;
20
21        let text = args[0].as_str().unwrap();
22        let patterns_arr = args[1].as_array().unwrap();
23
24        let patterns: Vec<&str> = patterns_arr.iter().filter_map(|p| p.as_str()).collect();
25
26        if patterns.is_empty() {
27            return Ok(Value::Bool(false));
28        }
29
30        let ac = AhoCorasick::new(&patterns).unwrap();
31        let has_match = ac.find(text).is_some();
32
33        Ok(Value::Bool(has_match))
34    }
35}
36
37// match_all(string, patterns) -> boolean
38// Returns true if all patterns match the string
39defn!(MatchAllFn, vec![arg!(string), arg!(array)], None);
40
41impl Function for MatchAllFn {
42    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
43        self.signature.validate(args, ctx)?;
44
45        let text = args[0].as_str().unwrap();
46        let patterns_arr = args[1].as_array().unwrap();
47
48        let patterns: Vec<&str> = patterns_arr.iter().filter_map(|p| p.as_str()).collect();
49
50        if patterns.is_empty() {
51            return Ok(Value::Bool(true));
52        }
53
54        let ac = AhoCorasick::new(&patterns).unwrap();
55
56        let mut found = vec![false; patterns.len()];
57
58        for mat in ac.find_iter(text) {
59            found[mat.pattern().as_usize()] = true;
60        }
61
62        let all_found = found.iter().all(|&f| f);
63        Ok(Value::Bool(all_found))
64    }
65}
66
67// match_which(string, patterns) -> array
68// Returns array of patterns that matched
69defn!(MatchWhichFn, vec![arg!(string), arg!(array)], None);
70
71impl Function for MatchWhichFn {
72    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
73        self.signature.validate(args, ctx)?;
74
75        let text = args[0].as_str().unwrap();
76        let patterns_arr = args[1].as_array().unwrap();
77
78        let patterns: Vec<&str> = patterns_arr.iter().filter_map(|p| p.as_str()).collect();
79
80        if patterns.is_empty() {
81            return Ok(Value::Array(vec![]));
82        }
83
84        let ac = AhoCorasick::new(&patterns).unwrap();
85
86        let mut found = vec![false; patterns.len()];
87
88        for mat in ac.find_iter(text) {
89            found[mat.pattern().as_usize()] = true;
90        }
91
92        let matched: Vec<Value> = patterns
93            .iter()
94            .enumerate()
95            .filter(|(i, _)| found[*i])
96            .map(|(_, p)| Value::String((*p).to_string()))
97            .collect();
98
99        Ok(Value::Array(matched))
100    }
101}
102
103// match_count(string, patterns) -> number
104// Count total number of matches (non-overlapping) across all patterns
105defn!(MatchCountFn, vec![arg!(string), arg!(array)], None);
106
107impl Function for MatchCountFn {
108    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
109        self.signature.validate(args, ctx)?;
110
111        let text = args[0].as_str().unwrap();
112        let patterns_arr = args[1].as_array().unwrap();
113
114        let patterns: Vec<&str> = patterns_arr.iter().filter_map(|p| p.as_str()).collect();
115
116        if patterns.is_empty() {
117            return Ok(Value::Number(Number::from(0)));
118        }
119
120        let ac = AhoCorasick::new(&patterns).unwrap();
121        let count = ac.find_iter(text).count();
122
123        Ok(Value::Number(Number::from(count)))
124    }
125}
126
127// replace_many(string, replacements) -> string
128// Replace multiple patterns at once. replacements is an object {pattern: replacement, ...}
129defn!(ReplaceManyFn, vec![arg!(string), arg!(object)], None);
130
131impl Function for ReplaceManyFn {
132    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
133        self.signature.validate(args, ctx)?;
134
135        let text = args[0].as_str().unwrap();
136        let replacements_obj = args[1].as_object().unwrap();
137
138        if replacements_obj.is_empty() {
139            return Ok(Value::String(text.to_string()));
140        }
141
142        let mut patterns: Vec<&str> = Vec::new();
143        let mut replacements: Vec<String> = Vec::new();
144
145        for (pattern, replacement) in replacements_obj.iter() {
146            patterns.push(pattern);
147            if let Some(s) = replacement.as_str() {
148                replacements.push(s.to_string());
149            } else {
150                replacements.push(replacement.to_string());
151            }
152        }
153
154        let ac = AhoCorasick::new(&patterns).unwrap();
155        let result = ac.replace_all(text, &replacements);
156
157        Ok(Value::String(result))
158    }
159}
160
161// extract_all(string, patterns) -> array of matches with pattern info
162defn!(ExtractAllFn, vec![arg!(string), arg!(array)], None);
163
164impl Function for ExtractAllFn {
165    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
166        self.signature.validate(args, ctx)?;
167
168        let text = args[0].as_str().unwrap();
169        let patterns_arr = args[1].as_array().unwrap();
170
171        let patterns: Vec<&str> = patterns_arr.iter().filter_map(|p| p.as_str()).collect();
172
173        if patterns.is_empty() {
174            return Ok(Value::Array(vec![]));
175        }
176
177        let ac = AhoCorasick::new(&patterns).unwrap();
178        let mut results: Vec<Value> = Vec::new();
179
180        for mat in ac.find_iter(text) {
181            let mut obj = serde_json::Map::new();
182            obj.insert(
183                "pattern".to_string(),
184                Value::String(patterns[mat.pattern().as_usize()].to_string()),
185            );
186            obj.insert(
187                "match".to_string(),
188                Value::String(text[mat.start()..mat.end()].to_string()),
189            );
190            obj.insert(
191                "start".to_string(),
192                Value::Number(Number::from(mat.start())),
193            );
194            obj.insert("end".to_string(), Value::Number(Number::from(mat.end())));
195            results.push(Value::Object(obj));
196        }
197
198        Ok(Value::Array(results))
199    }
200}
201
202// match_positions(string, patterns) -> array of positions
203defn!(MatchPositionsFn, vec![arg!(string), arg!(array)], None);
204
205impl Function for MatchPositionsFn {
206    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
207        self.signature.validate(args, ctx)?;
208
209        let text = args[0].as_str().unwrap();
210        let patterns_arr = args[1].as_array().unwrap();
211
212        let patterns: Vec<&str> = patterns_arr.iter().filter_map(|p| p.as_str()).collect();
213
214        if patterns.is_empty() {
215            return Ok(Value::Array(vec![]));
216        }
217
218        let ac = AhoCorasick::new(&patterns).unwrap();
219        let mut results: Vec<Value> = Vec::new();
220
221        for mat in ac.find_iter(text) {
222            let mut obj = serde_json::Map::new();
223            obj.insert(
224                "pattern".to_string(),
225                Value::String(patterns[mat.pattern().as_usize()].to_string()),
226            );
227            obj.insert(
228                "start".to_string(),
229                Value::Number(Number::from(mat.start())),
230            );
231            obj.insert("end".to_string(), Value::Number(Number::from(mat.end())));
232            results.push(Value::Object(obj));
233        }
234
235        Ok(Value::Array(results))
236    }
237}
238
239// mm_tokenize(string, options?) -> array of tokens
240// Smart word tokenization with optional configuration
241defn!(MmTokenizeFn, vec![arg!(string)], Some(arg!(any)));
242
243impl Function for MmTokenizeFn {
244    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
245        self.signature.validate(args, ctx)?;
246
247        let text = args[0].as_str().unwrap();
248
249        let lowercase = args
250            .get(1)
251            .and_then(|v| v.as_object())
252            .and_then(|obj| obj.get("lowercase"))
253            .and_then(|v| v.as_bool())
254            .unwrap_or(false);
255
256        let min_length = args
257            .get(1)
258            .and_then(|v| v.as_object())
259            .and_then(|obj| obj.get("min_length"))
260            .and_then(|v| v.as_f64())
261            .map(|n| n as usize)
262            .unwrap_or(1);
263
264        let tokens: Vec<Value> = text
265            .split(|c: char| !c.is_alphanumeric())
266            .filter(|s| !s.is_empty() && s.len() >= min_length)
267            .map(|s| {
268                let token = if lowercase {
269                    s.to_lowercase()
270                } else {
271                    s.to_string()
272                };
273                Value::String(token)
274            })
275            .collect();
276
277        Ok(Value::Array(tokens))
278    }
279}
280
281// extract_between(string, start, end) -> string or null
282defn!(
283    ExtractBetweenFn,
284    vec![arg!(string), arg!(string), arg!(string)],
285    None
286);
287
288impl Function for ExtractBetweenFn {
289    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
290        self.signature.validate(args, ctx)?;
291
292        let text = args[0].as_str().unwrap();
293        let start_delim = args[1].as_str().unwrap();
294        let end_delim = args[2].as_str().unwrap();
295
296        if let Some(start_pos) = text.find(start_delim) {
297            let after_start = start_pos + start_delim.len();
298            if let Some(end_pos) = text[after_start..].find(end_delim) {
299                let extracted = &text[after_start..after_start + end_pos];
300                return Ok(Value::String(extracted.to_string()));
301            }
302        }
303
304        Ok(Value::Null)
305    }
306}
307
308// split_keep(string, delimiter) -> array keeping delimiters
309defn!(SplitKeepFn, vec![arg!(string), arg!(string)], None);
310
311impl Function for SplitKeepFn {
312    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
313        self.signature.validate(args, ctx)?;
314
315        let text = args[0].as_str().unwrap();
316        let delimiter = args[1].as_str().unwrap();
317
318        if delimiter.is_empty() {
319            return Ok(Value::Array(vec![Value::String(text.to_string())]));
320        }
321
322        let mut result: Vec<Value> = Vec::new();
323        let mut last_end = 0;
324
325        for (start, part) in text.match_indices(delimiter) {
326            if start > last_end {
327                result.push(Value::String(text[last_end..start].to_string()));
328            }
329            result.push(Value::String(part.to_string()));
330            last_end = start + part.len();
331        }
332
333        if last_end < text.len() {
334            result.push(Value::String(text[last_end..].to_string()));
335        }
336
337        Ok(Value::Array(result))
338    }
339}
340
341/// Register multi-match functions filtered by the enabled set.
342pub fn register_filtered(runtime: &mut Runtime, enabled: &HashSet<&str>) {
343    register_if_enabled(runtime, "match_any", enabled, Box::new(MatchAnyFn::new()));
344    register_if_enabled(runtime, "match_all", enabled, Box::new(MatchAllFn::new()));
345    register_if_enabled(
346        runtime,
347        "match_which",
348        enabled,
349        Box::new(MatchWhichFn::new()),
350    );
351    register_if_enabled(
352        runtime,
353        "match_count",
354        enabled,
355        Box::new(MatchCountFn::new()),
356    );
357    register_if_enabled(
358        runtime,
359        "replace_many",
360        enabled,
361        Box::new(ReplaceManyFn::new()),
362    );
363    register_if_enabled(
364        runtime,
365        "extract_all",
366        enabled,
367        Box::new(ExtractAllFn::new()),
368    );
369    register_if_enabled(
370        runtime,
371        "match_positions",
372        enabled,
373        Box::new(MatchPositionsFn::new()),
374    );
375    register_if_enabled(
376        runtime,
377        "mm_tokenize",
378        enabled,
379        Box::new(MmTokenizeFn::new()),
380    );
381    register_if_enabled(
382        runtime,
383        "extract_between",
384        enabled,
385        Box::new(ExtractBetweenFn::new()),
386    );
387    register_if_enabled(runtime, "split_keep", enabled, Box::new(SplitKeepFn::new()));
388}