Skip to main content

jpx_core/extensions/
text.rs

1//! Text analysis functions.
2
3use std::collections::BTreeMap;
4use std::collections::HashSet;
5
6use serde_json::{Number, Value};
7
8use crate::functions::{Function, number_value};
9use crate::interpreter::SearchResult;
10use crate::registry::register_if_enabled;
11use crate::{Context, Runtime, arg, defn};
12
13// Average reading speed in words per minute
14const WORDS_PER_MINUTE: f64 = 200.0;
15
16// =============================================================================
17// word_count(s) -> number
18// =============================================================================
19
20defn!(WordCountFn, vec![arg!(string)], None);
21
22impl Function for WordCountFn {
23    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
24        self.signature.validate(args, ctx)?;
25        let s = args[0].as_str().unwrap();
26        let count = s.split_whitespace().count();
27        Ok(Value::Number(Number::from(count)))
28    }
29}
30
31// =============================================================================
32// char_count(s) -> number
33// =============================================================================
34
35defn!(CharCountFn, vec![arg!(string)], None);
36
37impl Function for CharCountFn {
38    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
39        self.signature.validate(args, ctx)?;
40        let s = args[0].as_str().unwrap();
41        let count = s.chars().filter(|c| !c.is_whitespace()).count();
42        Ok(Value::Number(Number::from(count)))
43    }
44}
45
46// =============================================================================
47// sentence_count(s) -> number
48// =============================================================================
49
50defn!(SentenceCountFn, vec![arg!(string)], None);
51
52impl Function for SentenceCountFn {
53    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
54        self.signature.validate(args, ctx)?;
55        let s = args[0].as_str().unwrap();
56
57        if s.trim().is_empty() {
58            return Ok(Value::Number(Number::from(0)));
59        }
60
61        // Count sentence-ending punctuation
62        let count = s
63            .chars()
64            .filter(|c| *c == '.' || *c == '!' || *c == '?')
65            .count();
66
67        // If no sentence-ending punctuation but has content, count as 1
68        let count = if count == 0 && !s.trim().is_empty() {
69            1
70        } else {
71            count
72        };
73
74        Ok(Value::Number(Number::from(count)))
75    }
76}
77
78// =============================================================================
79// paragraph_count(s) -> number
80// =============================================================================
81
82defn!(ParagraphCountFn, vec![arg!(string)], None);
83
84impl Function for ParagraphCountFn {
85    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
86        self.signature.validate(args, ctx)?;
87        let s = args[0].as_str().unwrap();
88
89        // Split by double newlines (paragraph separator)
90        let count = s.split("\n\n").filter(|p| !p.trim().is_empty()).count();
91
92        Ok(Value::Number(Number::from(count)))
93    }
94}
95
96// =============================================================================
97// reading_time(s) -> number (minutes)
98// =============================================================================
99
100defn!(ReadingTimeFn, vec![arg!(string)], None);
101
102impl Function for ReadingTimeFn {
103    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
104        self.signature.validate(args, ctx)?;
105        let s = args[0].as_str().unwrap();
106        let word_count = s.split_whitespace().count() as f64;
107        let minutes = (word_count / WORDS_PER_MINUTE).ceil();
108        Ok(number_value(minutes))
109    }
110}
111
112// =============================================================================
113// reading_time_seconds(s) -> number (seconds)
114// =============================================================================
115
116defn!(ReadingTimeSecondsFn, vec![arg!(string)], None);
117
118impl Function for ReadingTimeSecondsFn {
119    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
120        self.signature.validate(args, ctx)?;
121        let s = args[0].as_str().unwrap();
122        let word_count = s.split_whitespace().count() as f64;
123        let seconds = (word_count / WORDS_PER_MINUTE) * 60.0;
124        Ok(number_value(seconds.ceil()))
125    }
126}
127
128// =============================================================================
129// char_frequencies(s) -> object
130// =============================================================================
131
132defn!(CharFrequenciesFn, vec![arg!(string)], None);
133
134impl Function for CharFrequenciesFn {
135    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
136        self.signature.validate(args, ctx)?;
137        let s = args[0].as_str().unwrap();
138
139        let mut freq: BTreeMap<char, usize> = BTreeMap::new();
140        for c in s.chars() {
141            if !c.is_whitespace() {
142                *freq.entry(c).or_insert(0) += 1;
143            }
144        }
145
146        let obj: serde_json::Map<String, Value> = freq
147            .into_iter()
148            .map(|(k, v)| (k.to_string(), Value::Number(Number::from(v))))
149            .collect();
150
151        Ok(Value::Object(obj))
152    }
153}
154
155// =============================================================================
156// word_frequencies(s) -> object
157// =============================================================================
158
159defn!(WordFrequenciesFn, vec![arg!(string)], None);
160
161impl Function for WordFrequenciesFn {
162    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
163        self.signature.validate(args, ctx)?;
164        let s = args[0].as_str().unwrap();
165
166        let mut freq: BTreeMap<String, usize> = BTreeMap::new();
167        for word in s.split_whitespace() {
168            // Normalize: lowercase and remove punctuation
169            let normalized: String = word
170                .chars()
171                .filter(|c| c.is_alphanumeric())
172                .collect::<String>()
173                .to_lowercase();
174
175            if !normalized.is_empty() {
176                *freq.entry(normalized).or_insert(0) += 1;
177            }
178        }
179
180        let obj: serde_json::Map<String, Value> = freq
181            .into_iter()
182            .map(|(k, v)| (k, Value::Number(Number::from(v))))
183            .collect();
184
185        Ok(Value::Object(obj))
186    }
187}
188
189// =============================================================================
190// ngrams(s, n, type?) -> array
191// Generate n-grams from text. Type can be "word" (default) or "char".
192// =============================================================================
193
194defn!(
195    NgramsFn,
196    vec![arg!(string), arg!(number)],
197    Some(arg!(string))
198);
199
200impl Function for NgramsFn {
201    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
202        self.signature.validate(args, ctx)?;
203        let s = args[0].as_str().unwrap();
204        let n = args[1].as_f64().unwrap() as usize;
205
206        // Default to "word" if not specified
207        let ngram_type = if args.len() > 2 {
208            args[2].as_str().unwrap_or("word")
209        } else {
210            "word"
211        };
212
213        if n == 0 {
214            return Ok(Value::Array(vec![]));
215        }
216
217        let result = match ngram_type {
218            "char" => {
219                // Character n-grams
220                let chars: Vec<char> = s.chars().collect();
221                if chars.len() < n {
222                    vec![]
223                } else {
224                    chars
225                        .windows(n)
226                        .map(|w| Value::String(w.iter().collect()))
227                        .collect()
228                }
229            }
230            _ => {
231                // Word n-grams (default)
232                let words: Vec<&str> = s.split_whitespace().collect();
233                if words.len() < n {
234                    vec![]
235                } else {
236                    words
237                        .windows(n)
238                        .map(|w| {
239                            let arr: Vec<Value> = w
240                                .iter()
241                                .map(|word| Value::String(word.to_string()))
242                                .collect();
243                            Value::Array(arr)
244                        })
245                        .collect()
246                }
247            }
248        };
249
250        Ok(Value::Array(result))
251    }
252}
253
254// =============================================================================
255// bigrams(s) -> array
256// Convenience function for word bigrams (2-grams).
257// =============================================================================
258
259defn!(BigramsFn, vec![arg!(string)], None);
260
261impl Function for BigramsFn {
262    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
263        self.signature.validate(args, ctx)?;
264        let s = args[0].as_str().unwrap();
265
266        let words: Vec<&str> = s.split_whitespace().collect();
267        if words.len() < 2 {
268            return Ok(Value::Array(vec![]));
269        }
270
271        let result: Vec<Value> = words
272            .windows(2)
273            .map(|w| {
274                let arr: Vec<Value> = w
275                    .iter()
276                    .map(|word| Value::String(word.to_string()))
277                    .collect();
278                Value::Array(arr)
279            })
280            .collect();
281
282        Ok(Value::Array(result))
283    }
284}
285
286// =============================================================================
287// trigrams(s) -> array
288// Convenience function for word trigrams (3-grams).
289// =============================================================================
290
291defn!(TrigramsFn, vec![arg!(string)], None);
292
293impl Function for TrigramsFn {
294    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
295        self.signature.validate(args, ctx)?;
296        let s = args[0].as_str().unwrap();
297
298        let words: Vec<&str> = s.split_whitespace().collect();
299        if words.len() < 3 {
300            return Ok(Value::Array(vec![]));
301        }
302
303        let result: Vec<Value> = words
304            .windows(3)
305            .map(|w| {
306                let arr: Vec<Value> = w
307                    .iter()
308                    .map(|word| Value::String(word.to_string()))
309                    .collect();
310                Value::Array(arr)
311            })
312            .collect();
313
314        Ok(Value::Array(result))
315    }
316}
317
318// =============================================================================
319// tokens(s) -> array
320// Simple word tokenization: lowercase, strip punctuation
321// =============================================================================
322
323defn!(TokensFn, vec![arg!(string)], None);
324
325impl Function for TokensFn {
326    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
327        self.signature.validate(args, ctx)?;
328        let s = args[0].as_str().unwrap();
329
330        let tokens: Vec<Value> = s
331            .split_whitespace()
332            .filter_map(|word| {
333                let normalized: String = word
334                    .chars()
335                    .filter(|c| c.is_alphanumeric())
336                    .collect::<String>()
337                    .to_lowercase();
338
339                if normalized.is_empty() {
340                    None
341                } else {
342                    Some(Value::String(normalized))
343                }
344            })
345            .collect();
346
347        Ok(Value::Array(tokens))
348    }
349}
350
351// =============================================================================
352// tokenize(s, options?) -> array
353// Configurable tokenization with options:
354//   - case: "lower" (default), "upper", "preserve"
355//   - punctuation: "strip" (default), "keep"
356// =============================================================================
357
358defn!(TokenizeFn, vec![arg!(string)], Some(arg!(object)));
359
360impl Function for TokenizeFn {
361    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
362        self.signature.validate(args, ctx)?;
363        let s = args[0].as_str().unwrap();
364
365        // Parse options
366        let (case_mode, strip_punctuation) = if args.len() > 1 {
367            if let Some(opts) = args[1].as_object() {
368                let case_mode = opts.get("case").and_then(|v| v.as_str()).unwrap_or("lower");
369
370                let punctuation = opts
371                    .get("punctuation")
372                    .and_then(|v| v.as_str())
373                    .unwrap_or("strip");
374
375                (case_mode.to_string(), punctuation != "keep")
376            } else {
377                ("lower".to_string(), true)
378            }
379        } else {
380            ("lower".to_string(), true)
381        };
382
383        let tokens: Vec<Value> = s
384            .split_whitespace()
385            .filter_map(|word| {
386                let processed: String = if strip_punctuation {
387                    word.chars().filter(|c| c.is_alphanumeric()).collect()
388                } else {
389                    word.to_string()
390                };
391
392                if processed.is_empty() {
393                    return None;
394                }
395
396                let final_token = match case_mode.as_str() {
397                    "upper" => processed.to_uppercase(),
398                    "preserve" => processed,
399                    _ => processed.to_lowercase(), // "lower" or default
400                };
401
402                Some(Value::String(final_token))
403            })
404            .collect();
405
406        Ok(Value::Array(tokens))
407    }
408}
409
410// =============================================================================
411// stem(word, lang?) -> string
412// Stem a single word using Porter/Snowball stemmer
413// =============================================================================
414
415defn!(StemFn, vec![arg!(string)], Some(arg!(string)));
416
417impl Function for StemFn {
418    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
419        use rust_stemmers::{Algorithm, Stemmer};
420
421        self.signature.validate(args, ctx)?;
422        let word = args[0].as_str().unwrap();
423
424        let lang = if args.len() > 1 {
425            args[1].as_str().map(|s| s.to_string())
426        } else {
427            None
428        };
429
430        let algorithm = match lang.as_deref() {
431            Some("ar" | "arabic") => Algorithm::Arabic,
432            Some("da" | "danish") => Algorithm::Danish,
433            Some("nl" | "dutch") => Algorithm::Dutch,
434            Some("fi" | "finnish") => Algorithm::Finnish,
435            Some("fr" | "french") => Algorithm::French,
436            Some("de" | "german") => Algorithm::German,
437            Some("el" | "greek") => Algorithm::Greek,
438            Some("hu" | "hungarian") => Algorithm::Hungarian,
439            Some("it" | "italian") => Algorithm::Italian,
440            Some("no" | "norwegian") => Algorithm::Norwegian,
441            Some("pt" | "portuguese") => Algorithm::Portuguese,
442            Some("ro" | "romanian") => Algorithm::Romanian,
443            Some("ru" | "russian") => Algorithm::Russian,
444            Some("es" | "spanish") => Algorithm::Spanish,
445            Some("sv" | "swedish") => Algorithm::Swedish,
446            Some("ta" | "tamil") => Algorithm::Tamil,
447            Some("tr" | "turkish") => Algorithm::Turkish,
448            _ => Algorithm::English, // Default to English
449        };
450
451        let stemmer = Stemmer::create(algorithm);
452        let stemmed = stemmer.stem(word).to_string();
453
454        Ok(Value::String(stemmed))
455    }
456}
457
458// =============================================================================
459// stems(tokens, lang?) -> array
460// Stem an array of tokens
461// =============================================================================
462
463defn!(StemsFn, vec![arg!(array)], Some(arg!(string)));
464
465impl Function for StemsFn {
466    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
467        use rust_stemmers::{Algorithm, Stemmer};
468
469        self.signature.validate(args, ctx)?;
470        let tokens = args[0].as_array().unwrap();
471
472        let lang = if args.len() > 1 {
473            args[1].as_str().map(|s| s.to_string())
474        } else {
475            None
476        };
477
478        let algorithm = match lang.as_deref() {
479            Some("ar" | "arabic") => Algorithm::Arabic,
480            Some("da" | "danish") => Algorithm::Danish,
481            Some("nl" | "dutch") => Algorithm::Dutch,
482            Some("fi" | "finnish") => Algorithm::Finnish,
483            Some("fr" | "french") => Algorithm::French,
484            Some("de" | "german") => Algorithm::German,
485            Some("el" | "greek") => Algorithm::Greek,
486            Some("hu" | "hungarian") => Algorithm::Hungarian,
487            Some("it" | "italian") => Algorithm::Italian,
488            Some("no" | "norwegian") => Algorithm::Norwegian,
489            Some("pt" | "portuguese") => Algorithm::Portuguese,
490            Some("ro" | "romanian") => Algorithm::Romanian,
491            Some("ru" | "russian") => Algorithm::Russian,
492            Some("es" | "spanish") => Algorithm::Spanish,
493            Some("sv" | "swedish") => Algorithm::Swedish,
494            Some("ta" | "tamil") => Algorithm::Tamil,
495            Some("tr" | "turkish") => Algorithm::Turkish,
496            _ => Algorithm::English,
497        };
498
499        let stemmer = Stemmer::create(algorithm);
500
501        let result: Vec<Value> = tokens
502            .iter()
503            .filter_map(|t| {
504                t.as_str()
505                    .map(|s| Value::String(stemmer.stem(s).to_string()))
506            })
507            .collect();
508
509        Ok(Value::Array(result))
510    }
511}
512
513// =============================================================================
514// stopwords(lang?) -> array
515// Get stopwords list for a language
516// =============================================================================
517
518defn!(StopwordsFn, vec![], Some(arg!(string)));
519
520impl Function for StopwordsFn {
521    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
522        use stop_words::{LANGUAGE, get};
523
524        self.signature.validate(args, ctx)?;
525
526        let lang = if !args.is_empty() {
527            args[0].as_str().map(|s| s.to_string())
528        } else {
529            None
530        };
531
532        let language = match lang.as_deref() {
533            Some("ar" | "arabic") => LANGUAGE::Arabic,
534            Some("bg" | "bulgarian") => LANGUAGE::Bulgarian,
535            Some("ca" | "catalan") => LANGUAGE::Catalan,
536            Some("cs" | "czech") => LANGUAGE::Czech,
537            Some("da" | "danish") => LANGUAGE::Danish,
538            Some("nl" | "dutch") => LANGUAGE::Dutch,
539            Some("fi" | "finnish") => LANGUAGE::Finnish,
540            Some("fr" | "french") => LANGUAGE::French,
541            Some("de" | "german") => LANGUAGE::German,
542            Some("he" | "hebrew") => LANGUAGE::Hebrew,
543            Some("hi" | "hindi") => LANGUAGE::Hindi,
544            Some("hu" | "hungarian") => LANGUAGE::Hungarian,
545            Some("id" | "indonesian") => LANGUAGE::Indonesian,
546            Some("it" | "italian") => LANGUAGE::Italian,
547            Some("ja" | "japanese") => LANGUAGE::Japanese,
548            Some("ko" | "korean") => LANGUAGE::Korean,
549            Some("lv" | "latvian") => LANGUAGE::Latvian,
550            Some("no" | "norwegian") => LANGUAGE::Norwegian,
551            Some("fa" | "persian") => LANGUAGE::Persian,
552            Some("pl" | "polish") => LANGUAGE::Polish,
553            Some("pt" | "portuguese") => LANGUAGE::Portuguese,
554            Some("ro" | "romanian") => LANGUAGE::Romanian,
555            Some("ru" | "russian") => LANGUAGE::Russian,
556            Some("sk" | "slovak") => LANGUAGE::Slovak,
557            Some("es" | "spanish") => LANGUAGE::Spanish,
558            Some("sv" | "swedish") => LANGUAGE::Swedish,
559            Some("th" | "thai") => LANGUAGE::Thai,
560            Some("tr" | "turkish") => LANGUAGE::Turkish,
561            Some("uk" | "ukrainian") => LANGUAGE::Ukrainian,
562            Some("vi" | "vietnamese") => LANGUAGE::Vietnamese,
563            Some("zh" | "chinese") => LANGUAGE::Chinese,
564            _ => LANGUAGE::English,
565        };
566
567        let words = get(language);
568        let result: Vec<Value> = words.iter().map(|w| Value::String(w.to_string())).collect();
569
570        Ok(Value::Array(result))
571    }
572}
573
574// =============================================================================
575// remove_stopwords(tokens, lang?) -> array
576// Remove stopwords from token array
577// =============================================================================
578
579defn!(RemoveStopwordsFn, vec![arg!(array)], Some(arg!(string)));
580
581impl Function for RemoveStopwordsFn {
582    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
583        use stop_words::{LANGUAGE, get};
584
585        self.signature.validate(args, ctx)?;
586        let tokens = args[0].as_array().unwrap();
587
588        let lang = if args.len() > 1 {
589            args[1].as_str().map(|s| s.to_string())
590        } else {
591            None
592        };
593
594        let language = match lang.as_deref() {
595            Some("ar" | "arabic") => LANGUAGE::Arabic,
596            Some("bg" | "bulgarian") => LANGUAGE::Bulgarian,
597            Some("ca" | "catalan") => LANGUAGE::Catalan,
598            Some("cs" | "czech") => LANGUAGE::Czech,
599            Some("da" | "danish") => LANGUAGE::Danish,
600            Some("nl" | "dutch") => LANGUAGE::Dutch,
601            Some("fi" | "finnish") => LANGUAGE::Finnish,
602            Some("fr" | "french") => LANGUAGE::French,
603            Some("de" | "german") => LANGUAGE::German,
604            Some("he" | "hebrew") => LANGUAGE::Hebrew,
605            Some("hi" | "hindi") => LANGUAGE::Hindi,
606            Some("hu" | "hungarian") => LANGUAGE::Hungarian,
607            Some("id" | "indonesian") => LANGUAGE::Indonesian,
608            Some("it" | "italian") => LANGUAGE::Italian,
609            Some("ja" | "japanese") => LANGUAGE::Japanese,
610            Some("ko" | "korean") => LANGUAGE::Korean,
611            Some("lv" | "latvian") => LANGUAGE::Latvian,
612            Some("no" | "norwegian") => LANGUAGE::Norwegian,
613            Some("fa" | "persian") => LANGUAGE::Persian,
614            Some("pl" | "polish") => LANGUAGE::Polish,
615            Some("pt" | "portuguese") => LANGUAGE::Portuguese,
616            Some("ro" | "romanian") => LANGUAGE::Romanian,
617            Some("ru" | "russian") => LANGUAGE::Russian,
618            Some("sk" | "slovak") => LANGUAGE::Slovak,
619            Some("es" | "spanish") => LANGUAGE::Spanish,
620            Some("sv" | "swedish") => LANGUAGE::Swedish,
621            Some("th" | "thai") => LANGUAGE::Thai,
622            Some("tr" | "turkish") => LANGUAGE::Turkish,
623            Some("uk" | "ukrainian") => LANGUAGE::Ukrainian,
624            Some("vi" | "vietnamese") => LANGUAGE::Vietnamese,
625            Some("zh" | "chinese") => LANGUAGE::Chinese,
626            _ => LANGUAGE::English,
627        };
628
629        let stopwords = get(language);
630        let stopwords_set: HashSet<String> = stopwords.iter().map(|s| s.to_string()).collect();
631
632        let result: Vec<Value> = tokens
633            .iter()
634            .filter_map(|t| {
635                t.as_str().and_then(|s| {
636                    if stopwords_set.contains(&s.to_lowercase()) {
637                        None
638                    } else {
639                        Some(Value::String(s.to_string()))
640                    }
641                })
642            })
643            .collect();
644
645        Ok(Value::Array(result))
646    }
647}
648
649// =============================================================================
650// is_stopword(word, lang?) -> boolean
651// Check if word is a stopword
652// =============================================================================
653
654defn!(IsStopwordFn, vec![arg!(string)], Some(arg!(string)));
655
656impl Function for IsStopwordFn {
657    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
658        use stop_words::{LANGUAGE, get};
659
660        self.signature.validate(args, ctx)?;
661        let word = args[0].as_str().unwrap();
662
663        let lang = if args.len() > 1 {
664            args[1].as_str().map(|s| s.to_string())
665        } else {
666            None
667        };
668
669        let language = match lang.as_deref() {
670            Some("ar" | "arabic") => LANGUAGE::Arabic,
671            Some("bg" | "bulgarian") => LANGUAGE::Bulgarian,
672            Some("ca" | "catalan") => LANGUAGE::Catalan,
673            Some("cs" | "czech") => LANGUAGE::Czech,
674            Some("da" | "danish") => LANGUAGE::Danish,
675            Some("nl" | "dutch") => LANGUAGE::Dutch,
676            Some("fi" | "finnish") => LANGUAGE::Finnish,
677            Some("fr" | "french") => LANGUAGE::French,
678            Some("de" | "german") => LANGUAGE::German,
679            Some("he" | "hebrew") => LANGUAGE::Hebrew,
680            Some("hi" | "hindi") => LANGUAGE::Hindi,
681            Some("hu" | "hungarian") => LANGUAGE::Hungarian,
682            Some("id" | "indonesian") => LANGUAGE::Indonesian,
683            Some("it" | "italian") => LANGUAGE::Italian,
684            Some("ja" | "japanese") => LANGUAGE::Japanese,
685            Some("ko" | "korean") => LANGUAGE::Korean,
686            Some("lv" | "latvian") => LANGUAGE::Latvian,
687            Some("no" | "norwegian") => LANGUAGE::Norwegian,
688            Some("fa" | "persian") => LANGUAGE::Persian,
689            Some("pl" | "polish") => LANGUAGE::Polish,
690            Some("pt" | "portuguese") => LANGUAGE::Portuguese,
691            Some("ro" | "romanian") => LANGUAGE::Romanian,
692            Some("ru" | "russian") => LANGUAGE::Russian,
693            Some("sk" | "slovak") => LANGUAGE::Slovak,
694            Some("es" | "spanish") => LANGUAGE::Spanish,
695            Some("sv" | "swedish") => LANGUAGE::Swedish,
696            Some("th" | "thai") => LANGUAGE::Thai,
697            Some("tr" | "turkish") => LANGUAGE::Turkish,
698            Some("uk" | "ukrainian") => LANGUAGE::Ukrainian,
699            Some("vi" | "vietnamese") => LANGUAGE::Vietnamese,
700            Some("zh" | "chinese") => LANGUAGE::Chinese,
701            _ => LANGUAGE::English,
702        };
703
704        let stopwords = get(language);
705        let is_stop = stopwords.iter().any(|sw| sw.eq_ignore_ascii_case(word));
706
707        Ok(Value::Bool(is_stop))
708    }
709}
710
711// =============================================================================
712// normalize_unicode(s, form?) -> string
713// Unicode normalization (NFC, NFD, NFKC, NFKD)
714// =============================================================================
715
716defn!(NormalizeUnicodeFn, vec![arg!(string)], Some(arg!(string)));
717
718impl Function for NormalizeUnicodeFn {
719    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
720        use unicode_normalization::UnicodeNormalization;
721
722        self.signature.validate(args, ctx)?;
723        let s = args[0].as_str().unwrap();
724
725        let form = if args.len() > 1 {
726            args[1].as_str().map(|s| s.to_uppercase())
727        } else {
728            None
729        };
730
731        let normalized = match form.as_deref() {
732            Some("NFD") => s.nfd().collect::<String>(),
733            Some("NFKC") => s.nfkc().collect::<String>(),
734            Some("NFKD") => s.nfkd().collect::<String>(),
735            _ => s.nfc().collect::<String>(), // Default to NFC
736        };
737
738        Ok(Value::String(normalized))
739    }
740}
741
742// =============================================================================
743// remove_accents(s) -> string
744// Strip diacritics/accents from text
745// =============================================================================
746
747defn!(RemoveAccentsFn, vec![arg!(string)], None);
748
749impl Function for RemoveAccentsFn {
750    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
751        use unicode_normalization::UnicodeNormalization;
752
753        self.signature.validate(args, ctx)?;
754        let s = args[0].as_str().unwrap();
755
756        // NFD normalize then filter out combining marks (diacritics)
757        let result: String = s
758            .nfd()
759            .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
760            .collect();
761
762        Ok(Value::String(result))
763    }
764}
765
766// =============================================================================
767// collapse_whitespace(s) -> string
768// Normalize whitespace (multiple spaces -> single, trim)
769// =============================================================================
770
771defn!(CollapseWhitespaceFn, vec![arg!(string)], None);
772
773impl Function for CollapseWhitespaceFn {
774    fn evaluate(&self, args: &[Value], ctx: &mut Context<'_>) -> SearchResult {
775        self.signature.validate(args, ctx)?;
776        let s = args[0].as_str().unwrap();
777
778        let result: String = s.split_whitespace().collect::<Vec<_>>().join(" ");
779
780        Ok(Value::String(result))
781    }
782}
783
784/// Register text functions filtered by the enabled set.
785pub fn register_filtered(runtime: &mut Runtime, enabled: &HashSet<&str>) {
786    register_if_enabled(runtime, "word_count", enabled, Box::new(WordCountFn::new()));
787    register_if_enabled(runtime, "char_count", enabled, Box::new(CharCountFn::new()));
788    register_if_enabled(
789        runtime,
790        "sentence_count",
791        enabled,
792        Box::new(SentenceCountFn::new()),
793    );
794    register_if_enabled(
795        runtime,
796        "paragraph_count",
797        enabled,
798        Box::new(ParagraphCountFn::new()),
799    );
800    register_if_enabled(
801        runtime,
802        "reading_time",
803        enabled,
804        Box::new(ReadingTimeFn::new()),
805    );
806    register_if_enabled(
807        runtime,
808        "reading_time_seconds",
809        enabled,
810        Box::new(ReadingTimeSecondsFn::new()),
811    );
812    register_if_enabled(
813        runtime,
814        "char_frequencies",
815        enabled,
816        Box::new(CharFrequenciesFn::new()),
817    );
818    register_if_enabled(
819        runtime,
820        "word_frequencies",
821        enabled,
822        Box::new(WordFrequenciesFn::new()),
823    );
824    register_if_enabled(runtime, "ngrams", enabled, Box::new(NgramsFn::new()));
825    register_if_enabled(runtime, "bigrams", enabled, Box::new(BigramsFn::new()));
826    register_if_enabled(runtime, "trigrams", enabled, Box::new(TrigramsFn::new()));
827    register_if_enabled(runtime, "tokens", enabled, Box::new(TokensFn::new()));
828    register_if_enabled(runtime, "tokenize", enabled, Box::new(TokenizeFn::new()));
829    register_if_enabled(runtime, "stem", enabled, Box::new(StemFn::new()));
830    register_if_enabled(runtime, "stems", enabled, Box::new(StemsFn::new()));
831    register_if_enabled(runtime, "stopwords", enabled, Box::new(StopwordsFn::new()));
832    register_if_enabled(
833        runtime,
834        "remove_stopwords",
835        enabled,
836        Box::new(RemoveStopwordsFn::new()),
837    );
838    register_if_enabled(
839        runtime,
840        "is_stopword",
841        enabled,
842        Box::new(IsStopwordFn::new()),
843    );
844    register_if_enabled(
845        runtime,
846        "normalize_unicode",
847        enabled,
848        Box::new(NormalizeUnicodeFn::new()),
849    );
850    register_if_enabled(
851        runtime,
852        "remove_accents",
853        enabled,
854        Box::new(RemoveAccentsFn::new()),
855    );
856    register_if_enabled(
857        runtime,
858        "collapse_whitespace",
859        enabled,
860        Box::new(CollapseWhitespaceFn::new()),
861    );
862}
863
864#[cfg(test)]
865mod tests {
866    use crate::Runtime;
867    use serde_json::json;
868
869    fn setup_runtime() -> Runtime {
870        Runtime::builder()
871            .with_standard()
872            .with_all_extensions()
873            .build()
874    }
875
876    #[test]
877    fn test_word_count() {
878        let runtime = setup_runtime();
879        let data = json!("Hello world, this is a test.");
880        let expr = runtime.compile("word_count(@)").unwrap();
881        let result = expr.search(&data).unwrap();
882        assert_eq!(result.as_f64().unwrap(), 6.0);
883    }
884
885    #[test]
886    fn test_word_count_empty() {
887        let runtime = setup_runtime();
888        let data = json!("");
889        let expr = runtime.compile("word_count(@)").unwrap();
890        let result = expr.search(&data).unwrap();
891        assert_eq!(result.as_f64().unwrap(), 0.0);
892    }
893
894    #[test]
895    fn test_char_count() {
896        let runtime = setup_runtime();
897        let data = json!("Hello world");
898        let expr = runtime.compile("char_count(@)").unwrap();
899        let result = expr.search(&data).unwrap();
900        // "Hello world" without space = 10 characters
901        assert_eq!(result.as_f64().unwrap(), 10.0);
902    }
903
904    #[test]
905    fn test_sentence_count() {
906        let runtime = setup_runtime();
907        let data = json!("Hello world. How are you? I am fine!");
908        let expr = runtime.compile("sentence_count(@)").unwrap();
909        let result = expr.search(&data).unwrap();
910        assert_eq!(result.as_f64().unwrap(), 3.0);
911    }
912
913    #[test]
914    fn test_sentence_count_no_punctuation() {
915        let runtime = setup_runtime();
916        let data = json!("Hello world");
917        let expr = runtime.compile("sentence_count(@)").unwrap();
918        let result = expr.search(&data).unwrap();
919        assert_eq!(result.as_f64().unwrap(), 1.0);
920    }
921
922    #[test]
923    fn test_paragraph_count() {
924        let runtime = setup_runtime();
925        let data = json!("First paragraph.\n\nSecond paragraph.\n\nThird.");
926        let expr = runtime.compile("paragraph_count(@)").unwrap();
927        let result = expr.search(&data).unwrap();
928        assert_eq!(result.as_f64().unwrap(), 3.0);
929    }
930
931    #[test]
932    fn test_reading_time() {
933        let runtime = setup_runtime();
934        // 200 words = 1 minute at 200 wpm
935        let words: Vec<&str> = vec!["word"; 200];
936        let text = words.join(" ");
937        let data = json!(text);
938        let expr = runtime.compile("reading_time(@)").unwrap();
939        let result = expr.search(&data).unwrap();
940        assert_eq!(result.as_f64().unwrap(), 1.0);
941    }
942
943    #[test]
944    fn test_reading_time_short() {
945        let runtime = setup_runtime();
946        let data = json!("Quick read");
947        let expr = runtime.compile("reading_time(@)").unwrap();
948        let result = expr.search(&data).unwrap();
949        assert_eq!(result.as_f64().unwrap(), 1.0); // Rounds up to 1 minute
950    }
951
952    #[test]
953    fn test_reading_time_seconds() {
954        let runtime = setup_runtime();
955        // 100 words = 30 seconds at 200 wpm
956        let words: Vec<&str> = vec!["word"; 100];
957        let text = words.join(" ");
958        let data = json!(text);
959        let expr = runtime.compile("reading_time_seconds(@)").unwrap();
960        let result = expr.search(&data).unwrap();
961        assert_eq!(result.as_f64().unwrap(), 30.0);
962    }
963
964    #[test]
965    fn test_char_frequencies() {
966        let runtime = setup_runtime();
967        let data = json!("aab");
968        let expr = runtime.compile("char_frequencies(@)").unwrap();
969        let result = expr.search(&data).unwrap();
970        let obj = result.as_object().unwrap();
971        assert_eq!(obj.get("a").unwrap().as_f64().unwrap(), 2.0);
972        assert_eq!(obj.get("b").unwrap().as_f64().unwrap(), 1.0);
973    }
974
975    #[test]
976    fn test_word_frequencies() {
977        let runtime = setup_runtime();
978        let data = json!("hello world hello");
979        let expr = runtime.compile("word_frequencies(@)").unwrap();
980        let result = expr.search(&data).unwrap();
981        let obj = result.as_object().unwrap();
982        assert_eq!(obj.get("hello").unwrap().as_f64().unwrap(), 2.0);
983        assert_eq!(obj.get("world").unwrap().as_f64().unwrap(), 1.0);
984    }
985
986    #[test]
987    fn test_word_frequencies_normalized() {
988        let runtime = setup_runtime();
989        let data = json!("Hello, HELLO hello!");
990        let expr = runtime.compile("word_frequencies(@)").unwrap();
991        let result = expr.search(&data).unwrap();
992        let obj = result.as_object().unwrap();
993        // All normalized to "hello"
994        assert_eq!(obj.get("hello").unwrap().as_f64().unwrap(), 3.0);
995    }
996
997    #[test]
998    fn test_ngrams_char() {
999        let runtime = setup_runtime();
1000        let data = json!("hello");
1001        let expr = runtime.compile("ngrams(@, `3`, 'char')").unwrap();
1002        let result = expr.search(&data).unwrap();
1003        let arr = result.as_array().unwrap();
1004        assert_eq!(arr.len(), 3);
1005        assert_eq!(arr[0].as_str().unwrap(), "hel");
1006        assert_eq!(arr[1].as_str().unwrap(), "ell");
1007        assert_eq!(arr[2].as_str().unwrap(), "llo");
1008    }
1009
1010    #[test]
1011    fn test_ngrams_word() {
1012        let runtime = setup_runtime();
1013        let data = json!("the quick brown fox");
1014        let expr = runtime.compile("ngrams(@, `2`)").unwrap();
1015        let result = expr.search(&data).unwrap();
1016        let arr = result.as_array().unwrap();
1017        assert_eq!(arr.len(), 3);
1018        // Each element is an array of words
1019        let first = arr[0].as_array().unwrap();
1020        assert_eq!(first[0].as_str().unwrap(), "the");
1021        assert_eq!(first[1].as_str().unwrap(), "quick");
1022    }
1023
1024    #[test]
1025    fn test_ngrams_empty() {
1026        let runtime = setup_runtime();
1027        let data = json!("hi");
1028        // Asking for 3-grams from a 2-char string
1029        let expr = runtime.compile("ngrams(@, `3`, 'char')").unwrap();
1030        let result = expr.search(&data).unwrap();
1031        let arr = result.as_array().unwrap();
1032        assert_eq!(arr.len(), 0);
1033    }
1034
1035    #[test]
1036    fn test_bigrams() {
1037        let runtime = setup_runtime();
1038        let data = json!("a b c d");
1039        let expr = runtime.compile("bigrams(@)").unwrap();
1040        let result = expr.search(&data).unwrap();
1041        let arr = result.as_array().unwrap();
1042        assert_eq!(arr.len(), 3);
1043        // [["a", "b"], ["b", "c"], ["c", "d"]]
1044        let first = arr[0].as_array().unwrap();
1045        assert_eq!(first[0].as_str().unwrap(), "a");
1046        assert_eq!(first[1].as_str().unwrap(), "b");
1047        let last = arr[2].as_array().unwrap();
1048        assert_eq!(last[0].as_str().unwrap(), "c");
1049        assert_eq!(last[1].as_str().unwrap(), "d");
1050    }
1051
1052    #[test]
1053    fn test_bigrams_single_word() {
1054        let runtime = setup_runtime();
1055        let data = json!("hello");
1056        let expr = runtime.compile("bigrams(@)").unwrap();
1057        let result = expr.search(&data).unwrap();
1058        let arr = result.as_array().unwrap();
1059        assert_eq!(arr.len(), 0);
1060    }
1061
1062    #[test]
1063    fn test_trigrams() {
1064        let runtime = setup_runtime();
1065        let data = json!("a b c d e");
1066        let expr = runtime.compile("trigrams(@)").unwrap();
1067        let result = expr.search(&data).unwrap();
1068        let arr = result.as_array().unwrap();
1069        assert_eq!(arr.len(), 3);
1070        // [["a", "b", "c"], ["b", "c", "d"], ["c", "d", "e"]]
1071        let first = arr[0].as_array().unwrap();
1072        assert_eq!(first[0].as_str().unwrap(), "a");
1073        assert_eq!(first[1].as_str().unwrap(), "b");
1074        assert_eq!(first[2].as_str().unwrap(), "c");
1075    }
1076
1077    #[test]
1078    fn test_trigrams_too_short() {
1079        let runtime = setup_runtime();
1080        let data = json!("a b");
1081        let expr = runtime.compile("trigrams(@)").unwrap();
1082        let result = expr.search(&data).unwrap();
1083        let arr = result.as_array().unwrap();
1084        assert_eq!(arr.len(), 0);
1085    }
1086
1087    // =========================================================================
1088    // tokens tests
1089    // =========================================================================
1090
1091    #[test]
1092    fn test_tokens_basic() {
1093        let runtime = setup_runtime();
1094        let data = json!("Hello, World!");
1095        let expr = runtime.compile("tokens(@)").unwrap();
1096        let result = expr.search(&data).unwrap();
1097        let arr = result.as_array().unwrap();
1098        assert_eq!(arr.len(), 2);
1099        assert_eq!(arr[0].as_str().unwrap(), "hello");
1100        assert_eq!(arr[1].as_str().unwrap(), "world");
1101    }
1102
1103    #[test]
1104    fn test_tokens_punctuation_only() {
1105        let runtime = setup_runtime();
1106        let data = json!("... --- !!!");
1107        let expr = runtime.compile("tokens(@)").unwrap();
1108        let result = expr.search(&data).unwrap();
1109        let arr = result.as_array().unwrap();
1110        assert_eq!(arr.len(), 0);
1111    }
1112
1113    #[test]
1114    fn test_tokens_mixed() {
1115        let runtime = setup_runtime();
1116        let data = json!("The quick, brown fox!");
1117        let expr = runtime.compile("tokens(@)").unwrap();
1118        let result = expr.search(&data).unwrap();
1119        let arr = result.as_array().unwrap();
1120        assert_eq!(arr.len(), 4);
1121        assert_eq!(arr[0].as_str().unwrap(), "the");
1122        assert_eq!(arr[1].as_str().unwrap(), "quick");
1123        assert_eq!(arr[2].as_str().unwrap(), "brown");
1124        assert_eq!(arr[3].as_str().unwrap(), "fox");
1125    }
1126
1127    #[test]
1128    fn test_tokens_empty() {
1129        let runtime = setup_runtime();
1130        let data = json!("");
1131        let expr = runtime.compile("tokens(@)").unwrap();
1132        let result = expr.search(&data).unwrap();
1133        let arr = result.as_array().unwrap();
1134        assert_eq!(arr.len(), 0);
1135    }
1136
1137    // =========================================================================
1138    // tokenize tests
1139    // =========================================================================
1140
1141    #[test]
1142    fn test_tokenize_default() {
1143        let runtime = setup_runtime();
1144        let data = json!("Hello, World!");
1145        let expr = runtime.compile("tokenize(@)").unwrap();
1146        let result = expr.search(&data).unwrap();
1147        let arr = result.as_array().unwrap();
1148        assert_eq!(arr.len(), 2);
1149        assert_eq!(arr[0].as_str().unwrap(), "hello");
1150        assert_eq!(arr[1].as_str().unwrap(), "world");
1151    }
1152
1153    #[test]
1154    fn test_tokenize_preserve_case() {
1155        let runtime = setup_runtime();
1156        let data = json!("Hello, World!");
1157        let expr = runtime
1158            .compile(r#"tokenize(@, `{"case": "preserve"}`)"#)
1159            .unwrap();
1160        let result = expr.search(&data).unwrap();
1161        let arr = result.as_array().unwrap();
1162        assert_eq!(arr.len(), 2);
1163        assert_eq!(arr[0].as_str().unwrap(), "Hello");
1164        assert_eq!(arr[1].as_str().unwrap(), "World");
1165    }
1166
1167    #[test]
1168    fn test_tokenize_upper_case() {
1169        let runtime = setup_runtime();
1170        let data = json!("Hello, World!");
1171        let expr = runtime
1172            .compile(r#"tokenize(@, `{"case": "upper"}`)"#)
1173            .unwrap();
1174        let result = expr.search(&data).unwrap();
1175        let arr = result.as_array().unwrap();
1176        assert_eq!(arr.len(), 2);
1177        assert_eq!(arr[0].as_str().unwrap(), "HELLO");
1178        assert_eq!(arr[1].as_str().unwrap(), "WORLD");
1179    }
1180
1181    #[test]
1182    fn test_tokenize_keep_punctuation() {
1183        let runtime = setup_runtime();
1184        let data = json!("Hello, World!");
1185        let expr = runtime
1186            .compile(r#"tokenize(@, `{"punctuation": "keep"}`)"#)
1187            .unwrap();
1188        let result = expr.search(&data).unwrap();
1189        let arr = result.as_array().unwrap();
1190        assert_eq!(arr.len(), 2);
1191        assert_eq!(arr[0].as_str().unwrap(), "hello,");
1192        assert_eq!(arr[1].as_str().unwrap(), "world!");
1193    }
1194
1195    #[test]
1196    fn test_tokenize_preserve_case_keep_punctuation() {
1197        let runtime = setup_runtime();
1198        let data = json!("Hello, World!");
1199        let expr = runtime
1200            .compile(r#"tokenize(@, `{"case": "preserve", "punctuation": "keep"}`)"#)
1201            .unwrap();
1202        let result = expr.search(&data).unwrap();
1203        let arr = result.as_array().unwrap();
1204        assert_eq!(arr.len(), 2);
1205        assert_eq!(arr[0].as_str().unwrap(), "Hello,");
1206        assert_eq!(arr[1].as_str().unwrap(), "World!");
1207    }
1208
1209    // =========================================================================
1210    // stem tests
1211    // =========================================================================
1212
1213    #[test]
1214    fn test_stem_basic() {
1215        let runtime = setup_runtime();
1216        let data = json!("running");
1217        let expr = runtime.compile("stem(@)").unwrap();
1218        let result = expr.search(&data).unwrap();
1219        assert_eq!(result.as_str().unwrap(), "run");
1220    }
1221
1222    #[test]
1223    fn test_stem_plural() {
1224        let runtime = setup_runtime();
1225        let data = json!("cats");
1226        let expr = runtime.compile("stem(@)").unwrap();
1227        let result = expr.search(&data).unwrap();
1228        assert_eq!(result.as_str().unwrap(), "cat");
1229    }
1230
1231    #[test]
1232    fn test_stems_array() {
1233        let runtime = setup_runtime();
1234        let data = json!(["running", "cats", "quickly"]);
1235        let expr = runtime.compile("stems(@)").unwrap();
1236        let result = expr.search(&data).unwrap();
1237        let arr = result.as_array().unwrap();
1238        assert_eq!(arr.len(), 3);
1239        assert_eq!(arr[0].as_str().unwrap(), "run");
1240        assert_eq!(arr[1].as_str().unwrap(), "cat");
1241        assert_eq!(arr[2].as_str().unwrap(), "quick");
1242    }
1243
1244    // =========================================================================
1245    // stopwords tests
1246    // =========================================================================
1247
1248    #[test]
1249    fn test_stopwords_english() {
1250        let runtime = setup_runtime();
1251        let expr = runtime.compile("stopwords()").unwrap();
1252        let result = expr.search(&json!(null)).unwrap();
1253        let arr = result.as_array().unwrap();
1254        // English stopwords should include common words
1255        let words: Vec<String> = arr
1256            .iter()
1257            .filter_map(|v| v.as_str().map(|s| s.to_string()))
1258            .collect();
1259        assert!(words.contains(&"the".to_string()));
1260        assert!(words.contains(&"is".to_string()));
1261        assert!(words.contains(&"a".to_string()));
1262    }
1263
1264    #[test]
1265    fn test_remove_stopwords() {
1266        let runtime = setup_runtime();
1267        let data = json!(["the", "quick", "brown", "fox"]);
1268        let expr = runtime.compile("remove_stopwords(@)").unwrap();
1269        let result = expr.search(&data).unwrap();
1270        let arr = result.as_array().unwrap();
1271        // "the" should be removed
1272        let words: Vec<String> = arr
1273            .iter()
1274            .filter_map(|v| v.as_str().map(|s| s.to_string()))
1275            .collect();
1276        assert!(!words.contains(&"the".to_string()));
1277        assert!(words.contains(&"quick".to_string()));
1278        assert!(words.contains(&"brown".to_string()));
1279        assert!(words.contains(&"fox".to_string()));
1280    }
1281
1282    #[test]
1283    fn test_is_stopword() {
1284        let runtime = setup_runtime();
1285        let data = json!("the");
1286        let expr = runtime.compile("is_stopword(@)").unwrap();
1287        let result = expr.search(&data).unwrap();
1288        assert!(result.as_bool().unwrap());
1289
1290        let data = json!("elephant");
1291        let result = expr.search(&data).unwrap();
1292        assert!(!result.as_bool().unwrap());
1293    }
1294
1295    // =========================================================================
1296    // text normalization tests
1297    // =========================================================================
1298
1299    #[test]
1300    fn test_normalize_unicode_default() {
1301        let runtime = setup_runtime();
1302        // cafe with combining acute accent (e + combining accent)
1303        let data = json!("cafe\u{0301}");
1304        let expr = runtime.compile("normalize_unicode(@)").unwrap();
1305        let result = expr.search(&data).unwrap();
1306        // NFC should compose to e-acute
1307        assert_eq!(result.as_str().unwrap(), "caf\u{00e9}");
1308    }
1309
1310    #[test]
1311    fn test_remove_accents() {
1312        let runtime = setup_runtime();
1313        let data = json!("caf\u{00e9} na\u{00ef}ve r\u{00e9}sum\u{00e9}");
1314        let expr = runtime.compile("remove_accents(@)").unwrap();
1315        let result = expr.search(&data).unwrap();
1316        assert_eq!(result.as_str().unwrap(), "cafe naive resume");
1317    }
1318
1319    #[test]
1320    fn test_collapse_whitespace() {
1321        let runtime = setup_runtime();
1322        let data = json!("  hello   world  ");
1323        let expr = runtime.compile("collapse_whitespace(@)").unwrap();
1324        let result = expr.search(&data).unwrap();
1325        assert_eq!(result.as_str().unwrap(), "hello world");
1326    }
1327
1328    #[test]
1329    fn test_collapse_whitespace_tabs_newlines() {
1330        let runtime = setup_runtime();
1331        let data = json!("hello\t\n\nworld");
1332        let expr = runtime.compile("collapse_whitespace(@)").unwrap();
1333        let result = expr.search(&data).unwrap();
1334        assert_eq!(result.as_str().unwrap(), "hello world");
1335    }
1336}