Skip to main content

oxirs_arq/
string_functions_ext.rs

1//! Enhanced String Functions for SPARQL 1.1+
2//!
3//! This module implements extended string manipulation and literal construction
4//! functions as specified in SPARQL 1.1 and RDF 1.2.
5//!
6//! Based on Apache Jena ARQ implementation.
7
8use crate::extensions::{CustomFunction, ExecutionContext, Value, ValueType};
9use anyhow::{bail, Result};
10
11// ============================================================================
12// STRBEFORE Function - Substring before separator
13// ============================================================================
14
15/// STRBEFORE(str, separator) - Returns the substring before the first occurrence of separator
16///
17/// Returns the part of the string before the first occurrence of the separator.
18/// If separator is not found, returns empty string "".
19/// If separator is empty string, returns empty string "".
20///
21/// # Examples
22/// ```sparql
23/// STRBEFORE("abc@example.org", "@") → "abc"
24/// STRBEFORE("foobar", "bar") → "foo"
25/// STRBEFORE("foobar", "xyz") → ""
26/// ```
27#[derive(Debug, Clone)]
28pub struct StrBeforeFunction;
29
30impl CustomFunction for StrBeforeFunction {
31    fn name(&self) -> &str {
32        "http://www.w3.org/2001/XMLSchema#strBefore"
33    }
34
35    fn arity(&self) -> Option<usize> {
36        Some(2)
37    }
38
39    fn parameter_types(&self) -> Vec<ValueType> {
40        vec![ValueType::String, ValueType::String]
41    }
42
43    fn return_type(&self) -> ValueType {
44        ValueType::String
45    }
46
47    fn documentation(&self) -> &str {
48        "Returns the substring before the first occurrence of separator"
49    }
50
51    fn clone_function(&self) -> Box<dyn CustomFunction> {
52        Box::new(self.clone())
53    }
54
55    fn execute(&self, args: &[Value], _context: &ExecutionContext) -> Result<Value> {
56        if args.len() != 2 {
57            bail!("STRBEFORE() requires exactly 2 arguments");
58        }
59
60        let (string, lang, _dt) = extract_string_value(&args[0])?;
61        let (separator, sep_lang, _sep_dt) = extract_string_value(&args[1])?;
62
63        // Language tags must be compatible
64        if !compatible_languages(&lang, &sep_lang) {
65            bail!("STRBEFORE: incompatible language tags");
66        }
67
68        if separator.is_empty() {
69            return Ok(create_string_value("", lang));
70        }
71
72        let result = if let Some(pos) = string.find(&separator) {
73            &string[..pos]
74        } else {
75            ""
76        };
77
78        Ok(create_string_value(result, lang))
79    }
80}
81
82// ============================================================================
83// STRAFTER Function - Substring after separator
84// ============================================================================
85
86/// STRAFTER(str, separator) - Returns the substring after the first occurrence of separator
87///
88/// Returns the part of the string after the first occurrence of the separator.
89/// If separator is not found, returns empty string "".
90/// If separator is empty string, returns the original string.
91///
92/// # Examples
93/// ```sparql
94/// STRAFTER("abc@example.org", "@") → "example.org"
95/// STRAFTER("foobar", "foo") → "bar"
96/// STRAFTER("foobar", "xyz") → ""
97/// ```
98#[derive(Debug, Clone)]
99pub struct StrAfterFunction;
100
101impl CustomFunction for StrAfterFunction {
102    fn name(&self) -> &str {
103        "http://www.w3.org/2001/XMLSchema#strAfter"
104    }
105
106    fn arity(&self) -> Option<usize> {
107        Some(2)
108    }
109
110    fn parameter_types(&self) -> Vec<ValueType> {
111        vec![ValueType::String, ValueType::String]
112    }
113
114    fn return_type(&self) -> ValueType {
115        ValueType::String
116    }
117
118    fn documentation(&self) -> &str {
119        "Returns the substring after the first occurrence of separator"
120    }
121
122    fn clone_function(&self) -> Box<dyn CustomFunction> {
123        Box::new(self.clone())
124    }
125
126    fn execute(&self, args: &[Value], _context: &ExecutionContext) -> Result<Value> {
127        if args.len() != 2 {
128            bail!("STRAFTER() requires exactly 2 arguments");
129        }
130
131        let (string, lang, _dt) = extract_string_value(&args[0])?;
132        let (separator, sep_lang, _sep_dt) = extract_string_value(&args[1])?;
133
134        // Language tags must be compatible
135        if !compatible_languages(&lang, &sep_lang) {
136            bail!("STRAFTER: incompatible language tags");
137        }
138
139        if separator.is_empty() {
140            return Ok(create_string_value(&string, lang));
141        }
142
143        let result = if let Some(pos) = string.find(&separator) {
144            &string[pos + separator.len()..]
145        } else {
146            ""
147        };
148
149        Ok(create_string_value(result, lang))
150    }
151}
152
153// ============================================================================
154// STRLANG Function - Create language-tagged literal
155// ============================================================================
156
157/// STRLANG(str, lang) - Creates a language-tagged literal
158///
159/// Creates a new literal with the specified language tag.
160/// The language tag must be a valid BCP47 language tag (lowercased).
161///
162/// # Examples
163/// ```sparql
164/// STRLANG("chat", "fr") → "chat"@fr
165/// STRLANG("Hello", "en") → "Hello"@en
166/// ```
167#[derive(Debug, Clone)]
168pub struct StrLangFunction;
169
170impl CustomFunction for StrLangFunction {
171    fn name(&self) -> &str {
172        "http://www.w3.org/2001/XMLSchema#strLang"
173    }
174
175    fn arity(&self) -> Option<usize> {
176        Some(2)
177    }
178
179    fn parameter_types(&self) -> Vec<ValueType> {
180        vec![ValueType::String, ValueType::String]
181    }
182
183    fn return_type(&self) -> ValueType {
184        ValueType::Literal
185    }
186
187    fn documentation(&self) -> &str {
188        "Creates a language-tagged literal from a string and language tag"
189    }
190
191    fn clone_function(&self) -> Box<dyn CustomFunction> {
192        Box::new(self.clone())
193    }
194
195    fn execute(&self, args: &[Value], _context: &ExecutionContext) -> Result<Value> {
196        if args.len() != 2 {
197            bail!("STRLANG() requires exactly 2 arguments");
198        }
199
200        // First argument must be a simple literal or string
201        let lexical_form = match &args[0] {
202            Value::String(s) => s.clone(),
203            Value::Literal {
204                value,
205                language: None,
206                datatype: None,
207            } => value.clone(),
208            _ => bail!("STRLANG: first argument must be a simple literal or string"),
209        };
210
211        // Second argument must be a simple literal or string (the language tag)
212        let lang_tag = match &args[1] {
213            Value::String(s) => s.clone(),
214            Value::Literal {
215                value,
216                language: None,
217                datatype: None,
218            } => value.clone(),
219            _ => bail!("STRLANG: second argument must be a simple literal or string"),
220        };
221
222        // Validate and normalize language tag (should be lowercase)
223        let lang_tag = validate_language_tag(&lang_tag)?;
224
225        Ok(Value::Literal {
226            value: lexical_form,
227            language: Some(lang_tag),
228            datatype: None,
229        })
230    }
231}
232
233// ============================================================================
234// STRLANGDIR Function - Create language and direction-tagged literal (RDF 1.2)
235// ============================================================================
236
237/// STRLANGDIR(str, lang, dir) - Creates a language and direction-tagged literal
238///
239/// Creates a new literal with specified language tag and text direction.
240/// Part of RDF 1.2 specification for bidirectional text support.
241///
242/// # Examples
243/// ```sparql
244/// STRLANGDIR("Hello", "en", "ltr") → "Hello"@en--ltr
245/// STRLANGDIR("مرحبا", "ar", "rtl") → "مرحبا"@ar--rtl
246/// ```
247#[derive(Debug, Clone)]
248pub struct StrLangDirFunction;
249
250impl CustomFunction for StrLangDirFunction {
251    fn name(&self) -> &str {
252        "http://www.w3.org/ns/rdf#langString"
253    }
254
255    fn arity(&self) -> Option<usize> {
256        Some(3)
257    }
258
259    fn parameter_types(&self) -> Vec<ValueType> {
260        vec![ValueType::String, ValueType::String, ValueType::String]
261    }
262
263    fn return_type(&self) -> ValueType {
264        ValueType::Literal
265    }
266
267    fn documentation(&self) -> &str {
268        "Creates a language and direction-tagged literal (RDF 1.2)"
269    }
270
271    fn clone_function(&self) -> Box<dyn CustomFunction> {
272        Box::new(self.clone())
273    }
274
275    fn execute(&self, args: &[Value], _context: &ExecutionContext) -> Result<Value> {
276        if args.len() != 3 {
277            bail!("STRLANGDIR() requires exactly 3 arguments");
278        }
279
280        // First argument: lexical form
281        let lexical_form = match &args[0] {
282            Value::String(s) => s.clone(),
283            Value::Literal {
284                value,
285                language: None,
286                datatype: None,
287            } => value.clone(),
288            _ => bail!("STRLANGDIR: first argument must be a simple literal or string"),
289        };
290
291        // Second argument: language tag
292        let lang_tag = match &args[1] {
293            Value::String(s) => s.clone(),
294            Value::Literal {
295                value,
296                language: None,
297                datatype: None,
298            } => value.clone(),
299            _ => bail!("STRLANGDIR: second argument must be a simple literal or string"),
300        };
301
302        // Third argument: direction (ltr or rtl)
303        let direction = match &args[2] {
304            Value::String(s) => s.clone(),
305            Value::Literal {
306                value,
307                language: None,
308                datatype: None,
309            } => value.clone(),
310            _ => bail!("STRLANGDIR: third argument must be a simple literal or string"),
311        };
312
313        // Validate direction
314        if direction != "ltr" && direction != "rtl" {
315            bail!("STRLANGDIR: direction must be 'ltr' or 'rtl'");
316        }
317
318        // Validate and normalize language tag
319        let lang_tag = validate_language_tag(&lang_tag)?;
320
321        // Combine language and direction using RDF 1.2 format: lang--dir
322        let lang_with_dir = format!("{}--{}", lang_tag, direction);
323
324        Ok(Value::Literal {
325            value: lexical_form,
326            language: Some(lang_with_dir),
327            datatype: None,
328        })
329    }
330}
331
332// ============================================================================
333// STRDT Function - Create datatyped literal
334// ============================================================================
335
336/// STRDT(str, datatype) - Creates a typed literal
337///
338/// Creates a new literal with the specified datatype IRI.
339///
340/// # Examples
341/// ```sparql
342/// STRDT("123", xsd:integer) → "123"^^xsd:integer
343/// STRDT("true", xsd:boolean) → "true"^^xsd:boolean
344/// ```
345#[derive(Debug, Clone)]
346pub struct StrDtFunction;
347
348impl CustomFunction for StrDtFunction {
349    fn name(&self) -> &str {
350        "http://www.w3.org/2001/XMLSchema#strDt"
351    }
352
353    fn arity(&self) -> Option<usize> {
354        Some(2)
355    }
356
357    fn parameter_types(&self) -> Vec<ValueType> {
358        vec![ValueType::String, ValueType::Iri]
359    }
360
361    fn return_type(&self) -> ValueType {
362        ValueType::Literal
363    }
364
365    fn documentation(&self) -> &str {
366        "Creates a typed literal from a string and datatype IRI"
367    }
368
369    fn clone_function(&self) -> Box<dyn CustomFunction> {
370        Box::new(self.clone())
371    }
372
373    fn execute(&self, args: &[Value], _context: &ExecutionContext) -> Result<Value> {
374        if args.len() != 2 {
375            bail!("STRDT() requires exactly 2 arguments");
376        }
377
378        // First argument: lexical form (must be simple literal or string)
379        let lexical_form = match &args[0] {
380            Value::String(s) => s.clone(),
381            Value::Literal {
382                value,
383                language: None,
384                datatype: None,
385            } => value.clone(),
386            _ => bail!("STRDT: first argument must be a simple literal or string"),
387        };
388
389        // Second argument: datatype IRI
390        let datatype = match &args[1] {
391            Value::Iri(iri) => iri.clone(),
392            _ => bail!("STRDT: second argument must be an IRI"),
393        };
394
395        Ok(Value::Literal {
396            value: lexical_form,
397            language: None,
398            datatype: Some(datatype),
399        })
400    }
401}
402
403// ============================================================================
404// Helper Functions
405// ============================================================================
406
407/// Extract string value and metadata from a Value
408fn extract_string_value(value: &Value) -> Result<(String, Option<String>, Option<String>)> {
409    match value {
410        Value::String(s) => Ok((s.clone(), None, None)),
411        Value::Literal {
412            value,
413            language,
414            datatype,
415        } => Ok((value.clone(), language.clone(), datatype.clone())),
416        Value::Iri(iri) => Ok((iri.clone(), None, None)),
417        _ => bail!("Expected string or literal value"),
418    }
419}
420
421/// Check if two language tags are compatible
422fn compatible_languages(lang1: &Option<String>, lang2: &Option<String>) -> bool {
423    match (lang1, lang2) {
424        (None, None) => true,
425        (Some(l1), Some(l2)) => l1 == l2,
426        (Some(_), None) | (None, Some(_)) => false,
427    }
428}
429
430/// Create a string value with optional language tag
431fn create_string_value(s: &str, lang: Option<String>) -> Value {
432    if let Some(lang_tag) = lang {
433        Value::Literal {
434            value: s.to_string(),
435            language: Some(lang_tag),
436            datatype: None,
437        }
438    } else {
439        Value::String(s.to_string())
440    }
441}
442
443/// Validate and normalize a language tag (BCP47)
444fn validate_language_tag(tag: &str) -> Result<String> {
445    // Basic validation: must not be empty, contains only ASCII alphanumeric and hyphens
446    if tag.is_empty() {
447        bail!("Language tag cannot be empty");
448    }
449
450    // Language tags should be ASCII
451    if !tag.is_ascii() {
452        bail!("Language tag must be ASCII");
453    }
454
455    // Normalize to lowercase (BCP47 recommends lowercase)
456    let normalized = tag.to_lowercase();
457
458    // Basic format check: starts with letter
459    if !normalized
460        .chars()
461        .next()
462        .expect("normalized string validated to be non-empty")
463        .is_ascii_alphabetic()
464    {
465        bail!("Language tag must start with a letter");
466    }
467
468    Ok(normalized)
469}
470
471#[cfg(test)]
472mod tests {
473    use super::*;
474    use std::collections::HashMap;
475
476    fn create_test_context() -> ExecutionContext {
477        ExecutionContext {
478            variables: HashMap::new(),
479            namespaces: HashMap::new(),
480            base_iri: None,
481            dataset_context: None,
482            query_time: chrono::Utc::now(),
483            optimization_level: crate::extensions::OptimizationLevel::None,
484            memory_limit: None,
485            time_limit: None,
486        }
487    }
488
489    #[test]
490    fn test_strbefore_basic() {
491        let func = StrBeforeFunction;
492        let ctx = create_test_context();
493
494        let string = Value::String("abc@example.org".to_string());
495        let separator = Value::String("@".to_string());
496
497        let result = func.execute(&[string, separator], &ctx).unwrap();
498        assert_eq!(result, Value::String("abc".to_string()));
499    }
500
501    #[test]
502    fn test_strbefore_not_found() {
503        let func = StrBeforeFunction;
504        let ctx = create_test_context();
505
506        let string = Value::String("foobar".to_string());
507        let separator = Value::String("xyz".to_string());
508
509        let result = func.execute(&[string, separator], &ctx).unwrap();
510        assert_eq!(result, Value::String("".to_string()));
511    }
512
513    #[test]
514    fn test_strafter_basic() {
515        let func = StrAfterFunction;
516        let ctx = create_test_context();
517
518        let string = Value::String("abc@example.org".to_string());
519        let separator = Value::String("@".to_string());
520
521        let result = func.execute(&[string, separator], &ctx).unwrap();
522        assert_eq!(result, Value::String("example.org".to_string()));
523    }
524
525    #[test]
526    fn test_strafter_not_found() {
527        let func = StrAfterFunction;
528        let ctx = create_test_context();
529
530        let string = Value::String("foobar".to_string());
531        let separator = Value::String("xyz".to_string());
532
533        let result = func.execute(&[string, separator], &ctx).unwrap();
534        assert_eq!(result, Value::String("".to_string()));
535    }
536
537    #[test]
538    fn test_strlang() {
539        let func = StrLangFunction;
540        let ctx = create_test_context();
541
542        let string = Value::String("chat".to_string());
543        let lang = Value::String("fr".to_string());
544
545        let result = func.execute(&[string, lang], &ctx).unwrap();
546        match result {
547            Value::Literal {
548                value,
549                language,
550                datatype,
551            } => {
552                assert_eq!(value, "chat");
553                assert_eq!(language, Some("fr".to_string()));
554                assert_eq!(datatype, None);
555            }
556            _ => panic!("Expected Literal"),
557        }
558    }
559
560    #[test]
561    fn test_strlang_uppercase_normalized() {
562        let func = StrLangFunction;
563        let ctx = create_test_context();
564
565        let string = Value::String("Hello".to_string());
566        let lang = Value::String("EN".to_string());
567
568        let result = func.execute(&[string, lang], &ctx).unwrap();
569        match result {
570            Value::Literal { language, .. } => {
571                assert_eq!(language, Some("en".to_string()));
572            }
573            _ => panic!("Expected Literal"),
574        }
575    }
576
577    #[test]
578    fn test_strlangdir() {
579        let func = StrLangDirFunction;
580        let ctx = create_test_context();
581
582        let string = Value::String("Hello".to_string());
583        let lang = Value::String("en".to_string());
584        let dir = Value::String("ltr".to_string());
585
586        let result = func.execute(&[string, lang, dir], &ctx).unwrap();
587        match result {
588            Value::Literal { language, .. } => {
589                assert_eq!(language, Some("en--ltr".to_string()));
590            }
591            _ => panic!("Expected Literal"),
592        }
593    }
594
595    #[test]
596    fn test_strlangdir_rtl() {
597        let func = StrLangDirFunction;
598        let ctx = create_test_context();
599
600        let string = Value::String("مرحبا".to_string());
601        let lang = Value::String("ar".to_string());
602        let dir = Value::String("rtl".to_string());
603
604        let result = func.execute(&[string, lang, dir], &ctx).unwrap();
605        match result {
606            Value::Literal { language, .. } => {
607                assert_eq!(language, Some("ar--rtl".to_string()));
608            }
609            _ => panic!("Expected Literal"),
610        }
611    }
612
613    #[test]
614    fn test_strlangdir_invalid_direction() {
615        let func = StrLangDirFunction;
616        let ctx = create_test_context();
617
618        let string = Value::String("Hello".to_string());
619        let lang = Value::String("en".to_string());
620        let dir = Value::String("invalid".to_string());
621
622        let result = func.execute(&[string, lang, dir], &ctx);
623        assert!(result.is_err());
624    }
625
626    #[test]
627    fn test_strdt() {
628        let func = StrDtFunction;
629        let ctx = create_test_context();
630
631        let string = Value::String("123".to_string());
632        let datatype = Value::Iri("http://www.w3.org/2001/XMLSchema#integer".to_string());
633
634        let result = func.execute(&[string, datatype], &ctx).unwrap();
635        match result {
636            Value::Literal {
637                value,
638                language,
639                datatype,
640            } => {
641                assert_eq!(value, "123");
642                assert_eq!(language, None);
643                assert_eq!(
644                    datatype,
645                    Some("http://www.w3.org/2001/XMLSchema#integer".to_string())
646                );
647            }
648            _ => panic!("Expected Literal"),
649        }
650    }
651
652    #[test]
653    fn test_strbefore_with_language() {
654        let func = StrBeforeFunction;
655        let ctx = create_test_context();
656
657        let string = Value::Literal {
658            value: "abc@def".to_string(),
659            language: Some("en".to_string()),
660            datatype: None,
661        };
662        let separator = Value::Literal {
663            value: "@".to_string(),
664            language: Some("en".to_string()),
665            datatype: None,
666        };
667
668        let result = func.execute(&[string, separator], &ctx).unwrap();
669        match result {
670            Value::Literal {
671                value, language, ..
672            } => {
673                assert_eq!(value, "abc");
674                assert_eq!(language, Some("en".to_string()));
675            }
676            _ => panic!("Expected Literal with language"),
677        }
678    }
679
680    #[test]
681    fn test_function_arities() {
682        assert_eq!(StrBeforeFunction.arity(), Some(2));
683        assert_eq!(StrAfterFunction.arity(), Some(2));
684        assert_eq!(StrLangFunction.arity(), Some(2));
685        assert_eq!(StrLangDirFunction.arity(), Some(3));
686        assert_eq!(StrDtFunction.arity(), Some(2));
687    }
688}