Skip to main content

solid_pod_rs/wac/
parser.rs

1//! Turtle ACL parser (subset sufficient for WAC documents).
2//!
3//! Accepts the subset used by real-world Solid ACL files: `@prefix`
4//! directives, `a` shorthand, and `;`-separated predicate-object pairs
5//! terminated with `.`.
6//!
7//! Non-recognised tokens are skipped — the parser is deliberately
8//! forgiving so that odd whitespace or extra comments do not break it.
9
10use std::collections::HashMap;
11
12use crate::error::PodError;
13use crate::wac::client::ClientConditionBody;
14use crate::wac::conditions::Condition;
15use crate::wac::document::{ids_of, AclAuthorization, AclDocument, IdOrIds, IdRef};
16use crate::wac::issuer::IssuerConditionBody;
17use crate::wac::MAX_ACL_BYTES;
18
19/// Parse a Turtle ACL document into the same `AclDocument` shape that
20/// the JSON-LD deserialiser produces.
21///
22/// Enforces a byte cap (`JSS_MAX_ACL_BYTES`, default 1 MiB) so an
23/// attacker cannot feed a multi-gigabyte document and DoS the process.
24/// To supply an explicit limit, use [`parse_turtle_acl_with_limit`].
25pub fn parse_turtle_acl(input: &str) -> Result<AclDocument, PodError> {
26    let limit = std::env::var("JSS_MAX_ACL_BYTES")
27        .ok()
28        .and_then(|v| v.parse().ok())
29        .unwrap_or(MAX_ACL_BYTES);
30    parse_turtle_acl_with_limit(input, limit)
31}
32
33/// Parse a Turtle ACL document with a caller-supplied byte limit.
34///
35/// Equivalent to [`parse_turtle_acl`] but accepts the size cap as a
36/// parameter instead of reading from the `JSS_MAX_ACL_BYTES` environment
37/// variable. Returns `PodError::PayloadTooLarge` (HTTP 413 equivalent)
38/// when `input.len() > max_bytes`.
39pub fn parse_turtle_acl_with_limit(
40    input: &str,
41    max_bytes: usize,
42) -> Result<AclDocument, PodError> {
43    if input.len() > max_bytes {
44        return Err(PodError::PayloadTooLarge(format!(
45            "ACL body exceeds {max_bytes} bytes"
46        )));
47    }
48
49    let mut prefixes: HashMap<String, String> = HashMap::new();
50    prefixes.insert("acl".into(), "http://www.w3.org/ns/auth/acl#".into());
51    prefixes.insert("foaf".into(), "http://xmlns.com/foaf/0.1/".into());
52    prefixes.insert("vcard".into(), "http://www.w3.org/2006/vcard/ns#".into());
53
54    // Strip comments (lines beginning with # outside IRIs).
55    let cleaned = strip_turtle_comments(input);
56
57    // Pull out @prefix directives.
58    let mut body = String::new();
59    for line in cleaned.lines() {
60        let trimmed = line.trim();
61        if let Some(rest) = trimmed.strip_prefix("@prefix") {
62            let rest = rest.trim();
63            if let Some((name, iri_part)) = rest.split_once(':') {
64                let name = name.trim().to_string();
65                let iri_part = iri_part.trim().trim_end_matches('.').trim();
66                let iri = iri_part.trim_start_matches('<').trim_end_matches('>').trim();
67                prefixes.insert(name, iri.to_string());
68            }
69        } else {
70            body.push_str(line);
71            body.push('\n');
72        }
73    }
74
75    let statements = split_turtle_statements(&body);
76    let mut graph: Vec<AclAuthorization> = Vec::new();
77    for stmt in statements {
78        if stmt.trim().is_empty() {
79            continue;
80        }
81        if let Some(auth) = parse_turtle_authorization(&stmt, &prefixes) {
82            graph.push(auth);
83        }
84    }
85    Ok(AclDocument {
86        context: None,
87        graph: if graph.is_empty() { None } else { Some(graph) },
88    })
89}
90
91fn strip_turtle_comments(input: &str) -> String {
92    let mut out = String::with_capacity(input.len());
93    for line in input.lines() {
94        let mut in_iri = false;
95        let mut filtered = String::with_capacity(line.len());
96        for c in line.chars() {
97            match c {
98                '<' => {
99                    in_iri = true;
100                    filtered.push(c);
101                }
102                '>' => {
103                    in_iri = false;
104                    filtered.push(c);
105                }
106                '#' if !in_iri => break,
107                _ => filtered.push(c),
108            }
109        }
110        out.push_str(&filtered);
111        out.push('\n');
112    }
113    out
114}
115
116fn split_turtle_statements(input: &str) -> Vec<String> {
117    let mut out: Vec<String> = Vec::new();
118    let mut cur = String::new();
119    let mut depth_iri = 0i32;
120    let mut in_str = false;
121    for c in input.chars() {
122        match c {
123            '<' if !in_str => {
124                depth_iri += 1;
125                cur.push(c);
126            }
127            '>' if !in_str => {
128                depth_iri = (depth_iri - 1).max(0);
129                cur.push(c);
130            }
131            '"' => {
132                in_str = !in_str;
133                cur.push(c);
134            }
135            '.' if depth_iri == 0 && !in_str => {
136                out.push(cur.clone());
137                cur.clear();
138            }
139            _ => cur.push(c),
140        }
141    }
142    if !cur.trim().is_empty() {
143        out.push(cur);
144    }
145    out
146}
147
148fn parse_turtle_authorization(
149    stmt: &str,
150    prefixes: &HashMap<String, String>,
151) -> Option<AclAuthorization> {
152    let trimmed = stmt.trim();
153    if trimmed.is_empty() {
154        return None;
155    }
156    let (_subject, body) = turtle_pop_term(trimmed)?;
157    let mut auth = AclAuthorization {
158        id: None,
159        r#type: None,
160        agent: None,
161        agent_class: None,
162        agent_group: None,
163        origin: None,
164        access_to: None,
165        default: None,
166        mode: None,
167        condition: None,
168    };
169    let mut any_authz = false;
170    // Split the predicate list honouring `[...]` balance so a blank
171    // node body (e.g. `acl:condition [ a acl:ClientCondition; ... ]`)
172    // is not torn apart by its inner `;` separators.
173    for pair in split_predicate_list(&body) {
174        let pair = pair.trim();
175        if pair.is_empty() {
176            continue;
177        }
178        let (pred, rest) = turtle_pop_term(pair)?;
179        let pred_expanded = expand_curie_or_iri(&pred, prefixes);
180        let objects = parse_object_list(rest.trim(), prefixes);
181
182        match pred_expanded.as_str() {
183            "a" | "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" | "rdf:type"
184                if objects.iter().any(|o| {
185                    o == "http://www.w3.org/ns/auth/acl#Authorization"
186                        || o == "acl:Authorization"
187                }) =>
188            {
189                any_authz = true;
190            }
191            "http://www.w3.org/ns/auth/acl#agent" | "acl:agent" => {
192                auth.agent = Some(ids_of(objects));
193            }
194            "http://www.w3.org/ns/auth/acl#agentClass" | "acl:agentClass" => {
195                auth.agent_class = Some(ids_of(objects));
196            }
197            "http://www.w3.org/ns/auth/acl#agentGroup" | "acl:agentGroup" => {
198                auth.agent_group = Some(ids_of(objects));
199            }
200            "http://www.w3.org/ns/auth/acl#origin" | "acl:origin" => {
201                auth.origin = Some(ids_of(objects));
202            }
203            "http://www.w3.org/ns/auth/acl#accessTo" | "acl:accessTo" => {
204                auth.access_to = Some(ids_of(objects));
205            }
206            "http://www.w3.org/ns/auth/acl#default" | "acl:default" => {
207                auth.default = Some(ids_of(objects));
208            }
209            "http://www.w3.org/ns/auth/acl#mode" | "acl:mode" => {
210                auth.mode = Some(ids_of(objects));
211            }
212            "http://www.w3.org/ns/auth/acl#condition" | "acl:condition" => {
213                // Conditions are usually authored as a blank-node
214                // body `[ a acl:ClientCondition; acl:client <...> ]`.
215                // The object side of the predicate contains one or
216                // more such bodies. Parse each; on failure we
217                // *preserve* the condition as `Unknown` so the
218                // authorisation fails closed at evaluation time.
219                let parsed = parse_turtle_condition_objects(rest.trim(), prefixes);
220                let bucket = auth.condition.get_or_insert_with(Vec::new);
221                bucket.extend(parsed);
222            }
223            _ => {}
224        }
225    }
226    if any_authz {
227        Some(auth)
228    } else {
229        None
230    }
231}
232
233// ---------------------------------------------------------------------------
234// Predicate-list splitter that respects `[...]` blank-node bodies.
235//
236// Turtle's top-level predicate-object pairs are terminated by `;`, but
237// a blank-node body embedded as an object value also uses `;` to
238// separate its internal pairs. Simple `body.split(';')` would tear the
239// blank node apart; we track bracket depth instead.
240// ---------------------------------------------------------------------------
241fn split_predicate_list(input: &str) -> Vec<String> {
242    let mut out: Vec<String> = Vec::new();
243    let mut cur = String::new();
244    let mut depth: i32 = 0;
245    let mut in_str = false;
246    for c in input.chars() {
247        match c {
248            '"' => {
249                in_str = !in_str;
250                cur.push(c);
251            }
252            '[' if !in_str => {
253                depth += 1;
254                cur.push(c);
255            }
256            ']' if !in_str => {
257                depth = (depth - 1).max(0);
258                cur.push(c);
259            }
260            ';' if !in_str && depth == 0 => {
261                out.push(cur.clone());
262                cur.clear();
263            }
264            _ => cur.push(c),
265        }
266    }
267    if !cur.trim().is_empty() {
268        out.push(cur);
269    }
270    out
271}
272
273// ---------------------------------------------------------------------------
274// Condition-object parser.
275//
276// Accepts a comma-separated list of condition objects. Each object is
277// either:
278//
279//   * a blank-node body `[ a <cond-type>; <pred> <obj> ; ... ]`, or
280//   * an IRI reference (rare — usually the condition type is named
281//     inline as a blank node).
282//
283// Unknown `@type` values are preserved verbatim so
284// `validate_acl_document` can report the offending IRI in a 422.
285// ---------------------------------------------------------------------------
286fn parse_turtle_condition_objects(
287    input: &str,
288    prefixes: &HashMap<String, String>,
289) -> Vec<Condition> {
290    let mut out = Vec::new();
291    let mut remaining = input.trim().to_string();
292    loop {
293        let r = remaining.trim_start();
294        if r.is_empty() {
295            break;
296        }
297        if let Some(after_open) = r.strip_prefix('[') {
298            // Find the matching ']' honouring nesting + string content.
299            let mut depth: i32 = 1;
300            let mut idx = 0usize;
301            let mut in_str = false;
302            for (i, c) in after_open.char_indices() {
303                match c {
304                    '"' => in_str = !in_str,
305                    '[' if !in_str => depth += 1,
306                    ']' if !in_str => {
307                        depth -= 1;
308                        if depth == 0 {
309                            idx = i;
310                            break;
311                        }
312                    }
313                    _ => {}
314                }
315            }
316            if depth != 0 {
317                // Unbalanced — bail out on this object.
318                break;
319            }
320            let body = &after_open[..idx];
321            let rest = &after_open[idx + 1..];
322            if let Some(cond) = parse_turtle_condition_body(body, prefixes) {
323                out.push(cond);
324            }
325            remaining = rest.trim_start().to_string();
326        } else {
327            // IRI reference form — try to pop a term and treat it as an
328            // Unknown condition (we cannot resolve arbitrary IRIs to
329            // condition types without a registry lookup, so preserve).
330            let (tok, rest) = match turtle_pop_term(r) {
331                Some(v) => v,
332                None => break,
333            };
334            let iri = expand_curie_or_iri(&tok, prefixes);
335            out.push(Condition::Unknown { type_iri: iri });
336            remaining = rest.to_string();
337        }
338        let r = remaining.trim_start();
339        if let Some(after_comma) = r.strip_prefix(',') {
340            remaining = after_comma.to_string();
341        } else {
342            break;
343        }
344    }
345    out
346}
347
348fn parse_turtle_condition_body(
349    body: &str,
350    prefixes: &HashMap<String, String>,
351) -> Option<Condition> {
352    let mut type_iri: Option<String> = None;
353    let mut clients: Vec<String> = Vec::new();
354    let mut client_groups: Vec<String> = Vec::new();
355    let mut client_classes: Vec<String> = Vec::new();
356    let mut issuers: Vec<String> = Vec::new();
357    let mut issuer_groups: Vec<String> = Vec::new();
358    let mut issuer_classes: Vec<String> = Vec::new();
359
360    for pair in split_predicate_list(body) {
361        let pair = pair.trim();
362        if pair.is_empty() {
363            continue;
364        }
365        let (pred, rest) = match turtle_pop_term(pair) {
366            Some(v) => v,
367            None => continue,
368        };
369        let pred_expanded = expand_curie_or_iri(&pred, prefixes);
370        let objects = parse_object_list(rest.trim(), prefixes);
371        match pred_expanded.as_str() {
372            "a"
373            | "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
374            | "rdf:type" => {
375                if let Some(first) = objects.first() {
376                    type_iri = Some(normalise_condition_type(first));
377                }
378            }
379            "http://www.w3.org/ns/auth/acl#client" | "acl:client" => {
380                clients.extend(objects);
381            }
382            "http://www.w3.org/ns/auth/acl#clientGroup" | "acl:clientGroup" => {
383                client_groups.extend(objects);
384            }
385            "http://www.w3.org/ns/auth/acl#clientClass" | "acl:clientClass" => {
386                client_classes.extend(objects);
387            }
388            "http://www.w3.org/ns/auth/acl#issuer" | "acl:issuer" => {
389                issuers.extend(objects);
390            }
391            "http://www.w3.org/ns/auth/acl#issuerGroup" | "acl:issuerGroup" => {
392                issuer_groups.extend(objects);
393            }
394            "http://www.w3.org/ns/auth/acl#issuerClass" | "acl:issuerClass" => {
395                issuer_classes.extend(objects);
396            }
397            _ => {}
398        }
399    }
400
401    let t = type_iri?;
402    match t.as_str() {
403        "acl:ClientCondition" => Some(Condition::Client(ClientConditionBody {
404            client: strs_to_ids(clients),
405            client_group: strs_to_ids(client_groups),
406            client_class: strs_to_ids(client_classes),
407        })),
408        "acl:IssuerCondition" => Some(Condition::Issuer(IssuerConditionBody {
409            issuer: strs_to_ids(issuers),
410            issuer_group: strs_to_ids(issuer_groups),
411            issuer_class: strs_to_ids(issuer_classes),
412        })),
413        other => Some(Condition::Unknown {
414            type_iri: other.to_string(),
415        }),
416    }
417}
418
419fn strs_to_ids(items: Vec<String>) -> Option<IdOrIds> {
420    if items.is_empty() {
421        None
422    } else if items.len() == 1 {
423        Some(IdOrIds::Single(IdRef {
424            id: items.into_iter().next().unwrap(),
425        }))
426    } else {
427        Some(IdOrIds::Multiple(
428            items.into_iter().map(|id| IdRef { id }).collect(),
429        ))
430    }
431}
432
433fn normalise_condition_type(raw: &str) -> String {
434    // Fold full IRI forms to the short curie so match arms in
435    // `parse_turtle_condition_body` can branch on a single string.
436    match raw {
437        "http://www.w3.org/ns/auth/acl#ClientCondition"
438        | "https://www.w3.org/ns/auth/acl#ClientCondition" => "acl:ClientCondition".into(),
439        "http://www.w3.org/ns/auth/acl#IssuerCondition"
440        | "https://www.w3.org/ns/auth/acl#IssuerCondition" => "acl:IssuerCondition".into(),
441        other => other.to_string(),
442    }
443}
444
445fn turtle_pop_term(input: &str) -> Option<(String, String)> {
446    let input = input.trim_start();
447    if let Some(rest) = input.strip_prefix('<') {
448        let end = rest.find('>')?;
449        Some((rest[..end].to_string(), rest[end + 1..].to_string()))
450    } else if input.starts_with('"') {
451        None
452    } else {
453        // Identifier token terminated by whitespace *or* by Turtle
454        // punctuation (comma, semicolon, closing bracket, statement
455        // terminator). Without this, `acl:Write, acl:Control` would be
456        // parsed as a single token `acl:Write,` with the trailing comma
457        // welded to the IRI, defeating comma-separated object-list
458        // handling in `parse_object_list`.
459        let end = input
460            .find(|c: char| c.is_whitespace() || matches!(c, ',' | ';' | ']' | ')'))
461            .unwrap_or(input.len());
462        Some((input[..end].to_string(), input[end..].to_string()))
463    }
464}
465
466fn parse_object_list(input: &str, prefixes: &HashMap<String, String>) -> Vec<String> {
467    let mut out = Vec::new();
468    let mut remaining = input.trim().to_string();
469    loop {
470        let r = remaining.trim_start();
471        if r.is_empty() {
472            break;
473        }
474        let (tok, rest) = match turtle_pop_term(r) {
475            Some(v) => v,
476            None => break,
477        };
478        out.push(expand_curie_or_iri(&tok, prefixes));
479        let r = rest.trim_start();
480        if let Some(after_comma) = r.strip_prefix(',') {
481            remaining = after_comma.to_string();
482        } else {
483            break;
484        }
485    }
486    out
487}
488
489fn expand_curie_or_iri(tok: &str, prefixes: &HashMap<String, String>) -> String {
490    let tok = tok.trim();
491    if tok == "a" {
492        return "a".to_string();
493    }
494    if let Some((p, local)) = tok.split_once(':') {
495        if !p.starts_with('<') {
496            if let Some(base) = prefixes.get(p) {
497                return format!("{base}{local}");
498            }
499        }
500    }
501    tok.to_string()
502}
503
504// ---------------------------------------------------------------------------
505// Unit tests — size-capped parsing (Sprint 12 security hardening).
506// ---------------------------------------------------------------------------
507
508#[cfg(test)]
509mod tests {
510    use super::*;
511
512    /// Valid minimal Turtle ACL for round-trip sanity.
513    const TINY_ACL: &str = r#"
514        @prefix acl: <http://www.w3.org/ns/auth/acl#> .
515        @prefix foaf: <http://xmlns.com/foaf/0.1/> .
516
517        <#public> a acl:Authorization ;
518            acl:agentClass foaf:Agent ;
519            acl:accessTo </> ;
520            acl:mode acl:Read .
521    "#;
522
523    #[test]
524    fn parse_turtle_acl_with_limit_accepts_small_doc() {
525        // A generous limit should succeed.
526        let doc = parse_turtle_acl_with_limit(TINY_ACL, 1_048_576).unwrap();
527        assert!(doc.graph.is_some());
528    }
529
530    #[test]
531    fn parse_turtle_acl_with_limit_rejects_oversized_doc() {
532        // Set limit to 10 bytes — well under the document size.
533        let err = parse_turtle_acl_with_limit(TINY_ACL, 10).unwrap_err();
534        let msg = err.to_string();
535        assert!(
536            msg.contains("payload too large") || msg.contains("exceeds"),
537            "error should mention size: {msg}"
538        );
539    }
540
541    #[test]
542    fn parse_turtle_acl_with_limit_boundary() {
543        // Exactly at the boundary: len == limit should succeed.
544        let doc_str = "a".repeat(100);
545        // This won't be valid Turtle, but the size check passes and the
546        // parser returns an empty-graph document (it is forgiving).
547        let result = parse_turtle_acl_with_limit(&doc_str, 100);
548        assert!(result.is_ok(), "exactly at limit should not reject");
549
550        // One byte over the boundary should be rejected.
551        let doc_str_over = "a".repeat(101);
552        assert!(parse_turtle_acl_with_limit(&doc_str_over, 100).is_err());
553    }
554
555    #[test]
556    fn default_limit_is_one_mib() {
557        assert_eq!(MAX_ACL_BYTES, 1_048_576);
558    }
559}