graph_rdfa_processor/
lib.rs

1use std::{borrow::Cow, collections::HashMap, error::Error, sync::Arc};
2
3mod constants;
4mod rdfa_elt;
5mod structs;
6#[cfg(test)]
7mod tests;
8
9use constants::{
10    COMMON_PREFIXES, IS_SPECIAL_NODE_FN, NODE_NS_TYPE, NODE_RDF_FIRST, NODE_RDF_NIL,
11    NODE_RDF_PLAIN_LITERAL, NODE_RDF_REST, NODE_RDFA_PATTERN_TYPE, NODE_RDFA_USES_VOCABULARY,
12    RESERVED_KEYWORDS, get_uuid,
13};
14use log::{debug, error};
15use rdfa_elt::RdfaElement;
16use scraper::{ElementRef, Selector};
17use url::{Origin, Url};
18
19use structs::{Context, DataTypeFromPattern, Literal, Node, Statement};
20
21pub use structs::RdfaGraph;
22
23struct NodeContext<'a, 'b> {
24    element_ref: &'b ElementRef<'a>,
25    ctx: Context<'a>,
26    stmts: &'b mut Vec<Statement<'a>>,
27    current_node: Node<'a>,
28    rels: Option<Vec<Node<'a>>>,
29    revs: Option<Vec<Node<'a>>>,
30    in_list_stmts: &'b mut Vec<Statement<'a>>,
31    type_ofs: Option<Vec<Node<'a>>>,
32    parent_in_rel: Option<Vec<Node<'a>>>,
33    parent_in_rev: Option<Vec<Node<'a>>>,
34    parent: &'b Option<&'b Context<'a>>,
35}
36
37impl<'a> RdfaGraph<'a> {
38    pub fn parse(
39        input: &ElementRef<'a>,
40        initial_context: Context<'a>,
41    ) -> Result<RdfaGraph<'a>, Box<dyn Error>> {
42        let mut triples = vec![];
43        let mut inlist_triples = vec![];
44        let well_known_prefix = initial_context.well_known_prefix;
45        if initial_context.empty_ref_node_substitute.is_empty() {
46            return Err(
47                "if you provide a context, you most provide an empty_ref_node_substitute property."
48                    .into(),
49            );
50        }
51        traverse_element(
52            input,
53            None,
54            initial_context,
55            &mut triples,
56            &mut inlist_triples,
57        )?;
58
59        // fixes examples/other/example0002.html
60        // when base ends with "/", inlist_triples is not append
61        // todo find a better fix
62        if !inlist_triples.is_empty() {
63            triples.append(&mut inlist_triples);
64        }
65
66        triples = copy_pattern(triples)?;
67
68        Ok(RdfaGraph {
69            statements: triples.into_iter().collect(),
70            well_known_prefix,
71        })
72    }
73
74    pub fn parse_str(
75        html: &'a str,
76        base: &'a str,
77        well_known_prefix: Option<&'a str>,
78    ) -> Result<String, Box<dyn Error>> {
79        let document = scraper::Html::parse_document(html);
80        let empty_ref_node_substitue = get_uuid();
81        let root = document.root_element();
82
83        let root_ctx = Context {
84            base,
85            empty_ref_node_substitute: &empty_ref_node_substitue,
86            well_known_prefix: well_known_prefix.filter(|f| !f.is_empty()),
87            ..Default::default()
88        };
89        RdfaGraph::parse(&root, root_ctx).map(|g| g.to_string())
90    }
91}
92fn traverse_element<'a, 'b>(
93    element_ref: &'b ElementRef<'a>,
94    parent: Option<&'b Context<'a>>,
95    mut ctx: Context<'a>,
96    stmts: &'b mut Vec<Statement<'a>>,
97    in_list_stmts: &mut Vec<Statement<'a>>,
98) -> Result<Option<Node<'a>>, Box<dyn Error>> {
99    let mut elt = RdfaElement::new(element_ref)?;
100
101    ctx.vocab = elt.vocab.or_else(|| parent.as_ref().and_then(|p| p.vocab));
102
103    ctx.base = elt.base.unwrap_or(ctx.base);
104
105    let base = resolve_uri(ctx.base, &ctx, true)?;
106
107    if let Some(vocab) = ctx.vocab.filter(|v| !v.is_empty()) {
108        stmts.push(Statement {
109            subject: base.clone(),
110            predicate: NODE_RDFA_USES_VOCABULARY.clone(),
111            object: resolve_uri(vocab, &ctx, false)?,
112        })
113    } else {
114        ctx.vocab = None;
115    }
116    ctx.prefixes = elt
117        .prefix
118        .map(parse_prefixes)
119        .or_else(|| parent.map(|p| p.prefixes.clone()))
120        .unwrap_or(ctx.prefixes);
121
122    let is_empty_curie = |s: &str| {
123        let mut s = s.trim();
124        if s.starts_with('[') {
125            s = &s[1..];
126        } else {
127            return false;
128        }
129        if s.ends_with(']') {
130            s = &s[0..s.len() - 1];
131        } else {
132            return false;
133        }
134        s.is_empty()
135    };
136
137    let resource = elt
138        .resource
139        .filter(|r| !is_empty_curie(r))
140        .map(|c| if c.is_empty() { ctx.base } else { c });
141
142    ctx.lang = elt
143        .lang
144        .or_else(|| parent.and_then(|p| p.lang))
145        .or(ctx.lang);
146
147    let mut about = elt.about.and_then(|a| resolve_uri(a, &ctx, true).ok());
148
149    let mut rels = elt.rel.map(|r| parse_property_or_type_of(r, &ctx, true));
150    let mut revs = elt.rev.map(|r| parse_property_or_type_of(r, &ctx, true));
151
152    let mut parent_in_rel = parent.and_then(|c| c.in_rel.clone());
153    let mut parent_in_rev = parent.and_then(|c| c.in_rev.clone());
154    let mut parent_in_list = parent.and_then(|c| c.in_list.clone());
155
156    let mut src_or_href = elt
157        .src_or_href()
158        .and_then(|v| resolve_uri(v, &ctx, true).ok());
159
160    let mut type_ofs = elt.type_of.and_then(|t| {
161        if t.trim().is_empty() {
162            // use vocab
163            resolve_uri(ctx.vocab.unwrap_or(ctx.base), &ctx, true)
164                .ok()
165                .map(|v| vec![v])
166        } else {
167            Some(parse_property_or_type_of(t, &ctx, true))
168        }
169    });
170
171    let datatype = elt
172        .datatype
173        .and_then(|dt| match resolve_uri(dt, &ctx, false) {
174            Ok(d) => Some(Box::new(d)),
175            Err(e) => {
176                debug!("could not parse {dt}. error {e}");
177                None
178            }
179        });
180
181    let mut predicates = elt
182        .property
183        .map(|p| parse_property_or_type_of(p, &ctx, false));
184
185    // by default, current node set as the base unless it's a special node
186    // check other/example0006 for special node
187    let mut current_node = if !IS_SPECIAL_NODE_FN(&datatype) {
188        base.clone()
189    } else {
190        make_bnode()
191    };
192
193    // if parent is inlist
194    if let Some(parent_in_list) = parent_in_list.take() {
195        let subject = get_parent_subject(&parent, &ctx)?;
196        let obj = if let Some(resource) = resource
197            .and_then(|r| resolve_uri(r, &ctx, true).ok())
198            .map(|n| Node::Ref(Arc::new(n)))
199            .or_else(|| src_or_href.clone())
200        {
201            resource
202        } else {
203            Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
204        };
205        for rel in parent_in_list {
206            push_triples_inlist(in_list_stmts, &subject, rel, &obj);
207        }
208        current_node = subject;
209    }
210    // if current elt is inlist
211    else if elt.is_inlist() {
212        let mut in_rel = false;
213
214        let subject = get_parent_subject(&parent, &ctx)?;
215
216        if rels.is_some()
217            && src_or_href.is_none()
218            && predicates.is_none()
219            && resource.is_none()
220            && about.is_none()
221        // empty list
222        {
223            if element_ref.children().count() != 0 {
224                // example0013 && example0014
225                if type_ofs.is_some() {
226                    let Some(rels) = rels.take() else {
227                        unreachable!()
228                    };
229                    current_node = make_bnode();
230                    handle_children(NodeContext {
231                        element_ref,
232                        ctx: ctx.clone(),
233                        stmts,
234                        current_node: current_node.clone(),
235                        rels: None,
236                        revs: revs.take(),
237                        in_list_stmts,
238                        type_ofs: type_ofs.take(),
239                        parent_in_rel: parent_in_rel.take(),
240                        parent_in_rev: parent_in_rev.take(),
241                        parent: &parent,
242                    })?;
243                    for rel in rels {
244                        let mut existing_rel_in_list = None;
245                        if let Some(node) =
246                            find_pos_last_node_in_inlist(in_list_stmts, &subject, &rel)
247                                .and_then(|s| in_list_stmts.get_mut(s))
248                                .filter(|p| p.object != *NODE_RDF_NIL)
249                        {
250                            existing_rel_in_list = Some(node.object.clone());
251                        }
252
253                        if let Some(existing_rel_in_list) = existing_rel_in_list {
254                            push_triples_inlist(
255                                in_list_stmts,
256                                &subject,
257                                rel,
258                                &existing_rel_in_list,
259                            );
260                        } else {
261                            push_triples_inlist(in_list_stmts, &subject, rel, &current_node);
262                        }
263                    }
264                    return Ok(Some(subject));
265                } else {
266                    ctx.in_list = rels.take();
267                }
268            } else {
269                push_triples(in_list_stmts, &subject, &rels.take(), &NODE_RDF_NIL);
270            }
271        } else if let Some(rels) = rels.take().filter(|r| !r.is_empty()) {
272            in_rel = true;
273
274            let obj = if let Some(resource) = resource
275                .and_then(|r| resolve_uri(r, &ctx, true).ok())
276                .map(|n| Node::Ref(Arc::new(n)))
277                .or_else(|| src_or_href.clone())
278            {
279                resource
280            } else {
281                Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
282            };
283            for rel in rels {
284                push_triples_inlist(in_list_stmts, &subject, rel, &obj);
285            }
286        }
287        let obj = if let (Some(resource), false) = (resource, in_rel) {
288            Node::Ref(Arc::new(resolve_uri(resource, &ctx, true)?))
289        } else {
290            Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
291        };
292        if let Some(predicates) = predicates.take() {
293            for predicate in predicates {
294                push_triples_inlist(in_list_stmts, &subject, predicate, &obj);
295            }
296        }
297
298        current_node = subject;
299    }
300    // if there is a resource attr
301    else if let Some(resource) = resource {
302        let resource = Node::Ref(Arc::new(resolve_uri(resource, &ctx, true)?));
303
304        if !elt.has_content_or_datatype() {
305            let object = about
306                .as_ref()
307                .filter(|_| parent_in_rel.is_some() || parent_in_rev.is_some())
308                .map(|a| Node::Ref(Arc::new(a.clone())))
309                .unwrap_or(resource);
310            current_node = object;
311            let subject = about
312                .take()
313                .map(|a| Ok(Node::Ref(Arc::new(a))))
314                .unwrap_or_else(|| get_parent_subject(&parent, &ctx))?;
315
316            push_triples(stmts, &subject, &predicates, &current_node);
317
318            if predicates.is_some() && type_ofs.is_none() {
319                current_node = subject;
320            } else {
321                push_triples(stmts, &subject, &rels.take(), &current_node);
322                push_triples(stmts, &current_node, &revs.take(), &subject);
323            }
324        } else {
325            //example0020 && example0021
326            let resource = about
327                .as_ref()
328                .map(|a| Node::Ref(Arc::new(a.clone())))
329                .unwrap_or(resource);
330            push_triples(
331                stmts,
332                &resource,
333                &predicates,
334                &extract_literal(&elt, &datatype, &ctx)?,
335            );
336            current_node = resource;
337        }
338    }
339    // if there is no resource but about
340    else if let Some(about) = about {
341        // handle about case. set the context.
342        // if property is present, children become objects of current.
343        let is_empty = elt
344            .about
345            .filter(|a| !a.trim().is_empty() && is_empty_curie(a))
346            .is_some();
347        current_node = if !is_empty {
348            Node::Ref(Arc::new(about))
349        } else {
350            current_node
351        };
352
353        push_triples(
354            stmts,
355            &current_node,
356            &predicates,
357            &Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
358        );
359
360        if let Some(src_or_href) = src_or_href.take() {
361            push_triples(stmts, &current_node, &rels, &src_or_href);
362            push_triples(stmts, &src_or_href, &revs, &current_node);
363        }
364        if is_empty {
365            current_node = make_bnode();
366        }
367    }
368    // now the interesting bits
369    else if src_or_href.is_some() && elt.has_content_or_datatype() {
370        current_node = src_or_href.take().ok_or("no src")?;
371
372        push_triples(
373            stmts,
374            &current_node,
375            &predicates,
376            &extract_literal(&elt, &datatype, &ctx)?,
377        );
378    }
379    // test 0303
380    else if src_or_href.is_some() && (rels.is_some() || revs.is_some()) {
381        let src_or_href = src_or_href.take().ok_or("no src")?;
382        current_node = get_parent_subject(&parent, &ctx)
383            .ok()
384            .unwrap_or_else(make_bnode);
385
386        let mut has_term = false;
387        let mut emit_triple = false;
388        if elt.has_property() {
389            rels = rels.take().map(|rs| {
390                rs.into_iter()
391                    .filter(|r| {
392                        let m = matches!(r, Node::Ref(r) if matches!(r.as_ref(), Node::TermIri(_)));
393                        if m {
394                            has_term = true;
395                        } else {
396                            emit_triple = true;
397                        }
398                        !m
399                    })
400                    .collect()
401            });
402        }
403
404        push_triples(stmts, &current_node, &rels, &src_or_href);
405        push_triples(stmts, &src_or_href, &revs, &current_node);
406
407        if has_term {
408            if emit_triple {
409                elt.src.take();
410                elt.href.take();
411            }
412
413            push_triples(
414                stmts,
415                &current_node,
416                &predicates,
417                &extract_literal(&elt, &datatype, &ctx)?,
418            );
419        }
420        // example0017
421        if rels.is_some() && type_ofs.is_some() {
422            if let Some(type_ofs) = type_ofs.take() {
423                let pred = Some(vec![NODE_NS_TYPE.clone()]);
424
425                for to in type_ofs {
426                    push_triples(stmts, &src_or_href, &pred, &to);
427                }
428            }
429            //example0018
430            current_node = src_or_href.clone();
431            rels.take();
432        }
433        // example0012
434        if revs.is_some() {
435            if predicates.is_some() {
436                elt.src.take();
437                elt.href.take();
438                push_triples(
439                    stmts,
440                    &current_node,
441                    &predicates,
442                    &extract_literal(&elt, &datatype, &ctx)?,
443                );
444            }
445            if let Some(type_ofs) = type_ofs.take() {
446                let pred = Some(vec![NODE_NS_TYPE.clone()]);
447
448                for to in type_ofs {
449                    push_triples(stmts, &src_or_href, &pred, &to);
450                }
451            }
452        }
453    }
454    // another case
455    else if type_ofs.is_some() {
456        if elt.has_property()
457            && !elt.has_content_or_datatype()
458            && (parent_in_rel.is_some() || parent_in_rev.is_some())
459        {
460            current_node = make_bnode();
461            let node = src_or_href.take().unwrap_or_else(make_bnode);
462            for to in type_ofs.take().iter().flatten() {
463                push_triples(stmts, &node, &Some(vec![NODE_NS_TYPE.clone()]), to);
464            }
465            push_triples(stmts, &current_node, &predicates, &node);
466        } else if rels.is_some() {
467            current_node = make_bnode();
468
469            for to in type_ofs.take().into_iter().flatten() {
470                stmts.push(Statement {
471                    subject: current_node.clone(),
472                    predicate: NODE_NS_TYPE.clone(),
473                    object: to,
474                })
475            }
476            push_triples(stmts, &base, &rels.take(), &current_node);
477        } else if !IS_SPECIAL_NODE_FN(&datatype) {
478            // property shouldn't be in the list
479            // fixme
480            let child_with_rdfa_tag = element_ref
481                .select(&Selector::parse(
482                    "[href], [src], [resource], [property], [about]",
483                )?)
484                .filter(|e| {
485                    RdfaElement::new(e)
486                        .ok()
487                        .and_then(|e2| e2.datatype)
488                        .and_then(|dt| match resolve_uri(dt, &ctx, false).ok().map(Box::new) {
489                            v @ Some(_) if IS_SPECIAL_NODE_FN(&v) => v,
490                            _ => None,
491                        })
492                        .is_none()
493                })
494                .count()
495                == 0;
496            current_node = if let Some(src_or_href) = src_or_href.take() {
497                src_or_href
498            // not sure about this rule
499            } else if elt.name == "body"
500                || elt.name == "head"
501                || child_with_rdfa_tag
502                || parent.is_none()
503            {
504                base.clone()
505            } else {
506                make_bnode()
507            };
508
509            let subject = get_parent_subject(&parent, &ctx)
510                .ok()
511                .unwrap_or_else(make_bnode);
512
513            push_triples(stmts, &subject, &predicates, &current_node);
514        } else {
515            // test examples/other/example0006.html
516            push_triples(
517                stmts,
518                &current_node,
519                &predicates,
520                &extract_literal(&elt, &datatype, &ctx)?,
521            );
522        }
523    }
524    // another general case
525    else {
526        current_node = src_or_href
527            .take()
528            .filter(|_| parent_in_rel.is_some() || parent_in_rev.is_some())
529            .map(Ok)
530            .unwrap_or_else(|| get_parent_subject(&parent, &ctx))?;
531
532        push_triples(
533            stmts,
534            &current_node,
535            &predicates,
536            &Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
537        );
538    }
539
540    handle_children(NodeContext {
541        element_ref,
542        ctx,
543        stmts,
544        current_node,
545        rels,
546        revs,
547        in_list_stmts,
548        type_ofs,
549        parent_in_rel,
550        parent_in_rev,
551        parent: &parent,
552    })
553}
554fn handle_children<'a>(
555    NodeContext {
556        element_ref,
557        mut ctx,
558        stmts,
559        current_node,
560        rels,
561        revs,
562        in_list_stmts,
563        type_ofs,
564        mut parent_in_rel,
565        mut parent_in_rev,
566        parent,
567    }: NodeContext<'a, '_>,
568) -> Result<Option<Node<'a>>, Box<dyn Error>> {
569    if let Some(type_ofs) = type_ofs {
570        for type_of in type_ofs {
571            stmts.push(Statement {
572                subject: current_node.clone(),
573                predicate: NODE_NS_TYPE.clone(),
574                object: type_of,
575            })
576        }
577    }
578
579    if parent_in_rel.is_some() || parent_in_rev.is_some() {
580        let parent = get_parent_subject(parent, &ctx)
581            .ok()
582            .ok_or("in_rel: no parent node")?;
583        push_triples(stmts, &parent, &parent_in_rel.take(), &current_node);
584        push_triples(stmts, &current_node, &parent_in_rev.take(), &parent);
585    }
586    ctx.current_node = Some(current_node.clone());
587    ctx.in_rel = rels.clone();
588    ctx.in_rev = revs.clone();
589    for child in get_children(element_ref)? {
590        if let Some(c) = ElementRef::wrap(child) {
591            // Triples are also 'completed' if any one of @property, @rel or @rev are present.
592            let triples_completed = (ctx.in_rel.is_some() || ctx.in_rev.is_some())
593                && (c.attr("property").is_some()
594                    || c.attr("rel").is_some()
595                    || c.attr("rev").is_some())
596                && (c.attr("about").is_none() && c.attr("typeof").is_none());
597
598            if triples_completed {
599                // Triples are also 'completed' if any one of @property, @rel or @rev are present.
600                let b_node = make_bnode();
601                push_triples(stmts, &current_node, &ctx.in_rel.take(), &b_node);
602                push_triples(stmts, &b_node, &ctx.in_rev.take(), &current_node);
603
604                ctx.current_node = Some(b_node);
605            }
606            // However, unlike the situation when @about or @typeof are present, all predicates are attached to one bnode
607            if c.attr("about").is_some() || c.attr("typeof").is_some() {
608                ctx.in_rel = rels.clone();
609                ctx.in_rev = revs.clone();
610                ctx.current_node = Some(current_node.clone());
611            }
612            let child_ctx = Context {
613                base: ctx.base,
614                lang: ctx.lang,
615                empty_ref_node_substitute: ctx.empty_ref_node_substitute,
616                ..Default::default()
617            };
618
619            let node = traverse_element(&c, Some(&ctx), child_ctx, stmts, in_list_stmts)?;
620            if node != ctx.current_node {
621                stmts.append(in_list_stmts);
622            }
623        }
624    }
625    Ok(ctx.current_node.clone())
626}
627fn extract_literal<'a>(
628    rdfa_el: &RdfaElement<'a, '_>,
629    datatype: &Option<Box<Node<'a>>>,
630    ctx: &Context<'a>,
631) -> Result<Node<'a>, &'static str> {
632    let plain_datatype = datatype
633        .as_ref()
634        .filter(|dt| dt.as_ref() == &*NODE_RDF_PLAIN_LITERAL)
635        .is_some();
636
637    let lang = ctx.lang.filter(|s| datatype.is_none() && !s.is_empty());
638    if let Some(value) = rdfa_el.src_or_href().filter(|_| {
639        !rdfa_el.has_about() && !rdfa_el.has_property() || !rdfa_el.has_content_or_datatype()
640    }) {
641        resolve_uri(value, ctx, true)
642    } else if let Some(content) = rdfa_el.content {
643        Ok(Node::Literal(Literal {
644            datatype: datatype.clone(),
645            value: Cow::Borrowed(content),
646            lang,
647        }))
648    } else if !plain_datatype && IS_SPECIAL_NODE_FN(datatype) {
649        Ok(Node::Literal(Literal {
650            value: Cow::Owned(rdfa_el.inner_html()),
651            datatype: datatype.clone(),
652            lang: None,
653        }))
654    } else if let Some(content) = rdfa_el.get_time() {
655        Ok(Node::Literal(Literal {
656            datatype: datatype
657                .clone()
658                .or_else(|| DataTypeFromPattern::date_time_from_pattern(content).map(Box::new)),
659            value: Cow::Borrowed(content),
660            lang: None,
661        }))
662    } else {
663        let datatype = if plain_datatype {
664            None
665        } else {
666            datatype.clone()
667        };
668        let lang = if plain_datatype { ctx.lang } else { lang };
669        let texts = rdfa_el.texts();
670        let text = if texts.is_empty() {
671            Cow::Borrowed("")
672        } else {
673            let text = texts
674                .iter()
675                .map(|t| t.to_string())
676                .collect::<Vec<_>>()
677                .join("");
678            Cow::Owned(text)
679        };
680        Ok(Node::Literal(Literal {
681            datatype,
682            value: text,
683            lang,
684        }))
685    }
686}
687fn get_parent_subject<'a>(
688    parent: &Option<&Context<'a>>,
689    ctx: &Context<'a>,
690) -> Result<Node<'a>, Box<dyn Error>> {
691    parent
692        .and_then(|p| p.current_node.clone())
693        .or_else(|| {
694            if parent.is_none() {
695                resolve_uri(ctx.base, ctx, true).ok()
696            } else {
697                None
698            }
699        })
700        .ok_or("no parent".into())
701}
702
703fn resolve_uri<'a>(
704    uri: &'a str,
705    ctx: &Context<'a>,
706    is_resource: bool,
707) -> Result<Node<'a>, &'static str> {
708    let uri = uri.trim();
709
710    // special case, see bug#
711    if let Ok(ref origin) = Url::parse(ctx.base).map(|u| u.origin())
712        && let Origin::Tuple(_, host, _) = origin
713    {
714        let host = &host.to_string();
715
716        if uri.starts_with(host) {
717            return Ok(Node::TermIri(Cow::Owned(
718                uri.replace(host, &origin.unicode_serialization()),
719            )));
720        }
721    };
722
723    let iri = Url::parse(uri);
724    let trailing_white_space = if ctx.base.ends_with('/')
725        || ctx.base.ends_with('#')
726        || uri.starts_with('/')
727        || uri.starts_with('#')
728    {
729        ""
730    } else {
731        "/"
732    };
733    match iri {
734        Ok(iri) if !iri.cannot_be_a_base() || iri.is_special() => {
735            // special case pct encoded, see other/example0004
736            if uri.contains(|c: char| c.is_whitespace() || c.is_control()) {
737                let mut new_uri = String::with_capacity(uri.len() * 125 / 100);
738                for c in uri.chars() {
739                    match c {
740                        '\n' => new_uri.push_str("%0A"),
741                        '\0' => new_uri.push_str("%00"),
742                        '\t' => new_uri.push_str("%09"),
743                        '\r' => new_uri.push_str("%0D"),
744                        ' ' => new_uri.push_str("%20"),
745                        c => new_uri.push(c),
746                    }
747                }
748                Ok(Node::Iri(Cow::Owned(new_uri)))
749            } else {
750                Ok(Node::Iri(Cow::Borrowed(uri)))
751            }
752        }
753
754        // Curie
755        Ok(iri) => {
756            if uri.starts_with("mail:") || uri.starts_with("tel:") {
757                Ok(Node::Iri(Cow::Borrowed(uri)))
758            } else if let Some((prefix, value)) = ctx
759                .prefixes
760                .iter()
761                .find(|(k, _)| k.eq_ignore_ascii_case(iri.scheme()))
762            {
763                let iri = format!(
764                    "{value}{}",
765                    &uri.replacen(':', "", 1).trim()[prefix.len()..]
766                );
767                Ok(Node::Iri(Cow::Owned(iri)))
768            } else if let Some((prefix, value)) = COMMON_PREFIXES
769                .iter()
770                .find(|(k, _)| k.eq_ignore_ascii_case(iri.scheme()))
771            {
772                let iri = format!(
773                    "{value}{}",
774                    &uri.replacen(':', "", 1).trim()[prefix.len()..]
775                );
776                Ok(Node::Iri(Cow::Owned(iri)))
777            } else {
778                Ok(Node::Iri(Cow::Owned(uri.to_string())))
779            }
780        }
781        Err(url::ParseError::RelativeUrlWithoutBase) => {
782            if let Ok((prefix, reference)) = parse_safe_curie(uri) {
783                let reference = reference.trim();
784                let prefix = prefix.trim();
785                if prefix == "_" {
786                    let id = if reference.is_empty() {
787                        ctx.empty_ref_node_substitute
788                    } else {
789                        reference
790                    };
791                    return Ok(Node::RefBlank(id));
792                } else if prefix.is_empty() && !reference.is_empty() {
793                    return Ok(Node::TermIri(Cow::Owned(
794                        [COMMON_PREFIXES[""], reference].join(""),
795                    )));
796                } else if let Some(prefix) = ctx
797                    .prefixes
798                    .get(prefix)
799                    .or_else(|| COMMON_PREFIXES.get(prefix))
800                {
801                    let reference = if reference.trim().is_empty() {
802                        reference.trim()
803                    } else {
804                        reference
805                    };
806                    return Ok(Node::Iri(Cow::Owned([prefix, reference].join(""))));
807                }
808            }
809            if is_resource || uri.starts_with('#') || uri.starts_with('/') {
810                let uri = if uri.starts_with("/") && ctx.base.ends_with("/") {
811                    &uri[1..]
812                } else {
813                    uri
814                };
815                Ok(Node::TermIri(Cow::Owned(
816                    [ctx.base, trailing_white_space, uri].join(""),
817                )))
818            } else if let Some(vocab) = ctx.vocab {
819                Ok(Node::TermIri(Cow::Owned([vocab, uri].join(""))))
820            } else if RESERVED_KEYWORDS
821                .iter()
822                .any(|w| uri.eq_ignore_ascii_case(w))
823            {
824                Ok(Node::TermIri(Cow::Borrowed(
825                    COMMON_PREFIXES[uri.to_lowercase().as_str()],
826                )))
827            } else {
828                debug!("could not determine base/vocab {:?}", ctx);
829                // Ok(Node::Iri(Cow::Borrowed(uri)))
830                Err("could not determine uri")
831            }
832        }
833        Err(e) => {
834            eprintln!("invalid uri {uri}. error: {e}");
835            Err("could not resolve uri")
836        }
837    }
838}
839
840fn parse_safe_curie(s: &str) -> Result<(&str, &str), &'static str> {
841    let mut s = s.trim();
842    if s.starts_with('[') {
843        if !s.ends_with(']') {
844            return Err("invalid SafeCurie");
845        }
846        s = &s[1..s.len() - 1];
847    }
848    s.split_once(':').ok_or("not a curie")
849}
850
851fn parse_prefixes(s: &str) -> HashMap<&str, &str> {
852    s.split_whitespace()
853        .map(|s| s.trim())
854        .collect::<Vec<_>>()
855        .chunks_exact(2)
856        .map(|c| (c[0], c[1]))
857        .filter_map(|(s, p)| {
858            if let Ok((s, _)) = parse_safe_curie(s) {
859                Some((s, p))
860            } else {
861                error!("fixme! couldn't parse curie for {s}, {p}");
862                None
863            }
864        })
865        .collect()
866}
867
868fn parse_property_or_type_of<'a>(
869    s: &'a str,
870    ctx: &Context<'a>,
871    allow_b_node: bool,
872) -> Vec<Node<'a>> {
873    s.split_whitespace()
874        .filter_map(|uri| resolve_uri(uri, ctx, false).ok())
875        .filter(|node| allow_b_node || !matches!(node, Node::Blank(_) | Node::RefBlank(_)))
876        .map(|n| Node::Ref(Arc::new(n)))
877        .collect()
878}
879
880fn push_triples_inlist<'a>(
881    stmts: &mut Vec<Statement<'a>>,
882    subject: &Node<'a>,
883    predicate: Node<'a>,
884    obj: &Node<'a>,
885) {
886    let b_node = make_bnode();
887    stmts.push(Statement {
888        subject: b_node.clone(),
889        predicate: NODE_RDF_FIRST.clone(),
890        object: obj.clone(),
891    });
892
893    if let Some(node) =
894        find_pos_last_node_in_inlist(stmts, subject, &predicate).and_then(|pos| stmts.get_mut(pos))
895    {
896        node.object = b_node.clone();
897    } else {
898        // push the root of the list
899        stmts.push(Statement {
900            subject: subject.clone(),
901            predicate,
902            object: b_node.clone(),
903        });
904    }
905    stmts.push(Statement {
906        subject: b_node,
907        predicate: NODE_RDF_REST.clone(),
908        object: NODE_RDF_NIL.clone(),
909    });
910}
911fn find_pos_last_node_in_inlist<'a>(
912    stmts: &Vec<Statement<'a>>,
913    root_subject: &Node<'a>,
914    predicate: &Node<'a>,
915) -> Option<usize> {
916    fn find_res_nil<'a>(stmts: &Vec<Statement<'a>>, subject: &Node<'a>) -> Option<usize> {
917        let node = stmts
918            .iter()
919            .enumerate()
920            .find(|(_, stmt)| &stmt.subject == subject && stmt.predicate == *NODE_RDF_REST);
921
922        if let Some((pos, stmt)) = node {
923            if stmt.object == *NODE_RDF_NIL {
924                Some(pos)
925            } else {
926                find_res_nil(stmts, &stmt.object)
927            }
928        } else {
929            None
930        }
931    }
932    let root = stmts
933        .iter()
934        .find(|stmt| &stmt.subject == root_subject && &stmt.predicate == predicate);
935    if let Some(Statement { object, .. }) = root {
936        find_res_nil(stmts, object)
937    } else {
938        None
939    }
940}
941
942// skip when there are no rdfa attributes, see e.g examples/earl_html5/example0084.html
943#[inline]
944fn get_children<'a>(
945    element_ref: &ElementRef<'a>,
946) -> Result<Vec<ego_tree::NodeRef<'a, scraper::Node>>, &'static str> {
947    let mut res = vec![];
948    for c in element_ref.children() {
949        if c.value()
950            .as_element()
951            .filter(|e| e.attrs().count() == 0)
952            .is_some()
953        {
954            let child_ref = ElementRef::wrap(c).ok_or("not an element ref")?;
955            res.append(&mut get_children(&child_ref)?);
956        } else {
957            res.push(c);
958        }
959    }
960
961    Ok(res)
962}
963
964#[inline]
965fn make_bnode<'a>() -> Node<'a> {
966    Node::Blank(get_uuid())
967}
968
969#[inline]
970fn copy_pattern(triples: Vec<Statement<'_>>) -> Result<Vec<Statement<'_>>, Box<dyn Error>> {
971    let (pattern_type, pattern): (Vec<Statement>, Vec<Statement>) = triples
972        .into_iter()
973        .partition(|stmt| stmt.object == *NODE_RDFA_PATTERN_TYPE);
974
975    let (pattern_predicate, pattern): (Vec<Statement>, Vec<Statement>) = pattern
976        .into_iter()
977        .partition(|stmt| pattern_type.iter().any(|s| s.subject == stmt.subject));
978
979    let (pattern_subject, mut triples): (Vec<Statement>, Vec<Statement>) = pattern
980        .into_iter()
981        .partition(|stmt| pattern_predicate.iter().any(|s| s.subject == stmt.object));
982
983    // remove only if pattern referenced
984    let (mut unreferenced_pattern_predicate, pattern_predicate): (Vec<Statement>, Vec<Statement>) =
985        pattern_predicate
986            .into_iter()
987            .partition(|stmt| pattern_subject.iter().all(|s| s.object != stmt.subject));
988
989    let (mut unreferenced_pattern_type, _): (Vec<Statement>, Vec<Statement>) =
990        pattern_type.into_iter().partition(|stmt| {
991            unreferenced_pattern_predicate
992                .iter()
993                .any(|s| s.subject == stmt.subject)
994        });
995    triples.append(&mut unreferenced_pattern_predicate);
996    triples.append(&mut unreferenced_pattern_type);
997
998    for Statement {
999        subject, object, ..
1000    } in pattern_subject
1001    {
1002        for Statement {
1003            predicate,
1004            object: obj,
1005            ..
1006        } in pattern_predicate
1007            .iter()
1008            .filter(|stmt| object == stmt.subject)
1009        {
1010            triples.push(Statement {
1011                subject: subject.clone(),
1012                predicate: predicate.clone(),
1013                object: obj.clone(),
1014            })
1015        }
1016    }
1017
1018    Ok(triples)
1019}
1020
1021#[inline]
1022fn push_triples<'a>(
1023    stmts: &mut Vec<Statement<'a>>,
1024    subject: &Node<'a>,
1025    predicates: &Option<Vec<Node<'a>>>,
1026    object: &Node<'a>,
1027) {
1028    if let Some(predicate) = predicates {
1029        for predicate in predicate {
1030            stmts.push(Statement {
1031                subject: subject.clone(),
1032                predicate: predicate.clone(),
1033                object: object.clone(),
1034            });
1035        }
1036    }
1037}