graph_rdfa_processor/
lib.rs

1use std::{borrow::Cow, collections::HashMap, error::Error, sync::Arc};
2
3mod constants;
4mod rdfa_elt;
5mod structs;
6#[cfg(test)]
7mod tests;
8
9use constants::{
10    COMMON_PREFIXES, IS_SPECIAL_NODE_FN, NODE_NS_TYPE, NODE_RDF_FIRST, NODE_RDF_NIL,
11    NODE_RDF_PLAIN_LITERAL, NODE_RDF_REST, NODE_RDFA_PATTERN_TYPE, NODE_RDFA_USES_VOCABULARY,
12    RESERVED_KEYWORDS, get_uuid,
13};
14use log::{debug, error};
15use rdfa_elt::RdfaElement;
16use scraper::{ElementRef, Selector};
17use url::Url;
18
19use structs::{Context, DataTypeFromPattern, Literal, Node, Statement};
20
21pub use structs::RdfaGraph;
22
23struct NodeContext<'a, 'b> {
24    element_ref: &'b ElementRef<'a>,
25    ctx: Context<'a>,
26    stmts: &'b mut Vec<Statement<'a>>,
27    current_node: Node<'a>,
28    rels: Option<Vec<Node<'a>>>,
29    revs: Option<Vec<Node<'a>>>,
30    in_list_stmts: &'b mut Vec<Statement<'a>>,
31    type_ofs: Option<Vec<Node<'a>>>,
32    parent_in_rel: Option<Vec<Node<'a>>>,
33    parent_in_rev: Option<Vec<Node<'a>>>,
34    parent: &'b Option<&'b Context<'a>>,
35}
36
37impl<'a> RdfaGraph<'a> {
38    pub fn parse(
39        input: &ElementRef<'a>,
40        initial_context: Context<'a>,
41    ) -> Result<RdfaGraph<'a>, Box<dyn Error>> {
42        let mut triples = vec![];
43        let mut inlist_triples = vec![];
44        let well_known_prefix = initial_context.well_known_prefix;
45        if initial_context.empty_ref_node_substitute.is_empty() {
46            return Err(
47                "if you provide a context, you most provide an empty_ref_node_substitute property."
48                    .into(),
49            );
50        }
51        traverse_element(
52            input,
53            None,
54            initial_context,
55            &mut triples,
56            &mut inlist_triples,
57        )?;
58
59        // fixes examples/other/example0002.html
60        // when base ends with "/", inlist_triples is not append
61        // todo find a better fix
62        if !inlist_triples.is_empty() {
63            triples.append(&mut inlist_triples);
64        }
65
66        triples = copy_pattern(triples)?;
67
68        Ok(RdfaGraph {
69            statements: triples.into_iter().collect(),
70            well_known_prefix,
71        })
72    }
73
74    pub fn parse_str(
75        html: &'a str,
76        base: &'a str,
77        well_known_prefix: Option<&'a str>,
78    ) -> Result<String, Box<dyn Error>> {
79        let document = scraper::Html::parse_document(html);
80        let empty_ref_node_substitue = get_uuid();
81        let root = document.root_element();
82
83        let root_ctx = Context {
84            base,
85            empty_ref_node_substitute: &empty_ref_node_substitue,
86            well_known_prefix: well_known_prefix.filter(|f| !f.is_empty()),
87            ..Default::default()
88        };
89        RdfaGraph::parse(&root, root_ctx).map(|g| g.to_string())
90    }
91}
92fn traverse_element<'a, 'b>(
93    element_ref: &'b ElementRef<'a>,
94    parent: Option<&'b Context<'a>>,
95    mut ctx: Context<'a>,
96    stmts: &'b mut Vec<Statement<'a>>,
97    in_list_stmts: &mut Vec<Statement<'a>>,
98) -> Result<Option<Node<'a>>, Box<dyn Error>> {
99    let mut elt = RdfaElement::new(element_ref)?;
100
101    ctx.vocab = elt.vocab.or_else(|| parent.as_ref().and_then(|p| p.vocab));
102
103    ctx.base = elt.base.unwrap_or(ctx.base);
104
105    let base = resolve_uri(ctx.base, &ctx, true)?;
106
107    if let Some(vocab) = ctx.vocab.filter(|v| !v.is_empty()) {
108        stmts.push(Statement {
109            subject: base.clone(),
110            predicate: NODE_RDFA_USES_VOCABULARY.clone(),
111            object: resolve_uri(vocab, &ctx, false)?,
112        })
113    } else {
114        ctx.vocab = None;
115    }
116    ctx.prefixes = elt
117        .prefix
118        .map(parse_prefixes)
119        .or_else(|| parent.map(|p| p.prefixes.clone()))
120        .unwrap_or(ctx.prefixes);
121
122    let is_empty_curie = |s: &str| {
123        let mut s = s.trim();
124        if s.starts_with('[') {
125            s = &s[1..];
126        } else {
127            return false;
128        }
129        if s.ends_with(']') {
130            s = &s[0..s.len() - 1];
131        } else {
132            return false;
133        }
134        s.is_empty()
135    };
136
137    let resource = elt
138        .resource
139        .filter(|r| !is_empty_curie(r))
140        .map(|c| if c.is_empty() { ctx.base } else { c });
141
142    ctx.lang = elt
143        .lang
144        .or_else(|| parent.and_then(|p| p.lang))
145        .or(ctx.lang);
146
147    let mut about = elt.about.and_then(|a| resolve_uri(a, &ctx, true).ok());
148
149    let mut rels = elt.rel.map(|r| parse_property_or_type_of(r, &ctx, true));
150    let mut revs = elt.rev.map(|r| parse_property_or_type_of(r, &ctx, true));
151
152    let mut parent_in_rel = parent.and_then(|c| c.in_rel.clone());
153    let mut parent_in_rev = parent.and_then(|c| c.in_rev.clone());
154    let mut parent_in_list = parent.and_then(|c| c.in_list.clone());
155
156    let mut src_or_href = elt
157        .src_or_href()
158        .and_then(|v| resolve_uri(v, &ctx, true).ok());
159
160    let mut type_ofs = elt.type_of.and_then(|t| {
161        if t.trim().is_empty() {
162            // use vocab
163            resolve_uri(ctx.vocab.unwrap_or(ctx.base), &ctx, true)
164                .ok()
165                .map(|v| vec![v])
166        } else {
167            Some(parse_property_or_type_of(t, &ctx, true))
168        }
169    });
170
171    let datatype = elt
172        .datatype
173        .and_then(|dt| match resolve_uri(dt, &ctx, false) {
174            Ok(d) => Some(Box::new(d)),
175            Err(e) => {
176                debug!("could not parse {dt}. error {e}");
177                None
178            }
179        });
180
181    let mut predicates = elt
182        .property
183        .map(|p| parse_property_or_type_of(p, &ctx, false));
184
185    // by default, current node set as the base unless it's a special node
186    // check other/example0006 for special node
187    let mut current_node = if !IS_SPECIAL_NODE_FN(&datatype) {
188        base.clone()
189    } else {
190        make_bnode()
191    };
192
193    // if parent is inlist
194    if let Some(parent_in_list) = parent_in_list.take() {
195        let subject = get_parent_subject(&parent, &ctx)?;
196        let obj = if let Some(resource) = resource
197            .and_then(|r| resolve_uri(r, &ctx, true).ok())
198            .map(|n| Node::Ref(Arc::new(n)))
199            .or_else(|| src_or_href.clone())
200        {
201            resource
202        } else {
203            Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
204        };
205        for rel in parent_in_list {
206            push_triples_inlist(in_list_stmts, &subject, rel, &obj);
207        }
208        current_node = subject;
209    }
210    // if current elt is inlist
211    else if elt.is_inlist() {
212        let mut in_rel = false;
213
214        let subject = get_parent_subject(&parent, &ctx)?;
215
216        if rels.is_some()
217            && src_or_href.is_none()
218            && predicates.is_none()
219            && resource.is_none()
220            && about.is_none()
221        // empty list
222        {
223            if element_ref.children().count() != 0 {
224                // example0013 && example0014
225                if type_ofs.is_some() {
226                    let Some(rels) = rels.take() else {
227                        unreachable!()
228                    };
229                    current_node = make_bnode();
230                    handle_children(NodeContext {
231                        element_ref,
232                        ctx: ctx.clone(),
233                        stmts,
234                        current_node: current_node.clone(),
235                        rels: None,
236                        revs: revs.take(),
237                        in_list_stmts,
238                        type_ofs: type_ofs.take(),
239                        parent_in_rel: parent_in_rel.take(),
240                        parent_in_rev: parent_in_rev.take(),
241                        parent: &parent,
242                    })?;
243                    for rel in rels {
244                        let mut existing_rel_in_list = None;
245                        if let Some(node) =
246                            find_pos_last_node_in_inlist(in_list_stmts, &subject, &rel)
247                                .and_then(|s| in_list_stmts.get_mut(s))
248                                .filter(|p| p.object != *NODE_RDF_NIL)
249                        {
250                            existing_rel_in_list = Some(node.object.clone());
251                        }
252
253                        if let Some(existing_rel_in_list) = existing_rel_in_list {
254                            push_triples_inlist(
255                                in_list_stmts,
256                                &subject,
257                                rel,
258                                &existing_rel_in_list,
259                            );
260                        } else {
261                            push_triples_inlist(in_list_stmts, &subject, rel, &current_node);
262                        }
263                    }
264                    return Ok(Some(subject));
265                } else {
266                    ctx.in_list = rels.take();
267                }
268            } else {
269                push_triples(in_list_stmts, &subject, &rels.take(), &NODE_RDF_NIL);
270            }
271        } else if let Some(rels) = rels.take().filter(|r| !r.is_empty()) {
272            in_rel = true;
273
274            let obj = if let Some(resource) = resource
275                .and_then(|r| resolve_uri(r, &ctx, true).ok())
276                .map(|n| Node::Ref(Arc::new(n)))
277                .or_else(|| src_or_href.clone())
278            {
279                resource
280            } else {
281                Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
282            };
283            for rel in rels {
284                push_triples_inlist(in_list_stmts, &subject, rel, &obj);
285            }
286        }
287        let obj = if let (Some(resource), false) = (resource, in_rel) {
288            Node::Ref(Arc::new(resolve_uri(resource, &ctx, true)?))
289        } else {
290            Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
291        };
292        if let Some(predicates) = predicates.take() {
293            for predicate in predicates {
294                push_triples_inlist(in_list_stmts, &subject, predicate, &obj);
295            }
296        }
297
298        current_node = subject;
299    }
300    // if there is a resource attr
301    else if let Some(resource) = resource {
302        let resource = Node::Ref(Arc::new(resolve_uri(resource, &ctx, true)?));
303
304        if !elt.has_content_or_datatype() {
305            let object = about
306                .as_ref()
307                .filter(|_| parent_in_rel.is_some() || parent_in_rev.is_some())
308                .map(|a| Node::Ref(Arc::new(a.clone())))
309                .unwrap_or(resource);
310            current_node = object;
311            let subject = about
312                .take()
313                .map(|a| Ok(Node::Ref(Arc::new(a))))
314                .unwrap_or_else(|| get_parent_subject(&parent, &ctx))?;
315
316            push_triples(stmts, &subject, &predicates, &current_node);
317
318            if predicates.is_some() && type_ofs.is_none() {
319                current_node = subject;
320            } else {
321                push_triples(stmts, &subject, &rels.take(), &current_node);
322                push_triples(stmts, &current_node, &revs.take(), &subject);
323            }
324        } else {
325            //example0020 && example0021
326            let resource = about
327                .as_ref()
328                .map(|a| Node::Ref(Arc::new(a.clone())))
329                .unwrap_or(resource);
330            push_triples(
331                stmts,
332                &resource,
333                &predicates,
334                &extract_literal(&elt, &datatype, &ctx)?,
335            );
336            current_node = resource;
337        }
338    }
339    // if there is no resource but about
340    else if let Some(about) = about {
341        // handle about case. set the context.
342        // if property is present, children become objects of current.
343        let is_empty = elt
344            .about
345            .filter(|a| !a.trim().is_empty() && is_empty_curie(a))
346            .is_some();
347        current_node = if !is_empty {
348            Node::Ref(Arc::new(about))
349        } else {
350            current_node
351        };
352
353        push_triples(
354            stmts,
355            &current_node,
356            &predicates,
357            &Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
358        );
359
360        if let Some(src_or_href) = src_or_href.take() {
361            push_triples(stmts, &current_node, &rels, &src_or_href);
362            push_triples(stmts, &src_or_href, &revs, &current_node);
363        }
364        if is_empty {
365            current_node = make_bnode();
366        }
367    }
368    // now the interesting bits
369    else if src_or_href.is_some() && elt.has_content_or_datatype() {
370        current_node = src_or_href.take().ok_or("no src")?;
371
372        push_triples(
373            stmts,
374            &current_node,
375            &predicates,
376            &extract_literal(&elt, &datatype, &ctx)?,
377        );
378    }
379    // test 0303
380    else if src_or_href.is_some() && (rels.is_some() || revs.is_some()) {
381        let src_or_href = src_or_href.take().ok_or("no src")?;
382        current_node = get_parent_subject(&parent, &ctx)
383            .ok()
384            .unwrap_or_else(make_bnode);
385
386        let mut has_term = false;
387        let mut emit_triple = false;
388        if elt.has_property() {
389            rels = rels.take().map(|rs| {
390                rs.into_iter()
391                    .filter(|r| {
392                        let m = matches!(r, Node::Ref(r) if matches!(r.as_ref(), Node::TermIri(_)));
393                        if m {
394                            has_term = true;
395                        } else {
396                            emit_triple = true;
397                        }
398                        !m
399                    })
400                    .collect()
401            });
402        }
403
404        push_triples(stmts, &current_node, &rels, &src_or_href);
405        push_triples(stmts, &src_or_href, &revs, &current_node);
406
407        if has_term {
408            if emit_triple {
409                elt.src.take();
410                elt.href.take();
411            }
412
413            push_triples(
414                stmts,
415                &current_node,
416                &predicates,
417                &extract_literal(&elt, &datatype, &ctx)?,
418            );
419        }
420        // example0017
421        if rels.is_some() && type_ofs.is_some() {
422            if let Some(type_ofs) = type_ofs.take() {
423                let pred = Some(vec![NODE_NS_TYPE.clone()]);
424
425                for to in type_ofs {
426                    push_triples(stmts, &src_or_href, &pred, &to);
427                }
428            }
429            //example0018
430            current_node = src_or_href.clone();
431            rels.take();
432        }
433        // example0012
434        if revs.is_some() {
435            if predicates.is_some() {
436                elt.src.take();
437                elt.href.take();
438                push_triples(
439                    stmts,
440                    &current_node,
441                    &predicates,
442                    &extract_literal(&elt, &datatype, &ctx)?,
443                );
444            }
445            if let Some(type_ofs) = type_ofs.take() {
446                let pred = Some(vec![NODE_NS_TYPE.clone()]);
447
448                for to in type_ofs {
449                    push_triples(stmts, &src_or_href, &pred, &to);
450                }
451            }
452        }
453    }
454    // another case
455    else if type_ofs.is_some() {
456        if elt.has_property()
457            && !elt.has_content_or_datatype()
458            && (parent_in_rel.is_some() || parent_in_rev.is_some())
459        {
460            current_node = make_bnode();
461            let node = src_or_href.take().unwrap_or_else(make_bnode);
462            for to in type_ofs.take().iter().flatten() {
463                push_triples(stmts, &node, &Some(vec![NODE_NS_TYPE.clone()]), to);
464            }
465            push_triples(stmts, &current_node, &predicates, &node);
466        } else if rels.is_some() {
467            current_node = make_bnode();
468
469            for to in type_ofs.take().into_iter().flatten() {
470                stmts.push(Statement {
471                    subject: current_node.clone(),
472                    predicate: NODE_NS_TYPE.clone(),
473                    object: to,
474                })
475            }
476            push_triples(stmts, &base, &rels.take(), &current_node);
477        } else if !IS_SPECIAL_NODE_FN(&datatype) {
478            // property shouldn't be in the list
479            // fixme
480            let child_with_rdfa_tag = element_ref
481                .select(&Selector::parse(
482                    "[href], [src], [resource], [property], [about]",
483                )?)
484                .filter(|e| {
485                    RdfaElement::new(e)
486                        .ok()
487                        .and_then(|e2| e2.datatype)
488                        .and_then(|dt| match resolve_uri(dt, &ctx, false).ok().map(Box::new) {
489                            v @ Some(_) if IS_SPECIAL_NODE_FN(&v) => v,
490                            _ => None,
491                        })
492                        .is_none()
493                })
494                .count()
495                == 0;
496            current_node = if let Some(src_or_href) = src_or_href.take() {
497                src_or_href
498            // not sure about this rule
499            } else if elt.name == "body"
500                || elt.name == "head"
501                || child_with_rdfa_tag
502                || parent.is_none()
503            {
504                base.clone()
505            } else {
506                make_bnode()
507            };
508
509            let subject = get_parent_subject(&parent, &ctx)
510                .ok()
511                .unwrap_or_else(make_bnode);
512
513            push_triples(stmts, &subject, &predicates, &current_node);
514        } else {
515            // test examples/other/example0006.html
516            push_triples(
517                stmts,
518                &current_node,
519                &predicates,
520                &extract_literal(&elt, &datatype, &ctx)?,
521            );
522        }
523    }
524    // another general case
525    else {
526        current_node = src_or_href
527            .take()
528            .filter(|_| parent_in_rel.is_some() || parent_in_rev.is_some())
529            .map(Ok)
530            .unwrap_or_else(|| get_parent_subject(&parent, &ctx))?;
531
532        push_triples(
533            stmts,
534            &current_node,
535            &predicates,
536            &Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
537        );
538    }
539
540    handle_children(NodeContext {
541        element_ref,
542        ctx,
543        stmts,
544        current_node,
545        rels,
546        revs,
547        in_list_stmts,
548        type_ofs,
549        parent_in_rel,
550        parent_in_rev,
551        parent: &parent,
552    })
553}
554fn handle_children<'a>(
555    NodeContext {
556        element_ref,
557        mut ctx,
558        stmts,
559        current_node,
560        rels,
561        revs,
562        in_list_stmts,
563        type_ofs,
564        mut parent_in_rel,
565        mut parent_in_rev,
566        parent,
567    }: NodeContext<'a, '_>,
568) -> Result<Option<Node<'a>>, Box<dyn Error>> {
569    if let Some(type_ofs) = type_ofs {
570        for type_of in type_ofs {
571            stmts.push(Statement {
572                subject: current_node.clone(),
573                predicate: NODE_NS_TYPE.clone(),
574                object: type_of,
575            })
576        }
577    }
578
579    if parent_in_rel.is_some() || parent_in_rev.is_some() {
580        let parent = get_parent_subject(parent, &ctx)
581            .ok()
582            .ok_or("in_rel: no parent node")?;
583        push_triples(stmts, &parent, &parent_in_rel.take(), &current_node);
584        push_triples(stmts, &current_node, &parent_in_rev.take(), &parent);
585    }
586    ctx.current_node = Some(current_node.clone());
587    ctx.in_rel = rels.clone();
588    ctx.in_rev = revs.clone();
589    for child in get_children(element_ref)? {
590        if let Some(c) = ElementRef::wrap(child) {
591            // Triples are also 'completed' if any one of @property, @rel or @rev are present.
592            let triples_completed = (ctx.in_rel.is_some() || ctx.in_rev.is_some())
593                && (c.attr("property").is_some()
594                    || c.attr("rel").is_some()
595                    || c.attr("rev").is_some())
596                && (c.attr("about").is_none() && c.attr("typeof").is_none());
597
598            if triples_completed {
599                // Triples are also 'completed' if any one of @property, @rel or @rev are present.
600                let b_node = make_bnode();
601                push_triples(stmts, &current_node, &ctx.in_rel.take(), &b_node);
602                push_triples(stmts, &b_node, &ctx.in_rev.take(), &current_node);
603
604                ctx.current_node = Some(b_node);
605            }
606            // However, unlike the situation when @about or @typeof are present, all predicates are attached to one bnode
607            if c.attr("about").is_some() || c.attr("typeof").is_some() {
608                ctx.in_rel = rels.clone();
609                ctx.in_rev = revs.clone();
610                ctx.current_node = Some(current_node.clone());
611            }
612            let child_ctx = Context {
613                base: ctx.base,
614                lang: ctx.lang,
615                empty_ref_node_substitute: ctx.empty_ref_node_substitute,
616                ..Default::default()
617            };
618
619            let node = traverse_element(&c, Some(&ctx), child_ctx, stmts, in_list_stmts)?;
620            if node != ctx.current_node {
621                stmts.append(in_list_stmts);
622            }
623        }
624    }
625    Ok(ctx.current_node.clone())
626}
627fn extract_literal<'a>(
628    rdfa_el: &RdfaElement<'a, '_>,
629    datatype: &Option<Box<Node<'a>>>,
630    ctx: &Context<'a>,
631) -> Result<Node<'a>, &'static str> {
632    let plain_datatype = datatype
633        .as_ref()
634        .filter(|dt| dt.as_ref() == &*NODE_RDF_PLAIN_LITERAL)
635        .is_some();
636
637    let lang = ctx.lang.filter(|s| datatype.is_none() && !s.is_empty());
638    if let Some(value) = rdfa_el.src_or_href().filter(|_| {
639        !rdfa_el.has_about() && !rdfa_el.has_property() || !rdfa_el.has_content_or_datatype()
640    }) {
641        resolve_uri(value, ctx, true)
642    } else if let Some(content) = rdfa_el.content {
643        Ok(Node::Literal(Literal {
644            datatype: datatype.clone(),
645            value: Cow::Borrowed(content),
646            lang,
647        }))
648    } else if !plain_datatype && IS_SPECIAL_NODE_FN(datatype) {
649        Ok(Node::Literal(Literal {
650            value: Cow::Owned(rdfa_el.inner_html()),
651            datatype: datatype.clone(),
652            lang: None,
653        }))
654    } else if let Some(content) = rdfa_el.get_time() {
655        Ok(Node::Literal(Literal {
656            datatype: datatype
657                .clone()
658                .or_else(|| DataTypeFromPattern::date_time_from_pattern(content).map(Box::new)),
659            value: Cow::Borrowed(content),
660            lang: None,
661        }))
662    } else {
663        let datatype = if plain_datatype {
664            None
665        } else {
666            datatype.clone()
667        };
668        let lang = if plain_datatype { ctx.lang } else { lang };
669        let texts = rdfa_el.texts();
670        let text = if texts.is_empty() {
671            Cow::Borrowed("")
672        } else {
673            let text = texts
674                .iter()
675                .map(|t| t.to_string())
676                .collect::<Vec<_>>()
677                .join("");
678            Cow::Owned(text)
679        };
680        Ok(Node::Literal(Literal {
681            datatype,
682            value: text,
683            lang,
684        }))
685    }
686}
687fn get_parent_subject<'a>(
688    parent: &Option<&Context<'a>>,
689    ctx: &Context<'a>,
690) -> Result<Node<'a>, Box<dyn Error>> {
691    parent
692        .and_then(|p| p.current_node.clone())
693        .or_else(|| {
694            if parent.is_none() {
695                resolve_uri(ctx.base, ctx, true).ok()
696            } else {
697                None
698            }
699        })
700        .ok_or("no parent".into())
701}
702fn resolve_uri<'a>(
703    uri: &'a str,
704    ctx: &Context<'a>,
705    is_resource: bool,
706) -> Result<Node<'a>, &'static str> {
707    let uri = uri.trim();
708
709    let iri = Url::parse(uri);
710    let trailing_white_space = if ctx.base.ends_with('/')
711        || ctx.base.ends_with('#')
712        || uri.starts_with('/')
713        || uri.starts_with('#')
714    {
715        ""
716    } else {
717        "/"
718    };
719    match iri {
720        Ok(iri) if !iri.cannot_be_a_base() || iri.is_special() => {
721            // special case pct encoded, see other/example0004
722            if uri.contains(|c: char| c.is_whitespace() || c.is_control()) {
723                let mut new_uri = String::with_capacity(uri.len() * 125 / 100);
724                for c in uri.chars() {
725                    match c {
726                        '\n' => new_uri.push_str("%0A"),
727                        '\0' => new_uri.push_str("%00"),
728                        '\t' => new_uri.push_str("%09"),
729                        '\r' => new_uri.push_str("%0D"),
730                        ' ' => new_uri.push_str("%20"),
731                        c => new_uri.push(c),
732                    }
733                }
734                Ok(Node::Iri(Cow::Owned(new_uri)))
735            } else {
736                Ok(Node::Iri(Cow::Borrowed(uri)))
737            }
738        }
739
740        // Curie
741        Ok(iri) => {
742            if uri.starts_with("mail:") || uri.starts_with("tel:") {
743                Ok(Node::Iri(Cow::Borrowed(uri)))
744            } else if let Some((prefix, value)) = ctx
745                .prefixes
746                .iter()
747                .find(|(k, _)| k.eq_ignore_ascii_case(iri.scheme()))
748            {
749                let iri = format!(
750                    "{value}{}",
751                    &uri.replacen(':', "", 1).trim()[prefix.len()..]
752                );
753                Ok(Node::Iri(Cow::Owned(iri)))
754            } else if let Some((prefix, value)) = COMMON_PREFIXES
755                .iter()
756                .find(|(k, _)| k.eq_ignore_ascii_case(iri.scheme()))
757            {
758                let iri = format!(
759                    "{value}{}",
760                    &uri.replacen(':', "", 1).trim()[prefix.len()..]
761                );
762                Ok(Node::Iri(Cow::Owned(iri)))
763            } else {
764                Ok(Node::Iri(Cow::Owned(uri.to_string())))
765            }
766        }
767        Err(url::ParseError::RelativeUrlWithoutBase) => {
768            if let Ok((prefix, reference)) = parse_safe_curie(uri) {
769                let reference = reference.trim();
770                let prefix = prefix.trim();
771                if prefix == "_" {
772                    let id = if reference.is_empty() {
773                        ctx.empty_ref_node_substitute
774                    } else {
775                        reference
776                    };
777                    return Ok(Node::RefBlank(id));
778                } else if prefix.is_empty() && !reference.is_empty() {
779                    return Ok(Node::TermIri(Cow::Owned(
780                        [COMMON_PREFIXES[""], reference].join(""),
781                    )));
782                } else if let Some(prefix) = ctx
783                    .prefixes
784                    .get(prefix)
785                    .or_else(|| COMMON_PREFIXES.get(prefix))
786                {
787                    let reference = if reference.trim().is_empty() {
788                        reference.trim()
789                    } else {
790                        reference
791                    };
792                    return Ok(Node::Iri(Cow::Owned([prefix, reference].join(""))));
793                }
794            }
795            if is_resource || uri.starts_with('#') || uri.starts_with('/') {
796                let uri = if uri.starts_with("/") && ctx.base.ends_with("/") {
797                    &uri[1..]
798                } else {
799                    uri
800                };
801                Ok(Node::TermIri(Cow::Owned(
802                    [ctx.base, trailing_white_space, uri].join(""),
803                )))
804            } else if let Some(vocab) = ctx.vocab {
805                Ok(Node::TermIri(Cow::Owned([vocab, uri].join(""))))
806            } else if RESERVED_KEYWORDS
807                .iter()
808                .any(|w| uri.eq_ignore_ascii_case(w))
809            {
810                Ok(Node::TermIri(Cow::Borrowed(
811                    COMMON_PREFIXES[uri.to_lowercase().as_str()],
812                )))
813            } else {
814                debug!("could not determine base/vocab {:?}", ctx);
815                // Ok(Node::Iri(Cow::Borrowed(uri)))
816                Err("could not determine uri")
817            }
818        }
819        Err(e) => {
820            eprintln!("invalid uri {uri}. error: {e}");
821            Err("could not resolve uri")
822        }
823    }
824}
825
826fn parse_safe_curie(s: &str) -> Result<(&str, &str), &'static str> {
827    let mut s = s.trim();
828    if s.starts_with('[') {
829        if !s.ends_with(']') {
830            return Err("invalid SafeCurie");
831        }
832        s = &s[1..s.len() - 1];
833    }
834    s.split_once(':').ok_or("not a curie")
835}
836
837fn parse_prefixes(s: &str) -> HashMap<&str, &str> {
838    s.split_whitespace()
839        .map(|s| s.trim())
840        .collect::<Vec<_>>()
841        .chunks_exact(2)
842        .map(|c| (c[0], c[1]))
843        .filter_map(|(s, p)| {
844            if let Ok((s, _)) = parse_safe_curie(s) {
845                Some((s, p))
846            } else {
847                error!("fixme! couldn't parse curie for {s}, {p}");
848                None
849            }
850        })
851        .collect()
852}
853
854fn parse_property_or_type_of<'a>(
855    s: &'a str,
856    ctx: &Context<'a>,
857    allow_b_node: bool,
858) -> Vec<Node<'a>> {
859    s.split_whitespace()
860        .filter_map(|uri| resolve_uri(uri, ctx, false).ok())
861        .filter(|node| allow_b_node || !matches!(node, Node::Blank(_) | Node::RefBlank(_)))
862        .map(|n| Node::Ref(Arc::new(n)))
863        .collect()
864}
865
866fn push_triples_inlist<'a>(
867    stmts: &mut Vec<Statement<'a>>,
868    subject: &Node<'a>,
869    predicate: Node<'a>,
870    obj: &Node<'a>,
871) {
872    let b_node = make_bnode();
873    stmts.push(Statement {
874        subject: b_node.clone(),
875        predicate: NODE_RDF_FIRST.clone(),
876        object: obj.clone(),
877    });
878
879    if let Some(node) =
880        find_pos_last_node_in_inlist(stmts, subject, &predicate).and_then(|pos| stmts.get_mut(pos))
881    {
882        node.object = b_node.clone();
883    } else {
884        // push the root of the list
885        stmts.push(Statement {
886            subject: subject.clone(),
887            predicate,
888            object: b_node.clone(),
889        });
890    }
891    stmts.push(Statement {
892        subject: b_node,
893        predicate: NODE_RDF_REST.clone(),
894        object: NODE_RDF_NIL.clone(),
895    });
896}
897fn find_pos_last_node_in_inlist<'a>(
898    stmts: &Vec<Statement<'a>>,
899    root_subject: &Node<'a>,
900    predicate: &Node<'a>,
901) -> Option<usize> {
902    fn find_res_nil<'a>(stmts: &Vec<Statement<'a>>, subject: &Node<'a>) -> Option<usize> {
903        let node = stmts
904            .iter()
905            .enumerate()
906            .find(|(_, stmt)| &stmt.subject == subject && stmt.predicate == *NODE_RDF_REST);
907
908        if let Some((pos, stmt)) = node {
909            if stmt.object == *NODE_RDF_NIL {
910                Some(pos)
911            } else {
912                find_res_nil(stmts, &stmt.object)
913            }
914        } else {
915            None
916        }
917    }
918    let root = stmts
919        .iter()
920        .find(|stmt| &stmt.subject == root_subject && &stmt.predicate == predicate);
921    if let Some(Statement { object, .. }) = root {
922        find_res_nil(stmts, object)
923    } else {
924        None
925    }
926}
927
928// skip when there are no rdfa attributes, see e.g examples/earl_html5/example0084.html
929#[inline]
930fn get_children<'a>(
931    element_ref: &ElementRef<'a>,
932) -> Result<Vec<ego_tree::NodeRef<'a, scraper::Node>>, &'static str> {
933    let mut res = vec![];
934    for c in element_ref.children() {
935        if c.value()
936            .as_element()
937            .filter(|e| e.attrs().count() == 0)
938            .is_some()
939        {
940            let child_ref = ElementRef::wrap(c).ok_or("not an element ref")?;
941            res.append(&mut get_children(&child_ref)?);
942        } else {
943            res.push(c);
944        }
945    }
946
947    Ok(res)
948}
949
950#[inline]
951fn make_bnode<'a>() -> Node<'a> {
952    Node::Blank(get_uuid())
953}
954
955#[inline]
956fn copy_pattern(triples: Vec<Statement<'_>>) -> Result<Vec<Statement<'_>>, Box<dyn Error>> {
957    let (pattern_type, pattern): (Vec<Statement>, Vec<Statement>) = triples
958        .into_iter()
959        .partition(|stmt| stmt.object == *NODE_RDFA_PATTERN_TYPE);
960
961    let (pattern_predicate, pattern): (Vec<Statement>, Vec<Statement>) = pattern
962        .into_iter()
963        .partition(|stmt| pattern_type.iter().any(|s| s.subject == stmt.subject));
964
965    let (pattern_subject, mut triples): (Vec<Statement>, Vec<Statement>) = pattern
966        .into_iter()
967        .partition(|stmt| pattern_predicate.iter().any(|s| s.subject == stmt.object));
968
969    // remove only if pattern referenced
970    let (mut unreferenced_pattern_predicate, pattern_predicate): (Vec<Statement>, Vec<Statement>) =
971        pattern_predicate
972            .into_iter()
973            .partition(|stmt| pattern_subject.iter().all(|s| s.object != stmt.subject));
974
975    let (mut unreferenced_pattern_type, _): (Vec<Statement>, Vec<Statement>) =
976        pattern_type.into_iter().partition(|stmt| {
977            unreferenced_pattern_predicate
978                .iter()
979                .any(|s| s.subject == stmt.subject)
980        });
981    triples.append(&mut unreferenced_pattern_predicate);
982    triples.append(&mut unreferenced_pattern_type);
983
984    for Statement {
985        subject, object, ..
986    } in pattern_subject
987    {
988        for Statement {
989            predicate,
990            object: obj,
991            ..
992        } in pattern_predicate
993            .iter()
994            .filter(|stmt| object == stmt.subject)
995        {
996            triples.push(Statement {
997                subject: subject.clone(),
998                predicate: predicate.clone(),
999                object: obj.clone(),
1000            })
1001        }
1002    }
1003
1004    Ok(triples)
1005}
1006
1007#[inline]
1008fn push_triples<'a>(
1009    stmts: &mut Vec<Statement<'a>>,
1010    subject: &Node<'a>,
1011    predicates: &Option<Vec<Node<'a>>>,
1012    object: &Node<'a>,
1013) {
1014    if let Some(predicate) = predicates {
1015        for predicate in predicate {
1016            stmts.push(Statement {
1017                subject: subject.clone(),
1018                predicate: predicate.clone(),
1019                object: object.clone(),
1020            });
1021        }
1022    }
1023}