1use std::{borrow::Cow, collections::HashMap, error::Error, sync::Arc};
2
3mod constants;
4mod rdfa_elt;
5mod structs;
6#[cfg(test)]
7mod tests;
8
9use constants::{
10 COMMON_PREFIXES, IS_SPECIAL_NODE_FN, NODE_NS_TYPE, NODE_RDF_FIRST, NODE_RDF_NIL,
11 NODE_RDF_PLAIN_LITERAL, NODE_RDF_REST, NODE_RDFA_PATTERN_TYPE, NODE_RDFA_USES_VOCABULARY,
12 RESERVED_KEYWORDS, get_uuid,
13};
14use log::{debug, error};
15use rdfa_elt::RdfaElement;
16use scraper::{ElementRef, Selector};
17use url::{Origin, Url};
18
19use structs::{Context, DataTypeFromPattern, Literal, Node, Statement};
20
21pub use structs::RdfaGraph;
22
23struct NodeContext<'a, 'b> {
24 element_ref: &'b ElementRef<'a>,
25 ctx: Context<'a>,
26 stmts: &'b mut Vec<Statement<'a>>,
27 current_node: Node<'a>,
28 rels: Option<Vec<Node<'a>>>,
29 revs: Option<Vec<Node<'a>>>,
30 in_list_stmts: &'b mut Vec<Statement<'a>>,
31 type_ofs: Option<Vec<Node<'a>>>,
32 parent_in_rel: Option<Vec<Node<'a>>>,
33 parent_in_rev: Option<Vec<Node<'a>>>,
34 parent: &'b Option<&'b Context<'a>>,
35}
36
37impl<'a> RdfaGraph<'a> {
38 pub fn parse(
39 input: &ElementRef<'a>,
40 initial_context: Context<'a>,
41 ) -> Result<RdfaGraph<'a>, Box<dyn Error>> {
42 let mut triples = vec![];
43 let mut inlist_triples = vec![];
44 let well_known_prefix = initial_context.well_known_prefix;
45 if initial_context.empty_ref_node_substitute.is_empty() {
46 return Err(
47 "if you provide a context, you most provide an empty_ref_node_substitute property."
48 .into(),
49 );
50 }
51 traverse_element(
52 input,
53 None,
54 initial_context,
55 &mut triples,
56 &mut inlist_triples,
57 )?;
58
59 if !inlist_triples.is_empty() {
63 triples.append(&mut inlist_triples);
64 }
65
66 triples = copy_pattern(triples)?;
67
68 Ok(RdfaGraph {
69 statements: triples.into_iter().collect(),
70 well_known_prefix,
71 })
72 }
73
74 pub fn parse_str(
75 html: &'a str,
76 base: &'a str,
77 well_known_prefix: Option<&'a str>,
78 ) -> Result<String, Box<dyn Error>> {
79 let document = scraper::Html::parse_document(html);
80 let empty_ref_node_substitue = get_uuid();
81 let root = document.root_element();
82
83 let root_ctx = Context {
84 base,
85 empty_ref_node_substitute: &empty_ref_node_substitue,
86 well_known_prefix: well_known_prefix.filter(|f| !f.is_empty()),
87 ..Default::default()
88 };
89 RdfaGraph::parse(&root, root_ctx).map(|g| g.to_string())
90 }
91}
92fn traverse_element<'a, 'b>(
93 element_ref: &'b ElementRef<'a>,
94 parent: Option<&'b Context<'a>>,
95 mut ctx: Context<'a>,
96 stmts: &'b mut Vec<Statement<'a>>,
97 in_list_stmts: &mut Vec<Statement<'a>>,
98) -> Result<Option<Node<'a>>, Box<dyn Error>> {
99 let mut elt = RdfaElement::new(element_ref)?;
100
101 ctx.vocab = elt.vocab.or_else(|| parent.as_ref().and_then(|p| p.vocab));
102
103 ctx.base = elt.base.unwrap_or(ctx.base);
104
105 let base = resolve_uri(ctx.base, &ctx, true)?;
106
107 if let Some(vocab) = ctx.vocab.filter(|v| !v.is_empty()) {
108 stmts.push(Statement {
109 subject: base.clone(),
110 predicate: NODE_RDFA_USES_VOCABULARY.clone(),
111 object: resolve_uri(vocab, &ctx, false)?,
112 })
113 } else {
114 ctx.vocab = None;
115 }
116 ctx.prefixes = elt
117 .prefix
118 .map(parse_prefixes)
119 .or_else(|| parent.map(|p| p.prefixes.clone()))
120 .unwrap_or(ctx.prefixes);
121
122 let is_empty_curie = |s: &str| {
123 let mut s = s.trim();
124 if s.starts_with('[') {
125 s = &s[1..];
126 } else {
127 return false;
128 }
129 if s.ends_with(']') {
130 s = &s[0..s.len() - 1];
131 } else {
132 return false;
133 }
134 s.is_empty()
135 };
136
137 let resource = elt
138 .resource
139 .filter(|r| !is_empty_curie(r))
140 .map(|c| if c.is_empty() { ctx.base } else { c });
141
142 ctx.lang = elt
143 .lang
144 .or_else(|| parent.and_then(|p| p.lang))
145 .or(ctx.lang);
146
147 let mut about = elt.about.and_then(|a| resolve_uri(a, &ctx, true).ok());
148
149 let mut rels = elt.rel.map(|r| parse_property_or_type_of(r, &ctx, true));
150 let mut revs = elt.rev.map(|r| parse_property_or_type_of(r, &ctx, true));
151
152 let mut parent_in_rel = parent.and_then(|c| c.in_rel.clone());
153 let mut parent_in_rev = parent.and_then(|c| c.in_rev.clone());
154 let mut parent_in_list = parent.and_then(|c| c.in_list.clone());
155
156 let mut src_or_href = elt
157 .src_or_href()
158 .and_then(|v| resolve_uri(v, &ctx, true).ok());
159
160 let mut type_ofs = elt.type_of.and_then(|t| {
161 if t.trim().is_empty() {
162 resolve_uri(ctx.vocab.unwrap_or(ctx.base), &ctx, true)
164 .ok()
165 .map(|v| vec![v])
166 } else {
167 Some(parse_property_or_type_of(t, &ctx, true))
168 }
169 });
170
171 let datatype = elt
172 .datatype
173 .and_then(|dt| match resolve_uri(dt, &ctx, false) {
174 Ok(d) => Some(Box::new(d)),
175 Err(e) => {
176 debug!("could not parse {dt}. error {e}");
177 None
178 }
179 });
180
181 let mut predicates = elt
182 .property
183 .map(|p| parse_property_or_type_of(p, &ctx, false));
184
185 let mut current_node = if !IS_SPECIAL_NODE_FN(&datatype) {
188 base.clone()
189 } else {
190 make_bnode()
191 };
192
193 if let Some(parent_in_list) = parent_in_list.take() {
195 let subject = get_parent_subject(&parent, &ctx)?;
196 let obj = if let Some(resource) = resource
197 .and_then(|r| resolve_uri(r, &ctx, true).ok())
198 .map(|n| Node::Ref(Arc::new(n)))
199 .or_else(|| src_or_href.clone())
200 {
201 resource
202 } else {
203 Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
204 };
205 for rel in parent_in_list {
206 push_triples_inlist(in_list_stmts, &subject, rel, &obj);
207 }
208 current_node = subject;
209 }
210 else if elt.is_inlist() {
212 let mut in_rel = false;
213
214 let subject = get_parent_subject(&parent, &ctx)?;
215
216 if rels.is_some()
217 && src_or_href.is_none()
218 && predicates.is_none()
219 && resource.is_none()
220 && about.is_none()
221 {
223 if element_ref.children().count() != 0 {
224 if type_ofs.is_some() {
226 let Some(rels) = rels.take() else {
227 unreachable!()
228 };
229 current_node = make_bnode();
230 handle_children(NodeContext {
231 element_ref,
232 ctx: ctx.clone(),
233 stmts,
234 current_node: current_node.clone(),
235 rels: None,
236 revs: revs.take(),
237 in_list_stmts,
238 type_ofs: type_ofs.take(),
239 parent_in_rel: parent_in_rel.take(),
240 parent_in_rev: parent_in_rev.take(),
241 parent: &parent,
242 })?;
243 for rel in rels {
244 let mut existing_rel_in_list = None;
245 if let Some(node) =
246 find_pos_last_node_in_inlist(in_list_stmts, &subject, &rel)
247 .and_then(|s| in_list_stmts.get_mut(s))
248 .filter(|p| p.object != *NODE_RDF_NIL)
249 {
250 existing_rel_in_list = Some(node.object.clone());
251 }
252
253 if let Some(existing_rel_in_list) = existing_rel_in_list {
254 push_triples_inlist(
255 in_list_stmts,
256 &subject,
257 rel,
258 &existing_rel_in_list,
259 );
260 } else {
261 push_triples_inlist(in_list_stmts, &subject, rel, ¤t_node);
262 }
263 }
264 return Ok(Some(subject));
265 } else {
266 ctx.in_list = rels.take();
267 }
268 } else {
269 push_triples(in_list_stmts, &subject, &rels.take(), &NODE_RDF_NIL);
270 }
271 } else if let Some(rels) = rels.take().filter(|r| !r.is_empty()) {
272 in_rel = true;
273
274 let obj = if let Some(resource) = resource
275 .and_then(|r| resolve_uri(r, &ctx, true).ok())
276 .map(|n| Node::Ref(Arc::new(n)))
277 .or_else(|| src_or_href.clone())
278 {
279 resource
280 } else {
281 Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
282 };
283 for rel in rels {
284 push_triples_inlist(in_list_stmts, &subject, rel, &obj);
285 }
286 }
287 let obj = if let (Some(resource), false) = (resource, in_rel) {
288 Node::Ref(Arc::new(resolve_uri(resource, &ctx, true)?))
289 } else {
290 Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
291 };
292 if let Some(predicates) = predicates.take() {
293 for predicate in predicates {
294 push_triples_inlist(in_list_stmts, &subject, predicate, &obj);
295 }
296 }
297
298 current_node = subject;
299 }
300 else if let Some(resource) = resource {
302 let resource = Node::Ref(Arc::new(resolve_uri(resource, &ctx, true)?));
303
304 if !elt.has_content_or_datatype() {
305 let object = about
306 .as_ref()
307 .filter(|_| parent_in_rel.is_some() || parent_in_rev.is_some())
308 .map(|a| Node::Ref(Arc::new(a.clone())))
309 .unwrap_or(resource);
310 current_node = object;
311 let subject = about
312 .take()
313 .map(|a| Ok(Node::Ref(Arc::new(a))))
314 .unwrap_or_else(|| get_parent_subject(&parent, &ctx))?;
315
316 push_triples(stmts, &subject, &predicates, ¤t_node);
317
318 if predicates.is_some() && type_ofs.is_none() {
319 current_node = subject;
320 } else {
321 push_triples(stmts, &subject, &rels.take(), ¤t_node);
322 push_triples(stmts, ¤t_node, &revs.take(), &subject);
323 }
324 } else {
325 let resource = about
327 .as_ref()
328 .map(|a| Node::Ref(Arc::new(a.clone())))
329 .unwrap_or(resource);
330 push_triples(
331 stmts,
332 &resource,
333 &predicates,
334 &extract_literal(&elt, &datatype, &ctx)?,
335 );
336 current_node = resource;
337 }
338 }
339 else if let Some(about) = about {
341 let is_empty = elt
344 .about
345 .filter(|a| !a.trim().is_empty() && is_empty_curie(a))
346 .is_some();
347 current_node = if !is_empty {
348 Node::Ref(Arc::new(about))
349 } else {
350 current_node
351 };
352
353 push_triples(
354 stmts,
355 ¤t_node,
356 &predicates,
357 &Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
358 );
359
360 if let Some(src_or_href) = src_or_href.take() {
361 push_triples(stmts, ¤t_node, &rels, &src_or_href);
362 push_triples(stmts, &src_or_href, &revs, ¤t_node);
363 }
364 if is_empty {
365 current_node = make_bnode();
366 }
367 }
368 else if src_or_href.is_some() && elt.has_content_or_datatype() {
370 current_node = src_or_href.take().ok_or("no src")?;
371
372 push_triples(
373 stmts,
374 ¤t_node,
375 &predicates,
376 &extract_literal(&elt, &datatype, &ctx)?,
377 );
378 }
379 else if src_or_href.is_some() && (rels.is_some() || revs.is_some()) {
381 let src_or_href = src_or_href.take().ok_or("no src")?;
382 current_node = get_parent_subject(&parent, &ctx)
383 .ok()
384 .unwrap_or_else(make_bnode);
385
386 let mut has_term = false;
387 let mut emit_triple = false;
388 if elt.has_property() {
389 rels = rels.take().map(|rs| {
390 rs.into_iter()
391 .filter(|r| {
392 let m = matches!(r, Node::Ref(r) if matches!(r.as_ref(), Node::TermIri(_)));
393 if m {
394 has_term = true;
395 } else {
396 emit_triple = true;
397 }
398 !m
399 })
400 .collect()
401 });
402 }
403
404 push_triples(stmts, ¤t_node, &rels, &src_or_href);
405 push_triples(stmts, &src_or_href, &revs, ¤t_node);
406
407 if has_term {
408 if emit_triple {
409 elt.src.take();
410 elt.href.take();
411 }
412
413 push_triples(
414 stmts,
415 ¤t_node,
416 &predicates,
417 &extract_literal(&elt, &datatype, &ctx)?,
418 );
419 }
420 if rels.is_some() && type_ofs.is_some() {
422 if let Some(type_ofs) = type_ofs.take() {
423 let pred = Some(vec![NODE_NS_TYPE.clone()]);
424
425 for to in type_ofs {
426 push_triples(stmts, &src_or_href, &pred, &to);
427 }
428 }
429 current_node = src_or_href.clone();
431 rels.take();
432 }
433 if revs.is_some() {
435 if predicates.is_some() {
436 elt.src.take();
437 elt.href.take();
438 push_triples(
439 stmts,
440 ¤t_node,
441 &predicates,
442 &extract_literal(&elt, &datatype, &ctx)?,
443 );
444 }
445 if let Some(type_ofs) = type_ofs.take() {
446 let pred = Some(vec![NODE_NS_TYPE.clone()]);
447
448 for to in type_ofs {
449 push_triples(stmts, &src_or_href, &pred, &to);
450 }
451 }
452 }
453 }
454 else if type_ofs.is_some() {
456 if elt.has_property()
457 && !elt.has_content_or_datatype()
458 && (parent_in_rel.is_some() || parent_in_rev.is_some())
459 {
460 current_node = make_bnode();
461 let node = src_or_href.take().unwrap_or_else(make_bnode);
462 for to in type_ofs.take().iter().flatten() {
463 push_triples(stmts, &node, &Some(vec![NODE_NS_TYPE.clone()]), to);
464 }
465 push_triples(stmts, ¤t_node, &predicates, &node);
466 } else if rels.is_some() {
467 current_node = make_bnode();
468
469 for to in type_ofs.take().into_iter().flatten() {
470 stmts.push(Statement {
471 subject: current_node.clone(),
472 predicate: NODE_NS_TYPE.clone(),
473 object: to,
474 })
475 }
476 push_triples(stmts, &base, &rels.take(), ¤t_node);
477 } else if !IS_SPECIAL_NODE_FN(&datatype) {
478 let child_with_rdfa_tag = element_ref
481 .select(&Selector::parse(
482 "[href], [src], [resource], [property], [about]",
483 )?)
484 .filter(|e| {
485 RdfaElement::new(e)
486 .ok()
487 .and_then(|e2| e2.datatype)
488 .and_then(|dt| match resolve_uri(dt, &ctx, false).ok().map(Box::new) {
489 v @ Some(_) if IS_SPECIAL_NODE_FN(&v) => v,
490 _ => None,
491 })
492 .is_none()
493 })
494 .count()
495 == 0;
496 current_node = if let Some(src_or_href) = src_or_href.take() {
497 src_or_href
498 } else if elt.name == "body"
500 || elt.name == "head"
501 || child_with_rdfa_tag
502 || parent.is_none()
503 {
504 base.clone()
505 } else {
506 make_bnode()
507 };
508
509 let subject = get_parent_subject(&parent, &ctx)
510 .ok()
511 .unwrap_or_else(make_bnode);
512
513 push_triples(stmts, &subject, &predicates, ¤t_node);
514 } else {
515 push_triples(
517 stmts,
518 ¤t_node,
519 &predicates,
520 &extract_literal(&elt, &datatype, &ctx)?,
521 );
522 }
523 }
524 else {
526 current_node = src_or_href
527 .take()
528 .filter(|_| parent_in_rel.is_some() || parent_in_rev.is_some())
529 .map(Ok)
530 .unwrap_or_else(|| get_parent_subject(&parent, &ctx))?;
531
532 push_triples(
533 stmts,
534 ¤t_node,
535 &predicates,
536 &Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
537 );
538 }
539
540 handle_children(NodeContext {
541 element_ref,
542 ctx,
543 stmts,
544 current_node,
545 rels,
546 revs,
547 in_list_stmts,
548 type_ofs,
549 parent_in_rel,
550 parent_in_rev,
551 parent: &parent,
552 })
553}
554fn handle_children<'a>(
555 NodeContext {
556 element_ref,
557 mut ctx,
558 stmts,
559 current_node,
560 rels,
561 revs,
562 in_list_stmts,
563 type_ofs,
564 mut parent_in_rel,
565 mut parent_in_rev,
566 parent,
567 }: NodeContext<'a, '_>,
568) -> Result<Option<Node<'a>>, Box<dyn Error>> {
569 if let Some(type_ofs) = type_ofs {
570 for type_of in type_ofs {
571 stmts.push(Statement {
572 subject: current_node.clone(),
573 predicate: NODE_NS_TYPE.clone(),
574 object: type_of,
575 })
576 }
577 }
578
579 if parent_in_rel.is_some() || parent_in_rev.is_some() {
580 let parent = get_parent_subject(parent, &ctx)
581 .ok()
582 .ok_or("in_rel: no parent node")?;
583 push_triples(stmts, &parent, &parent_in_rel.take(), ¤t_node);
584 push_triples(stmts, ¤t_node, &parent_in_rev.take(), &parent);
585 }
586 ctx.current_node = Some(current_node.clone());
587 ctx.in_rel = rels.clone();
588 ctx.in_rev = revs.clone();
589 for child in get_children(element_ref)? {
590 if let Some(c) = ElementRef::wrap(child) {
591 let triples_completed = (ctx.in_rel.is_some() || ctx.in_rev.is_some())
593 && (c.attr("property").is_some()
594 || c.attr("rel").is_some()
595 || c.attr("rev").is_some())
596 && (c.attr("about").is_none() && c.attr("typeof").is_none());
597
598 if triples_completed {
599 let b_node = make_bnode();
601 push_triples(stmts, ¤t_node, &ctx.in_rel.take(), &b_node);
602 push_triples(stmts, &b_node, &ctx.in_rev.take(), ¤t_node);
603
604 ctx.current_node = Some(b_node);
605 }
606 if c.attr("about").is_some() || c.attr("typeof").is_some() {
608 ctx.in_rel = rels.clone();
609 ctx.in_rev = revs.clone();
610 ctx.current_node = Some(current_node.clone());
611 }
612 let child_ctx = Context {
613 base: ctx.base,
614 lang: ctx.lang,
615 empty_ref_node_substitute: ctx.empty_ref_node_substitute,
616 ..Default::default()
617 };
618
619 let node = traverse_element(&c, Some(&ctx), child_ctx, stmts, in_list_stmts)?;
620 if node != ctx.current_node {
621 stmts.append(in_list_stmts);
622 }
623 }
624 }
625 Ok(ctx.current_node.clone())
626}
627fn extract_literal<'a>(
628 rdfa_el: &RdfaElement<'a, '_>,
629 datatype: &Option<Box<Node<'a>>>,
630 ctx: &Context<'a>,
631) -> Result<Node<'a>, &'static str> {
632 let plain_datatype = datatype
633 .as_ref()
634 .filter(|dt| dt.as_ref() == &*NODE_RDF_PLAIN_LITERAL)
635 .is_some();
636
637 let lang = ctx.lang.filter(|s| datatype.is_none() && !s.is_empty());
638 if let Some(value) = rdfa_el.src_or_href().filter(|_| {
639 !rdfa_el.has_about() && !rdfa_el.has_property() || !rdfa_el.has_content_or_datatype()
640 }) {
641 resolve_uri(value, ctx, true)
642 } else if let Some(content) = rdfa_el.content {
643 Ok(Node::Literal(Literal {
644 datatype: datatype.clone(),
645 value: Cow::Borrowed(content),
646 lang,
647 }))
648 } else if !plain_datatype && IS_SPECIAL_NODE_FN(datatype) {
649 Ok(Node::Literal(Literal {
650 value: Cow::Owned(rdfa_el.inner_html()),
651 datatype: datatype.clone(),
652 lang: None,
653 }))
654 } else if let Some(content) = rdfa_el.get_time() {
655 Ok(Node::Literal(Literal {
656 datatype: datatype
657 .clone()
658 .or_else(|| DataTypeFromPattern::date_time_from_pattern(content).map(Box::new)),
659 value: Cow::Borrowed(content),
660 lang: None,
661 }))
662 } else {
663 let datatype = if plain_datatype {
664 None
665 } else {
666 datatype.clone()
667 };
668 let lang = if plain_datatype { ctx.lang } else { lang };
669 let texts = rdfa_el.texts();
670 let text = if texts.is_empty() {
671 Cow::Borrowed("")
672 } else {
673 let text = texts
674 .iter()
675 .map(|t| t.to_string())
676 .collect::<Vec<_>>()
677 .join("");
678 Cow::Owned(text)
679 };
680 Ok(Node::Literal(Literal {
681 datatype,
682 value: text,
683 lang,
684 }))
685 }
686}
687fn get_parent_subject<'a>(
688 parent: &Option<&Context<'a>>,
689 ctx: &Context<'a>,
690) -> Result<Node<'a>, Box<dyn Error>> {
691 parent
692 .and_then(|p| p.current_node.clone())
693 .or_else(|| {
694 if parent.is_none() {
695 resolve_uri(ctx.base, ctx, true).ok()
696 } else {
697 None
698 }
699 })
700 .ok_or("no parent".into())
701}
702
703fn resolve_uri<'a>(
704 uri: &'a str,
705 ctx: &Context<'a>,
706 is_resource: bool,
707) -> Result<Node<'a>, &'static str> {
708 let uri = uri.trim();
709
710 if let Ok(ref origin) = Url::parse(ctx.base).map(|u| u.origin())
712 && let Origin::Tuple(_, host, _) = origin
713 {
714 let host = &host.to_string();
715
716 if uri.starts_with(host) {
717 return Ok(Node::TermIri(Cow::Owned(
718 uri.replace(host, &origin.unicode_serialization()),
719 )));
720 }
721 };
722
723 let iri = Url::parse(uri);
724 let trailing_white_space = if ctx.base.ends_with('/')
725 || ctx.base.ends_with('#')
726 || uri.starts_with('/')
727 || uri.starts_with('#')
728 {
729 ""
730 } else {
731 "/"
732 };
733 match iri {
734 Ok(iri) if !iri.cannot_be_a_base() || iri.is_special() => {
735 if uri.contains(|c: char| c.is_whitespace() || c.is_control()) {
737 let mut new_uri = String::with_capacity(uri.len() * 125 / 100);
738 for c in uri.chars() {
739 match c {
740 '\n' => new_uri.push_str("%0A"),
741 '\0' => new_uri.push_str("%00"),
742 '\t' => new_uri.push_str("%09"),
743 '\r' => new_uri.push_str("%0D"),
744 ' ' => new_uri.push_str("%20"),
745 c => new_uri.push(c),
746 }
747 }
748 Ok(Node::Iri(Cow::Owned(new_uri)))
749 } else {
750 Ok(Node::Iri(Cow::Borrowed(uri)))
751 }
752 }
753
754 Ok(iri) => {
756 if uri.starts_with("mail:") || uri.starts_with("tel:") {
757 Ok(Node::Iri(Cow::Borrowed(uri)))
758 } else if let Some((prefix, value)) = ctx
759 .prefixes
760 .iter()
761 .find(|(k, _)| k.eq_ignore_ascii_case(iri.scheme()))
762 {
763 let iri = format!(
764 "{value}{}",
765 &uri.replacen(':', "", 1).trim()[prefix.len()..]
766 );
767 Ok(Node::Iri(Cow::Owned(iri)))
768 } else if let Some((prefix, value)) = COMMON_PREFIXES
769 .iter()
770 .find(|(k, _)| k.eq_ignore_ascii_case(iri.scheme()))
771 {
772 let iri = format!(
773 "{value}{}",
774 &uri.replacen(':', "", 1).trim()[prefix.len()..]
775 );
776 Ok(Node::Iri(Cow::Owned(iri)))
777 } else {
778 Ok(Node::Iri(Cow::Owned(uri.to_string())))
779 }
780 }
781 Err(url::ParseError::RelativeUrlWithoutBase) => {
782 if let Ok((prefix, reference)) = parse_safe_curie(uri) {
783 let reference = reference.trim();
784 let prefix = prefix.trim();
785 if prefix == "_" {
786 let id = if reference.is_empty() {
787 ctx.empty_ref_node_substitute
788 } else {
789 reference
790 };
791 return Ok(Node::RefBlank(id));
792 } else if prefix.is_empty() && !reference.is_empty() {
793 return Ok(Node::TermIri(Cow::Owned(
794 [COMMON_PREFIXES[""], reference].join(""),
795 )));
796 } else if let Some(prefix) = ctx
797 .prefixes
798 .get(prefix)
799 .or_else(|| COMMON_PREFIXES.get(prefix))
800 {
801 let reference = if reference.trim().is_empty() {
802 reference.trim()
803 } else {
804 reference
805 };
806 return Ok(Node::Iri(Cow::Owned([prefix, reference].join(""))));
807 }
808 }
809 if is_resource || uri.starts_with('#') || uri.starts_with('/') {
810 let uri = if uri.starts_with("/") && ctx.base.ends_with("/") {
811 &uri[1..]
812 } else {
813 uri
814 };
815 Ok(Node::TermIri(Cow::Owned(
816 [ctx.base, trailing_white_space, uri].join(""),
817 )))
818 } else if let Some(vocab) = ctx.vocab {
819 Ok(Node::TermIri(Cow::Owned([vocab, uri].join(""))))
820 } else if RESERVED_KEYWORDS
821 .iter()
822 .any(|w| uri.eq_ignore_ascii_case(w))
823 {
824 Ok(Node::TermIri(Cow::Borrowed(
825 COMMON_PREFIXES[uri.to_lowercase().as_str()],
826 )))
827 } else {
828 debug!("could not determine base/vocab {:?}", ctx);
829 Err("could not determine uri")
831 }
832 }
833 Err(e) => {
834 eprintln!("invalid uri {uri}. error: {e}");
835 Err("could not resolve uri")
836 }
837 }
838}
839
840fn parse_safe_curie(s: &str) -> Result<(&str, &str), &'static str> {
841 let mut s = s.trim();
842 if s.starts_with('[') {
843 if !s.ends_with(']') {
844 return Err("invalid SafeCurie");
845 }
846 s = &s[1..s.len() - 1];
847 }
848 s.split_once(':').ok_or("not a curie")
849}
850
851fn parse_prefixes(s: &str) -> HashMap<&str, &str> {
852 s.split_whitespace()
853 .map(|s| s.trim())
854 .collect::<Vec<_>>()
855 .chunks_exact(2)
856 .map(|c| (c[0], c[1]))
857 .filter_map(|(s, p)| {
858 if let Ok((s, _)) = parse_safe_curie(s) {
859 Some((s, p))
860 } else {
861 error!("fixme! couldn't parse curie for {s}, {p}");
862 None
863 }
864 })
865 .collect()
866}
867
868fn parse_property_or_type_of<'a>(
869 s: &'a str,
870 ctx: &Context<'a>,
871 allow_b_node: bool,
872) -> Vec<Node<'a>> {
873 s.split_whitespace()
874 .filter_map(|uri| resolve_uri(uri, ctx, false).ok())
875 .filter(|node| allow_b_node || !matches!(node, Node::Blank(_) | Node::RefBlank(_)))
876 .map(|n| Node::Ref(Arc::new(n)))
877 .collect()
878}
879
880fn push_triples_inlist<'a>(
881 stmts: &mut Vec<Statement<'a>>,
882 subject: &Node<'a>,
883 predicate: Node<'a>,
884 obj: &Node<'a>,
885) {
886 let b_node = make_bnode();
887 stmts.push(Statement {
888 subject: b_node.clone(),
889 predicate: NODE_RDF_FIRST.clone(),
890 object: obj.clone(),
891 });
892
893 if let Some(node) =
894 find_pos_last_node_in_inlist(stmts, subject, &predicate).and_then(|pos| stmts.get_mut(pos))
895 {
896 node.object = b_node.clone();
897 } else {
898 stmts.push(Statement {
900 subject: subject.clone(),
901 predicate,
902 object: b_node.clone(),
903 });
904 }
905 stmts.push(Statement {
906 subject: b_node,
907 predicate: NODE_RDF_REST.clone(),
908 object: NODE_RDF_NIL.clone(),
909 });
910}
911fn find_pos_last_node_in_inlist<'a>(
912 stmts: &Vec<Statement<'a>>,
913 root_subject: &Node<'a>,
914 predicate: &Node<'a>,
915) -> Option<usize> {
916 fn find_res_nil<'a>(stmts: &Vec<Statement<'a>>, subject: &Node<'a>) -> Option<usize> {
917 let node = stmts
918 .iter()
919 .enumerate()
920 .find(|(_, stmt)| &stmt.subject == subject && stmt.predicate == *NODE_RDF_REST);
921
922 if let Some((pos, stmt)) = node {
923 if stmt.object == *NODE_RDF_NIL {
924 Some(pos)
925 } else {
926 find_res_nil(stmts, &stmt.object)
927 }
928 } else {
929 None
930 }
931 }
932 let root = stmts
933 .iter()
934 .find(|stmt| &stmt.subject == root_subject && &stmt.predicate == predicate);
935 if let Some(Statement { object, .. }) = root {
936 find_res_nil(stmts, object)
937 } else {
938 None
939 }
940}
941
942#[inline]
944fn get_children<'a>(
945 element_ref: &ElementRef<'a>,
946) -> Result<Vec<ego_tree::NodeRef<'a, scraper::Node>>, &'static str> {
947 let mut res = vec![];
948 for c in element_ref.children() {
949 if c.value()
950 .as_element()
951 .filter(|e| e.attrs().count() == 0)
952 .is_some()
953 {
954 let child_ref = ElementRef::wrap(c).ok_or("not an element ref")?;
955 res.append(&mut get_children(&child_ref)?);
956 } else {
957 res.push(c);
958 }
959 }
960
961 Ok(res)
962}
963
964#[inline]
965fn make_bnode<'a>() -> Node<'a> {
966 Node::Blank(get_uuid())
967}
968
969#[inline]
970fn copy_pattern(triples: Vec<Statement<'_>>) -> Result<Vec<Statement<'_>>, Box<dyn Error>> {
971 let (pattern_type, pattern): (Vec<Statement>, Vec<Statement>) = triples
972 .into_iter()
973 .partition(|stmt| stmt.object == *NODE_RDFA_PATTERN_TYPE);
974
975 let (pattern_predicate, pattern): (Vec<Statement>, Vec<Statement>) = pattern
976 .into_iter()
977 .partition(|stmt| pattern_type.iter().any(|s| s.subject == stmt.subject));
978
979 let (pattern_subject, mut triples): (Vec<Statement>, Vec<Statement>) = pattern
980 .into_iter()
981 .partition(|stmt| pattern_predicate.iter().any(|s| s.subject == stmt.object));
982
983 let (mut unreferenced_pattern_predicate, pattern_predicate): (Vec<Statement>, Vec<Statement>) =
985 pattern_predicate
986 .into_iter()
987 .partition(|stmt| pattern_subject.iter().all(|s| s.object != stmt.subject));
988
989 let (mut unreferenced_pattern_type, _): (Vec<Statement>, Vec<Statement>) =
990 pattern_type.into_iter().partition(|stmt| {
991 unreferenced_pattern_predicate
992 .iter()
993 .any(|s| s.subject == stmt.subject)
994 });
995 triples.append(&mut unreferenced_pattern_predicate);
996 triples.append(&mut unreferenced_pattern_type);
997
998 for Statement {
999 subject, object, ..
1000 } in pattern_subject
1001 {
1002 for Statement {
1003 predicate,
1004 object: obj,
1005 ..
1006 } in pattern_predicate
1007 .iter()
1008 .filter(|stmt| object == stmt.subject)
1009 {
1010 triples.push(Statement {
1011 subject: subject.clone(),
1012 predicate: predicate.clone(),
1013 object: obj.clone(),
1014 })
1015 }
1016 }
1017
1018 Ok(triples)
1019}
1020
1021#[inline]
1022fn push_triples<'a>(
1023 stmts: &mut Vec<Statement<'a>>,
1024 subject: &Node<'a>,
1025 predicates: &Option<Vec<Node<'a>>>,
1026 object: &Node<'a>,
1027) {
1028 if let Some(predicate) = predicates {
1029 for predicate in predicate {
1030 stmts.push(Statement {
1031 subject: subject.clone(),
1032 predicate: predicate.clone(),
1033 object: object.clone(),
1034 });
1035 }
1036 }
1037}