1use std::{borrow::Cow, collections::HashMap, error::Error, sync::Arc};
2
3mod constants;
4mod rdfa_elt;
5mod structs;
6#[cfg(test)]
7mod tests;
8
9use constants::{
10 COMMON_PREFIXES, IS_SPECIAL_NODE_FN, NODE_NS_TYPE, NODE_RDF_FIRST, NODE_RDF_NIL,
11 NODE_RDF_PLAIN_LITERAL, NODE_RDF_REST, NODE_RDFA_PATTERN_TYPE, NODE_RDFA_USES_VOCABULARY,
12 RESERVED_KEYWORDS, get_uuid,
13};
14use log::{debug, error};
15use rdfa_elt::RdfaElement;
16use scraper::{ElementRef, Selector};
17use url::Url;
18
19use structs::{Context, DataTypeFromPattern, Literal, Node, Statement};
20
21pub use structs::RdfaGraph;
22
23struct NodeContext<'a, 'b> {
24 element_ref: &'b ElementRef<'a>,
25 ctx: Context<'a>,
26 stmts: &'b mut Vec<Statement<'a>>,
27 current_node: Node<'a>,
28 rels: Option<Vec<Node<'a>>>,
29 revs: Option<Vec<Node<'a>>>,
30 in_list_stmts: &'b mut Vec<Statement<'a>>,
31 type_ofs: Option<Vec<Node<'a>>>,
32 parent_in_rel: Option<Vec<Node<'a>>>,
33 parent_in_rev: Option<Vec<Node<'a>>>,
34 parent: &'b Option<&'b Context<'a>>,
35}
36
37impl<'a> RdfaGraph<'a> {
38 pub fn parse(
39 input: &ElementRef<'a>,
40 initial_context: Context<'a>,
41 ) -> Result<RdfaGraph<'a>, Box<dyn Error>> {
42 let mut triples = vec![];
43 let mut inlist_triples = vec![];
44 let well_known_prefix = initial_context.well_known_prefix;
45 if initial_context.empty_ref_node_substitute.is_empty() {
46 return Err(
47 "if you provide a context, you most provide an empty_ref_node_substitute property."
48 .into(),
49 );
50 }
51 traverse_element(
52 input,
53 None,
54 initial_context,
55 &mut triples,
56 &mut inlist_triples,
57 )?;
58
59 if !inlist_triples.is_empty() {
63 triples.append(&mut inlist_triples);
64 }
65
66 triples = copy_pattern(triples)?;
67
68 Ok(RdfaGraph {
69 statements: triples.into_iter().collect(),
70 well_known_prefix,
71 })
72 }
73
74 pub fn parse_str(
75 html: &'a str,
76 base: &'a str,
77 well_known_prefix: Option<&'a str>,
78 ) -> Result<String, Box<dyn Error>> {
79 let document = scraper::Html::parse_document(html);
80 let empty_ref_node_substitue = get_uuid();
81 let root = document.root_element();
82
83 let root_ctx = Context {
84 base,
85 empty_ref_node_substitute: &empty_ref_node_substitue,
86 well_known_prefix: well_known_prefix.filter(|f| !f.is_empty()),
87 ..Default::default()
88 };
89 RdfaGraph::parse(&root, root_ctx).map(|g| g.to_string())
90 }
91}
92fn traverse_element<'a, 'b>(
93 element_ref: &'b ElementRef<'a>,
94 parent: Option<&'b Context<'a>>,
95 mut ctx: Context<'a>,
96 stmts: &'b mut Vec<Statement<'a>>,
97 in_list_stmts: &mut Vec<Statement<'a>>,
98) -> Result<Option<Node<'a>>, Box<dyn Error>> {
99 let mut elt = RdfaElement::new(element_ref)?;
100
101 ctx.vocab = elt.vocab.or_else(|| parent.as_ref().and_then(|p| p.vocab));
102
103 ctx.base = elt.base.unwrap_or(ctx.base);
104
105 let base = resolve_uri(ctx.base, &ctx, true)?;
106
107 if let Some(vocab) = ctx.vocab.filter(|v| !v.is_empty()) {
108 stmts.push(Statement {
109 subject: base.clone(),
110 predicate: NODE_RDFA_USES_VOCABULARY.clone(),
111 object: resolve_uri(vocab, &ctx, false)?,
112 })
113 } else {
114 ctx.vocab = None;
115 }
116 ctx.prefixes = elt
117 .prefix
118 .map(parse_prefixes)
119 .or_else(|| parent.map(|p| p.prefixes.clone()))
120 .unwrap_or(ctx.prefixes);
121
122 let is_empty_curie = |s: &str| {
123 let mut s = s.trim();
124 if s.starts_with('[') {
125 s = &s[1..];
126 } else {
127 return false;
128 }
129 if s.ends_with(']') {
130 s = &s[0..s.len() - 1];
131 } else {
132 return false;
133 }
134 s.is_empty()
135 };
136
137 let resource = elt
138 .resource
139 .filter(|r| !is_empty_curie(r))
140 .map(|c| if c.is_empty() { ctx.base } else { c });
141
142 ctx.lang = elt
143 .lang
144 .or_else(|| parent.and_then(|p| p.lang))
145 .or(ctx.lang);
146
147 let mut about = elt.about.and_then(|a| resolve_uri(a, &ctx, true).ok());
148
149 let mut rels = elt.rel.map(|r| parse_property_or_type_of(r, &ctx, true));
150 let mut revs = elt.rev.map(|r| parse_property_or_type_of(r, &ctx, true));
151
152 let mut parent_in_rel = parent.and_then(|c| c.in_rel.clone());
153 let mut parent_in_rev = parent.and_then(|c| c.in_rev.clone());
154 let mut parent_in_list = parent.and_then(|c| c.in_list.clone());
155
156 let mut src_or_href = elt
157 .src_or_href()
158 .and_then(|v| resolve_uri(v, &ctx, true).ok());
159
160 let mut type_ofs = elt.type_of.and_then(|t| {
161 if t.trim().is_empty() {
162 resolve_uri(ctx.vocab.unwrap_or(ctx.base), &ctx, true)
164 .ok()
165 .map(|v| vec![v])
166 } else {
167 Some(parse_property_or_type_of(t, &ctx, true))
168 }
169 });
170
171 let datatype = elt
172 .datatype
173 .and_then(|dt| match resolve_uri(dt, &ctx, false) {
174 Ok(d) => Some(Box::new(d)),
175 Err(e) => {
176 debug!("could not parse {dt}. error {e}");
177 None
178 }
179 });
180
181 let mut predicates = elt
182 .property
183 .map(|p| parse_property_or_type_of(p, &ctx, false));
184
185 let mut current_node = if !IS_SPECIAL_NODE_FN(&datatype) {
188 base.clone()
189 } else {
190 make_bnode()
191 };
192
193 if let Some(parent_in_list) = parent_in_list.take() {
195 let subject = get_parent_subject(&parent, &ctx)?;
196 let obj = if let Some(resource) = resource
197 .and_then(|r| resolve_uri(r, &ctx, true).ok())
198 .map(|n| Node::Ref(Arc::new(n)))
199 .or_else(|| src_or_href.clone())
200 {
201 resource
202 } else {
203 Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
204 };
205 for rel in parent_in_list {
206 push_triples_inlist(in_list_stmts, &subject, rel, &obj);
207 }
208 current_node = subject;
209 }
210 else if elt.is_inlist() {
212 let mut in_rel = false;
213
214 let subject = get_parent_subject(&parent, &ctx)?;
215
216 if rels.is_some()
217 && src_or_href.is_none()
218 && predicates.is_none()
219 && resource.is_none()
220 && about.is_none()
221 {
223 if element_ref.children().count() != 0 {
224 if type_ofs.is_some() {
226 let Some(rels) = rels.take() else {
227 unreachable!()
228 };
229 current_node = make_bnode();
230 handle_children(NodeContext {
231 element_ref,
232 ctx: ctx.clone(),
233 stmts,
234 current_node: current_node.clone(),
235 rels: None,
236 revs: revs.take(),
237 in_list_stmts,
238 type_ofs: type_ofs.take(),
239 parent_in_rel: parent_in_rel.take(),
240 parent_in_rev: parent_in_rev.take(),
241 parent: &parent,
242 })?;
243 for rel in rels {
244 let mut existing_rel_in_list = None;
245 if let Some(node) =
246 find_pos_last_node_in_inlist(in_list_stmts, &subject, &rel)
247 .and_then(|s| in_list_stmts.get_mut(s))
248 .filter(|p| p.object != *NODE_RDF_NIL)
249 {
250 existing_rel_in_list = Some(node.object.clone());
251 }
252
253 if let Some(existing_rel_in_list) = existing_rel_in_list {
254 push_triples_inlist(
255 in_list_stmts,
256 &subject,
257 rel,
258 &existing_rel_in_list,
259 );
260 } else {
261 push_triples_inlist(in_list_stmts, &subject, rel, ¤t_node);
262 }
263 }
264 return Ok(Some(subject));
265 } else {
266 ctx.in_list = rels.take();
267 }
268 } else {
269 push_triples(in_list_stmts, &subject, &rels.take(), &NODE_RDF_NIL);
270 }
271 } else if let Some(rels) = rels.take().filter(|r| !r.is_empty()) {
272 in_rel = true;
273
274 let obj = if let Some(resource) = resource
275 .and_then(|r| resolve_uri(r, &ctx, true).ok())
276 .map(|n| Node::Ref(Arc::new(n)))
277 .or_else(|| src_or_href.clone())
278 {
279 resource
280 } else {
281 Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
282 };
283 for rel in rels {
284 push_triples_inlist(in_list_stmts, &subject, rel, &obj);
285 }
286 }
287 let obj = if let (Some(resource), false) = (resource, in_rel) {
288 Node::Ref(Arc::new(resolve_uri(resource, &ctx, true)?))
289 } else {
290 Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
291 };
292 if let Some(predicates) = predicates.take() {
293 for predicate in predicates {
294 push_triples_inlist(in_list_stmts, &subject, predicate, &obj);
295 }
296 }
297
298 current_node = subject;
299 }
300 else if let Some(resource) = resource {
302 let resource = Node::Ref(Arc::new(resolve_uri(resource, &ctx, true)?));
303
304 if !elt.has_content_or_datatype() {
305 let object = about
306 .as_ref()
307 .filter(|_| parent_in_rel.is_some() || parent_in_rev.is_some())
308 .map(|a| Node::Ref(Arc::new(a.clone())))
309 .unwrap_or(resource);
310 current_node = object;
311 let subject = about
312 .take()
313 .map(|a| Ok(Node::Ref(Arc::new(a))))
314 .unwrap_or_else(|| get_parent_subject(&parent, &ctx))?;
315
316 push_triples(stmts, &subject, &predicates, ¤t_node);
317
318 if predicates.is_some() && type_ofs.is_none() {
319 current_node = subject;
320 } else {
321 push_triples(stmts, &subject, &rels.take(), ¤t_node);
322 push_triples(stmts, ¤t_node, &revs.take(), &subject);
323 }
324 } else {
325 let resource = about
327 .as_ref()
328 .map(|a| Node::Ref(Arc::new(a.clone())))
329 .unwrap_or(resource);
330 push_triples(
331 stmts,
332 &resource,
333 &predicates,
334 &extract_literal(&elt, &datatype, &ctx)?,
335 );
336 current_node = resource;
337 }
338 }
339 else if let Some(about) = about {
341 let is_empty = elt
344 .about
345 .filter(|a| !a.trim().is_empty() && is_empty_curie(a))
346 .is_some();
347 current_node = if !is_empty {
348 Node::Ref(Arc::new(about))
349 } else {
350 current_node
351 };
352
353 push_triples(
354 stmts,
355 ¤t_node,
356 &predicates,
357 &Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
358 );
359
360 if let Some(src_or_href) = src_or_href.take() {
361 push_triples(stmts, ¤t_node, &rels, &src_or_href);
362 push_triples(stmts, &src_or_href, &revs, ¤t_node);
363 }
364 if is_empty {
365 current_node = make_bnode();
366 }
367 }
368 else if src_or_href.is_some() && elt.has_content_or_datatype() {
370 current_node = src_or_href.take().ok_or("no src")?;
371
372 push_triples(
373 stmts,
374 ¤t_node,
375 &predicates,
376 &extract_literal(&elt, &datatype, &ctx)?,
377 );
378 }
379 else if src_or_href.is_some() && (rels.is_some() || revs.is_some()) {
381 let src_or_href = src_or_href.take().ok_or("no src")?;
382 current_node = get_parent_subject(&parent, &ctx)
383 .ok()
384 .unwrap_or_else(make_bnode);
385
386 let mut has_term = false;
387 let mut emit_triple = false;
388 if elt.has_property() {
389 rels = rels.take().map(|rs| {
390 rs.into_iter()
391 .filter(|r| {
392 let m = matches!(r, Node::Ref(r) if matches!(r.as_ref(), Node::TermIri(_)));
393 if m {
394 has_term = true;
395 } else {
396 emit_triple = true;
397 }
398 !m
399 })
400 .collect()
401 });
402 }
403
404 push_triples(stmts, ¤t_node, &rels, &src_or_href);
405 push_triples(stmts, &src_or_href, &revs, ¤t_node);
406
407 if has_term {
408 if emit_triple {
409 elt.src.take();
410 elt.href.take();
411 }
412
413 push_triples(
414 stmts,
415 ¤t_node,
416 &predicates,
417 &extract_literal(&elt, &datatype, &ctx)?,
418 );
419 }
420 if rels.is_some() && type_ofs.is_some() {
422 if let Some(type_ofs) = type_ofs.take() {
423 let pred = Some(vec![NODE_NS_TYPE.clone()]);
424
425 for to in type_ofs {
426 push_triples(stmts, &src_or_href, &pred, &to);
427 }
428 }
429 current_node = src_or_href.clone();
431 rels.take();
432 }
433 if revs.is_some() {
435 if predicates.is_some() {
436 elt.src.take();
437 elt.href.take();
438 push_triples(
439 stmts,
440 ¤t_node,
441 &predicates,
442 &extract_literal(&elt, &datatype, &ctx)?,
443 );
444 }
445 if let Some(type_ofs) = type_ofs.take() {
446 let pred = Some(vec![NODE_NS_TYPE.clone()]);
447
448 for to in type_ofs {
449 push_triples(stmts, &src_or_href, &pred, &to);
450 }
451 }
452 }
453 }
454 else if type_ofs.is_some() {
456 if elt.has_property()
457 && !elt.has_content_or_datatype()
458 && (parent_in_rel.is_some() || parent_in_rev.is_some())
459 {
460 current_node = make_bnode();
461 let node = src_or_href.take().unwrap_or_else(make_bnode);
462 for to in type_ofs.take().iter().flatten() {
463 push_triples(stmts, &node, &Some(vec![NODE_NS_TYPE.clone()]), to);
464 }
465 push_triples(stmts, ¤t_node, &predicates, &node);
466 } else if rels.is_some() {
467 current_node = make_bnode();
468
469 for to in type_ofs.take().into_iter().flatten() {
470 stmts.push(Statement {
471 subject: current_node.clone(),
472 predicate: NODE_NS_TYPE.clone(),
473 object: to,
474 })
475 }
476 push_triples(stmts, &base, &rels.take(), ¤t_node);
477 } else if !IS_SPECIAL_NODE_FN(&datatype) {
478 let child_with_rdfa_tag = element_ref
481 .select(&Selector::parse(
482 "[href], [src], [resource], [property], [about]",
483 )?)
484 .filter(|e| {
485 RdfaElement::new(e)
486 .ok()
487 .and_then(|e2| e2.datatype)
488 .and_then(|dt| match resolve_uri(dt, &ctx, false).ok().map(Box::new) {
489 v @ Some(_) if IS_SPECIAL_NODE_FN(&v) => v,
490 _ => None,
491 })
492 .is_none()
493 })
494 .count()
495 == 0;
496 current_node = if let Some(src_or_href) = src_or_href.take() {
497 src_or_href
498 } else if elt.name == "body"
500 || elt.name == "head"
501 || child_with_rdfa_tag
502 || parent.is_none()
503 {
504 base.clone()
505 } else {
506 make_bnode()
507 };
508
509 let subject = get_parent_subject(&parent, &ctx)
510 .ok()
511 .unwrap_or_else(make_bnode);
512
513 push_triples(stmts, &subject, &predicates, ¤t_node);
514 } else {
515 push_triples(
517 stmts,
518 ¤t_node,
519 &predicates,
520 &extract_literal(&elt, &datatype, &ctx)?,
521 );
522 }
523 }
524 else {
526 current_node = src_or_href
527 .take()
528 .filter(|_| parent_in_rel.is_some() || parent_in_rev.is_some())
529 .map(Ok)
530 .unwrap_or_else(|| get_parent_subject(&parent, &ctx))?;
531
532 push_triples(
533 stmts,
534 ¤t_node,
535 &predicates,
536 &Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
537 );
538 }
539
540 handle_children(NodeContext {
541 element_ref,
542 ctx,
543 stmts,
544 current_node,
545 rels,
546 revs,
547 in_list_stmts,
548 type_ofs,
549 parent_in_rel,
550 parent_in_rev,
551 parent: &parent,
552 })
553}
554fn handle_children<'a>(
555 NodeContext {
556 element_ref,
557 mut ctx,
558 stmts,
559 current_node,
560 rels,
561 revs,
562 in_list_stmts,
563 type_ofs,
564 mut parent_in_rel,
565 mut parent_in_rev,
566 parent,
567 }: NodeContext<'a, '_>,
568) -> Result<Option<Node<'a>>, Box<dyn Error>> {
569 if let Some(type_ofs) = type_ofs {
570 for type_of in type_ofs {
571 stmts.push(Statement {
572 subject: current_node.clone(),
573 predicate: NODE_NS_TYPE.clone(),
574 object: type_of,
575 })
576 }
577 }
578
579 if parent_in_rel.is_some() || parent_in_rev.is_some() {
580 let parent = get_parent_subject(parent, &ctx)
581 .ok()
582 .ok_or("in_rel: no parent node")?;
583 push_triples(stmts, &parent, &parent_in_rel.take(), ¤t_node);
584 push_triples(stmts, ¤t_node, &parent_in_rev.take(), &parent);
585 }
586 ctx.current_node = Some(current_node.clone());
587 ctx.in_rel = rels.clone();
588 ctx.in_rev = revs.clone();
589 for child in get_children(element_ref)? {
590 if let Some(c) = ElementRef::wrap(child) {
591 let triples_completed = (ctx.in_rel.is_some() || ctx.in_rev.is_some())
593 && (c.attr("property").is_some()
594 || c.attr("rel").is_some()
595 || c.attr("rev").is_some())
596 && (c.attr("about").is_none() && c.attr("typeof").is_none());
597
598 if triples_completed {
599 let b_node = make_bnode();
601 push_triples(stmts, ¤t_node, &ctx.in_rel.take(), &b_node);
602 push_triples(stmts, &b_node, &ctx.in_rev.take(), ¤t_node);
603
604 ctx.current_node = Some(b_node);
605 }
606 if c.attr("about").is_some() || c.attr("typeof").is_some() {
608 ctx.in_rel = rels.clone();
609 ctx.in_rev = revs.clone();
610 ctx.current_node = Some(current_node.clone());
611 }
612 let child_ctx = Context {
613 base: ctx.base,
614 lang: ctx.lang,
615 empty_ref_node_substitute: ctx.empty_ref_node_substitute,
616 ..Default::default()
617 };
618
619 let node = traverse_element(&c, Some(&ctx), child_ctx, stmts, in_list_stmts)?;
620 if node != ctx.current_node {
621 stmts.append(in_list_stmts);
622 }
623 }
624 }
625 Ok(ctx.current_node.clone())
626}
627fn extract_literal<'a>(
628 rdfa_el: &RdfaElement<'a, '_>,
629 datatype: &Option<Box<Node<'a>>>,
630 ctx: &Context<'a>,
631) -> Result<Node<'a>, &'static str> {
632 let plain_datatype = datatype
633 .as_ref()
634 .filter(|dt| dt.as_ref() == &*NODE_RDF_PLAIN_LITERAL)
635 .is_some();
636
637 let lang = ctx.lang.filter(|s| datatype.is_none() && !s.is_empty());
638 if let Some(value) = rdfa_el.src_or_href().filter(|_| {
639 !rdfa_el.has_about() && !rdfa_el.has_property() || !rdfa_el.has_content_or_datatype()
640 }) {
641 resolve_uri(value, ctx, true)
642 } else if let Some(content) = rdfa_el.content {
643 Ok(Node::Literal(Literal {
644 datatype: datatype.clone(),
645 value: Cow::Borrowed(content),
646 lang,
647 }))
648 } else if !plain_datatype && IS_SPECIAL_NODE_FN(datatype) {
649 Ok(Node::Literal(Literal {
650 value: Cow::Owned(rdfa_el.inner_html()),
651 datatype: datatype.clone(),
652 lang: None,
653 }))
654 } else if let Some(content) = rdfa_el.get_time() {
655 Ok(Node::Literal(Literal {
656 datatype: datatype
657 .clone()
658 .or_else(|| DataTypeFromPattern::date_time_from_pattern(content).map(Box::new)),
659 value: Cow::Borrowed(content),
660 lang: None,
661 }))
662 } else {
663 let datatype = if plain_datatype {
664 None
665 } else {
666 datatype.clone()
667 };
668 let lang = if plain_datatype { ctx.lang } else { lang };
669 let texts = rdfa_el.texts();
670 let text = if texts.is_empty() {
671 Cow::Borrowed("")
672 } else {
673 let text = texts
674 .iter()
675 .map(|t| t.to_string())
676 .collect::<Vec<_>>()
677 .join("");
678 Cow::Owned(text)
679 };
680 Ok(Node::Literal(Literal {
681 datatype,
682 value: text,
683 lang,
684 }))
685 }
686}
687fn get_parent_subject<'a>(
688 parent: &Option<&Context<'a>>,
689 ctx: &Context<'a>,
690) -> Result<Node<'a>, Box<dyn Error>> {
691 parent
692 .and_then(|p| p.current_node.clone())
693 .or_else(|| {
694 if parent.is_none() {
695 resolve_uri(ctx.base, ctx, true).ok()
696 } else {
697 None
698 }
699 })
700 .ok_or("no parent".into())
701}
702fn resolve_uri<'a>(
703 uri: &'a str,
704 ctx: &Context<'a>,
705 is_resource: bool,
706) -> Result<Node<'a>, &'static str> {
707 let uri = uri.trim();
708
709 let iri = Url::parse(uri);
710 let trailing_white_space = if ctx.base.ends_with('/')
711 || ctx.base.ends_with('#')
712 || uri.starts_with('/')
713 || uri.starts_with('#')
714 {
715 ""
716 } else {
717 "/"
718 };
719 match iri {
720 Ok(iri) if !iri.cannot_be_a_base() || iri.is_special() => {
721 if uri.contains(|c: char| c.is_whitespace() || c.is_control()) {
723 let mut new_uri = String::with_capacity(uri.len() * 125 / 100);
724 for c in uri.chars() {
725 match c {
726 '\n' => new_uri.push_str("%0A"),
727 '\0' => new_uri.push_str("%00"),
728 '\t' => new_uri.push_str("%09"),
729 '\r' => new_uri.push_str("%0D"),
730 ' ' => new_uri.push_str("%20"),
731 c => new_uri.push(c),
732 }
733 }
734 Ok(Node::Iri(Cow::Owned(new_uri)))
735 } else {
736 Ok(Node::Iri(Cow::Borrowed(uri)))
737 }
738 }
739
740 Ok(iri) => {
742 if uri.starts_with("mail:") || uri.starts_with("tel:") {
743 Ok(Node::Iri(Cow::Borrowed(uri)))
744 } else if let Some((prefix, value)) = ctx
745 .prefixes
746 .iter()
747 .find(|(k, _)| k.eq_ignore_ascii_case(iri.scheme()))
748 {
749 let iri = format!(
750 "{value}{}",
751 &uri.replacen(':', "", 1).trim()[prefix.len()..]
752 );
753 Ok(Node::Iri(Cow::Owned(iri)))
754 } else if let Some((prefix, value)) = COMMON_PREFIXES
755 .iter()
756 .find(|(k, _)| k.eq_ignore_ascii_case(iri.scheme()))
757 {
758 let iri = format!(
759 "{value}{}",
760 &uri.replacen(':', "", 1).trim()[prefix.len()..]
761 );
762 Ok(Node::Iri(Cow::Owned(iri)))
763 } else {
764 Ok(Node::Iri(Cow::Owned(uri.to_string())))
765 }
766 }
767 Err(url::ParseError::RelativeUrlWithoutBase) => {
768 if let Ok((prefix, reference)) = parse_safe_curie(uri) {
769 let reference = reference.trim();
770 let prefix = prefix.trim();
771 if prefix == "_" {
772 let id = if reference.is_empty() {
773 ctx.empty_ref_node_substitute
774 } else {
775 reference
776 };
777 return Ok(Node::RefBlank(id));
778 } else if prefix.is_empty() && !reference.is_empty() {
779 return Ok(Node::TermIri(Cow::Owned(
780 [COMMON_PREFIXES[""], reference].join(""),
781 )));
782 } else if let Some(prefix) = ctx
783 .prefixes
784 .get(prefix)
785 .or_else(|| COMMON_PREFIXES.get(prefix))
786 {
787 let reference = if reference.trim().is_empty() {
788 reference.trim()
789 } else {
790 reference
791 };
792 return Ok(Node::Iri(Cow::Owned([prefix, reference].join(""))));
793 }
794 }
795 if is_resource || uri.starts_with('#') || uri.starts_with('/') {
796 let uri = if uri.starts_with("/") && ctx.base.ends_with("/") {
797 &uri[1..]
798 } else {
799 uri
800 };
801 Ok(Node::TermIri(Cow::Owned(
802 [ctx.base, trailing_white_space, uri].join(""),
803 )))
804 } else if let Some(vocab) = ctx.vocab {
805 Ok(Node::TermIri(Cow::Owned([vocab, uri].join(""))))
806 } else if RESERVED_KEYWORDS
807 .iter()
808 .any(|w| uri.eq_ignore_ascii_case(w))
809 {
810 Ok(Node::TermIri(Cow::Borrowed(
811 COMMON_PREFIXES[uri.to_lowercase().as_str()],
812 )))
813 } else {
814 debug!("could not determine base/vocab {:?}", ctx);
815 Err("could not determine uri")
817 }
818 }
819 Err(e) => {
820 eprintln!("invalid uri {uri}. error: {e}");
821 Err("could not resolve uri")
822 }
823 }
824}
825
826fn parse_safe_curie(s: &str) -> Result<(&str, &str), &'static str> {
827 let mut s = s.trim();
828 if s.starts_with('[') {
829 if !s.ends_with(']') {
830 return Err("invalid SafeCurie");
831 }
832 s = &s[1..s.len() - 1];
833 }
834 s.split_once(':').ok_or("not a curie")
835}
836
837fn parse_prefixes(s: &str) -> HashMap<&str, &str> {
838 s.split_whitespace()
839 .map(|s| s.trim())
840 .collect::<Vec<_>>()
841 .chunks_exact(2)
842 .map(|c| (c[0], c[1]))
843 .filter_map(|(s, p)| {
844 if let Ok((s, _)) = parse_safe_curie(s) {
845 Some((s, p))
846 } else {
847 error!("fixme! couldn't parse curie for {s}, {p}");
848 None
849 }
850 })
851 .collect()
852}
853
854fn parse_property_or_type_of<'a>(
855 s: &'a str,
856 ctx: &Context<'a>,
857 allow_b_node: bool,
858) -> Vec<Node<'a>> {
859 s.split_whitespace()
860 .filter_map(|uri| resolve_uri(uri, ctx, false).ok())
861 .filter(|node| allow_b_node || !matches!(node, Node::Blank(_) | Node::RefBlank(_)))
862 .map(|n| Node::Ref(Arc::new(n)))
863 .collect()
864}
865
866fn push_triples_inlist<'a>(
867 stmts: &mut Vec<Statement<'a>>,
868 subject: &Node<'a>,
869 predicate: Node<'a>,
870 obj: &Node<'a>,
871) {
872 let b_node = make_bnode();
873 stmts.push(Statement {
874 subject: b_node.clone(),
875 predicate: NODE_RDF_FIRST.clone(),
876 object: obj.clone(),
877 });
878
879 if let Some(node) =
880 find_pos_last_node_in_inlist(stmts, subject, &predicate).and_then(|pos| stmts.get_mut(pos))
881 {
882 node.object = b_node.clone();
883 } else {
884 stmts.push(Statement {
886 subject: subject.clone(),
887 predicate,
888 object: b_node.clone(),
889 });
890 }
891 stmts.push(Statement {
892 subject: b_node,
893 predicate: NODE_RDF_REST.clone(),
894 object: NODE_RDF_NIL.clone(),
895 });
896}
897fn find_pos_last_node_in_inlist<'a>(
898 stmts: &Vec<Statement<'a>>,
899 root_subject: &Node<'a>,
900 predicate: &Node<'a>,
901) -> Option<usize> {
902 fn find_res_nil<'a>(stmts: &Vec<Statement<'a>>, subject: &Node<'a>) -> Option<usize> {
903 let node = stmts
904 .iter()
905 .enumerate()
906 .find(|(_, stmt)| &stmt.subject == subject && stmt.predicate == *NODE_RDF_REST);
907
908 if let Some((pos, stmt)) = node {
909 if stmt.object == *NODE_RDF_NIL {
910 Some(pos)
911 } else {
912 find_res_nil(stmts, &stmt.object)
913 }
914 } else {
915 None
916 }
917 }
918 let root = stmts
919 .iter()
920 .find(|stmt| &stmt.subject == root_subject && &stmt.predicate == predicate);
921 if let Some(Statement { object, .. }) = root {
922 find_res_nil(stmts, object)
923 } else {
924 None
925 }
926}
927
928#[inline]
930fn get_children<'a>(
931 element_ref: &ElementRef<'a>,
932) -> Result<Vec<ego_tree::NodeRef<'a, scraper::Node>>, &'static str> {
933 let mut res = vec![];
934 for c in element_ref.children() {
935 if c.value()
936 .as_element()
937 .filter(|e| e.attrs().count() == 0)
938 .is_some()
939 {
940 let child_ref = ElementRef::wrap(c).ok_or("not an element ref")?;
941 res.append(&mut get_children(&child_ref)?);
942 } else {
943 res.push(c);
944 }
945 }
946
947 Ok(res)
948}
949
950#[inline]
951fn make_bnode<'a>() -> Node<'a> {
952 Node::Blank(get_uuid())
953}
954
955#[inline]
956fn copy_pattern(triples: Vec<Statement<'_>>) -> Result<Vec<Statement<'_>>, Box<dyn Error>> {
957 let (pattern_type, pattern): (Vec<Statement>, Vec<Statement>) = triples
958 .into_iter()
959 .partition(|stmt| stmt.object == *NODE_RDFA_PATTERN_TYPE);
960
961 let (pattern_predicate, pattern): (Vec<Statement>, Vec<Statement>) = pattern
962 .into_iter()
963 .partition(|stmt| pattern_type.iter().any(|s| s.subject == stmt.subject));
964
965 let (pattern_subject, mut triples): (Vec<Statement>, Vec<Statement>) = pattern
966 .into_iter()
967 .partition(|stmt| pattern_predicate.iter().any(|s| s.subject == stmt.object));
968
969 let (mut unreferenced_pattern_predicate, pattern_predicate): (Vec<Statement>, Vec<Statement>) =
971 pattern_predicate
972 .into_iter()
973 .partition(|stmt| pattern_subject.iter().all(|s| s.object != stmt.subject));
974
975 let (mut unreferenced_pattern_type, _): (Vec<Statement>, Vec<Statement>) =
976 pattern_type.into_iter().partition(|stmt| {
977 unreferenced_pattern_predicate
978 .iter()
979 .any(|s| s.subject == stmt.subject)
980 });
981 triples.append(&mut unreferenced_pattern_predicate);
982 triples.append(&mut unreferenced_pattern_type);
983
984 for Statement {
985 subject, object, ..
986 } in pattern_subject
987 {
988 for Statement {
989 predicate,
990 object: obj,
991 ..
992 } in pattern_predicate
993 .iter()
994 .filter(|stmt| object == stmt.subject)
995 {
996 triples.push(Statement {
997 subject: subject.clone(),
998 predicate: predicate.clone(),
999 object: obj.clone(),
1000 })
1001 }
1002 }
1003
1004 Ok(triples)
1005}
1006
1007#[inline]
1008fn push_triples<'a>(
1009 stmts: &mut Vec<Statement<'a>>,
1010 subject: &Node<'a>,
1011 predicates: &Option<Vec<Node<'a>>>,
1012 object: &Node<'a>,
1013) {
1014 if let Some(predicate) = predicates {
1015 for predicate in predicate {
1016 stmts.push(Statement {
1017 subject: subject.clone(),
1018 predicate: predicate.clone(),
1019 object: object.clone(),
1020 });
1021 }
1022 }
1023}