1use std::collections::{BTreeMap, HashMap};
32
33use super::{
34 hash::sha256_hex,
35 nquads::{quad_to_nquad, quad_to_nquad_with_placeholders},
36 types::RdfQuad,
37};
38
39#[derive(Debug, Clone)]
47pub struct IdentifierIssuer {
48 prefix: String,
49 counter: usize,
50 issued: Vec<(String, String)>,
53 issued_map: HashMap<String, String>,
55}
56
57impl IdentifierIssuer {
58 pub fn new(prefix: impl Into<String>) -> Self {
60 IdentifierIssuer {
61 prefix: prefix.into(),
62 counter: 0,
63 issued: Vec::new(),
64 issued_map: HashMap::new(),
65 }
66 }
67
68 pub fn issue(&mut self, original: &str) -> String {
72 if let Some(existing) = self.issued_map.get(original) {
73 return existing.clone();
74 }
75 let canonical = format!("{}{}", self.prefix, self.counter);
76 self.counter += 1;
77 self.issued_map
78 .insert(original.to_string(), canonical.clone());
79 self.issued.push((original.to_string(), canonical.clone()));
80 canonical
81 }
82
83 pub fn get(&self, original: &str) -> Option<&str> {
86 self.issued_map.get(original).map(String::as_str)
87 }
88
89 pub fn has_issued(&self, original: &str) -> bool {
92 self.issued_map.contains_key(original)
93 }
94
95 pub fn issued_pairs(&self) -> impl Iterator<Item = (&str, &str)> {
97 self.issued.iter().map(|(o, c)| (o.as_str(), c.as_str()))
98 }
99
100 pub fn as_map(&self) -> &HashMap<String, String> {
102 &self.issued_map
103 }
104}
105
106pub struct Canonicalizer {
115 issuer: IdentifierIssuer,
117 bnode_to_quads: HashMap<String, Vec<usize>>,
119 quads: Vec<RdfQuad>,
121 hash_to_bnodes: BTreeMap<String, Vec<String>>,
123}
124
125impl Canonicalizer {
126 pub fn new() -> Self {
128 Canonicalizer {
129 issuer: IdentifierIssuer::new("_:c14n"),
130 bnode_to_quads: HashMap::new(),
131 quads: Vec::new(),
132 hash_to_bnodes: BTreeMap::new(),
133 }
134 }
135
136 fn collect_blank_nodes(&mut self) {
140 for (idx, quad) in self.quads.iter().enumerate() {
141 for bnode_id in quad.blank_nodes() {
142 self.bnode_to_quads
143 .entry(bnode_id.to_string())
144 .or_default()
145 .push(idx);
146 }
147 }
148 }
149
150 fn hash_first_degree_quads(&self, bnode: &str, issued: &IdentifierIssuer) -> String {
157 let quad_indices = match self.bnode_to_quads.get(bnode) {
158 Some(v) => v,
159 None => return sha256_hex(""),
160 };
161
162 let mut nquad_lines: Vec<String> = quad_indices
164 .iter()
165 .map(|&idx| quad_to_nquad_with_placeholders(&self.quads[idx], bnode, issued.as_map()))
166 .collect();
167
168 nquad_lines.sort();
169 sha256_hex(&nquad_lines.join(""))
170 }
171
172 fn issue_simple_canonical_ids(&mut self) {
177 let mut simple: Vec<(String, String)> = self
179 .hash_to_bnodes
180 .iter()
181 .filter(|(_, bnodes)| bnodes.len() == 1)
182 .map(|(h, bnodes)| (h.clone(), bnodes[0].clone()))
183 .collect();
184
185 simple.sort_by(|a, b| a.0.cmp(&b.0));
188
189 for (_, bnode) in simple {
190 self.issuer.issue(&bnode);
191 }
192 }
193
194 fn hash_n_degree_quads(
209 &self,
210 bnode: &str,
211 issuer: &IdentifierIssuer,
212 depth: usize,
213 ) -> (String, IdentifierIssuer) {
214 const MAX_DEPTH: usize = 64;
216
217 let mut hash_to_related: BTreeMap<String, Vec<String>> = BTreeMap::new();
220
221 let quad_indices = match self.bnode_to_quads.get(bnode) {
222 Some(v) => v.clone(),
223 None => return (sha256_hex(""), issuer.clone()),
224 };
225
226 for quad_idx in &quad_indices {
227 let quad = &self.quads[*quad_idx];
228 for related_id in quad.blank_nodes() {
230 if related_id == bnode {
231 continue;
232 }
233 let related_hash = self.hash_first_degree_quads(related_id, issuer);
235 hash_to_related
236 .entry(related_hash)
237 .or_default()
238 .push(related_id.to_string());
239 }
240 }
241
242 let mut combined = String::new();
244
245 {
247 let mut nquad_lines: Vec<String> = quad_indices
248 .iter()
249 .map(|&idx| quad_to_nquad(&self.quads[idx], issuer.as_map()))
250 .collect();
251 nquad_lines.sort();
252 combined.push_str(&nquad_lines.join(""));
253 }
254
255 let mut chosen_issuer = issuer.clone();
256
257 for (group_hash, related_group) in &hash_to_related {
260 combined.push_str(group_hash);
261
262 let mut best_path: Option<(String, IdentifierIssuer)> = None;
263
264 let permutations = permutations_of(related_group);
266
267 for perm in permutations {
268 let mut path_issuer = issuer.clone();
269 let mut path = String::new();
270
271 for related_bnode in &perm {
272 if let Some(canonical) = issuer.get(related_bnode) {
274 path.push_str(canonical);
275 } else {
276 if depth < MAX_DEPTH {
278 let temp_id = path_issuer.issue(related_bnode);
279 path.push_str(&temp_id);
280 let (recursive_hash, _) =
281 self.hash_n_degree_quads(related_bnode, &path_issuer, depth + 1);
282 path.push_str(&recursive_hash);
283 } else {
284 let fd_hash = self.hash_first_degree_quads(related_bnode, &path_issuer);
286 path.push_str(&fd_hash);
287 }
288 }
289 }
290
291 let candidate_hash = sha256_hex(&path);
292
293 if let Some((ref best_hash, _)) = best_path {
294 if candidate_hash < *best_hash {
295 best_path = Some((candidate_hash, path_issuer));
296 }
297 } else {
298 best_path = Some((candidate_hash, path_issuer));
299 }
300 }
301
302 if let Some((best_hash, best_issuer)) = best_path {
303 combined.push_str(&best_hash);
304 chosen_issuer = best_issuer;
305 }
306 }
307
308 (sha256_hex(&combined), chosen_issuer)
309 }
310
311 fn issue_ndegree_canonical_ids(&mut self) {
314 let unresolved_groups: Vec<(String, Vec<String>)> = self
317 .hash_to_bnodes
318 .iter()
319 .filter(|(_, bnodes)| bnodes.len() > 1)
320 .map(|(h, bnodes)| (h.clone(), bnodes.clone()))
321 .collect();
322
323 for (_, group) in unresolved_groups {
324 let mut nd_hashes: Vec<(String, String, IdentifierIssuer)> = group
326 .iter()
327 .map(|bnode| {
328 let (nd_hash, temp_issuer) =
329 self.hash_n_degree_quads(bnode, &self.issuer.clone(), 0);
330 (nd_hash, bnode.clone(), temp_issuer)
331 })
332 .collect();
333
334 nd_hashes.sort_by(|a, b| a.0.cmp(&b.0));
336
337 for (_, bnode, _temp_issuer) in &nd_hashes {
338 if !self.issuer.has_issued(bnode) {
339 self.issuer.issue(bnode);
340 }
341 }
342 }
343 }
344
345 fn emit_canonical_nquads(&self) -> String {
350 let mapping = self.issuer.as_map();
351 let mut lines: Vec<String> = self
352 .quads
353 .iter()
354 .map(|q| quad_to_nquad(q, mapping))
355 .collect();
356 lines.sort();
357 lines.join("\n")
358 }
359
360 pub fn canonicalize(quads: &[RdfQuad]) -> String {
367 let mut c = Canonicalizer::new();
368 c.quads = quads.to_vec();
369
370 if c.quads.is_empty() {
371 return String::new();
372 }
373
374 c.collect_blank_nodes();
376
377 let all_bnodes: Vec<String> = c.bnode_to_quads.keys().cloned().collect();
379 let temp_issuer = IdentifierIssuer::new("_:c14n");
380 for bnode in &all_bnodes {
381 let hash = c.hash_first_degree_quads(bnode, &temp_issuer);
382 c.hash_to_bnodes
383 .entry(hash)
384 .or_default()
385 .push(bnode.clone());
386 }
387
388 for group in c.hash_to_bnodes.values_mut() {
390 group.sort();
391 }
392
393 c.issue_simple_canonical_ids();
395
396 c.issue_ndegree_canonical_ids();
398
399 c.emit_canonical_nquads()
401 }
402}
403
404impl Default for Canonicalizer {
405 fn default() -> Self {
406 Self::new()
407 }
408}
409
410fn permutations_of(items: &[String]) -> Vec<Vec<String>> {
418 let n = items.len();
419 if n == 0 {
420 return vec![vec![]];
421 }
422 if n == 1 {
423 return vec![items.to_vec()];
424 }
425
426 let mut result = Vec::new();
427 let mut arr = items.to_vec();
428 heap_permute(&mut arr, n, &mut result);
429 result
430}
431
432fn heap_permute(arr: &mut Vec<String>, k: usize, out: &mut Vec<Vec<String>>) {
433 if k == 1 {
434 out.push(arr.clone());
435 return;
436 }
437 for i in 0..k {
438 heap_permute(arr, k - 1, out);
439 if k % 2 == 0 {
440 arr.swap(i, k - 1);
441 } else {
442 arr.swap(0, k - 1);
443 }
444 }
445}
446
447pub fn canonicalize(quads: &[RdfQuad]) -> String {
466 Canonicalizer::canonicalize(quads)
467}
468
469#[cfg(test)]
472mod tests {
473 use super::*;
474 use crate::canon::types::{QuadTerm, RdfQuad};
475 use std::collections::HashSet;
476
477 fn iri_quad(s: &str, p: &str, o: &str) -> RdfQuad {
480 RdfQuad::new(QuadTerm::iri(s), QuadTerm::iri(p), QuadTerm::iri(o))
481 }
482
483 fn blank_iri_quad(bnode: &str, p: &str, o: &str) -> RdfQuad {
484 RdfQuad::new(QuadTerm::blank(bnode), QuadTerm::iri(p), QuadTerm::iri(o))
485 }
486
487 #[test]
490 fn test_empty_graph_yields_empty_output() {
491 let result = canonicalize(&[]);
492 assert!(result.is_empty(), "expected empty string for empty input");
493 }
494
495 #[test]
498 fn test_single_iri_quad() {
499 let quads = vec![iri_quad(
500 "http://example.org/s",
501 "http://example.org/p",
502 "http://example.org/o",
503 )];
504 let result = canonicalize(&quads);
505 assert_eq!(
506 result,
507 "<http://example.org/s> <http://example.org/p> <http://example.org/o> ."
508 );
509 }
510
511 #[test]
514 fn test_literal_with_datatype() {
515 let quads = vec![RdfQuad::new(
516 QuadTerm::iri("http://example.org/s"),
517 QuadTerm::iri("http://example.org/p"),
518 QuadTerm::typed_literal("42", "http://www.w3.org/2001/XMLSchema#integer"),
519 )];
520 let result = canonicalize(&quads);
521 assert!(
522 result.contains("\"42\"^^<http://www.w3.org/2001/XMLSchema#integer>"),
523 "expected typed literal; got: {result}"
524 );
525 }
526
527 #[test]
530 fn test_literal_with_language_tag() {
531 let quads = vec![RdfQuad::new(
532 QuadTerm::iri("http://example.org/s"),
533 QuadTerm::iri("http://example.org/p"),
534 QuadTerm::lang_literal("Hello", "en"),
535 )];
536 let result = canonicalize(&quads);
537 assert!(
538 result.contains("\"Hello\"@en"),
539 "expected language-tagged literal; got: {result}"
540 );
541 }
542
543 #[test]
546 fn test_single_blank_node_gets_c14n0() {
547 let quads = vec![blank_iri_quad(
548 "b0",
549 "http://example.org/p",
550 "http://example.org/o",
551 )];
552 let result = canonicalize(&quads);
553 assert!(
554 result.contains("_:c14n0"),
555 "expected _:c14n0; got: {result}"
556 );
557 assert!(
558 !result.contains("_:b0"),
559 "original blank node id must not appear in output"
560 );
561 }
562
563 #[test]
566 fn test_two_blank_nodes_get_sequential_ids() {
567 let quads = vec![
568 blank_iri_quad("b0", "http://example.org/type", "http://example.org/A"),
569 blank_iri_quad("b1", "http://example.org/type", "http://example.org/B"),
570 ];
571 let result = canonicalize(&quads);
572 assert!(
573 result.contains("_:c14n0"),
574 "expected _:c14n0; got: {result}"
575 );
576 assert!(
577 result.contains("_:c14n1"),
578 "expected _:c14n1; got: {result}"
579 );
580 }
581
582 #[test]
585 fn test_same_blank_node_across_quads() {
586 let quads = vec![
587 blank_iri_quad(
588 "alice",
589 "http://example.org/name",
590 "http://example.org/Alice",
591 ),
592 blank_iri_quad(
593 "alice",
594 "http://example.org/knows",
595 "http://example.org/Bob",
596 ),
597 ];
598 let result = canonicalize(&quads);
599 let c14n_count = result.matches("_:c14n0").count();
601 assert_eq!(
602 c14n_count, 2,
603 "_:c14n0 should appear once per quad; got {c14n_count} occurrences:\n{result}"
604 );
605 assert!(
607 !result.contains("_:c14n1"),
608 "only one blank node exists; _:c14n1 must not appear; got:\n{result}"
609 );
610 }
611
612 #[test]
615 fn test_named_graph_quad() {
616 let quads = vec![RdfQuad::new_in_graph(
617 QuadTerm::iri("http://example.org/s"),
618 QuadTerm::iri("http://example.org/p"),
619 QuadTerm::iri("http://example.org/o"),
620 QuadTerm::iri("http://example.org/graph1"),
621 )];
622 let result = canonicalize(&quads);
623 assert!(
624 result.contains("<http://example.org/graph1>"),
625 "named graph must appear in output; got: {result}"
626 );
627 assert!(
628 result.ends_with('.') || result.ends_with(". "),
629 "N-Quads line must end with period; got: {result}"
630 );
631 }
632
633 #[test]
636 fn test_output_is_sorted_lexicographically() {
637 let quads = vec![
638 iri_quad(
639 "http://z.example.org/s",
640 "http://example.org/p",
641 "http://example.org/o",
642 ),
643 iri_quad(
644 "http://a.example.org/s",
645 "http://example.org/p",
646 "http://example.org/o",
647 ),
648 iri_quad(
649 "http://m.example.org/s",
650 "http://example.org/p",
651 "http://example.org/o",
652 ),
653 ];
654 let result = canonicalize(&quads);
655 let lines: Vec<&str> = result.lines().collect();
656 assert_eq!(lines.len(), 3);
657 assert!(lines[0] < lines[1], "line 0 must be < line 1");
659 assert!(lines[1] < lines[2], "line 1 must be < line 2");
660 assert!(
661 lines[0].contains("/a.example.org"),
662 "first line must be /a; got: {}",
663 lines[0]
664 );
665 assert!(
666 lines[2].contains("/z.example.org"),
667 "last line must be /z; got: {}",
668 lines[2]
669 );
670 }
671
672 #[test]
675 fn test_canonicalization_is_deterministic() {
676 let quads = vec![
677 blank_iri_quad("x", "http://example.org/p", "http://example.org/o"),
678 blank_iri_quad("y", "http://example.org/q", "http://example.org/o"),
679 RdfQuad::new(
680 QuadTerm::blank("x"),
681 QuadTerm::iri("http://example.org/knows"),
682 QuadTerm::blank("y"),
683 ),
684 ];
685 let first = canonicalize(&quads);
686 let second = canonicalize(&quads);
687 assert_eq!(
688 first, second,
689 "canonicalization must be deterministic across calls"
690 );
691 }
692
693 #[test]
696 fn test_blank_node_renamed_from_b0() {
697 let quads = vec![blank_iri_quad(
698 "b0",
699 "http://example.org/type",
700 "http://example.org/Thing",
701 )];
702 let result = canonicalize(&quads);
703 assert!(
704 result.contains("_:c14n0"),
705 "blank node must be renamed to _:c14n0; got: {result}"
706 );
707 assert!(
708 !result.contains("_:b0"),
709 "original _:b0 must NOT appear in output; got: {result}"
710 );
711 }
712
713 #[test]
716 fn test_three_blank_nodes_unique_ids() {
717 let quads = vec![
719 RdfQuad::new(
720 QuadTerm::blank("x"),
721 QuadTerm::iri("http://example.org/next"),
722 QuadTerm::blank("y"),
723 ),
724 RdfQuad::new(
725 QuadTerm::blank("y"),
726 QuadTerm::iri("http://example.org/next"),
727 QuadTerm::blank("z"),
728 ),
729 blank_iri_quad("z", "http://example.org/type", "http://example.org/End"),
730 ];
731 let result = canonicalize(&quads);
732
733 let mut ids: HashSet<String> = HashSet::new();
734 for i in 0..3 {
735 let id = format!("_:c14n{}", i);
736 assert!(
737 result.contains(&id),
738 "expected {id} in output; got:\n{result}"
739 );
740 ids.insert(id);
741 }
742 assert_eq!(ids.len(), 3, "all three canonical IDs must be distinct");
743 }
744
745 #[test]
748 fn test_blank_node_in_object_position() {
749 let quads = vec![RdfQuad::new(
750 QuadTerm::iri("http://example.org/s"),
751 QuadTerm::iri("http://example.org/p"),
752 QuadTerm::blank("obj_node"),
753 )];
754 let result = canonicalize(&quads);
755 assert!(
756 result.contains("_:c14n0"),
757 "blank object must be canonicalized; got: {result}"
758 );
759 }
760
761 #[test]
764 fn test_blank_node_as_named_graph() {
765 let quads = vec![RdfQuad::new_in_graph(
766 QuadTerm::iri("http://example.org/s"),
767 QuadTerm::iri("http://example.org/p"),
768 QuadTerm::iri("http://example.org/o"),
769 QuadTerm::blank("g"),
770 )];
771 let result = canonicalize(&quads);
772 assert!(
773 result.contains("_:c14n0"),
774 "blank graph name must be canonicalized; got: {result}"
775 );
776 }
777
778 #[test]
781 fn test_identifier_issuer_sequential() {
782 let mut issuer = IdentifierIssuer::new("_:c14n");
783 let id0 = issuer.issue("b0");
784 let id1 = issuer.issue("b1");
785 let id0_again = issuer.issue("b0");
786 assert_eq!(id0, "_:c14n0");
787 assert_eq!(id1, "_:c14n1");
788 assert_eq!(id0_again, "_:c14n0", "same node must get same canonical id");
789 }
790
791 #[test]
794 fn test_two_calls_are_independent() {
795 let quads_a = vec![blank_iri_quad(
796 "x",
797 "http://example.org/p",
798 "http://example.org/A",
799 )];
800 let quads_b = vec![blank_iri_quad(
801 "x",
802 "http://example.org/p",
803 "http://example.org/A",
804 )];
805 assert_eq!(
806 canonicalize(&quads_a),
807 canonicalize(&quads_b),
808 "identical inputs must produce identical outputs"
809 );
810 }
811}