use std::collections::{BTreeMap, HashMap};
use super::{
hash::sha256_hex,
nquads::{quad_to_nquad, quad_to_nquad_with_placeholders},
types::RdfQuad,
};
#[derive(Debug, Clone)]
pub struct IdentifierIssuer {
prefix: String,
counter: usize,
issued: Vec<(String, String)>,
issued_map: HashMap<String, String>,
}
impl IdentifierIssuer {
pub fn new(prefix: impl Into<String>) -> Self {
IdentifierIssuer {
prefix: prefix.into(),
counter: 0,
issued: Vec::new(),
issued_map: HashMap::new(),
}
}
pub fn issue(&mut self, original: &str) -> String {
if let Some(existing) = self.issued_map.get(original) {
return existing.clone();
}
let canonical = format!("{}{}", self.prefix, self.counter);
self.counter += 1;
self.issued_map
.insert(original.to_string(), canonical.clone());
self.issued.push((original.to_string(), canonical.clone()));
canonical
}
pub fn get(&self, original: &str) -> Option<&str> {
self.issued_map.get(original).map(String::as_str)
}
pub fn has_issued(&self, original: &str) -> bool {
self.issued_map.contains_key(original)
}
pub fn issued_pairs(&self) -> impl Iterator<Item = (&str, &str)> {
self.issued.iter().map(|(o, c)| (o.as_str(), c.as_str()))
}
pub fn as_map(&self) -> &HashMap<String, String> {
&self.issued_map
}
}
pub struct Canonicalizer {
issuer: IdentifierIssuer,
bnode_to_quads: HashMap<String, Vec<usize>>,
quads: Vec<RdfQuad>,
hash_to_bnodes: BTreeMap<String, Vec<String>>,
}
impl Canonicalizer {
pub fn new() -> Self {
Canonicalizer {
issuer: IdentifierIssuer::new("_:c14n"),
bnode_to_quads: HashMap::new(),
quads: Vec::new(),
hash_to_bnodes: BTreeMap::new(),
}
}
fn collect_blank_nodes(&mut self) {
for (idx, quad) in self.quads.iter().enumerate() {
for bnode_id in quad.blank_nodes() {
self.bnode_to_quads
.entry(bnode_id.to_string())
.or_default()
.push(idx);
}
}
}
fn hash_first_degree_quads(&self, bnode: &str, issued: &IdentifierIssuer) -> String {
let quad_indices = match self.bnode_to_quads.get(bnode) {
Some(v) => v,
None => return sha256_hex(""),
};
let mut nquad_lines: Vec<String> = quad_indices
.iter()
.map(|&idx| quad_to_nquad_with_placeholders(&self.quads[idx], bnode, issued.as_map()))
.collect();
nquad_lines.sort();
sha256_hex(&nquad_lines.join(""))
}
fn issue_simple_canonical_ids(&mut self) {
let mut simple: Vec<(String, String)> = self
.hash_to_bnodes
.iter()
.filter(|(_, bnodes)| bnodes.len() == 1)
.map(|(h, bnodes)| (h.clone(), bnodes[0].clone()))
.collect();
simple.sort_by(|a, b| a.0.cmp(&b.0));
for (_, bnode) in simple {
self.issuer.issue(&bnode);
}
}
fn hash_n_degree_quads(
&self,
bnode: &str,
issuer: &IdentifierIssuer,
depth: usize,
) -> (String, IdentifierIssuer) {
const MAX_DEPTH: usize = 64;
let mut hash_to_related: BTreeMap<String, Vec<String>> = BTreeMap::new();
let quad_indices = match self.bnode_to_quads.get(bnode) {
Some(v) => v.clone(),
None => return (sha256_hex(""), issuer.clone()),
};
for quad_idx in &quad_indices {
let quad = &self.quads[*quad_idx];
for related_id in quad.blank_nodes() {
if related_id == bnode {
continue;
}
let related_hash = self.hash_first_degree_quads(related_id, issuer);
hash_to_related
.entry(related_hash)
.or_default()
.push(related_id.to_string());
}
}
let mut combined = String::new();
{
let mut nquad_lines: Vec<String> = quad_indices
.iter()
.map(|&idx| quad_to_nquad(&self.quads[idx], issuer.as_map()))
.collect();
nquad_lines.sort();
combined.push_str(&nquad_lines.join(""));
}
let mut chosen_issuer = issuer.clone();
for (group_hash, related_group) in &hash_to_related {
combined.push_str(group_hash);
let mut best_path: Option<(String, IdentifierIssuer)> = None;
let permutations = permutations_of(related_group);
for perm in permutations {
let mut path_issuer = issuer.clone();
let mut path = String::new();
for related_bnode in &perm {
if let Some(canonical) = issuer.get(related_bnode) {
path.push_str(canonical);
} else {
if depth < MAX_DEPTH {
let temp_id = path_issuer.issue(related_bnode);
path.push_str(&temp_id);
let (recursive_hash, _) =
self.hash_n_degree_quads(related_bnode, &path_issuer, depth + 1);
path.push_str(&recursive_hash);
} else {
let fd_hash = self.hash_first_degree_quads(related_bnode, &path_issuer);
path.push_str(&fd_hash);
}
}
}
let candidate_hash = sha256_hex(&path);
if let Some((ref best_hash, _)) = best_path {
if candidate_hash < *best_hash {
best_path = Some((candidate_hash, path_issuer));
}
} else {
best_path = Some((candidate_hash, path_issuer));
}
}
if let Some((best_hash, best_issuer)) = best_path {
combined.push_str(&best_hash);
chosen_issuer = best_issuer;
}
}
(sha256_hex(&combined), chosen_issuer)
}
fn issue_ndegree_canonical_ids(&mut self) {
let unresolved_groups: Vec<(String, Vec<String>)> = self
.hash_to_bnodes
.iter()
.filter(|(_, bnodes)| bnodes.len() > 1)
.map(|(h, bnodes)| (h.clone(), bnodes.clone()))
.collect();
for (_, group) in unresolved_groups {
let mut nd_hashes: Vec<(String, String, IdentifierIssuer)> = group
.iter()
.map(|bnode| {
let (nd_hash, temp_issuer) =
self.hash_n_degree_quads(bnode, &self.issuer.clone(), 0);
(nd_hash, bnode.clone(), temp_issuer)
})
.collect();
nd_hashes.sort_by(|a, b| a.0.cmp(&b.0));
for (_, bnode, _temp_issuer) in &nd_hashes {
if !self.issuer.has_issued(bnode) {
self.issuer.issue(bnode);
}
}
}
}
fn emit_canonical_nquads(&self) -> String {
let mapping = self.issuer.as_map();
let mut lines: Vec<String> = self
.quads
.iter()
.map(|q| quad_to_nquad(q, mapping))
.collect();
lines.sort();
lines.join("\n")
}
pub fn canonicalize(quads: &[RdfQuad]) -> String {
let mut c = Canonicalizer::new();
c.quads = quads.to_vec();
if c.quads.is_empty() {
return String::new();
}
c.collect_blank_nodes();
let all_bnodes: Vec<String> = c.bnode_to_quads.keys().cloned().collect();
let temp_issuer = IdentifierIssuer::new("_:c14n");
for bnode in &all_bnodes {
let hash = c.hash_first_degree_quads(bnode, &temp_issuer);
c.hash_to_bnodes
.entry(hash)
.or_default()
.push(bnode.clone());
}
for group in c.hash_to_bnodes.values_mut() {
group.sort();
}
c.issue_simple_canonical_ids();
c.issue_ndegree_canonical_ids();
c.emit_canonical_nquads()
}
}
impl Default for Canonicalizer {
fn default() -> Self {
Self::new()
}
}
fn permutations_of(items: &[String]) -> Vec<Vec<String>> {
let n = items.len();
if n == 0 {
return vec![vec![]];
}
if n == 1 {
return vec![items.to_vec()];
}
let mut result = Vec::new();
let mut arr = items.to_vec();
heap_permute(&mut arr, n, &mut result);
result
}
fn heap_permute(arr: &mut Vec<String>, k: usize, out: &mut Vec<Vec<String>>) {
if k == 1 {
out.push(arr.clone());
return;
}
for i in 0..k {
heap_permute(arr, k - 1, out);
if k % 2 == 0 {
arr.swap(i, k - 1);
} else {
arr.swap(0, k - 1);
}
}
}
pub fn canonicalize(quads: &[RdfQuad]) -> String {
Canonicalizer::canonicalize(quads)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::canon::types::{QuadTerm, RdfQuad};
use std::collections::HashSet;
fn iri_quad(s: &str, p: &str, o: &str) -> RdfQuad {
RdfQuad::new(QuadTerm::iri(s), QuadTerm::iri(p), QuadTerm::iri(o))
}
fn blank_iri_quad(bnode: &str, p: &str, o: &str) -> RdfQuad {
RdfQuad::new(QuadTerm::blank(bnode), QuadTerm::iri(p), QuadTerm::iri(o))
}
#[test]
fn test_empty_graph_yields_empty_output() {
let result = canonicalize(&[]);
assert!(result.is_empty(), "expected empty string for empty input");
}
#[test]
fn test_single_iri_quad() {
let quads = vec![iri_quad(
"http://example.org/s",
"http://example.org/p",
"http://example.org/o",
)];
let result = canonicalize(&quads);
assert_eq!(
result,
"<http://example.org/s> <http://example.org/p> <http://example.org/o> ."
);
}
#[test]
fn test_literal_with_datatype() {
let quads = vec![RdfQuad::new(
QuadTerm::iri("http://example.org/s"),
QuadTerm::iri("http://example.org/p"),
QuadTerm::typed_literal("42", "http://www.w3.org/2001/XMLSchema#integer"),
)];
let result = canonicalize(&quads);
assert!(
result.contains("\"42\"^^<http://www.w3.org/2001/XMLSchema#integer>"),
"expected typed literal; got: {result}"
);
}
#[test]
fn test_literal_with_language_tag() {
let quads = vec![RdfQuad::new(
QuadTerm::iri("http://example.org/s"),
QuadTerm::iri("http://example.org/p"),
QuadTerm::lang_literal("Hello", "en"),
)];
let result = canonicalize(&quads);
assert!(
result.contains("\"Hello\"@en"),
"expected language-tagged literal; got: {result}"
);
}
#[test]
fn test_single_blank_node_gets_c14n0() {
let quads = vec![blank_iri_quad(
"b0",
"http://example.org/p",
"http://example.org/o",
)];
let result = canonicalize(&quads);
assert!(
result.contains("_:c14n0"),
"expected _:c14n0; got: {result}"
);
assert!(
!result.contains("_:b0"),
"original blank node id must not appear in output"
);
}
#[test]
fn test_two_blank_nodes_get_sequential_ids() {
let quads = vec![
blank_iri_quad("b0", "http://example.org/type", "http://example.org/A"),
blank_iri_quad("b1", "http://example.org/type", "http://example.org/B"),
];
let result = canonicalize(&quads);
assert!(
result.contains("_:c14n0"),
"expected _:c14n0; got: {result}"
);
assert!(
result.contains("_:c14n1"),
"expected _:c14n1; got: {result}"
);
}
#[test]
fn test_same_blank_node_across_quads() {
let quads = vec![
blank_iri_quad(
"alice",
"http://example.org/name",
"http://example.org/Alice",
),
blank_iri_quad(
"alice",
"http://example.org/knows",
"http://example.org/Bob",
),
];
let result = canonicalize(&quads);
let c14n_count = result.matches("_:c14n0").count();
assert_eq!(
c14n_count, 2,
"_:c14n0 should appear once per quad; got {c14n_count} occurrences:\n{result}"
);
assert!(
!result.contains("_:c14n1"),
"only one blank node exists; _:c14n1 must not appear; got:\n{result}"
);
}
#[test]
fn test_named_graph_quad() {
let quads = vec![RdfQuad::new_in_graph(
QuadTerm::iri("http://example.org/s"),
QuadTerm::iri("http://example.org/p"),
QuadTerm::iri("http://example.org/o"),
QuadTerm::iri("http://example.org/graph1"),
)];
let result = canonicalize(&quads);
assert!(
result.contains("<http://example.org/graph1>"),
"named graph must appear in output; got: {result}"
);
assert!(
result.ends_with('.') || result.ends_with(". "),
"N-Quads line must end with period; got: {result}"
);
}
#[test]
fn test_output_is_sorted_lexicographically() {
let quads = vec![
iri_quad(
"http://z.example.org/s",
"http://example.org/p",
"http://example.org/o",
),
iri_quad(
"http://a.example.org/s",
"http://example.org/p",
"http://example.org/o",
),
iri_quad(
"http://m.example.org/s",
"http://example.org/p",
"http://example.org/o",
),
];
let result = canonicalize(&quads);
let lines: Vec<&str> = result.lines().collect();
assert_eq!(lines.len(), 3);
assert!(lines[0] < lines[1], "line 0 must be < line 1");
assert!(lines[1] < lines[2], "line 1 must be < line 2");
assert!(
lines[0].contains("/a.example.org"),
"first line must be /a; got: {}",
lines[0]
);
assert!(
lines[2].contains("/z.example.org"),
"last line must be /z; got: {}",
lines[2]
);
}
#[test]
fn test_canonicalization_is_deterministic() {
let quads = vec![
blank_iri_quad("x", "http://example.org/p", "http://example.org/o"),
blank_iri_quad("y", "http://example.org/q", "http://example.org/o"),
RdfQuad::new(
QuadTerm::blank("x"),
QuadTerm::iri("http://example.org/knows"),
QuadTerm::blank("y"),
),
];
let first = canonicalize(&quads);
let second = canonicalize(&quads);
assert_eq!(
first, second,
"canonicalization must be deterministic across calls"
);
}
#[test]
fn test_blank_node_renamed_from_b0() {
let quads = vec![blank_iri_quad(
"b0",
"http://example.org/type",
"http://example.org/Thing",
)];
let result = canonicalize(&quads);
assert!(
result.contains("_:c14n0"),
"blank node must be renamed to _:c14n0; got: {result}"
);
assert!(
!result.contains("_:b0"),
"original _:b0 must NOT appear in output; got: {result}"
);
}
#[test]
fn test_three_blank_nodes_unique_ids() {
let quads = vec![
RdfQuad::new(
QuadTerm::blank("x"),
QuadTerm::iri("http://example.org/next"),
QuadTerm::blank("y"),
),
RdfQuad::new(
QuadTerm::blank("y"),
QuadTerm::iri("http://example.org/next"),
QuadTerm::blank("z"),
),
blank_iri_quad("z", "http://example.org/type", "http://example.org/End"),
];
let result = canonicalize(&quads);
let mut ids: HashSet<String> = HashSet::new();
for i in 0..3 {
let id = format!("_:c14n{}", i);
assert!(
result.contains(&id),
"expected {id} in output; got:\n{result}"
);
ids.insert(id);
}
assert_eq!(ids.len(), 3, "all three canonical IDs must be distinct");
}
#[test]
fn test_blank_node_in_object_position() {
let quads = vec![RdfQuad::new(
QuadTerm::iri("http://example.org/s"),
QuadTerm::iri("http://example.org/p"),
QuadTerm::blank("obj_node"),
)];
let result = canonicalize(&quads);
assert!(
result.contains("_:c14n0"),
"blank object must be canonicalized; got: {result}"
);
}
#[test]
fn test_blank_node_as_named_graph() {
let quads = vec![RdfQuad::new_in_graph(
QuadTerm::iri("http://example.org/s"),
QuadTerm::iri("http://example.org/p"),
QuadTerm::iri("http://example.org/o"),
QuadTerm::blank("g"),
)];
let result = canonicalize(&quads);
assert!(
result.contains("_:c14n0"),
"blank graph name must be canonicalized; got: {result}"
);
}
#[test]
fn test_identifier_issuer_sequential() {
let mut issuer = IdentifierIssuer::new("_:c14n");
let id0 = issuer.issue("b0");
let id1 = issuer.issue("b1");
let id0_again = issuer.issue("b0");
assert_eq!(id0, "_:c14n0");
assert_eq!(id1, "_:c14n1");
assert_eq!(id0_again, "_:c14n0", "same node must get same canonical id");
}
#[test]
fn test_two_calls_are_independent() {
let quads_a = vec![blank_iri_quad(
"x",
"http://example.org/p",
"http://example.org/A",
)];
let quads_b = vec![blank_iri_quad(
"x",
"http://example.org/p",
"http://example.org/A",
)];
assert_eq!(
canonicalize(&quads_a),
canonicalize(&quads_b),
"identical inputs must produce identical outputs"
);
}
}