use std::collections::BTreeMap as Map;
use std::collections::HashSet;
use std::fmt;
#[derive(Debug)]
pub struct MissingChosenIssuer;
use rdf_types::BlankId;
use rdf_types::QuadRef;
use rdf_types::{BlankIdBuf, Quad};
use ssi_crypto::hashes::sha256::sha256;
use crate::rdf::IntoNQuads;
use crate::rdf::NQuadsStatement;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum BlankIdPosition {
Subject,
Object,
Graph,
}
impl BlankIdPosition {
pub fn into_char(self) -> char {
match self {
Self::Subject => 's',
Self::Object => 'o',
Self::Graph => 'g',
}
}
}
impl From<BlankIdPosition> for char {
fn from(p: BlankIdPosition) -> Self {
p.into_char()
}
}
impl fmt::Display for BlankIdPosition {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.into_char().fmt(f)
}
}
pub trait BlankNodeComponents<'a> {
fn blank_node_components(&self) -> Vec<&'a BlankId>;
fn blank_node_components_with_position(&self) -> Vec<(&'a BlankId, BlankIdPosition)>;
}
pub trait BlankNodeComponentsMut {
fn blank_node_components_mut(&mut self) -> Vec<&mut BlankIdBuf>;
}
impl<'a> BlankNodeComponents<'a> for QuadRef<'a> {
fn blank_node_components(&self) -> Vec<&'a BlankId> {
self.blank_node_components_with_position()
.into_iter()
.map(|(label, _position)| label)
.collect()
}
fn blank_node_components_with_position(&self) -> Vec<(&'a BlankId, BlankIdPosition)> {
let mut labels = Vec::new();
if let rdf_types::Subject::Blank(label) = self.0 {
labels.push((label, BlankIdPosition::Subject))
}
if let rdf_types::Object::Blank(label) = self.2 {
labels.push((label, BlankIdPosition::Object))
}
if let Some(rdf_types::GraphLabel::Blank(label)) = self.3 {
labels.push((label, BlankIdPosition::Graph))
}
labels
}
}
impl BlankNodeComponentsMut for Quad {
fn blank_node_components_mut(&mut self) -> Vec<&mut BlankIdBuf> {
let mut labels: Vec<&mut BlankIdBuf> = Vec::new();
if let rdf_types::Subject::Blank(label) = &mut self.0 {
labels.push(label)
}
if let rdf_types::Object::Blank(label) = &mut self.2 {
labels.push(label)
}
if let Some(rdf_types::GraphLabel::Blank(label)) = &mut self.3 {
labels.push(label)
}
labels
}
}
#[derive(Debug, Clone)]
pub struct NormalizationState<'a> {
pub blank_node_to_quads: Map<&'a BlankId, Vec<QuadRef<'a>>>,
pub hash_to_blank_nodes: Map<String, Vec<&'a BlankId>>,
pub canonical_issuer: IdentifierIssuer,
}
#[derive(Debug, Clone)]
pub struct IdentifierIssuer {
pub identifier_prefix: String,
pub identifier_counter: u64,
pub issued_identifiers_list: Vec<(BlankIdBuf, BlankIdBuf)>,
}
impl IdentifierIssuer {
pub fn new(prefix: String) -> Self {
Self {
identifier_prefix: prefix,
identifier_counter: 0,
issued_identifiers_list: Vec::new(),
}
}
pub fn find_issued_identifier(&self, existing_identifier: &BlankId) -> Option<&BlankId> {
self.issued_identifiers_list
.iter()
.find(|(_, existing_id)| existing_id == existing_identifier)
.map(|(issued_identifier, _)| issued_identifier.as_ref())
}
}
#[derive(Debug, Clone)]
pub struct HashNDegreeQuadsOutput {
pub hash: String,
pub issuer: IdentifierIssuer,
}
fn digest_to_lowerhex(digest: &[u8]) -> String {
digest
.iter()
.map(|byte| format!("{:02x}", byte))
.collect::<String>()
}
pub fn hash_first_degree_quads(
normalization_state: &mut NormalizationState,
reference_blank_node_identifier: &BlankId,
) -> String {
let mut nquads: Vec<String> = Vec::new();
if let Some(quads) = normalization_state
.blank_node_to_quads
.get(reference_blank_node_identifier)
{
for quad in quads {
let mut quad: Quad = quad.into_owned();
for label in quad.blank_node_components_mut() {
*label = if label == reference_blank_node_identifier {
BlankIdBuf::from_suffix("a").unwrap()
} else {
BlankIdBuf::from_suffix("z").unwrap()
};
}
let nquad = NQuadsStatement(&quad).to_string();
nquads.push(nquad);
}
}
nquads.sort();
let joined_nquads = nquads.join("");
let nquads_digest = sha256(joined_nquads.as_bytes());
digest_to_lowerhex(&nquads_digest)
}
pub fn normalize<'a, Q: IntoIterator<Item = QuadRef<'a>>>(
quads: Q,
) -> NormalizedQuads<'a, Q::IntoIter>
where
Q::IntoIter: Clone,
{
let mut normalization_state = NormalizationState {
blank_node_to_quads: Map::new(),
hash_to_blank_nodes: Map::new(),
canonical_issuer: IdentifierIssuer::new("_:c14n".to_string()),
};
let quads = quads.into_iter();
for quad in quads.clone() {
for blank_node_identifier in quad.blank_node_components() {
normalization_state
.blank_node_to_quads
.entry(blank_node_identifier)
.or_insert_with(Vec::new)
.push(quad);
}
}
let mut non_normalized_identifiers: HashSet<&BlankId> = normalization_state
.blank_node_to_quads
.keys()
.cloned()
.collect();
let mut simple = true;
while simple {
simple = false;
normalization_state.hash_to_blank_nodes.clear();
for identifier in non_normalized_identifiers.iter() {
let hash = hash_first_degree_quads(&mut normalization_state, identifier);
normalization_state
.hash_to_blank_nodes
.entry(hash)
.or_insert_with(Vec::new)
.push(identifier);
}
let mut hashes_to_remove = Vec::new();
for (hash, identifier_list) in normalization_state.hash_to_blank_nodes.iter() {
if identifier_list.len() > 1 {
continue;
}
let identifier = match identifier_list.iter().next() {
Some(id) => id,
None => continue,
};
issue_identifier(&mut normalization_state.canonical_issuer, identifier);
non_normalized_identifiers.remove(identifier);
hashes_to_remove.push(hash.clone());
simple = true;
}
for hash in hashes_to_remove {
normalization_state.hash_to_blank_nodes.remove(&hash);
}
for (_hash, identifier_list) in normalization_state.hash_to_blank_nodes.clone() {
let mut hash_path_list: Vec<HashNDegreeQuadsOutput> = Vec::new();
for identifier in identifier_list {
if normalization_state
.canonical_issuer
.find_issued_identifier(identifier)
.is_some()
{
continue;
}
let mut temporary_issuer = IdentifierIssuer::new("_:b".to_string());
issue_identifier(&mut temporary_issuer, identifier);
hash_path_list.push(
hash_n_degree_quads(
&mut normalization_state,
identifier,
&mut temporary_issuer,
)
.unwrap(),
);
}
hash_path_list.sort_by(|a, b| a.hash.cmp(&b.hash));
for result in hash_path_list {
let identifier_issuer = result.issuer;
for (_, existing_identifier) in identifier_issuer.issued_identifiers_list {
issue_identifier(
&mut normalization_state.canonical_issuer,
&existing_identifier,
);
}
}
}
}
NormalizedQuads {
quads,
normalization_state,
}
}
pub struct NormalizedQuads<'a, Q> {
quads: Q,
normalization_state: NormalizationState<'a>,
}
impl<'a, Q: Iterator<Item = QuadRef<'a>>> NormalizedQuads<'a, Q> {
pub fn into_nquads(self) -> String {
IntoNQuads::into_nquads(self)
}
}
impl<'a, Q: Iterator<Item = QuadRef<'a>>> Iterator for NormalizedQuads<'a, Q> {
type Item = Quad;
fn next(&mut self) -> Option<Self::Item> {
self.quads.next().map(|quad| {
let mut quad_copy = quad.into_owned();
for label in quad_copy.blank_node_components_mut() {
let canonical_identifier = self
.normalization_state
.canonical_issuer
.find_issued_identifier(label)
.unwrap();
*label = canonical_identifier.to_owned();
}
quad_copy
})
}
}
pub fn issue_identifier(
identifier_issuer: &mut IdentifierIssuer,
existing_identifier: &BlankId,
) -> BlankIdBuf {
if let Some(id) = identifier_issuer.find_issued_identifier(existing_identifier) {
return id.to_owned();
}
let issued_identifier = BlankIdBuf::new(
identifier_issuer.identifier_prefix.to_owned()
+ &identifier_issuer.identifier_counter.to_string(),
)
.unwrap();
identifier_issuer
.issued_identifiers_list
.push((issued_identifier.clone(), existing_identifier.to_owned()));
identifier_issuer.identifier_counter += 1;
issued_identifier
}
pub fn hash_n_degree_quads(
normalization_state: &mut NormalizationState,
identifier: &BlankId,
issuer: &mut IdentifierIssuer,
) -> Result<HashNDegreeQuadsOutput, MissingChosenIssuer> {
let mut issuer = issuer;
let mut issuer_tmp: IdentifierIssuer;
let mut hash_to_related_blank_nodes: Map<String, Vec<&BlankId>> = Map::new();
if let Some(quads) = normalization_state
.blank_node_to_quads
.get(identifier)
.cloned()
{
for quad in quads {
for (component, position) in quad.blank_node_components_with_position() {
if component != identifier {
let hash = hash_related_blank_node(
normalization_state,
component,
quad,
issuer,
position,
);
hash_to_related_blank_nodes
.entry(hash)
.or_insert_with(Vec::new)
.push(component);
}
}
}
}
let mut data_to_hash = String::new();
for (related_hash, blank_node_list) in hash_to_related_blank_nodes {
data_to_hash.push_str(&related_hash);
let mut chosen_path = String::new();
let mut chosen_issuer = None;
for permutation in combination::permutate::from_vec(&blank_node_list) {
let mut issuer_copy = issuer.clone();
let mut path = String::new();
let mut recursion_list: Vec<BlankIdBuf> = Vec::new();
for related in permutation {
if let Some(canonical_identifier) = normalization_state
.canonical_issuer
.find_issued_identifier(related)
.as_ref()
{
recursion_list.push((*canonical_identifier).to_owned());
} else {
if issuer_copy.find_issued_identifier(related).is_none() {
recursion_list.push(related.to_owned());
}
path += &issue_identifier(&mut issuer_copy, related);
}
if !chosen_path.is_empty() && path.len() >= chosen_path.len() && path > chosen_path
{
continue;
}
}
for related in recursion_list {
let result = hash_n_degree_quads(normalization_state, &related, &mut issuer_copy)?;
path.push_str(&issue_identifier(&mut issuer_copy, &related));
path.push('<');
path.push_str(&result.hash);
path.push('>');
issuer_copy = result.issuer;
if !chosen_path.is_empty() && path.len() >= chosen_path.len() && path > chosen_path
{
continue;
}
}
if chosen_path.is_empty() || path < chosen_path {
chosen_path = path;
chosen_issuer.replace(issuer_copy);
}
}
data_to_hash.push_str(&chosen_path);
issuer_tmp = match chosen_issuer {
Some(issuer) => issuer,
None => return Err(MissingChosenIssuer),
};
issuer = &mut issuer_tmp;
}
let digest = sha256(data_to_hash.as_bytes());
let hash = digest_to_lowerhex(&digest);
Ok(HashNDegreeQuadsOutput {
hash,
issuer: issuer.to_owned(),
})
}
pub fn hash_related_blank_node(
normalization_state: &mut NormalizationState,
related: &BlankId,
quad: QuadRef,
issuer: &mut IdentifierIssuer,
position: BlankIdPosition,
) -> String {
let identifier = match normalization_state
.canonical_issuer
.find_issued_identifier(related)
{
Some(id) => id.to_string(),
None => match issuer.find_issued_identifier(related) {
Some(id) => id.to_string(),
None => hash_first_degree_quads(normalization_state, related),
},
};
let mut input = position.to_string();
if position != BlankIdPosition::Graph {
input.push('<');
input.push_str(quad.predicate().as_str());
input.push('>');
}
input += &identifier;
let digest = sha256(input.as_bytes());
digest_to_lowerhex(&digest)
}
#[cfg(test)]
mod tests {
use locspan::Meta;
use nquads_syntax::Parse;
use super::*;
#[test]
fn normalization_test_suite() {
use std::fs::{self};
use std::path::PathBuf;
let case = std::env::args().nth(2);
let mut passed = 0;
let mut total = 0;
for entry in fs::read_dir("../json-ld-normalization/tests").unwrap() {
let entry = entry.unwrap();
let filename = entry.file_name().into_string().unwrap();
if !filename.starts_with("test") || !filename.ends_with("-urdna2015.nq") {
continue;
}
let num = &filename[0..7].to_string();
if let Some(ref case) = case {
if case != num {
continue;
}
}
total += 1;
let mut path = entry.path();
let expected_str = fs::read_to_string(&path).unwrap();
let in_file_name = num.to_string() + "-in.nq";
path.set_file_name(PathBuf::from(in_file_name));
let in_str = fs::read_to_string(&path).unwrap();
let dataset = nquads_syntax::Document::parse_str(&in_str, |span| span).unwrap();
let stripped_dataset: Vec<_> = dataset
.into_value()
.into_iter()
.map(Meta::into_value)
.map(Quad::strip_all_but_predicate)
.collect();
let normalized =
normalize(stripped_dataset.iter().map(Quad::as_quad_ref)).into_nquads();
if &normalized == &expected_str {
passed += 1;
} else {
let changes = difference::Changeset::new(&normalized, &expected_str, "\n");
eprintln!("test {}: failed. diff:\n{}", num, changes);
}
}
assert!(total > 0);
assert_eq!(passed, total);
}
}