use std::collections::BTreeMap;
use std::io;
use std::path::Path;
use serde_json::{Value, json};
use sha2::{Digest, Sha256};
const BLOCKS_SCHEMA_TXT: &str = include_str!("schemas/blocks_schema.txt");
const RAT_SCHEMA_TXT: &str = include_str!("schemas/rat_schema.txt");
const BLOCK_INDEX_SCHEMA_JSON: &str = include_str!("schemas/block_index.schema.json");
const DECODER_PY: &str = include_str!("extras/decode.py");
const VERIFY_SHA256_PY: &str = include_str!("extras/verify_sha256.py");
const COUNT_BY_LENGTH_PY: &str = include_str!("extras/count_by_length.py");
const VERIFY_COUNTS_PY: &str = include_str!("extras/verify_counts.py");
#[derive(Debug, Clone, Copy)]
pub enum ProducedVia {
InMemory,
StreamingPipeline,
}
#[derive(Debug, Clone)]
pub struct AssetParams<'a> {
pub ring: u8,
pub max_steps: usize,
pub step: i8,
pub free: bool,
pub target_block_bytes: u32,
pub n_sequences: u64,
pub oeis_a_number: Option<&'a str>,
pub produced_via: ProducedVia,
}
struct PkgMeta {
name: &'static str,
version: &'static str,
repository: &'static str,
description: &'static str,
authors: &'static str,
commit: &'static str,
}
const AUTHOR_ORCIDS: &[(&str, &str)] = &[(
"apirogov@users.noreply.github.com",
"https://orcid.org/0000-0002-5077-7497",
)];
fn orcid_for(email: Option<&str>) -> Option<&'static str> {
let email = email?;
AUTHOR_ORCIDS
.iter()
.find(|(e, _)| *e == email)
.map(|(_, orcid)| *orcid)
}
const PKG: PkgMeta = PkgMeta {
name: env!("CARGO_PKG_NAME"),
version: env!("CARGO_PKG_VERSION"),
repository: env!("CARGO_PKG_REPOSITORY"),
description: env!("CARGO_PKG_DESCRIPTION"),
authors: env!("CARGO_PKG_AUTHORS"),
commit: env!("TILEZZ_GIT_COMMIT"),
};
fn build_endtime() -> String {
let secs: i64 = match std::env::var("SOURCE_DATE_EPOCH")
.ok()
.and_then(|s| s.parse().ok())
{
Some(s) => s,
None => std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0),
};
format_iso_utc(secs)
}
fn build_date() -> String {
build_endtime()[..10].to_string()
}
fn format_iso_utc(secs: i64) -> String {
let days = secs.div_euclid(86_400);
let tod = secs.rem_euclid(86_400);
let hour = (tod / 3600) as u32;
let minute = ((tod % 3600) / 60) as u32;
let second = (tod % 60) as u32;
let z = days + 719_468;
let era = z.div_euclid(146_097);
let doe = z.rem_euclid(146_097) as u64;
let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146_096) / 365;
let y = (yoe as i64) + era * 400;
let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
let mp = (5 * doy + 2) / 153;
let day = doy - (153 * mp + 2) / 5 + 1;
let month = if mp < 10 { mp + 3 } else { mp - 9 };
let year = if month <= 2 { y + 1 } else { y };
format!("{year:04}-{month:02}-{day:02}T{hour:02}:{minute:02}:{second:02}Z")
}
fn sha256_hex(path: &Path) -> io::Result<String> {
let mut file = std::fs::File::open(path)?;
let mut hasher = Sha256::new();
let mut buf = [0u8; 64 * 1024];
loop {
let n = io::Read::read(&mut file, &mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
Ok(super::hex_lower(&hasher.finalize()))
}
struct FileMeta {
rel_path: String,
size: u64,
sha256: String,
}
fn snapshot_dir(dir: &Path) -> io::Result<Vec<FileMeta>> {
let mut out: Vec<FileMeta> = Vec::new();
let mut stack: Vec<std::path::PathBuf> = vec![dir.to_path_buf()];
while let Some(d) = stack.pop() {
for entry in std::fs::read_dir(&d)? {
let entry = entry?;
let path = entry.path();
let file_type = entry.file_type()?;
if file_type.is_dir() {
stack.push(path);
continue;
}
let rel = path
.strip_prefix(dir)
.map_err(|e| io::Error::other(e.to_string()))?;
let name = rel.file_name().and_then(|s| s.to_str()).unwrap_or("");
if name == "ro-crate-metadata.json" || name == ".DS_Store" || name == ".gitkeep" {
continue;
}
let meta = entry.metadata()?;
let sha256 = sha256_hex(&path)?;
out.push(FileMeta {
rel_path: rel
.to_str()
.ok_or_else(|| io::Error::other("non-UTF-8 path"))?
.replace('\\', "/"),
size: meta.len(),
sha256,
});
}
}
out.sort_by(|a, b| a.rel_path.cmp(&b.rel_path));
Ok(out)
}
fn encoding_format_of(rel_path: &str) -> &'static str {
if rel_path.ends_with(".json") {
"application/json"
} else if rel_path.ends_with(".bin") {
"application/gzip"
} else if rel_path.ends_with(".txt") {
"text/plain"
} else if rel_path.ends_with(".md") {
"text/markdown"
} else if rel_path.ends_with(".py") {
"text/x-python"
} else if rel_path.ends_with(".sh") {
"application/x-sh"
} else {
"application/octet-stream"
}
}
fn conforms_to_of(rel_path: &str) -> Vec<Value> {
if rel_path == "block_index.json" {
vec![
json!({"@id": "schemas/block_index.schema.json"}),
json!({"@id": "schemas/blocks_schema.txt"}),
]
} else if rel_path.starts_with("blocks/") && rel_path.ends_with(".bin") {
vec![json!({"@id": "schemas/blocks_schema.txt"})]
} else {
vec![]
}
}
fn human_label_for(rel_path: &str) -> (&'static str, &'static str) {
match rel_path {
"block_index.json" => (
"block_index.json",
"Manifest for the tilezz-rat-dafsa-blocks asset (state / edge counts, root state record, content-addressed block index).",
),
"schemas/block_index.schema.json" => (
"block_index.schema.json (JSON Schema)",
"Formal JSON Schema (draft 2020-12) for block_index.json. Machine-validatable; covers the manifest JSON only -- see blocks_schema.txt for the binary block file format.",
),
"schemas/blocks_schema.txt" => (
"blocks_schema.txt (prose)",
"Prose specification of the tilezz-rat-dafsa-blocks wire format: manifest fields, block file binary layout, the read algorithm.",
),
"schemas/rat_schema.txt" => (
"rat_schema.txt (prose)",
"Length-prefix convention used inside the DAFSA's accepted sequences: stored_sequence = [len, rat...].",
),
"tools/decode.py" => (
"decode.py (Python 3 decoder)",
"Standalone, dependency-free Python 3 script that walks the blocked DAFSA in this directory and prints every rat as a line of space-separated signed integers. Run as `python3 tools/decode.py > rats.txt`.",
),
"tools/verify_sha256.py" => (
"verify_sha256.py (Python 3 hash verifier)",
"Standalone, dependency-free Python 3 script that checks every sha256 recorded in ro-crate-metadata.json against the on-disk file bytes. Run as `python3 tools/verify_sha256.py`; exits 0 on full match, 1 on mismatch.",
),
"tools/count_by_length.py" => (
"count_by_length.py (Python 3 per-length counter)",
"Standalone Python 3 script (stdlib + sibling decode.py) that prints the number of stored sequences per exact perimeter length, one `<length> <count>` line each -- the OEIS-style terms this asset realises. Reads the counts off the DAFSA's rank index without decoding. Run as `python3 tools/count_by_length.py`.",
),
"tools/verify_counts.py" => (
"verify_counts.py (Python 3 sub-family verifier)",
"Standalone Python 3 script (stdlib + sibling decode.py) that re-derives the per-perimeter sub-family sequences (free, oneSided, achiral, rotationSymmetric, symmetric, subring, coset) from the DAFSA and checks them against the `variableMeasured` block in ro-crate-metadata.json. Run as `python3 tools/verify_counts.py`; exits 0 if all match, 1 on mismatch.",
),
"README.md" => (
"README.md",
"Human-readable entry point: dataset overview, author and copyright notice, CC-BY-SA 4.0 license summary, contents map, reproduction recipe (repo URL, exact commit, CLI invocation), and a SHA-256 verification snippet.",
),
"tools/reproduce.sh" => (
"reproduce.sh (executable rebuild script)",
"Shell script that re-runs the exact pipeline that produced this dataset: clones the source repo at the recorded commit, builds rat_enum, and runs the original CLI invocation(s). Run as `bash tools/reproduce.sh`. Honours REPO, COMMIT, SRC_DIR env vars for offline / pre-cloned scenarios; honours SOURCE_DATE_EPOCH for bit-identical metadata.",
),
_ if rel_path.starts_with("blocks/") => (
"DAFSA block file",
"One block of the rat-DAFSA in tilezz-rat-block binary format, gzip-compressed.",
),
_ => ("", ""),
}
}
fn dataset_name(p: &AssetParams) -> String {
let canon = if p.free { "free" } else { "rotation-canonical" };
format!(
"tilezz simple matchstick polygons on Z[zeta_{ring}], perimeter <= {n}, {canon}",
ring = effective_ring(p),
n = p.max_steps,
)
}
fn dataset_identifier(p: &AssetParams) -> String {
let canon = if p.free { "free" } else { "onesided" };
format!(
"tilezz-rat-zz{ring}-n{n}-{canon}",
ring = effective_ring(p),
n = p.max_steps,
)
}
fn gcd_u8(mut a: u8, mut b: u8) -> u8 {
while b != 0 {
(a, b) = (b, a % b);
}
a
}
fn effective_ring(p: &AssetParams) -> u8 {
p.ring / gcd_u8(p.step.unsigned_abs().max(1), p.ring).max(1)
}
fn turn_units_note_short(p: &AssetParams) -> String {
let eff = effective_ring(p);
if eff == p.ring {
return String::new();
}
format!(
" It is the order-{eff} sub-ring of Z[zeta_{parent}] (Z[zeta_{eff}] has no native lattice here, \
so it is enumerated as the directions that are multiples of {step}, via --step {step}); each \
stored turn is a Z[zeta_{parent}] unit (always even) equal to {step} times the Z[zeta_{eff}] turn \
-- divide stored values by {step} to read Z[zeta_{eff}] turns. See README.md (Turn-angle units).",
eff = eff,
parent = p.ring,
step = p.step,
)
}
fn turn_units_note_readme(p: &AssetParams) -> String {
let eff = effective_ring(p);
if eff == p.ring {
return String::new();
}
let n = eff as usize;
let stored = vec![p.step.to_string(); n].join(" ");
let halved = vec!["1"; n].join(" ");
format!(
"\n\n## Turn-angle units\n\n\
Z[zeta_{eff}] has no native lattice in tilezz, so this dataset is the order-{eff} \
sub-ring of Z[zeta_{parent}] -- the turn directions that are multiples of {step} -- \
enumerated with `--step {step}`. Each stored turn is therefore a Z[zeta_{parent}] turn \
(an integer multiple of `2*pi/{parent}`, always even) and is `{step}` times the \
corresponding Z[zeta_{eff}] turn (a multiple of `2*pi/{eff}`).\n\n\
To read the sequences as Z[zeta_{eff}] turns, divide every stored value by `{step}`. \
For example the regular {eff}-gon is stored as `{stored}` and is `{halved}` in \
Z[zeta_{eff}]. The web explorer shows the halved Z[zeta_{eff}] form; `tools/decode.py` \
prints the raw stored Z[zeta_{parent}] values.",
eff = eff,
parent = p.ring,
step = p.step,
stored = stored,
halved = halved,
)
}
fn dataset_additional_properties(p: &AssetParams, max_indexed_length: usize) -> Value {
let effective_ring = effective_ring(p);
json!([
{"@type": "PropertyValue", "name": "ring", "value": p.ring},
{"@type": "PropertyValue", "name": "step", "value": p.step},
{"@type": "PropertyValue", "name": "effectiveRing", "value": effective_ring},
{"@type": "PropertyValue", "name": "maxPerimeter", "value": p.max_steps},
{
"@type": "PropertyValue",
"name": "canonicalization",
"value": if p.free { "free" } else { "onesided" },
},
{"@type": "PropertyValue", "name": "nSequences", "value": p.n_sequences},
{"@type": "PropertyValue", "name": "maxIndexedLength", "value": max_indexed_length},
])
}
#[derive(Default)]
pub struct SequenceCounts {
free: BTreeMap<usize, u64>,
subring: BTreeMap<usize, u64>,
coset: BTreeMap<usize, u64>,
achiral: BTreeMap<usize, u64>,
rotation_symmetric: BTreeMap<usize, u64>,
symmetric: BTreeMap<usize, u64>,
}
fn rot_min(t: &[i8]) -> Vec<i8> {
let n = t.len();
(0..n)
.map(|i| {
let mut r = Vec::with_capacity(n);
r.extend_from_slice(&t[i..]);
r.extend_from_slice(&t[..i]);
r
})
.min()
.unwrap_or_default()
}
impl SequenceCounts {
pub fn from_rats<I: IntoIterator<Item = Vec<i8>>>(rats: I) -> Self {
let mut c = Self::default();
for t in rats {
let l = t.len();
*c.free.entry(l).or_default() += 1;
if t.iter().all(|a| a % 2 == 0) {
*c.subring.entry(l).or_default() += 1;
}
if t.iter().all(|a| a % 2 != 0) {
*c.coset.entry(l).or_default() += 1;
}
let mut rev = t.clone();
rev.reverse();
let achiral = rot_min(&t) == rot_min(&rev);
let rot_sym = (1..l).any(|d| t[d..].iter().chain(&t[..d]).eq(t.iter()));
if achiral {
*c.achiral.entry(l).or_default() += 1;
}
if rot_sym {
*c.rotation_symmetric.entry(l).or_default() += 1;
}
if achiral || rot_sym {
*c.symmetric.entry(l).or_default() += 1;
}
}
c
}
fn series(map: &BTreeMap<usize, u64>, max_perim: usize) -> String {
(1..=max_perim)
.map(|n| map.get(&n).copied().unwrap_or(0).to_string())
.collect::<Vec<_>>()
.join(",")
}
fn variable_measured(&self) -> Value {
let max_perim = self.free.keys().next_back().copied().unwrap_or(0);
let one_sided: BTreeMap<usize, u64> = (1..=max_perim)
.filter_map(|n| {
let f = self.free.get(&n).copied().unwrap_or(0);
if f == 0 {
return None;
}
let a = self.achiral.get(&n).copied().unwrap_or(0);
Some((n, 2 * f - a))
})
.collect();
let pv = |name: &str, map: &BTreeMap<usize, u64>, desc: &str| {
json!({
"@type": "PropertyValue",
"name": name,
"unitText": "polygons",
"value": Self::series(map, max_perim),
"description": desc,
})
};
json!([
pv(
"free",
&self.free,
"Self-avoiding polygons by exact perimeter (index = perimeter, from 1); the base count, up to rotation and reflection."
),
pv(
"oneSided",
&one_sided,
"Up to rotation only; mirror images counted as distinct (= 2*free - achiral)."
),
pv("achiral", &self.achiral, "Equal to its own mirror image."),
pv(
"rotationSymmetric",
&self.rotation_symmetric,
"Repetition factor > 1 (nontrivial rotational symmetry)."
),
pv(
"symmetric",
&self.symmetric,
"Has a nontrivial symmetry: achiral OR rotationSymmetric."
),
pv(
"subring",
&self.subring,
"All turns even -- the polygons of the order-n/2 sub-ring."
),
pv(
"coset",
&self.coset,
"All turns odd -- no straight segments; even perimeter only."
),
])
}
}
fn host_coordinate_fields(
base_url: Option<&str>,
doi: Option<&str>,
slug: &str,
manifest_size: u64,
) -> (Value, Option<Value>, Value, Option<Value>) {
let base = base_url.map(|b| b.trim_end_matches('/'));
let identifier = if let Some(doi) = doi {
json!({
"@type": "PropertyValue",
"propertyID": "DOI",
"value": doi,
"url": format!("https://doi.org/{doi}"),
})
} else if let Some(base) = base {
json!(base)
} else {
json!({
"@type": "PropertyValue",
"propertyID": "tilezz-dataset-id",
"value": slug,
})
};
let url = base.map(|b| json!(b));
let same_as = doi.map(|doi| json!(format!("https://doi.org/{doi}")));
let content_url = match base {
Some(base) => format!("{base}/block_index.json"),
None => "block_index.json".to_string(),
};
let distribution = json!({
"@type": "DataDownload",
"contentUrl": content_url,
"encodingFormat": "application/json",
"contentSize": manifest_size.to_string(),
});
(identifier, url, distribution, same_as)
}
pub fn write_ro_crate(
dir: &Path,
p: &AssetParams,
counts: &SequenceCounts,
base_url: Option<&str>,
doi: Option<&str>,
) -> io::Result<()> {
let files = snapshot_dir(dir)?;
let has_part: Vec<Value> = files.iter().map(|f| json!({"@id": f.rel_path})).collect();
let result_ids: Vec<Value> = files
.iter()
.filter(|f| !f.rel_path.starts_with("schemas/"))
.map(|f| json!({"@id": f.rel_path}))
.collect();
let mut graph: Vec<Value> = Vec::new();
graph.push(json!({
"@type": "CreativeWork",
"@id": "ro-crate-metadata.json",
"conformsTo": {"@id": "https://w3id.org/ro/crate/1.2"},
"about": {"@id": "./"},
}));
let mut root: BTreeMap<&str, Value> = BTreeMap::new();
root.insert("@id", json!("./"));
root.insert("@type", json!("Dataset"));
root.insert("name", json!(dataset_name(p)));
let index_size = std::fs::metadata(dir.join("block_index.json"))
.map(|m| m.len())
.unwrap_or(0);
let (identifier, url, distribution, same_as) =
host_coordinate_fields(base_url, doi, &dataset_identifier(p), index_size);
root.insert("identifier", identifier);
if let Some(url) = url {
root.insert("url", url);
}
root.insert(
"description",
json!(format!(
"Simple matchstick polygons (closed self-avoiding unit-edge polygons) on the cyclotomic ring Z[zeta_{ring}], with perimeter <= {n}, canonicalized by {canon} symmetry. Contains {count} sequences.{units} Self-describing tilezz-rat-dafsa-blocks asset (schemas alongside).",
ring = effective_ring(p),
n = p.max_steps,
canon = if p.free { "free (full dihedral)" } else { "rotation only (one-sided)" },
count = p.n_sequences,
units = turn_units_note_short(p),
)),
);
root.insert("datePublished", json!(build_endtime()));
root.insert("version", json!(build_date()));
root.insert(
"license",
json!({"@id": "https://creativecommons.org/licenses/by-sa/4.0/"}),
);
let mut keywords = vec![
json!("combinatorial enumeration"),
json!("self-avoiding polygon"),
json!("cyclotomic lattice"),
json!("matchstick polygon"),
json!(format!("Z[zeta_{}]", effective_ring(p))),
];
if let Some(oeis) = p.oeis_a_number {
keywords.push(json!(oeis));
}
root.insert("keywords", Value::Array(keywords));
root.insert("mainEntity", json!({"@id": "block_index.json"}));
root.insert("distribution", distribution);
root.insert("hasPart", json!(has_part));
let max_indexed_length = counts.free.keys().next_back().copied().unwrap_or(0);
root.insert(
"additionalProperty",
dataset_additional_properties(p, max_indexed_length),
);
root.insert("variableMeasured", counts.variable_measured());
root.insert(
"creator",
Value::Array(
authors_as_ids()
.into_iter()
.map(|id| json!({"@id": id}))
.collect(),
),
);
if let Some(oeis) = p.oeis_a_number {
let oeis_url = format!("https://oeis.org/{oeis}");
root.insert("subjectOf", json!({"@id": oeis_url}));
}
if let Some(same_as) = same_as {
root.insert("sameAs", same_as);
}
graph.push(serde_json::Value::Object(
root.into_iter().map(|(k, v)| (k.to_string(), v)).collect(),
));
graph.push(json!({
"@id": "https://creativecommons.org/licenses/by-sa/4.0/",
"@type": "CreativeWork",
"name": "Creative Commons Attribution-ShareAlike 4.0 International",
"identifier": "CC-BY-SA-4.0",
}));
for (id, name, email) in parse_authors(PKG.authors) {
let mut p_obj: BTreeMap<&str, Value> = BTreeMap::new();
if id.starts_with("https://orcid.org/") {
p_obj.insert("identifier", json!(id));
}
p_obj.insert("@id", json!(id));
p_obj.insert("@type", json!("Person"));
p_obj.insert("name", json!(name));
if let Some(e) = email {
p_obj.insert("email", json!(e));
}
graph.push(serde_json::Value::Object(
p_obj.into_iter().map(|(k, v)| (k.to_string(), v)).collect(),
));
}
graph.push(json!({
"@id": format!("#{}", PKG.name),
"@type": "SoftwareApplication",
"name": PKG.name,
"version": PKG.version,
"description": PKG.description,
"url": PKG.repository,
"codeRepository": PKG.repository,
"softwareVersion": PKG.commit,
"softwareRequirements": "Rust toolchain (stable, 2024-12+); standard Cargo build prerequisites (git, C linker, pkg-config).",
"downloadUrl": format!("{}/archive/{}.tar.gz", PKG.repository, PKG.commit),
}));
graph.push(json!({
"@id": "#build",
"@type": "CreateAction",
"name": format!("Generate {}", dataset_identifier(p)),
"description": format!(
"Produced by: {}\nReproduce: bash tools/reproduce.sh (in this directory). See README.md for context.",
reproduce_one_liner(p),
),
"endTime": build_endtime(),
"instrument": {"@id": format!("#{}", PKG.name)},
"result": result_ids,
}));
if let Some(oeis) = p.oeis_a_number {
let oeis_url = format!("https://oeis.org/{oeis}");
graph.push(json!({
"@id": oeis_url,
"@type": "CreativeWork",
"name": format!("OEIS {oeis}"),
"identifier": oeis,
"url": oeis_url.clone(),
}));
}
for f in &files {
let (name, description) = human_label_for(&f.rel_path);
let mut obj: BTreeMap<&str, Value> = BTreeMap::new();
let is_schema = f.rel_path.starts_with("schemas/");
obj.insert(
"@type",
if is_schema {
json!(["File", "CreativeWork"])
} else {
json!("File")
},
);
obj.insert("@id", json!(f.rel_path));
if !name.is_empty() {
obj.insert("name", json!(name));
obj.insert("description", json!(description));
}
obj.insert("encodingFormat", json!(encoding_format_of(&f.rel_path)));
obj.insert("contentSize", json!(f.size.to_string()));
obj.insert("sha256", json!(f.sha256));
let conforms = conforms_to_of(&f.rel_path);
if !conforms.is_empty() {
obj.insert(
"conformsTo",
if conforms.len() == 1 {
conforms.into_iter().next().unwrap()
} else {
Value::Array(conforms)
},
);
}
graph.push(serde_json::Value::Object(
obj.into_iter().map(|(k, v)| (k.to_string(), v)).collect(),
));
}
let root_obj = json!({
"@context": "https://w3id.org/ro/crate/1.2/context",
"@graph": graph,
});
let path = dir.join("ro-crate-metadata.json");
let mut writer = std::io::BufWriter::new(std::fs::File::create(&path)?);
serde_json::to_writer_pretty(&mut writer, &root_obj)
.map_err(|e| io::Error::other(format!("write ro-crate-metadata.json: {e}")))?;
io::Write::write_all(&mut writer, b"\n")?;
Ok(())
}
fn recover_slug(graph: &[Value]) -> String {
for e in graph {
if e.get("@id").and_then(|v| v.as_str()) == Some("#build")
&& let Some(slug) = e
.get("name")
.and_then(|v| v.as_str())
.and_then(|n| n.strip_prefix("Generate "))
{
return slug.to_string();
}
}
"tilezz-rat-dataset".to_string()
}
pub fn rehost_ro_crate(dir: &Path, base_url: Option<&str>, doi: Option<&str>) -> io::Result<()> {
let path = dir.join("ro-crate-metadata.json");
if !path.is_file() {
return Err(io::Error::new(
io::ErrorKind::NotFound,
format!(
"{} is not an RO-Crate: no ro-crate-metadata.json found",
dir.display()
),
));
}
let bytes = std::fs::read(&path)?;
let mut crate_json: Value = serde_json::from_slice(&bytes)
.map_err(|e| io::Error::other(format!("{} is not valid JSON: {e}", path.display())))?;
let manifest_size = std::fs::metadata(dir.join("block_index.json"))
.map_err(|e| {
io::Error::other(format!(
"cannot stat {}/block_index.json (re-host needs the dataset payload present): {e}",
dir.display()
))
})?
.len();
let slug = crate_json
.get("@graph")
.and_then(|g| g.as_array())
.map(|g| recover_slug(g))
.unwrap_or_else(|| "tilezz-rat-dataset".to_string());
let graph = crate_json
.get_mut("@graph")
.and_then(|g| g.as_array_mut())
.ok_or_else(|| io::Error::other(format!("{} has no @graph array", path.display())))?;
let root = graph
.iter_mut()
.find_map(|e| {
let obj = e.as_object_mut()?;
(obj.get("@id").and_then(|v| v.as_str()) == Some("./")).then_some(obj)
})
.ok_or_else(|| {
io::Error::other(format!(
"{} has no Dataset entity at @id \"./\"",
path.display()
))
})?;
let (identifier, url, distribution, same_as) =
host_coordinate_fields(base_url, doi, &slug, manifest_size);
root.insert("identifier".to_string(), identifier);
root.insert("distribution".to_string(), distribution);
match url {
Some(v) => {
root.insert("url".to_string(), v);
}
None => {
root.remove("url");
}
}
match same_as {
Some(v) => {
root.insert("sameAs".to_string(), v);
}
None => {
root.remove("sameAs");
}
}
let mut writer = std::io::BufWriter::new(std::fs::File::create(&path)?);
serde_json::to_writer_pretty(&mut writer, &crate_json)
.map_err(|e| io::Error::other(format!("write {}: {e}", path.display())))?;
io::Write::write_all(&mut writer, b"\n")?;
Ok(())
}
pub fn write_archival_extras(dir: &Path, params: &AssetParams) -> io::Result<()> {
let schemas = dir.join("schemas");
std::fs::create_dir_all(&schemas)?;
std::fs::write(schemas.join("blocks_schema.txt"), BLOCKS_SCHEMA_TXT)?;
std::fs::write(schemas.join("rat_schema.txt"), RAT_SCHEMA_TXT)?;
std::fs::write(
schemas.join("block_index.schema.json"),
BLOCK_INDEX_SCHEMA_JSON,
)?;
let tools = dir.join("tools");
std::fs::create_dir_all(&tools)?;
std::fs::write(tools.join("decode.py"), DECODER_PY)?;
std::fs::write(tools.join("verify_sha256.py"), VERIFY_SHA256_PY)?;
std::fs::write(tools.join("count_by_length.py"), COUNT_BY_LENGTH_PY)?;
std::fs::write(tools.join("verify_counts.py"), VERIFY_COUNTS_PY)?;
std::fs::write(dir.join("README.md"), readme_md(params))?;
let sh_path = tools.join("reproduce.sh");
std::fs::write(&sh_path, reproduce_sh(params))?;
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let mut perm = std::fs::metadata(&sh_path)?.permissions();
perm.set_mode(0o755);
std::fs::set_permissions(&sh_path, perm)?;
}
Ok(())
}
pub fn write_collection_ro_crate(web_dir: &Path, page_url: &str) -> io::Result<()> {
let mut has_part_ids: Vec<Value> = Vec::new();
let mut child_stubs: Vec<Value> = Vec::new();
let data_dir = web_dir.join("data");
let mut child_dirs: Vec<std::path::PathBuf> = Vec::new();
if data_dir.is_dir() {
for entry in std::fs::read_dir(&data_dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() && path.join("ro-crate-metadata.json").is_file() {
child_dirs.push(path);
}
}
}
child_dirs.sort();
for path in &child_dirs {
let dir_name = match path.file_name().and_then(|s| s.to_str()) {
Some(n) => n.to_string(),
None => continue,
};
let stub_id = format!("./data/{dir_name}/");
let manifest_id = format!("./data/{dir_name}/ro-crate-metadata.json");
let crate_path = path.join("ro-crate-metadata.json");
let bytes = match std::fs::read(&crate_path) {
Ok(b) => b,
Err(e) => {
eprintln!("warn: cannot read {}: {e}", crate_path.display());
continue;
}
};
let crate_json: Value = match serde_json::from_slice(&bytes) {
Ok(v) => v,
Err(e) => {
eprintln!("warn: {} is not valid JSON: {e}", crate_path.display());
continue;
}
};
let root = match find_root_dataset(&crate_json) {
Some(r) => r,
None => {
eprintln!(
"warn: {} has no Dataset entity at @id \"./\"; skipping",
crate_path.display()
);
continue;
}
};
let mut stub = serde_json::Map::new();
stub.insert("@id".into(), json!(stub_id));
stub.insert("@type".into(), json!("Dataset"));
for key in [
"identifier",
"name",
"description",
"license",
"datePublished",
"version",
"keywords",
"creator",
"subjectOf",
"additionalProperty",
] {
if let Some(v) = root.get(key) {
stub.insert(key.to_string(), v.clone());
}
}
stub.insert("encodingFormat".into(), json!("tilezz-rat-dafsa-blocks"));
stub.insert(
"distribution".into(),
json!({
"@type": "DataDownload",
"encodingFormat": "application/ld+json",
"contentUrl": manifest_id.clone(),
"description": "RO-Crate 1.2 manifest for this dataset (canonical, self-describing).",
}),
);
has_part_ids.push(json!({"@id": stub_id}));
child_stubs.push(Value::Object(stub));
}
let mut graph: Vec<Value> = Vec::new();
graph.push(json!({
"@type": "CreativeWork",
"@id": "ro-crate-metadata.json",
"conformsTo": {"@id": "https://w3id.org/ro/crate/1.2"},
"about": {"@id": "./"},
}));
let mut root: BTreeMap<&str, Value> = BTreeMap::new();
root.insert("@id", json!("./"));
root.insert("@type", json!("Dataset"));
root.insert("name", json!("tilezz Rat Explorer"));
root.insert(
"description",
json!(
"Interactive WebAssembly explorer for simple matchstick polygons on cyclotomic \
lattices Z[zeta_n]. Hosts a queryable database of canonical polygons; each \
dataset under data/ is a self-describing RO-Crate sub-dataset (follow its \
ro-crate-metadata.json for the full file inventory)."
),
);
root.insert("identifier", json!(page_url));
root.insert("url", json!(page_url));
root.insert("datePublished", json!(build_endtime()));
root.insert("version", json!(build_date()));
root.insert(
"license",
json!({"@id": "https://creativecommons.org/licenses/by-sa/4.0/"}),
);
root.insert("isBasedOn", json!({"@id": format!("#{}", PKG.name)}));
root.insert(
"creator",
Value::Array(
authors_as_ids()
.into_iter()
.map(|id| json!({"@id": id}))
.collect(),
),
);
root.insert("mainEntity", json!({"@id": "#rat-explorer"}));
root.insert("hasPart", json!(has_part_ids));
graph.push(serde_json::Value::Object(
root.into_iter().map(|(k, v)| (k.to_string(), v)).collect(),
));
graph.push(json!({
"@id": "#rat-explorer",
"@type": "WebApplication",
"name": "tilezz Rat Explorer",
"description": "Interactive explorer for simple matchstick polygons on cyclotomic lattices. Builds polygons from angle sequences and looks them up in a packaged RO-Crate database of canonical forms.",
"url": page_url,
"applicationCategory": "BrowserApplication",
"operatingSystem": "Any (browser with WebAssembly)",
"isBasedOn": {"@id": format!("#{}", PKG.name)},
"author": authors_as_ids().into_iter().map(|id| json!({"@id": id})).collect::<Vec<_>>(),
"license": {"@id": "https://opensource.org/license/mit"},
}));
for (id, name, email) in parse_authors(PKG.authors) {
let mut p_obj: BTreeMap<&str, Value> = BTreeMap::new();
if id.starts_with("https://orcid.org/") {
p_obj.insert("identifier", json!(id));
}
p_obj.insert("@id", json!(id));
p_obj.insert("@type", json!("Person"));
p_obj.insert("name", json!(name));
if let Some(e) = email {
p_obj.insert("email", json!(e));
}
graph.push(serde_json::Value::Object(
p_obj.into_iter().map(|(k, v)| (k.to_string(), v)).collect(),
));
}
graph.push(json!({
"@id": format!("#{}", PKG.name),
"@type": "SoftwareSourceCode",
"name": PKG.name,
"description": PKG.description,
"codeRepository": PKG.repository,
"programmingLanguage": "Rust",
"version": PKG.version,
"softwareVersion": PKG.commit,
"license": {"@id": "https://opensource.org/license/mit"},
}));
let cc_id = "https://creativecommons.org/licenses/by-sa/4.0/";
let already_present = graph
.iter()
.any(|e| e.get("@id").and_then(|v| v.as_str()) == Some(cc_id));
if !already_present {
graph.push(json!({
"@id": cc_id,
"@type": "CreativeWork",
"name": "Creative Commons Attribution-ShareAlike 4.0 International",
"identifier": "CC-BY-SA-4.0",
}));
}
graph.extend(child_stubs);
let root_obj = json!({
"@context": "https://w3id.org/ro/crate/1.2/context",
"@graph": graph,
});
let out_path = web_dir.join("ro-crate-metadata.json");
let mut writer = std::io::BufWriter::new(std::fs::File::create(&out_path)?);
serde_json::to_writer_pretty(&mut writer, &root_obj)
.map_err(|e| io::Error::other(format!("write {}: {e}", out_path.display())))?;
io::Write::write_all(&mut writer, b"\n")?;
Ok(())
}
fn find_root_dataset(crate_json: &Value) -> Option<&serde_json::Map<String, Value>> {
let graph = crate_json.get("@graph")?.as_array()?;
for entity in graph {
let obj = entity.as_object()?;
if obj.get("@id").and_then(|v| v.as_str()) == Some("./") {
return Some(obj);
}
}
None
}
fn shared_flags(p: &AssetParams) -> String {
let canon = if p.free { " --free" } else { "" };
let step = if p.step == 1 {
String::new()
} else {
format!(" --step {}", p.step)
};
let oeis = match p.oeis_a_number {
Some(a) => format!(" --oeis-a-number {a}"),
None => String::new(),
};
format!(
"--ring {ring} -n {n}{canon}{step}{oeis}",
ring = p.ring,
n = p.max_steps
)
}
fn reproduce_commands(p: &AssetParams) -> Vec<String> {
let shared = shared_flags(p);
let ident = dataset_identifier(p);
let tbb = p.target_block_bytes;
match p.produced_via {
ProducedVia::InMemory => vec![format!(
"./target/release/rat_enum {shared} --mode dafsa-blocks \
--target-block-bytes {tbb} --threads 0 -o {ident}"
)],
ProducedVia::StreamingPipeline => vec![
format!(
"./target/release/rat_enum {shared} --mode stream \
--threads 16 -o {ident}-pipeline"
),
format!(
"./target/release/rat_enum {shared} --mode merge \
-o {ident}-pipeline"
),
format!(
"./target/release/rat_enum {shared} --mode build \
--target-block-bytes {tbb} -o {ident}-pipeline"
),
format!("mv {ident}-pipeline/dafsa {ident}"),
],
}
}
fn reproduce_one_liner(p: &AssetParams) -> String {
reproduce_commands(p).join(" && ")
}
fn reproduce_sh(p: &AssetParams) -> String {
let commit = PKG.commit;
let repo = PKG.repository;
let name = PKG.name;
let ident = dataset_identifier(p);
let pipeline_label = match p.produced_via {
ProducedVia::InMemory => "in-memory (single-step --mode dafsa-blocks)",
ProducedVia::StreamingPipeline => "streaming pipeline (--mode stream + merge + build)",
};
let mut steps = String::new();
for cmd in reproduce_commands(p) {
steps.push_str(&format!("{cmd}\n"));
}
format!(
"#!/bin/sh
#
# Rebuild this dataset ({ident}) from source.
# Reproduction strategy: {pipeline_label}.
#
# Tested with the same tilezz commit that produced the original.
# Re-running this script in a clean directory should yield a
# directory whose contents match the recorded sha256s in
# ro-crate-metadata.json (see the verification snippet in
# README.md).
#
# For bit-identical ro-crate-metadata.json, set
# SOURCE_DATE_EPOCH to a fixed value before running -- otherwise
# CreateAction.endTime will drift, but the block files and
# block_index.json are unaffected.
set -eu
REPO=\"${{REPO:-{repo}}}\"
COMMIT=\"${{COMMIT:-{commit}}}\"
SRC_DIR=\"${{SRC_DIR:-{name}-{commit}}}\"
if [ ! -d \"$SRC_DIR/.git\" ]; then
git clone \"$REPO\" \"$SRC_DIR\"
fi
( cd \"$SRC_DIR\" && git fetch && git checkout \"$COMMIT\" && \\
cargo build --release --bin rat_enum --features cli )
# Run the reproduction step(s) relative to the source tree so the
# `./target/release/rat_enum` path resolves.
cd \"$SRC_DIR\"
{steps}\
echo
echo \"Reproduced asset: $(pwd)/{ident}\"
echo \"Compare against the recorded sha256s via the snippet in README.md.\"
",
)
}
fn readme_md(p: &AssetParams) -> String {
let commit = PKG.commit;
let repo = PKG.repository;
let name = PKG.name;
let version = PKG.version;
let pipeline_blurb = match p.produced_via {
ProducedVia::InMemory => "single-step in-memory build (`--mode dafsa-blocks`)",
ProducedVia::StreamingPipeline => {
"three-stage streaming pipeline (`--mode stream` -> `--mode merge` -> `--mode build`)"
}
};
let authors = format_authors(PKG.authors);
let year = current_year();
let name_h = dataset_name(p);
let ident = dataset_identifier(p);
let canon = if p.free {
"free (full dihedral symmetry reduction)"
} else {
"rotation-canonical (one-sided)"
};
let oeis_note = match p.oeis_a_number {
Some(a) => format!(
"\n\nUpstream cross-reference: this dataset realises OEIS sequence \
[`{a}`](https://oeis.org/{a}).",
),
None => String::new(),
};
let units = turn_units_note_readme(p);
format!(
"# {name_h}
Simple matchstick polygons -- closed self-avoiding polygonal walks \
with unit-length edges and turn angles in integer multiples of \
`2*pi/{ring}` -- on the cyclotomic ring `Z[zeta_{ring}]`, with perimeter \
up to {n}, canonicalised under {canon} symmetry. {count} sequences.\
{oeis_note}{units}
## Copyright
Copyright (c) {year} {authors}. All rights reserved subject to the \
license below.
## License
This dataset is distributed under the [Creative Commons \
Attribution-ShareAlike 4.0 International \
License](https://creativecommons.org/licenses/by-sa/4.0/) \
(CC-BY-SA-4.0). In short:
- **Attribution** -- credit the original author(s) above and \
link back to the source repository ({repo}).
- **ShareAlike** -- if you remix, transform, or build upon this \
dataset, distribute your contributions under the same license.
The full license text is at <https://creativecommons.org/licenses/by-sa/4.0/legalcode>.
## Contents
This directory is a [RO-Crate 1.2](https://www.researchobject.org/ro-crate/specification/1.2/) \
asset. The entry point is `ro-crate-metadata.json`; every file \
listed below is also recorded there with a `sha256`, an \
`encodingFormat`, and (where applicable) a `conformsTo` pointer \
to its schema.
```
{ident}/
README.md this file
ro-crate-metadata.json RO-Crate 1.2 manifest (start here for tooling)
block_index.json DAFSA wire manifest (counts, root state, sha256 block index)
schemas/
block_index.schema.json formal JSON Schema (draft 2020-12)
blocks_schema.txt prose spec covering JSON + .bin formats
rat_schema.txt length-prefix convention inside the DAFSA
blocks/
<sha256>.bin one gzipped DAFSA block each; filename = SHA-256 of file
tools/
decode.py standalone Python 3 decoder (no deps)
verify_sha256.py SHA-256 verifier (no deps; exits 0 on full match)
count_by_length.py per-exact-length counts (OEIS-style terms)
verify_counts.py re-derive + verify the variableMeasured sub-family sequences
reproduce.sh executable rebuild script (clones + builds + runs)
```
## Extracting sequences (no Rust toolchain needed)
`tools/decode.py` walks the blocked DAFSA and prints every \
sequence as a line of space-separated signed integers:
```sh
python3 tools/decode.py > rats.txt
```
The line count of `rats.txt` must equal `n_sequences` from \
`block_index.json`.
## Verifying SHA-256s
Every File entity in `ro-crate-metadata.json` carries a `sha256` \
that matches the on-disk bytes. `tools/verify_sha256.py` checks \
the whole set:
```sh
python3 tools/verify_sha256.py
```
Exits 0 on full match, 1 on any mismatch.
## Reproducing from source
Produced by [`{name}`]({repo}) v{version}, commit `{commit}`, via \
{pipeline_blurb}.
The `reproduce.sh` script in this directory is a self-contained \
recipe: it clones the repo at the recorded commit, builds \
`rat_enum`, and runs the exact sequence of commands that \
produced the dataset.
Prerequisites: a recent Rust toolchain (stable, 2024-12 or \
later); `git`, a C linker, and `pkg-config` (the standard Cargo \
build deps).
```sh
bash tools/reproduce.sh
```
(The script honours `REPO`, `COMMIT`, and `SRC_DIR` environment \
variables for mirroring / vendoring / pre-cloned checkouts; see \
the script header.)
The reproduced directory must match the existing one \
file-for-file. If not, the most likely cause is a different Rust \
version producing a different state ordering inside the DAFSA -- \
pin to the toolchain version this commit's `Cargo.lock` \
specifies. For bit-identical `ro-crate-metadata.json` across \
reruns, set `SOURCE_DATE_EPOCH` to a fixed POSIX-seconds value \
before running:
```sh
SOURCE_DATE_EPOCH=1780000000 bash tools/reproduce.sh
```
Otherwise the `CreateAction.endTime` will reflect the current \
wall-clock time and the metadata hash will drift; the block \
files and `block_index.json` are unaffected.
",
ring = effective_ring(p),
n = p.max_steps,
count = p.n_sequences,
)
}
fn parse_authors(authors: &str) -> Vec<(String, String, Option<String>)> {
authors
.split(':')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.enumerate()
.map(|(i, raw)| {
let (name, email) = match (raw.find('<'), raw.rfind('>')) {
(Some(lt), Some(gt)) if lt < gt => (
raw[..lt].trim().to_string(),
Some(raw[lt + 1..gt].trim().to_string()),
),
_ => (raw.to_string(), None),
};
let id = match orcid_for(email.as_deref()) {
Some(orcid) => orcid.to_string(),
None => format!("#author-{i}"),
};
(id, name, email)
})
.collect()
}
fn authors_as_ids() -> Vec<String> {
parse_authors(PKG.authors)
.into_iter()
.map(|(id, _, _)| id)
.collect()
}
fn format_authors(authors: &str) -> String {
authors
.split(':')
.map(|a| a.trim())
.filter(|a| !a.is_empty())
.map(|a| {
if let Some(idx) = a.find('<') {
a[..idx].trim().to_string()
} else {
a.to_string()
}
})
.collect::<Vec<_>>()
.join(", ")
}
fn current_year() -> i64 {
let secs: i64 = match std::env::var("SOURCE_DATE_EPOCH")
.ok()
.and_then(|s| s.parse().ok())
{
Some(s) => s,
None => std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0),
};
let days = secs.div_euclid(86_400);
let z = days + 719_468;
let era = z.div_euclid(146_097);
let doe = z.rem_euclid(146_097) as u64;
let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146_096) / 365;
let y = (yoe as i64) + era * 400;
let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
let mp = (5 * doy + 2) / 153;
let month = if mp < 10 { mp + 3 } else { mp - 9 };
if month <= 2 { y + 1 } else { y }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn iso_utc_examples() {
assert_eq!(format_iso_utc(0), "1970-01-01T00:00:00Z");
let secs_2026_06_02 = 1_780_000_000; let s = format_iso_utc(secs_2026_06_02);
assert!(s.starts_with("2026-"), "got {s}");
assert!(s.ends_with("Z"), "got {s}");
}
#[test]
fn encoding_format_table() {
assert_eq!(encoding_format_of("block_index.json"), "application/json");
assert_eq!(
encoding_format_of("blocks/abcdef0123456789.bin"),
"application/gzip"
);
assert_eq!(
encoding_format_of("schemas/blocks_schema.txt"),
"text/plain"
);
assert_eq!(encoding_format_of("README.md"), "text/markdown");
assert_eq!(encoding_format_of("xyzzy"), "application/octet-stream");
}
#[test]
fn dataset_identifier_shape() {
let p = AssetParams {
ring: 12,
max_steps: 10,
step: 1,
free: true,
target_block_bytes: 512,
n_sequences: 16_751,
oeis_a_number: Some("A316192"),
produced_via: ProducedVia::InMemory,
};
assert_eq!(dataset_identifier(&p), "tilezz-rat-zz12-n10-free");
let p2 = AssetParams { free: false, ..p };
assert_eq!(dataset_identifier(&p2), "tilezz-rat-zz12-n10-onesided");
let p3 = AssetParams { step: 3, ..p2 };
assert_eq!(dataset_identifier(&p3), "tilezz-rat-zz4-n10-onesided");
let p4 = AssetParams {
ring: 14,
step: 2,
..p
};
assert_eq!(dataset_identifier(&p4), "tilezz-rat-zz7-n10-free");
}
#[test]
fn reproduce_commands_per_pipeline() {
let base = AssetParams {
ring: 12,
max_steps: 10,
step: 1,
free: true,
target_block_bytes: 512,
n_sequences: 16_751,
oeis_a_number: None,
produced_via: ProducedVia::InMemory,
};
let in_mem = reproduce_commands(&base);
assert_eq!(in_mem.len(), 1, "in-memory should be one command");
assert!(in_mem[0].contains("--mode dafsa-blocks"), "{:?}", in_mem);
let streamed = reproduce_commands(&AssetParams {
produced_via: ProducedVia::StreamingPipeline,
..base
});
assert!(
streamed.len() >= 3,
"streaming pipeline needs stream + merge + build, got {:?}",
streamed
);
assert!(streamed[0].contains("--mode stream"));
assert!(streamed[1].contains("--mode merge"));
assert!(streamed[2].contains("--mode build"));
}
fn build_test_asset() -> std::path::PathBuf {
use crate::stringmatch::dafsa::RatDafsa;
let rats: Vec<Vec<i8>> = vec![
vec![1, 2, 3],
vec![1, 2, 4],
vec![1, 3, 1],
vec![2, 1, 5],
vec![3, 0, -1, 2],
];
let dafsa = RatDafsa::from_rats(rats.iter().map(|r| r.as_slice()));
let dir = std::env::temp_dir().join(format!(
"tilezz_rocrate_test_{}",
std::process::id() as u64 * 1000
+ std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.subsec_nanos() as u64
));
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();
dafsa.write_blocks(&dir, 2).expect("write_blocks");
let params = AssetParams {
ring: 12,
max_steps: 4,
step: 1,
free: true,
target_block_bytes: 2,
n_sequences: rats.len() as u64,
oeis_a_number: Some("A316192"),
produced_via: ProducedVia::InMemory,
};
let counts = SequenceCounts::from_rats(dafsa.iter());
write_archival_extras(&dir, ¶ms).expect("write_archival_extras");
write_ro_crate(&dir, ¶ms, &counts, None, None).expect("write_ro_crate");
dir
}
#[test]
fn sha256_matches_actual_files() {
let dir = build_test_asset();
let meta: Value = serde_json::from_str(
&std::fs::read_to_string(dir.join("ro-crate-metadata.json")).unwrap(),
)
.unwrap();
let graph = meta["@graph"].as_array().expect("@graph array");
let mut checked = 0;
for entity in graph {
let Some(id) = entity["@id"].as_str() else {
continue;
};
let Some(expected) = entity["sha256"].as_str() else {
continue;
};
let on_disk = sha256_hex(&dir.join(id)).expect("hash file");
assert_eq!(
expected, on_disk,
"ro-crate sha256 for {id} does not match on-disk",
);
checked += 1;
}
assert!(checked >= 5, "expected >= 5 hashed files, got {checked}");
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn source_date_epoch_controls_end_time() {
unsafe { std::env::set_var("SOURCE_DATE_EPOCH", "1780000000") };
let dir1 = build_test_asset();
let dir2 = build_test_asset();
let m1 = std::fs::read_to_string(dir1.join("ro-crate-metadata.json")).unwrap();
let m2 = std::fs::read_to_string(dir2.join("ro-crate-metadata.json")).unwrap();
let want = "2026-05-28T20:26:40Z";
assert!(m1.contains(want), "epoch-derived endTime missing in run 1");
assert!(m2.contains(want), "epoch-derived endTime missing in run 2");
unsafe { std::env::remove_var("SOURCE_DATE_EPOCH") };
let _ = std::fs::remove_dir_all(&dir1);
let _ = std::fs::remove_dir_all(&dir2);
}
#[test]
fn ro_crate_structure_is_valid() {
let dir = build_test_asset();
let meta: Value = serde_json::from_str(
&std::fs::read_to_string(dir.join("ro-crate-metadata.json")).unwrap(),
)
.unwrap();
assert_eq!(
meta["@context"], "https://w3id.org/ro/crate/1.2/context",
"wrong @context"
);
let graph = meta["@graph"].as_array().expect("@graph array");
let by_id: std::collections::HashMap<&str, &Value> = graph
.iter()
.filter_map(|e| Some((e["@id"].as_str()?, e)))
.collect();
let descriptor = by_id
.get("ro-crate-metadata.json")
.expect("metadata descriptor entity present");
assert_eq!(
descriptor["conformsTo"]["@id"], "https://w3id.org/ro/crate/1.2",
"descriptor conformsTo wrong"
);
assert_eq!(descriptor["about"]["@id"], "./");
let root = by_id.get("./").expect("root Dataset present");
assert_eq!(root["@type"], "Dataset");
assert!(root["name"].as_str().is_some(), "Dataset.name missing");
assert!(root["datePublished"].as_str().is_some());
assert!(root["hasPart"].as_array().is_some());
assert!(root["mainEntity"]["@id"].as_str().is_some());
assert_eq!(
root["license"]["@id"], "https://creativecommons.org/licenses/by-sa/4.0/",
"license wrong"
);
let dist = &root["distribution"];
assert_eq!(dist["@type"], "DataDownload", "distribution @type wrong");
assert_eq!(
dist["contentUrl"], "block_index.json",
"distribution.contentUrl wrong for relative (no base_url) build"
);
let orcid = "https://orcid.org/0000-0002-5077-7497";
let creators = root["creator"].as_array().expect("creator array");
assert!(
creators.iter().any(|c| c["@id"] == orcid),
"creator does not reference the author ORCID {orcid}"
);
let person = by_id.get(orcid).expect("Person entity at ORCID @id");
assert_eq!(person["@type"], "Person");
assert_eq!(person["identifier"], orcid, "Person.identifier wrong");
assert!(by_id.contains_key("https://creativecommons.org/licenses/by-sa/4.0/"));
assert!(by_id.contains_key("#tilezz"));
let build = by_id.get("#build").expect("CreateAction present");
assert_eq!(build["@type"], "CreateAction");
assert!(build["endTime"].as_str().is_some());
assert!(by_id.contains_key("https://oeis.org/A316192"));
for entity in graph {
let Some(c) = entity.get("conformsTo") else {
continue;
};
let refs: Vec<&str> = if let Some(arr) = c.as_array() {
arr.iter().filter_map(|v| v["@id"].as_str()).collect()
} else {
c["@id"].as_str().into_iter().collect()
};
for r in refs {
if r.starts_with("http://") || r.starts_with("https://") {
continue;
}
assert!(
by_id.contains_key(r),
"conformsTo points at unresolved @id {r}",
);
}
}
let parts = root["hasPart"].as_array().unwrap();
assert!(parts.len() >= 5, "hasPart has too few entries");
for part in parts {
let pid = part["@id"].as_str().expect("hasPart entry has @id");
assert!(
by_id.contains_key(pid),
"hasPart referent {pid} not in @graph"
);
assert!(
dir.join(pid).exists(),
"hasPart referent {pid} missing on disk"
);
}
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn count_by_length_prints_oeis_terms() {
let dir = build_test_asset();
let script = dir.join("tools/count_by_length.py");
assert!(script.exists(), "tools/count_by_length.py not emitted");
let meta: Value = serde_json::from_str(
&std::fs::read_to_string(dir.join("ro-crate-metadata.json")).unwrap(),
)
.unwrap();
let graph = meta["@graph"].as_array().expect("@graph array");
let entity = graph
.iter()
.find(|e| e["@id"] == "tools/count_by_length.py")
.expect("File entity for tools/count_by_length.py");
assert!(
entity["name"].as_str().is_some_and(|s| !s.is_empty()),
"count_by_length.py entity lacks a human label"
);
let have_python = std::process::Command::new("python3")
.arg("--version")
.output()
.is_ok_and(|o| o.status.success());
if !have_python {
eprintln!("skipping end-to-end half: python3 not on PATH");
let _ = std::fs::remove_dir_all(&dir);
return;
}
let output = std::process::Command::new("python3")
.arg(&script)
.arg(&dir)
.output()
.expect("run count_by_length.py");
assert!(
output.status.success(),
"count_by_length.py failed:\nstdout: {}\nstderr: {}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr),
);
let stdout = String::from_utf8_lossy(&output.stdout);
let lines: Vec<&str> = stdout.lines().filter(|l| !l.starts_with('#')).collect();
assert_eq!(lines, vec!["3 4", "4 1"], "unexpected terms:\n{stdout}");
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn variable_measured_emitted_and_verified() {
let dir = build_test_asset();
let meta: Value = serde_json::from_str(
&std::fs::read_to_string(dir.join("ro-crate-metadata.json")).unwrap(),
)
.unwrap();
let graph = meta["@graph"].as_array().expect("@graph");
let root = graph
.iter()
.find(|e| e["@id"] == "./")
.expect("root Dataset");
let vm = root["variableMeasured"]
.as_array()
.expect("variableMeasured array");
let names: std::collections::BTreeSet<&str> =
vm.iter().filter_map(|p| p["name"].as_str()).collect();
for want in [
"free",
"oneSided",
"achiral",
"rotationSymmetric",
"symmetric",
"subring",
"coset",
] {
assert!(names.contains(want), "variableMeasured missing `{want}`");
}
let free = vm
.iter()
.find(|p| p["name"] == "free")
.and_then(|p| p["value"].as_str())
.expect("free series");
assert_eq!(free, "0,0,4,1", "free series wrong: {free}");
let have_python = std::process::Command::new("python3")
.arg("--version")
.output()
.is_ok_and(|o| o.status.success());
if have_python {
let out = std::process::Command::new("python3")
.arg(dir.join("tools/verify_counts.py"))
.arg(&dir)
.output()
.expect("run verify_counts.py");
assert!(
out.status.success(),
"verify_counts.py failed:\nstdout: {}\nstderr: {}",
String::from_utf8_lossy(&out.stdout),
String::from_utf8_lossy(&out.stderr),
);
} else {
eprintln!("skipping verify_counts.py half: python3 not on PATH");
}
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn variable_measured_handles_max_steps_above_deepest_perimeter() {
use crate::stringmatch::dafsa::RatDafsa;
let rats: Vec<Vec<i8>> = vec![vec![1, 2, 3], vec![1, 2, 4], vec![3, 0, -1, 2]];
let dafsa = RatDafsa::from_rats(rats.iter().map(|r| r.as_slice()));
let dir = std::env::temp_dir().join(format!(
"tilezz_rocrate_gap_{}",
std::process::id() as u64 * 1000
+ std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.subsec_nanos() as u64
));
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();
dafsa.write_blocks(&dir, 2).expect("write_blocks");
let params = AssetParams {
ring: 12,
max_steps: 7,
step: 1,
free: true,
target_block_bytes: 2,
n_sequences: rats.len() as u64,
oeis_a_number: Some("A316192"),
produced_via: ProducedVia::InMemory,
};
let counts = SequenceCounts::from_rats(dafsa.iter());
write_archival_extras(&dir, ¶ms).expect("write_archival_extras");
write_ro_crate(&dir, ¶ms, &counts, None, None).expect("write_ro_crate");
let meta: Value = serde_json::from_str(
&std::fs::read_to_string(dir.join("ro-crate-metadata.json")).unwrap(),
)
.unwrap();
let graph = meta["@graph"].as_array().expect("@graph");
let root = graph
.iter()
.find(|e| e["@id"] == "./")
.expect("root Dataset");
let free = root["variableMeasured"]
.as_array()
.unwrap()
.iter()
.find(|p| p["name"] == "free")
.and_then(|p| p["value"].as_str())
.expect("free series");
assert_eq!(
free, "0,0,2,1",
"free series should stop at perimeter 4: {free}"
);
let max_indexed = root["additionalProperty"]
.as_array()
.unwrap()
.iter()
.find(|p| p["name"] == "maxIndexedLength")
.and_then(|p| p["value"].as_u64())
.expect("maxIndexedLength");
assert_eq!(max_indexed, 4, "maxIndexedLength should track the data");
let have_python = std::process::Command::new("python3")
.arg("--version")
.output()
.is_ok_and(|o| o.status.success());
if have_python {
let out = std::process::Command::new("python3")
.arg(dir.join("tools/verify_counts.py"))
.arg(&dir)
.output()
.expect("run verify_counts.py");
assert!(
out.status.success(),
"verify_counts.py failed on the gap asset:\nstdout: {}\nstderr: {}",
String::from_utf8_lossy(&out.stdout),
String::from_utf8_lossy(&out.stderr),
);
} else {
eprintln!("skipping verify_counts.py half: python3 not on PATH");
}
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn block_index_json_validates_against_schema() {
let dir = build_test_asset();
let schema: Value = serde_json::from_str(
&std::fs::read_to_string(dir.join("schemas/block_index.schema.json")).unwrap(),
)
.unwrap();
let instance: Value =
serde_json::from_str(&std::fs::read_to_string(dir.join("block_index.json")).unwrap())
.unwrap();
let validator = jsonschema::validator_for(&schema).expect("build validator");
let errors: Vec<String> = validator
.iter_errors(&instance)
.map(|e| format!("{} at {}", e, e.instance_path()))
.collect();
assert!(
errors.is_empty(),
"block_index.json failed validation:\n {}",
errors.join("\n ")
);
let _ = std::fs::remove_dir_all(&dir);
}
fn file_sha_pairs(crate_json: &Value) -> Vec<(String, String)> {
let mut out: Vec<(String, String)> = crate_json["@graph"]
.as_array()
.unwrap()
.iter()
.filter_map(|e| {
let id = e["@id"].as_str()?;
let sha = e["sha256"].as_str()?;
Some((id.to_string(), sha.to_string()))
})
.collect();
out.sort();
out
}
#[test]
fn rehost_rewrites_only_host_coordinates() {
let dir = build_test_asset();
let before: Value = serde_json::from_str(
&std::fs::read_to_string(dir.join("ro-crate-metadata.json")).unwrap(),
)
.unwrap();
let before_root = before["@graph"]
.as_array()
.unwrap()
.iter()
.find(|e| e["@id"] == "./")
.unwrap()
.clone();
let before_files = file_sha_pairs(&before);
let blocks_dir = dir.join("blocks");
let block_path = std::fs::read_dir(&blocks_dir)
.unwrap()
.filter_map(|e| e.ok())
.map(|e| e.path())
.find(|p| p.extension().and_then(|s| s.to_str()) == Some("bin"))
.expect("at least one block file");
let block_bytes_before = std::fs::read(&block_path).unwrap();
rehost_ro_crate(
&dir,
Some("https://zenodo.org/records/123/files"),
Some("10.5281/zenodo.123"),
)
.expect("rehost to zenodo");
let after: Value = serde_json::from_str(
&std::fs::read_to_string(dir.join("ro-crate-metadata.json")).unwrap(),
)
.unwrap();
let after_root = after["@graph"]
.as_array()
.unwrap()
.iter()
.find(|e| e["@id"] == "./")
.unwrap()
.clone();
assert_eq!(after_root["identifier"]["@type"], "PropertyValue");
assert_eq!(after_root["identifier"]["propertyID"], "DOI");
assert_eq!(after_root["identifier"]["value"], "10.5281/zenodo.123");
assert_eq!(
after_root["distribution"]["contentUrl"],
"https://zenodo.org/records/123/files/block_index.json"
);
assert_eq!(after_root["sameAs"], "https://doi.org/10.5281/zenodo.123");
assert_eq!(after_root["url"], "https://zenodo.org/records/123/files");
for key in [
"datePublished",
"version",
"variableMeasured",
"additionalProperty",
"name",
"description",
"keywords",
"creator",
"license",
"hasPart",
"mainEntity",
"subjectOf",
] {
assert_eq!(
after_root.get(key),
before_root.get(key),
"re-host changed `{key}` on the root Dataset",
);
}
let find = |c: &Value, id: &str| -> Value {
c["@graph"]
.as_array()
.unwrap()
.iter()
.find(|e| e["@id"] == id)
.unwrap()
.clone()
};
assert_eq!(
find(&after, "#build")["endTime"],
find(&before, "#build")["endTime"],
"re-host changed the CreateAction endTime"
);
assert_eq!(
find(&after, "#build"),
find(&before, "#build"),
"re-host changed the #build CreateAction"
);
assert_eq!(
find(&after, "#tilezz")["softwareVersion"],
find(&before, "#tilezz")["softwareVersion"],
"re-host changed the tool softwareVersion"
);
assert_eq!(
file_sha_pairs(&after),
before_files,
"re-host changed a File entity's sha256"
);
assert_eq!(
std::fs::read(&block_path).unwrap(),
block_bytes_before,
"re-host changed block file bytes on disk"
);
rehost_ro_crate(&dir, None, None).expect("rehost to relative");
let reverted: Value = serde_json::from_str(
&std::fs::read_to_string(dir.join("ro-crate-metadata.json")).unwrap(),
)
.unwrap();
let reverted_root = reverted["@graph"]
.as_array()
.unwrap()
.iter()
.find(|e| e["@id"] == "./")
.unwrap()
.clone();
assert_eq!(
reverted_root["identifier"]["propertyID"],
"tilezz-dataset-id"
);
assert_eq!(
reverted_root["identifier"]["value"], before_root["identifier"]["value"],
"reverted identifier slug must match the original, not the DOI"
);
assert_eq!(
reverted_root["distribution"]["contentUrl"],
"block_index.json"
);
assert!(
reverted_root.get("url").is_none(),
"reverting to relative form must remove url"
);
assert!(
reverted_root.get("sameAs").is_none(),
"reverting to relative form must remove sameAs"
);
assert_eq!(
find(&reverted, "#build")["endTime"],
find(&before, "#build")["endTime"],
"round-trip changed the CreateAction endTime"
);
let _ = std::fs::remove_dir_all(&dir);
}
}