dbmd_core/ulid.rs
1//! Lowercase ULID minting and validation (SPEC § The `id` field, format v0.4).
2//!
3//! A [ULID](https://github.com/ulid/spec) is a 128-bit identifier — a 48-bit
4//! millisecond timestamp followed by 80 random bits — rendered as 26
5//! characters of Crockford base32. db.md's canonical form is **lowercase**
6//! (YAML-clean, shell-friendly, reads like the rest of the frontmatter).
7//! `dbmd write` mints one for every new content file that carries no `id`;
8//! the leading timestamp makes freshly minted ids time-sortable, and the 80
9//! random bits make offline minting coordination-free.
10//!
11//! **Std-only by design.** The toolkit's dependency discipline (zero AI deps,
12//! minimal tree) rules out pulling a `ulid`/`rand` stack for one mint call.
13//! Randomness comes from [`RandomState`] — std's hasher keys are seeded from
14//! OS entropy — mixed with the wall clock, the PID, and a process-global
15//! counter. That is not cryptographic randomness and does not need to be:
16//! the id's contract is store-scoped uniqueness (`DUP_ID` is the backstop),
17//! not unguessability.
18
19use std::collections::hash_map::RandomState;
20use std::hash::BuildHasher;
21use std::sync::atomic::{AtomicU64, Ordering};
22use std::sync::OnceLock;
23use std::time::{SystemTime, UNIX_EPOCH};
24
25/// Crockford base32, lowercase: `0-9` then `a-z` minus `i`, `l`, `o`, `u`.
26const CROCKFORD_LOWER: &[u8; 32] = b"0123456789abcdefghjkmnpqrstvwxyz";
27
28/// Per-process mint counter: guarantees two mints in the same nanosecond
29/// still hash distinct inputs.
30static COUNTER: AtomicU64 = AtomicU64::new(0);
31
32/// Two OS-entropy-seeded hash keys, fixed for the process. Hashing the
33/// (time, pid, counter) tuple under two independent keys yields 128 bits of
34/// well-distributed output per mint; the mint takes 80 of them.
35fn hash_states() -> &'static (RandomState, RandomState) {
36 static STATES: OnceLock<(RandomState, RandomState)> = OnceLock::new();
37 STATES.get_or_init(|| (RandomState::new(), RandomState::new()))
38}
39
40/// 80 bits of per-mint randomness (in the low bits of the returned value).
41fn entropy80() -> u128 {
42 let nanos = SystemTime::now()
43 .duration_since(UNIX_EPOCH)
44 .map(|d| d.as_nanos())
45 .unwrap_or_default();
46 let count = COUNTER.fetch_add(1, Ordering::Relaxed);
47 let pid = std::process::id();
48
49 let (a, b) = hash_states();
50 let hi = a.hash_one((nanos, count, pid));
51 let lo = b.hash_one((count, pid, nanos));
52
53 (((hi as u128) << 64) | (lo as u128)) & ((1u128 << 80) - 1)
54}
55
56/// Mint a fresh lowercase ULID: 48-bit Unix-millisecond timestamp + 80
57/// random bits, encoded as 26 chars of lowercase Crockford base32.
58pub fn mint() -> String {
59 let ms = SystemTime::now()
60 .duration_since(UNIX_EPOCH)
61 .map(|d| d.as_millis() as u64)
62 .unwrap_or_default()
63 & 0xFFFF_FFFF_FFFF; // 48 bits
64 encode(((ms as u128) << 80) | entropy80())
65}
66
67/// Encode a 128-bit value as 26 lowercase Crockford base32 chars. The first
68/// char carries only the top 3 bits (the 128-bit value viewed as 130 bits
69/// with two leading zeros), so it is always `0`–`7`.
70fn encode(value: u128) -> String {
71 let mut out = String::with_capacity(26);
72 for i in 0..26 {
73 let shift = 125 - 5 * i;
74 out.push(CROCKFORD_LOWER[((value >> shift) & 0x1F) as usize] as char);
75 }
76 out
77}
78
79/// True when `s` is a well-formed **lowercase** ULID: exactly 26 chars, all
80/// lowercase Crockford base32, first char `0`–`7` (so the value fits 128
81/// bits). Uppercase or mixed-case forms are not the db.md canonical form and
82/// return false — as does any other opaque id, which stays *legal* in a
83/// store (SPEC: the ULID form is recommended, never a validation gate).
84pub fn is_ulid(s: &str) -> bool {
85 let bytes = s.as_bytes();
86 bytes.len() == 26
87 && matches!(bytes[0], b'0'..=b'7')
88 && bytes.iter().all(|b| CROCKFORD_LOWER.contains(b))
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94 use std::collections::HashSet;
95
96 #[test]
97 fn mint_is_wellformed_lowercase_ulid() {
98 for _ in 0..100 {
99 let id = mint();
100 assert!(is_ulid(&id), "minted id {id:?} is not a lowercase ULID");
101 assert_eq!(id.len(), 26);
102 assert_eq!(id, id.to_lowercase());
103 }
104 }
105
106 #[test]
107 fn mint_is_unique_across_a_burst() {
108 // 10k mints in a tight loop (same millisecond for most): all distinct.
109 let ids: HashSet<String> = (0..10_000).map(|_| mint()).collect();
110 assert_eq!(ids.len(), 10_000, "duplicate ULIDs in a same-ms burst");
111 }
112
113 #[test]
114 fn mint_is_time_sortable_across_ms_boundaries() {
115 // Two mints separated by >1ms sort in mint order (the 48-bit ms
116 // timestamp is the most significant component).
117 let a = mint();
118 std::thread::sleep(std::time::Duration::from_millis(3));
119 let b = mint();
120 assert!(a < b, "ULIDs did not time-sort: {a} !< {b}");
121 }
122
123 #[test]
124 fn timestamp_prefix_decodes_to_now() {
125 // Decode the 10-char timestamp prefix back to ms and compare to the
126 // wall clock — proves the encoding puts the timestamp in the right
127 // bits (not just that output "looks like" a ULID).
128 let before_ms = SystemTime::now()
129 .duration_since(UNIX_EPOCH)
130 .unwrap()
131 .as_millis() as u64;
132 let id = mint();
133 let after_ms = SystemTime::now()
134 .duration_since(UNIX_EPOCH)
135 .unwrap()
136 .as_millis() as u64;
137 let mut ts: u64 = 0;
138 for b in id.as_bytes().iter().take(10) {
139 let digit = CROCKFORD_LOWER.iter().position(|c| c == b).unwrap() as u64;
140 ts = (ts << 5) | digit;
141 }
142 assert!(
143 (before_ms..=after_ms).contains(&ts),
144 "decoded ts {ts} outside [{before_ms}, {after_ms}]"
145 );
146 }
147
148 #[test]
149 fn is_ulid_accepts_only_canonical_lowercase() {
150 assert!(is_ulid("01j5qc3v9k4ym8rwbn2tqe6f7d"));
151 assert!(is_ulid("00000000000000000000000000"));
152 assert!(is_ulid("7zzzzzzzzzzzzzzzzzzzzzzzzz")); // max 128-bit value
153 // Wrong length.
154 assert!(!is_ulid(""));
155 assert!(!is_ulid("01j5qc3v9k4ym8rwbn2tqe6f7"));
156 assert!(!is_ulid("01j5qc3v9k4ym8rwbn2tqe6f7dd"));
157 // Uppercase (the upstream ULID spelling) is not db.md-canonical.
158 assert!(!is_ulid("01J5QC3V9K4YM8RWBN2TQE6F7D"));
159 // Excluded Crockford letters and non-base32 bytes.
160 assert!(!is_ulid("01j5qc3v9k4ym8rwbn2tqe6fil"));
161 assert!(!is_ulid("01j5qc3v9k4ym8rwbn2tqe6f7-"));
162 // First char beyond `7` overflows 128 bits.
163 assert!(!is_ulid("8zzzzzzzzzzzzzzzzzzzzzzzzz"));
164 // A plain slug is not a ULID (it stays LEGAL as an id; just not this form).
165 assert!(!is_ulid("sarah-chen"));
166 }
167}