Skip to main content

dbmd_core/
ulid.rs

1//! Lowercase ULID minting and validation (SPEC § The `id` field, format v0.4).
2//!
3//! A [ULID](https://github.com/ulid/spec) is a 128-bit identifier — a 48-bit
4//! millisecond timestamp followed by 80 random bits — rendered as 26
5//! characters of Crockford base32. db.md's canonical form is **lowercase**
6//! (YAML-clean, shell-friendly, reads like the rest of the frontmatter).
7//! `dbmd write` mints one for every new content file that carries no `id`;
8//! the leading timestamp makes freshly minted ids time-sortable, and the 80
9//! random bits make offline minting coordination-free.
10//!
11//! **Std-only by design.** The toolkit's dependency discipline (zero AI deps,
12//! minimal tree) rules out pulling a `ulid`/`rand` stack for one mint call.
13//! Randomness comes from [`RandomState`] — std's hasher keys are seeded from
14//! OS entropy — mixed with the wall clock, the PID, and a process-global
15//! counter. That is not cryptographic randomness and does not need to be:
16//! the id's contract is store-scoped uniqueness (`DUP_ID` is the backstop),
17//! not unguessability.
18
19use std::collections::hash_map::RandomState;
20use std::hash::BuildHasher;
21use std::sync::atomic::{AtomicU64, Ordering};
22use std::sync::OnceLock;
23use std::time::{SystemTime, UNIX_EPOCH};
24
25/// Crockford base32, lowercase: `0-9` then `a-z` minus `i`, `l`, `o`, `u`.
26const CROCKFORD_LOWER: &[u8; 32] = b"0123456789abcdefghjkmnpqrstvwxyz";
27
28/// Per-process mint counter: guarantees two mints in the same nanosecond
29/// still hash distinct inputs.
30static COUNTER: AtomicU64 = AtomicU64::new(0);
31
32/// Two OS-entropy-seeded hash keys, fixed for the process. Hashing the
33/// (time, pid, counter) tuple under two independent keys yields 128 bits of
34/// well-distributed output per mint; the mint takes 80 of them.
35fn hash_states() -> &'static (RandomState, RandomState) {
36    static STATES: OnceLock<(RandomState, RandomState)> = OnceLock::new();
37    STATES.get_or_init(|| (RandomState::new(), RandomState::new()))
38}
39
40/// 80 bits of per-mint randomness (in the low bits of the returned value).
41fn entropy80() -> u128 {
42    let nanos = SystemTime::now()
43        .duration_since(UNIX_EPOCH)
44        .map(|d| d.as_nanos())
45        .unwrap_or_default();
46    let count = COUNTER.fetch_add(1, Ordering::Relaxed);
47    let pid = std::process::id();
48
49    let (a, b) = hash_states();
50    let hi = a.hash_one((nanos, count, pid));
51    let lo = b.hash_one((count, pid, nanos));
52
53    (((hi as u128) << 64) | (lo as u128)) & ((1u128 << 80) - 1)
54}
55
56/// Mint a fresh lowercase ULID: 48-bit Unix-millisecond timestamp + 80
57/// random bits, encoded as 26 chars of lowercase Crockford base32.
58pub fn mint() -> String {
59    let ms = SystemTime::now()
60        .duration_since(UNIX_EPOCH)
61        .map(|d| d.as_millis() as u64)
62        .unwrap_or_default()
63        & 0xFFFF_FFFF_FFFF; // 48 bits
64    encode(((ms as u128) << 80) | entropy80())
65}
66
67/// Encode a 128-bit value as 26 lowercase Crockford base32 chars. The first
68/// char carries only the top 3 bits (the 128-bit value viewed as 130 bits
69/// with two leading zeros), so it is always `0`–`7`.
70fn encode(value: u128) -> String {
71    let mut out = String::with_capacity(26);
72    for i in 0..26 {
73        let shift = 125 - 5 * i;
74        out.push(CROCKFORD_LOWER[((value >> shift) & 0x1F) as usize] as char);
75    }
76    out
77}
78
79/// True when `s` is a well-formed **lowercase** ULID: exactly 26 chars, all
80/// lowercase Crockford base32, first char `0`–`7` (so the value fits 128
81/// bits). Uppercase or mixed-case forms are not the db.md canonical form and
82/// return false — as does any other opaque id, which stays *legal* in a
83/// store (SPEC: the ULID form is recommended, never a validation gate).
84pub fn is_ulid(s: &str) -> bool {
85    let bytes = s.as_bytes();
86    bytes.len() == 26
87        && matches!(bytes[0], b'0'..=b'7')
88        && bytes.iter().all(|b| CROCKFORD_LOWER.contains(b))
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94    use std::collections::HashSet;
95
96    #[test]
97    fn mint_is_wellformed_lowercase_ulid() {
98        for _ in 0..100 {
99            let id = mint();
100            assert!(is_ulid(&id), "minted id {id:?} is not a lowercase ULID");
101            assert_eq!(id.len(), 26);
102            assert_eq!(id, id.to_lowercase());
103        }
104    }
105
106    #[test]
107    fn mint_is_unique_across_a_burst() {
108        // 10k mints in a tight loop (same millisecond for most): all distinct.
109        let ids: HashSet<String> = (0..10_000).map(|_| mint()).collect();
110        assert_eq!(ids.len(), 10_000, "duplicate ULIDs in a same-ms burst");
111    }
112
113    #[test]
114    fn mint_is_time_sortable_across_ms_boundaries() {
115        // Two mints separated by >1ms sort in mint order (the 48-bit ms
116        // timestamp is the most significant component).
117        let a = mint();
118        std::thread::sleep(std::time::Duration::from_millis(3));
119        let b = mint();
120        assert!(a < b, "ULIDs did not time-sort: {a} !< {b}");
121    }
122
123    #[test]
124    fn timestamp_prefix_decodes_to_now() {
125        // Decode the 10-char timestamp prefix back to ms and compare to the
126        // wall clock — proves the encoding puts the timestamp in the right
127        // bits (not just that output "looks like" a ULID).
128        let before_ms = SystemTime::now()
129            .duration_since(UNIX_EPOCH)
130            .unwrap()
131            .as_millis() as u64;
132        let id = mint();
133        let after_ms = SystemTime::now()
134            .duration_since(UNIX_EPOCH)
135            .unwrap()
136            .as_millis() as u64;
137        let mut ts: u64 = 0;
138        for b in id.as_bytes().iter().take(10) {
139            let digit = CROCKFORD_LOWER.iter().position(|c| c == b).unwrap() as u64;
140            ts = (ts << 5) | digit;
141        }
142        assert!(
143            (before_ms..=after_ms).contains(&ts),
144            "decoded ts {ts} outside [{before_ms}, {after_ms}]"
145        );
146    }
147
148    #[test]
149    fn is_ulid_accepts_only_canonical_lowercase() {
150        assert!(is_ulid("01j5qc3v9k4ym8rwbn2tqe6f7d"));
151        assert!(is_ulid("00000000000000000000000000"));
152        assert!(is_ulid("7zzzzzzzzzzzzzzzzzzzzzzzzz")); // max 128-bit value
153                                                        // Wrong length.
154        assert!(!is_ulid(""));
155        assert!(!is_ulid("01j5qc3v9k4ym8rwbn2tqe6f7"));
156        assert!(!is_ulid("01j5qc3v9k4ym8rwbn2tqe6f7dd"));
157        // Uppercase (the upstream ULID spelling) is not db.md-canonical.
158        assert!(!is_ulid("01J5QC3V9K4YM8RWBN2TQE6F7D"));
159        // Excluded Crockford letters and non-base32 bytes.
160        assert!(!is_ulid("01j5qc3v9k4ym8rwbn2tqe6fil"));
161        assert!(!is_ulid("01j5qc3v9k4ym8rwbn2tqe6f7-"));
162        // First char beyond `7` overflows 128 bits.
163        assert!(!is_ulid("8zzzzzzzzzzzzzzzzzzzzzzzzz"));
164        // A plain slug is not a ULID (it stays LEGAL as an id; just not this form).
165        assert!(!is_ulid("sarah-chen"));
166    }
167}