spacetimedb_schema/
identifier.rs

1use crate::error::IdentifierError;
2use hashbrown::Equivalent;
3use spacetimedb_data_structures::map::HashSet;
4use spacetimedb_sats::{de, ser};
5use std::fmt::{self, Debug, Display};
6use std::ops::Deref;
7use unicode_ident::{is_xid_continue, is_xid_start};
8use unicode_normalization::UnicodeNormalization;
9
10lazy_static::lazy_static! {
11    /// TODO(1.0): Pull in the rest of the reserved identifiers from the Identifier Proposal once that's merged.
12    static ref RESERVED_IDENTIFIERS: HashSet<&'static str> = include_str!("reserved_identifiers.txt").lines().collect();
13}
14
15/// A valid SpacetimeDB Identifier.
16///
17/// Identifiers must be normalized according to [Unicode Standard Annex 15](https://www.unicode.org/reports/tr15/), normalization form C
18/// (Canonical Decomposition followed by Canonical Composition).
19/// Following Rust, we use the identifier rules defined by [Unicode Standard Annex 31](https://www.unicode.org/reports/tr31/tr31-37.html) to validate identifiers.
20/// We allow underscores as well as any XID_Start character to start an identifier.
21///
22/// In addition, we forbid the use of any identifier reserved by [PostgreSQL](https://www.postgresql.org/docs/current/sql-keywords-appendix.html).
23/// Any string that is converted into a reserved word by the Rust function
24/// [`String::to_uppercase`](https://doc.rust-lang.org/std/string/struct.String.html#method.to_uppercase) will be rejected.
25///
26/// The list of reserved words can be found in the file `SpacetimeDB/crates/sats/db/reserved_identifiers.txt`.
27#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, de::Deserialize, ser::Serialize)]
28#[sats(crate = spacetimedb_sats)]
29pub struct Identifier {
30    id: Box<str>,
31}
32
33impl Identifier {
34    /// Validates that the input string is a valid identifier.
35    ///
36    /// Currently, this rejects non-canonicalized identifiers.
37    /// Eventually, it will be changed to canonicalize the input string.
38    pub fn new(name: Box<str>) -> Result<Self, IdentifierError> {
39        if name.is_empty() {
40            return Err(IdentifierError::Empty {});
41        }
42
43        // Convert to Unicode Normalization Form C (canonical decomposition followed by composition).
44        if name.nfc().zip(name.chars()).any(|(a, b)| a != b) {
45            return Err(IdentifierError::NotCanonicalized { name });
46        }
47
48        let mut chars = name.chars();
49
50        let start = chars.next().ok_or(IdentifierError::Empty {})?;
51        if !is_xid_start(start) && start != '_' {
52            return Err(IdentifierError::InvalidStart {
53                name,
54                invalid_start: start,
55            });
56        }
57
58        for char_ in chars {
59            if !is_xid_continue(char_) {
60                return Err(IdentifierError::InvalidContinue {
61                    name,
62                    invalid_continue: char_,
63                });
64            }
65        }
66
67        if Identifier::is_reserved(&name) {
68            return Err(IdentifierError::Reserved { name });
69        }
70
71        Ok(Identifier { id: name })
72    }
73
74    /// Check if a string is a reserved identifier.
75    pub fn is_reserved(name: &str) -> bool {
76        RESERVED_IDENTIFIERS.contains(&*name.to_uppercase())
77    }
78}
79
80impl Debug for Identifier {
81    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
82        Debug::fmt(&self.id, f)
83    }
84}
85
86impl Display for Identifier {
87    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
88        Display::fmt(&self.id, f)
89    }
90}
91
92impl Deref for Identifier {
93    type Target = str;
94
95    fn deref(&self) -> &str {
96        &self.id
97    }
98}
99
100impl From<Identifier> for Box<str> {
101    fn from(value: Identifier) -> Self {
102        value.id
103    }
104}
105
106impl Equivalent<Identifier> for str {
107    fn equivalent(&self, other: &Identifier) -> bool {
108        self == &other.id[..]
109    }
110}
111
112#[cfg(test)]
113mod tests {
114    use super::*;
115    use proptest::prelude::*;
116
117    #[test]
118    fn test_a_bunch_of_identifiers() {
119        assert!(Identifier::new("friends".into()).is_ok());
120        assert!(Identifier::new("Oysters".into()).is_ok());
121        assert!(Identifier::new("_hello".into()).is_ok());
122        assert!(Identifier::new("bananas_there_".into()).is_ok());
123        assert!(Identifier::new("Москва".into()).is_ok());
124        assert!(Identifier::new("東京".into()).is_ok());
125        assert!(Identifier::new("bees123".into()).is_ok());
126
127        assert!(Identifier::new("".into()).is_err());
128        assert!(Identifier::new("123bees".into()).is_err());
129        assert!(Identifier::new("\u{200B}hello".into()).is_err()); // zero-width space
130        assert!(Identifier::new(" hello".into()).is_err());
131        assert!(Identifier::new("hello ".into()).is_err());
132        assert!(Identifier::new("🍌".into()).is_err()); // ;-; the unicode committee is no fun
133        assert!(Identifier::new("".into()).is_err());
134    }
135
136    #[test]
137    fn test_canonicalization() {
138        assert!(Identifier::new("_\u{0041}\u{030A}".into()).is_err());
139        // canonicalized version of the above.
140        assert!(Identifier::new("_\u{00C5}".into()).is_ok());
141    }
142
143    proptest! {
144        #[test]
145        fn test_standard_ascii_identifiers(s in "[a-zA-Z_][a-zA-Z0-9_]*") {
146            // Ha! Proptest will reliably find these.
147            prop_assume!(!Identifier::is_reserved(&s));
148
149            prop_assert!(Identifier::new(s.into()).is_ok());
150        }
151    }
152}