Skip to main content

nwnrs_nwscript/
lexer.rs

1use std::{error::Error, fmt};
2
3use crate::{
4    CompilerErrorCode, Keyword, MAX_TOKEN_LENGTH, Token, TokenKind, nwscript_string_hash_bytes,
5    source::{SourceFile, SourceId, Span},
6};
7
8/// A lexical error returned while scanning `NWScript` source text.
9#[derive(#[automatically_derived]
impl ::core::fmt::Debug for LexerError {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field3_finish(f, "LexerError",
            "code", &self.code, "span", &self.span, "message", &&self.message)
    }
}Debug, #[automatically_derived]
impl ::core::clone::Clone for LexerError {
    #[inline]
    fn clone(&self) -> LexerError {
        LexerError {
            code: ::core::clone::Clone::clone(&self.code),
            span: ::core::clone::Clone::clone(&self.span),
            message: ::core::clone::Clone::clone(&self.message),
        }
    }
}Clone, #[automatically_derived]
impl ::core::cmp::PartialEq for LexerError {
    #[inline]
    fn eq(&self, other: &LexerError) -> bool {
        self.code == other.code && self.span == other.span &&
            self.message == other.message
    }
}PartialEq, #[automatically_derived]
impl ::core::cmp::Eq for LexerError {
    #[inline]
    #[doc(hidden)]
    #[coverage(off)]
    fn assert_fields_are_eq(&self) {
        let _: ::core::cmp::AssertParamIsEq<CompilerErrorCode>;
        let _: ::core::cmp::AssertParamIsEq<Span>;
        let _: ::core::cmp::AssertParamIsEq<String>;
    }
}Eq)]
10pub struct LexerError {
11    /// Stable upstream-aligned compiler error code.
12    pub code:    CompilerErrorCode,
13    /// Source span where lexing failed.
14    pub span:    Span,
15    /// Human-readable error message.
16    pub message: String,
17}
18
19impl LexerError {
20    fn new(
21        code: CompilerErrorCode,
22        source_id: SourceId,
23        start: usize,
24        end: usize,
25        message: impl Into<String>,
26    ) -> Self {
27        Self {
28            code,
29            span: Span::new(source_id, start, end),
30            message: message.into(),
31        }
32    }
33}
34
35impl fmt::Display for LexerError {
36    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37        f.write_fmt(format_args!("{0} ({1})", self.message, self.code.code()))write!(f, "{} ({})", self.message, self.code.code())
38    }
39}
40
41impl Error for LexerError {}
42
43/// Lexes `NWScript` source using the upstream compiler's token vocabulary.
44#[derive(#[automatically_derived]
impl<'a> ::core::fmt::Debug for Lexer<'a> {
    #[inline]
    fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
        ::core::fmt::Formatter::debug_struct_field3_finish(f, "Lexer",
            "source_id", &self.source_id, "input", &self.input, "position",
            &&self.position)
    }
}Debug, #[automatically_derived]
impl<'a> ::core::clone::Clone for Lexer<'a> {
    #[inline]
    fn clone(&self) -> Lexer<'a> {
        Lexer {
            source_id: ::core::clone::Clone::clone(&self.source_id),
            input: ::core::clone::Clone::clone(&self.input),
            position: ::core::clone::Clone::clone(&self.position),
        }
    }
}Clone)]
45pub struct Lexer<'a> {
46    source_id: SourceId,
47    input:     &'a [u8],
48    position:  usize,
49}
50
51impl<'a> Lexer<'a> {
52    /// Creates a lexer for one source file's contents.
53    #[must_use]
54    pub fn new(source_id: SourceId, input: &'a [u8]) -> Self {
55        Self {
56            source_id,
57            input,
58            position: 0,
59        }
60    }
61
62    /// Lexes the entire input into a token vector ending with `Eof`.
63    ///
64    /// # Errors
65    ///
66    /// Returns [`LexerError`] if an unrecognized token is encountered.
67    pub fn lex_all(&mut self) -> Result<Vec<Token>, LexerError> {
68        let mut tokens = Vec::new();
69        loop {
70            self.skip_trivia();
71            let token = self.next_token()?;
72            let is_eof = token.kind == TokenKind::Eof;
73            tokens.push(token);
74            if is_eof {
75                break;
76            }
77        }
78        Ok(tokens)
79    }
80
81    fn next_token(&mut self) -> Result<Token, LexerError> {
82        if self.position >= self.input.len() {
83            return Ok(Token::new(
84                TokenKind::Eof,
85                Span::new(self.source_id, self.position, self.position),
86                "",
87            ));
88        }
89
90        if self.starts_with_raw_string() {
91            return self.lex_raw_string();
92        }
93        if self.starts_with_hashed_string() {
94            return self.lex_hashed_string();
95        }
96
97        let start = self.position;
98        let current = self.current_byte();
99        match current {
100            Some(b'0'..=b'9') => self.lex_number(),
101            Some(b'.') if self.peek_byte(1).is_some_and(|next| next.is_ascii_digit()) => {
102                self.lex_number()
103            }
104            Some(b'a'..=b'z' | b'A'..=b'Z' | b'_') => self.lex_identifier(),
105            Some(b'#') => self.lex_hash_identifier_or_error(),
106            Some(b'"') => self.lex_string(),
107            Some(_) => self.lex_punctuation(start),
108            None => Ok(Token::new(
109                TokenKind::Eof,
110                Span::new(self.source_id, self.position, self.position),
111                "",
112            )),
113        }
114    }
115
116    fn skip_trivia(&mut self) {
117        loop {
118            if self.position >= self.input.len() {
119                return;
120            }
121
122            match self.current_byte() {
123                Some(b' ' | b'\t' | b'\n' | b'\r') => {
124                    self.position += 1;
125                }
126                Some(b'/') if self.peek_byte(1) == Some(b'/') => {
127                    self.position += 2;
128                    while let Some(byte) = self.current_byte() {
129                        if byte == b'\n' {
130                            break;
131                        }
132                        self.position += 1;
133                    }
134                }
135                Some(b'/') if self.peek_byte(1) == Some(b'*') => {
136                    self.position += 2;
137                    while self.position < self.input.len() {
138                        if self.current_byte() == Some(b'*') && self.peek_byte(1) == Some(b'/') {
139                            self.position += 2;
140                            break;
141                        }
142                        self.position += 1;
143                    }
144                }
145                _ => return,
146            }
147        }
148    }
149
150    fn lex_number(&mut self) -> Result<Token, LexerError> {
151        let start = self.position;
152        let mut text = String::new();
153        let mut kind = TokenKind::Integer;
154
155        if self.current_byte() == Some(b'.') {
156            kind = TokenKind::Float;
157            text.push('0');
158            text.push('.');
159            self.position += 1;
160            self.consume_ascii_digits(&mut text);
161            self.consume_float_suffix_if_present(&mut kind);
162            return self.finish_token(kind, start, self.position, text);
163        }
164
165        self.consume_ascii_digits(&mut text);
166
167        if text == "0" {
168            match self.current_byte() {
169                Some(b'x' | b'X') => {
170                    kind = TokenKind::HexInteger;
171                    if let Some(prefix) = self.bump_byte() {
172                        text.push(char::from(prefix));
173                    }
174                    while let Some(byte) = self.current_byte() {
175                        if byte.is_ascii_hexdigit() {
176                            let lowered = if (b'A'..=b'F').contains(&byte) {
177                                byte + 32
178                            } else {
179                                byte
180                            };
181                            text.push(char::from(lowered));
182                            self.position += 1;
183                        } else {
184                            break;
185                        }
186                    }
187                    return self.finish_token(kind, start, self.position, text);
188                }
189                Some(b'b' | b'B') => {
190                    kind = TokenKind::BinaryInteger;
191                    if let Some(prefix) = self.bump_byte() {
192                        text.push(char::from(prefix));
193                    }
194                    while let Some(byte) = self.current_byte() {
195                        if #[allow(non_exhaustive_omitted_patterns)] match byte {
    b'0' | b'1' => true,
    _ => false,
}matches!(byte, b'0' | b'1') {
196                            text.push(char::from(byte));
197                            self.position += 1;
198                        } else {
199                            break;
200                        }
201                    }
202                    return self.finish_token(kind, start, self.position, text);
203                }
204                Some(b'o' | b'O') => {
205                    kind = TokenKind::OctalInteger;
206                    if let Some(prefix) = self.bump_byte() {
207                        text.push(char::from(prefix));
208                    }
209                    while let Some(byte) = self.current_byte() {
210                        if (b'0'..=b'7').contains(&byte) {
211                            text.push(char::from(byte));
212                            self.position += 1;
213                        } else {
214                            break;
215                        }
216                    }
217                    return self.finish_token(kind, start, self.position, text);
218                }
219                _ => {}
220            }
221        }
222
223        if self.current_byte() == Some(b'.') {
224            kind = TokenKind::Float;
225            text.push('.');
226            self.position += 1;
227            self.consume_ascii_digits(&mut text);
228        }
229
230        self.consume_float_suffix_if_present(&mut kind);
231        self.finish_token(kind, start, self.position, text)
232    }
233
234    fn lex_identifier(&mut self) -> Result<Token, LexerError> {
235        let start = self.position;
236        let mut text = String::new();
237        while let Some(byte) = self.current_byte() {
238            if is_identifier_continue(byte) {
239                text.push(char::from(byte));
240                self.position += 1;
241            } else {
242                break;
243            }
244        }
245        self.finish_identifier_like_token(start, self.position, text)
246    }
247
248    fn lex_hash_identifier_or_error(&mut self) -> Result<Token, LexerError> {
249        let start = self.position;
250        self.position += 1;
251
252        if !self.current_byte().is_some_and(is_identifier_start) {
253            return Err(LexerError::new(
254                CompilerErrorCode::EllipsisInIdentifier,
255                self.source_id,
256                start,
257                self.position,
258                "invalid preprocessor-like identifier",
259            ));
260        }
261
262        let mut text = String::from("#");
263        while let Some(byte) = self.current_byte() {
264            if is_identifier_continue(byte) {
265                text.push(char::from(byte));
266                self.position += 1;
267            } else {
268                break;
269            }
270        }
271        self.finish_identifier_like_token(start, self.position, text)
272    }
273
274    fn finish_identifier_like_token(
275        &self,
276        start: usize,
277        end: usize,
278        text: String,
279    ) -> Result<Token, LexerError> {
280        if let Some(keyword) = Keyword::from_lexeme(&text) {
281            return self.finish_token(TokenKind::Keyword(keyword), start, end, text);
282        }
283        if text.starts_with('#') {
284            return Err(LexerError::new(
285                CompilerErrorCode::EllipsisInIdentifier,
286                self.source_id,
287                start,
288                end,
289                ::alloc::__export::must_use({
        ::alloc::fmt::format(format_args!("unknown preprocessor-like identifier {0:?}",
                text))
    })format!("unknown preprocessor-like identifier {text:?}"),
290            ));
291        }
292        self.finish_token(TokenKind::Identifier, start, end, text)
293    }
294
295    fn lex_string(&mut self) -> Result<Token, LexerError> {
296        let start = self.position;
297        self.position += 1;
298        let mut text = String::new();
299
300        while let Some(byte) = self.current_byte() {
301            match byte {
302                b'\n' => {
303                    return Err(LexerError::new(
304                        CompilerErrorCode::UnterminatedStringConstant,
305                        self.source_id,
306                        start,
307                        self.position,
308                        "unterminated string constant",
309                    ));
310                }
311                b'"' => {
312                    self.position += 1;
313                    return self.finish_token(TokenKind::String, start, self.position, text);
314                }
315                b'\\' => {
316                    let next = self.peek_byte(1);
317                    match next {
318                        Some(b'n') => {
319                            text.push('\n');
320                            self.position += 2;
321                        }
322                        Some(b'\\') => {
323                            text.push('\\');
324                            self.position += 2;
325                        }
326                        Some(b'"') => {
327                            text.push('"');
328                            self.position += 2;
329                        }
330                        Some(b'x') => {
331                            let first = self.peek_byte(2);
332                            let second = self.peek_byte(3);
333                            if first.is_none() || second.is_none() {
334                                return Err(LexerError::new(
335                                    CompilerErrorCode::UnterminatedStringConstant,
336                                    self.source_id,
337                                    start,
338                                    self.input.len(),
339                                    "unterminated hexadecimal string escape",
340                                ));
341                            }
342                            let value = parse_upstream_hex_escape(
343                                first.unwrap_or_default(),
344                                second.unwrap_or_default(),
345                            );
346                            text.push(char::from(value));
347                            self.position += 4;
348                        }
349                        Some(_) => {
350                            self.position += 1;
351                        }
352                        None => {
353                            return Err(LexerError::new(
354                                CompilerErrorCode::UnterminatedStringConstant,
355                                self.source_id,
356                                start,
357                                self.input.len(),
358                                "unterminated string constant",
359                            ));
360                        }
361                    }
362                }
363                _ => {
364                    text.push(byte_to_text_char(byte));
365                    self.position += 1;
366                }
367            }
368        }
369
370        Err(LexerError::new(
371            CompilerErrorCode::UnterminatedStringConstant,
372            self.source_id,
373            start,
374            self.input.len(),
375            "unterminated string constant",
376        ))
377    }
378
379    fn lex_raw_string(&mut self) -> Result<Token, LexerError> {
380        let start = self.position;
381        self.position += 2;
382        let mut text = String::new();
383
384        while let Some(byte) = self.current_byte() {
385            if byte == b'"' {
386                if self.peek_byte(1) == Some(b'"') {
387                    text.push('"');
388                    self.position += 2;
389                    continue;
390                }
391
392                self.position += 1;
393                return self.finish_token(TokenKind::String, start, self.position, text);
394            }
395
396            text.push(byte_to_text_char(byte));
397            self.position += 1;
398        }
399
400        Err(LexerError::new(
401            CompilerErrorCode::UnterminatedStringConstant,
402            self.source_id,
403            start,
404            self.input.len(),
405            "unterminated raw string constant",
406        ))
407    }
408
409    fn lex_hashed_string(&mut self) -> Result<Token, LexerError> {
410        let start = self.position;
411        self.position += 2;
412        let mut cooked_bytes = Vec::new();
413
414        while let Some(byte) = self.current_byte() {
415            match byte {
416                b'\n' => {
417                    return Err(LexerError::new(
418                        CompilerErrorCode::UnterminatedStringConstant,
419                        self.source_id,
420                        start,
421                        self.position,
422                        "unterminated hashed string constant",
423                    ));
424                }
425                b'"' => {
426                    self.position += 1;
427                    let lowered = ::alloc::__export::must_use({
        ::alloc::fmt::format(format_args!("0x{0:x}",
                nwscript_string_hash_bytes(&cooked_bytes).cast_unsigned()))
    })format!(
428                        "0x{:x}",
429                        nwscript_string_hash_bytes(&cooked_bytes).cast_unsigned()
430                    );
431                    return self.finish_token(TokenKind::HexInteger, start, self.position, lowered);
432                }
433                b'\\' => {
434                    let next = self.peek_byte(1);
435                    match next {
436                        Some(b'n') => {
437                            cooked_bytes.push(b'\n');
438                            self.position += 2;
439                        }
440                        Some(b'\\') => {
441                            cooked_bytes.push(b'\\');
442                            self.position += 2;
443                        }
444                        Some(b'"') => {
445                            cooked_bytes.push(b'"');
446                            self.position += 2;
447                        }
448                        Some(b'x') => {
449                            let first = self.peek_byte(2);
450                            let second = self.peek_byte(3);
451                            if first.is_none() || second.is_none() {
452                                return Err(LexerError::new(
453                                    CompilerErrorCode::UnterminatedStringConstant,
454                                    self.source_id,
455                                    start,
456                                    self.input.len(),
457                                    "unterminated hexadecimal hashed-string escape",
458                                ));
459                            }
460                            let value = parse_upstream_hex_escape(
461                                first.unwrap_or_default(),
462                                second.unwrap_or_default(),
463                            );
464                            cooked_bytes.push(value);
465                            self.position += 4;
466                        }
467                        Some(_) => {
468                            self.position += 1;
469                        }
470                        None => {
471                            return Err(LexerError::new(
472                                CompilerErrorCode::UnterminatedStringConstant,
473                                self.source_id,
474                                start,
475                                self.input.len(),
476                                "unterminated hashed string constant",
477                            ));
478                        }
479                    }
480                }
481                _ => {
482                    cooked_bytes.push(byte);
483                    self.position += 1;
484                }
485            }
486        }
487
488        Err(LexerError::new(
489            CompilerErrorCode::UnterminatedStringConstant,
490            self.source_id,
491            start,
492            self.input.len(),
493            "unterminated hashed string constant",
494        ))
495    }
496
497    fn lex_punctuation(&mut self, start: usize) -> Result<Token, LexerError> {
498        if self.slice_eq(start, start + 4, b">>>=") {
499            self.position += 4;
500            return self.finish_token(
501                TokenKind::AssignUnsignedShiftRight,
502                start,
503                self.position,
504                ">>>=".to_string(),
505            );
506        }
507
508        if let Some((kind, text)) = [
509            (TokenKind::UnsignedShiftRight, ">>>"),
510            (TokenKind::AssignShiftRight, ">>="),
511            (TokenKind::AssignShiftLeft, "<<="),
512        ]
513        .into_iter()
514        .find(|(_, text)| self.slice_eq(start, start + text.len(), text.as_bytes()))
515        {
516            let width = text.len();
517            self.position += width;
518            return self.finish_token(kind, start, self.position, text.to_string());
519        }
520
521        if let Some((kind, text)) = [
522            (TokenKind::LogicalAnd, "&&"),
523            (TokenKind::LogicalOr, "||"),
524            (TokenKind::GreaterEqual, ">="),
525            (TokenKind::LessEqual, "<="),
526            (TokenKind::NotEqual, "!="),
527            (TokenKind::EqualEqual, "=="),
528            (TokenKind::ShiftLeft, "<<"),
529            (TokenKind::ShiftRight, ">>"),
530            (TokenKind::Increment, "++"),
531            (TokenKind::Decrement, "--"),
532            (TokenKind::AssignMinus, "-="),
533            (TokenKind::AssignPlus, "+="),
534            (TokenKind::AssignMultiply, "*="),
535            (TokenKind::AssignDivide, "/="),
536            (TokenKind::AssignModulus, "%="),
537            (TokenKind::AssignAnd, "&="),
538            (TokenKind::AssignXor, "^="),
539            (TokenKind::AssignOr, "|="),
540        ]
541        .into_iter()
542        .find(|(_, text)| self.slice_eq(start, start + text.len(), text.as_bytes()))
543        {
544            let width = text.len();
545            self.position += width;
546            return self.finish_token(kind, start, self.position, text.to_string());
547        }
548
549        if let Some((kind, ch)) = self.current_byte().and_then(|byte| {
550            let kind = match byte {
551                b'/' => TokenKind::Divide,
552                b'*' => TokenKind::Multiply,
553                b'&' => TokenKind::BooleanAnd,
554                b'|' => TokenKind::InclusiveOr,
555                b'-' => TokenKind::Minus,
556                b'{' => TokenKind::LeftBrace,
557                b'}' => TokenKind::RightBrace,
558                b'(' => TokenKind::LeftParen,
559                b')' => TokenKind::RightParen,
560                b'[' => TokenKind::LeftSquareBracket,
561                b']' => TokenKind::RightSquareBracket,
562                b'<' => TokenKind::LessThan,
563                b'>' => TokenKind::GreaterThan,
564                b'!' => TokenKind::BooleanNot,
565                b'=' => TokenKind::Assign,
566                b'+' => TokenKind::Plus,
567                b'%' => TokenKind::Modulus,
568                b';' => TokenKind::Semicolon,
569                b',' => TokenKind::Comma,
570                b'^' => TokenKind::ExclusiveOr,
571                b'~' => TokenKind::Tilde,
572                b'.' => TokenKind::StructurePartSpecify,
573                b'?' => TokenKind::QuestionMark,
574                b':' => TokenKind::Colon,
575                _ => return None,
576            };
577            Some((kind, char::from(byte)))
578        }) {
579            self.position += 1;
580            return self.finish_token(kind, start, self.position, ch.to_string());
581        }
582
583        Err(LexerError::new(
584            CompilerErrorCode::UnexpectedCharacter,
585            self.source_id,
586            start,
587            start.saturating_add(1),
588            ::alloc::__export::must_use({
        ::alloc::fmt::format(format_args!("unexpected character {0:?}",
                self.current_byte().map_or('\0', char::from)))
    })format!(
589                "unexpected character {:?}",
590                self.current_byte().map_or('\0', char::from)
591            ),
592        ))
593    }
594
595    fn finish_token(
596        &self,
597        kind: TokenKind,
598        start: usize,
599        end: usize,
600        text: String,
601    ) -> Result<Token, LexerError> {
602        if text.len() > MAX_TOKEN_LENGTH {
603            return Err(LexerError::new(
604                CompilerErrorCode::TokenTooLong,
605                self.source_id,
606                start,
607                end,
608                ::alloc::__export::must_use({
        ::alloc::fmt::format(format_args!("token exceeds maximum length of {0} bytes",
                MAX_TOKEN_LENGTH))
    })format!("token exceeds maximum length of {MAX_TOKEN_LENGTH} bytes"),
609            ));
610        }
611        Ok(Token::new(
612            kind,
613            Span::new(self.source_id, start, end),
614            text,
615        ))
616    }
617
618    fn starts_with_raw_string(&self) -> bool {
619        #[allow(non_exhaustive_omitted_patterns)] match (self.current_byte(),
        self.peek_byte(1)) {
    (Some(b'r' | b'R'), Some(b'"')) => true,
    _ => false,
}matches!(
620            (self.current_byte(), self.peek_byte(1)),
621            (Some(b'r' | b'R'), Some(b'"'))
622        )
623    }
624
625    fn starts_with_hashed_string(&self) -> bool {
626        #[allow(non_exhaustive_omitted_patterns)] match (self.current_byte(),
        self.peek_byte(1)) {
    (Some(b'h' | b'H'), Some(b'"')) => true,
    _ => false,
}matches!(
627            (self.current_byte(), self.peek_byte(1)),
628            (Some(b'h' | b'H'), Some(b'"'))
629        )
630    }
631
632    fn consume_ascii_digits(&mut self, output: &mut String) {
633        while let Some(byte) = self.current_byte() {
634            if byte.is_ascii_digit() {
635                output.push(char::from(byte));
636                self.position += 1;
637            } else {
638                break;
639            }
640        }
641    }
642
643    fn consume_float_suffix_if_present(&mut self, kind: &mut TokenKind) {
644        if self.current_byte() == Some(b'f') {
645            *kind = TokenKind::Float;
646            self.position += 1;
647        }
648    }
649
650    fn current_byte(&self) -> Option<u8> {
651        self.input.get(self.position).copied()
652    }
653
654    fn peek_byte(&self, ahead: usize) -> Option<u8> {
655        self.input.get(self.position.saturating_add(ahead)).copied()
656    }
657
658    fn bump_byte(&mut self) -> Option<u8> {
659        let byte = self.current_byte()?;
660        self.position += 1;
661        Some(byte)
662    }
663
664    fn slice_eq(&self, start: usize, end: usize, expected: &[u8]) -> bool {
665        self.input.get(start..end) == Some(expected)
666    }
667}
668
669/// Lexes the contents of one source file.
670///
671/// # Errors
672///
673/// Returns [`LexerError`] if an unrecognized token is encountered.
674pub fn lex_source(source: &SourceFile) -> Result<Vec<Token>, LexerError> {
675    Lexer::new(source.id, source.bytes()).lex_all()
676}
677
678/// Lexes a byte buffer associated with `source_id`.
679///
680/// # Errors
681///
682/// Returns [`LexerError`] if an unrecognized token is encountered.
683pub fn lex_bytes(source_id: SourceId, input: &[u8]) -> Result<Vec<Token>, LexerError> {
684    Lexer::new(source_id, input).lex_all()
685}
686
687/// Lexes a string slice associated with `source_id`.
688///
689/// # Errors
690///
691/// Returns [`LexerError`] if an unrecognized token is encountered.
692pub fn lex_text(source_id: SourceId, input: &str) -> Result<Vec<Token>, LexerError> {
693    lex_bytes(source_id, input.as_bytes())
694}
695
696fn is_identifier_start(byte: u8) -> bool {
697    byte.is_ascii_alphabetic() || byte == b'_'
698}
699
700fn is_identifier_continue(byte: u8) -> bool {
701    byte.is_ascii_alphanumeric() || byte == b'_'
702}
703
704fn parse_upstream_hex_escape(first: u8, second: u8) -> u8 {
705    let first = hex_nibble(first);
706    let second = hex_nibble(second);
707    match (first, second) {
708        (Some(high), Some(low)) => (high << 4) | low,
709        (Some(value), None) => value,
710        (None, _) => 0,
711    }
712}
713
714fn hex_nibble(byte: u8) -> Option<u8> {
715    match byte {
716        b'0'..=b'9' => Some(byte - b'0'),
717        b'a'..=b'f' => Some((byte - b'a') + 10),
718        b'A'..=b'F' => Some((byte - b'A') + 10),
719        _ => None,
720    }
721}
722
723fn byte_to_text_char(byte: u8) -> char {
724    char::from_u32(u32::from(byte)).unwrap_or('\0')
725}
726
727#[cfg(test)]
728mod tests {
729    use crate::{
730        Keyword, SourceFile, SourceId, TokenKind, lex_bytes, lex_source, lex_text,
731        nwscript_string_hash_bytes,
732    };
733
734    #[test]
735    fn lexes_upstream_keyword_table_entries() {
736        let source = SourceFile::new(
737            SourceId::new(1),
738            "keywords.nss",
739            "if #include #define OBJECT_SELF JSON_TRUE __FILE__ ENGINE_STRUCTURE_0",
740        );
741
742        let tokens = lex_source(&source);
743        let kinds = tokens.ok().map(|items| {
744            items
745                .into_iter()
746                .map(|token| token.kind)
747                .collect::<Vec<_>>()
748        });
749
750        assert_eq!(
751            kinds,
752            Some(vec![
753                TokenKind::Keyword(Keyword::If),
754                TokenKind::Keyword(Keyword::Include),
755                TokenKind::Keyword(Keyword::Define),
756                TokenKind::Keyword(Keyword::ObjectSelf),
757                TokenKind::Keyword(Keyword::JsonTrue),
758                TokenKind::Keyword(Keyword::FileMacro),
759                TokenKind::Keyword(Keyword::EngineStructureDefinition),
760                TokenKind::Eof,
761            ])
762        );
763    }
764
765    #[test]
766    fn lexes_comments_numbers_and_operators() {
767        let tokens = lex_text(
768            SourceId::new(2),
769            "// header\n0xAB 0b10 0o77 .42 5.f 6f >>= >>>= && ||",
770        );
771
772        let pairs = tokens.ok().map(|items| {
773            items
774                .into_iter()
775                .map(|token| (token.kind, token.text))
776                .collect::<Vec<_>>()
777        });
778
779        assert_eq!(
780            pairs,
781            Some(vec![
782                (TokenKind::HexInteger, "0xab".to_string()),
783                (TokenKind::BinaryInteger, "0b10".to_string()),
784                (TokenKind::OctalInteger, "0o77".to_string()),
785                (TokenKind::Float, "0.42".to_string()),
786                (TokenKind::Float, "5.".to_string()),
787                (TokenKind::Float, "6".to_string()),
788                (TokenKind::AssignShiftRight, ">>=".to_string()),
789                (TokenKind::AssignUnsignedShiftRight, ">>>=".to_string()),
790                (TokenKind::LogicalAnd, "&&".to_string()),
791                (TokenKind::LogicalOr, "||".to_string()),
792                (TokenKind::Eof, "".to_string()),
793            ])
794        );
795    }
796
797    #[test]
798    fn lexes_strings_raw_strings_and_hashed_strings() {
799        let tokens = lex_text(
800            SourceId::new(3),
801            "\"a\\n\\\"\\\\\\x41\" r\"alpha\"\"beta\" h\"tag\\x3f\"",
802        );
803
804        let pairs = tokens.ok().map(|items| {
805            items
806                .into_iter()
807                .map(|token| (token.kind, token.text))
808                .collect::<Vec<_>>()
809        });
810
811        assert_eq!(
812            pairs,
813            Some(vec![
814                (TokenKind::String, "a\n\"\\A".to_string()),
815                (TokenKind::String, "alpha\"beta".to_string()),
816                (
817                    TokenKind::HexInteger,
818                    format!("0x{:x}", nwscript_string_hash_bytes(b"tag?") as u32),
819                ),
820                (TokenKind::Eof, "".to_string()),
821            ])
822        );
823    }
824
825    #[test]
826    fn lowers_hashed_strings_to_exact_upstream_hex_integers() {
827        let tokens = lex_text(
828            SourceId::new(5),
829            "h\"hello\" H\"\" h\"\\\"\\n\\\\\\xFF\\x80\"",
830        );
831        let pairs = tokens.ok().map(|items| {
832            items
833                .into_iter()
834                .map(|token| (token.kind, token.text))
835                .collect::<Vec<_>>()
836        });
837
838        assert_eq!(
839            pairs,
840            Some(vec![
841                (TokenKind::HexInteger, "0xf9cc2afc".to_string()),
842                (TokenKind::HexInteger, "0x0".to_string()),
843                (
844                    TokenKind::HexInteger,
845                    format!(
846                        "0x{:x}",
847                        nwscript_string_hash_bytes(&[b'"', b'\n', b'\\', 0xff, 0x80]) as u32
848                    ),
849                ),
850                (TokenKind::Eof, "".to_string()),
851            ])
852        );
853    }
854
855    #[test]
856    fn rejects_unknown_hash_prefixed_identifier_like_upstream() {
857        let error = lex_text(SourceId::new(4), "#pragma").err();
858
859        assert_eq!(
860            error.map(|item| item.code),
861            Some(crate::CompilerErrorCode::EllipsisInIdentifier)
862        );
863    }
864
865    #[test]
866    fn lexes_non_utf8_string_bytes_without_rejecting_source() {
867        let tokens = lex_bytes(SourceId::new(6), b"\"a\x93\xff\"");
868        let string_token = tokens.ok().and_then(|items| {
869            items
870                .into_iter()
871                .find(|token| token.kind == TokenKind::String)
872        });
873
874        let codepoints =
875            string_token.map(|token| token.text.chars().map(|ch| ch as u32).collect::<Vec<_>>());
876
877        assert_eq!(codepoints, Some(vec![0x61, 0x93, 0xff]));
878    }
879}