Skip to main content

ptx_parser/
lexer.rs

1use crate::parser::Span;
2use logos::Logos;
3
4/// PTX specification token types for lexical analysis.
5///
6/// This enum represents all token types that can appear in PTX assembly code,
7/// including keywords, operators, literals, identifiers, and special markers.
8#[derive(Logos, Debug, Clone, PartialEq, Eq)]
9#[logos(error = LexError)]
10#[logos(skip r"[ \t\r\n]+")]
11pub enum PtxToken {
12    #[regex(r"//[^\n]*", logos::skip)]
13    #[regex(r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/", logos::skip)]
14    #[token("::")]
15    DoubleColon,
16    #[token(".")]
17    Dot,
18    #[token(",")]
19    Comma,
20    #[token(";")]
21    Semicolon,
22    #[token(":")]
23    Colon,
24    #[token("(")]
25    LParen,
26    #[token(")")]
27    RParen,
28    #[token("[")]
29    LBracket,
30    #[token("]")]
31    RBracket,
32    #[token("{")]
33    LBrace,
34    #[token("}")]
35    RBrace,
36    #[token("+")]
37    Plus,
38    #[token("-")]
39    Minus,
40    #[token("*")]
41    Star,
42    #[token("/")]
43    Slash,
44    #[token("<")]
45    LAngle,
46    #[token(">")]
47    RAngle,
48    #[token("=")]
49    Equals,
50    #[token("%")]
51    Percent,
52    #[token("!")]
53    Exclaim,
54    #[token("|")]
55    Pipe,
56    #[token("&")]
57    Ampersand,
58    #[token("^")]
59    Caret,
60    #[token("~")]
61    Tilde,
62    #[token("@")]
63    At,
64    // Single-precision hex float: 0f12345678
65    #[regex(r"0[fF][0-9a-fA-F]{8}", |lex| lex.slice().to_string())]
66    HexFloatSingle(String),
67    // Double-precision hex float: 0d1234567890abcdef
68    #[regex(r"0[dD][0-9a-fA-F]{16}", |lex| lex.slice().to_string())]
69    HexFloatDouble(String),
70    #[regex(r"0[xX][0-9a-fA-F]+U?", |lex| lex.slice().to_string())]
71    HexInteger(String),
72    #[regex(r"0[bB][01]+U?", |lex| lex.slice().to_string())]
73    BinaryInteger(String),
74    #[regex(r"0[0-7]+U?", |lex| lex.slice().to_string())]
75    OctalInteger(String),
76    #[regex(r"[0-9]+\.[0-9]+[eE][+-]?[0-9]+", |lex| lex.slice().to_string())]
77    #[regex(r"[0-9]+[eE][+-]?[0-9]+", |lex| lex.slice().to_string())]
78    FloatExponent(String),
79    #[regex(r"[0-9]+\.[0-9]+", |lex| lex.slice().to_string())]
80    Float(String),
81    #[regex(r"[1-9][0-9]*U?", priority = 2, callback = |lex| lex.slice().to_string())]
82    #[regex(r"0U?", |lex| lex.slice().to_string())]
83    DecimalInteger(String),
84    #[regex(r"%[a-zA-Z_][a-zA-Z0-9_]*", priority = 2, callback = |lex| lex.slice().to_string())]
85    Register(String),
86    #[regex(r"[a-zA-Z][a-zA-Z0-9_$]*", |lex| lex.slice().to_string())]
87    #[regex(r"[_$%][a-zA-Z0-9_$]*", priority = 1, callback = |lex| lex.slice().to_string())]
88    Identifier(String),
89    /// Whitespace tokens emitted by the unparser only (never produced by the lexer).
90    #[token("\u{0000}")]
91    Space,
92    #[token("\u{0001}")]
93    Newline,
94    #[regex(r#""([^"\\]|\\.)*""#, |lex| {
95        let slice = lex.slice();
96        slice[1..slice.len() - 1].to_string()
97    })]
98    StringLiteral(String),
99}
100
101impl PtxToken {
102    /// Extract the string value from a token, if it has one
103    pub fn as_str(&self) -> &str {
104        match self {
105            PtxToken::Identifier(s)
106            | PtxToken::DecimalInteger(s)
107            | PtxToken::HexInteger(s)
108            | PtxToken::BinaryInteger(s)
109            | PtxToken::OctalInteger(s)
110            | PtxToken::Float(s)
111            | PtxToken::FloatExponent(s)
112            | PtxToken::HexFloatSingle(s)
113            | PtxToken::HexFloatDouble(s)
114            | PtxToken::Register(s)
115            | PtxToken::StringLiteral(s) => s.as_str(),
116            PtxToken::DoubleColon => "::",
117            PtxToken::Dot => ".",
118            PtxToken::Comma => ",",
119            PtxToken::Semicolon => ";",
120            PtxToken::Colon => ":",
121            PtxToken::LParen => "(",
122            PtxToken::RParen => ")",
123            PtxToken::LBracket => "[",
124            PtxToken::RBracket => "]",
125            PtxToken::LBrace => "{",
126            PtxToken::RBrace => "}",
127            PtxToken::Plus => "+",
128            PtxToken::Minus => "-",
129            PtxToken::Star => "*",
130            PtxToken::Slash => "/",
131            PtxToken::LAngle => "<",
132            PtxToken::RAngle => ">",
133            PtxToken::Equals => "=",
134            PtxToken::Percent => "%",
135            PtxToken::Exclaim => "!",
136            PtxToken::Pipe => "|",
137            PtxToken::Ampersand => "&",
138            PtxToken::Caret => "^",
139            PtxToken::Tilde => "~",
140            PtxToken::At => "@",
141            PtxToken::Space => " ",
142            PtxToken::Newline => "\n",
143        }
144    }
145
146    pub fn len(&self) -> usize {
147        match self {
148            PtxToken::Identifier(s)
149            | PtxToken::DecimalInteger(s)
150            | PtxToken::HexInteger(s)
151            | PtxToken::BinaryInteger(s)
152            | PtxToken::OctalInteger(s)
153            | PtxToken::Float(s)
154            | PtxToken::FloatExponent(s)
155            | PtxToken::HexFloatSingle(s)
156            | PtxToken::HexFloatDouble(s)
157            | PtxToken::Register(s)
158            | PtxToken::StringLiteral(s) => s.len(),
159            PtxToken::DoubleColon => 2,
160            PtxToken::Dot
161            | PtxToken::Comma
162            | PtxToken::Semicolon
163            | PtxToken::Colon
164            | PtxToken::LParen
165            | PtxToken::RParen
166            | PtxToken::LBracket
167            | PtxToken::RBracket
168            | PtxToken::LBrace
169            | PtxToken::RBrace
170            | PtxToken::Plus
171            | PtxToken::Minus
172            | PtxToken::Star
173            | PtxToken::Slash
174            | PtxToken::LAngle
175            | PtxToken::RAngle
176            | PtxToken::Equals
177            | PtxToken::Percent
178            | PtxToken::Exclaim
179            | PtxToken::Pipe
180            | PtxToken::Ampersand
181            | PtxToken::Caret
182            | PtxToken::Tilde
183            | PtxToken::At
184            | PtxToken::Space
185            | PtxToken::Newline => 1,
186        }
187    }
188}
189
190/// Lexical analysis error type.
191#[derive(Debug, Clone, PartialEq, Default)]
192pub struct LexError {
193    /// The span in the source code where the error occurred
194    pub span: Span,
195}
196
197impl From<Span> for LexError {
198    fn from(span: Span) -> Self {
199        LexError { span }
200    }
201}
202
203/// Tokenize a PTX source string into a sequence of tokens with their spans.
204///
205/// This is the main entry point for lexical analysis. It converts raw PTX
206/// source code into a vector of tokens that can be parsed.
207///
208/// # Arguments
209///
210/// * `source` - The PTX source code as a string slice
211///
212/// # Returns
213///
214/// Returns a vector of tuples containing each token and its span in the source,
215/// or a `LexError` if tokenization fails.
216///
217/// # Example
218///
219/// ```no_run
220/// use ptx_parser::tokenize;
221///
222/// let source = ".version 8.5\n.target sm_90";
223/// let tokens = tokenize(source).expect("Failed to tokenize");
224/// ```
225pub fn tokenize(source: &str) -> Result<Vec<(PtxToken, Span)>, LexError> {
226    let mut lexer = PtxToken::lexer(source);
227    let mut tokens = Vec::new();
228
229    while let Some(item) = lexer.next() {
230        match item {
231            Ok(token) => tokens.push((token, Span::from(lexer.span()))),
232            Err(_) => {
233                return Err(LexError {
234                    span: Span::from(lexer.span()),
235                });
236            }
237        }
238    }
239
240    Ok(tokens)
241}