mod error;
mod quoted_strings;
mod wrapped_parsers;
use std::{num::NonZeroU8, str::FromStr};
use lexical::{
FromLexicalWithOptions, NumberFormatBuilder, ParseFloatOptions, ParseIntegerOptions,
};
use nom::{
bytes::complete::{is_a, take_till, take_while, take_while1},
character::complete::one_of,
combinator::{all_consuming, cut, map, peek, recognize, value},
multi::many0,
sequence::{pair, preceded, terminated, tuple},
Finish, IResult, Slice as _,
};
use nom_locate::LocatedSpan;
use wrapped_parsers::{alt, tag, tag_no_case};
pub use super::token::{KeywordToken, Token, TokenWithLocation};
use crate::parser::lexer::wrapped_parsers::expecting;
use crate::parser::token::token_with_location;
pub(crate) use error::InternalLexError;
pub use error::{LexError, LexErrorKind};
#[derive(Debug, Copy, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "SCREAMING-KEBAB-CASE")]
pub enum Command {
Add,
And,
Ashr,
Call,
Capture,
Convert,
Declare,
#[strum(to_string = "DEFCAL")]
DefCal,
#[strum(to_string = "DEFCIRCUIT")]
DefCircuit,
#[strum(to_string = "DEFFRAME")]
DefFrame,
#[strum(to_string = "DEFGATE")]
DefGate,
#[strum(to_string = "DEFWAVEFORM")]
DefWaveform,
Delay,
Div,
Eq,
Exchange,
Fence,
GE,
GT,
Halt,
Include,
Ior,
Jump,
JumpUnless,
JumpWhen,
Label,
LE,
Load,
LT,
Measure,
Move,
Mul,
Neg,
Nop,
Not,
Pragma,
Pulse,
RawCapture,
Reset,
SetFrequency,
SetPhase,
SetScale,
ShiftFrequency,
ShiftPhase,
Shl,
Shr,
Store,
Sub,
SwapPhases,
Wait,
Xor,
}
#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "UPPERCASE")]
pub enum DataType {
Bit,
Octet,
Real,
Integer,
}
#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "UPPERCASE")]
pub enum Modifier {
Controlled,
Dagger,
Forked, }
#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
pub enum Operator {
#[strum(serialize = "^")]
Caret,
#[strum(serialize = "-")]
Minus,
#[strum(serialize = "+")]
Plus,
#[strum(serialize = "/")]
Slash,
#[strum(serialize = "*")]
Star,
}
pub type LexInput<'a> = LocatedSpan<&'a str>;
pub(crate) type InternalLexResult<'a, T = Token, E = InternalLexError<'a>> =
IResult<LexInput<'a>, T, E>;
pub type LexResult<'a, T = Token, E = LexError> = IResult<LexInput<'a>, T, E>;
pub(crate) fn lex(input: LexInput) -> Result<Vec<TokenWithLocation>, LexError> {
all_consuming(_lex)(input)
.finish()
.map(|(_, tokens)| tokens)
.map_err(LexError::from)
}
fn _lex(input: LexInput) -> InternalLexResult<Vec<TokenWithLocation>> {
terminated(
many0(alt(
"indentation or a token preceded by whitespace",
(lex_indent, preceded(many0(tag(" ")), lex_token)),
)),
many0(one_of("\n\t ")),
)(input)
}
fn lex_indent(input: LexInput) -> InternalLexResult<TokenWithLocation> {
alt(
"indentation",
(
token_with_location(value(Token::Indentation, tag(" "))),
token_with_location(value(Token::Indentation, tag("\t"))),
),
)(input)
}
fn lex_token(input: LexInput) -> InternalLexResult<TokenWithLocation> {
alt(
"a token",
(
token_with_location(lex_comment),
token_with_location(lex_punctuation),
token_with_location(lex_target),
token_with_location(lex_string),
token_with_location(lex_operator),
token_with_location(lex_variable),
token_with_location(lex_keyword_or_identifier),
token_with_location(lex_number),
),
)(input)
}
fn lex_comment(input: LexInput) -> InternalLexResult {
let (input, _) = tag("#")(input)?;
let (input, content) = take_till(|c| c == '\n')(input)?;
Ok((input, Token::Comment(content.to_string())))
}
fn keyword_or_identifier(identifier: String) -> Token {
fn parse<T: FromStr>(token: impl Fn(T) -> Token, identifier: &str) -> Result<Token, T::Err> {
T::from_str(identifier).map(token)
}
parse(KeywordToken::into, &identifier)
.or_else(|_| parse(Token::Command, &identifier))
.or_else(|_| parse(Token::DataType, &identifier))
.or_else(|_| parse(Token::Modifier, &identifier))
.unwrap_or(Token::Identifier(identifier))
}
fn is_valid_identifier_leading_character(chr: char) -> bool {
chr.is_ascii_alphabetic() || chr == '_'
}
fn is_valid_identifier_end_character(chr: char) -> bool {
is_valid_identifier_leading_character(chr) || chr.is_ascii_digit()
}
fn is_dash(chr: char) -> bool {
chr == '-'
}
fn lex_identifier_raw(input: LexInput) -> InternalLexResult<String> {
expecting(
"a valid identifier",
map(
tuple::<_, _, InternalLexError, _>((
take_while1(is_valid_identifier_leading_character),
take_while(is_valid_identifier_end_character),
recognize(many0(pair(
take_while1(is_dash),
take_while1(is_valid_identifier_end_character),
))),
)),
|(leading, middle, trailing_dash_vars)| {
format!("{leading}{middle}{trailing_dash_vars}")
},
),
)(input)
}
fn lex_keyword_or_identifier(input: LexInput) -> InternalLexResult {
let (input, identifier) = lex_identifier_raw(input)?;
let token = keyword_or_identifier(identifier);
Ok((input, token))
}
fn lex_target(input: LexInput) -> InternalLexResult {
let (input, _) = tag("@")(input)?;
let (input, label) = lex_identifier_raw(input)?;
Ok((input, Token::Target(label)))
}
const fn number_format(radix: u8, prefix: Option<NonZeroU8>) -> u128 {
NumberFormatBuilder::new()
.mantissa_radix(radix)
.exponent_base(NonZeroU8::new(radix))
.exponent_radix(NonZeroU8::new(10))
.base_prefix(prefix)
.digit_separator(NonZeroU8::new(b'_'))
.required_integer_digits(false) .required_fraction_digits(false) .required_exponent_digits(true) .required_mantissa_digits(true) .no_positive_mantissa_sign(true) .required_mantissa_sign(false) .no_exponent_notation(false) .no_positive_exponent_sign(false) .required_exponent_sign(false) .no_exponent_without_fraction(false) .no_special(true) .no_integer_leading_zeros(false) .no_float_leading_zeros(false) .required_exponent_notation(false) .case_sensitive_exponent(false) .case_sensitive_base_prefix(false) .digit_separator_flags(true) .integer_leading_digit_separator(false) .fraction_leading_digit_separator(false) .special_digit_separator(false) .build_strict()
}
const INTEGER_OPTIONS: ParseIntegerOptions = ParseIntegerOptions::new();
const FLOAT_OPTIONS: ParseFloatOptions = ParseFloatOptions::builder()
.exponent(b'e')
.decimal_point(b'.')
.build_strict();
fn lex_and_parse_number<N: FromLexicalWithOptions, const FORMAT: u128>(
options: &'static N::Options,
) -> impl FnMut(LexInput) -> InternalLexResult<N> {
#[inline(always)]
fn parse<N: FromLexicalWithOptions, const FORMAT: u128>(
input: LexInput,
options: &'static N::Options,
) -> lexical::Result<(N, usize)> {
let result @ (_, len) =
lexical::parse_partial_with_options::<N, _, FORMAT>(input, options)?;
if const {
NumberFormatBuilder::rebuild(FORMAT)
.get_base_prefix()
.is_some()
} && len == 2
{
return Err(lexical::Error::EmptyInteger(2));
}
if let Some(dot) = input.slice(..len).find("._") {
let include_dot = dot + 1;
let number =
lexical::parse_with_options::<N, _, FORMAT>(input.slice(..include_dot), options)?;
return Ok((number, include_dot));
}
Ok(result)
}
move |input| {
let (num, len) = parse::<N, FORMAT>(input, options).map_err(|lex_err| {
let error = InternalLexError::from_kind(input, lex_err.into());
match lex_err {
lexical::Error::Overflow(_) | lexical::Error::Underflow(_) => {
nom::Err::Failure(error)
}
_ => nom::Err::Error(error),
}
})?;
Ok((input.slice(len..), num))
}
}
fn raw_lex_integer<const PREFIX: u8, const FORMAT: u128>(
input: LexInput,
) -> InternalLexResult<u64> {
const {
assert!(PREFIX <= 127, "PREFIX must be an ASCII character");
}
if PREFIX == 0 {
lex_and_parse_number::<u64, FORMAT>(&INTEGER_OPTIONS)(input)
} else {
let (_, _) = peek(tag_no_case(std::str::from_utf8(&[b'0', PREFIX]).unwrap()))(input)?;
cut(lex_and_parse_number::<u64, FORMAT>(&INTEGER_OPTIONS))(input)
}
}
macro_rules! def_radix {
($name:ident, $radix:literal $(,)?) => {
def_radix!($name, $radix, 0);
};
($name:ident, $radix:literal, $prefix:literal $(,)?) => {
paste::paste! {
#[inline]
fn [< lex_ $name _integer >](input: LexInput) -> InternalLexResult<u64> {
raw_lex_integer::<$prefix, { number_format($radix, NonZeroU8::new($prefix)) }>(
input
)
}
}
};
}
def_radix!(binary, 2, b'b');
def_radix!(decimal, 10);
def_radix!(octal, 8, b'o');
def_radix!(hexadecimal, 16, b'x');
fn lex_decimal_number(input: LexInput) -> InternalLexResult {
let parse_float = |input| {
let (input, float) = cut(lex_and_parse_number::<f64, { number_format(10, None) }>(
&FLOAT_OPTIONS,
))(input)?;
if !float.is_finite() {
return Err(nom::Err::Failure(InternalLexError::from_kind(
input,
lexical::Error::Overflow(0).into(),
)));
}
Ok((input, Token::Float(float)))
};
if input.as_bytes().first() == Some(&b'.') {
parse_float(input)
} else {
let (input_if_int, int) = lex_decimal_integer(input)?;
if input_if_int
.as_bytes()
.first()
.is_some_and(|next| b".eE".contains(next))
{
parse_float(input)
} else {
Ok((input_if_int, Token::Integer(int)))
}
}
}
fn lex_number(input: LexInput) -> InternalLexResult {
alt(
"number",
(
map(lex_binary_integer, Token::Integer),
map(lex_octal_integer, Token::Integer),
map(lex_hexadecimal_integer, Token::Integer),
lex_decimal_number,
),
)(input)
}
fn lex_operator(input: LexInput) -> InternalLexResult {
use Operator::*;
map(
alt(
"an operator",
(
value(Caret, tag("^")),
value(Minus, tag("-")),
value(Plus, tag("+")),
value(Slash, tag("/")),
value(Star, tag("*")),
),
),
Token::Operator,
)(input)
}
fn recognize_newlines(input: LexInput) -> InternalLexResult<LexInput> {
alt(
"one or more newlines",
(
is_a::<_, _, InternalLexError>("\n"),
is_a::<_, _, InternalLexError>("\r\n"),
),
)(input)
}
fn lex_punctuation(input: LexInput) -> InternalLexResult {
use Token::*;
alt(
"punctuation",
(
value(Bang, tag("!")),
value(Colon, tag(":")),
value(Comma, tag(",")),
value(
Indentation,
alt("four spaces or a tab character", (tag(" "), tag("\t"))),
),
value(LBracket, tag("[")),
value(LParenthesis, tag("(")),
value(NewLine, recognize_newlines),
value(RBracket, tag("]")),
value(RParenthesis, tag(")")),
value(Semicolon, tag(";")),
),
)(input)
}
fn lex_string(input: LexInput) -> InternalLexResult {
map(quoted_strings::unescaped_quoted_string, Token::String)(input)
}
fn lex_variable(input: LexInput) -> InternalLexResult {
map(preceded(tag("%"), lex_identifier_raw), |ident| {
Token::Variable(ident)
})(input)
}
#[cfg(test)]
mod tests {
use nom_locate::LocatedSpan;
use rstest::*;
use crate::parser::{common::tests::KITCHEN_SINK_QUIL, DataType};
use super::{lex, Command, Operator, Token};
#[test]
fn comment() {
let input = LocatedSpan::new("# hello\n#world\n#\n#");
let tokens = lex(input).unwrap();
assert_eq!(
tokens,
vec![
Token::Comment(" hello".to_owned()),
Token::NewLine,
Token::Comment("world".to_owned()),
Token::NewLine,
Token::Comment("".to_owned()),
Token::NewLine,
Token::Comment("".to_owned())
]
)
}
#[test]
fn keywords() {
let input = LocatedSpan::new("DEFGATE DEFCIRCUIT JUMP-WHEN MATRIX LOAD load LOAD-MEMORY");
let tokens = lex(input).unwrap();
assert_eq!(
tokens,
vec![
Token::Command(Command::DefGate),
Token::Command(Command::DefCircuit),
Token::Command(Command::JumpWhen),
Token::Matrix,
Token::Command(Command::Load),
Token::Identifier(String::from("load")),
Token::Identifier(String::from("LOAD-MEMORY"))
]
)
}
#[rstest]
#[case::bin_dot("0b10.1", [Token::Integer(0b10), Token::Float(0.1)])]
#[case::oct_dot("0o777.7", [Token::Integer(0o777), Token::Float(0.7)])]
#[case::hex_dot("0x3.4", [Token::Integer(0x3), Token::Float(0.4)])]
#[case::imaginary("1i", [Token::Integer(1), Token::Identifier("i".to_owned())])]
#[case::complex(
"1 + 2i",
[
Token::Integer(1),
Token::Operator(Operator::Plus),
Token::Integer(2),
Token::Identifier("i".to_owned()),
],
)]
#[case::bin_imaginary("0b10i", [Token::Integer(0b10), Token::Identifier("i".to_owned())])]
#[case::oct_imaginary("0o10i", [Token::Integer(0o10), Token::Identifier("i".to_owned())])]
#[case::hex_imaginary("0x10i", [Token::Integer(0x10), Token::Identifier("i".to_owned())])]
#[case::zero_dot_underscore_one("0._1", [Token::Float(0.0), Token::Identifier("_1".to_owned())])]
#[case::zero_dot_underscore("0._", [Token::Float(0.0), Token::Identifier("_".to_owned())])]
fn tokenization<const N: usize>(#[case] input: &str, #[case] expected: [Token; N]) {
let tokens = lex(LocatedSpan::new(input)).expect("lexing error");
assert_eq!(
tokens,
expected,
"lexing {input:?}:\n\
- got: {plain_tokens:?},\n\
- expected: {expected:?}",
plain_tokens = tokens
.iter()
.map(|located| located.as_token())
.collect::<Vec<_>>(),
);
}
#[rstest]
#[case::bin_prefix_only("0b")]
#[case::oct_prefix_only("0o")]
#[case::hex_prefix_only("0x")]
#[case::bin_prefix_dot("0b.")]
#[case::oct_prefix_dot("0o.")]
#[case::hex_prefix_dot("0x.")]
#[case::bin_prefix_dot_one("0b.1")]
#[case::oct_prefix_dot_one("0o.1")]
#[case::hex_prefix_dot_one("0x.1")]
#[case::int_too_big("0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF")]
#[case::float_too_big("1e1_000_000")]
#[case::dot_underscore_one("._1")]
#[case::dot_underscore("._")]
fn bad_token(#[case] input: &str) {
let _ = lex(LocatedSpan::new(input)).expect_err("lexing error");
}
macro_rules! number_test_builder {
{
{ $($cases:tt)* }
$(#[$meta:meta])*
$number:literal,
$($rest:tt)*
} => {
number_test_builder! {
{ $($cases)* }
$(#[$meta])*
{ name: ($number), input: stringify!($number), expected: $number },
$($rest)*
}
};
{
{ $($cases:tt)* }
$(#[$meta:meta])*
lit($($piece:tt)+) => $number:literal,
$($rest:tt)*
} => {
number_test_builder! {
{ $($cases)* }
$(#[$meta])*
{
name: ($($piece)+),
input: concat!($(stringify!($piece)),+),
expected: $number,
},
$($rest)*
}
};
{
{ $($cases:tt)* }
$(#[$meta:meta])*
name($($piece:tt)+) => $number:literal,
$($rest:tt)*
} => {
number_test_builder! {
{ $($cases)* }
$(#[$meta])*
{
name: ($($piece)+),
input: stringify!($number),
expected: $number,
},
$($rest)*
}
};
(
{ $($cases:tt)* }
$(#[$meta:meta])*
{
name: ($($name_fragment:tt)+),
input: $input:expr,
expected: $number:expr$(,)?
},
$($rest:tt)*
) => {
paste::paste! {
number_test_builder! {
{
$($cases)*
#[allow(
non_snake_case,
clippy::inconsistent_digit_grouping,
clippy::unusual_byte_groupings,
clippy::mixed_case_hex_literals,
clippy::zero_prefixed_literal,
)]
$(#[$meta])*
#[case::[<lex$(_$name_fragment)*>]($input, $number)]
}
$($rest)*
}
}
};
(
{
{ $test_name:ident, $token:ident, $ty:ty }
$($cases:tt)*
}
) => {
#[rstest]
$($cases)*
fn $test_name(#[case] input: &str, #[case] expected: $ty) {
tokenization(input, [Token::$token(expected)])
}
}
}
macro_rules! integer_tests {
($($input:tt)*) => {
const _: () = assert!(matches!(
stringify!(0x_12__34_aBCd__).as_bytes(),
b"0x_12__34_aBCd__"
));
number_test_builder! { { {integer, Integer, u64} } $($input)* }
}
}
macro_rules! float_tests {
($($input:tt)*) => {
number_test_builder! { { {float, Float, f64} } $($input)* }
}
}
integer_tests! {
0,
1,
2_,
3__,
4_5,
6__7,
8__9__10,
0000042,
0x0,
0x7F,
lit(0X7f) => 0x7f,
0x__CaFe__B0bA__1234__,
0o0,
0o10,
lit(0O10) => 0o10,
0o__777__555__,
0b0,
0b101010,
lit(0B101010) => 0b101010,
0b__1111__0000__,
}
float_tests! {
name(0dot) => 0.,
{ name: (dot0), input: ".0", expected: 0.0 },
name(0dot0) => 0.0,
name(1dot) => 1.,
{ name: (dot1), input: ".1", expected: 0.1 },
name(1dot1) => 1.1,
name(1__2__dot3__4__eplus__1__5__) => 1__2__.3__4__e+__1__5__,
name(1__2__dot3__4__eminus__1__5__) => 1__2__.3__4__e-__1__5__,
{
name: (1__2__dot3__4__e__1__5__),
input: "1__2__.3__4__e__1__5__",
expected: 1__2__.3__4__e+__1__5__
},
1e5,
{ name: (1dote5), input: "1.e5", expected: 1e5 },
{ name: (dot1e5), input: ".1e5", expected: 0.1e5 },
name(1dot1e5) => 1.1e5,
}
#[test]
fn a_bunch_of_numbers() {
let input = LocatedSpan::new("2 2i 2.0 2e3 2.0e3 (1+2i)");
let tokens = lex(input).unwrap();
assert_eq!(
tokens,
vec![
Token::Integer(2),
Token::Integer(2),
Token::Identifier("i".to_owned()),
Token::Float(2.0),
Token::Float(2000f64),
Token::Float(2000f64),
Token::LParenthesis,
Token::Integer(1),
Token::Operator(Operator::Plus),
Token::Integer(2),
Token::Identifier("i".to_owned()),
Token::RParenthesis
]
)
}
#[test]
fn string() {
let input = LocatedSpan::new("\"hello\"\n\"world\"");
let tokens = lex(input).unwrap();
assert_eq!(
tokens,
vec![
Token::String("hello".to_owned()),
Token::NewLine,
Token::String("world".to_owned())
]
)
}
#[test]
fn gate_operation() {
let input = LocatedSpan::new("I 0; RX 1\nCZ 0 1");
let tokens = lex(input).unwrap();
assert_eq!(
tokens,
vec![
Token::Identifier("I".to_owned()),
Token::Integer(0),
Token::Semicolon,
Token::Identifier("RX".to_owned()),
Token::Integer(1),
Token::NewLine,
Token::Identifier("CZ".to_owned()),
Token::Integer(0),
Token::Integer(1),
]
)
}
#[test]
fn label() {
let input = LocatedSpan::new("@hello\n@world");
let tokens = lex(input).unwrap();
assert_eq!(
tokens,
vec![
Token::Target("hello".to_owned()),
Token::NewLine,
Token::Target("world".to_owned())
]
)
}
#[test]
fn indentation() {
let input = LocatedSpan::new(" ");
let tokens = lex(input).unwrap();
assert_eq!(tokens, vec![Token::Indentation,])
}
#[test]
fn indented_block() {
let input = LocatedSpan::new("DEFGATE Name AS PERMUTATION:\n\t1,0\n 0,1");
let tokens = lex(input).unwrap();
assert_eq!(
tokens,
vec![
Token::Command(Command::DefGate),
Token::Identifier("Name".to_owned()),
Token::As,
Token::Permutation,
Token::Colon,
Token::NewLine,
Token::Indentation,
Token::Integer(1),
Token::Comma,
Token::Integer(0),
Token::NewLine,
Token::Indentation,
Token::Integer(0),
Token::Comma,
Token::Integer(1),
]
)
}
#[test]
fn surrounding_whitespace() {
let input = LocatedSpan::new("\nI 0\n \n");
let tokens = lex(input).unwrap();
assert_eq!(
tokens,
vec![
Token::NewLine,
Token::Identifier("I".to_owned()),
Token::Integer(0),
Token::NewLine,
Token::Indentation,
Token::NewLine
]
)
}
#[rstest(input, expected,
case("_", vec![Token::Identifier("_".to_string())]),
case("a", vec![Token::Identifier("a".to_string())]),
case("_a-2_b-2_", vec![Token::Identifier("_a-2_b-2_".to_string())]),
case("a-2-%var", vec![
Token::Identifier("a-2".to_string()),
Token::Operator(Operator::Minus),
Token::Variable("var".to_string())
]),
case("BIT", vec![Token::DataType(DataType::Bit)]),
case("BITS", vec![Token::Identifier("BITS".to_string())]),
case("NaN", vec![Token::Identifier("NaN".to_string())]),
case("nan", vec![Token::Identifier("nan".to_string())]),
case("NaNa", vec![Token::Identifier("NaNa".to_string())]),
case("nana", vec![Token::Identifier("nana".to_string())]),
case("INF", vec![Token::Identifier("INF".to_string())]),
case("Infinity", vec![Token::Identifier("Infinity".to_string())]),
case("Inferior", vec![Token::Identifier("Inferior".to_string())]),
case("-NaN", vec![Token::Operator(Operator::Minus), Token::Identifier("NaN".to_string())]),
case("-inf", vec![Token::Operator(Operator::Minus), Token::Identifier("inf".to_string())]),
case("-Infinity", vec![
Token::Operator(Operator::Minus),
Token::Identifier("Infinity".to_string())
]),
case("-inferior", vec![
Token::Operator(Operator::Minus),
Token::Identifier("inferior".to_string())
]),
)]
fn it_lexes_identifier(input: &str, expected: Vec<Token>) {
let input = LocatedSpan::new(input);
let tokens = lex(input).unwrap();
assert_eq!(tokens, expected);
}
#[rstest(input, not_expected,
case("a-", vec![Token::Identifier("_-".to_string())]),
case("-a", vec![Token::Identifier("-a".to_string())]),
case("a\\", vec![Token::Identifier("_\\".to_string())]),
)]
fn it_fails_to_lex_identifier(input: &str, not_expected: Vec<Token>) {
let input = LocatedSpan::new(input);
if let Ok(tokens) = lex(input) {
assert_ne!(tokens, not_expected);
}
}
#[test]
fn kitchen_sink() {
let input = LocatedSpan::new(KITCHEN_SINK_QUIL);
lex(input).unwrap();
}
}