relex 1.0.0

a library for building a regex-based lexer
Documentation
use crate::*;
use regex::RegexSet;
use std::fmt::Debug;
use std::rc::Rc;

// RecognizerBuilder {{{
/// A convenient builder-pattern struct that creates Lexers
#[derive(Debug)]
pub struct RecognizerBuilder<K: TokenKind> {
  rules: Vec<Rule<K>>,
}
impl<K: TokenKind> Default for lexer::RecognizerBuilder<K> {
  fn default() -> Self {
    Self::new()
  }
}
impl<K: TokenKind> RecognizerBuilder<K> {
  pub fn new() -> Self {
    RecognizerBuilder { rules: vec![] }
  }

  pub fn token(mut self, rule: Rule<K>) -> Self {
    self.rules.push(rule);
    self
  }

  pub fn build(self) -> Recognizer<K> {
    let re_set =
      RegexSet::new(self.rules.iter().map(|r| r.re.as_str()).collect::<Vec<_>>()).unwrap();
    Recognizer {
      re_set,
      rules: self.rules,
    }
  }
}
// }}}

/// A recognizer that houses a bunch of Rules
#[derive(Debug, Clone)]
pub struct Recognizer<K: TokenKind> {
  re_set: RegexSet,
  rules: Vec<Rule<K>>,
}
impl<K: TokenKind> Recognizer<K> {
  /// Given the source and starting position, reads:
  /// * the next token OR
  /// * EOF (if at the end) OR
  /// * returns an unexpected sequence before a real token is detected
  /// Note: this method purposefully does NOT skip over skippable tokens
  pub fn read_at<'a>(&self, source: &'a str, position: usize) -> Token<'a, K> {
    if position >= source.len() {
      return Token::eof(source);
    }

    let mut matches = vec![false; self.rules.len()];
    let any_match = self
      .re_set
      .read_matches_at(matches.as_mut_slice(), &source[position..], 0);

    if !any_match {
      // There were no matches. Read ahead until one is found, and
      // construct an appropriate `Unexpected` token
      let next = self.read_at(source, position + 1);
      let (sequence, next) = if next.kind.is_eof() {
        (&source[position..], next)
      } else if next.kind.is_unrecognized() {
        (&source[position..next.end], *next.next.unwrap())
      } else {
        (&source[position..next.start], next)
      };

      return Token::new(
        source,
        K::unrecognized(),
        position,
        position + sequence.len(),
      )
      .next(Some(Box::new(next)));
    }

    // `any_match` == false, therefore we can safely unwrap here:
    let (first_idx, _) = matches.iter().enumerate().find(|(_idx, b)| **b).unwrap();
    let rule = &self.rules[first_idx];
    let (re_match, maybe_captures) = if rule.capture {
      let cap = rule.re.captures(&source[position..]).unwrap();
      (cap.get(0).unwrap(), Some(cap))
    } else {
      let re_match = rule.re.find(&source[position..]).unwrap();
      (re_match, None)
    };

    let start = position + re_match.start();
    let end = position + re_match.end();

    Token::new(source, rule.kind.clone(), start, end)
      .skip(rule.skip)
      .captures(maybe_captures.map(|captures| {
        captures
          .iter()
          .map(|maybe_capture| maybe_capture.map(|capture| (capture.start(), capture.end())))
          .collect()
      }))
  }

  /// Like `read_at`, but skips over skippable tokens
  pub fn scan<'a>(&self, source: &'a str, mut position: usize) -> Token<'a, K> {
    let mut skipped = vec![];
    let token = loop {
      let token = self.read_at(source, position);
      position = token.end;
      if !token.skip {
        break token;
      }
      skipped.push(token);
    };
    token.skipped(skipped)
  }

  /// Returns a lexer over the given source, starting at the given position
  pub fn into_lexer<'a>(self, source: &'a str, position: usize) -> Lexer<'a, K> {
    Lexer {
      source,
      position,
      recognizer: Rc::new(self),
    }
  }
}

/// A lexer is a stateful lexer, i.e. one that keeps track of its current position.
#[derive(Clone, Debug)]
pub struct Lexer<'a, K: TokenKind> {
  pub source: &'a str,
  pub position: usize,
  pub recognizer: Rc<Recognizer<K>>,
}
impl<'a, K: TokenKind> Lexer<'a, K> {
  /// Returns the next token as Ok(token) if it matches the predicate,
  /// otherwise returns the next token as Err(token)
  pub fn next_of<F>(&mut self, f: F) -> Result<Token<'_, K>, Token<'_, K>>
  where
    F: Fn(&Token<'_, K>) -> bool,
  {
    let token = self.recognizer.scan(&self.source, self.position);
    if f(&token) {
      self.position = token.end;
      Ok(token)
    } else {
      Err(token)
    }
  }

  /// Returns the next token as Ok(token) if it matches the given list of kinds,
  /// otherwise returns the next token as Err(token)
  pub fn next_of_kinds<S>(&mut self, kinds: S) -> Result<Token<'_, K>, Token<'_, K>>
  where
    S: AsRef<[K]>,
  {
    let kinds = kinds.as_ref();
    self.next_of(|t| kinds.contains(&t.kind))
  }

  /// Returns the next token as Ok(token) if it matches the given kind,
  /// otherwise returns the next token as Err(token)
  pub fn next_of_kind(&mut self, kind: K) -> Result<Token<'_, K>, Token<'_, K>> {
    self.next_of_kinds(&[kind])
  }
}
impl<'a, K: TokenKind> std::iter::Iterator for Lexer<'a, K> {
  type Item = Token<'a, K>;
  fn next(&mut self) -> Option<Self::Item> {
    let token = self.recognizer.scan(&self.source, self.position);
    self.position = token.end;

    if token.kind.is_eof() {
      None
    } else {
      Some(token)
    }
  }
}

// tests {{{
#[cfg(test)]
mod tests {
  use super::*;

  #[test]
  fn test_read_at() {
    let recognizer = RecognizerBuilder::<&str>::new()
      .token(Rule::new("ID", "[A-Za-z]+").unwrap())
      .token(Rule::new("WS", "\\s+").unwrap().skip(true))
      .build();

    let token = recognizer.read_at(" hello", 0);
    assert_eq!(token, Token::from_text("WS", " ", 0).skip(true));

    let token = recognizer.read_at(" hello", 1);
    assert_eq!(token, Token::from_text("ID", "hello", 1));

    let token = recognizer.read_at(" 123 hello", 1);
    assert_eq!(
      token,
      Token::from_text("UNRECOGNIZED", "123", 1)
        .next(Some(Box::new(Token::from_text("WS", " ", 4).skip(true))))
    );

    let token = recognizer.read_at(" 123", 1);
    assert_eq!(
      token,
      Token::from_text("UNRECOGNIZED", "123", 1)
        .next(Some(Box::new(Token::from_text("EOF", "", 4))))
    );
  }

  #[test]
  fn test_lexer_iter() {
    let recognizer = RecognizerBuilder::<&str>::new()
      .token(Rule::new("NUM", "[0-9]+").unwrap())
      .token(Rule::new("WS", "\\s+").unwrap().skip(true))
      .build();

    let tokens: Vec<_> = recognizer.into_lexer("1 2 3", 0).collect();
    assert_eq!(
      tokens,
      vec![
        Token::from_text("NUM", "1", 0),
        Token::from_text("NUM", "2", 2).skipped(vec![Token::from_text("WS", " ", 1).skip(true)]),
        Token::from_text("NUM", "3", 4).skipped(vec![Token::from_text("WS", " ", 3).skip(true)]),
      ]
    )
  }

  #[test]
  fn test_capture() {
    let recognizer = RecognizerBuilder::new()
      .token(
        Rule::new("FLOAT", r"(\d+)(?:\.(\d+))?")
          .unwrap()
          .capture(true),
      )
      .build();

    let mut lexer = recognizer.into_lexer("1.234", 0);
    let t = lexer.next().unwrap();
    assert_eq!(
      t.get_capture(0).unwrap(),
      TokenCapture {
        text: "1.234",
        start: t.start,
        end: t.end
      }
    );
    assert_eq!(
      t.get_capture(1).unwrap(),
      TokenCapture {
        text: "1",
        start: 0,
        end: 1,
      }
    );
    assert_eq!(
      t.get_capture(2).unwrap(),
      TokenCapture {
        text: "234",
        start: 2,
        end: 5
      }
    );
  }
}
// }}}