use crate::parsing::{ExtractedToken, Token};
use crate::text::FullText;
#[derive(Debug, Clone)]
pub struct Tokenization<'t> {
tokens: Vec<ExtractedToken<'t>>,
full_text: FullText<'t>,
}
impl<'t> Tokenization<'t> {
#[inline]
pub fn tokens<'r>(&'r self) -> &'r [ExtractedToken<'t>] {
&self.tokens
}
#[inline]
pub(crate) fn full_text(&self) -> FullText<'t> {
self.full_text
}
}
impl<'t> From<Tokenization<'t>> for Vec<ExtractedToken<'t>> {
#[inline]
fn from(tokenization: Tokenization<'t>) -> Vec<ExtractedToken<'t>> {
tokenization.tokens
}
}
pub fn tokenize(text: &str) -> Tokenization<'_> {
info!(
"Running lexer on text ({} bytes) to produce tokens",
text.len(),
);
let tokens = Token::extract_all(text);
let full_text = FullText::new(text);
Tokenization { tokens, full_text }
}
#[cfg(test)]
mod test {
use super::*;
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(4096))]
#[test]
#[ignore = "slow test"]
fn tokenizer_prop(s in ".*") {
let _ = tokenize(&s);
}
}
}