pub mod base_tokenization;
pub mod common;
pub mod line_classification;
pub mod line_grouping;
pub mod transformations;
pub use base_tokenization::tokenize;
pub use common::{LexError, Lexer, LexerOutput};
pub use crate::lex::token::{LineContainer, LineToken, LineType, Token};
pub fn ensure_source_ends_with_newline(source: &str) -> String {
if !source.is_empty() && !source.ends_with('\n') {
format!("{source}\n")
} else {
source.to_string()
}
}
pub fn lex(
tokens: Vec<(Token, std::ops::Range<usize>)>,
) -> Result<Vec<(Token, std::ops::Range<usize>)>, LexError> {
use crate::lex::lexing::transformations::semantic_indentation::SemanticIndentationMapper;
let mut mapper = SemanticIndentationMapper::new();
mapper
.map(tokens)
.map_err(|e| LexError::Transformation(e.to_string()))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::lex::testing::factories::mk_tokens;
fn lex_helper(source: &str) -> Vec<(Token, std::ops::Range<usize>)> {
let source_with_newline = ensure_source_ends_with_newline(source);
let token_stream = base_tokenization::tokenize(&source_with_newline);
lex(token_stream).expect("lex failed")
}
#[test]
fn test_paragraph_pattern() {
let input = "This is a paragraph.\nIt has multiple lines.";
let tokens = lex_helper(input);
assert_eq!(
tokens,
mk_tokens(&[
(Token::Text("This".to_string()), 0, 4),
(Token::Whitespace(1), 4, 5),
(Token::Text("is".to_string()), 5, 7),
(Token::Whitespace(1), 7, 8),
(Token::Text("a".to_string()), 8, 9),
(Token::Whitespace(1), 9, 10),
(Token::Text("paragraph".to_string()), 10, 19),
(Token::Period, 19, 20),
(Token::BlankLine(Some("\n".to_string())), 20, 21),
(Token::Text("It".to_string()), 21, 23),
(Token::Whitespace(1), 23, 24),
(Token::Text("has".to_string()), 24, 27),
(Token::Whitespace(1), 27, 28),
(Token::Text("multiple".to_string()), 28, 36),
(Token::Whitespace(1), 36, 37),
(Token::Text("lines".to_string()), 37, 42),
(Token::Period, 42, 43),
(Token::BlankLine(Some("\n".to_string())), 43, 44),
])
);
}
#[test]
fn test_list_pattern() {
let input = "- First item\n- Second item";
let tokens = lex_helper(input);
assert_eq!(
tokens,
mk_tokens(&[
(Token::Dash, 0, 1),
(Token::Whitespace(1), 1, 2),
(Token::Text("First".to_string()), 2, 7),
(Token::Whitespace(1), 7, 8),
(Token::Text("item".to_string()), 8, 12),
(Token::BlankLine(Some("\n".to_string())), 12, 13),
(Token::Dash, 13, 14),
(Token::Whitespace(1), 14, 15),
(Token::Text("Second".to_string()), 15, 21),
(Token::Whitespace(1), 21, 22),
(Token::Text("item".to_string()), 22, 26),
(Token::BlankLine(Some("\n".to_string())), 26, 27),
])
);
}
#[test]
fn test_session_pattern() {
let input = "1. Session Title\n Content here";
let tokens = lex_helper(input);
assert_eq!(
tokens,
mk_tokens(&[
(Token::Number("1".to_string()), 0, 1),
(Token::Period, 1, 2),
(Token::Whitespace(1), 2, 3),
(Token::Text("Session".to_string()), 3, 10),
(Token::Whitespace(1), 10, 11),
(Token::Text("Title".to_string()), 11, 16),
(Token::BlankLine(Some("\n".to_string())), 16, 17),
(Token::Indent(vec![(Token::Indentation, 17..21)]), 0, 0),
(Token::Text("Content".to_string()), 21, 28),
(Token::Whitespace(1), 28, 29),
(Token::Text("here".to_string()), 29, 33),
(Token::BlankLine(Some("\n".to_string())), 33, 34),
(Token::Dedent(vec![]), 0, 0),
])
);
}
#[test]
fn test_lex_marker_pattern() {
let input = "Some text :: marker";
let tokens = lex_helper(input);
assert_eq!(
tokens,
mk_tokens(&[
(Token::Text("Some".to_string()), 0, 4),
(Token::Whitespace(1), 4, 5),
(Token::Text("text".to_string()), 5, 9),
(Token::Whitespace(1), 9, 10),
(Token::LexMarker, 10, 12),
(Token::Whitespace(1), 12, 13),
(Token::Text("marker".to_string()), 13, 19),
(Token::BlankLine(Some("\n".to_string())), 19, 20),
])
);
}
#[test]
fn test_lex_indented_marker() {
let input = " ::";
let tokens = lex_helper(input);
let token_kinds: Vec<Token> = tokens.iter().map(|(t, _)| t.clone()).collect();
println!("LEXING Tokens: {token_kinds:?}");
}
#[test]
fn test_mixed_content_pattern() {
let input = "1. Session\n - Item 1\n - Item 2\n\nParagraph after.";
let tokens = lex_helper(input);
assert_eq!(
tokens,
mk_tokens(&[
(Token::Number("1".to_string()), 0, 1),
(Token::Period, 1, 2),
(Token::Whitespace(1), 2, 3),
(Token::Text("Session".to_string()), 3, 10),
(Token::BlankLine(Some("\n".to_string())), 10, 11),
(Token::Indent(vec![(Token::Indentation, 11..15)]), 0, 0),
(Token::Dash, 15, 16),
(Token::Whitespace(1), 16, 17),
(Token::Text("Item".to_string()), 17, 21),
(Token::Whitespace(1), 21, 22),
(Token::Number("1".to_string()), 22, 23),
(Token::BlankLine(Some("\n".to_string())), 23, 24),
(Token::Dash, 28, 29),
(Token::Whitespace(1), 29, 30),
(Token::Text("Item".to_string()), 30, 34),
(Token::Whitespace(1), 34, 35),
(Token::Number("2".to_string()), 35, 36),
(Token::BlankLine(Some("\n".to_string())), 36, 37),
(Token::BlankLine(Some("\n".to_string())), 37, 38),
(Token::Dedent(vec![]), 0, 0),
(Token::Text("Paragraph".to_string()), 38, 47),
(Token::Whitespace(1), 47, 48),
(Token::Text("after".to_string()), 48, 53),
(Token::Period, 53, 54),
(Token::BlankLine(Some("\n".to_string())), 54, 55),
])
);
}
#[test]
fn test_consecutive_blank_lines() {
let input = "First\n\n\nSecond"; let tokens = lex_helper(input);
assert_eq!(
tokens,
mk_tokens(&[
(Token::Text("First".to_string()), 0, 5),
(Token::BlankLine(Some("\n".to_string())), 5, 6),
(Token::BlankLine(Some("\n".to_string())), 6, 7),
(Token::BlankLine(Some("\n".to_string())), 7, 8),
(Token::Text("Second".to_string()), 8, 14),
(Token::BlankLine(Some("\n".to_string())), 14, 15),
])
);
}
#[test]
fn test_blank_line_round_trip() {
use crate::lex::formats::detokenizer::detokenize;
let inputs = vec![
"First\nSecond", "First\n\nSecond", "First\n\n\nSecond", "First\n\n\n\nSecond", ];
for input in inputs {
let tokens_with_spans = lex_helper(input);
let tokens: Vec<Token> = tokens_with_spans.into_iter().map(|(t, _)| t).collect();
let detokenized = detokenize(&tokens);
let expected = ensure_source_ends_with_newline(input);
assert_eq!(
detokenized, expected,
"Round-trip failed for input: {input:?}"
);
}
}
}