use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Write as _;
use crate::grammar::{CharClassItem, GrammarExpr, GrammarRule, RuleKind};
use crate::source::ByteRange;
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum CharCategory {
Letter,
Digit,
Whitespace,
Delimiter,
Punctuation,
Other,
}
#[must_use]
pub fn categorise(value: char) -> CharCategory {
if value.is_whitespace() {
CharCategory::Whitespace
} else if is_delimiter(value) {
CharCategory::Delimiter
} else if value.is_alphabetic() {
CharCategory::Letter
} else if value.is_numeric() {
CharCategory::Digit
} else if value.is_ascii_punctuation() {
CharCategory::Punctuation
} else {
CharCategory::Other
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Token {
pub text: String,
pub category: CharCategory,
pub span: ByteRange,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct LexicalConfig {
pub max_closed_forms: usize,
}
impl Default for LexicalConfig {
fn default() -> Self {
Self {
max_closed_forms: 12,
}
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct LexicalModel {
pub alphabet: Vec<char>,
pub classes: Vec<GrammarRule>,
pub config: LexicalConfig,
}
impl LexicalModel {
#[must_use]
pub fn tokenize(&self, text: &str) -> Vec<Token> {
tokenize_text(text, self.config)
}
}
#[must_use]
pub fn infer_lexical_classes(corpus: &[&str], config: &LexicalConfig) -> LexicalModel {
let mut alphabet = BTreeSet::new();
let mut forms_by_category = BTreeMap::<CharCategory, BTreeMap<String, usize>>::new();
for text in corpus {
alphabet.extend(text.chars());
for token in tokenize_text(text, *config) {
*forms_by_category
.entry(token.category)
.or_default()
.entry(token.text)
.or_default() += 1;
}
}
LexicalModel {
alphabet: alphabet.into_iter().collect(),
classes: infer_classes(&forms_by_category, *config),
config: *config,
}
}
fn tokenize_text(text: &str, _config: LexicalConfig) -> Vec<Token> {
let mut tokens = Vec::new();
let mut token_start = 0;
let mut token_end = 0;
let mut token_category = None;
for (index, value) in text.char_indices() {
let category = categorise(value);
let value_end = index + value.len_utf8();
let Some(current_category) = token_category else {
token_start = index;
token_end = value_end;
token_category = Some(category);
continue;
};
if continues_token(current_category, category) {
token_end = value_end;
} else {
tokens.push(Token {
text: text[token_start..token_end].to_string(),
category: current_category,
span: ByteRange::new(token_start, token_end),
});
token_start = index;
token_end = value_end;
token_category = Some(category);
}
}
if let Some(category) = token_category {
tokens.push(Token {
text: text[token_start..token_end].to_string(),
category,
span: ByteRange::new(token_start, token_end),
});
}
tokens
}
fn continues_token(current: CharCategory, next: CharCategory) -> bool {
if is_atomic(current) || is_atomic(next) {
return false;
}
current == next || (current == CharCategory::Letter && next == CharCategory::Digit)
}
const fn is_atomic(category: CharCategory) -> bool {
matches!(
category,
CharCategory::Delimiter | CharCategory::Punctuation
)
}
const fn is_delimiter(value: char) -> bool {
matches!(value, '(' | ')' | '[' | ']' | '{' | '}' | '\'' | '"' | '`')
}
fn infer_classes(
forms_by_category: &BTreeMap<CharCategory, BTreeMap<String, usize>>,
config: LexicalConfig,
) -> Vec<GrammarRule> {
let mut classes = Vec::new();
for (category, forms) in forms_by_category {
match category {
CharCategory::Delimiter | CharCategory::Punctuation => {
classes.extend(forms.keys().map(|form| literal_rule(form)));
}
CharCategory::Digit | CharCategory::Whitespace => {
classes.push(open_rule(*category, forms.keys().map(String::as_str)));
}
CharCategory::Letter | CharCategory::Other => {
classes.extend(infer_mixed_category(*category, forms, config));
}
}
}
classes
}
fn infer_mixed_category(
category: CharCategory,
forms: &BTreeMap<String, usize>,
config: LexicalConfig,
) -> Vec<GrammarRule> {
let has_repeated_forms = forms.values().any(|count| *count > 1);
let has_singleton_forms = forms.values().any(|count| *count == 1);
if forms.len() <= config.max_closed_forms && !(has_repeated_forms && has_singleton_forms) {
return forms.keys().map(|form| literal_rule(form)).collect();
}
let mut rules = Vec::new();
let mut open_forms = Vec::new();
let mut closed_forms = 0;
for (form, count) in forms {
if *count > 1 && closed_forms < config.max_closed_forms {
rules.push(literal_rule(form));
closed_forms += 1;
} else {
open_forms.push(form.as_str());
}
}
if !open_forms.is_empty() {
rules.push(open_rule(category, open_forms));
}
rules
}
fn literal_rule(form: &str) -> GrammarRule {
GrammarRule::new(literal_rule_name(form), GrammarExpr::terminal(form))
.with_kind(RuleKind::Token)
}
fn literal_rule_name(form: &str) -> String {
let mut name = String::from("literal");
for value in form.chars() {
name.push('_');
if value.is_ascii_alphanumeric() {
name.push(value.to_ascii_lowercase());
} else {
name.push('u');
write!(name, "{:04x}", u32::from(value)).expect("writing to a String cannot fail");
}
}
name
}
fn open_rule<'a>(category: CharCategory, forms: impl IntoIterator<Item = &'a str>) -> GrammarRule {
GrammarRule::new(open_rule_name(category), open_expr(category, forms))
.with_kind(RuleKind::Token)
}
const fn open_rule_name(category: CharCategory) -> &'static str {
match category {
CharCategory::Letter => "identifier",
CharCategory::Digit => "integer",
CharCategory::Whitespace => "whitespace",
CharCategory::Delimiter => "delimiter",
CharCategory::Punctuation => "punctuation",
CharCategory::Other => "other",
}
}
fn open_expr<'a>(category: CharCategory, forms: impl IntoIterator<Item = &'a str>) -> GrammarExpr {
let forms = forms.into_iter().collect::<Vec<_>>();
match category {
CharCategory::Letter => identifier_expr(&forms),
CharCategory::Digit if all_ascii_digits(&forms) => {
GrammarExpr::one_or_more(GrammarExpr::char_range('0', '9'))
}
CharCategory::Digit | CharCategory::Whitespace | CharCategory::Other => {
GrammarExpr::one_or_more(char_set_expr(&all_chars(&forms)))
}
CharCategory::Delimiter | CharCategory::Punctuation => {
GrammarExpr::choice(false, forms.into_iter().map(GrammarExpr::terminal))
}
}
}
fn identifier_expr(forms: &[&str]) -> GrammarExpr {
let mut first_chars = BTreeSet::new();
let mut rest_chars = BTreeSet::new();
for form in forms {
let mut chars = form.chars();
if let Some(first) = chars.next() {
first_chars.insert(first);
rest_chars.extend(chars);
}
}
let first = char_set_expr(&first_chars);
if rest_chars.is_empty() {
first
} else {
GrammarExpr::sequence([first, GrammarExpr::zero_or_more(char_set_expr(&rest_chars))])
}
}
fn all_ascii_digits(forms: &[&str]) -> bool {
forms
.iter()
.flat_map(|form| form.chars())
.all(|value| value.is_ascii_digit())
}
fn all_chars(forms: &[&str]) -> BTreeSet<char> {
forms.iter().flat_map(|form| form.chars()).collect()
}
fn char_set_expr(chars: &BTreeSet<char>) -> GrammarExpr {
GrammarExpr::char_class(false, char_class_items(chars))
}
fn char_class_items(chars: &BTreeSet<char>) -> Vec<CharClassItem> {
let mut items = Vec::new();
let mut chars = chars.iter().copied();
let Some(mut range_start) = chars.next() else {
return items;
};
let mut range_end = range_start;
for value in chars {
if u32::from(value) != u32::from(range_end).saturating_add(1) {
push_char_class_item(&mut items, range_start, range_end);
range_start = value;
}
range_end = value;
}
push_char_class_item(&mut items, range_start, range_end);
items
}
fn push_char_class_item(items: &mut Vec<CharClassItem>, start: char, end: char) {
if start == end {
items.push(CharClassItem::char(start));
} else {
items.push(CharClassItem::range(start, end));
}
}