1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
#![cfg_attr(feature = "strict_docs", allow(missing_docs))]
//! Parse table compression using Tree-sitter's encoding scheme.
use crate::{Result, TableGenError};
use adze_glr_core::{Action, ParseTable};
use adze_ir::{StateId, SymbolId};
use std::collections::{BTreeMap, HashMap};
/// Compressed parse table representation
pub struct CompressedParseTable {
symbol_count: usize,
state_count: usize,
}
impl CompressedParseTable {
/// Create a new compressed parse table for testing
pub fn new_for_testing(symbol_count: usize, state_count: usize) -> Self {
Self {
symbol_count,
state_count,
}
}
/// Get the symbol count
pub fn symbol_count(&self) -> usize {
self.symbol_count
}
/// Get the state count
pub fn state_count(&self) -> usize {
self.state_count
}
/// Create from a parse table
pub fn from_parse_table(parse_table: &ParseTable) -> Self {
Self {
symbol_count: parse_table.symbol_count,
state_count: parse_table.state_count,
}
}
}
// Removed: This From implementation was returning dummy empty tables.
// Compression is now handled by TableCompressor::compress() method directly.
/// Complete compressed tables for Tree-sitter
pub struct CompressedTables {
pub action_table: CompressedActionTable,
pub goto_table: CompressedGotoTable,
pub small_table_threshold: usize,
}
impl CompressedTables {
/// Validate compressed tables against original parse table
#[must_use = "validation result must be checked"]
pub fn validate(&self, _parse_table: &ParseTable) -> Result<()> {
// TODO: Implement validation logic
// For now, just return Ok to make tests compile
Ok(())
}
}
/// Compressed action table
#[derive(Debug, Clone)]
pub struct CompressedActionTable {
pub data: Vec<CompressedActionEntry>,
pub row_offsets: Vec<u16>,
pub default_actions: Vec<Action>,
}
/// Entry in the action table
#[derive(Debug, Clone)]
pub struct ActionEntry {
pub symbol: u16,
pub action: Action,
}
/// Compressed action entry
#[derive(Debug, Clone)]
pub struct CompressedActionEntry {
pub symbol: u16,
pub action: Action,
}
impl CompressedActionEntry {
/// Create a new compressed action entry
pub fn new(symbol: u16, action: Action) -> Self {
Self { symbol, action }
}
}
/// Compressed goto table
#[derive(Debug, Clone)]
pub struct CompressedGotoTable {
pub data: Vec<CompressedGotoEntry>,
pub row_offsets: Vec<u16>,
}
/// Entry in the goto table
pub struct GotoEntry {
pub symbol: SymbolId,
pub state: u16,
}
/// Compressed goto entry with run-length encoding
#[derive(Debug, Clone)]
pub enum CompressedGotoEntry {
Single(u16),
RunLength { state: u16, count: u16 },
}
/// Lossless compressor for LR(1) parse tables produced by `glr-core`.
///
/// The compressor packs the ACTION/GOTO matrices into compact columnar
/// representations while preserving all transitions.
pub struct TableCompressor {
// Tree-sitter's magic constants for compression
small_table_threshold: usize,
}
impl Default for TableCompressor {
fn default() -> Self {
Self::new()
}
}
impl TableCompressor {
/// Create a new compressor with default thresholds.
#[must_use]
pub fn new() -> Self {
Self {
small_table_threshold: 32768, // Tree-sitter's threshold
}
}
/// Encode an action for small tables
#[must_use = "encoding result must be checked"]
pub fn encode_action_small(&self, action: &Action) -> Result<u16> {
match action {
Action::Shift(state) => {
if state.0 >= 0x8000 {
return Err(TableGenError::Compression(format!(
"Shift state {} too large for small table encoding",
state.0
)));
}
Ok(state.0)
}
Action::Reduce(rule) => {
if rule.0 >= 0x4000 {
return Err(TableGenError::Compression(format!(
"Reduce rule {} too large for small table encoding",
rule.0
)));
}
// Reduce actions are encoded with high bit set
// bit 15: 1 (indicates reduce)
// bits 14-0: rule_id (1-based)
// Tree-sitter uses 1-based production IDs
Ok(0x8000 | (rule.0 + 1))
}
Action::Accept => Ok(0xFFFF),
Action::Error => Ok(0xFFFE),
Action::Recover => Ok(0xFFFD), // Use distinct value for Recover
Action::Fork(_) => {
// GLR fork points need special handling
// For now, treat as error
Ok(0xFFFE)
}
_ => {
// Unknown action type // Expected: V for Recover
crate::util::unexpected_action(action, "encode_action_as_u16");
Ok(0xFFFE)
}
}
}
/// Compress a parse table into compact ACTION/GOTO forms.
///
/// * `parse_table` — LR(1) automaton from `glr-core`.
/// * `token_indices` — column indices including EOF from `collect_token_indices`.
/// * `start_can_be_empty` — whether the start symbol is nullable.
///
/// Returns compressed tables suitable for embedding.
///
/// ```ignore
/// # use adze_ir::builder::GrammarBuilder;
/// # use adze_glr_core::{FirstFollowSets, build_lr1_automaton};
/// # use adze_tablegen::{TableCompressor, helpers::{collect_token_indices, eof_accepts_or_reduces}};
/// # let g = GrammarBuilder::new("demo").start("module").build();
/// # let ff = FirstFollowSets::compute(&g);
/// # let pt = build_lr1_automaton(&g, &ff).unwrap();
/// # let token_ix = collect_token_indices(&g, &pt);
/// let compressed = TableCompressor::new()
/// .compress(&pt, &token_ix, eof_accepts_or_reduces(&pt))
/// .unwrap();
/// # let _ = compressed;
/// ```
///
/// # Breaking Change Note
/// This function signature changed to include `token_indices` and `start_can_be_empty` parameters
/// to properly handle nullable start symbols and GLR multi-action cells.
pub fn compress(
&self,
parse_table: &ParseTable,
token_indices: &[usize],
start_can_be_empty: bool,
) -> Result<CompressedTables> {
// Convert token_indices to FxHashSet for O(1) membership checks with better performance
use rustc_hash::FxHashSet;
// Debug assertions to verify invariants (zero cost in release builds)
debug_assert!(
token_indices.windows(2).all(|w| w[0] < w[1]),
"token_indices must be strictly increasing (sorted and deduped)"
);
// Only assert EOF presence if the parse table actually exposes an EOF mapping
// Don't assume EOF is at column 0 - derive it from symbol_to_index using the actual eof_symbol
if let Some(&eof_idx) = parse_table.symbol_to_index.get(&parse_table.eof_symbol) {
debug_assert!(
token_indices.contains(&eof_idx),
"token_indices must contain EOF column (derived from symbol_to_index)"
);
}
let token_set: FxHashSet<usize> = token_indices.iter().copied().collect();
// Fetch EOF column index once and reuse it everywhere
// Use parse_table.eof_symbol instead of hardcoded SymbolId(0) since EOF symbol
// is computed as max_symbol + 1 in build_lr1_automaton
let eof_idx = *parse_table
.symbol_to_index
.get(&parse_table.eof_symbol)
.ok_or_else(|| TableGenError::InvalidTable(
format!("EOF (symbol {}) not found in symbol_to_index map - this is a critical invariant violation", parse_table.eof_symbol.0)
))?;
// Validation: Ensure state 0 has at least one token shift action
// This catches the "state 0 bug" where no tokens can be shifted from the initial state
if let Some(state0_actions) = parse_table.action_table.first() {
// Check if any token column has a shift action
let has_token_shift = token_indices.iter().any(|&idx| {
state0_actions
.get(idx)
.is_some_and(|cell| cell.iter().any(|a| matches!(a, Action::Shift(_))))
});
// If no token shifts, and start is nullable, allow ACCEPT/REDUCE on EOF column
let eof_ok = !has_token_shift
&& start_can_be_empty
&& state0_actions.get(eof_idx).is_some_and(|cell| {
cell.iter()
.any(|a| matches!(a, Action::Accept | Action::Reduce(_)))
});
if !has_token_shift && !eof_ok {
// Provide detailed debugging info
let mut debug_info = String::new();
// Show expected token columns
debug_info.push_str(&format!(
"Expected token columns (first 12): {:?}\n",
token_indices.iter().take(12).collect::<Vec<_>>()
));
debug_info.push_str(&format!("Start can be empty: {}\n", start_can_be_empty));
// Show the actual state-0 actions
debug_info.push_str("State 0 actions (first 12 columns):\n");
#[allow(clippy::needless_range_loop)]
for idx in 0..state0_actions.len().min(12) {
let cell = &state0_actions[idx];
// Prefer labeling by EOF column equality rather than symbol id
let symbol_info = if idx == eof_idx {
"EOF".to_string()
} else {
parse_table
.symbol_to_index
.iter()
.find(|(_, i)| **i == idx)
.map(|(sym_id, _)| format!("sym_{}", sym_id.0))
.unwrap_or_else(|| "unmapped".to_string())
};
let type_str = if idx == eof_idx || token_set.contains(&idx) {
"TOKEN"
} else {
"NT"
};
let action_str = if cell.is_empty() {
"[]".to_string()
} else {
format!("{:?}", cell)
};
debug_info.push_str(&format!(
" Col {:2} ({:8} {:5}): {}\n",
idx, symbol_info, type_str, action_str
));
}
// Provide actionable guidance
debug_info.push_str("\nPossible causes:\n");
debug_info.push_str("1. Pattern wrappers not desugared to unit rules\n");
debug_info
.push_str("2. Token symbols not properly registered in symbol_to_index\n");
debug_info.push_str("3. Grammar start symbol issues\n");
return Err(TableGenError::Compression(format!(
"State 0 validation failed: No valid token shift actions found.\n{}",
debug_info
)));
}
}
// Additional sanity guards
if parse_table.action_table.is_empty() {
return Err(TableGenError::Compression(
"Empty action table - grammar has no parse states".to_string(),
));
}
if parse_table.state_count == 0 {
return Err(TableGenError::Compression(
"State count is 0 - invalid parse table".to_string(),
));
}
// Determine if we should use small table optimization
let use_small_table = parse_table.state_count < self.small_table_threshold;
if use_small_table {
self.compress_small_table(parse_table)
} else {
self.compress_large_table(parse_table)
}
}
/// Compress using Tree-sitter's "small table" optimization
fn compress_small_table(&self, parse_table: &ParseTable) -> Result<CompressedTables> {
let compressed_action_table = self
.compress_action_table_small(&parse_table.action_table, &parse_table.symbol_to_index)?;
let compressed_goto_table = self.compress_goto_table_small(&parse_table.goto_table)?;
Ok(CompressedTables {
action_table: compressed_action_table,
goto_table: compressed_goto_table,
small_table_threshold: self.small_table_threshold,
})
}
/// Compress using large table optimization
fn compress_large_table(&self, parse_table: &ParseTable) -> Result<CompressedTables> {
// For now, use the same as small table
self.compress_small_table(parse_table)
}
/// Compress action table using Tree-sitter's small table format
pub fn compress_action_table_small(
&self,
action_table: &[Vec<Vec<Action>>],
symbol_to_index: &BTreeMap<SymbolId, usize>,
) -> Result<CompressedActionTable> {
let mut entries = Vec::new();
let mut row_offsets = Vec::new();
let mut default_actions = Vec::new();
// Create inverse mapping from index to symbol ID
let mut index_to_symbol = HashMap::new();
for (&symbol_id, &index) in symbol_to_index {
index_to_symbol.insert(index, symbol_id);
}
for action_row in action_table.iter() {
// Find the most common action across all cells
let mut action_counts: HashMap<Action, usize> = HashMap::new();
let mut _has_shift = false;
let mut _has_accept = false;
// Collect all actions from all cells in this row
for action_cell in action_row {
for action in action_cell {
*action_counts.entry(action.clone()).or_insert(0) += 1;
match action {
Action::Shift(_) => _has_shift = true,
Action::Accept => _has_accept = true,
_ => {}
}
}
}
let _most_common = action_counts
.iter()
.max_by_key(|(_, count)| *count)
.map(|(action, _)| action.clone())
.unwrap_or(Action::Error);
// Default action optimization is currently disabled by design.
// The runtime does not use the default_actions array, so we encode all actions explicitly
// and populate default_actions with Action::Error as a placeholder.
// This ensures no information is lost during compression and all actions are available at runtime.
// Future: Could optimize by implementing default action support in the runtime decoder.
let default_action = Action::Error;
default_actions.push(default_action.clone());
row_offsets.push(entries.len() as u16);
for (index, action_cell) in action_row.iter().enumerate() {
// Process each action in the cell
for action in action_cell {
if action == &Action::Error {
// Still skip explicit Error actions to save space
continue;
}
// Use the mapped index directly, not the original symbol ID
// This ensures terminals (index < token_count) are correctly identified
let symbol_id = index as u16;
entries.push(CompressedActionEntry {
symbol: symbol_id,
action: action.clone(),
});
}
}
}
row_offsets.push(entries.len() as u16);
// Validate row_offsets are strictly increasing
for i in 1..row_offsets.len() {
if row_offsets[i] < row_offsets[i - 1] {
return Err(TableGenError::Compression(format!(
"Row offsets not strictly increasing at index {}: {} < {}",
i,
row_offsets[i],
row_offsets[i - 1]
)));
}
}
// Validate map length matches state count
if row_offsets.len() != action_table.len() + 1 {
return Err(TableGenError::Compression(format!(
"Row offsets length {} doesn't match state count {} + 1",
row_offsets.len(),
action_table.len()
)));
}
Ok(CompressedActionTable {
data: entries,
row_offsets,
default_actions,
})
}
/// Compress goto table
pub fn compress_goto_table_small(
&self,
goto_table: &[Vec<StateId>],
) -> Result<CompressedGotoTable> {
let mut entries = Vec::new();
let mut row_offsets = Vec::new();
for row in goto_table {
row_offsets.push(entries.len() as u16);
let mut last_state = None;
let mut run_length = 0;
for &state_id in row {
if last_state == Some(state_id.0) {
run_length += 1;
} else {
if run_length > 0 {
// SAFETY: run_length > 0 implies last_state was set
let state = last_state.expect("run_length > 0 implies last_state is set");
// Emit previous run
if run_length > 2 {
entries.push(CompressedGotoEntry::RunLength {
state,
count: run_length,
});
} else {
// For short runs, individual entries are more efficient
for _ in 0..run_length {
entries.push(CompressedGotoEntry::Single(state));
}
}
}
last_state = Some(state_id.0);
run_length = 1;
}
}
if run_length > 0 {
let state = last_state.expect("run_length > 0 implies last_state is set");
if run_length > 2 {
entries.push(CompressedGotoEntry::RunLength {
state,
count: run_length,
});
} else {
for _ in 0..run_length {
entries.push(CompressedGotoEntry::Single(state));
}
}
}
}
row_offsets.push(entries.len() as u16);
Ok(CompressedGotoTable {
data: entries,
row_offsets,
})
}
// Removed in 0.8.0 - use compress(parse_table, token_indices, start_can_be_empty)
// See MIGRATING.md for migration guide
}
#[cfg(test)]
mod tests {
use super::*;
use adze_glr_core::Action;
use adze_ir::{RuleId, StateId};
#[test]
fn test_compressed_parse_table_creation() {
let table = CompressedParseTable::new_for_testing(10, 20);
assert_eq!(table.symbol_count(), 10);
assert_eq!(table.state_count(), 20);
}
#[test]
fn test_compressed_parse_table_from_parse_table() {
let parse_table = crate::test_helpers::test::make_minimal_table(
vec![vec![vec![]; 5]; 10], // 10 states, 5 symbols
vec![vec![crate::test_helpers::test::INVALID; 5]; 10],
vec![],
SymbolId(2), // start_symbol
SymbolId(1), // eof_symbol (must be > 0)
0, // external_token_count
);
let compressed = CompressedParseTable::from_parse_table(&parse_table);
assert_eq!(compressed.symbol_count(), 5);
assert_eq!(compressed.state_count(), 10);
}
#[test]
fn test_compressed_action_entry() {
let entry = CompressedActionEntry::new(42, Action::Shift(StateId(5)));
assert_eq!(entry.symbol, 42);
match entry.action {
Action::Shift(StateId(5)) => {}
_ => panic!("Expected shift action"),
}
}
#[test]
fn test_table_compressor_creation() {
let compressor = TableCompressor::new();
// Just verify it can be created
assert!(compressor.small_table_threshold > 0);
}
#[test]
fn test_compress_empty_action_table() {
let compressor = TableCompressor::new();
let action_table = vec![vec![]; 5]; // 5 empty states
let symbol_to_index = std::collections::BTreeMap::new();
let result = compressor.compress_action_table_small(&action_table, &symbol_to_index);
assert!(result.is_ok());
let compressed = result.unwrap();
assert_eq!(compressed.row_offsets.len(), 6); // n_states + 1
assert_eq!(compressed.default_actions.len(), 5);
assert!(compressed.data.is_empty());
}
#[test]
fn test_compress_action_table_with_default_reduce() {
let compressor = TableCompressor::new();
let reduce_action = Action::Reduce(RuleId(1));
let action_table = vec![
vec![vec![reduce_action.clone()]; 10], // All same reduce action in ActionCells
];
let symbol_to_index = std::collections::BTreeMap::new();
let result = compressor.compress_action_table_small(&action_table, &symbol_to_index);
assert!(result.is_ok());
let compressed = result.unwrap();
// Default action optimization is disabled, so default should be Error
assert_eq!(
compressed.default_actions[0],
Action::Error,
"Default action optimization disabled"
);
// All 10 reduce actions should be explicitly encoded
assert_eq!(
compressed.data.len(),
10,
"All reduce actions should be explicitly encoded"
);
}
#[test]
fn test_compress_goto_table_with_runs() {
let compressor = TableCompressor::new();
let goto_table = vec![vec![
StateId(1),
StateId(1),
StateId(1),
StateId(2),
StateId(2),
]];
let result = compressor.compress_goto_table_small(&goto_table);
assert!(result.is_ok());
let compressed = result.unwrap();
assert!(!compressed.data.is_empty());
// Should have a run length entry for the three 1s
let has_run_length = compressed
.data
.iter()
.any(|entry| matches!(entry, CompressedGotoEntry::RunLength { state: 1, count: 3 }));
assert!(has_run_length);
}
#[test]
fn test_compressed_tables_validation() {
let tables = CompressedTables {
action_table: CompressedActionTable {
data: vec![],
row_offsets: vec![],
default_actions: vec![],
},
goto_table: CompressedGotoTable {
data: vec![],
row_offsets: vec![],
},
small_table_threshold: 32768,
};
let parse_table = crate::test_helpers::test::make_minimal_table(
vec![vec![vec![]]], // 1 state, 1 symbol (minimum required)
vec![vec![crate::test_helpers::test::INVALID]], // 1 state, 1 symbol
vec![], // 0 rules
SymbolId(1), // start_symbol
SymbolId(1), // eof_symbol (must be >= 1)
0, // external_token_count
);
let result = tables.validate(&parse_table);
assert!(result.is_ok());
}
}