libgrammstein 0.1.0

Hybrid language model (N-gram + Embeddings) for WFST text correction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
//! Domain events for Google Books import progress.
//!
//! These events are TUI-agnostic and describe what happened in domain terms.
//! Any UI (TUI, progress bar, logging) can subscribe and render appropriately.
//!
//! ## Architecture
//!
//! The import logic emits domain events via a broadcast channel. Multiple
//! subscribers can consume these events:
//! - TUI: Renders progress dashboard with ratatui
//! - Logging: Writes to file or console
//! - Metrics: Collects statistics for later analysis
//!
//! Commands flow in the opposite direction, from UI to the importer:
//! - Pause/Resume: Gracefully pause workers
//! - Cancel: Stop import with checkpoint
//! - SetParallelism: Adjust worker count at runtime

use std::time::Duration;

/// Domain events emitted during import.
///
/// These events describe what happened without any UI-specific details.
/// The TUI (or any other subscriber) maps these to its own state.
#[derive(Clone, Debug)]
pub enum ImportEvent {
    /// Import started for a specific n-gram order.
    OrderStarted {
        /// N-gram order being imported.
        order: u8,
        /// Total number of files scheduled for the order.
        total_files: u64,
    },

    /// An order completed successfully.
    OrderCompleted {
        /// N-gram order that completed.
        order: u8,
        /// Number of n-grams processed for the order.
        ngram_count: u64,
        /// Wall-clock duration for the order.
        duration: Duration,
    },

    /// Worker began downloading a prefix file.
    WorkerStarted {
        /// Worker ID that started the job.
        worker_id: usize,
        /// N-gram order being processed (1-5).
        order: u8,
        /// Prefix assigned to the worker.
        prefix: String,
    },

    /// Worker download progress (bytes received).
    WorkerProgress {
        /// Worker ID reporting progress.
        worker_id: usize,
        /// Bytes downloaded so far.
        bytes_downloaded: u64,
        /// Total compressed bytes if the server reported a content length.
        total_bytes: Option<u64>,
    },

    /// Worker n-gram processing progress (periodic update).
    WorkerNgramProgress {
        /// Worker ID reporting progress.
        worker_id: usize,
        /// Number of n-grams processed by the worker.
        ngram_count: u64,
    },

    /// Worker finished processing a file.
    WorkerFinished {
        /// Worker ID that completed the job.
        worker_id: usize,
        /// N-gram order that was processed (1-5).
        order: u8,
        /// Prefix that was processed.
        prefix: String,
        /// Number of n-grams processed.
        ngram_count: u64,
        /// Wall-clock duration for the job.
        duration: Duration,
    },

    /// Per-order progress update for TUI multi-order display.
    ///
    /// Emitted periodically to update the TUI with per-order progress,
    /// enabling display of multiple concurrent orders.
    OrderProgress {
        /// N-gram order (1-5).
        order: u8,
        /// Files completed for this order (success + skipped).
        files_completed: u64,
        /// Total files for this order.
        total_files: u64,
        /// N-grams processed for this order.
        ngrams_processed: u64,
        /// Whether this order is fully complete.
        is_complete: bool,
        /// Files successfully completed (for green progress bar segment).
        files_succeeded: u64,
        /// Files skipped/failed (for yellow progress bar segment, will retry next session).
        files_skipped: u64,
    },

    /// Worker is retrying after transient error.
    WorkerRetrying {
        /// Worker ID retrying the job.
        worker_id: usize,
        /// Prefix being retried.
        prefix: String,
        /// Current retry attempt.
        attempt: u32,
        /// Maximum retry attempts before the prefix is marked failed.
        max_attempts: u32,
        /// Human-readable retry reason.
        error: String,
    },

    /// Worker exited (shutdown signal received or queue empty).
    ///
    /// Emitted when a worker task exits, either because it received a shutdown
    /// signal (parallelism decreased) or because the job queue is empty.
    WorkerExited {
        /// Worker ID that exited.
        worker_id: usize,
    },

    /// Periodic statistics update.
    StatsSnapshot {
        /// Number of files completed across active orders.
        files_completed: u64,
        /// Total number of files scheduled.
        total_files: u64,
        /// Total n-grams processed.
        total_ngrams: u64,
        /// Unique n-grams seen so far.
        unique_ngrams: u64,
        /// Current processing throughput.
        ngrams_per_second: f64,
        /// Elapsed import duration.
        elapsed: Duration,
    },

    /// Checkpoint saved.
    CheckpointSaved {
        /// Prefix whose checkpoint was persisted.
        prefix: String,
    },

    /// Checkpoint progress update.
    ///
    /// Emitted during async checkpoint finish phase to track progress
    /// of persisting individual shards.
    CheckpointProgress {
        /// Number of shards that have been checkpointed.
        shards_processed: usize,
        /// Total number of shards to checkpoint.
        total_shards: usize,
        /// Completion percentage (0.0 to 100.0).
        percent_complete: f32,
    },

    /// Import completed (all orders).
    ImportCompleted {
        /// Total n-grams processed across all orders.
        total_ngrams: u64,
        /// Total wall-clock duration for the import.
        duration: Duration,
    },

    /// Import cancelled by user.
    ImportCancelled,

    /// Import paused.
    ImportPaused,

    /// Import resumed.
    ImportResumed,

    /// Fatal error occurred during import.
    ///
    /// This event is sent when an unrecoverable error occurs (e.g., network
    /// failure after retries exhausted, disk I/O error). The TUI should
    /// display this prominently and allow the user to quit gracefully.
    Error {
        /// Human-readable error message.
        message: String,
    },

    /// Log message (for debugging/info).
    Log {
        /// Severity level for the log message.
        level: LogLevel,
        /// Human-readable log text.
        message: String,
    },

    /// A prefix file failed after exhausting all retries.
    ///
    /// This event indicates that a prefix could not be processed and will
    /// be skipped for the current run. The prefix is marked as failed in
    /// the checkpoint and will be retried on subsequent runs.
    PrefixFailed {
        /// N-gram order (1-5).
        order: u8,
        /// The prefix that failed.
        prefix: String,
        /// Human-readable error message.
        error: String,
        /// Number of retry attempts made.
        attempts: u32,
    },

    /// Previously failed prefixes are being retried.
    ///
    /// Emitted at the start of an order when there are failed prefixes
    /// from a previous run that will be retried.
    RetryingFailedPrefixes {
        /// N-gram order (1-5).
        order: u8,
        /// Number of failed prefixes being retried.
        count: usize,
        /// List of prefix strings being retried.
        prefixes: Vec<String>,
    },

    /// In-progress prefixes detected on resume (partial data).
    ///
    /// Emitted when resuming and detecting prefixes that were being
    /// processed when the previous run crashed. These prefixes have
    /// potentially partial data that will be cleared before retrying.
    RecoveringInProgressPrefixes {
        /// N-gram order (1-5).
        order: u8,
        /// Number of in-progress prefixes being recovered.
        count: usize,
        /// List of prefix strings with partial data.
        prefixes: Vec<String>,
    },

    /// Job requeued with exponential backoff (current session retry).
    ///
    /// Emitted when a job encounters a retryable error and is requeued
    /// with a delay (exponential backoff). These jobs will be retried
    /// within the current session after the backoff delay expires.
    DeferredRetry {
        /// The prefix being retried.
        prefix: String,
        /// Current attempt number (1-indexed).
        attempt: u32,
        /// N-gram order (1-5).
        order: u8,
    },

    /// Deferred retry job started processing (decrement backoff queue).
    ///
    /// Emitted when a job that was previously deferred (requeued with backoff)
    /// starts processing again. This allows the TUI to decrement the backoff
    /// queue counter.
    DeferredRetryStarted {
        /// The prefix starting its retry.
        prefix: String,
        /// N-gram order (1-5).
        order: u8,
    },

    /// Merge phase started (sharded storage only).
    ///
    /// Emitted when the shard merge phase begins after n-gram collection completes.
    MergeStarted {
        /// Number of shards to merge.
        shard_count: usize,
        /// Estimated total n-grams across all shards.
        estimated_ngrams: u64,
    },

    /// Merge progress update.
    ///
    /// Emitted periodically during the merge phase to update progress display.
    MergeProgress {
        /// Number of shards processed so far.
        shards_processed: usize,
        /// Total number of shards.
        total_shards: usize,
        /// N-grams merged so far.
        ngrams_merged: u64,
        /// Completion percentage (0.0 to 100.0).
        percent_complete: f32,
    },

    /// Merge completed successfully.
    ///
    /// Emitted when all shards have been merged into the final output.
    MergeCompleted {
        /// Total n-grams in merged output.
        total_ngrams: u64,
        /// Bytes written to output file.
        bytes_written: u64,
        /// Duration of merge phase.
        duration: Duration,
    },

    /// Merge failed with error.
    ///
    /// Emitted when merge encounters an unrecoverable error.
    MergeFailed {
        /// Human-readable error message.
        error: String,
    },

    /// Shard cleanup started.
    ///
    /// Emitted when temporary shard files are being deleted after merge.
    ShardCleanupStarted {
        /// Number of shard files to delete.
        shard_count: usize,
    },

    /// Shard cleanup completed.
    ///
    /// Emitted when all temporary shard files have been deleted.
    ShardCleanupCompleted {
        /// Number of shard files deleted.
        shards_deleted: usize,
        /// Total bytes freed by deletion.
        bytes_freed: u64,
    },

    /// All work completed - triggers completion dialog.
    ///
    /// Emitted after import, merge (if applicable), and cleanup are all done.
    /// The TUI should display a completion dialog requiring user acknowledgment.
    AllWorkCompleted {
        /// Total n-grams in final output.
        total_ngrams: u64,
        /// Total duration of entire import process.
        total_duration: Duration,
        /// Whether shard files were kept (--keep-shards flag).
        shards_kept: bool,
    },

    /// MKN statistics computation started.
    ///
    /// Emitted when Modified Kneser-Ney statistics computation begins.
    MknStarted {
        /// Source of n-gram data for MKN computation.
        /// Either "shards" (parallel computation) or "single_trie" (sequential).
        source: String,
        /// Estimated total n-grams to process.
        estimated_ngrams: u64,
    },

    /// MKN computation progress update.
    ///
    /// Emitted periodically during MKN statistics computation.
    MknProgress {
        /// Current phase (1 = collecting pairs, 2 = writing stats).
        phase: u8,
        /// Total phases.
        total_phases: u8,
        /// Items processed in current phase.
        items_processed: u64,
        /// Estimated total items in current phase.
        total_items: u64,
        /// Completion percentage (0.0 to 100.0).
        percent_complete: f32,
    },

    /// MKN computation completed successfully.
    ///
    /// Emitted when MKN statistics have been computed and written.
    MknCompleted {
        /// Number of continuation count entries written.
        continuation_entries: u64,
        /// Number of frequency count entries written.
        frequency_entries: u64,
        /// Duration of MKN computation.
        duration: Duration,
    },

    /// MKN computation failed.
    ///
    /// Emitted when MKN computation encounters an unrecoverable error.
    MknFailed {
        /// Human-readable error message.
        error: String,
    },

    /// Import phase changed.
    ///
    /// Emitted when the import pipeline transitions to a new phase.
    /// Enables TUI to display current phase and track progress.
    PhaseChanged {
        /// Name of the new phase (e.g., "Downloading", "Computing MKN Statistics").
        phase: String,
    },
}

/// Log level for log events.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum LogLevel {
    /// Debug-level diagnostic message.
    Debug,
    /// Informational message.
    Info,
    /// Warning message.
    Warn,
    /// Error message.
    Error,
}

/// Commands sent to control the import.
///
/// Commands flow from the UI (or any controller) to the importer.
/// The importer processes these asynchronously and emits corresponding events.
#[derive(Clone, Debug)]
pub enum ImportCommand {
    /// Pause all workers (graceful, waits for current n-gram).
    Pause,

    /// Resume paused workers.
    Resume,

    /// Cancel import (save checkpoint first).
    Cancel,

    /// Force quit without saving checkpoint.
    ForceQuit,

    /// Adjust parallelism at runtime.
    SetParallelism(usize),
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn event_is_clone() {
        let event = ImportEvent::WorkerStarted {
            worker_id: 0,
            order: 2,
            prefix: "th".to_string(),
        };
        let _cloned = event.clone();
    }

    #[test]
    fn command_is_clone() {
        let cmd = ImportCommand::Pause;
        let _cloned = cmd.clone();
    }

    #[test]
    fn order_progress_event() {
        let event = ImportEvent::OrderProgress {
            order: 2,
            files_completed: 50,
            total_files: 676,
            ngrams_processed: 1_000_000,
            is_complete: false,
            files_succeeded: 48,
            files_skipped: 2,
        };
        let _cloned = event.clone();
    }
}