lling-llang 0.1.0

WFST framework for text normalization and grammar correction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
//! # lling-llang
//!
//! A Weighted Finite State Transducer (WFST) framework for text normalization
//! and grammar correction.
//!
//! ## Overview
//!
//! lling-llang provides:
//! - **Semirings**: Algebraic weight structures (Tropical, Log, Probability, Boolean, Product, String, Expectation)
//! - **WFSTs**: Weighted finite state transducers with composition operators
//! - **Lattices**: Weighted DAGs for representing correction alternatives
//! - **CFG Parsing**: Earley parser modified for lattice input
//! - **Extensible Layers**: Plugin architecture for correction pipelines
//!
//! ## Feature Flags
//!
//! - `default`: Standalone WFST framework with no external dependencies
//! - `levenshtein`: Integration with liblevenshtein for lexical correction
//! - `pos-tagging`: POS tagging layer support
//! - `lm-rerank`: Language model reranking layer support
//! - `f1r3fly`: Full F1R3FLY.io integration (PathMap, MORK, MeTTaTron, MeTTaIL)
//! - `sexpr`: S-expression path format for MORK compatibility
//! - `serde`: Serialization support
//!
//! ## Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────────────────┐
//! │                        Correction Layer Stack                           │
//! ├─────────────────────────────────────────────────────────────────────────┤
//! │  Layer N: [User-Defined]           ← Implement CorrectionLayer trait    │
//! │     ↑                                                                   │
//! │  Layer 3: CFG Grammar              ← Syntactic filtering                │
//! │     ↑                                                                   │
//! │  Layer 1: Lexical Correction       ← Levenshtein + phonetic candidates  │
//! │     ↑                                                                   │
//! │  [Input Lattice]                                                        │
//! └─────────────────────────────────────────────────────────────────────────┘
//! ```
//!
//! ## Example
//!
//! ```rust,ignore
//! use lling_llang::prelude::*;
//!
//! // Build a correction lattice
//! let mut builder = LatticeBuilder::<TropicalWeight, _>::new(backend);
//! builder.add_correction(0, 1, "the", TropicalWeight(1.0), EdgeMetadata::default());
//! builder.add_correction(0, 1, "teh", TropicalWeight(0.0), EdgeMetadata::default());
//! let lattice = builder.build(2);
//!
//! // Extract best path
//! let best = lattice.best_path();
//! ```

#![warn(missing_docs)]
#![warn(rustdoc::missing_crate_level_docs)]
// === Clippy policy allows ===
// Generic-semiring code uses explicit `.clone()` to document intent and to
// stay correct when a future Semiring impl is Clone-but-not-Copy. Concrete
// semirings (LogWeight, TropicalWeight, ...) all happen to be Copy today.
#![allow(clippy::clone_on_copy)]
// `&*x` and `&x` patterns are common in this crate's iterator-heavy code
// where adding/removing borrows would require touching many call sites.
#![allow(clippy::needless_borrow)]
// Range-indexed loops are preferred over `iter().enumerate()` in numeric
// algorithm code where the index is the primary value (alpha[s], distances[i]).
#![allow(clippy::needless_range_loop)]
// Very-complex types appear in lazy-WFST plumbing where the type alias would
// obscure rather than clarify.
#![allow(clippy::type_complexity)]
// `if let Some(x) = ...` nested in `match` arms is often the clearest way to
// express weighted-transition decisions in algorithm code; collapsing them
// makes the algebra harder to read.
#![allow(clippy::collapsible_if, clippy::collapsible_match)]
// `(x + n - 1) / n` and `x % n == 0` are the textbook idioms in this crate's
// algorithmic code, often appearing inside comments referencing the formula.
#![allow(clippy::manual_div_ceil, clippy::manual_is_multiple_of)]
// `s[1..]` / `&s[..s.len()-1]` patterns appear in tight tokenization loops
// where the manual form matches the surrounding indexing arithmetic.
#![allow(clippy::manual_strip)]
// `x as u32` on values already typed as `u32` survives generic refactors
// (e.g. `StateId` aliasing) and documents the intent at the call site.
#![allow(clippy::unnecessary_cast)]
// `Foo { ..Default::default() }` is more readable than full struct init for
// many config types in this crate.
#![allow(clippy::field_reassign_with_default)]
// `0..=255u8` and similar appear as `b'\0'..=b'\xFF'` deliberately to spell
// out the full byte range.
#![allow(clippy::almost_complete_range)]
// `iter().enumerate().map(|(_, x)| ...)` survives index-related refactors.
#![allow(clippy::unused_enumerate_index)]
// `or_insert_with(Vec::new)` is identical to `or_default()` but reads as
// "insert an empty Vec", which matches the surrounding code in this crate.
#![allow(clippy::unwrap_or_default)]
// `>= n + 1` patterns appear in inequality chains where keeping the symmetric
// form aids legibility.
#![allow(clippy::int_plus_one)]
// Boolean-comparison-to-true patterns occur in proptest predicates where the
// explicit form documents that the value is a bool, not e.g. an Option<bool>.
#![allow(clippy::bool_comparison, clippy::nonminimal_bool)]
// `.contains_key + .insert` is structurally a `.entry().or_insert()` pair but
// the explicit form makes the absence path observable to the reader.
#![allow(clippy::map_entry)]
// `.expect(format!(...))` in tests is legible; the lazy_format dance is noise.
#![allow(clippy::expect_fun_call)]
// `manual RangeInclusive::contains` matches the surrounding comparison style.
#![allow(clippy::manual_range_contains)]
// `Default::default` redundant closures are fine; explicit constructor calls
// document the type being defaulted.
#![allow(clippy::redundant_closure)]
// `from_str` on inherent impls is a deliberate API choice (some types support
// fallible parsing via Result and a separate non-FromStr signature).
#![allow(clippy::should_implement_trait)]
// Remaining stylistic lints that are deliberate codebase patterns:
#![allow(
    // `for k in map.iter()` over `for k in map.keys()` documents that the value is intentionally ignored.
    clippy::for_kv_map,
    // `.iter().any(|x| x == &needle)` documents the comparator; `contains` hides it for non-Copy types.
    clippy::manual_contains,
    // `* 1.0` and similar appear in benchmark fixtures as load-bearing scaffolding.
    clippy::no_effect,
    // `map_or` chains stay because the call sites compose with other map_or chains.
    clippy::unnecessary_map_or,
    // Wide-arity builder/decoder functions are part of public API; renaming/grouping would break callers.
    clippy::too_many_arguments,
    // `vec![x].clone()` in tests reads more naturally than `std::slice::from_ref`.
    clippy::single_element_loop, clippy::redundant_slicing,
    // Reference-of-both-operands patterns appear in proptest predicates where keeping both sides borrowed avoids move warnings.
    clippy::op_ref,
    // Internal `module/module.rs` layouts (e.g. lattice/lattice.rs) are intentional for the public type sharing the module name.
    clippy::module_inception,
    // `match Option { Some(x) => x, None => Default::default() }` documents intent better than `unwrap_or_default`.
    clippy::manual_unwrap_or_default,
    // `(x as char) == ...` casts appear in tokenization for documentation purposes.
    clippy::single_char_pattern,
    // sort_by closure pattern is fine when the key extraction has side-effect-free arithmetic.
    clippy::unnecessary_sort_by,
    // `.max(lo).min(hi)` vs `.clamp(lo, hi)` is a wash; both are legible.
    clippy::manual_clamp,
    // `text.len() == 1` reads more naturally than `text.chars().count() == 1` for ASCII inputs.
    clippy::comparison_to_empty,
)]
// Doc-format lints triggered by intentional README-style markdown in module docs.
#![allow(
    rustdoc::redundant_explicit_links,
    clippy::doc_overindented_list_items,
    clippy::doc_lazy_continuation,
    clippy::empty_line_after_doc_comments
)]

pub mod acoustic;
pub mod algorithms;
pub mod asr;
pub mod backend;
pub mod cfg;
pub mod composition;
pub mod ctc;
pub mod differentiable;
pub mod error_models;
pub mod gpu;
pub mod lattice;
#[cfg(feature = "lattice")]
pub mod lattice_bridge;
pub mod layers;
pub mod llm;
pub mod multilingual;
pub mod multitape;
pub mod optimization;
pub mod path;
pub mod programming;
pub mod pushdown;
pub mod semiring;
pub mod simd;
pub mod subsequential;
pub mod text_processing;
pub mod training;
pub mod transducer;
pub mod tree_transducers;
pub mod wfst;

/// Test utilities for property-based testing and assertions.
///
/// This module provides `proptest` strategies, custom assertions, and
/// common fixtures for testing WFSTs and semirings.
#[cfg(any(test, feature = "test-utils"))]
pub mod test_utils;

// #[cfg(feature = "error-grammar")]
// pub mod error_grammar;

// #[cfg(feature = "sexpr")]
// pub mod sexpr;

// #[cfg(feature = "f1r3fly")]
// pub mod storage;

#[cfg(feature = "levenshtein")]
pub mod integration;

/// Prelude for convenient imports.
pub mod prelude {
    pub use crate::acoustic::{
        // Score fusion
        AcousticLanguageModel,
        // Core trait
        AcousticModel,
        // Posteriors
        FramePosterior,
        FusionConfig,
        HmmStateId,
        PosteriorSequence,
        TransitionLogProb,
        // HMM topology
        TransitionMatrix,
        UnitId,
    };
    pub use crate::algorithms::{
        all_pairs_shortest_distance, single_source_shortest_distance,
        single_source_shortest_distance_with_queue, AutoQueue, FifoQueue, QueueType,
        ShortestDistanceConfig, ShortestDistanceQueue, ShortestFirstQueue, TopologicalQueue,
    };
    pub use crate::asr::{
        chain_factor,
        rescore_lattice,
        AsrCascade,
        AuxiliarySymbol,
        BackoffState,
        CascadeBuilder,
        CascadeConfig,
        Chain,
        ChainFactorConfig,
        ChainFactorResult,
        ChainId,
        ContextDependencyBuilder,
        ContextDependencyConfig,
        ContextState,
        DysfluencyConfig,
        DysfluencyDetector,
        // Dysfluency detection
        DysfluencyPattern,
        DysfluencySpan,
        LatticeGrammar,
        LexiconEntry,
        NgramBuilder,
        NgramConfig,
        NgramOrder,
        NgramTransducer,
        NgramWeight,
        PhoneId,
        RescoreConfig,
        RescorePass,
        RescoreResult,
        SyllableRepetitionBuilder,
        TetraploneBuilder,
        TriphoneBuilder,
        WordRepetitionBuilder,
    };
    pub use crate::backend::{HashMapBackend, LatticeBackend, VocabId};
    pub use crate::cfg::{
        EarleyChart, EarleyParser, EarleyState, ForestNode, ForestNodeId, Grammar, GrammarBuilder,
        GrammarError, NonTerminal, ParseError, ParseForest, ParseTree, Production, RuleId, Symbol,
        SymbolKind, Terminal,
    };
    pub use crate::composition::{
        compose, ComposedPath, CompositionStats, EpsilonFilter, EpsilonFilterType, FilterState,
        FilteredLattice, LazyCfgComposition, LazyComposition, ParseState, ValidPathIterator,
    };
    pub use crate::ctc::{
        compact_ctc,
        correct_ctc,
        minimal_ctc,
        selfless_compact_ctc,
        selfless_correct_ctc,
        // CTC decoding
        CtcDecoder,
        CtcDecoderConfig,
        CtcLabel,
        CtcTopology,
        CtcTopologyInfo,
        DecodingError,
        DecodingResult,
        DecodingStats,
        ObservationFst,
        StreamingCtcDecoder,
        BLANK,
    };
    pub use crate::differentiable::{
        backward, forward_score, log_sum_exp_paths, viterbi_path_with_grad, viterbi_score,
        ArcGradient, GradientAccumulator, GradientWfst, ViterbiGradResult,
    };
    pub use crate::gpu::{
        csr_from_vector_wfst,
        csr_memory_size,
        pack_cost_arc,
        reduce_with_k_vectors,
        unpack_cost_arc,
        AdaptiveBeam,
        BatchedDecoder,
        // Channels/Lanes
        Channel,
        ChannelState,
        CsrArc,
        CsrBuilder,
        CsrState,
        // CSR representation
        CsrWfst,
        DecoderConfig,
        // K-vector reduction
        KVector,
        KVectorConfig,
        KVectorStats,
        Lane,
        LaneState,
        LoadBalancer,
        // Token recombination
        PackedToken,
        RecombinationBuffer,
        SoftPruneBuffer,
        SoftPruneConfig,
        SoftPruneManager,
        SoftPruneStats,
        // Soft pruning
        SoftToken,
        TokenPacker,
        WorkDispatcher,
        // Load balancing
        WorkGroup,
        WorkItem,
        WorkQueue,
    };
    pub use crate::lattice::{
        Edge, EdgeId, EdgeMetadata, Lattice, LatticeBuilder, LatticePath, Node, NodeId,
        PathIterator,
    };
    pub use crate::layers::{
        CfgFilterLayer,
        // Confusion layer
        ConfusionLayer,
        ConfusionLayerConfig,
        ConfusionMatrix,
        CorrectionLayer,
        LayerError,
        LayerPipeline,
        LayerPipelineBuilder,
        LayerResult,
        LayerStats,
    };
    pub use crate::multilingual::{
        CodeSwitchBuilder,
        CodeSwitchConfig,
        CodeSwitchPath,
        // Code-switching transducer
        CodeSwitchTransducer,
        DetectionResult,
        LanguageConfig,
        LanguageDetector,
        // Language types
        LanguageId,
        LanguageModel,
        LanguageSpan,
        SimpleLanguageModel,
        SwitchPoint,
        WordProbability,
    };
    pub use crate::multitape::{
        // Labels and transitions
        MultiTapeLabel,
        MultiTapeState,
        MultiTapeTransition,
        // Traits
        MultiTapeWfst,
        // Builder
        MultiTapeWfstBuilder,
        // Projection and synchronization
        ProjectedWfst,
        SyncConfig,
        SynchronizedMultiTape,
        TapeDelay,
        // Implementations
        VectorMultiTapeWfst,
    };
    pub use crate::optimization::{
        apply_log_push, build_lookahead_table, compute_log_potentials, prepare_for_beam_search,
        BeamSearchPrepResult, LogPushConfig, LookaheadConfig, LookaheadTable,
    };
    pub use crate::path::{
        beam_search, nbest, viterbi, BeamSearchConfig, NBestIterator, ViterbiResult,
    };
    pub use crate::programming::{
        ApiMigrationBuilder,
        ApiMigrationRule,
        ApiMigrationTransducer,
        MigrationResult,
        MigrationStats,
        MigrationType,
        NodeKind,
        ParseResult,
        // Parser backend traits
        ParserBackend,
        ParserError,
        PatternMatcher,
        Position,
        Range,
        RepairAction,
        RepairCandidate,
        ReplacementAction,
        SyntaxNode,
        SyntaxNodeRef,
        SyntaxRepairBuilder,
        SyntaxRepairCosts,
        // Syntax repair
        SyntaxRepairRule,
        SyntaxRepairTransducer,
        // Token patterns
        Token,
        TokenKind,
        TokenPattern,
        TokenPredicate,
        TokenReplacement,
        // API migration
        Version,
        VersionRange,
    };
    pub use crate::pushdown::{
        PdaAcceptMode,
        // Builder
        PdaBuilder,
        PdaConfiguration,
        PdaState,
        // Transitions
        PdaTransition,
        StackAction,
        // Stack operations
        StackSymbol,
        // Implementations
        VectorPda,
        // Traits
        WeightedPda,
    };
    pub use crate::semiring::{
        BoolWeight, DivisibleSemiring, ExpectationWeight, FallibleStarSemiring, GodelWeight,
        LeftStringWeight, LogWeight, ProbabilityWeight, ProductWeight, RightStringWeight, Semiring,
        SignedTropicalWeight, StarDivergenceError, StarSemiring, TropicalWeight,
    };
    pub use crate::subsequential::{
        DecompositionStats,
        PiecewiseBuilder,
        PiecewiseSubsequential,
        // Subsequential transducers
        SubsequentialTransducer,
    };
    pub use crate::tree_transducers::{
        // Ranked alphabet
        RankedAlphabet,
        SimpleAlphabet,
        Symbol as RankedSymbol,
        TransducerState,
        // Tree data structure
        Tree,
        TreeChild,
        TreeNode,
        TreePattern,
        // Rules and patterns
        TreeRule,
        // Builder
        TreeTransducerBuilder,
        TreeTransducerOps,
        VectorTreeTransducer,
        // Transducer trait and implementations
        WeightedTreeTransducer,
    };
    pub use crate::wfst::{
        closure,
        closure_plus,
        compute_max_delay,
        concat,
        has_bounded_delay,
        invert,
        project_input,
        project_output,
        reverse,
        synchronize,
        synchronize_bounded,
        union,
        CachePolicy,
        ClosureSource,
        ClosureWfst,
        ConcatSource,
        ConcatWfst,
        // Unary operations
        InvertSource,
        InvertWfst,
        LazyState,
        LazyWfst,
        LazyWfstWrapper,
        MutableSyncSource,
        MutableWfst,
        ProjectInputWfst,
        ProjectOutputWfst,
        ProjectSource,
        StateId,
        StateSource,
        // Synchronization
        StringDelay,
        SyncSource,
        SyncState,
        SyncWfst,
        // Rational operations
        UnionSource,
        UnionWfst,
        VectorWfst,
        VectorWfstBuilder,
        WeightedTransition,
        Wfst,
        WfstState,
    };
}