1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
//! LaTeX-specific statistical and neural scoring for correction pipelines.
//!
//! This module provides specialized language models and scoring functions
//! for LaTeX documents, designed to integrate with the lling-llang WFST
//! correction framework.
//!
//! # Overview
//!
//! The LaTeX module consists of several components:
//!
//! - **Tokenizer** (`tokenizer`): LaTeX-aware tokenization that distinguishes between
//! commands, environments, math mode, and text content.
//!
//! - **N-gram Model** (`ngram`): Mode-aware n-gram models trained separately on
//! command sequences, mathematical expressions, and natural text.
//!
//! - **Embeddings** (`embedding`): LaTeX command and equation embeddings for
//! semantic similarity scoring.
//!
//! - **Rescorer** (`rescorer`): Neural rescoring using fine-tuned ModernBERT
//! for mathematical text.
//!
//! - **RAG** (`rag`): Equation retrieval for finding similar correct equations
//! in a reference corpus.
//!
//! # Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────────────┐
//! │ LaTeX Scoring Pipeline │
//! ├─────────────────────────────────────────────────────────────────────┤
//! │ │
//! │ Input: LaTeX Token Stream │
//! │ │ │
//! │ ▼ │
//! │ ┌───────────────────┐ │
//! │ │ Mode Detector │ ← Identifies command/math/text regions │
//! │ └─────────┬─────────┘ │
//! │ │ │
//! │ ┌─────────┴─────────┬───────────────────┬───────────────────┐ │
//! │ ▼ ▼ ▼ ▼ │
//! │ Command N-gram Math N-gram Text N-gram Equation │
//! │ Model Model Model Embeddings │
//! │ │ │ │ │ │
//! │ └───────────────────┴───────────────────┴───────────────┘ │
//! │ │ │
//! │ ▼ │
//! │ Combined Score (weighted) │
//! │ │ │
//! │ ▼ │
//! │ Neural Rescore (optional) │
//! │ │ │
//! │ ▼ │
//! │ Final Score │
//! └─────────────────────────────────────────────────────────────────────┘
//! ```
//!
//! # Example
//!
//! ```ignore
//! use libgrammstein::latex::{LaTeXTokenizer, LaTeXNgramModel, LaTeXScorer};
//!
//! // Tokenize LaTeX input
//! let tokenizer = LaTeXTokenizer::new();
//! let tokens = tokenizer.tokenize(r"\begin{equation} x^2 + y^2 = z^2 \end{equation}");
//!
//! // Score with mode-aware n-gram model
//! let model = LaTeXNgramModel::load("latex_model.bin")?;
//! let score = model.score(&tokens);
//!
//! // Or use the combined scorer
//! let scorer = LaTeXScorer::builder()
//! .with_ngram_model(model)
//! .with_equation_embeddings(embeddings)
//! .build();
//!
//! let final_score = scorer.score(&tokens);
//! ```
//!
//! # Training
//!
//! Models can be trained on arXiv LaTeX source files:
//!
//! ```ignore
//! use libgrammstein::latex::{LaTeXCorpusReader, LaTeXTrainer};
//!
//! let corpus = LaTeXCorpusReader::from_arxiv_bulk("/path/to/arxiv")?;
//! let trainer = LaTeXTrainer::new()
//! .ngram_order(5)
//! .math_weight(2.0)
//! .command_weight(1.5);
//!
//! let model = trainer.train(corpus)?;
//! model.save("latex_model.bin")?;
//! ```
// Re-export main types
pub use ;
pub use ;
pub use ;
pub use ;
pub use ;
pub use ;