1#![allow(clippy::result_large_err)]
5#![allow(clippy::too_many_arguments)]
7#![allow(clippy::type_complexity)]
8#![allow(clippy::excessive_nesting)]
9#![allow(clippy::should_implement_trait)]
11#![allow(clippy::wrong_self_convention)]
12#![allow(clippy::field_reassign_with_default)]
14#![allow(clippy::len_zero)]
15
16pub mod advanced_vocab_intelligence;
17pub mod alignment;
18pub mod arabic;
19pub mod async_tokenizer;
20pub mod benchmark_utils;
21pub mod binary_format;
22pub mod bio;
23pub mod bpe;
24pub mod canine;
25pub mod char;
26pub mod chemical;
27pub mod chinese;
28pub mod code_tokenizer;
29pub mod compressed_vocab;
30pub mod coverage;
31pub mod custom;
32pub mod custom_format;
33pub mod fairseq;
34#[cfg(feature = "gpu")]
35pub mod gpu_tokenization;
36pub mod japanese;
37#[cfg(feature = "jax")]
38pub mod jax;
39pub mod korean;
40pub mod math_tokenizer;
41pub mod messagepack_serialization;
42pub mod minimal_perfect_hash;
43pub mod mmap_vocab;
44pub mod multimodal;
45pub mod music;
46pub mod normalizer;
47#[cfg(feature = "onnx")]
48pub mod onnx;
49pub mod parallel;
50pub mod performance_profiler;
51pub mod protobuf_serialization;
52#[cfg(feature = "pytorch")]
53pub mod pytorch;
54pub mod regex_tokenizer;
55pub mod sentencepiece;
56pub mod sequence_packing;
57pub mod shared_vocab_pool;
58pub mod simd;
59pub mod special_tokens;
60pub mod streaming;
61pub mod subword_regularization;
62#[cfg(feature = "tensorflow")]
63pub mod tensorflow;
64pub mod test_infrastructure;
65pub mod thai;
66pub mod tiktoken;
67pub mod tokenization_debugger;
68pub mod tokenizer;
69pub mod training;
70pub mod unigram;
71pub mod visualization;
72pub mod vocab;
73pub mod vocab_analyzer;
74pub mod wordpiece;
75pub mod zero_copy;
76
77#[cfg(feature = "python")]
78pub mod python;
79
80pub use advanced_vocab_intelligence::{
81 ActionableRecommendation, CompressionAnalysis, CompressionOpportunity,
82 CompressionOpportunityType, CrossLingualAnalysis, DeclineToken, DomainAnalysis,
83 DomainDistribution, EvolutionAnalysis, EvolutionPrediction, ImplementationDifficulty,
84 LanguageCoverage, MergeRisk, MultilingualOpportunity, RecommendationCategory,
85 RecommendationPriority, RedundantTokenGroup, RiskAssessment, RiskLevel, SemanticAnalysis,
86 SemanticCluster, TrendingToken, VocabIntelligenceAnalyzer, VocabIntelligenceConfig,
87 VocabIntelligenceResult,
88};
89pub use alignment::{
90 AlignedSpan, AlignmentConfig, AlignmentEngine, AlignmentStats, TokenAlignment, Word,
91};
92pub use arabic::{
93 ArabicMode, ArabicTokenizer, ArabicTokenizerConfig, MorphologicalAnalysis, TokenizationStats,
94};
95pub use async_tokenizer::{
96 AsyncTokenizer, AsyncTokenizerConfig, AsyncTokenizerWrapper, ConfigurableAsyncTokenizer,
97};
98pub use benchmark_utils::{
99 BenchmarkConfig, BenchmarkResult as TokenizerBenchmarkResult, TokenizerBenchmark,
100};
101pub use binary_format::{
102 BinaryConfig, BinaryHeader, BinarySerializer, BinaryTokenizer, BinaryUtils,
103 NormalizationRule as BinaryNormalizationRule, PreTokenizationRule as BinaryPreTokenizationRule,
104 TokenizerConverter,
105};
106pub use bio::{
107 BioAnalysis, BioToken, BioTokenMetadata, BioTokenType, BioTokenizer, BioTokenizerConfig,
108};
109pub use bpe::BPETokenizer;
110pub use canine::CanineTokenizer;
111pub use char::CharTokenizer;
112pub use chemical::{
113 ChemicalAnalysis, ChemicalToken, ChemicalTokenMetadata, ChemicalTokenType, ChemicalTokenizer,
114 ChemicalTokenizerConfig,
115};
116pub use chinese::{ChineseTokenizer, ChineseTokenizerConfig};
117pub use code_tokenizer::{
118 CodeToken, CodeTokenType, CodeTokenizer, CodeTokenizerConfig, CommentPatterns, Language,
119 LiteralType, TokenPosition,
120};
121pub use compressed_vocab::{CompressedVocab, CompressedVocabStats, PrefixTrie};
122pub use coverage::{
123 CharacterCoverage, CoverageAnalyzer, CoverageConfig, CoverageExample, CoverageReport,
124 CoverageReportExporter, CoverageThresholds, CoverageWarning, PerformanceMetrics,
125 QualityMetrics, ReportFormat, TokenDistribution, VocabularyCoverage,
126};
127pub use custom::{CustomVocabTokenizer, CustomVocabTokenizerBuilder};
128pub use custom_format::{
129 CustomFormatConverter, CustomFormatTokenizer, CustomSpecialToken, CustomToken,
130 CustomTokenizerFormat, CustomVocabulary, NormalizationRule as CustomNormalizationRule,
131 NormalizationType, PostProcessingRule, PostProcessingType,
132 PreTokenizationRule as CustomPreTokenizationRule, PreTokenizationType, SpecialTokenType,
133 VocabularyType,
134};
135pub use fairseq::{FairseqDictionaryBuilder, FairseqTokenizer};
136#[cfg(feature = "gpu")]
137pub use gpu_tokenization::{
138 BatchProcessingConfig, BenchmarkResult as GpuBenchmarkResult, GpuTokenizationBenchmark,
139 GpuTokenizationResult, GpuTokenizationStats, GpuTokenizer, GpuTokenizerConfig,
140 GpuTokenizerError, KernelOptimization, MemoryOptimization,
141 PaddingStrategy as GpuPaddingStrategy,
142};
143pub use japanese::{JapaneseMode, JapaneseTokenizer, JapaneseTokenizerConfig};
144#[cfg(feature = "jax")]
145pub use jax::{
146 JaxArray, JaxBatch, JaxCompiledTokenizer, JaxConfig, JaxDType, JaxDataIterator, JaxDataset,
147 JaxDevice, JaxMesh, JaxPaddingStrategy, JaxSharding, JaxTokenizer, JaxTruncationStrategy,
148 JaxUtils,
149};
150pub use korean::{KoreanMode, KoreanTokenizer, KoreanTokenizerConfig};
151pub use math_tokenizer::{
152 MathAnalysis, MathToken, MathTokenType, MathTokenizer, MathTokenizerConfig,
153};
154pub use messagepack_serialization::{
155 MessagePackConfig, MessagePackMergeRule, MessagePackNormalizationRule, MessagePackSerializer,
156 MessagePackTokenizedInput, MessagePackTokenizerConfig, MessagePackTokenizerMetadata,
157 MessagePackUtils, MessagePackVocabEntry,
158};
159pub use minimal_perfect_hash::{
160 EfficiencyComparison, MemoryUsage, MinimalPerfectHash, MinimalPerfectHashConfig,
161 MinimalPerfectHashVocab,
162};
163pub use mmap_vocab::{MemoryStats, MmapVocab, TokenIterator};
164pub use multimodal::{
165 AudioFrame, FusionStrategy, GraphData, ImagePatch, ModalityType, MultimodalConfig,
166 MultimodalInput, MultimodalToken, MultimodalTokenMetadata, MultimodalTokenizedInput,
167 MultimodalTokenizer, MultimodalUtils, TableData, VideoFrame,
168};
169pub use music::{
170 MusicAnalysis, MusicToken, MusicTokenMetadata, MusicTokenType, MusicTokenizer,
171 MusicTokenizerConfig,
172};
173#[cfg(feature = "onnx")]
174pub use onnx::{
175 OnnxAttribute, OnnxDataType, OnnxExportConfig, OnnxModel, OnnxModelMetadata, OnnxNode,
176 OnnxOptimizationLevel, OnnxSessionOptions, OnnxTensorData, OnnxTensorInfo,
177 OnnxTokenizerExporter, OnnxTokenizerRuntime, OnnxUtils,
178};
179pub use parallel::{BatchTokenizer, BatchedTokenizedInput, ParallelTokenizer};
180pub use performance_profiler::{
181 BenchmarkResult, ExportFormat, MemoryStats as ProfilerMemoryStats, PerformanceProfiler,
182 ProfilerConfig, ProfilingReport, ProfilingSummary, ThroughputStats, TimingStats,
183 TokenizerComparison as ProfilerComparison,
184};
185pub use protobuf_serialization::{
186 ProtobufConvertible, ProtobufExportConfig, ProtobufExporter, ProtobufFormat, ProtobufMergeRule,
187 ProtobufNormalizationRule, ProtobufSerializer, ProtobufTokenizedInput,
188 ProtobufTokenizerMetadata, ProtobufTokenizerModel, ProtobufVocabEntry,
189};
190#[cfg(feature = "pytorch")]
191pub use pytorch::{
192 BatchIterator, PaddingStrategy as PyTorchPaddingStrategy, PyTorchBatch, PyTorchConfig,
193 PyTorchDataset, PyTorchTensor, PyTorchTokenizer, PyTorchUtils, TensorDType, TruncationStrategy,
194};
195pub use sentencepiece::SentencePieceTokenizer;
196pub use sequence_packing::{
197 AdvancedSequencePacker, PackedSequence, PackingConfig, PackingInfo, PackingStats,
198 PackingStrategy, SequencePacker,
199};
200pub use shared_vocab_pool::{PooledVocab, SharedVocabPool, VocabPoolConfig, VocabPoolStats};
201pub use simd::SimdTokenizer;
202pub use special_tokens::{
203 AdvancedTemplate, ConversationMessage, PlaceholderProcessor, PlaceholderToken, PlaceholderType,
204 SpecialTokenConfig, SpecialTokenManager,
205};
206pub use streaming::{BatchedStreamingTokenizer, StreamingTokenizer, TextFileIterator};
207pub use subword_regularization::{
208 SubwordRegularizationConfig, SubwordRegularizer, UnigramSubwordRegularizer,
209};
210#[cfg(feature = "tensorflow")]
211pub use tensorflow::{
212 RaggedTensor, TensorFlowBatch, TensorFlowConfig, TensorFlowDataset, TensorFlowTensor,
213 TensorFlowTokenizer, TensorFlowUtils, TensorOrRagged, TfDType, TfDataIterator,
214 TfPaddingStrategy, TfTruncationStrategy,
215};
216pub use test_infrastructure::{
217 BenchmarkResults, CrossValidationResults, CrossValidationRunner, FuzzingResults,
218 InconsistencySeverity, RegressionResults, TestCaseGenerator, TestConfig, TestReportUtils,
219 TestResult, TestRunner, TestSuiteResult,
220};
221pub use thai::{ThaiMode, ThaiTokenizer, ThaiTokenizerConfig};
222pub use tiktoken::TiktokenTokenizer;
223pub use tokenization_debugger::{
224 CharacterAnalysis, CompressionStats, DebugAnalysis, DebugSession, DebuggerConfig,
225 DetectedIssue, IssueSeverity, IssueType, PatternAnalysis, PerformanceStats,
226 TokenizationDebugger, TokenizationResult,
227};
228pub use tokenizer::{
229 TokenizedInputWithAlignment, TokenizedInputWithOffsets, TokenizerImpl, TokenizerWrapper,
230};
231pub use training::{
232 CoverageAnalysis, DistributedTrainingCoordinator, LanguageDetectionResult, LanguageDetector,
233 StreamingTrainer, TokenDistributionAnalyzer, TokenDistributionResult, TrainingCheckpoint,
234};
235pub use unigram::UnigramTokenizer;
236pub use visualization::{
237 ComparisonStats, TokenInfo, TokenVisualization, TokenVisualizer,
238 TokenizationStats as VisualizationTokenizationStats, TokenizerComparison, VisualizationConfig,
239};
240pub use vocab::{FlexibleVocab, LazyVocab, MergeStrategy, Vocab};
241pub use vocab_analyzer::{
242 CharacterPattern, CoverageAnalysis as VocabCoverageAnalysis, FrequencyAnalysis,
243 IssueSeverity as VocabIssueSeverity, LanguageDistribution, SubwordPattern, VocabAnalysisConfig,
244 VocabAnalysisResult, VocabAnalyzer, VocabBasicStats, VocabDebugUtils, VocabIssue,
245 VocabIssueType,
246};
247pub use wordpiece::WordPieceTokenizer;
248pub use zero_copy::{
249 ZeroCopyBuilder, ZeroCopyHeader, ZeroCopyMemoryStats, ZeroCopyTokenizer, ZeroCopyUtils,
250 ZeroCopyVocabEntry,
251};
252
253use trustformers_core::traits::{TokenizedInput, Tokenizer};
254
255#[cfg(test)]
258mod tests {
259
260 #[test]
261 fn it_works() {
262 assert_eq!(2 + 2, 4);
263 }
264}