1pub use crate::{Result, TextError};
8
9pub use crate::tokenization::{
11 BPETokenizer, CharTokenizer, SubwordTokenizer, Tokenizer, WhitespaceTokenizer,
12};
13
14pub use crate::tokenization::advanced::FastTokenizer;
16
17pub use crate::tokenization::unified::{
19 EfficientUnifiedTokenizer, TokenizerConfig, TokenizerFactory, UnifiedTokenizer,
20};
21
22pub use crate::vocab::{SpecialTokens, Vocabulary};
24
25pub use crate::utils::{
27 BatchProcessor, CustomStep, OptimizedBatchOps, PreprocessingStep, StreamingBatchProcessor,
28 TextAugmenter, TextCleaner, TextNormalizer, TextPreprocessingPipeline,
29};
30
31pub use crate::embeddings::{
33 CombinedEmbeddings, EmbeddingUtils, PositionalEncoding, WordEmbedding,
34};
35
36pub use crate::datasets::{
38 AgNewsDataset, ClassificationDataset, ConsolidatedDataset, Dataset, DatasetConfig,
39 DatasetDownloader, DatasetUtils, ImdbDataset, LanguageModelingDataset, Multi30kDataset,
40 SequenceLabelingDataset, TranslationDataset, UnifiedDatasetLoader, WikiTextDataset,
41};
42
43pub use crate::generation::{
45 BeamHypothesis, BeamSearchDecoder, GenerationConfig, NGramRepetitionFilter, RepetitionPenalty,
46 TextGenerator, TextSampler,
47};
48
49pub use crate::analysis::{NgramExtractor, TextSimilarity, TextStatistics, TfIdfCalculator};
51
52pub use crate::models::{
66 GenerationConfig as ModelGenerationConfig, ModelRegistry, TextDecoder, TextEncoder, TextModel,
67};
68
69pub use crate::models::registry::{create_model, get_config, get_global_registry, list_configs};
71
72pub use crate::scirs2_ops::SciRS2TextOps;
74
75pub use crate::scirs2_ops::string_ops::*;
77
78pub use crate::scirs2_ops::advanced_analytics::{
80 compute_advanced_stats, AdvancedTextSampler, AdvancedTextStats, ComplexityAnalyzer,
81 ComplexityMetrics,
82};
83
84pub use crate::scirs2_ops::performance::{PerformanceMetrics, PerformanceMonitor};
86
87pub use crate::scirs2_ops::vectorized_ops::*;
89
90pub use crate::scirs2_ops::indexing::*;
92
93pub use crate::scirs2_ops::memory::*;
95
96pub use crate::convenience::{
98 BatchTextProcessor, ComprehensiveTextReport, EnhancedTextAnalyzer, LanguageDetector,
99 QuickTextProcessor, TextQualityAssessor,
100};
101
102pub use torsh_core::{DType, Device, Shape};
104pub use torsh_tensor::Tensor;
105
106#[macro_export]
120macro_rules! preprocessing_pipeline {
121 (
122 normalize: (unicode: $unicode:expr, accents: $accents:expr, punctuation: $punct:expr),
123 clean: (urls: $urls:expr, emails: $emails:expr, html: $html:expr)
124 $(, custom: $custom:expr)*
125 ) => {{
126 let normalizer = $crate::utils::TextNormalizer::default()
127 .normalize_unicode($unicode)
128 .remove_accents($accents)
129 .remove_punctuation($punct);
130 let cleaner = $crate::utils::TextCleaner::default()
131 .remove_urls($urls)
132 .remove_emails($emails)
133 .remove_html($html);
134 let mut pipeline = $crate::utils::TextPreprocessingPipeline::new()
135 .with_normalization(normalizer)
136 .with_cleaning(cleaner);
137
138 $(
139 pipeline = pipeline.add_custom_step(Box::new($crate::utils::CustomStep::new($custom, "custom".to_string())));
140 )*
141
142 pipeline
143 }};
144}
145
146#[macro_export]
164macro_rules! vocabulary {
165 (
166 special_tokens: {
167 $($name:ident: $token:expr),* $(,)?
168 }
169 $(, min_freq: $min_freq:expr)?
170 ) => {{
171 let mut special_tokens = $crate::vocab::SpecialTokens::default();
172 $(
173 match stringify!($name) {
174 "pad" => special_tokens.pad = $token.to_string(),
175 "unk" => special_tokens.unk = $token.to_string(),
176 "bos" => special_tokens.bos = $token.to_string(),
177 "eos" => special_tokens.eos = $token.to_string(),
178 "sep" => special_tokens.sep = $token.to_string(),
179 "cls" => special_tokens.cls = $token.to_string(),
180 "mask" => special_tokens.mask = $token.to_string(),
181 _ => {}
182 }
183 )*
184
185 let vocab = $crate::vocab::Vocabulary::new(Some(special_tokens));
186 vocab
187 }};
188}
189
190#[macro_export]
205macro_rules! quick_process {
206 (
207 $text:expr
208 $(, normalize: $normalize:expr)?
209 $(, clean_urls: $clean_urls:expr)?
210 $(, clean_emails: $clean_emails:expr)?
211 $(, clean_html: $clean_html:expr)?
212 $(, lowercase: $lowercase:expr)?
213 ) => {{
214 let mut pipeline = $crate::utils::TextPreprocessingPipeline::new();
215
216 $(
217 if $normalize {
218 let normalizer = $crate::utils::TextNormalizer::default();
219 pipeline = pipeline.with_normalization(normalizer);
220 }
221 )?
222
223 $(
224 if $clean_urls {
225 let cleaner = $crate::utils::TextCleaner::default().remove_urls(true).remove_emails(false).remove_html(false);
226 pipeline = pipeline.with_cleaning(cleaner);
227 }
228 )?
229
230 $(
231 if $clean_emails {
232 let cleaner = $crate::utils::TextCleaner::default().remove_urls(false).remove_emails(true).remove_html(false);
233 pipeline = pipeline.with_cleaning(cleaner);
234 }
235 )?
236
237 $(
238 if $clean_html {
239 let cleaner = $crate::utils::TextCleaner::default().remove_urls(false).remove_emails(false).remove_html(true);
240 pipeline = pipeline.with_cleaning(cleaner);
241 }
242 )?
243
244 $(
245 if $lowercase {
246 pipeline = pipeline.add_custom_step(Box::new($crate::utils::CustomStep::new(|text: &str| text.to_lowercase(), "lowercase".to_string())));
247 }
248 )?
249
250 pipeline.process_text($text)
251 }};
252}
253
254pub use preprocessing_pipeline;
256pub use quick_process;
257pub use vocabulary;