Skip to main content

torsh_text/
prelude.rs

1//! Prelude module for torsh-text
2//!
3//! This module re-exports the most commonly used types and traits from torsh-text,
4//! allowing users to import everything they need with a single `use torsh_text::prelude::*;`
5
6// Core types and traits
7pub use crate::{Result, TextError};
8
9// Tokenization essentials
10pub use crate::tokenization::{
11    BPETokenizer, CharTokenizer, SubwordTokenizer, Tokenizer, WhitespaceTokenizer,
12};
13
14// Advanced tokenization
15pub use crate::tokenization::advanced::FastTokenizer;
16
17// Unified tokenization
18pub use crate::tokenization::unified::{
19    EfficientUnifiedTokenizer, TokenizerConfig, TokenizerFactory, UnifiedTokenizer,
20};
21
22// Vocabulary management
23pub use crate::vocab::{SpecialTokens, Vocabulary};
24
25// Text preprocessing
26pub use crate::utils::{
27    BatchProcessor, CustomStep, OptimizedBatchOps, PreprocessingStep, StreamingBatchProcessor,
28    TextAugmenter, TextCleaner, TextNormalizer, TextPreprocessingPipeline,
29};
30
31// Embeddings
32pub use crate::embeddings::{
33    CombinedEmbeddings, EmbeddingUtils, PositionalEncoding, WordEmbedding,
34};
35
36// Datasets
37pub use crate::datasets::{
38    AgNewsDataset, ClassificationDataset, ConsolidatedDataset, Dataset, DatasetConfig,
39    DatasetDownloader, DatasetUtils, ImdbDataset, LanguageModelingDataset, Multi30kDataset,
40    SequenceLabelingDataset, TranslationDataset, UnifiedDatasetLoader, WikiTextDataset,
41};
42
43// Text generation
44pub use crate::generation::{
45    BeamHypothesis, BeamSearchDecoder, GenerationConfig, NGramRepetitionFilter, RepetitionPenalty,
46    TextGenerator, TextSampler,
47};
48
49// Analysis tools
50pub use crate::analysis::{NgramExtractor, TextSimilarity, TextStatistics, TfIdfCalculator};
51
52// Metrics - Temporarily disabled due to module issues
53// pub use crate::metrics::{
54//     BertScore, BertScoreResult, BleuScore, EditDistance, PerplexityCalculator, RougeMetrics,
55//     RougeScore, RougeType, SemanticSimilarity,
56// };
57
58// Custom metrics - Temporarily disabled due to module issues
59// pub use crate::metrics::custom::{
60//     CompositeMetric, CustomMetric, EvaluationFramework, FluencyMetric, MetricRegistry,
61//     SemanticCoherenceMetric, WordOverlapMetric,
62// };
63
64// Models
65pub use crate::models::{
66    GenerationConfig as ModelGenerationConfig, ModelRegistry, TextDecoder, TextEncoder, TextModel,
67};
68
69// Model registry
70pub use crate::models::registry::{create_model, get_config, get_global_registry, list_configs};
71
72// SciRS2 operations
73pub use crate::scirs2_ops::SciRS2TextOps;
74
75// SciRS2 string operations
76pub use crate::scirs2_ops::string_ops::*;
77
78// Advanced analytics
79pub use crate::scirs2_ops::advanced_analytics::{
80    compute_advanced_stats, AdvancedTextSampler, AdvancedTextStats, ComplexityAnalyzer,
81    ComplexityMetrics,
82};
83
84// Performance monitoring
85pub use crate::scirs2_ops::performance::{PerformanceMetrics, PerformanceMonitor};
86
87// SciRS2 vectorized operations
88pub use crate::scirs2_ops::vectorized_ops::*;
89
90// SciRS2 indexing
91pub use crate::scirs2_ops::indexing::*;
92
93// SciRS2 memory optimization
94pub use crate::scirs2_ops::memory::*;
95
96// Convenience utilities
97pub use crate::convenience::{
98    BatchTextProcessor, ComprehensiveTextReport, EnhancedTextAnalyzer, LanguageDetector,
99    QuickTextProcessor, TextQualityAssessor,
100};
101
102// Re-export commonly used external types
103pub use torsh_core::{DType, Device, Shape};
104pub use torsh_tensor::Tensor;
105
106/// Convenience macro for creating a preprocessing pipeline
107///
108/// # Examples
109///
110/// ```rust
111/// use torsh_text::prelude::*;
112///
113/// let pipeline = preprocessing_pipeline! {
114///     normalize: (unicode: true, accents: true, punctuation: false),
115///     clean: (urls: true, emails: true, html: true),
116///     custom: |text| text.to_lowercase()
117/// };
118/// ```
119#[macro_export]
120macro_rules! preprocessing_pipeline {
121    (
122        normalize: (unicode: $unicode:expr, accents: $accents:expr, punctuation: $punct:expr),
123        clean: (urls: $urls:expr, emails: $emails:expr, html: $html:expr)
124        $(, custom: $custom:expr)*
125    ) => {{
126        let normalizer = $crate::utils::TextNormalizer::default()
127            .normalize_unicode($unicode)
128            .remove_accents($accents)
129            .remove_punctuation($punct);
130        let cleaner = $crate::utils::TextCleaner::default()
131            .remove_urls($urls)
132            .remove_emails($emails)
133            .remove_html($html);
134        let mut pipeline = $crate::utils::TextPreprocessingPipeline::new()
135            .with_normalization(normalizer)
136            .with_cleaning(cleaner);
137
138        $(
139            pipeline = pipeline.add_custom_step(Box::new($crate::utils::CustomStep::new($custom, "custom".to_string())));
140        )*
141
142        pipeline
143    }};
144}
145
146/// Convenience macro for creating a vocabulary with special tokens
147///
148/// # Examples
149///
150/// ```rust
151/// use torsh_text::prelude::*;
152///
153/// let vocab = vocabulary! {
154///     special_tokens: {
155///         pad: "<pad>",
156///         unk: "<unk>",
157///         bos: "<s>",
158///         eos: "</s>"
159///     },
160///     min_freq: 5
161/// };
162/// ```
163#[macro_export]
164macro_rules! vocabulary {
165    (
166        special_tokens: {
167            $($name:ident: $token:expr),* $(,)?
168        }
169        $(, min_freq: $min_freq:expr)?
170    ) => {{
171        let mut special_tokens = $crate::vocab::SpecialTokens::default();
172        $(
173            match stringify!($name) {
174                "pad" => special_tokens.pad = $token.to_string(),
175                "unk" => special_tokens.unk = $token.to_string(),
176                "bos" => special_tokens.bos = $token.to_string(),
177                "eos" => special_tokens.eos = $token.to_string(),
178                "sep" => special_tokens.sep = $token.to_string(),
179                "cls" => special_tokens.cls = $token.to_string(),
180                "mask" => special_tokens.mask = $token.to_string(),
181                _ => {}
182            }
183        )*
184
185        let vocab = $crate::vocab::Vocabulary::new(Some(special_tokens));
186        vocab
187    }};
188}
189
190/// Quick text processing function for common use cases
191///
192/// # Examples
193///
194/// ```rust
195/// use torsh_text::prelude::*;
196///
197/// let processed = quick_process!(
198///     "Hello, world! Visit https://example.com",
199///     normalize: true,
200///     clean_urls: true,
201///     lowercase: true
202/// );
203/// ```
204#[macro_export]
205macro_rules! quick_process {
206    (
207        $text:expr
208        $(, normalize: $normalize:expr)?
209        $(, clean_urls: $clean_urls:expr)?
210        $(, clean_emails: $clean_emails:expr)?
211        $(, clean_html: $clean_html:expr)?
212        $(, lowercase: $lowercase:expr)?
213    ) => {{
214        let mut pipeline = $crate::utils::TextPreprocessingPipeline::new();
215
216        $(
217            if $normalize {
218                let normalizer = $crate::utils::TextNormalizer::default();
219                pipeline = pipeline.with_normalization(normalizer);
220            }
221        )?
222
223        $(
224            if $clean_urls {
225                let cleaner = $crate::utils::TextCleaner::default().remove_urls(true).remove_emails(false).remove_html(false);
226                pipeline = pipeline.with_cleaning(cleaner);
227            }
228        )?
229
230        $(
231            if $clean_emails {
232                let cleaner = $crate::utils::TextCleaner::default().remove_urls(false).remove_emails(true).remove_html(false);
233                pipeline = pipeline.with_cleaning(cleaner);
234            }
235        )?
236
237        $(
238            if $clean_html {
239                let cleaner = $crate::utils::TextCleaner::default().remove_urls(false).remove_emails(false).remove_html(true);
240                pipeline = pipeline.with_cleaning(cleaner);
241            }
242        )?
243
244        $(
245            if $lowercase {
246                pipeline = pipeline.add_custom_step(Box::new($crate::utils::CustomStep::new(|text: &str| text.to_lowercase(), "lowercase".to_string())));
247            }
248        )?
249
250        pipeline.process_text($text)
251    }};
252}
253
254// Re-export macros
255pub use preprocessing_pipeline;
256pub use quick_process;
257pub use vocabulary;