Skip to main content

scirs2_text/
lib.rs

1#![allow(clippy::manual_strip)]
2#![allow(clippy::needless_range_loop)]
3#![allow(clippy::if_same_then_else)]
4#![allow(clippy::cloned_ref_to_slice_refs)]
5#![allow(dead_code)]
6//! # SciRS2 Text - Natural Language Processing
7//!
8//! **scirs2-text** provides comprehensive text processing and NLP capabilities,
9//! offering tokenization, TF-IDF vectorization, word embeddings, sentiment analysis,
10//! topic modeling, and text classification with SIMD acceleration and parallel processing.
11//!
12//! ## 🎯 Key Features
13//!
14//! - **Tokenization**: Word, sentence, N-gram, BPE, regex tokenizers
15//! - **Vectorization**: TF-IDF, count vectorizers, word embeddings
16//! - **Text Processing**: Stemming, lemmatization, normalization, stopword removal
17//! - **Embeddings**: Word2Vec (Skip-gram, CBOW), GloVe loading
18//! - **Similarity**: Cosine, Jaccard, Levenshtein, phonetic algorithms
19//! - **NLP**: Sentiment analysis, topic modeling (LDA), text classification
20//! - **Performance**: SIMD operations, parallel processing, sparse matrices
21//!
22//! ## 📦 Module Overview
23//!
24//! | SciRS2 Module | Python Equivalent | Description |
25//! |---------------|-------------------|-------------|
26//! | `tokenize` | `nltk.tokenize` | Text tokenization utilities |
27//! | `vectorize` | `sklearn.feature_extraction.text.TfidfVectorizer` | TF-IDF and count vectorization |
28//! | `embeddings` | `gensim.models.Word2Vec` | Word embeddings (Word2Vec) |
29//! | `sentiment` | `nltk.sentiment` | Sentiment analysis |
30//! | `topic_modeling` | `sklearn.decomposition.LatentDirichletAllocation` | Topic modeling (LDA) |
31//! | `stemming` | `nltk.stem` | Stemming and lemmatization |
32//!
33//! ## 🚀 Quick Start
34//!
35//! ```toml
36//! [dependencies]
37//! scirs2-text = "0.1.2"
38//! ```
39//!
40//! ```rust,no_run
41//! use scirs2_text::{tokenize::WordTokenizer, vectorize::TfidfVectorizer, Tokenizer, Vectorizer};
42//!
43//! // Tokenization
44//! let tokenizer = WordTokenizer::default();
45//! let tokens = tokenizer.tokenize("Hello, world!").unwrap();
46//!
47//! // TF-IDF vectorization
48//! let docs = vec!["Hello world", "Good morning world"];
49//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
50//! let matrix = vectorizer.fit_transform(&docs).unwrap();
51//! ```
52//!
53//! ## 🔒 Version: 0.1.2 (January 15, 2026)
54//!
55//! ## Quick Start
56//!
57//! ```rust
58//! use scirs2_text::{
59//!     tokenize::WordTokenizer,
60//!     vectorize::TfidfVectorizer,
61//!     sentiment::LexiconSentimentAnalyzer,
62//!     Tokenizer, Vectorizer
63//! };
64//!
65//! // Basic tokenization
66//! let tokenizer = WordTokenizer::default();
67//! let tokens = tokenizer.tokenize("Hello, world! This is a test.").unwrap();
68//!
69//! // TF-IDF vectorization
70//! let documents = vec![
71//!     "The quick brown fox jumps over the lazy dog",
72//!     "A quick brown dog outpaces a quick fox",
73//!     "The lazy dog sleeps all day"
74//! ];
75//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
76//! let matrix = vectorizer.fit_transform(&documents).unwrap();
77//!
78//! // Sentiment analysis
79//! let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
80//! let sentiment = analyzer.analyze("I love this library!").unwrap();
81//! println!("Sentiment: {:?}", sentiment.sentiment);
82//! ```
83//!
84//! ## Architecture
85//!
86//! The module is organized into focused sub-modules:
87//!
88//! - [`tokenize`]: Text tokenization utilities
89//! - [`vectorize`]: Document vectorization and TF-IDF
90//! - [`embeddings`]: Word embedding training and utilities
91//! - [`sentiment`]: Sentiment analysis tools
92//! - [`topic_modeling`]: Topic modeling with LDA
93//! - [`string_metrics`]: String similarity and distance metrics
94//! - [`preprocess`]: Text cleaning and normalization
95//! - [`stemming`]: Stemming and lemmatization
96//! - [`parallel`]: Parallel processing utilities
97//! - [`simd_ops`]: SIMD-accelerated operations
98//!
99//! ## Performance
100//!
101//! SciRS2 Text is designed for high performance:
102//!
103//! - SIMD acceleration for string operations
104//! - Parallel processing for large document collections
105//! - Memory-efficient sparse matrix representations
106//! - Zero-copy string processing where possible
107//! - Optimized algorithms with complexity guarantees
108
109#![warn(missing_docs)]
110
111pub mod classification;
112pub mod cleansing;
113pub mod distance;
114pub mod domain_processors;
115pub mod embeddings;
116pub mod enhanced_vectorize;
117pub mod error;
118pub mod huggingface_compat;
119pub mod information_extraction;
120pub mod ml_integration;
121pub mod ml_sentiment;
122pub mod model_registry;
123pub mod multilingual;
124pub mod neural_architectures;
125pub mod parallel;
126pub mod performance;
127pub mod pos_tagging;
128pub mod preprocess;
129pub mod semantic_similarity;
130pub mod sentiment;
131pub mod simd_ops;
132pub mod sparse;
133pub mod sparse_vectorize;
134pub mod spelling;
135pub mod stemming;
136pub mod streaming;
137pub mod string_metrics;
138pub mod summarization;
139pub mod text_coordinator;
140pub mod text_statistics;
141pub mod token_filter;
142pub mod tokenize;
143pub mod topic_coherence;
144pub mod topic_modeling;
145pub mod transformer;
146pub mod utils;
147pub mod vectorize;
148pub mod visualization;
149pub mod vocabulary;
150pub mod weighted_distance;
151
152// Re-export commonly used items
153pub use classification::{
154    TextClassificationMetrics, TextClassificationPipeline, TextDataset, TextFeatureSelector,
155};
156pub use cleansing::{
157    expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
158    normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
159    replace_urls, strip_html_tags, AdvancedTextCleaner,
160};
161pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
162pub use domain_processors::{
163    Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
164    MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
165    ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
166};
167pub use embeddings::{Word2Vec, Word2VecAlgorithm, Word2VecConfig};
168pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
169pub use error::{Result, TextError};
170pub use huggingface_compat::{
171    ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
172    FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
173    HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
174    TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
175};
176pub use information_extraction::{
177    AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
178    CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
179    EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
180    InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
181    MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
182    StructuredDocumentInformation, TemporalExtractor, Topic,
183};
184pub use ml_integration::{
185    BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
186};
187pub use ml_sentiment::{
188    ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
189};
190pub use model_registry::{
191    ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
192    SerializableModelData,
193};
194pub use multilingual::{
195    Language, LanguageDetectionResult, LanguageDetector, MultilingualProcessor, ProcessedText,
196    StopWords,
197};
198pub use neural_architectures::{
199    ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
200    GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
201    MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
202    ResidualBlock1D, SelfAttention, TextCNN,
203};
204pub use parallel::{
205    ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
206};
207pub use performance::{
208    AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
209    PerformanceSummary, PerformanceThresholds,
210};
211pub use pos_tagging::{
212    PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
213};
214pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
215pub use semantic_similarity::{
216    LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
217    WordMoversDistance,
218};
219pub use sentiment::{
220    LexiconSentimentAnalyzer, RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon,
221    SentimentResult, SentimentRules, SentimentWordCounts,
222};
223pub use simd_ops::{
224    AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
225    TextProcessingResult,
226};
227pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
228pub use sparse_vectorize::{
229    sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
230};
231pub use spelling::{
232    DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
233    SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
234};
235pub use stemming::{
236    LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
237    RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
238};
239pub use streaming::{
240    AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
241    ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
242};
243pub use string_metrics::{
244    AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
245    PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
246};
247pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
248pub use text_coordinator::{
249    AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
250    AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
251};
252pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
253pub use token_filter::{
254    CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
255    TokenFilter,
256};
257pub use tokenize::{
258    bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
259    CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
260    WhitespaceTokenizer, WordTokenizer,
261};
262pub use topic_coherence::{TopicCoherence, TopicDiversity};
263pub use topic_modeling::{
264    LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
265};
266pub use transformer::{
267    FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
268    TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
269    TransformerEncoderLayer, TransformerModel,
270};
271pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
272pub use visualization::{
273    AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
274    TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
275};
276pub use vocabulary::Vocabulary;
277pub use weighted_distance::{
278    DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
279    WeightedStringMetric,
280};