Skip to main content

scirs2_text/
lib.rs

1#![allow(clippy::manual_strip)]
2#![allow(clippy::needless_range_loop)]
3#![allow(clippy::if_same_then_else)]
4#![allow(clippy::cloned_ref_to_slice_refs)]
5#![allow(dead_code)]
6//! # SciRS2 Text - Natural Language Processing
7//!
8//! **scirs2-text** provides comprehensive text processing and NLP capabilities,
9//! offering tokenization, TF-IDF vectorization, word embeddings, sentiment analysis,
10//! topic modeling, and text classification with SIMD acceleration and parallel processing.
11//!
12//! ## 🎯 Key Features
13//!
14//! - **Tokenization**: Word, sentence, N-gram, BPE, regex tokenizers
15//! - **Vectorization**: TF-IDF, count vectorizers, word embeddings
16//! - **Text Processing**: Stemming, lemmatization, normalization, stopword removal
17//! - **Embeddings**: Word2Vec (Skip-gram, CBOW), GloVe loading
18//! - **Similarity**: Cosine, Jaccard, Levenshtein, phonetic algorithms
19//! - **NLP**: Sentiment analysis, topic modeling (LDA), text classification
20//! - **Performance**: SIMD operations, parallel processing, sparse matrices
21//!
22//! ## 📦 Module Overview
23//!
24//! | SciRS2 Module | Python Equivalent | Description |
25//! |---------------|-------------------|-------------|
26//! | `tokenize` | `nltk.tokenize` | Text tokenization utilities |
27//! | `vectorize` | `sklearn.feature_extraction.text.TfidfVectorizer` | TF-IDF and count vectorization |
28//! | `embeddings` | `gensim.models.Word2Vec` | Word embeddings (Word2Vec) |
29//! | `sentiment` | `nltk.sentiment` | Sentiment analysis |
30//! | `topic_modeling` | `sklearn.decomposition.LatentDirichletAllocation` | Topic modeling (LDA) |
31//! | `stemming` | `nltk.stem` | Stemming and lemmatization |
32//!
33//! ## 🚀 Quick Start
34//!
35//! ```toml
36//! [dependencies]
37//! scirs2-text = "0.1.5"
38//! ```
39//!
40//! ```rust,no_run
41//! use scirs2_text::{tokenize::WordTokenizer, vectorize::TfidfVectorizer, Tokenizer, Vectorizer};
42//!
43//! // Tokenization
44//! let tokenizer = WordTokenizer::default();
45//! let tokens = tokenizer.tokenize("Hello, world!").unwrap();
46//!
47//! // TF-IDF vectorization
48//! let docs = vec!["Hello world", "Good morning world"];
49//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
50//! let matrix = vectorizer.fit_transform(&docs).unwrap();
51//! ```
52//!
53//! ## 🔒 Version: 0.1.5 (January 15, 2026)
54//!
55//! ## Quick Start
56//!
57//! ```rust
58//! use scirs2_text::{
59//!     tokenize::WordTokenizer,
60//!     vectorize::TfidfVectorizer,
61//!     sentiment::LexiconSentimentAnalyzer,
62//!     Tokenizer, Vectorizer
63//! };
64//!
65//! // Basic tokenization
66//! let tokenizer = WordTokenizer::default();
67//! let tokens = tokenizer.tokenize("Hello, world! This is a test.").unwrap();
68//!
69//! // TF-IDF vectorization
70//! let documents = vec![
71//!     "The quick brown fox jumps over the lazy dog",
72//!     "A quick brown dog outpaces a quick fox",
73//!     "The lazy dog sleeps all day"
74//! ];
75//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
76//! let matrix = vectorizer.fit_transform(&documents).unwrap();
77//!
78//! // Sentiment analysis
79//! let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
80//! let sentiment = analyzer.analyze("I love this library!").unwrap();
81//! println!("Sentiment: {:?}", sentiment.sentiment);
82//! ```
83//!
84//! ## Architecture
85//!
86//! The module is organized into focused sub-modules:
87//!
88//! - [`tokenize`]: Text tokenization utilities
89//! - [`vectorize`]: Document vectorization and TF-IDF
90//! - [`embeddings`]: Word embedding training and utilities
91//! - [`sentiment`]: Sentiment analysis tools
92//! - [`topic_modeling`]: Topic modeling with LDA
93//! - [`string_metrics`]: String similarity and distance metrics
94//! - [`preprocess`]: Text cleaning and normalization
95//! - [`stemming`]: Stemming and lemmatization
96//! - [`parallel`]: Parallel processing utilities
97//! - [`simd_ops`]: SIMD-accelerated operations
98//!
99//! ## Performance
100//!
101//! SciRS2 Text is designed for high performance:
102//!
103//! - SIMD acceleration for string operations
104//! - Parallel processing for large document collections
105//! - Memory-efficient sparse matrix representations
106//! - Zero-copy string processing where possible
107//! - Optimized algorithms with complexity guarantees
108
109#![warn(missing_docs)]
110
111pub mod classification;
112pub mod cleansing;
113pub mod distance;
114pub mod domain_processors;
115pub mod embeddings;
116pub mod enhanced_vectorize;
117pub mod error;
118pub mod huggingface_compat;
119pub mod information_extraction;
120pub mod language_model;
121pub mod ml_integration;
122pub mod ml_sentiment;
123pub mod model_registry;
124pub mod multilingual;
125pub mod neural_architectures;
126pub mod parallel;
127pub mod paraphrasing;
128pub mod performance;
129pub mod pos_tagging;
130pub mod preprocess;
131pub mod semantic_similarity;
132pub mod sentiment;
133pub mod simd_ops;
134pub mod sparse;
135pub mod sparse_vectorize;
136pub mod spelling;
137pub mod stemming;
138pub mod streaming;
139pub mod string_metrics;
140pub mod summarization;
141pub mod text_coordinator;
142pub mod text_statistics;
143pub mod token_filter;
144pub mod tokenize;
145pub mod topic_coherence;
146pub mod topic_modeling;
147pub mod transformer;
148pub mod utils;
149pub mod vectorize;
150pub mod visualization;
151pub mod vocabulary;
152pub mod weighted_distance;
153
154// Re-export commonly used items
155pub use classification::{
156    TextClassificationMetrics, TextClassificationPipeline, TextDataset, TextFeatureSelector,
157};
158pub use cleansing::{
159    expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
160    normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
161    replace_urls, strip_html_tags, AdvancedTextCleaner,
162};
163pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
164pub use domain_processors::{
165    Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
166    MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
167    ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
168};
169pub use embeddings::{
170    fasttext::{FastText, FastTextConfig},
171    glove::GloVe,
172    Word2Vec, Word2VecAlgorithm, Word2VecConfig,
173};
174pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
175pub use error::{Result, TextError};
176pub use huggingface_compat::{
177    ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
178    FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
179    HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
180    TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
181};
182pub use information_extraction::{
183    AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
184    CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
185    EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
186    InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
187    MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
188    StructuredDocumentInformation, TemporalExtractor, Topic,
189};
190pub use language_model::{NgramModel, SmoothingMethod};
191pub use ml_integration::{
192    BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
193};
194pub use ml_sentiment::{
195    ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
196};
197pub use model_registry::{
198    ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
199    SerializableModelData,
200};
201pub use multilingual::{
202    Language, LanguageDetectionResult, LanguageDetector, MultilingualProcessor, ProcessedText,
203    StopWords,
204};
205pub use neural_architectures::{
206    ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
207    GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
208    MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
209    ResidualBlock1D, SelfAttention, TextCNN,
210};
211pub use parallel::{
212    ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
213};
214pub use paraphrasing::{ParaphraseConfig, ParaphraseResult, ParaphraseStrategy, Paraphraser};
215pub use performance::{
216    AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
217    PerformanceSummary, PerformanceThresholds,
218};
219pub use pos_tagging::{
220    PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
221};
222pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
223pub use semantic_similarity::{
224    LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
225    WordMoversDistance,
226};
227pub use sentiment::{
228    LexiconSentimentAnalyzer, RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon,
229    SentimentResult, SentimentRules, SentimentWordCounts,
230};
231pub use simd_ops::{
232    AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
233    TextProcessingResult,
234};
235pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
236pub use sparse_vectorize::{
237    sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
238};
239pub use spelling::{
240    DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
241    SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
242};
243pub use stemming::{
244    LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
245    RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
246};
247pub use streaming::{
248    AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
249    ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
250};
251pub use string_metrics::{
252    AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
253    PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
254};
255pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
256pub use text_coordinator::{
257    AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
258    AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
259};
260pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
261pub use token_filter::{
262    CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
263    TokenFilter,
264};
265pub use tokenize::{
266    bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
267    CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
268    WhitespaceTokenizer, WordTokenizer,
269};
270pub use topic_coherence::{TopicCoherence, TopicDiversity};
271pub use topic_modeling::{
272    LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
273};
274pub use transformer::{
275    FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
276    TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
277    TransformerEncoderLayer, TransformerModel,
278};
279pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
280pub use visualization::{
281    AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
282    TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
283};
284pub use vocabulary::Vocabulary;
285pub use weighted_distance::{
286    DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
287    WeightedStringMetric,
288};