Skip to main content

scirs2_text/
lib.rs

1#![allow(clippy::manual_strip)]
2#![allow(clippy::needless_range_loop)]
3#![allow(clippy::if_same_then_else)]
4#![allow(clippy::cloned_ref_to_slice_refs)]
5#![allow(dead_code)]
6//! # SciRS2 Text - Natural Language Processing
7//!
8//! **scirs2-text** provides comprehensive text processing and NLP capabilities,
9//! offering tokenization, TF-IDF vectorization, word embeddings, sentiment analysis,
10//! topic modeling, and text classification with SIMD acceleration and parallel processing.
11//!
12//! ## 🎯 Key Features
13//!
14//! - **Tokenization**: Word, sentence, N-gram, BPE, regex tokenizers
15//! - **Vectorization**: TF-IDF, count vectorizers, word embeddings
16//! - **Text Processing**: Stemming, lemmatization, normalization, stopword removal
17//! - **Embeddings**: Word2Vec (Skip-gram, CBOW), GloVe loading
18//! - **Similarity**: Cosine, Jaccard, Levenshtein, phonetic algorithms
19//! - **NLP**: Sentiment analysis, topic modeling (LDA), text classification
20//! - **Performance**: SIMD operations, parallel processing, sparse matrices
21//!
22//! ## 📦 Module Overview
23//!
24//! | SciRS2 Module | Python Equivalent | Description |
25//! |---------------|-------------------|-------------|
26//! | `tokenize` | `nltk.tokenize` | Text tokenization utilities |
27//! | `vectorize` | `sklearn.feature_extraction.text.TfidfVectorizer` | TF-IDF and count vectorization |
28//! | `embeddings` | `gensim.models.Word2Vec` | Word embeddings (Word2Vec) |
29//! | `sentiment` | `nltk.sentiment` | Sentiment analysis |
30//! | `topic_modeling` | `sklearn.decomposition.LatentDirichletAllocation` | Topic modeling (LDA) |
31//! | `stemming` | `nltk.stem` | Stemming and lemmatization |
32//!
33//! ## 🚀 Quick Start
34//!
35//! ```toml
36//! [dependencies]
37//! scirs2-text = "0.1.5"
38//! ```
39//!
40//! ```rust,no_run
41//! use scirs2_text::{tokenize::WordTokenizer, vectorize::TfidfVectorizer, Tokenizer, Vectorizer};
42//!
43//! // Tokenization
44//! let tokenizer = WordTokenizer::default();
45//! let tokens = tokenizer.tokenize("Hello, world!").unwrap();
46//!
47//! // TF-IDF vectorization
48//! let docs = vec!["Hello world", "Good morning world"];
49//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
50//! let matrix = vectorizer.fit_transform(&docs).unwrap();
51//! ```
52//!
53//! ## 🔒 Version: 0.1.5 (January 15, 2026)
54//!
55//! ## Quick Start
56//!
57//! ```rust
58//! use scirs2_text::{
59//!     tokenize::WordTokenizer,
60//!     vectorize::TfidfVectorizer,
61//!     sentiment::LexiconSentimentAnalyzer,
62//!     Tokenizer, Vectorizer
63//! };
64//!
65//! // Basic tokenization
66//! let tokenizer = WordTokenizer::default();
67//! let tokens = tokenizer.tokenize("Hello, world! This is a test.").unwrap();
68//!
69//! // TF-IDF vectorization
70//! let documents = vec![
71//!     "The quick brown fox jumps over the lazy dog",
72//!     "A quick brown dog outpaces a quick fox",
73//!     "The lazy dog sleeps all day"
74//! ];
75//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
76//! let matrix = vectorizer.fit_transform(&documents).unwrap();
77//!
78//! // Sentiment analysis
79//! let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
80//! let sentiment = analyzer.analyze("I love this library!").unwrap();
81//! println!("Sentiment: {:?}", sentiment.sentiment);
82//! ```
83//!
84//! ## Architecture
85//!
86//! The module is organized into focused sub-modules:
87//!
88//! - [`tokenize`]: Text tokenization utilities
89//! - [`vectorize`]: Document vectorization and TF-IDF
90//! - [`embeddings`]: Word embedding training and utilities
91//! - [`sentiment`]: Sentiment analysis tools
92//! - [`topic_modeling`]: Topic modeling with LDA
93//! - [`string_metrics`]: String similarity and distance metrics
94//! - [`preprocess`]: Text cleaning and normalization
95//! - [`stemming`]: Stemming and lemmatization
96//! - [`parallel`]: Parallel processing utilities
97//! - [`simd_ops`]: SIMD-accelerated operations
98//!
99//! ## Performance
100//!
101//! SciRS2 Text is designed for high performance:
102//!
103//! - SIMD acceleration for string operations
104//! - Parallel processing for large document collections
105//! - Memory-efficient sparse matrix representations
106//! - Zero-copy string processing where possible
107//! - Optimized algorithms with complexity guarantees
108
109#![warn(missing_docs)]
110
111pub mod classification;
112pub mod cleansing;
113pub mod distance;
114pub mod domain_processors;
115pub mod embeddings;
116pub mod enhanced_vectorize;
117pub mod error;
118pub mod huggingface_compat;
119pub mod information_extraction;
120pub mod language_model;
121pub mod lemmatization;
122pub mod ml_integration;
123pub mod ml_sentiment;
124pub mod model_registry;
125pub mod multilingual;
126pub mod neural_architectures;
127pub mod parallel;
128pub mod paraphrasing;
129pub mod performance;
130pub mod pipeline;
131pub mod pos_tagging;
132pub mod preprocess;
133pub mod semantic_similarity;
134pub mod sentiment;
135pub mod simd_ops;
136pub mod sparse;
137pub mod sparse_vectorize;
138pub mod spelling;
139pub mod stemming;
140pub mod streaming;
141pub mod string_metrics;
142pub mod summarization;
143pub mod text_coordinator;
144pub mod text_statistics;
145pub mod token_filter;
146pub mod tokenize;
147pub mod tokenizer;
148pub mod topic_coherence;
149pub mod topic_modeling;
150pub mod transformer;
151pub mod utils;
152pub mod vectorize;
153pub mod visualization;
154pub mod vocabulary;
155pub mod weighted_distance;
156
157// New text processing modules
158pub mod keyword_extraction;
159pub mod language_detection;
160pub mod named_entity_recognition;
161pub mod text_similarity;
162pub mod text_summarization;
163
164// Re-export commonly used items
165pub use classification::{
166    cross_validate_nb, BernoulliNaiveBayes, CrossValidationResult, FeatureHasher, FoldResult,
167    MultiLabelClassifier, MultiLabelPrediction, MultinomialNaiveBayes, TextClassificationMetrics,
168    TextClassificationPipeline, TextDataset, TextFeatureSelector, TfidfCosineClassifier,
169};
170pub use cleansing::{
171    expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
172    normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
173    replace_urls, strip_html_tags, AdvancedTextCleaner,
174};
175pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
176pub use domain_processors::{
177    Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
178    MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
179    ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
180};
181pub use embeddings::{
182    embedding_cosine_similarity,
183    fasttext::{FastText, FastTextConfig},
184    glove::{CooccurrenceMatrix, GloVe, GloVeTrainer, GloVeTrainerConfig},
185    pairwise_similarity, Word2Vec, Word2VecAlgorithm, Word2VecConfig, WordEmbedding,
186};
187pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
188pub use error::{Result, TextError};
189pub use huggingface_compat::{
190    ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
191    FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
192    HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
193    TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
194};
195pub use information_extraction::{
196    AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
197    CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
198    EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
199    InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
200    MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
201    StructuredDocumentInformation, TemporalExtractor, Topic,
202};
203pub use language_model::{NgramModel, SmoothingMethod};
204pub use lemmatization::{Lemmatizer, RuleBasedLemmatizer, WordNetLemmatizer};
205pub use ml_integration::{
206    BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
207};
208pub use ml_sentiment::{
209    ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
210};
211pub use model_registry::{
212    ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
213    SerializableModelData,
214};
215pub use multilingual::{
216    Language, LanguageDetectionResult, LanguageDetector, MultilingualProcessor, ProcessedText,
217    StopWords,
218};
219pub use neural_architectures::{
220    ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
221    GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
222    MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
223    ResidualBlock1D, SelfAttention, TextCNN,
224};
225pub use parallel::{
226    ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
227};
228pub use paraphrasing::{ParaphraseConfig, ParaphraseResult, ParaphraseStrategy, Paraphraser};
229pub use performance::{
230    AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
231    PerformanceSummary, PerformanceThresholds,
232};
233pub use pipeline::{
234    basic_pipeline, lemmatization_pipeline, ngram_pipeline, stemming_pipeline, BatchProcessor,
235    NlpPipeline, PipelineBuilder, PipelineStep,
236};
237pub use pos_tagging::{
238    PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
239};
240pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
241pub use semantic_similarity::{
242    LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
243    WordMoversDistance,
244};
245pub use sentiment::{
246    aggregate_sentiment, analyze_and_aggregate, AggregatedSentiment, AspectSentiment,
247    AspectSentimentAnalyzer, LexiconSentimentAnalyzer, NaiveBayesSentiment,
248    RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon, SentimentResult, SentimentRules,
249    SentimentWordCounts, VaderResult, VaderSentimentAnalyzer,
250};
251pub use simd_ops::{
252    AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
253    TextProcessingResult,
254};
255pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
256pub use sparse_vectorize::{
257    sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
258};
259pub use spelling::{
260    DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
261    SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
262};
263pub use stemming::{
264    LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
265    RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
266};
267pub use streaming::{
268    AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
269    ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
270};
271pub use string_metrics::{
272    AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
273    PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
274};
275pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
276pub use text_coordinator::{
277    AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
278    AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
279};
280pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
281pub use token_filter::{
282    CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
283    TokenFilter,
284};
285pub use tokenize::{
286    bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
287    CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
288    WhitespaceTokenizer, WordTokenizer,
289};
290pub use tokenizer::{
291    BPETokenizer, SimpleCharTokenizer, SimpleWhitespaceTokenizer, TransformerTokenizer,
292    WordPieceTokenizer,
293};
294pub use topic_coherence::{TopicCoherence, TopicDiversity};
295pub use topic_modeling::{
296    LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
297};
298pub use transformer::{
299    FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
300    TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
301    TransformerEncoderLayer, TransformerModel,
302};
303pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
304pub use visualization::{
305    AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
306    TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
307};
308pub use vocabulary::Vocabulary;
309pub use weighted_distance::{
310    DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
311    WeightedStringMetric,
312};
313
314// Re-exports for new modules
315pub use keyword_extraction::{
316    extract_keywords, Keyword, KeywordMethod, RakeKeywordExtractor, TextRankKeywordExtractor,
317    TfIdfKeywordExtractor,
318};
319pub use language_detection::{
320    detect_language, detect_language_with_strategy, DetectedLanguage, DetectionStrategy,
321    LanguageDetectionOutput,
322};
323pub use named_entity_recognition::{extract_entities, NerEntity, NerEntityType, NerPatternConfig};
324pub use text_similarity::{
325    bm25_score, char_ngram_jaccard_similarity, edit_distance_similarity, jaccard_token_similarity,
326    text_similarity, tfidf_cosine_similarity, Bm25Config, Bm25Scorer, SimilarityMethod,
327    SimilarityResult, TfIdfCosineSimilarity,
328};
329pub use text_summarization::{
330    score_position, score_textrank, score_tfidf, summarize, ScoredSentence, SummarizationMethod,
331};