1#![allow(clippy::manual_strip)]
2#![allow(clippy::needless_range_loop)]
3#![allow(clippy::if_same_then_else)]
4#![allow(clippy::cloned_ref_to_slice_refs)]
5#![allow(dead_code)]
6#![warn(missing_docs)]
110
111pub mod classification;
112pub mod cleansing;
113pub mod distance;
114pub mod domain_processors;
115pub mod embeddings;
116pub mod enhanced_vectorize;
117pub mod error;
118pub mod huggingface_compat;
119pub mod information_extraction;
120pub mod language_model;
121pub mod lemmatization;
122pub mod ml_integration;
123pub mod ml_sentiment;
124pub mod model_registry;
125pub mod multilingual;
126pub mod neural_architectures;
127pub mod parallel;
128pub mod paraphrasing;
129pub mod performance;
130pub mod pipeline;
131pub mod pos_tagging;
132pub mod preprocess;
133pub mod semantic_similarity;
134pub mod sentiment;
135pub mod simd_ops;
136pub mod sparse;
137pub mod sparse_vectorize;
138pub mod spelling;
139pub mod stemming;
140pub mod streaming;
141pub mod string_metrics;
142pub mod summarization;
143pub mod text_coordinator;
144pub mod text_statistics;
145pub mod token_filter;
146pub mod tokenize;
147pub mod tokenizer;
148pub mod topic_coherence;
149pub mod topic_modeling;
150pub mod transformer;
151pub mod utils;
152pub mod vectorize;
153pub mod visualization;
154pub mod vocabulary;
155pub mod weighted_distance;
156
157pub mod keyword_extraction;
159pub mod language_detection;
160pub mod named_entity_recognition;
161pub mod text_similarity;
162pub mod text_summarization;
163
164pub use classification::{
166 cross_validate_nb, BernoulliNaiveBayes, CrossValidationResult, FeatureHasher, FoldResult,
167 MultiLabelClassifier, MultiLabelPrediction, MultinomialNaiveBayes, TextClassificationMetrics,
168 TextClassificationPipeline, TextDataset, TextFeatureSelector, TfidfCosineClassifier,
169};
170pub use cleansing::{
171 expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
172 normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
173 replace_urls, strip_html_tags, AdvancedTextCleaner,
174};
175pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
176pub use domain_processors::{
177 Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
178 MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
179 ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
180};
181pub use embeddings::{
182 embedding_cosine_similarity,
183 fasttext::{FastText, FastTextConfig},
184 glove::{CooccurrenceMatrix, GloVe, GloVeTrainer, GloVeTrainerConfig},
185 pairwise_similarity, Word2Vec, Word2VecAlgorithm, Word2VecConfig, WordEmbedding,
186};
187pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
188pub use error::{Result, TextError};
189pub use huggingface_compat::{
190 ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
191 FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
192 HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
193 TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
194};
195pub use information_extraction::{
196 AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
197 CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
198 EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
199 InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
200 MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
201 StructuredDocumentInformation, TemporalExtractor, Topic,
202};
203pub use language_model::{NgramModel, SmoothingMethod};
204pub use lemmatization::{Lemmatizer, RuleBasedLemmatizer, WordNetLemmatizer};
205pub use ml_integration::{
206 BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
207};
208pub use ml_sentiment::{
209 ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
210};
211pub use model_registry::{
212 ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
213 SerializableModelData,
214};
215pub use multilingual::{
216 Language, LanguageDetectionResult, LanguageDetector, MultilingualProcessor, ProcessedText,
217 StopWords,
218};
219pub use neural_architectures::{
220 ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
221 GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
222 MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
223 ResidualBlock1D, SelfAttention, TextCNN,
224};
225pub use parallel::{
226 ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
227};
228pub use paraphrasing::{ParaphraseConfig, ParaphraseResult, ParaphraseStrategy, Paraphraser};
229pub use performance::{
230 AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
231 PerformanceSummary, PerformanceThresholds,
232};
233pub use pipeline::{
234 basic_pipeline, lemmatization_pipeline, ngram_pipeline, stemming_pipeline, BatchProcessor,
235 NlpPipeline, PipelineBuilder, PipelineStep,
236};
237pub use pos_tagging::{
238 PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
239};
240pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
241pub use semantic_similarity::{
242 LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
243 WordMoversDistance,
244};
245pub use sentiment::{
246 aggregate_sentiment, analyze_and_aggregate, AggregatedSentiment, AspectSentiment,
247 AspectSentimentAnalyzer, LexiconSentimentAnalyzer, NaiveBayesSentiment,
248 RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon, SentimentResult, SentimentRules,
249 SentimentWordCounts, VaderResult, VaderSentimentAnalyzer,
250};
251pub use simd_ops::{
252 AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
253 TextProcessingResult,
254};
255pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
256pub use sparse_vectorize::{
257 sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
258};
259pub use spelling::{
260 DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
261 SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
262};
263pub use stemming::{
264 LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
265 RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
266};
267pub use streaming::{
268 AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
269 ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
270};
271pub use string_metrics::{
272 AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
273 PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
274};
275pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
276pub use text_coordinator::{
277 AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
278 AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
279};
280pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
281pub use token_filter::{
282 CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
283 TokenFilter,
284};
285pub use tokenize::{
286 bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
287 CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
288 WhitespaceTokenizer, WordTokenizer,
289};
290pub use tokenizer::{
291 BPETokenizer, SimpleCharTokenizer, SimpleWhitespaceTokenizer, TransformerTokenizer,
292 WordPieceTokenizer,
293};
294pub use topic_coherence::{TopicCoherence, TopicDiversity};
295pub use topic_modeling::{
296 LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
297};
298pub use transformer::{
299 FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
300 TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
301 TransformerEncoderLayer, TransformerModel,
302};
303pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
304pub use visualization::{
305 AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
306 TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
307};
308pub use vocabulary::Vocabulary;
309pub use weighted_distance::{
310 DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
311 WeightedStringMetric,
312};
313
314pub use keyword_extraction::{
316 extract_keywords, Keyword, KeywordMethod, RakeKeywordExtractor, TextRankKeywordExtractor,
317 TfIdfKeywordExtractor,
318};
319pub use language_detection::{
320 detect_language, detect_language_with_strategy, DetectedLanguage, DetectionStrategy,
321 LanguageDetectionOutput,
322};
323pub use named_entity_recognition::{extract_entities, NerEntity, NerEntityType, NerPatternConfig};
324pub use text_similarity::{
325 bm25_score, char_ngram_jaccard_similarity, edit_distance_similarity, jaccard_token_similarity,
326 text_similarity, tfidf_cosine_similarity, Bm25Config, Bm25Scorer, SimilarityMethod,
327 SimilarityResult, TfIdfCosineSimilarity,
328};
329pub use text_summarization::{
330 score_position, score_textrank, score_tfidf, summarize, ScoredSentence, SummarizationMethod,
331};