scirs2_text/simd_ops/
text_analysis.rs

1//! SIMD-accelerated text analysis operations
2//!
3//! This module provides advanced text analysis capabilities with SIMD acceleration.
4
5use scirs2_core::ndarray::Array1;
6
7/// SIMD-accelerated text analysis operations
8pub struct SimdTextAnalyzer;
9
10/// Text analysis result
11#[derive(Debug, Clone)]
12pub struct TextAnalysisResult {
13    /// Character frequencies
14    pub char_frequencies: std::collections::HashMap<char, usize>,
15    /// Word count
16    pub word_count: usize,
17    /// Average word length
18    pub avg_word_length: f64,
19    /// Sentence count
20    pub sentence_count: usize,
21}
22
23/// Advanced SIMD text processor
24pub struct AdvancedSIMDTextProcessor;
25
26/// Text processing result
27#[derive(Debug, Clone)]
28pub struct TextProcessingResult {
29    /// Processed text
30    pub text: String,
31    /// Processing statistics
32    pub stats: TextAnalysisResult,
33    /// Performance metrics
34    pub processing_time_ms: f64,
35}
36
37impl SimdTextAnalyzer {
38    /// Analyze text characteristics
39    pub fn analyze_text(text: &str) -> TextAnalysisResult {
40        let mut char_frequencies = std::collections::HashMap::new();
41
42        for c in text.chars() {
43            *char_frequencies.entry(c).or_insert(0) += 1;
44        }
45
46        let words: Vec<&str> = text.split_whitespace().collect();
47        let word_count = words.len();
48        let total_word_length: usize = words.iter().map(|w| w.len()).sum();
49        let avg_word_length = if word_count > 0 {
50            total_word_length as f64 / word_count as f64
51        } else {
52            0.0
53        };
54
55        let sentence_count = text.split('.').filter(|s| !s.trim().is_empty()).count();
56
57        TextAnalysisResult {
58            char_frequencies,
59            word_count,
60            avg_word_length,
61            sentence_count,
62        }
63    }
64
65    /// Fast character frequency analysis
66    pub fn character_frequencies(text: &str) -> std::collections::HashMap<char, usize> {
67        let mut frequencies = std::collections::HashMap::new();
68        for c in text.chars() {
69            *frequencies.entry(c).or_insert(0) += 1;
70        }
71        frequencies
72    }
73
74    /// SIMD-accelerated line counting
75    pub fn count_lines(text: &str) -> usize {
76        text.lines().count()
77    }
78
79    /// Fast word boundary detection
80    pub fn find_word_boundaries(text: &str) -> Vec<(usize, usize)> {
81        let mut boundaries = Vec::new();
82        let mut start = None;
83
84        for (i, c) in text.char_indices() {
85            if c.is_alphanumeric() {
86                if start.is_none() {
87                    start = Some(i);
88                }
89            } else if let Some(word_start) = start {
90                boundaries.push((word_start, i));
91                start = None;
92            }
93        }
94
95        if let Some(word_start) = start {
96            boundaries.push((word_start, text.len()));
97        }
98
99        boundaries
100    }
101}
102
103impl AdvancedSIMDTextProcessor {
104    /// Process text with advanced SIMD operations
105    pub fn process_text(text: &str) -> TextProcessingResult {
106        let start_time = std::time::Instant::now();
107
108        // Perform analysis
109        let stats = SimdTextAnalyzer::analyze_text(text);
110
111        // Simple text processing (could be extended)
112        let processed_text = text.to_lowercase();
113
114        let processing_time_ms = start_time.elapsed().as_secs_f64() * 1000.0;
115
116        TextProcessingResult {
117            text: processed_text,
118            stats,
119            processing_time_ms,
120        }
121    }
122
123    /// Batch process multiple texts
124    pub fn batch_process(texts: &[&str]) -> Vec<TextProcessingResult> {
125        texts.iter().map(|&text| Self::process_text(text)).collect()
126    }
127
128    /// Advanced batch processing (alias for backward compatibility)
129    pub fn advanced_batch_process(texts: &[&str]) -> Vec<TextProcessingResult> {
130        Self::batch_process(texts)
131    }
132
133    /// Calculate similarity matrix between texts
134    pub fn advanced_similarity_matrix(texts: &[&str]) -> Vec<Vec<f64>> {
135        let n = texts.len();
136        let mut matrix = vec![vec![0.0; n]; n];
137
138        for i in 0..n {
139            for j in i..n {
140                if i == j {
141                    matrix[i][j] = 1.0;
142                } else {
143                    // Use Jaccard similarity from vectorized_ops
144                    let similarity = super::vectorized_ops::SimdTextSimilarity::jaccard_similarity(
145                        texts[i], texts[j],
146                    );
147                    matrix[i][j] = similarity;
148                    matrix[j][i] = similarity; // Symmetric matrix
149                }
150            }
151        }
152
153        matrix
154    }
155}