scirs2_text/simd_ops/
text_analysis.rs1use scirs2_core::ndarray::Array1;
6
7pub struct SimdTextAnalyzer;
9
10#[derive(Debug, Clone)]
12pub struct TextAnalysisResult {
13 pub char_frequencies: std::collections::HashMap<char, usize>,
15 pub word_count: usize,
17 pub avg_word_length: f64,
19 pub sentence_count: usize,
21}
22
23pub struct AdvancedSIMDTextProcessor;
25
26#[derive(Debug, Clone)]
28pub struct TextProcessingResult {
29 pub text: String,
31 pub stats: TextAnalysisResult,
33 pub processing_time_ms: f64,
35}
36
37impl SimdTextAnalyzer {
38 pub fn analyze_text(text: &str) -> TextAnalysisResult {
40 let mut char_frequencies = std::collections::HashMap::new();
41
42 for c in text.chars() {
43 *char_frequencies.entry(c).or_insert(0) += 1;
44 }
45
46 let words: Vec<&str> = text.split_whitespace().collect();
47 let word_count = words.len();
48 let total_word_length: usize = words.iter().map(|w| w.len()).sum();
49 let avg_word_length = if word_count > 0 {
50 total_word_length as f64 / word_count as f64
51 } else {
52 0.0
53 };
54
55 let sentence_count = text.split('.').filter(|s| !s.trim().is_empty()).count();
56
57 TextAnalysisResult {
58 char_frequencies,
59 word_count,
60 avg_word_length,
61 sentence_count,
62 }
63 }
64
65 pub fn character_frequencies(text: &str) -> std::collections::HashMap<char, usize> {
67 let mut frequencies = std::collections::HashMap::new();
68 for c in text.chars() {
69 *frequencies.entry(c).or_insert(0) += 1;
70 }
71 frequencies
72 }
73
74 pub fn count_lines(text: &str) -> usize {
76 text.lines().count()
77 }
78
79 pub fn find_word_boundaries(text: &str) -> Vec<(usize, usize)> {
81 let mut boundaries = Vec::new();
82 let mut start = None;
83
84 for (i, c) in text.char_indices() {
85 if c.is_alphanumeric() {
86 if start.is_none() {
87 start = Some(i);
88 }
89 } else if let Some(word_start) = start {
90 boundaries.push((word_start, i));
91 start = None;
92 }
93 }
94
95 if let Some(word_start) = start {
96 boundaries.push((word_start, text.len()));
97 }
98
99 boundaries
100 }
101}
102
103impl AdvancedSIMDTextProcessor {
104 pub fn process_text(text: &str) -> TextProcessingResult {
106 let start_time = std::time::Instant::now();
107
108 let stats = SimdTextAnalyzer::analyze_text(text);
110
111 let processed_text = text.to_lowercase();
113
114 let processing_time_ms = start_time.elapsed().as_secs_f64() * 1000.0;
115
116 TextProcessingResult {
117 text: processed_text,
118 stats,
119 processing_time_ms,
120 }
121 }
122
123 pub fn batch_process(texts: &[&str]) -> Vec<TextProcessingResult> {
125 texts.iter().map(|&text| Self::process_text(text)).collect()
126 }
127
128 pub fn advanced_batch_process(texts: &[&str]) -> Vec<TextProcessingResult> {
130 Self::batch_process(texts)
131 }
132
133 pub fn advanced_similarity_matrix(texts: &[&str]) -> Vec<Vec<f64>> {
135 let n = texts.len();
136 let mut matrix = vec![vec![0.0; n]; n];
137
138 for i in 0..n {
139 for j in i..n {
140 if i == j {
141 matrix[i][j] = 1.0;
142 } else {
143 let similarity = super::vectorized_ops::SimdTextSimilarity::jaccard_similarity(
145 texts[i], texts[j],
146 );
147 matrix[i][j] = similarity;
148 matrix[j][i] = similarity; }
150 }
151 }
152
153 matrix
154 }
155}