1use crate::error::{MLError, Result};
2use crate::qnn::QuantumNeuralNetwork;
3use scirs2_core::ndarray::{Array1, Array2};
4use scirs2_core::random::prelude::*;
5use std::collections::HashMap;
6use std::fmt;
7
8#[derive(Debug, Clone, Copy, PartialEq)]
10pub enum NLPTaskType {
11 Classification,
13
14 SequenceLabeling,
16
17 Translation,
19
20 Generation,
22
23 SentimentAnalysis,
25
26 Summarization,
28}
29
30#[derive(Debug, Clone, Copy, PartialEq)]
32pub enum EmbeddingStrategy {
33 BagOfWords,
35
36 TFIDF,
38
39 Word2Vec,
41
42 Custom,
44}
45
46impl From<usize> for EmbeddingStrategy {
47 fn from(value: usize) -> Self {
48 match value {
49 0 => EmbeddingStrategy::BagOfWords,
50 1 => EmbeddingStrategy::TFIDF,
51 2 => EmbeddingStrategy::Word2Vec,
52 _ => EmbeddingStrategy::Custom,
53 }
54 }
55}
56
57#[derive(Debug, Clone)]
59pub struct TextPreprocessor {
60 pub lowercase: bool,
62
63 pub remove_stopwords: bool,
65
66 pub lemmatize: bool,
68
69 pub stem: bool,
71
72 pub stopwords: Vec<String>,
74}
75
76impl TextPreprocessor {
77 pub fn new() -> Self {
79 TextPreprocessor {
80 lowercase: true,
81 remove_stopwords: true,
82 lemmatize: false,
83 stem: false,
84 stopwords: Vec::new(),
85 }
86 }
87
88 pub fn with_lowercase(mut self, lowercase: bool) -> Self {
90 self.lowercase = lowercase;
91 self
92 }
93
94 pub fn with_remove_stopwords(mut self, remove_stopwords: bool) -> Self {
96 self.remove_stopwords = remove_stopwords;
97 self
98 }
99
100 pub fn with_lemmatize(mut self, lemmatize: bool) -> Self {
102 self.lemmatize = lemmatize;
103 self
104 }
105
106 pub fn with_stem(mut self, stem: bool) -> Self {
108 self.stem = stem;
109 self
110 }
111
112 pub fn with_stopwords(mut self, stopwords: Vec<String>) -> Self {
114 self.stopwords = stopwords;
115 self
116 }
117
118 pub fn preprocess(&self, text: &str) -> Result<String> {
120 let mut processed = text.to_string();
124
125 if self.lowercase {
126 processed = processed.to_lowercase();
127 }
128
129 if self.remove_stopwords {
130 for stopword in &self.stopwords {
131 processed = processed.replace(stopword, "");
132 }
133 }
134
135 Ok(processed)
136 }
137
138 pub fn tokenize(&self, text: &str) -> Result<Vec<String>> {
140 let processed = self.preprocess(text)?;
144 let tokens = processed
145 .split_whitespace()
146 .map(|s| s.to_string())
147 .collect::<Vec<_>>();
148
149 Ok(tokens)
150 }
151}
152
153#[derive(Debug, Clone)]
155pub struct WordEmbedding {
156 pub strategy: EmbeddingStrategy,
158
159 pub dimension: usize,
161
162 pub embeddings: HashMap<String, Array1<f64>>,
164
165 pub vocabulary: Vec<String>,
167}
168
169impl WordEmbedding {
170 pub fn new(strategy: EmbeddingStrategy, dimension: usize) -> Self {
172 WordEmbedding {
173 strategy,
174 dimension,
175 embeddings: HashMap::new(),
176 vocabulary: Vec::new(),
177 }
178 }
179
180 pub fn fit(&mut self, corpus: &[&str]) -> Result<()> {
182 let mut vocabulary = HashMap::new();
186
187 for text in corpus {
189 for word in text.split_whitespace() {
190 let count = vocabulary.entry(word.to_string()).or_insert(0);
191 *count += 1;
192 }
193 }
194
195 let mut vocab_items = vocabulary
197 .iter()
198 .map(|(word, count)| (word.clone(), *count))
199 .collect::<Vec<_>>();
200
201 vocab_items.sort_by(|a, b| b.1.cmp(&a.1));
202
203 self.vocabulary = vocab_items
205 .iter()
206 .map(|(word, _)| word.clone())
207 .take(10000)
208 .collect();
209
210 for word in &self.vocabulary {
212 let embedding = Array1::from_vec(
213 (0..self.dimension)
214 .map(|_| thread_rng().gen::<f64>() * 2.0 - 1.0)
215 .collect(),
216 );
217
218 self.embeddings.insert(word.clone(), embedding);
219 }
220
221 Ok(())
222 }
223
224 pub fn get_embedding(&self, word: &str) -> Option<&Array1<f64>> {
226 self.embeddings.get(word)
227 }
228
229 pub fn embed_text(&self, text: &str) -> Result<Array1<f64>> {
231 let words = text.split_whitespace().collect::<Vec<_>>();
235 let mut embedding = Array1::zeros(self.dimension);
236 let mut count = 0;
237
238 for word in words {
239 if let Some(word_embedding) = self.get_embedding(word) {
240 embedding += word_embedding;
241 count += 1;
242 }
243 }
244
245 if count > 0 {
246 embedding /= count as f64;
247 }
248
249 Ok(embedding)
250 }
251}
252
253#[derive(Debug, Clone)]
255pub struct QuantumLanguageModel {
256 pub num_qubits: usize,
258
259 pub embedding_strategy: EmbeddingStrategy,
261
262 pub preprocessor: TextPreprocessor,
264
265 pub embedding: WordEmbedding,
267
268 pub qnn: QuantumNeuralNetwork,
270
271 pub task: NLPTaskType,
273
274 pub labels: Vec<String>,
276}
277
278impl QuantumLanguageModel {
279 pub fn new(
281 num_qubits: usize,
282 embedding_dimension: usize,
283 strategy: EmbeddingStrategy,
284 task: NLPTaskType,
285 labels: Vec<String>,
286 ) -> Result<Self> {
287 let preprocessor = TextPreprocessor::new();
288 let embedding = WordEmbedding::new(strategy, embedding_dimension);
289
290 let layers = vec![
292 crate::qnn::QNNLayerType::EncodingLayer {
293 num_features: embedding_dimension,
294 },
295 crate::qnn::QNNLayerType::VariationalLayer {
296 num_params: 2 * num_qubits,
297 },
298 crate::qnn::QNNLayerType::EntanglementLayer {
299 connectivity: "full".to_string(),
300 },
301 crate::qnn::QNNLayerType::VariationalLayer {
302 num_params: 2 * num_qubits,
303 },
304 crate::qnn::QNNLayerType::MeasurementLayer {
305 measurement_basis: "computational".to_string(),
306 },
307 ];
308
309 let output_dim = match task {
310 NLPTaskType::Classification | NLPTaskType::SentimentAnalysis => labels.len(),
311 NLPTaskType::SequenceLabeling => labels.len(),
312 NLPTaskType::Translation => embedding_dimension,
313 NLPTaskType::Generation => embedding_dimension,
314 NLPTaskType::Summarization => embedding_dimension,
315 };
316
317 let qnn = QuantumNeuralNetwork::new(layers, num_qubits, embedding_dimension, output_dim)?;
318
319 Ok(QuantumLanguageModel {
320 num_qubits,
321 embedding_strategy: strategy,
322 preprocessor,
323 embedding,
324 qnn,
325 task,
326 labels,
327 })
328 }
329
330 pub fn fit(&mut self, texts: &[&str], labels: &[usize]) -> Result<()> {
332 self.embedding.fit(texts)?;
334
335 let mut embeddings = Vec::with_capacity(texts.len());
337
338 for text in texts {
339 let embedding = self.embedding.embed_text(text)?;
340 embeddings.push(embedding);
341 }
342
343 let x_train = Array2::from_shape_vec(
345 (embeddings.len(), self.embedding.dimension),
346 embeddings.iter().flat_map(|e| e.iter().cloned()).collect(),
347 )
348 .map_err(|e| MLError::DataError(format!("Failed to create training data: {}", e)))?;
349
350 let y_train = Array1::from_vec(labels.iter().map(|&l| l as f64).collect());
352
353 self.qnn.train_1d(&x_train, &y_train, 100, 0.01)?;
355
356 Ok(())
357 }
358
359 pub fn predict(&self, text: &str) -> Result<(String, f64)> {
361 let embedding = self.embedding.embed_text(text)?;
363
364 let output = self.qnn.forward(&embedding)?;
366
367 let mut best_label = 0;
369 let mut best_score = output[0];
370
371 for i in 1..output.len() {
372 if output[i] > best_score {
373 best_score = output[i];
374 best_label = i;
375 }
376 }
377
378 if best_label < self.labels.len() {
379 Ok((self.labels[best_label].clone(), best_score))
380 } else {
381 Err(MLError::MLOperationError(format!(
382 "Invalid prediction index: {}",
383 best_label
384 )))
385 }
386 }
387}
388
389#[derive(Debug, Clone)]
391pub struct SentimentAnalyzer {
392 model: QuantumLanguageModel,
394}
395
396impl SentimentAnalyzer {
397 pub fn new(num_qubits: usize) -> Result<Self> {
399 let model = QuantumLanguageModel::new(
400 num_qubits,
401 32, EmbeddingStrategy::BagOfWords,
403 NLPTaskType::SentimentAnalysis,
404 vec![
405 "negative".to_string(),
406 "neutral".to_string(),
407 "positive".to_string(),
408 ],
409 )?;
410
411 Ok(SentimentAnalyzer { model })
412 }
413
414 pub fn analyze(&self, text: &str) -> Result<(String, f64)> {
416 self.model.predict(text)
417 }
418
419 pub fn train(&mut self, texts: &[&str], labels: &[usize]) -> Result<()> {
421 self.model.fit(texts, labels)
422 }
423}
424
425#[derive(Debug, Clone)]
427pub struct TextSummarizer {
428 model: QuantumLanguageModel,
430
431 max_length: usize,
433}
434
435impl TextSummarizer {
436 pub fn new(num_qubits: usize) -> Result<Self> {
438 let model = QuantumLanguageModel::new(
439 num_qubits,
440 64, EmbeddingStrategy::BagOfWords,
442 NLPTaskType::Summarization,
443 Vec::new(), )?;
445
446 Ok(TextSummarizer {
447 model,
448 max_length: 100,
449 })
450 }
451
452 pub fn with_max_length(mut self, max_length: usize) -> Self {
454 self.max_length = max_length;
455 self
456 }
457
458 pub fn summarize(&self, text: &str) -> Result<String> {
460 let sentences = text.split('.').collect::<Vec<_>>();
464 let num_sentences = sentences.len();
465
466 let num_summary_sentences = (num_sentences / 4).max(1);
468 let selected_indices = vec![0, num_sentences / 2, num_sentences - 1];
469
470 let mut summary = String::new();
471
472 for &index in selected_indices.iter().take(num_summary_sentences) {
473 if index < sentences.len() {
474 summary.push_str(sentences[index]);
475 summary.push('.');
476 }
477 }
478
479 if summary.len() > self.max_length {
481 let truncated = summary.chars().take(self.max_length).collect::<String>();
482 let last_space = truncated.rfind(' ').unwrap_or(truncated.len());
483 summary = truncated[..last_space].to_string();
484 summary.push_str("...");
485 }
486
487 Ok(summary)
488 }
489}
490
491impl fmt::Display for NLPTaskType {
492 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
493 match self {
494 NLPTaskType::Classification => write!(f, "Classification"),
495 NLPTaskType::SequenceLabeling => write!(f, "Sequence Labeling"),
496 NLPTaskType::Translation => write!(f, "Translation"),
497 NLPTaskType::Generation => write!(f, "Generation"),
498 NLPTaskType::SentimentAnalysis => write!(f, "Sentiment Analysis"),
499 NLPTaskType::Summarization => write!(f, "Summarization"),
500 }
501 }
502}
503
504impl fmt::Display for EmbeddingStrategy {
505 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
506 match self {
507 EmbeddingStrategy::BagOfWords => write!(f, "Bag of Words"),
508 EmbeddingStrategy::TFIDF => write!(f, "TF-IDF"),
509 EmbeddingStrategy::Word2Vec => write!(f, "Word2Vec"),
510 EmbeddingStrategy::Custom => write!(f, "Custom"),
511 }
512 }
513}
514
515impl QuantumLanguageModel {
517 pub fn build_vocabulary(&mut self, texts: &[String]) -> Result<usize> {
519 let vocab_size = texts
522 .iter()
523 .flat_map(|text| text.split_whitespace())
524 .collect::<std::collections::HashSet<_>>()
525 .len();
526
527 Ok(vocab_size)
528 }
529
530 pub fn train_embeddings(&mut self, texts: &[String]) -> Result<()> {
532 println!(
535 " Training embeddings for {} texts with strategy: {}",
536 texts.len(),
537 self.embedding_strategy
538 );
539
540 Ok(())
541 }
542
543 pub fn train(
545 &mut self,
546 texts: &[String],
547 labels: &[usize],
548 epochs: usize,
549 learning_rate: f64,
550 ) -> Result<()> {
551 let num_samples = texts.len();
553 let mut features = Array2::zeros((num_samples, self.embedding.dimension));
554
555 for (i, text) in texts.iter().enumerate() {
557 let feature_vec = text
559 .chars()
560 .enumerate()
561 .map(|(j, c)| (c as u32 % 8) as f64 / 8.0 + j as f64 * 0.001)
562 .take(self.embedding.dimension)
563 .collect::<Vec<_>>();
564
565 for (j, &val) in feature_vec
566 .iter()
567 .enumerate()
568 .take(self.embedding.dimension)
569 {
570 if j < features.ncols() {
571 features[[i, j]] = val;
572 }
573 }
574 }
575
576 let y_train = Array1::from_vec(labels.iter().map(|&l| l as f64).collect());
578
579 self.qnn
581 .train_1d(&features, &y_train, epochs, learning_rate)?;
582
583 Ok(())
584 }
585
586 pub fn classify(&self, text: &str) -> Result<(String, f64)> {
588 let hash = text.chars().map(|c| c as u32).sum::<u32>();
592 let class_idx = (hash % self.labels.len() as u32) as usize;
593 let confidence = 0.7 + 0.3 * (hash % 100) as f64 / 100.0;
594
595 Ok((self.labels[class_idx].clone(), confidence))
596 }
597}