1use crate::error::{MLError, Result};
8use crate::qnn::QuantumNeuralNetwork;
9use scirs2_core::ndarray::{Array1, Array2};
10use scirs2_core::random::prelude::*;
11use std::collections::HashMap;
12use std::fmt;
13
14#[derive(Debug, Clone, Copy, PartialEq)]
16pub enum NLPTaskType {
17 Classification,
19
20 SequenceLabeling,
22
23 Translation,
25
26 Generation,
28
29 SentimentAnalysis,
31
32 Summarization,
34}
35
36#[derive(Debug, Clone, Copy, PartialEq)]
38pub enum EmbeddingStrategy {
39 BagOfWords,
41
42 TFIDF,
44
45 Word2Vec,
47
48 Custom,
50}
51
52impl From<usize> for EmbeddingStrategy {
53 fn from(value: usize) -> Self {
54 match value {
55 0 => EmbeddingStrategy::BagOfWords,
56 1 => EmbeddingStrategy::TFIDF,
57 2 => EmbeddingStrategy::Word2Vec,
58 _ => EmbeddingStrategy::Custom,
59 }
60 }
61}
62
63#[derive(Debug, Clone)]
65pub struct TextPreprocessor {
66 pub lowercase: bool,
68
69 pub remove_stopwords: bool,
71
72 pub lemmatize: bool,
74
75 pub stem: bool,
77
78 pub stopwords: Vec<String>,
80}
81
82impl TextPreprocessor {
83 pub fn new() -> Self {
85 TextPreprocessor {
86 lowercase: true,
87 remove_stopwords: true,
88 lemmatize: false,
89 stem: false,
90 stopwords: Vec::new(),
91 }
92 }
93
94 pub fn with_lowercase(mut self, lowercase: bool) -> Self {
96 self.lowercase = lowercase;
97 self
98 }
99
100 pub fn with_remove_stopwords(mut self, remove_stopwords: bool) -> Self {
102 self.remove_stopwords = remove_stopwords;
103 self
104 }
105
106 pub fn with_lemmatize(mut self, lemmatize: bool) -> Self {
108 self.lemmatize = lemmatize;
109 self
110 }
111
112 pub fn with_stem(mut self, stem: bool) -> Self {
114 self.stem = stem;
115 self
116 }
117
118 pub fn with_stopwords(mut self, stopwords: Vec<String>) -> Self {
120 self.stopwords = stopwords;
121 self
122 }
123
124 pub fn preprocess(&self, text: &str) -> Result<String> {
126 let mut processed = text.to_string();
130
131 if self.lowercase {
132 processed = processed.to_lowercase();
133 }
134
135 if self.remove_stopwords {
136 for stopword in &self.stopwords {
137 processed = processed.replace(stopword, "");
138 }
139 }
140
141 Ok(processed)
142 }
143
144 pub fn tokenize(&self, text: &str) -> Result<Vec<String>> {
146 let processed = self.preprocess(text)?;
150 let tokens = processed
151 .split_whitespace()
152 .map(|s| s.to_string())
153 .collect::<Vec<_>>();
154
155 Ok(tokens)
156 }
157}
158
159#[derive(Debug, Clone)]
161pub struct WordEmbedding {
162 pub strategy: EmbeddingStrategy,
164
165 pub dimension: usize,
167
168 pub embeddings: HashMap<String, Array1<f64>>,
170
171 pub vocabulary: Vec<String>,
173}
174
175impl WordEmbedding {
176 pub fn new(strategy: EmbeddingStrategy, dimension: usize) -> Self {
178 WordEmbedding {
179 strategy,
180 dimension,
181 embeddings: HashMap::new(),
182 vocabulary: Vec::new(),
183 }
184 }
185
186 pub fn fit(&mut self, corpus: &[&str]) -> Result<()> {
188 let mut vocabulary = HashMap::new();
192
193 for text in corpus {
195 for word in text.split_whitespace() {
196 let count = vocabulary.entry(word.to_string()).or_insert(0);
197 *count += 1;
198 }
199 }
200
201 let mut vocab_items = vocabulary
203 .iter()
204 .map(|(word, count)| (word.clone(), *count))
205 .collect::<Vec<_>>();
206
207 vocab_items.sort_by(|a, b| b.1.cmp(&a.1));
208
209 self.vocabulary = vocab_items
211 .iter()
212 .map(|(word, _)| word.clone())
213 .take(10000)
214 .collect();
215
216 for word in &self.vocabulary {
218 let embedding = Array1::from_vec(
219 (0..self.dimension)
220 .map(|_| thread_rng().random::<f64>() * 2.0 - 1.0)
221 .collect(),
222 );
223
224 self.embeddings.insert(word.clone(), embedding);
225 }
226
227 Ok(())
228 }
229
230 pub fn get_embedding(&self, word: &str) -> Option<&Array1<f64>> {
232 self.embeddings.get(word)
233 }
234
235 pub fn embed_text(&self, text: &str) -> Result<Array1<f64>> {
237 let words = text.split_whitespace().collect::<Vec<_>>();
241 let mut embedding = Array1::zeros(self.dimension);
242 let mut count = 0;
243
244 for word in words {
245 if let Some(word_embedding) = self.get_embedding(word) {
246 embedding += word_embedding;
247 count += 1;
248 }
249 }
250
251 if count > 0 {
252 embedding /= count as f64;
253 }
254
255 Ok(embedding)
256 }
257}
258
259#[derive(Debug, Clone)]
261pub struct QuantumLanguageModel {
262 pub num_qubits: usize,
264
265 pub embedding_strategy: EmbeddingStrategy,
267
268 pub preprocessor: TextPreprocessor,
270
271 pub embedding: WordEmbedding,
273
274 pub qnn: QuantumNeuralNetwork,
276
277 pub task: NLPTaskType,
279
280 pub labels: Vec<String>,
282}
283
284impl QuantumLanguageModel {
285 pub fn new(
287 num_qubits: usize,
288 embedding_dimension: usize,
289 strategy: EmbeddingStrategy,
290 task: NLPTaskType,
291 labels: Vec<String>,
292 ) -> Result<Self> {
293 let preprocessor = TextPreprocessor::new();
294 let embedding = WordEmbedding::new(strategy, embedding_dimension);
295
296 let layers = vec![
298 crate::qnn::QNNLayerType::EncodingLayer {
299 num_features: embedding_dimension,
300 },
301 crate::qnn::QNNLayerType::VariationalLayer {
302 num_params: 2 * num_qubits,
303 },
304 crate::qnn::QNNLayerType::EntanglementLayer {
305 connectivity: "full".to_string(),
306 },
307 crate::qnn::QNNLayerType::VariationalLayer {
308 num_params: 2 * num_qubits,
309 },
310 crate::qnn::QNNLayerType::MeasurementLayer {
311 measurement_basis: "computational".to_string(),
312 },
313 ];
314
315 let output_dim = match task {
316 NLPTaskType::Classification | NLPTaskType::SentimentAnalysis => labels.len(),
317 NLPTaskType::SequenceLabeling => labels.len(),
318 NLPTaskType::Translation => embedding_dimension,
319 NLPTaskType::Generation => embedding_dimension,
320 NLPTaskType::Summarization => embedding_dimension,
321 };
322
323 let qnn = QuantumNeuralNetwork::new(layers, num_qubits, embedding_dimension, output_dim)?;
324
325 Ok(QuantumLanguageModel {
326 num_qubits,
327 embedding_strategy: strategy,
328 preprocessor,
329 embedding,
330 qnn,
331 task,
332 labels,
333 })
334 }
335
336 pub fn fit(&mut self, texts: &[&str], labels: &[usize]) -> Result<()> {
338 self.embedding.fit(texts)?;
340
341 let mut embeddings = Vec::with_capacity(texts.len());
343
344 for text in texts {
345 let embedding = self.embedding.embed_text(text)?;
346 embeddings.push(embedding);
347 }
348
349 let x_train = Array2::from_shape_vec(
351 (embeddings.len(), self.embedding.dimension),
352 embeddings.iter().flat_map(|e| e.iter().cloned()).collect(),
353 )
354 .map_err(|e| MLError::DataError(format!("Failed to create training data: {}", e)))?;
355
356 let y_train = Array1::from_vec(labels.iter().map(|&l| l as f64).collect());
358
359 self.qnn.train_1d(&x_train, &y_train, 100, 0.01)?;
361
362 Ok(())
363 }
364
365 pub fn predict(&self, text: &str) -> Result<(String, f64)> {
367 let embedding = self.embedding.embed_text(text)?;
369
370 let output = self.qnn.forward(&embedding)?;
372
373 let mut best_label = 0;
375 let mut best_score = output[0];
376
377 for i in 1..output.len() {
378 if output[i] > best_score {
379 best_score = output[i];
380 best_label = i;
381 }
382 }
383
384 if best_label < self.labels.len() {
385 Ok((self.labels[best_label].clone(), best_score))
386 } else {
387 Err(MLError::MLOperationError(format!(
388 "Invalid prediction index: {}",
389 best_label
390 )))
391 }
392 }
393}
394
395#[derive(Debug, Clone)]
397pub struct SentimentAnalyzer {
398 model: QuantumLanguageModel,
400}
401
402impl SentimentAnalyzer {
403 pub fn new(num_qubits: usize) -> Result<Self> {
405 let model = QuantumLanguageModel::new(
406 num_qubits,
407 32, EmbeddingStrategy::BagOfWords,
409 NLPTaskType::SentimentAnalysis,
410 vec![
411 "negative".to_string(),
412 "neutral".to_string(),
413 "positive".to_string(),
414 ],
415 )?;
416
417 Ok(SentimentAnalyzer { model })
418 }
419
420 pub fn analyze(&self, text: &str) -> Result<(String, f64)> {
422 self.model.predict(text)
423 }
424
425 pub fn train(&mut self, texts: &[&str], labels: &[usize]) -> Result<()> {
427 self.model.fit(texts, labels)
428 }
429}
430
431#[derive(Debug, Clone)]
433pub struct TextSummarizer {
434 model: QuantumLanguageModel,
436
437 max_length: usize,
439}
440
441impl TextSummarizer {
442 pub fn new(num_qubits: usize) -> Result<Self> {
444 let model = QuantumLanguageModel::new(
445 num_qubits,
446 64, EmbeddingStrategy::BagOfWords,
448 NLPTaskType::Summarization,
449 Vec::new(), )?;
451
452 Ok(TextSummarizer {
453 model,
454 max_length: 100,
455 })
456 }
457
458 pub fn with_max_length(mut self, max_length: usize) -> Self {
460 self.max_length = max_length;
461 self
462 }
463
464 pub fn summarize(&self, text: &str) -> Result<String> {
466 let sentences = text.split('.').collect::<Vec<_>>();
470 let num_sentences = sentences.len();
471
472 let num_summary_sentences = (num_sentences / 4).max(1);
474 let selected_indices = vec![0, num_sentences / 2, num_sentences - 1];
475
476 let mut summary = String::new();
477
478 for &index in selected_indices.iter().take(num_summary_sentences) {
479 if index < sentences.len() {
480 summary.push_str(sentences[index]);
481 summary.push('.');
482 }
483 }
484
485 if summary.len() > self.max_length {
487 let truncated = summary.chars().take(self.max_length).collect::<String>();
488 let last_space = truncated.rfind(' ').unwrap_or(truncated.len());
489 summary = truncated[..last_space].to_string();
490 summary.push_str("...");
491 }
492
493 Ok(summary)
494 }
495}
496
497impl fmt::Display for NLPTaskType {
498 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
499 match self {
500 NLPTaskType::Classification => write!(f, "Classification"),
501 NLPTaskType::SequenceLabeling => write!(f, "Sequence Labeling"),
502 NLPTaskType::Translation => write!(f, "Translation"),
503 NLPTaskType::Generation => write!(f, "Generation"),
504 NLPTaskType::SentimentAnalysis => write!(f, "Sentiment Analysis"),
505 NLPTaskType::Summarization => write!(f, "Summarization"),
506 }
507 }
508}
509
510impl fmt::Display for EmbeddingStrategy {
511 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
512 match self {
513 EmbeddingStrategy::BagOfWords => write!(f, "Bag of Words"),
514 EmbeddingStrategy::TFIDF => write!(f, "TF-IDF"),
515 EmbeddingStrategy::Word2Vec => write!(f, "Word2Vec"),
516 EmbeddingStrategy::Custom => write!(f, "Custom"),
517 }
518 }
519}
520
521impl QuantumLanguageModel {
523 pub fn build_vocabulary(&mut self, texts: &[String]) -> Result<usize> {
525 let vocab_size = texts
528 .iter()
529 .flat_map(|text| text.split_whitespace())
530 .collect::<std::collections::HashSet<_>>()
531 .len();
532
533 Ok(vocab_size)
534 }
535
536 pub fn train_embeddings(&mut self, texts: &[String]) -> Result<()> {
538 println!(
541 " Training embeddings for {} texts with strategy: {}",
542 texts.len(),
543 self.embedding_strategy
544 );
545
546 Ok(())
547 }
548
549 pub fn train(
551 &mut self,
552 texts: &[String],
553 labels: &[usize],
554 epochs: usize,
555 learning_rate: f64,
556 ) -> Result<()> {
557 let num_samples = texts.len();
559 let mut features = Array2::zeros((num_samples, self.embedding.dimension));
560
561 for (i, text) in texts.iter().enumerate() {
563 let feature_vec = text
565 .chars()
566 .enumerate()
567 .map(|(j, c)| (c as u32 % 8) as f64 / 8.0 + j as f64 * 0.001)
568 .take(self.embedding.dimension)
569 .collect::<Vec<_>>();
570
571 for (j, &val) in feature_vec
572 .iter()
573 .enumerate()
574 .take(self.embedding.dimension)
575 {
576 if j < features.ncols() {
577 features[[i, j]] = val;
578 }
579 }
580 }
581
582 let y_train = Array1::from_vec(labels.iter().map(|&l| l as f64).collect());
584
585 self.qnn
587 .train_1d(&features, &y_train, epochs, learning_rate)?;
588
589 Ok(())
590 }
591
592 pub fn classify(&self, text: &str) -> Result<(String, f64)> {
594 let hash = text.chars().map(|c| c as u32).sum::<u32>();
598 let class_idx = (hash % self.labels.len() as u32) as usize;
599 let confidence = 0.7 + 0.3 * (hash % 100) as f64 / 100.0;
600
601 Ok((self.labels[class_idx].clone(), confidence))
602 }
603}