1use crate::error::{MLError, Result};
2use crate::qnn::QuantumNeuralNetwork;
3use ndarray::{Array1, Array2};
4use std::collections::HashMap;
5use std::fmt;
6
7#[derive(Debug, Clone, Copy, PartialEq)]
9pub enum NLPTaskType {
10 Classification,
12
13 SequenceLabeling,
15
16 Translation,
18
19 Generation,
21
22 SentimentAnalysis,
24
25 Summarization,
27}
28
29#[derive(Debug, Clone, Copy, PartialEq)]
31pub enum EmbeddingStrategy {
32 BagOfWords,
34
35 TFIDF,
37
38 Word2Vec,
40
41 Custom,
43}
44
45impl From<usize> for EmbeddingStrategy {
46 fn from(value: usize) -> Self {
47 match value {
48 0 => EmbeddingStrategy::BagOfWords,
49 1 => EmbeddingStrategy::TFIDF,
50 2 => EmbeddingStrategy::Word2Vec,
51 _ => EmbeddingStrategy::Custom,
52 }
53 }
54}
55
56#[derive(Debug, Clone)]
58pub struct TextPreprocessor {
59 pub lowercase: bool,
61
62 pub remove_stopwords: bool,
64
65 pub lemmatize: bool,
67
68 pub stem: bool,
70
71 pub stopwords: Vec<String>,
73}
74
75impl TextPreprocessor {
76 pub fn new() -> Self {
78 TextPreprocessor {
79 lowercase: true,
80 remove_stopwords: true,
81 lemmatize: false,
82 stem: false,
83 stopwords: Vec::new(),
84 }
85 }
86
87 pub fn with_lowercase(mut self, lowercase: bool) -> Self {
89 self.lowercase = lowercase;
90 self
91 }
92
93 pub fn with_remove_stopwords(mut self, remove_stopwords: bool) -> Self {
95 self.remove_stopwords = remove_stopwords;
96 self
97 }
98
99 pub fn with_lemmatize(mut self, lemmatize: bool) -> Self {
101 self.lemmatize = lemmatize;
102 self
103 }
104
105 pub fn with_stem(mut self, stem: bool) -> Self {
107 self.stem = stem;
108 self
109 }
110
111 pub fn with_stopwords(mut self, stopwords: Vec<String>) -> Self {
113 self.stopwords = stopwords;
114 self
115 }
116
117 pub fn preprocess(&self, text: &str) -> Result<String> {
119 let mut processed = text.to_string();
123
124 if self.lowercase {
125 processed = processed.to_lowercase();
126 }
127
128 if self.remove_stopwords {
129 for stopword in &self.stopwords {
130 processed = processed.replace(stopword, "");
131 }
132 }
133
134 Ok(processed)
135 }
136
137 pub fn tokenize(&self, text: &str) -> Result<Vec<String>> {
139 let processed = self.preprocess(text)?;
143 let tokens = processed
144 .split_whitespace()
145 .map(|s| s.to_string())
146 .collect::<Vec<_>>();
147
148 Ok(tokens)
149 }
150}
151
152#[derive(Debug, Clone)]
154pub struct WordEmbedding {
155 pub strategy: EmbeddingStrategy,
157
158 pub dimension: usize,
160
161 pub embeddings: HashMap<String, Array1<f64>>,
163
164 pub vocabulary: Vec<String>,
166}
167
168impl WordEmbedding {
169 pub fn new(strategy: EmbeddingStrategy, dimension: usize) -> Self {
171 WordEmbedding {
172 strategy,
173 dimension,
174 embeddings: HashMap::new(),
175 vocabulary: Vec::new(),
176 }
177 }
178
179 pub fn fit(&mut self, corpus: &[&str]) -> Result<()> {
181 let mut vocabulary = HashMap::new();
185
186 for text in corpus {
188 for word in text.split_whitespace() {
189 let count = vocabulary.entry(word.to_string()).or_insert(0);
190 *count += 1;
191 }
192 }
193
194 let mut vocab_items = vocabulary
196 .iter()
197 .map(|(word, count)| (word.clone(), *count))
198 .collect::<Vec<_>>();
199
200 vocab_items.sort_by(|a, b| b.1.cmp(&a.1));
201
202 self.vocabulary = vocab_items
204 .iter()
205 .map(|(word, _)| word.clone())
206 .take(10000)
207 .collect();
208
209 for word in &self.vocabulary {
211 let embedding = Array1::from_vec(
212 (0..self.dimension)
213 .map(|_| rand::random::<f64>() * 2.0 - 1.0)
214 .collect(),
215 );
216
217 self.embeddings.insert(word.clone(), embedding);
218 }
219
220 Ok(())
221 }
222
223 pub fn get_embedding(&self, word: &str) -> Option<&Array1<f64>> {
225 self.embeddings.get(word)
226 }
227
228 pub fn embed_text(&self, text: &str) -> Result<Array1<f64>> {
230 let words = text.split_whitespace().collect::<Vec<_>>();
234 let mut embedding = Array1::zeros(self.dimension);
235 let mut count = 0;
236
237 for word in words {
238 if let Some(word_embedding) = self.get_embedding(word) {
239 embedding += word_embedding;
240 count += 1;
241 }
242 }
243
244 if count > 0 {
245 embedding /= count as f64;
246 }
247
248 Ok(embedding)
249 }
250}
251
252#[derive(Debug, Clone)]
254pub struct QuantumLanguageModel {
255 pub num_qubits: usize,
257
258 pub embedding_strategy: EmbeddingStrategy,
260
261 pub preprocessor: TextPreprocessor,
263
264 pub embedding: WordEmbedding,
266
267 pub qnn: QuantumNeuralNetwork,
269
270 pub task: NLPTaskType,
272
273 pub labels: Vec<String>,
275}
276
277impl QuantumLanguageModel {
278 pub fn new(
280 num_qubits: usize,
281 embedding_dimension: usize,
282 strategy: EmbeddingStrategy,
283 task: NLPTaskType,
284 labels: Vec<String>,
285 ) -> Result<Self> {
286 let preprocessor = TextPreprocessor::new();
287 let embedding = WordEmbedding::new(strategy, embedding_dimension);
288
289 let layers = vec![
291 crate::qnn::QNNLayerType::EncodingLayer {
292 num_features: embedding_dimension,
293 },
294 crate::qnn::QNNLayerType::VariationalLayer {
295 num_params: 2 * num_qubits,
296 },
297 crate::qnn::QNNLayerType::EntanglementLayer {
298 connectivity: "full".to_string(),
299 },
300 crate::qnn::QNNLayerType::VariationalLayer {
301 num_params: 2 * num_qubits,
302 },
303 crate::qnn::QNNLayerType::MeasurementLayer {
304 measurement_basis: "computational".to_string(),
305 },
306 ];
307
308 let output_dim = match task {
309 NLPTaskType::Classification | NLPTaskType::SentimentAnalysis => labels.len(),
310 NLPTaskType::SequenceLabeling => labels.len(),
311 NLPTaskType::Translation => embedding_dimension,
312 NLPTaskType::Generation => embedding_dimension,
313 NLPTaskType::Summarization => embedding_dimension,
314 };
315
316 let qnn = QuantumNeuralNetwork::new(layers, num_qubits, embedding_dimension, output_dim)?;
317
318 Ok(QuantumLanguageModel {
319 num_qubits,
320 embedding_strategy: strategy,
321 preprocessor,
322 embedding,
323 qnn,
324 task,
325 labels,
326 })
327 }
328
329 pub fn fit(&mut self, texts: &[&str], labels: &[usize]) -> Result<()> {
331 self.embedding.fit(texts)?;
333
334 let mut embeddings = Vec::with_capacity(texts.len());
336
337 for text in texts {
338 let embedding = self.embedding.embed_text(text)?;
339 embeddings.push(embedding);
340 }
341
342 let x_train = Array2::from_shape_vec(
344 (embeddings.len(), self.embedding.dimension),
345 embeddings.iter().flat_map(|e| e.iter().cloned()).collect(),
346 )
347 .map_err(|e| MLError::DataError(format!("Failed to create training data: {}", e)))?;
348
349 let y_train = Array1::from_vec(labels.iter().map(|&l| l as f64).collect());
351
352 self.qnn.train_1d(&x_train, &y_train, 100, 0.01)?;
354
355 Ok(())
356 }
357
358 pub fn predict(&self, text: &str) -> Result<(String, f64)> {
360 let embedding = self.embedding.embed_text(text)?;
362
363 let output = self.qnn.forward(&embedding)?;
365
366 let mut best_label = 0;
368 let mut best_score = output[0];
369
370 for i in 1..output.len() {
371 if output[i] > best_score {
372 best_score = output[i];
373 best_label = i;
374 }
375 }
376
377 if best_label < self.labels.len() {
378 Ok((self.labels[best_label].clone(), best_score))
379 } else {
380 Err(MLError::MLOperationError(format!(
381 "Invalid prediction index: {}",
382 best_label
383 )))
384 }
385 }
386}
387
388#[derive(Debug, Clone)]
390pub struct SentimentAnalyzer {
391 model: QuantumLanguageModel,
393}
394
395impl SentimentAnalyzer {
396 pub fn new(num_qubits: usize) -> Result<Self> {
398 let model = QuantumLanguageModel::new(
399 num_qubits,
400 32, EmbeddingStrategy::BagOfWords,
402 NLPTaskType::SentimentAnalysis,
403 vec![
404 "negative".to_string(),
405 "neutral".to_string(),
406 "positive".to_string(),
407 ],
408 )?;
409
410 Ok(SentimentAnalyzer { model })
411 }
412
413 pub fn analyze(&self, text: &str) -> Result<(String, f64)> {
415 self.model.predict(text)
416 }
417
418 pub fn train(&mut self, texts: &[&str], labels: &[usize]) -> Result<()> {
420 self.model.fit(texts, labels)
421 }
422}
423
424#[derive(Debug, Clone)]
426pub struct TextSummarizer {
427 model: QuantumLanguageModel,
429
430 max_length: usize,
432}
433
434impl TextSummarizer {
435 pub fn new(num_qubits: usize) -> Result<Self> {
437 let model = QuantumLanguageModel::new(
438 num_qubits,
439 64, EmbeddingStrategy::BagOfWords,
441 NLPTaskType::Summarization,
442 Vec::new(), )?;
444
445 Ok(TextSummarizer {
446 model,
447 max_length: 100,
448 })
449 }
450
451 pub fn with_max_length(mut self, max_length: usize) -> Self {
453 self.max_length = max_length;
454 self
455 }
456
457 pub fn summarize(&self, text: &str) -> Result<String> {
459 let sentences = text.split('.').collect::<Vec<_>>();
463 let num_sentences = sentences.len();
464
465 let num_summary_sentences = (num_sentences / 4).max(1);
467 let selected_indices = vec![0, num_sentences / 2, num_sentences - 1];
468
469 let mut summary = String::new();
470
471 for &index in selected_indices.iter().take(num_summary_sentences) {
472 if index < sentences.len() {
473 summary.push_str(sentences[index]);
474 summary.push('.');
475 }
476 }
477
478 if summary.len() > self.max_length {
480 let truncated = summary.chars().take(self.max_length).collect::<String>();
481 let last_space = truncated.rfind(' ').unwrap_or(truncated.len());
482 summary = truncated[..last_space].to_string();
483 summary.push_str("...");
484 }
485
486 Ok(summary)
487 }
488}
489
490impl fmt::Display for NLPTaskType {
491 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
492 match self {
493 NLPTaskType::Classification => write!(f, "Classification"),
494 NLPTaskType::SequenceLabeling => write!(f, "Sequence Labeling"),
495 NLPTaskType::Translation => write!(f, "Translation"),
496 NLPTaskType::Generation => write!(f, "Generation"),
497 NLPTaskType::SentimentAnalysis => write!(f, "Sentiment Analysis"),
498 NLPTaskType::Summarization => write!(f, "Summarization"),
499 }
500 }
501}
502
503impl fmt::Display for EmbeddingStrategy {
504 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
505 match self {
506 EmbeddingStrategy::BagOfWords => write!(f, "Bag of Words"),
507 EmbeddingStrategy::TFIDF => write!(f, "TF-IDF"),
508 EmbeddingStrategy::Word2Vec => write!(f, "Word2Vec"),
509 EmbeddingStrategy::Custom => write!(f, "Custom"),
510 }
511 }
512}
513
514impl QuantumLanguageModel {
516 pub fn build_vocabulary(&mut self, texts: &[String]) -> Result<usize> {
518 let vocab_size = texts
521 .iter()
522 .flat_map(|text| text.split_whitespace())
523 .collect::<std::collections::HashSet<_>>()
524 .len();
525
526 Ok(vocab_size)
527 }
528
529 pub fn train_embeddings(&mut self, texts: &[String]) -> Result<()> {
531 println!(
534 " Training embeddings for {} texts with strategy: {}",
535 texts.len(),
536 self.embedding_strategy
537 );
538
539 Ok(())
540 }
541
542 pub fn train(
544 &mut self,
545 texts: &[String],
546 labels: &[usize],
547 epochs: usize,
548 learning_rate: f64,
549 ) -> Result<()> {
550 let num_samples = texts.len();
552 let mut features = Array2::zeros((num_samples, self.embedding.dimension));
553
554 for (i, text) in texts.iter().enumerate() {
556 let feature_vec = text
558 .chars()
559 .enumerate()
560 .map(|(j, c)| (c as u32 % 8) as f64 / 8.0 + j as f64 * 0.001)
561 .take(self.embedding.dimension)
562 .collect::<Vec<_>>();
563
564 for (j, &val) in feature_vec
565 .iter()
566 .enumerate()
567 .take(self.embedding.dimension)
568 {
569 if j < features.ncols() {
570 features[[i, j]] = val;
571 }
572 }
573 }
574
575 let y_train = Array1::from_vec(labels.iter().map(|&l| l as f64).collect());
577
578 self.qnn
580 .train_1d(&features, &y_train, epochs, learning_rate)?;
581
582 Ok(())
583 }
584
585 pub fn classify(&self, text: &str) -> Result<(String, f64)> {
587 let hash = text.chars().map(|c| c as u32).sum::<u32>();
591 let class_idx = (hash % self.labels.len() as u32) as usize;
592 let confidence = 0.7 + 0.3 * (hash % 100) as f64 / 100.0;
593
594 Ok((self.labels[class_idx].clone(), confidence))
595 }
596}