trueno_rag/
preprocess.rs

1//! Query preprocessing strategies for improved retrieval.
2//!
3//! This module provides preprocessing techniques to enhance query effectiveness:
4//! - HyDE (Hypothetical Document Embeddings): Generate hypothetical answers for better matching
5//! - Multi-query expansion: Expand a single query into multiple related queries
6
7use crate::Result;
8
9/// A query preprocessor that transforms or expands queries before retrieval.
10pub trait QueryPreprocessor: Send + Sync {
11    /// Preprocess a query, potentially returning multiple expanded queries.
12    fn preprocess(&self, query: &str) -> Result<Vec<String>>;
13
14    /// Get the name of this preprocessor for debugging.
15    fn name(&self) -> &str;
16}
17
18/// No-op preprocessor that returns the query unchanged.
19#[derive(Debug, Clone, Default)]
20pub struct PassthroughPreprocessor;
21
22impl QueryPreprocessor for PassthroughPreprocessor {
23    fn preprocess(&self, query: &str) -> Result<Vec<String>> {
24        Ok(vec![query.to_string()])
25    }
26
27    fn name(&self) -> &str {
28        "passthrough"
29    }
30}
31
32/// HyDE (Hypothetical Document Embeddings) preprocessor.
33///
34/// Instead of searching with the query directly, HyDE generates a hypothetical
35/// document that would answer the query, then uses that for retrieval.
36/// This can improve semantic matching by generating content in the same
37/// "language" as the documents being searched.
38#[derive(Debug, Clone)]
39pub struct HydePreprocessor<G: HypotheticalGenerator> {
40    generator: G,
41    include_original: bool,
42}
43
44/// Trait for generating hypothetical documents from queries.
45pub trait HypotheticalGenerator: Send + Sync {
46    /// Generate a hypothetical document that would answer the query.
47    fn generate(&self, query: &str) -> Result<String>;
48}
49
50impl<G: HypotheticalGenerator> HydePreprocessor<G> {
51    /// Create a new HyDE preprocessor with the given generator.
52    pub fn new(generator: G) -> Self {
53        Self { generator, include_original: false }
54    }
55
56    /// Include the original query alongside the hypothetical document.
57    #[must_use]
58    pub fn with_original_query(mut self, include: bool) -> Self {
59        self.include_original = include;
60        self
61    }
62}
63
64impl<G: HypotheticalGenerator> QueryPreprocessor for HydePreprocessor<G> {
65    fn preprocess(&self, query: &str) -> Result<Vec<String>> {
66        let hypothetical = self.generator.generate(query)?;
67        if self.include_original {
68            Ok(vec![query.to_string(), hypothetical])
69        } else {
70            Ok(vec![hypothetical])
71        }
72    }
73
74    fn name(&self) -> &str {
75        "hyde"
76    }
77}
78
79/// Anthropic-backed HyDE generator using Claude API.
80///
81/// Generates hypothetical documents by prompting Claude to write a passage
82/// that would answer the given query. Requires `ANTHROPIC_API_KEY` environment
83/// variable and the `eval` feature flag (reuses the eval Anthropic client).
84#[cfg(feature = "eval")]
85pub struct AnthropicHypotheticalGenerator {
86    client: crate::eval::AnthropicClient,
87    runtime: std::sync::Arc<tokio::runtime::Runtime>,
88    model: String,
89    max_tokens: u32,
90}
91
92#[cfg(feature = "eval")]
93impl std::fmt::Debug for AnthropicHypotheticalGenerator {
94    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95        f.debug_struct("AnthropicHypotheticalGenerator")
96            .field("client", &"AnthropicClient{..}")
97            .field("runtime", &"Runtime{..}")
98            .field("model", &self.model)
99            .field("max_tokens", &self.max_tokens)
100            .finish()
101    }
102}
103
104#[cfg(feature = "eval")]
105impl AnthropicHypotheticalGenerator {
106    /// Create a new Anthropic HyDE generator from environment.
107    pub fn from_env() -> std::result::Result<Self, String> {
108        let client = crate::eval::AnthropicClient::from_env()?;
109        let runtime = tokio::runtime::Runtime::new()
110            .map_err(|e| format!("Failed to create tokio runtime: {e}"))?;
111        Ok(Self {
112            client,
113            runtime: std::sync::Arc::new(runtime),
114            model: "claude-haiku-4-5-20251001".to_string(),
115            max_tokens: 256,
116        })
117    }
118
119    /// Set the model to use for generation.
120    #[must_use]
121    pub fn with_model(mut self, model: impl Into<String>) -> Self {
122        self.model = model.into();
123        self
124    }
125
126    /// Set maximum tokens for the hypothetical document.
127    #[must_use]
128    pub fn with_max_tokens(mut self, max_tokens: u32) -> Self {
129        self.max_tokens = max_tokens;
130        self
131    }
132}
133
134#[cfg(feature = "eval")]
135impl HypotheticalGenerator for AnthropicHypotheticalGenerator {
136    fn generate(&self, query: &str) -> Result<String> {
137        let system = "You are a technical content generator. Given a user query, write a short \
138            passage (2-4 sentences) that directly answers the query as if it were an excerpt from \
139            a lecture transcript or technical document. Output ONLY the passage text, no preamble \
140            or formatting.";
141
142        let result = self.runtime.block_on(self.client.complete(
143            &self.model,
144            Some(system),
145            query,
146            self.max_tokens,
147        ));
148
149        match result {
150            Ok(completion) => Ok(completion.text),
151            Err(e) => Err(crate::Error::InvalidConfig(format!("HyDE generation failed: {e}"))),
152        }
153    }
154}
155
156/// Mock HyDE generator for testing that creates a simple hypothetical answer.
157#[derive(Debug, Clone, Default)]
158pub struct MockHypotheticalGenerator {
159    prefix: String,
160}
161
162impl MockHypotheticalGenerator {
163    /// Create a new mock generator.
164    pub fn new() -> Self {
165        Self { prefix: "The answer is:".to_string() }
166    }
167
168    /// Set a custom prefix for generated hypotheticals.
169    #[must_use]
170    pub fn with_prefix(mut self, prefix: impl Into<String>) -> Self {
171        self.prefix = prefix.into();
172        self
173    }
174}
175
176impl HypotheticalGenerator for MockHypotheticalGenerator {
177    fn generate(&self, query: &str) -> Result<String> {
178        Ok(format!("{} {}", self.prefix, query))
179    }
180}
181
182/// Multi-query expansion preprocessor.
183///
184/// Expands a single query into multiple related queries using different
185/// strategies. This can help retrieve documents that match different
186/// phrasings or aspects of the original query.
187#[derive(Debug, Clone)]
188pub struct MultiQueryPreprocessor<E: QueryExpander> {
189    expander: E,
190    max_queries: usize,
191    include_original: bool,
192}
193
194/// Trait for expanding queries into multiple related queries.
195pub trait QueryExpander: Send + Sync {
196    /// Expand a query into multiple related queries.
197    fn expand(&self, query: &str) -> Result<Vec<String>>;
198}
199
200impl<E: QueryExpander> MultiQueryPreprocessor<E> {
201    /// Create a new multi-query preprocessor.
202    pub fn new(expander: E) -> Self {
203        Self { expander, max_queries: 5, include_original: true }
204    }
205
206    /// Set the maximum number of expanded queries.
207    #[must_use]
208    pub fn with_max_queries(mut self, max: usize) -> Self {
209        self.max_queries = max;
210        self
211    }
212
213    /// Whether to include the original query in results.
214    #[must_use]
215    pub fn with_original_query(mut self, include: bool) -> Self {
216        self.include_original = include;
217        self
218    }
219}
220
221impl<E: QueryExpander> QueryPreprocessor for MultiQueryPreprocessor<E> {
222    fn preprocess(&self, query: &str) -> Result<Vec<String>> {
223        let mut queries = if self.include_original { vec![query.to_string()] } else { vec![] };
224
225        let expanded = self.expander.expand(query)?;
226        for q in expanded {
227            if queries.len() >= self.max_queries {
228                break;
229            }
230            // Skip duplicates and, when original is excluded, skip expansions
231            // that are identical to the original query
232            if queries.contains(&q) || (!self.include_original && q == query) {
233                continue;
234            }
235            queries.push(q);
236        }
237
238        Ok(queries)
239    }
240
241    fn name(&self) -> &str {
242        "multi-query"
243    }
244}
245
246/// Keyword-based query expander.
247///
248/// Expands queries by extracting key terms and creating variations.
249#[derive(Debug, Clone, Default)]
250pub struct KeywordExpander {
251    stopwords: std::collections::HashSet<String>,
252}
253
254impl KeywordExpander {
255    /// Create a new keyword expander with default stopwords.
256    pub fn new() -> Self {
257        let stopwords: std::collections::HashSet<String> = [
258            "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", "have", "has",
259            "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "must",
260            "shall", "can", "to", "of", "in", "for", "on", "with", "at", "by", "from", "as",
261            "into", "through", "during", "before", "after", "above", "below", "between", "under",
262            "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
263            "all", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not",
264            "only", "own", "same", "so", "than", "too", "very", "just", "and", "but", "if", "or",
265            "because", "until", "while", "what", "which", "who", "this", "that", "these", "those",
266            "i", "me", "my", "myself", "we", "our", "you", "your", "he", "him", "she", "her", "it",
267            "its", "they", "them", "their",
268        ]
269        .iter()
270        .map(|s| (*s).to_string())
271        .collect();
272
273        Self { stopwords }
274    }
275
276    /// Extract keywords from text.
277    fn extract_keywords(&self, text: &str) -> Vec<String> {
278        text.to_lowercase()
279            .split(|c: char| !c.is_alphanumeric())
280            .filter(|w| !w.is_empty() && w.len() > 2 && !self.stopwords.contains(*w))
281            .map(String::from)
282            .collect()
283    }
284}
285
286impl QueryExpander for KeywordExpander {
287    fn expand(&self, query: &str) -> Result<Vec<String>> {
288        let keywords = self.extract_keywords(query);
289        let mut expansions = Vec::new();
290
291        // Create query from just keywords
292        if keywords.len() > 1 {
293            expansions.push(keywords.join(" "));
294        }
295
296        // Create queries with individual important keywords emphasized
297        for keyword in keywords.iter().take(3) {
298            expansions.push(format!("{query} {keyword}"));
299        }
300
301        Ok(expansions)
302    }
303}
304
305/// Synonym-based query expander.
306///
307/// Expands queries by replacing words with synonyms.
308#[derive(Debug, Clone)]
309pub struct SynonymExpander {
310    synonyms: std::collections::HashMap<String, Vec<String>>,
311}
312
313impl SynonymExpander {
314    /// Create a new synonym expander with the given synonym map.
315    pub fn new(synonyms: std::collections::HashMap<String, Vec<String>>) -> Self {
316        Self { synonyms }
317    }
318
319    /// Create an expander with default technical synonyms.
320    pub fn with_technical_synonyms() -> Self {
321        let mut synonyms = std::collections::HashMap::new();
322        synonyms.insert(
323            "error".to_string(),
324            vec!["exception".to_string(), "failure".to_string(), "bug".to_string()],
325        );
326        synonyms
327            .insert("function".to_string(), vec!["method".to_string(), "procedure".to_string()]);
328        synonyms.insert(
329            "create".to_string(),
330            vec!["make".to_string(), "build".to_string(), "generate".to_string()],
331        );
332        synonyms.insert("delete".to_string(), vec!["remove".to_string(), "destroy".to_string()]);
333        synonyms.insert(
334            "update".to_string(),
335            vec!["modify".to_string(), "change".to_string(), "edit".to_string()],
336        );
337        synonyms.insert(
338            "find".to_string(),
339            vec!["search".to_string(), "lookup".to_string(), "locate".to_string()],
340        );
341        synonyms.insert(
342            "fast".to_string(),
343            vec!["quick".to_string(), "rapid".to_string(), "speedy".to_string()],
344        );
345        synonyms.insert("slow".to_string(), vec!["sluggish".to_string(), "delayed".to_string()]);
346        Self { synonyms }
347    }
348}
349
350impl Default for SynonymExpander {
351    fn default() -> Self {
352        Self::with_technical_synonyms()
353    }
354}
355
356impl QueryExpander for SynonymExpander {
357    fn expand(&self, query: &str) -> Result<Vec<String>> {
358        let mut expansions = Vec::new();
359        let words: Vec<&str> = query.split_whitespace().collect();
360
361        for (i, word) in words.iter().enumerate() {
362            let lower = word.to_lowercase();
363            if let Some(syns) = self.synonyms.get(&lower) {
364                for syn in syns.iter().take(2) {
365                    let mut new_words = words.clone();
366                    new_words[i] = syn;
367                    expansions.push(new_words.join(" "));
368                }
369            }
370        }
371
372        Ok(expansions)
373    }
374}
375
376/// Chained preprocessor that applies multiple preprocessors in sequence.
377#[allow(missing_debug_implementations)]
378pub struct ChainedPreprocessor {
379    preprocessors: Vec<Box<dyn QueryPreprocessor>>,
380    deduplicate: bool,
381    max_total: usize,
382}
383
384impl ChainedPreprocessor {
385    /// Create a new chained preprocessor.
386    pub fn new() -> Self {
387        Self { preprocessors: Vec::new(), deduplicate: true, max_total: 10 }
388    }
389
390    /// Add a preprocessor to the chain.
391    pub fn add<P: QueryPreprocessor + 'static>(mut self, preprocessor: P) -> Self {
392        self.preprocessors.push(Box::new(preprocessor));
393        self
394    }
395
396    /// Set maximum total queries to return.
397    #[must_use]
398    pub fn with_max_total(mut self, max: usize) -> Self {
399        self.max_total = max;
400        self
401    }
402
403    /// Whether to deduplicate queries.
404    #[must_use]
405    pub fn with_deduplicate(mut self, dedup: bool) -> Self {
406        self.deduplicate = dedup;
407        self
408    }
409}
410
411impl Default for ChainedPreprocessor {
412    fn default() -> Self {
413        Self::new()
414    }
415}
416
417impl QueryPreprocessor for ChainedPreprocessor {
418    fn preprocess(&self, query: &str) -> Result<Vec<String>> {
419        if self.preprocessors.is_empty() {
420            return Ok(vec![query.to_string()]);
421        }
422
423        let mut all_queries = Vec::new();
424
425        for preprocessor in &self.preprocessors {
426            let queries = preprocessor.preprocess(query)?;
427            for q in queries {
428                if all_queries.len() >= self.max_total {
429                    break;
430                }
431                if !self.deduplicate || !all_queries.contains(&q) {
432                    all_queries.push(q);
433                }
434            }
435        }
436
437        Ok(all_queries)
438    }
439
440    fn name(&self) -> &str {
441        "chained"
442    }
443}
444
445/// Query analyzer that extracts structured information from queries.
446#[derive(Debug, Clone)]
447pub struct QueryAnalyzer {
448    intent_keywords: std::collections::HashMap<QueryIntent, Vec<String>>,
449}
450
451/// Detected intent of a query.
452#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
453pub enum QueryIntent {
454    /// Looking for information or explanation.
455    Informational,
456    /// Looking for how to do something.
457    HowTo,
458    /// Looking for a definition.
459    Definition,
460    /// Looking for troubleshooting help.
461    Troubleshooting,
462    /// Looking to compare options.
463    Comparison,
464    /// Unknown intent.
465    Unknown,
466}
467
468/// Analysis result for a query.
469#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
470pub struct QueryAnalysis {
471    /// Original query text.
472    pub original: String,
473    /// Detected intent.
474    pub intent: QueryIntent,
475    /// Extracted keywords.
476    pub keywords: Vec<String>,
477    /// Confidence score (0.0 - 1.0).
478    pub confidence: f32,
479}
480
481impl QueryAnalyzer {
482    /// Create a new query analyzer with default intent patterns.
483    pub fn new() -> Self {
484        let mut intent_keywords = std::collections::HashMap::new();
485
486        intent_keywords.insert(
487            QueryIntent::HowTo,
488            vec![
489                "how".to_string(),
490                "tutorial".to_string(),
491                "guide".to_string(),
492                "steps".to_string(),
493                "way".to_string(),
494            ],
495        );
496
497        intent_keywords.insert(
498            QueryIntent::Definition,
499            vec![
500                "what".to_string(),
501                "define".to_string(),
502                "meaning".to_string(),
503                "definition".to_string(),
504            ],
505        );
506
507        intent_keywords.insert(
508            QueryIntent::Troubleshooting,
509            vec![
510                "error".to_string(),
511                "fix".to_string(),
512                "problem".to_string(),
513                "issue".to_string(),
514                "not working".to_string(),
515                "failed".to_string(),
516                "broken".to_string(),
517            ],
518        );
519
520        intent_keywords.insert(
521            QueryIntent::Comparison,
522            vec![
523                "vs".to_string(),
524                "versus".to_string(),
525                "compare".to_string(),
526                "difference".to_string(),
527                "better".to_string(),
528            ],
529        );
530
531        Self { intent_keywords }
532    }
533
534    /// Analyze a query and return structured information.
535    pub fn analyze(&self, query: &str) -> QueryAnalysis {
536        let lower = query.to_lowercase();
537        let mut best_intent = QueryIntent::Informational;
538        let mut best_score = 0;
539
540        for (intent, keywords) in &self.intent_keywords {
541            let score = keywords.iter().filter(|kw| lower.contains(kw.as_str())).count();
542            if score > best_score {
543                best_score = score;
544                best_intent = *intent;
545            }
546        }
547
548        // Extract keywords
549        let keywords: Vec<String> = lower
550            .split(|c: char| !c.is_alphanumeric())
551            .filter(|w| !w.is_empty() && w.len() > 2)
552            .map(String::from)
553            .collect();
554
555        let confidence =
556            if best_score == 0 { 0.3 } else { (0.5 + 0.1 * best_score as f32).min(1.0) };
557
558        QueryAnalysis {
559            original: query.to_string(),
560            intent: if best_score == 0 { QueryIntent::Unknown } else { best_intent },
561            keywords,
562            confidence,
563        }
564    }
565}
566
567impl Default for QueryAnalyzer {
568    fn default() -> Self {
569        Self::new()
570    }
571}
572
573#[cfg(test)]
574mod tests {
575    use super::*;
576
577    // Passthrough preprocessor tests
578
579    #[test]
580    fn test_passthrough_returns_original() {
581        let preprocessor = PassthroughPreprocessor;
582        let result = preprocessor.preprocess("test query").unwrap();
583        assert_eq!(result, vec!["test query"]);
584    }
585
586    #[test]
587    fn test_passthrough_name() {
588        let preprocessor = PassthroughPreprocessor;
589        assert_eq!(preprocessor.name(), "passthrough");
590    }
591
592    // HyDE preprocessor tests
593
594    #[test]
595    fn test_hyde_generates_hypothetical() {
596        let generator = MockHypotheticalGenerator::new();
597        let hyde = HydePreprocessor::new(generator);
598        let result = hyde.preprocess("what is rust").unwrap();
599        assert_eq!(result.len(), 1);
600        assert!(result[0].contains("The answer is:"));
601        assert!(result[0].contains("what is rust"));
602    }
603
604    #[test]
605    fn test_hyde_with_original() {
606        let generator = MockHypotheticalGenerator::new();
607        let hyde = HydePreprocessor::new(generator).with_original_query(true);
608        let result = hyde.preprocess("test query").unwrap();
609        assert_eq!(result.len(), 2);
610        assert_eq!(result[0], "test query");
611        assert!(result[1].contains("The answer is:"));
612    }
613
614    #[test]
615    fn test_hyde_custom_prefix() {
616        let generator = MockHypotheticalGenerator::new().with_prefix("Answer:");
617        let hyde = HydePreprocessor::new(generator);
618        let result = hyde.preprocess("query").unwrap();
619        assert!(result[0].starts_with("Answer:"));
620    }
621
622    #[test]
623    fn test_hyde_name() {
624        let generator = MockHypotheticalGenerator::new();
625        let hyde = HydePreprocessor::new(generator);
626        assert_eq!(hyde.name(), "hyde");
627    }
628
629    // Multi-query preprocessor tests
630
631    #[test]
632    fn test_multi_query_with_keyword_expander() {
633        let expander = KeywordExpander::new();
634        let multi = MultiQueryPreprocessor::new(expander);
635        let result = multi.preprocess("rust programming language").unwrap();
636        assert!(!result.is_empty());
637        assert_eq!(result[0], "rust programming language"); // original first
638    }
639
640    #[test]
641    fn test_multi_query_max_queries() {
642        let expander = KeywordExpander::new();
643        let multi = MultiQueryPreprocessor::new(expander).with_max_queries(2);
644        let result = multi.preprocess("rust programming language").unwrap();
645        assert!(result.len() <= 2);
646    }
647
648    #[test]
649    fn test_multi_query_without_original() {
650        let expander = KeywordExpander::new();
651        let multi = MultiQueryPreprocessor::new(expander).with_original_query(false);
652        let result = multi.preprocess("rust programming language").unwrap();
653        assert!(!result.contains(&"rust programming language".to_string()));
654    }
655
656    #[test]
657    fn test_multi_query_name() {
658        let expander = KeywordExpander::new();
659        let multi = MultiQueryPreprocessor::new(expander);
660        assert_eq!(multi.name(), "multi-query");
661    }
662
663    // Keyword expander tests
664
665    #[test]
666    fn test_keyword_expander_extracts_keywords() {
667        let expander = KeywordExpander::new();
668        let keywords = expander.extract_keywords("the quick brown fox jumps");
669        assert!(keywords.contains(&"quick".to_string()));
670        assert!(keywords.contains(&"brown".to_string()));
671        assert!(keywords.contains(&"jumps".to_string()));
672        assert!(!keywords.contains(&"the".to_string())); // stopword
673    }
674
675    #[test]
676    fn test_keyword_expander_filters_short_words() {
677        let expander = KeywordExpander::new();
678        let keywords = expander.extract_keywords("a go at it");
679        assert!(keywords.is_empty() || !keywords.iter().any(|w| w.len() <= 2));
680    }
681
682    #[test]
683    fn test_keyword_expander_expand() {
684        let expander = KeywordExpander::new();
685        let result = expander.expand("rust memory safety").unwrap();
686        assert!(!result.is_empty());
687    }
688
689    // Synonym expander tests
690
691    #[test]
692    fn test_synonym_expander_basic() {
693        let expander = SynonymExpander::with_technical_synonyms();
694        let result = expander.expand("create a function").unwrap();
695        assert!(!result.is_empty());
696        // Should have variations with "make" or "build" instead of "create"
697        assert!(result.iter().any(|q| q.contains("make") || q.contains("build")));
698    }
699
700    #[test]
701    fn test_synonym_expander_no_synonyms() {
702        let expander = SynonymExpander::with_technical_synonyms();
703        let result = expander.expand("xyz abc def").unwrap();
704        assert!(result.is_empty()); // no synonyms for these words
705    }
706
707    #[test]
708    fn test_synonym_expander_custom_synonyms() {
709        let mut synonyms = std::collections::HashMap::new();
710        synonyms.insert("test".to_string(), vec!["check".to_string()]);
711        let expander = SynonymExpander::new(synonyms);
712        let result = expander.expand("test code").unwrap();
713        assert!(result.iter().any(|q| q.contains("check")));
714    }
715
716    // Chained preprocessor tests
717
718    #[test]
719    fn test_chained_empty() {
720        let chained = ChainedPreprocessor::new();
721        let result = chained.preprocess("query").unwrap();
722        assert_eq!(result, vec!["query"]);
723    }
724
725    #[test]
726    fn test_chained_single() {
727        let chained = ChainedPreprocessor::new().add(PassthroughPreprocessor);
728        let result = chained.preprocess("query").unwrap();
729        assert_eq!(result, vec!["query"]);
730    }
731
732    #[test]
733    fn test_chained_multiple() {
734        let chained = ChainedPreprocessor::new()
735            .add(PassthroughPreprocessor)
736            .add(HydePreprocessor::new(MockHypotheticalGenerator::new()));
737        let result = chained.preprocess("query").unwrap();
738        assert!(result.len() >= 2);
739        assert!(result.contains(&"query".to_string()));
740    }
741
742    #[test]
743    fn test_chained_deduplicates() {
744        let chained = ChainedPreprocessor::new()
745            .add(PassthroughPreprocessor)
746            .add(PassthroughPreprocessor)
747            .with_deduplicate(true);
748        let result = chained.preprocess("query").unwrap();
749        assert_eq!(result.len(), 1); // duplicates removed
750    }
751
752    #[test]
753    fn test_chained_max_total() {
754        let chained = ChainedPreprocessor::new()
755            .add(MultiQueryPreprocessor::new(KeywordExpander::new()).with_max_queries(10))
756            .with_max_total(3);
757        let result = chained.preprocess("rust programming language tutorial").unwrap();
758        assert!(result.len() <= 3);
759    }
760
761    #[test]
762    fn test_chained_name() {
763        let chained = ChainedPreprocessor::new();
764        assert_eq!(chained.name(), "chained");
765    }
766
767    // Query analyzer tests
768
769    #[test]
770    fn test_analyzer_how_to() {
771        let analyzer = QueryAnalyzer::new();
772        let analysis = analyzer.analyze("how to write tests in rust");
773        assert_eq!(analysis.intent, QueryIntent::HowTo);
774        assert!(analysis.confidence > 0.5);
775    }
776
777    #[test]
778    fn test_analyzer_definition() {
779        let analyzer = QueryAnalyzer::new();
780        let analysis = analyzer.analyze("what is a monad");
781        assert_eq!(analysis.intent, QueryIntent::Definition);
782    }
783
784    #[test]
785    fn test_analyzer_troubleshooting() {
786        let analyzer = QueryAnalyzer::new();
787        let analysis = analyzer.analyze("error compiling code fix");
788        assert_eq!(analysis.intent, QueryIntent::Troubleshooting);
789    }
790
791    #[test]
792    fn test_analyzer_comparison() {
793        let analyzer = QueryAnalyzer::new();
794        let analysis = analyzer.analyze("rust vs go comparison");
795        assert_eq!(analysis.intent, QueryIntent::Comparison);
796    }
797
798    #[test]
799    fn test_analyzer_unknown() {
800        let analyzer = QueryAnalyzer::new();
801        let analysis = analyzer.analyze("random words xyz");
802        assert_eq!(analysis.intent, QueryIntent::Unknown);
803        assert!(analysis.confidence < 0.5);
804    }
805
806    #[test]
807    fn test_analyzer_extracts_keywords() {
808        let analyzer = QueryAnalyzer::new();
809        let analysis = analyzer.analyze("rust programming language");
810        assert!(analysis.keywords.contains(&"rust".to_string()));
811        assert!(analysis.keywords.contains(&"programming".to_string()));
812        assert!(analysis.keywords.contains(&"language".to_string()));
813    }
814
815    #[test]
816    fn test_query_analysis_serialization() {
817        let analysis = QueryAnalysis {
818            original: "test".to_string(),
819            intent: QueryIntent::HowTo,
820            keywords: vec!["test".to_string()],
821            confidence: 0.8,
822        };
823        let json = serde_json::to_string(&analysis).unwrap();
824        let deserialized: QueryAnalysis = serde_json::from_str(&json).unwrap();
825        assert_eq!(deserialized.original, "test");
826        assert_eq!(deserialized.intent, QueryIntent::HowTo);
827    }
828
829    // Property-based tests
830    use proptest::prelude::*;
831
832    proptest! {
833        #[test]
834        fn prop_passthrough_preserves_input(query in "\\PC{1,100}") {
835            let preprocessor = PassthroughPreprocessor;
836            let result = preprocessor.preprocess(&query).unwrap();
837            prop_assert_eq!(result.len(), 1);
838            prop_assert_eq!(&result[0], &query);
839        }
840
841        #[test]
842        fn prop_hyde_always_returns_something(query in "\\w{1,50}") {
843            let hyde = HydePreprocessor::new(MockHypotheticalGenerator::new());
844            let result = hyde.preprocess(&query).unwrap();
845            prop_assert!(!result.is_empty());
846        }
847
848        #[test]
849        fn prop_chained_respects_max_total(query in "\\w{1,50}", max in 1usize..20) {
850            let chained = ChainedPreprocessor::new()
851                .add(MultiQueryPreprocessor::new(KeywordExpander::new()))
852                .add(HydePreprocessor::new(MockHypotheticalGenerator::new()))
853                .with_max_total(max);
854            let result = chained.preprocess(&query).unwrap();
855            prop_assert!(result.len() <= max);
856        }
857
858        #[test]
859        fn prop_analyzer_always_returns_analysis(query in "\\w{1,100}") {
860            let analyzer = QueryAnalyzer::new();
861            let analysis = analyzer.analyze(&query);
862            prop_assert_eq!(analysis.original, query);
863            prop_assert!(analysis.confidence >= 0.0 && analysis.confidence <= 1.0);
864        }
865
866        #[test]
867        fn prop_keyword_expander_no_empty_results(
868            w1 in "[a-z]{4,10}",
869            w2 in "[a-z]{4,10}",
870            w3 in "[a-z]{4,10}"
871        ) {
872            let expander = KeywordExpander::new();
873            let query = format!("{w1} {w2} {w3}");
874            let result = expander.expand(&query).unwrap();
875            // All results should be non-empty strings
876            for q in &result {
877                prop_assert!(!q.is_empty());
878            }
879        }
880    }
881}
trueno_rag/preprocess.rs

trueno_rag/
preprocess.rs