trueno_rag/
preprocess.rs

1//! Query preprocessing strategies for improved retrieval.
2//!
3//! This module provides preprocessing techniques to enhance query effectiveness:
4//! - HyDE (Hypothetical Document Embeddings): Generate hypothetical answers for better matching
5//! - Multi-query expansion: Expand a single query into multiple related queries
6
7use crate::Result;
8
9/// A query preprocessor that transforms or expands queries before retrieval.
10pub trait QueryPreprocessor: Send + Sync {
11    /// Preprocess a query, potentially returning multiple expanded queries.
12    fn preprocess(&self, query: &str) -> Result<Vec<String>>;
13
14    /// Get the name of this preprocessor for debugging.
15    fn name(&self) -> &str;
16}
17
18/// No-op preprocessor that returns the query unchanged.
19#[derive(Debug, Clone, Default)]
20pub struct PassthroughPreprocessor;
21
22impl QueryPreprocessor for PassthroughPreprocessor {
23    fn preprocess(&self, query: &str) -> Result<Vec<String>> {
24        Ok(vec![query.to_string()])
25    }
26
27    fn name(&self) -> &str {
28        "passthrough"
29    }
30}
31
32/// HyDE (Hypothetical Document Embeddings) preprocessor.
33///
34/// Instead of searching with the query directly, HyDE generates a hypothetical
35/// document that would answer the query, then uses that for retrieval.
36/// This can improve semantic matching by generating content in the same
37/// "language" as the documents being searched.
38#[derive(Debug, Clone)]
39pub struct HydePreprocessor<G: HypotheticalGenerator> {
40    generator: G,
41    include_original: bool,
42}
43
44/// Trait for generating hypothetical documents from queries.
45pub trait HypotheticalGenerator: Send + Sync {
46    /// Generate a hypothetical document that would answer the query.
47    fn generate(&self, query: &str) -> Result<String>;
48}
49
50impl<G: HypotheticalGenerator> HydePreprocessor<G> {
51    /// Create a new HyDE preprocessor with the given generator.
52    pub fn new(generator: G) -> Self {
53        Self { generator, include_original: false }
54    }
55
56    /// Include the original query alongside the hypothetical document.
57    #[must_use]
58    pub fn with_original_query(mut self, include: bool) -> Self {
59        self.include_original = include;
60        self
61    }
62}
63
64impl<G: HypotheticalGenerator> QueryPreprocessor for HydePreprocessor<G> {
65    fn preprocess(&self, query: &str) -> Result<Vec<String>> {
66        let hypothetical = self.generator.generate(query)?;
67        if self.include_original {
68            Ok(vec![query.to_string(), hypothetical])
69        } else {
70            Ok(vec![hypothetical])
71        }
72    }
73
74    fn name(&self) -> &str {
75        "hyde"
76    }
77}
78
79/// Anthropic-backed HyDE generator using Claude API.
80///
81/// Generates hypothetical documents by prompting Claude to write a passage
82/// that would answer the given query. Requires `ANTHROPIC_API_KEY` environment
83/// variable and the `eval` feature flag (reuses the eval Anthropic client).
84#[cfg(feature = "eval")]
85pub struct AnthropicHypotheticalGenerator {
86    client: crate::eval::AnthropicClient,
87    runtime: std::sync::Arc<tokio::runtime::Runtime>,
88    model: String,
89    max_tokens: u32,
90}
91
92#[cfg(feature = "eval")]
93impl std::fmt::Debug for AnthropicHypotheticalGenerator {
94    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95        f.debug_struct("AnthropicHypotheticalGenerator")
96            .field("client", &"AnthropicClient{..}")
97            .field("runtime", &"Runtime{..}")
98            .field("model", &self.model)
99            .field("max_tokens", &self.max_tokens)
100            .finish()
101    }
102}
103
104#[cfg(feature = "eval")]
105impl AnthropicHypotheticalGenerator {
106    /// Create a new Anthropic HyDE generator from environment.
107    pub fn from_env() -> std::result::Result<Self, String> {
108        let client = crate::eval::AnthropicClient::from_env()?;
109        let runtime = tokio::runtime::Runtime::new()
110            .map_err(|e| format!("Failed to create tokio runtime: {e}"))?;
111        Ok(Self {
112            client,
113            runtime: std::sync::Arc::new(runtime),
114            model: "claude-haiku-4-5-20251001".to_string(),
115            max_tokens: 256,
116        })
117    }
118
119    /// Set the model to use for generation.
120    #[must_use]
121    pub fn with_model(mut self, model: impl Into<String>) -> Self {
122        self.model = model.into();
123        self
124    }
125
126    /// Set maximum tokens for the hypothetical document.
127    #[must_use]
128    pub fn with_max_tokens(mut self, max_tokens: u32) -> Self {
129        self.max_tokens = max_tokens;
130        self
131    }
132}
133
134#[cfg(feature = "eval")]
135impl HypotheticalGenerator for AnthropicHypotheticalGenerator {
136    fn generate(&self, query: &str) -> Result<String> {
137        let system = "You are a technical content generator. Given a user query, write a short \
138            passage (2-4 sentences) that directly answers the query as if it were an excerpt from \
139            a lecture transcript or technical document. Output ONLY the passage text, no preamble \
140            or formatting.";
141
142        let result = self.runtime.block_on(self.client.complete(
143            &self.model,
144            Some(system),
145            query,
146            self.max_tokens,
147        ));
148
149        match result {
150            Ok(completion) => Ok(completion.text),
151            Err(e) => Err(crate::Error::InvalidConfig(format!("HyDE generation failed: {e}"))),
152        }
153    }
154}
155
156/// Mock HyDE generator for testing that creates a simple hypothetical answer.
157#[derive(Debug, Clone, Default)]
158pub struct MockHypotheticalGenerator {
159    prefix: String,
160}
161
162impl MockHypotheticalGenerator {
163    /// Create a new mock generator.
164    pub fn new() -> Self {
165        Self { prefix: "The answer is:".to_string() }
166    }
167
168    /// Set a custom prefix for generated hypotheticals.
169    #[must_use]
170    pub fn with_prefix(mut self, prefix: impl Into<String>) -> Self {
171        self.prefix = prefix.into();
172        self
173    }
174}
175
176impl HypotheticalGenerator for MockHypotheticalGenerator {
177    fn generate(&self, query: &str) -> Result<String> {
178        Ok(format!("{} {}", self.prefix, query))
179    }
180}
181
182/// Multi-query expansion preprocessor.
183///
184/// Expands a single query into multiple related queries using different
185/// strategies. This can help retrieve documents that match different
186/// phrasings or aspects of the original query.
187#[derive(Debug, Clone)]
188pub struct MultiQueryPreprocessor<E: QueryExpander> {
189    expander: E,
190    max_queries: usize,
191    include_original: bool,
192}
193
194/// Trait for expanding queries into multiple related queries.
195pub trait QueryExpander: Send + Sync {
196    /// Expand a query into multiple related queries.
197    fn expand(&self, query: &str) -> Result<Vec<String>>;
198}
199
200impl<E: QueryExpander> MultiQueryPreprocessor<E> {
201    /// Create a new multi-query preprocessor.
202    pub fn new(expander: E) -> Self {
203        Self { expander, max_queries: 5, include_original: true }
204    }
205
206    /// Set the maximum number of expanded queries.
207    #[must_use]
208    pub fn with_max_queries(mut self, max: usize) -> Self {
209        self.max_queries = max;
210        self
211    }
212
213    /// Whether to include the original query in results.
214    #[must_use]
215    pub fn with_original_query(mut self, include: bool) -> Self {
216        self.include_original = include;
217        self
218    }
219}
220
221impl<E: QueryExpander> QueryPreprocessor for MultiQueryPreprocessor<E> {
222    fn preprocess(&self, query: &str) -> Result<Vec<String>> {
223        let mut queries = if self.include_original { vec![query.to_string()] } else { vec![] };
224
225        let expanded = self.expander.expand(query)?;
226        for q in expanded {
227            if queries.len() >= self.max_queries {
228                break;
229            }
230            // Skip duplicates and, when original is excluded, skip expansions
231            // that are identical to the original query
232            if queries.contains(&q) || (!self.include_original && q == query) {
233                continue;
234            }
235            queries.push(q);
236        }
237
238        Ok(queries)
239    }
240
241    fn name(&self) -> &str {
242        "multi-query"
243    }
244}
245
246/// Keyword-based query expander.
247///
248/// Expands queries by extracting key terms and creating variations.
249#[derive(Debug, Clone, Default)]
250pub struct KeywordExpander {
251    stopwords: std::collections::HashSet<String>,
252}
253
254impl KeywordExpander {
255    /// Create a new keyword expander with default stopwords.
256    pub fn new() -> Self {
257        let stopwords: std::collections::HashSet<String> = [
258            "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", "have", "has",
259            "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "must",
260            "shall", "can", "to", "of", "in", "for", "on", "with", "at", "by", "from", "as",
261            "into", "through", "during", "before", "after", "above", "below", "between", "under",
262            "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
263            "all", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not",
264            "only", "own", "same", "so", "than", "too", "very", "just", "and", "but", "if", "or",
265            "because", "until", "while", "what", "which", "who", "this", "that", "these", "those",
266            "i", "me", "my", "myself", "we", "our", "you", "your", "he", "him", "she", "her", "it",
267            "its", "they", "them", "their",
268        ]
269        .iter()
270        .map(|s| (*s).to_string())
271        .collect();
272
273        Self { stopwords }
274    }
275
276    /// Extract keywords from text.
277    fn extract_keywords(&self, text: &str) -> Vec<String> {
278        text.to_lowercase()
279            .split(|c: char| !c.is_alphanumeric())
280            .filter(|w| !w.is_empty() && w.len() > 2 && !self.stopwords.contains(*w))
281            .map(String::from)
282            .collect()
283    }
284}
285
286impl QueryExpander for KeywordExpander {
287    fn expand(&self, query: &str) -> Result<Vec<String>> {
288        let keywords = self.extract_keywords(query);
289        let mut expansions = Vec::new();
290
291        // Create query from just keywords
292        if keywords.len() > 1 {
293            expansions.push(keywords.join(" "));
294        }
295
296        // Create queries with individual important keywords emphasized
297        for keyword in keywords.iter().take(3) {
298            expansions.push(format!("{query} {keyword}"));
299        }
300
301        Ok(expansions)
302    }
303}
304
305/// Synonym-based query expander.
306///
307/// Expands queries by replacing words with synonyms.
308#[derive(Debug, Clone)]
309pub struct SynonymExpander {
310    synonyms: std::collections::HashMap<String, Vec<String>>,
311}
312
313impl SynonymExpander {
314    /// Create a new synonym expander with the given synonym map.
315    pub fn new(synonyms: std::collections::HashMap<String, Vec<String>>) -> Self {
316        Self { synonyms }
317    }
318
319    /// Create an expander with default technical synonyms.
320    pub fn with_technical_synonyms() -> Self {
321        let mut synonyms = std::collections::HashMap::new();
322        synonyms.insert(
323            "error".to_string(),
324            vec!["exception".to_string(), "failure".to_string(), "bug".to_string()],
325        );
326        synonyms
327            .insert("function".to_string(), vec!["method".to_string(), "procedure".to_string()]);
328        synonyms.insert(
329            "create".to_string(),
330            vec!["make".to_string(), "build".to_string(), "generate".to_string()],
331        );
332        synonyms.insert("delete".to_string(), vec!["remove".to_string(), "destroy".to_string()]);
333        synonyms.insert(
334            "update".to_string(),
335            vec!["modify".to_string(), "change".to_string(), "edit".to_string()],
336        );
337        synonyms.insert(
338            "find".to_string(),
339            vec!["search".to_string(), "lookup".to_string(), "locate".to_string()],
340        );
341        synonyms.insert(
342            "fast".to_string(),
343            vec!["quick".to_string(), "rapid".to_string(), "speedy".to_string()],
344        );
345        synonyms.insert("slow".to_string(), vec!["sluggish".to_string(), "delayed".to_string()]);
346        Self { synonyms }
347    }
348}
349
350impl Default for SynonymExpander {
351    fn default() -> Self {
352        Self::with_technical_synonyms()
353    }
354}
355
356impl QueryExpander for SynonymExpander {
357    fn expand(&self, query: &str) -> Result<Vec<String>> {
358        let mut expansions = Vec::new();
359        let words: Vec<&str> = query.split_whitespace().collect();
360
361        for (i, word) in words.iter().enumerate() {
362            let lower = word.to_lowercase();
363            if let Some(syns) = self.synonyms.get(&lower) {
364                for syn in syns.iter().take(2) {
365                    let mut new_words = words.clone();
366                    new_words[i] = syn;
367                    expansions.push(new_words.join(" "));
368                }
369            }
370        }
371
372        Ok(expansions)
373    }
374}
375
376/// Chained preprocessor that applies multiple preprocessors in sequence.
377#[allow(missing_debug_implementations)]
378pub struct ChainedPreprocessor {
379    preprocessors: Vec<Box<dyn QueryPreprocessor>>,
380    deduplicate: bool,
381    max_total: usize,
382}
383
384impl ChainedPreprocessor {
385    /// Create a new chained preprocessor.
386    pub fn new() -> Self {
387        Self { preprocessors: Vec::new(), deduplicate: true, max_total: 10 }
388    }
389
390    /// Add a preprocessor to the chain.
391    #[must_use]
392    #[allow(clippy::should_implement_trait)]
393    pub fn add<P: QueryPreprocessor + 'static>(mut self, preprocessor: P) -> Self {
394        self.preprocessors.push(Box::new(preprocessor));
395        self
396    }
397
398    /// Set maximum total queries to return.
399    #[must_use]
400    pub fn with_max_total(mut self, max: usize) -> Self {
401        self.max_total = max;
402        self
403    }
404
405    /// Whether to deduplicate queries.
406    #[must_use]
407    pub fn with_deduplicate(mut self, dedup: bool) -> Self {
408        self.deduplicate = dedup;
409        self
410    }
411}
412
413impl Default for ChainedPreprocessor {
414    fn default() -> Self {
415        Self::new()
416    }
417}
418
419impl QueryPreprocessor for ChainedPreprocessor {
420    fn preprocess(&self, query: &str) -> Result<Vec<String>> {
421        if self.preprocessors.is_empty() {
422            return Ok(vec![query.to_string()]);
423        }
424
425        let mut all_queries = Vec::new();
426
427        for preprocessor in &self.preprocessors {
428            let queries = preprocessor.preprocess(query)?;
429            for q in queries {
430                if all_queries.len() >= self.max_total {
431                    break;
432                }
433                if !self.deduplicate || !all_queries.contains(&q) {
434                    all_queries.push(q);
435                }
436            }
437        }
438
439        Ok(all_queries)
440    }
441
442    fn name(&self) -> &str {
443        "chained"
444    }
445}
446
447/// Query analyzer that extracts structured information from queries.
448#[derive(Debug, Clone)]
449pub struct QueryAnalyzer {
450    intent_keywords: std::collections::HashMap<QueryIntent, Vec<String>>,
451}
452
453/// Detected intent of a query.
454#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
455pub enum QueryIntent {
456    /// Looking for information or explanation.
457    Informational,
458    /// Looking for how to do something.
459    HowTo,
460    /// Looking for a definition.
461    Definition,
462    /// Looking for troubleshooting help.
463    Troubleshooting,
464    /// Looking to compare options.
465    Comparison,
466    /// Unknown intent.
467    Unknown,
468}
469
470/// Analysis result for a query.
471#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
472pub struct QueryAnalysis {
473    /// Original query text.
474    pub original: String,
475    /// Detected intent.
476    pub intent: QueryIntent,
477    /// Extracted keywords.
478    pub keywords: Vec<String>,
479    /// Confidence score (0.0 - 1.0).
480    pub confidence: f32,
481}
482
483impl QueryAnalyzer {
484    /// Create a new query analyzer with default intent patterns.
485    pub fn new() -> Self {
486        let mut intent_keywords = std::collections::HashMap::new();
487
488        intent_keywords.insert(
489            QueryIntent::HowTo,
490            vec![
491                "how".to_string(),
492                "tutorial".to_string(),
493                "guide".to_string(),
494                "steps".to_string(),
495                "way".to_string(),
496            ],
497        );
498
499        intent_keywords.insert(
500            QueryIntent::Definition,
501            vec![
502                "what".to_string(),
503                "define".to_string(),
504                "meaning".to_string(),
505                "definition".to_string(),
506            ],
507        );
508
509        intent_keywords.insert(
510            QueryIntent::Troubleshooting,
511            vec![
512                "error".to_string(),
513                "fix".to_string(),
514                "problem".to_string(),
515                "issue".to_string(),
516                "not working".to_string(),
517                "failed".to_string(),
518                "broken".to_string(),
519            ],
520        );
521
522        intent_keywords.insert(
523            QueryIntent::Comparison,
524            vec![
525                "vs".to_string(),
526                "versus".to_string(),
527                "compare".to_string(),
528                "difference".to_string(),
529                "better".to_string(),
530            ],
531        );
532
533        Self { intent_keywords }
534    }
535
536    /// Analyze a query and return structured information.
537    pub fn analyze(&self, query: &str) -> QueryAnalysis {
538        let lower = query.to_lowercase();
539        let mut best_intent = QueryIntent::Informational;
540        let mut best_score = 0;
541
542        for (intent, keywords) in &self.intent_keywords {
543            let score = keywords.iter().filter(|kw| lower.contains(kw.as_str())).count();
544            if score > best_score {
545                best_score = score;
546                best_intent = *intent;
547            }
548        }
549
550        // Extract keywords
551        let keywords: Vec<String> = lower
552            .split(|c: char| !c.is_alphanumeric())
553            .filter(|w| !w.is_empty() && w.len() > 2)
554            .map(String::from)
555            .collect();
556
557        let confidence =
558            if best_score == 0 { 0.3 } else { (0.5 + 0.1 * best_score as f32).min(1.0) };
559
560        QueryAnalysis {
561            original: query.to_string(),
562            intent: if best_score == 0 { QueryIntent::Unknown } else { best_intent },
563            keywords,
564            confidence,
565        }
566    }
567}
568
569impl Default for QueryAnalyzer {
570    fn default() -> Self {
571        Self::new()
572    }
573}
574
575#[cfg(test)]
576mod tests {
577    use super::*;
578
579    // Passthrough preprocessor tests
580
581    #[test]
582    fn test_passthrough_returns_original() {
583        let preprocessor = PassthroughPreprocessor;
584        let result = preprocessor.preprocess("test query").unwrap();
585        assert_eq!(result, vec!["test query"]);
586    }
587
588    #[test]
589    fn test_passthrough_name() {
590        let preprocessor = PassthroughPreprocessor;
591        assert_eq!(preprocessor.name(), "passthrough");
592    }
593
594    // HyDE preprocessor tests
595
596    #[test]
597    fn test_hyde_generates_hypothetical() {
598        let generator = MockHypotheticalGenerator::new();
599        let hyde = HydePreprocessor::new(generator);
600        let result = hyde.preprocess("what is rust").unwrap();
601        assert_eq!(result.len(), 1);
602        assert!(result[0].contains("The answer is:"));
603        assert!(result[0].contains("what is rust"));
604    }
605
606    #[test]
607    fn test_hyde_with_original() {
608        let generator = MockHypotheticalGenerator::new();
609        let hyde = HydePreprocessor::new(generator).with_original_query(true);
610        let result = hyde.preprocess("test query").unwrap();
611        assert_eq!(result.len(), 2);
612        assert_eq!(result[0], "test query");
613        assert!(result[1].contains("The answer is:"));
614    }
615
616    #[test]
617    fn test_hyde_custom_prefix() {
618        let generator = MockHypotheticalGenerator::new().with_prefix("Answer:");
619        let hyde = HydePreprocessor::new(generator);
620        let result = hyde.preprocess("query").unwrap();
621        assert!(result[0].starts_with("Answer:"));
622    }
623
624    #[test]
625    fn test_hyde_name() {
626        let generator = MockHypotheticalGenerator::new();
627        let hyde = HydePreprocessor::new(generator);
628        assert_eq!(hyde.name(), "hyde");
629    }
630
631    // Multi-query preprocessor tests
632
633    #[test]
634    fn test_multi_query_with_keyword_expander() {
635        let expander = KeywordExpander::new();
636        let multi = MultiQueryPreprocessor::new(expander);
637        let result = multi.preprocess("rust programming language").unwrap();
638        assert!(!result.is_empty());
639        assert_eq!(result[0], "rust programming language"); // original first
640    }
641
642    #[test]
643    fn test_multi_query_max_queries() {
644        let expander = KeywordExpander::new();
645        let multi = MultiQueryPreprocessor::new(expander).with_max_queries(2);
646        let result = multi.preprocess("rust programming language").unwrap();
647        assert!(result.len() <= 2);
648    }
649
650    #[test]
651    fn test_multi_query_without_original() {
652        let expander = KeywordExpander::new();
653        let multi = MultiQueryPreprocessor::new(expander).with_original_query(false);
654        let result = multi.preprocess("rust programming language").unwrap();
655        assert!(!result.contains(&"rust programming language".to_string()));
656    }
657
658    #[test]
659    fn test_multi_query_name() {
660        let expander = KeywordExpander::new();
661        let multi = MultiQueryPreprocessor::new(expander);
662        assert_eq!(multi.name(), "multi-query");
663    }
664
665    // Keyword expander tests
666
667    #[test]
668    fn test_keyword_expander_extracts_keywords() {
669        let expander = KeywordExpander::new();
670        let keywords = expander.extract_keywords("the quick brown fox jumps");
671        assert!(keywords.contains(&"quick".to_string()));
672        assert!(keywords.contains(&"brown".to_string()));
673        assert!(keywords.contains(&"jumps".to_string()));
674        assert!(!keywords.contains(&"the".to_string())); // stopword
675    }
676
677    #[test]
678    fn test_keyword_expander_filters_short_words() {
679        let expander = KeywordExpander::new();
680        let keywords = expander.extract_keywords("a go at it");
681        assert!(keywords.is_empty() || !keywords.iter().any(|w| w.len() <= 2));
682    }
683
684    #[test]
685    fn test_keyword_expander_expand() {
686        let expander = KeywordExpander::new();
687        let result = expander.expand("rust memory safety").unwrap();
688        assert!(!result.is_empty());
689    }
690
691    // Synonym expander tests
692
693    #[test]
694    fn test_synonym_expander_basic() {
695        let expander = SynonymExpander::with_technical_synonyms();
696        let result = expander.expand("create a function").unwrap();
697        assert!(!result.is_empty());
698        // Should have variations with "make" or "build" instead of "create"
699        assert!(result.iter().any(|q| q.contains("make") || q.contains("build")));
700    }
701
702    #[test]
703    fn test_synonym_expander_no_synonyms() {
704        let expander = SynonymExpander::with_technical_synonyms();
705        let result = expander.expand("xyz abc def").unwrap();
706        assert!(result.is_empty()); // no synonyms for these words
707    }
708
709    #[test]
710    fn test_synonym_expander_custom_synonyms() {
711        let mut synonyms = std::collections::HashMap::new();
712        synonyms.insert("test".to_string(), vec!["check".to_string()]);
713        let expander = SynonymExpander::new(synonyms);
714        let result = expander.expand("test code").unwrap();
715        assert!(result.iter().any(|q| q.contains("check")));
716    }
717
718    // Chained preprocessor tests
719
720    #[test]
721    fn test_chained_empty() {
722        let chained = ChainedPreprocessor::new();
723        let result = chained.preprocess("query").unwrap();
724        assert_eq!(result, vec!["query"]);
725    }
726
727    #[test]
728    fn test_chained_single() {
729        let chained = ChainedPreprocessor::new().add(PassthroughPreprocessor);
730        let result = chained.preprocess("query").unwrap();
731        assert_eq!(result, vec!["query"]);
732    }
733
734    #[test]
735    fn test_chained_multiple() {
736        let chained = ChainedPreprocessor::new()
737            .add(PassthroughPreprocessor)
738            .add(HydePreprocessor::new(MockHypotheticalGenerator::new()));
739        let result = chained.preprocess("query").unwrap();
740        assert!(result.len() >= 2);
741        assert!(result.contains(&"query".to_string()));
742    }
743
744    #[test]
745    fn test_chained_deduplicates() {
746        let chained = ChainedPreprocessor::new()
747            .add(PassthroughPreprocessor)
748            .add(PassthroughPreprocessor)
749            .with_deduplicate(true);
750        let result = chained.preprocess("query").unwrap();
751        assert_eq!(result.len(), 1); // duplicates removed
752    }
753
754    #[test]
755    fn test_chained_max_total() {
756        let chained = ChainedPreprocessor::new()
757            .add(MultiQueryPreprocessor::new(KeywordExpander::new()).with_max_queries(10))
758            .with_max_total(3);
759        let result = chained.preprocess("rust programming language tutorial").unwrap();
760        assert!(result.len() <= 3);
761    }
762
763    #[test]
764    fn test_chained_name() {
765        let chained = ChainedPreprocessor::new();
766        assert_eq!(chained.name(), "chained");
767    }
768
769    // Query analyzer tests
770
771    #[test]
772    fn test_analyzer_how_to() {
773        let analyzer = QueryAnalyzer::new();
774        let analysis = analyzer.analyze("how to write tests in rust");
775        assert_eq!(analysis.intent, QueryIntent::HowTo);
776        assert!(analysis.confidence > 0.5);
777    }
778
779    #[test]
780    fn test_analyzer_definition() {
781        let analyzer = QueryAnalyzer::new();
782        let analysis = analyzer.analyze("what is a monad");
783        assert_eq!(analysis.intent, QueryIntent::Definition);
784    }
785
786    #[test]
787    fn test_analyzer_troubleshooting() {
788        let analyzer = QueryAnalyzer::new();
789        let analysis = analyzer.analyze("error compiling code fix");
790        assert_eq!(analysis.intent, QueryIntent::Troubleshooting);
791    }
792
793    #[test]
794    fn test_analyzer_comparison() {
795        let analyzer = QueryAnalyzer::new();
796        let analysis = analyzer.analyze("rust vs go comparison");
797        assert_eq!(analysis.intent, QueryIntent::Comparison);
798    }
799
800    #[test]
801    fn test_analyzer_unknown() {
802        let analyzer = QueryAnalyzer::new();
803        let analysis = analyzer.analyze("random words xyz");
804        assert_eq!(analysis.intent, QueryIntent::Unknown);
805        assert!(analysis.confidence < 0.5);
806    }
807
808    #[test]
809    fn test_analyzer_extracts_keywords() {
810        let analyzer = QueryAnalyzer::new();
811        let analysis = analyzer.analyze("rust programming language");
812        assert!(analysis.keywords.contains(&"rust".to_string()));
813        assert!(analysis.keywords.contains(&"programming".to_string()));
814        assert!(analysis.keywords.contains(&"language".to_string()));
815    }
816
817    #[test]
818    fn test_query_analysis_serialization() {
819        let analysis = QueryAnalysis {
820            original: "test".to_string(),
821            intent: QueryIntent::HowTo,
822            keywords: vec!["test".to_string()],
823            confidence: 0.8,
824        };
825        let json = serde_json::to_string(&analysis).unwrap();
826        let deserialized: QueryAnalysis = serde_json::from_str(&json).unwrap();
827        assert_eq!(deserialized.original, "test");
828        assert_eq!(deserialized.intent, QueryIntent::HowTo);
829    }
830
831    // Property-based tests
832    use proptest::prelude::*;
833
834    proptest! {
835        #[test]
836        fn prop_passthrough_preserves_input(query in "\\PC{1,100}") {
837            let preprocessor = PassthroughPreprocessor;
838            let result = preprocessor.preprocess(&query).unwrap();
839            prop_assert_eq!(result.len(), 1);
840            prop_assert_eq!(&result[0], &query);
841        }
842
843        #[test]
844        fn prop_hyde_always_returns_something(query in "\\w{1,50}") {
845            let hyde = HydePreprocessor::new(MockHypotheticalGenerator::new());
846            let result = hyde.preprocess(&query).unwrap();
847            prop_assert!(!result.is_empty());
848        }
849
850        #[test]
851        fn prop_chained_respects_max_total(query in "\\w{1,50}", max in 1usize..20) {
852            let chained = ChainedPreprocessor::new()
853                .add(MultiQueryPreprocessor::new(KeywordExpander::new()))
854                .add(HydePreprocessor::new(MockHypotheticalGenerator::new()))
855                .with_max_total(max);
856            let result = chained.preprocess(&query).unwrap();
857            prop_assert!(result.len() <= max);
858        }
859
860        #[test]
861        fn prop_analyzer_always_returns_analysis(query in "\\w{1,100}") {
862            let analyzer = QueryAnalyzer::new();
863            let analysis = analyzer.analyze(&query);
864            prop_assert_eq!(analysis.original, query);
865            prop_assert!(analysis.confidence >= 0.0 && analysis.confidence <= 1.0);
866        }
867
868        #[test]
869        fn prop_keyword_expander_no_empty_results(
870            w1 in "[a-z]{4,10}",
871            w2 in "[a-z]{4,10}",
872            w3 in "[a-z]{4,10}"
873        ) {
874            let expander = KeywordExpander::new();
875            let query = format!("{w1} {w2} {w3}");
876            let result = expander.expand(&query).unwrap();
877            // All results should be non-empty strings
878            for q in &result {
879                prop_assert!(!q.is_empty());
880            }
881        }
882    }
883}
trueno_rag/preprocess.rs

trueno_rag/
preprocess.rs