lcpfs 2026.1.102

LCP File System - A ZFS-inspired copy-on-write filesystem for Rust
// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! Text tokenization for full-text search

use alloc::string::{String, ToString};
use alloc::vec;
use alloc::vec::Vec;

use super::types::Token;

/// Common English stop words to filter out.
///
/// **IMPORTANT**: This list MUST be sorted alphabetically for binary_search to work.
const STOP_WORDS: &[&str] = &[
    "a", "after", "all", "also", "an", "and", "are", "as", "at", "be", "been", "before", "but",
    "by", "can", "did", "do", "does", "down", "for", "from", "had", "has", "have", "he", "how",
    "if", "in", "into", "is", "it", "its", "just", "more", "no", "not", "of", "on", "only", "or",
    "other", "out", "over", "so", "some", "such", "than", "that", "the", "their", "then", "there",
    "they", "this", "to", "up", "was", "were", "what", "when", "which", "who", "will", "with",
    "would", "you", "your",
];

/// Tokenizer for text content
pub struct Tokenizer {
    /// Whether to lowercase tokens
    lowercase: bool,
    /// Whether to remove stop words
    remove_stop_words: bool,
    /// Minimum token length
    min_length: usize,
    /// Maximum token length
    max_length: usize,
}

impl Default for Tokenizer {
    fn default() -> Self {
        Self::new()
    }
}

impl Tokenizer {
    /// Create a new tokenizer with default settings
    pub fn new() -> Self {
        Self {
            lowercase: true,
            remove_stop_words: true,
            min_length: 2,
            max_length: 64,
        }
    }

    /// Enable/disable lowercasing
    pub fn with_lowercase(mut self, lowercase: bool) -> Self {
        self.lowercase = lowercase;
        self
    }

    /// Enable/disable stop word removal
    pub fn with_stop_words(mut self, remove: bool) -> Self {
        self.remove_stop_words = remove;
        self
    }

    /// Set minimum token length
    pub fn with_min_length(mut self, min: usize) -> Self {
        self.min_length = min;
        self
    }

    /// Tokenize text content into tokens with positions
    pub fn tokenize(&self, content: &[u8]) -> Vec<Token> {
        let mut tokens = Vec::new();
        let mut position = 0u32;
        let mut current_word = String::new();
        let mut word_start = 0usize;

        // Try to decode as UTF-8, fall back to lossy conversion
        let text = core::str::from_utf8(content)
            .map(|s| s.to_string())
            .unwrap_or_else(|_| String::from_utf8_lossy(content).into_owned());

        for (i, ch) in text.char_indices() {
            if ch.is_alphanumeric() {
                if current_word.is_empty() {
                    word_start = i;
                }
                current_word.push(ch);
            } else if !current_word.is_empty() {
                // End of word
                if let Some(token) = self.process_word(&current_word, position, word_start) {
                    tokens.push(token);
                    position += 1;
                }
                current_word.clear();
            }
        }

        // Handle last word
        if !current_word.is_empty() {
            if let Some(token) = self.process_word(&current_word, position, word_start) {
                tokens.push(token);
            }
        }

        tokens
    }

    /// Process a word into a token
    fn process_word(&self, word: &str, position: u32, byte_offset: usize) -> Option<Token> {
        // Check length constraints
        if word.len() < self.min_length || word.len() > self.max_length {
            return None;
        }

        // Normalize
        let normalized = if self.lowercase {
            word.to_lowercase()
        } else {
            word.to_string()
        };

        // Check stop words
        if self.remove_stop_words && is_stop_word(&normalized) {
            return None;
        }

        Some(Token {
            term: normalized,
            position,
            byte_offset,
        })
    }

    /// Tokenize a search query (less aggressive filtering)
    pub fn tokenize_query(&self, query: &str) -> Vec<String> {
        let mut terms = Vec::new();
        let mut current_word = String::new();

        for ch in query.chars() {
            if ch.is_alphanumeric() {
                current_word.push(ch);
            } else if !current_word.is_empty() {
                let term = if self.lowercase {
                    current_word.to_lowercase()
                } else {
                    current_word.clone()
                };

                if term.len() >= self.min_length && term.len() <= self.max_length {
                    terms.push(term);
                }
                current_word.clear();
            }
        }

        // Handle last word
        if !current_word.is_empty() {
            let term = if self.lowercase {
                current_word.to_lowercase()
            } else {
                current_word
            };

            if term.len() >= self.min_length && term.len() <= self.max_length {
                terms.push(term);
            }
        }

        terms
    }
}

/// Check if a word is a stop word
fn is_stop_word(word: &str) -> bool {
    STOP_WORDS.binary_search(&word).is_ok()
}

/// Calculate Levenshtein distance between two strings
pub fn levenshtein_distance(s1: &str, s2: &str) -> usize {
    let len1 = s1.chars().count();
    let len2 = s2.chars().count();

    if len1 == 0 {
        return len2;
    }
    if len2 == 0 {
        return len1;
    }

    // Use two-row optimization for space efficiency
    let mut prev_row: Vec<usize> = (0..=len2).collect();
    let mut curr_row = vec![0; len2 + 1];

    for (i, c1) in s1.chars().enumerate() {
        curr_row[0] = i + 1;

        for (j, c2) in s2.chars().enumerate() {
            let cost = if c1 == c2 { 0 } else { 1 };

            curr_row[j + 1] = (prev_row[j + 1] + 1) // deletion
                .min(curr_row[j] + 1) // insertion
                .min(prev_row[j] + cost); // substitution
        }

        core::mem::swap(&mut prev_row, &mut curr_row);
    }

    prev_row[len2]
}

/// Find fuzzy matches for a term in the index
pub fn fuzzy_match(
    term: &str,
    candidates: impl Iterator<Item = impl AsRef<str>>,
    max_distance: usize,
) -> Vec<String> {
    candidates
        .filter_map(|candidate| {
            let candidate = candidate.as_ref();
            let distance = levenshtein_distance(term, candidate);
            if distance <= max_distance {
                Some(candidate.to_string())
            } else {
                None
            }
        })
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_tokenize_simple() {
        let tokenizer = Tokenizer::new();
        let tokens = tokenizer.tokenize(b"Hello World");
        assert_eq!(tokens.len(), 2);
        assert_eq!(tokens[0].term, "hello");
        assert_eq!(tokens[1].term, "world");
    }

    #[test]
    fn test_stop_words() {
        let tokenizer = Tokenizer::new();
        let tokens = tokenizer.tokenize(b"the quick brown fox");
        // "the" is a stop word
        assert_eq!(tokens.len(), 3);
        assert_eq!(tokens[0].term, "quick");
    }

    #[test]
    fn test_levenshtein() {
        assert_eq!(levenshtein_distance("hello", "hello"), 0);
        assert_eq!(levenshtein_distance("hello", "hallo"), 1);
        assert_eq!(levenshtein_distance("hello", "world"), 4);
        assert_eq!(levenshtein_distance("", "hello"), 5);
    }
}