lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
/// Character filters transform raw text before tokenization.
///
/// Each filter receives the full input string and returns the transformed
/// string plus an offset correction map so that token offsets can be mapped
/// back to positions in the original text.
///
/// See [[analyzers#Character Filters]].

/// A correction entry mapping a position in the filtered text back to the
/// original text. Used to fix token offsets after character filtering.
#[derive(Clone, Debug)]
pub struct OffsetCorrection {
    /// Byte offset in the filtered text.
    pub filtered: usize,
    /// How many bytes were removed (positive) or added (negative as wrapping)
    /// up to this point. To recover the original offset:
    /// `original_offset = filtered_offset + cumulative_diff`
    pub cumulative_diff: isize,
}

/// Transforms raw text before tokenization.
///
/// Implementations must be thread-safe (`Send + Sync`) so that analyzers can
/// be shared across indexing threads.
pub trait CharFilter: Send + Sync {
    /// Transform the input text.
    ///
    /// Returns the transformed text and a list of offset corrections for
    /// mapping filtered-text positions back to original-text positions.
    /// If the filter doesn't change text length (e.g., simple char mapping),
    /// the corrections list may be empty.
    fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>);
}

/// Strips HTML tags and decodes common HTML entities.
///
/// Matches ES `html_strip` character filter.
///
/// See [[analyzers#Character Filters]].
pub struct HtmlStripCharFilter;

impl CharFilter for HtmlStripCharFilter {
    fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
        let mut result = String::with_capacity(text.len());
        let mut corrections = Vec::new();
        let mut cumulative_diff: isize = 0;
        let mut i = 0;
        let bytes = text.as_bytes();

        while i < bytes.len() {
            if bytes[i] == b'<' {
                // Find closing '>'
                let tag_start = i;
                while i < bytes.len() && bytes[i] != b'>' {
                    i += 1;
                }
                if i < bytes.len() {
                    i += 1; // skip '>'
                }
                let removed = i - tag_start;
                cumulative_diff += removed as isize;
                corrections.push(OffsetCorrection {
                    filtered: result.len(),
                    cumulative_diff,
                });
            } else if bytes[i] == b'&' {
                // Try to decode HTML entity
                if let Some((decoded, consumed)) = decode_entity(&text[i..]) {
                    let old_len = result.len();
                    result.push_str(&decoded);
                    let new_bytes = result.len() - old_len;
                    cumulative_diff += consumed as isize - new_bytes as isize;
                    corrections.push(OffsetCorrection {
                        filtered: result.len(),
                        cumulative_diff,
                    });
                    i += consumed;
                } else {
                    result.push('&');
                    i += 1;
                }
            } else {
                // Safe: we're only matching ASCII bytes above, but the text
                // could contain multi-byte UTF-8. Advance by char.
                let ch = text[i..].chars().next().unwrap();
                result.push(ch);
                i += ch.len_utf8();
            }
        }

        (result, corrections)
    }
}

/// Decode a single HTML entity at the start of `s` (which starts with '&').
/// Returns (decoded_string, bytes_consumed) or None.
fn decode_entity(s: &str) -> Option<(String, usize)> {
    let end = s.find(';')?;
    if end > 10 {
        return None; // Entity too long, probably not an entity
    }
    let entity = &s[1..end]; // between & and ;
    let consumed = end + 1; // include the ;

    let decoded = match entity {
        "amp" => "&".to_string(),
        "lt" => "<".to_string(),
        "gt" => ">".to_string(),
        "quot" => "\"".to_string(),
        "apos" => "'".to_string(),
        "nbsp" => "\u{00A0}".to_string(),
        _ if entity.starts_with('#') => {
            let num_str = &entity[1..];
            let code_point = if let Some(hex) = num_str.strip_prefix('x') {
                u32::from_str_radix(hex, 16).ok()?
            } else {
                num_str.parse::<u32>().ok()?
            };
            char::from_u32(code_point)?.to_string()
        }
        _ => return None,
    };

    Some((decoded, consumed))
}

/// Replaces characters/strings using a mapping table.
///
/// Matches ES `mapping` character filter. Uses simple sequential replacement.
///
/// See [[analyzers#Character Filters]].
pub struct MappingCharFilter {
    mappings: Vec<(String, String)>,
}

impl MappingCharFilter {
    pub fn new(mappings: Vec<(String, String)>) -> Self {
        // Sort by key length descending so longer patterns match first.
        let mut mappings = mappings;
        mappings.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
        Self { mappings }
    }
}

impl CharFilter for MappingCharFilter {
    fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
        if self.mappings.is_empty() {
            return (text.to_string(), Vec::new());
        }

        let mut result = text.to_string();
        let mut corrections = Vec::new();
        let mut cumulative_diff: isize = 0;

        for (from, to) in &self.mappings {
            let mut new_result = String::with_capacity(result.len());
            let mut search_start = 0;

            while let Some(pos) = result[search_start..].find(from.as_str()) {
                let abs_pos = search_start + pos;
                new_result.push_str(&result[search_start..abs_pos]);
                new_result.push_str(to);

                let len_diff = from.len() as isize - to.len() as isize;
                cumulative_diff += len_diff;
                corrections.push(OffsetCorrection {
                    filtered: new_result.len(),
                    cumulative_diff,
                });

                search_start = abs_pos + from.len();
            }

            new_result.push_str(&result[search_start..]);
            result = new_result;
        }

        (result, corrections)
    }
}

/// Replaces characters matching a regex pattern.
///
/// Matches ES `pattern_replace` character filter.
///
/// See [[analyzers#Character Filters]].
pub struct PatternReplaceCharFilter {
    pattern: regex::Regex,
    replacement: String,
}

impl PatternReplaceCharFilter {
    pub fn new(pattern: &str, replacement: &str) -> Result<Self, regex::Error> {
        Ok(Self {
            pattern: regex::Regex::new(pattern)?,
            replacement: replacement.to_string(),
        })
    }
}

impl CharFilter for PatternReplaceCharFilter {
    fn filter(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
        let mut corrections = Vec::new();
        let mut cumulative_diff: isize = 0;
        let mut result = String::with_capacity(text.len());
        let mut last_end = 0;

        for m in self.pattern.find_iter(text) {
            result.push_str(&text[last_end..m.start()]);
            result.push_str(&self.replacement);

            let match_len = m.end() - m.start();
            let replace_len = self.replacement.len();
            cumulative_diff += match_len as isize - replace_len as isize;
            corrections.push(OffsetCorrection {
                filtered: result.len(),
                cumulative_diff,
            });

            last_end = m.end();
        }

        result.push_str(&text[last_end..]);
        (result, corrections)
    }
}

/// Map a byte offset in filtered text back to the original text.
pub fn correct_offset(filtered_offset: usize, corrections: &[OffsetCorrection]) -> usize {
    // Find the applicable correction: the last one where filtered <= offset.
    let diff = match corrections
        .iter()
        .rev()
        .find(|c| c.filtered <= filtered_offset)
    {
        Some(c) => c.cumulative_diff,
        None => 0,
    };
    (filtered_offset as isize + diff) as usize
}

#[cfg(test)]
mod tests {
    use super::*;

    // --- HtmlStripCharFilter ---

    #[test]
    fn html_strip_basic() {
        let filter = HtmlStripCharFilter;
        let (result, _) = filter.filter("<p>Hello <b>World</b></p>");
        assert_eq!(result, "Hello World");
    }

    #[test]
    fn html_strip_entities() {
        let filter = HtmlStripCharFilter;
        let (result, _) = filter.filter("foo &amp; bar &lt; baz");
        assert_eq!(result, "foo & bar < baz");
    }

    #[test]
    fn html_strip_numeric_entity() {
        let filter = HtmlStripCharFilter;
        let (result, _) = filter.filter("&#65;&#x42;");
        assert_eq!(result, "AB");
    }

    #[test]
    fn html_strip_no_html() {
        let filter = HtmlStripCharFilter;
        let (result, corrections) = filter.filter("plain text");
        assert_eq!(result, "plain text");
        assert!(corrections.is_empty());
    }

    #[test]
    fn html_strip_empty() {
        let filter = HtmlStripCharFilter;
        let (result, _) = filter.filter("");
        assert_eq!(result, "");
    }

    #[test]
    fn html_strip_offset_correction() {
        let filter = HtmlStripCharFilter;
        // "<b>Hello</b>"
        //  ^3 chars removed -> "Hello" starts at filtered offset 0, original offset 3
        let (result, corrections) = filter.filter("<b>Hello</b>");
        assert_eq!(result, "Hello");
        let original_start = correct_offset(0, &corrections);
        assert_eq!(original_start, 3);
    }

    // --- MappingCharFilter ---

    #[test]
    fn mapping_basic() {
        let filter = MappingCharFilter::new(vec![
            (":)".to_string(), "_happy_".to_string()),
            (":(".to_string(), "_sad_".to_string()),
        ]);
        let (result, _) = filter.filter("I am :) and not :(");
        assert_eq!(result, "I am _happy_ and not _sad_");
    }

    #[test]
    fn mapping_empty() {
        let filter = MappingCharFilter::new(vec![]);
        let (result, corrections) = filter.filter("no change");
        assert_eq!(result, "no change");
        assert!(corrections.is_empty());
    }

    #[test]
    fn mapping_no_match() {
        let filter = MappingCharFilter::new(vec![("xyz".to_string(), "abc".to_string())]);
        let (result, _) = filter.filter("hello world");
        assert_eq!(result, "hello world");
    }

    // --- PatternReplaceCharFilter ---

    #[test]
    fn pattern_replace_basic() {
        let filter = PatternReplaceCharFilter::new(r"\d+", "#").unwrap();
        let (result, _) = filter.filter("abc123def456");
        assert_eq!(result, "abc#def#");
    }

    #[test]
    fn pattern_replace_no_match() {
        let filter = PatternReplaceCharFilter::new(r"\d+", "#").unwrap();
        let (result, corrections) = filter.filter("no digits");
        assert_eq!(result, "no digits");
        assert!(corrections.is_empty());
    }

    #[test]
    fn pattern_replace_empty() {
        let filter = PatternReplaceCharFilter::new(r"\d+", "#").unwrap();
        let (result, _) = filter.filter("");
        assert_eq!(result, "");
    }

    // --- correct_offset ---

    #[test]
    fn correct_offset_no_corrections() {
        assert_eq!(correct_offset(5, &[]), 5);
    }

    #[test]
    fn correct_offset_single_removal() {
        // Removed 3 bytes at filtered position 0 (e.g., "<b>" tag)
        let corrections = vec![OffsetCorrection {
            filtered: 0,
            cumulative_diff: 3,
        }];
        // Filtered offset 0 → original offset 3
        assert_eq!(correct_offset(0, &corrections), 3);
        // Filtered offset 5 → original offset 8
        assert_eq!(correct_offset(5, &corrections), 8);
    }
}