mod confusables;
mod homoglyph;
mod unicode;
use crate::error::{NlResult, PreprocessError};
use crate::types::PreprocessResult;
pub const MAX_INPUT_LENGTH: usize = 4096;
pub fn preprocess_input(input: &str) -> NlResult<PreprocessResult> {
if input.len() > MAX_INPUT_LENGTH {
return Err(PreprocessError::InputTooLong {
len: input.len(),
max: MAX_INPUT_LENGTH,
}
.into());
}
let (text_without_quotes, quoted_spans, quote_positions) =
extract_quoted_spans_with_positions(input);
let normalized = unicode::normalize_nfkc(&text_without_quotes);
let stripped = unicode::strip_zero_width(&normalized);
let (dehomoglyphed, homoglyphs_replaced) = homoglyph::replace_confusables(&stripped);
let trimmed = dehomoglyphed.trim();
if trimmed.is_empty() && quoted_spans.is_empty() {
return Err(PreprocessError::EmptyInput.into());
}
let final_text = reconstruct_with_quotes(trimmed, "ed_spans, "e_positions);
Ok(PreprocessResult {
text: final_text,
quoted_spans,
normalized: normalized != text_without_quotes,
homoglyphs_replaced,
})
}
fn extract_quoted_spans_with_positions(input: &str) -> (String, Vec<String>, Vec<usize>) {
let mut result = String::with_capacity(input.len());
let mut quoted_spans = Vec::new();
let mut positions = Vec::new();
let chars = input.chars().peekable();
let mut in_quote = false;
let mut quote_char = '"';
let mut current_span = String::new();
for c in chars {
if !in_quote && (c == '"' || c == '\'') {
in_quote = true;
quote_char = c;
current_span.clear();
} else if in_quote && c == quote_char {
in_quote = false;
if !current_span.is_empty() {
positions.push(result.chars().count());
quoted_spans.push(current_span.clone());
result.push('\x00'); }
} else if in_quote {
current_span.push(c);
} else {
result.push(c);
}
}
if in_quote && !current_span.is_empty() {
positions.push(result.chars().count());
quoted_spans.push(current_span.clone());
result.push('\x00');
}
(result, quoted_spans, positions)
}
fn reconstruct_with_quotes(text: &str, quotes: &[String], _positions: &[usize]) -> String {
if quotes.is_empty() {
return text.to_string();
}
let mut result =
String::with_capacity(text.len() + quotes.iter().map(|s| s.len() + 2).sum::<usize>());
let mut quote_iter = quotes.iter();
for c in text.chars() {
if c == '\x00' {
if let Some(quoted) = quote_iter.next() {
result.push('"');
result.push_str(quoted);
result.push('"');
}
} else {
result.push(c);
}
}
result
}
#[allow(dead_code)]
fn extract_quoted_spans(input: &str) -> (String, Vec<String>) {
let mut result = String::with_capacity(input.len());
let mut quoted_spans = Vec::new();
let chars = input.chars().peekable();
let mut in_quote = false;
let mut quote_char = '"';
let mut current_span = String::new();
for c in chars {
if !in_quote && (c == '"' || c == '\'') {
in_quote = true;
quote_char = c;
current_span.clear();
} else if in_quote && c == quote_char {
in_quote = false;
if !current_span.is_empty() {
quoted_spans.push(current_span.clone());
result.push('"');
result.push_str(¤t_span);
result.push('"');
}
} else if in_quote {
current_span.push(c);
} else {
result.push(c);
}
}
if in_quote && !current_span.is_empty() {
quoted_spans.push(current_span.clone());
result.push_str(¤t_span);
}
(result, quoted_spans)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_preprocess_basic() {
let result = preprocess_input("find authentication").unwrap();
assert_eq!(result.text, "find authentication");
assert!(result.quoted_spans.is_empty());
}
#[test]
fn test_preprocess_with_quotes() {
let result = preprocess_input("find \"UserAuth::login\"").unwrap();
assert!(result.quoted_spans.contains(&"UserAuth::login".to_string()));
assert!(result.text.contains("\"UserAuth::login\""));
}
#[test]
fn test_preprocess_too_long() {
let long_input = "x".repeat(MAX_INPUT_LENGTH + 1);
let result = preprocess_input(&long_input);
assert!(matches!(
result,
Err(crate::error::NlError::Preprocess(
PreprocessError::InputTooLong { .. }
))
));
}
#[test]
fn test_preprocess_empty() {
let result = preprocess_input(" ");
assert!(matches!(
result,
Err(crate::error::NlError::Preprocess(
PreprocessError::EmptyInput
))
));
}
#[test]
fn test_extract_quoted_spans() {
let (text, spans) = extract_quoted_spans("find \"foo\" and \"bar\"");
assert_eq!(spans, vec!["foo", "bar"]);
assert!(text.contains("\"foo\""));
assert!(text.contains("\"bar\""));
}
}