use crate::error::Result;
use crate::text::CharacterInfo;
#[derive(Debug, Clone)]
pub struct PatternPreservationConfig {
pub preserve_patterns: bool,
pub detect_emails: bool,
pub detect_urls: bool,
}
impl Default for PatternPreservationConfig {
fn default() -> Self {
Self {
preserve_patterns: true,
detect_emails: true,
detect_urls: true,
}
}
}
#[derive(Debug)]
pub struct PatternDetector;
impl PatternDetector {
pub fn new(_config: PatternPreservationConfig) -> Self {
Self
}
pub fn default_config() -> Self {
Self
}
pub fn has_email_pattern(characters: &[CharacterInfo]) -> bool {
if characters.is_empty() {
return false;
}
let at_position = characters.iter().position(|ch| ch.code == 0x40);
if let Some(at_idx) = at_position {
let after_at = &characters[at_idx + 1..];
let has_dot = after_at.iter().any(|ch| ch.code == 0x2E);
let has_domain_chars = after_at
.iter()
.any(|ch| ch.code != 0x20 && ch.code != 0x09 && ch.code != 0x0A && ch.code != 0x0D);
return has_dot && has_domain_chars;
}
false
}
pub fn has_url_pattern(characters: &[CharacterInfo]) -> bool {
if characters.len() < 7 {
return false;
}
let text: String = characters
.iter()
.filter_map(|ch| char::from_u32(ch.code))
.collect();
let text_lower = text.to_lowercase();
text_lower.contains("http://")
|| text_lower.contains("https://")
|| text_lower.contains("ftp://")
|| text_lower.contains("mailto:")
}
pub fn mark_pattern_contexts(
characters: &mut [CharacterInfo],
config: &PatternPreservationConfig,
) -> Result<()> {
if !config.preserve_patterns {
return Ok(());
}
if config.detect_emails {
Self::mark_email_contexts(characters);
}
if config.detect_urls {
Self::mark_url_contexts(characters);
}
Ok(())
}
fn mark_email_contexts(characters: &mut [CharacterInfo]) {
if characters.is_empty() {
return;
}
let at_positions: Vec<usize> = characters
.iter()
.enumerate()
.filter(|(_, ch)| ch.code == 0x40) .map(|(idx, _)| idx)
.collect();
for at_idx in at_positions {
if !Self::is_email_at_position(characters, at_idx) {
continue;
}
let start_idx = Self::find_email_start(characters, at_idx);
let end_idx = Self::find_email_end(characters, at_idx);
for ch in &mut characters[start_idx..=end_idx] {
ch.protected_from_split = true;
}
}
}
fn is_email_at_position(characters: &[CharacterInfo], at_idx: usize) -> bool {
if at_idx == 0 {
return false;
}
if at_idx >= characters.len() - 1 {
return false;
}
let after_at = &characters[at_idx + 1..];
let has_dot = after_at.iter().any(|ch| ch.code == 0x2E);
has_dot
}
fn find_email_start(characters: &[CharacterInfo], at_idx: usize) -> usize {
let mut start = at_idx;
while start > 0 {
let ch = &characters[start - 1];
if Self::is_email_char(ch.code) {
start -= 1;
} else {
break;
}
}
start
}
fn find_email_end(characters: &[CharacterInfo], at_idx: usize) -> usize {
let mut end = at_idx;
while end < characters.len() - 1 {
let ch = &characters[end + 1];
if Self::is_email_char(ch.code) {
end += 1;
} else {
break;
}
}
end
}
fn is_email_char(code: u32) -> bool {
matches!(code,
0x30..=0x39 | 0x41..=0x5A | 0x61..=0x7A | 0x2D | 0x2E | 0x5F | 0x2B | 0x40 )
}
fn mark_url_contexts(characters: &mut [CharacterInfo]) {
if characters.len() < 7 {
return;
}
let schemes = [
("http://", 7),
("https://", 8),
("ftp://", 6),
("mailto:", 7),
];
for i in 0..characters.len() {
for (scheme, len) in &schemes {
if i + len > characters.len() {
continue;
}
let slice = &characters[i..i + len];
if Self::matches_scheme(slice, scheme) {
let end_idx = Self::find_url_end(characters, i + len);
for ch in &mut characters[i..=end_idx] {
ch.protected_from_split = true;
}
break;
}
}
}
}
fn matches_scheme(chars: &[CharacterInfo], scheme: &str) -> bool {
if chars.len() != scheme.len() {
return false;
}
for (ch, scheme_char) in chars.iter().zip(scheme.chars()) {
let ch_lower = char::from_u32(ch.code)
.map(|c| c.to_lowercase().next().unwrap_or(c))
.unwrap_or('\0');
if ch_lower != scheme_char {
return false;
}
}
true
}
fn find_url_end(characters: &[CharacterInfo], start_idx: usize) -> usize {
let mut end = start_idx;
while end < characters.len() {
let ch = &characters[end];
if Self::is_url_char(ch.code) {
end += 1;
} else {
break;
}
}
if end > start_idx {
end - 1
} else {
start_idx
}
}
fn is_url_char(code: u32) -> bool {
matches!(code,
0x30..=0x39 | 0x41..=0x5A | 0x61..=0x7A | 0x2D | 0x2E | 0x5F | 0x7E | 0x3A | 0x2F | 0x3F | 0x23 | 0x5B | 0x5D | 0x40 | 0x21 | 0x24 | 0x26 | 0x27 | 0x28 | 0x29 | 0x2A | 0x2B | 0x2C | 0x3B | 0x3D | 0x25 )
}
}
#[cfg(test)]
mod tests {
use super::*;
fn create_char_info(code: u32) -> CharacterInfo {
CharacterInfo {
code,
glyph_id: Some(1),
width: 0.5,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}
}
fn string_to_chars(s: &str) -> Vec<CharacterInfo> {
s.chars().map(|ch| create_char_info(ch as u32)).collect()
}
#[test]
fn test_email_pattern_detection() {
let chars = string_to_chars("user@example.com");
assert!(PatternDetector::has_email_pattern(&chars), "Should detect email pattern");
}
#[test]
fn test_email_pattern_no_domain() {
let chars = string_to_chars("user@example");
assert!(
!PatternDetector::has_email_pattern(&chars),
"Should not detect email without dot in domain"
);
}
#[test]
fn test_url_pattern_http() {
let chars = string_to_chars("http://example.com");
assert!(PatternDetector::has_url_pattern(&chars), "Should detect http:// URL");
}
#[test]
fn test_url_pattern_https() {
let chars = string_to_chars("https://example.com");
assert!(PatternDetector::has_url_pattern(&chars), "Should detect https:// URL");
}
#[test]
fn test_email_protection() {
let mut chars = string_to_chars("user@example.com");
let config = PatternPreservationConfig::default();
PatternDetector::mark_pattern_contexts(&mut chars, &config).unwrap();
for (i, ch) in chars.iter().enumerate() {
assert!(ch.protected_from_split, "Character {} should be protected", i);
}
}
#[test]
fn test_url_protection() {
let mut chars = string_to_chars("http://example.com");
let config = PatternPreservationConfig::default();
PatternDetector::mark_pattern_contexts(&mut chars, &config).unwrap();
for (i, ch) in chars.iter().enumerate() {
assert!(ch.protected_from_split, "Character {} should be protected", i);
}
}
#[test]
fn test_pattern_detection_disabled() {
let mut chars = string_to_chars("user@example.com");
let config = PatternPreservationConfig {
preserve_patterns: false,
detect_emails: true,
detect_urls: true,
};
PatternDetector::mark_pattern_contexts(&mut chars, &config).unwrap();
for ch in &chars {
assert!(!ch.protected_from_split, "Characters should not be protected when disabled");
}
}
#[test]
fn test_mixed_content() {
let mut chars = string_to_chars("Contact user@example.com for more info");
let config = PatternPreservationConfig::default();
PatternDetector::mark_pattern_contexts(&mut chars, &config).unwrap();
let email_start = "Contact ".len();
let email_end = email_start + "user@example.com".len();
for i in email_start..email_end {
assert!(chars[i].protected_from_split, "Email character {} should be protected", i);
}
for i in 0..email_start {
assert!(
!chars[i].protected_from_split,
"Non-email character {} should not be protected",
i
);
}
}
}