markdown_ai_cite_remove/
remover.rs

1use crate::config::RemoverConfig;
2use crate::patterns::Patterns;
3
4/// Main citation remover
5pub struct CitationRemover {
6    config: RemoverConfig,
7    patterns: &'static Patterns,
8}
9
10impl CitationRemover {
11    /// Create new remover with default configuration
12    pub fn new() -> Self {
13        Self {
14            config: RemoverConfig::default(),
15            patterns: Patterns::get(),
16        }
17    }
18
19    /// Create remover with custom configuration
20    pub fn with_config(config: RemoverConfig) -> Self {
21        Self {
22            config,
23            patterns: Patterns::get(),
24        }
25    }
26
27    /// Remove citations from markdown string
28    pub fn remove(&self, markdown: &str) -> String {
29        let mut result = markdown.to_string();
30
31        // Step 1: Remove reference sections FIRST (before inline citations)
32        // This is important because inline citation removal would break reference link patterns
33        if self.config.remove_reference_links
34            || self.config.remove_reference_entries
35            || self.config.remove_reference_headers
36        {
37            result = self.remove_reference_sections(&result);
38        }
39
40        // Step 2: Remove inline citations
41        if self.config.remove_inline_citations {
42            result = self.remove_inline_citations(&result);
43        }
44
45        // Step 3: Cleanup whitespace
46        if self.config.normalize_whitespace {
47            result = self.normalize_whitespace(&result);
48        }
49
50        // Step 4: Remove excessive blank lines
51        if self.config.remove_blank_lines {
52            result = self.remove_excessive_blank_lines(&result);
53        }
54
55        // Step 5: Trim lines
56        if self.config.trim_lines {
57            result = self.trim_all_lines(&result);
58        }
59
60        result
61    }
62
63    /// Remove ALL inline citations using comprehensive pattern matching
64    /// Handles: `[1]`, `[^1]`, `[^1_1]`, `[source:1]`, `[@smith2004]`, `@citation`
65    fn remove_inline_citations(&self, text: &str) -> String {
66        // Use the unified comprehensive pattern that matches ALL citation formats
67        self.patterns
68            .inline_citations
69            .replace_all(text, "")
70            .to_string()
71    }
72
73    /// Remove reference sections at end of document
74    /// Handles: `[1]: url`, `[^1]: text`, `[^1_1]: url`, `[1](url)`, `[^1_1](url)`
75    fn remove_reference_sections(&self, text: &str) -> String {
76        let lines: Vec<&str> = text.lines().collect();
77        let mut references_start = None;
78
79        // Scan for reference section start (find the FIRST occurrence)
80        for (i, line) in lines.iter().enumerate() {
81            // Skip if we already found the start
82            if references_start.is_some() {
83                break;
84            }
85
86            // Check for reference header
87            if self.config.remove_reference_headers && self.patterns.reference_header.is_match(line)
88            {
89                references_start = Some(i);
90                break;
91            }
92
93            // Check for ANY reference definition format using comprehensive pattern
94            if self.patterns.reference_definitions.is_match(line)
95                || self.patterns.reference_entry.is_match(line)
96            {
97                references_start = Some(i);
98                break;
99            }
100        }
101
102        // Remove everything from references onward
103        if let Some(start) = references_start {
104            lines[..start].join("\n")
105        } else {
106            text.to_string()
107        }
108    }
109
110    /// Normalize multiple spaces to single space
111    fn normalize_whitespace(&self, text: &str) -> String {
112        self.patterns
113            .multiple_whitespace
114            .replace_all(text, " ")
115            .to_string()
116    }
117
118    /// Remove excessive blank lines (3+ consecutive newlines → 2)
119    fn remove_excessive_blank_lines(&self, text: &str) -> String {
120        self.patterns
121            .excessive_newlines
122            .replace_all(text, "\n\n")
123            .to_string()
124    }
125
126    /// Trim whitespace from all lines
127    fn trim_all_lines(&self, text: &str) -> String {
128        text.lines()
129            .map(|line| line.trim_end())
130            .collect::<Vec<_>>()
131            .join("\n")
132    }
133}
134
135impl Default for CitationRemover {
136    fn default() -> Self {
137        Self::new()
138    }
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144
145    #[test]
146    fn test_remove_inline_numeric() {
147        let remover = CitationRemover::new();
148        let input = "Text[1] with[2] citations[3].";
149        let result = remover.remove_inline_citations(input);
150        assert_eq!(result, "Text with citations.");
151    }
152
153    #[test]
154    fn test_remove_inline_named() {
155        let remover = CitationRemover::new();
156        let input = "Text[source:1] with[ref:2] citations.";
157        let result = remover.remove_inline_citations(input);
158        assert_eq!(result, "Text with citations.");
159    }
160
161    #[test]
162    fn test_normalize_whitespace() {
163        let remover = CitationRemover::new();
164        let input = "Text  with    multiple     spaces.";
165        let result = remover.normalize_whitespace(input);
166        assert_eq!(result, "Text with multiple spaces.");
167    }
168
169    #[test]
170    fn test_remove_excessive_blank_lines() {
171        let remover = CitationRemover::new();
172        let input = "Line 1\n\n\n\n\nLine 2";
173        let result = remover.remove_excessive_blank_lines(input);
174        assert_eq!(result, "Line 1\n\nLine 2");
175    }
176
177    #[test]
178    fn test_trim_all_lines() {
179        let remover = CitationRemover::new();
180        let input = "Line 1   \nLine 2  \nLine 3 ";
181        let result = remover.trim_all_lines(input);
182        assert_eq!(result, "Line 1\nLine 2\nLine 3");
183    }
184
185    #[test]
186    fn test_remove_reference_sections_with_header() {
187        let remover = CitationRemover::new();
188        let input = "Content here.\n\n## References\n[1]: https://example.com";
189        let result = remover.remove_reference_sections(input);
190        assert_eq!(result.trim(), "Content here.");
191    }
192
193    #[test]
194    fn test_remove_reference_sections_without_header() {
195        let remover = CitationRemover::new();
196        let input = "Content here.\n\n[1]: https://example.com\n[2]: https://test.com";
197        let result = remover.remove_reference_sections(input);
198        assert_eq!(result.trim(), "Content here.");
199    }
200
201    #[test]
202    fn test_custom_config() {
203        let config = RemoverConfig {
204            remove_inline_citations: true,
205            remove_reference_links: false,
206            remove_reference_headers: false,
207            remove_reference_entries: false,
208            normalize_whitespace: false,
209            remove_blank_lines: false,
210            trim_lines: false,
211        };
212        let remover = CitationRemover::with_config(config);
213        let input = "Text[1].\n\n[1]: https://example.com";
214        let result = remover.remove(input);
215        assert!(!result.contains("[1]"));
216        assert!(result.contains("https://example.com"));
217    }
218
219    #[test]
220    fn test_full_pipeline() {
221        let remover = CitationRemover::new();
222        let input = "Text[1]  with   spaces.\n\n\n\n## References\n[1]: https://example.com";
223        let result = remover.remove(input);
224        assert!(!result.contains("[1]"));
225        assert!(!result.contains("https://example.com"));
226        assert!(!result.contains("  "));
227        assert!(!result.contains("\n\n\n"));
228    }
229}