rtf_to_html/
lib.rs

1//! RTF to HTML conversion library
2//!
3//! Converts RTF documents to clean, semantic HTML. Built on top of the `rtf-parser` crate.
4//!
5//! # Example
6//!
7//! ```
8//! use rtf_to_html::rtf_to_html;
9//!
10//! let rtf = br#"{\rtf1\ansi\deff0 Hello World}"#;
11//! let html = rtf_to_html(rtf).unwrap();
12//! assert!(html.contains("Hello World"));
13//! ```
14//!
15//! # Features
16//!
17//! - Converts RTF to semantic HTML (`<strong>`, `<em>`, `<u>`)
18//! - Paragraph structure preserved with `<p>` tags
19//! - XSS-safe output (HTML entities escaped)
20//! - Plain text extraction (strips all formatting)
21
22use rtf_parser_tt::RtfDocument;
23use thiserror::Error;
24
25/// Errors that can occur during RTF conversion
26#[derive(Error, Debug)]
27pub enum RtfError {
28    /// Failed to parse RTF document
29    #[error("Failed to parse RTF: {0}")]
30    ParseError(String),
31}
32
33/// Convert RTF content to HTML
34///
35/// Takes raw RTF bytes and returns semantic HTML.
36/// Preserves basic formatting: bold, italic, underline, and paragraph structure.
37///
38/// # Arguments
39///
40/// * `rtf_content` - RTF document as bytes
41///
42/// # Returns
43///
44/// HTML string with semantic markup
45///
46/// # Example
47///
48/// ```
49/// use rtf_to_html::rtf_to_html;
50///
51/// let rtf = br#"{\rtf1\ansi\deff0 {\b Bold} and {\i italic}}"#;
52/// let html = rtf_to_html(rtf).unwrap();
53/// ```
54pub fn rtf_to_html(rtf_content: &[u8]) -> Result<String, RtfError> {
55    let rtf_string = String::from_utf8_lossy(rtf_content);
56
57    let doc = RtfDocument::try_from(rtf_string.as_ref())
58        .map_err(|e| RtfError::ParseError(e.to_string()))?;
59
60    Ok(convert_document_to_html(&doc))
61}
62
63/// Extract plain text from RTF (strips all formatting)
64///
65/// # Arguments
66///
67/// * `rtf_content` - RTF document as bytes
68///
69/// # Returns
70///
71/// Plain text with formatting stripped
72///
73/// # Example
74///
75/// ```
76/// use rtf_to_html::rtf_to_plain_text;
77///
78/// let rtf = br#"{\rtf1\ansi\deff0 Hello World}"#;
79/// let text = rtf_to_plain_text(rtf).unwrap();
80/// assert!(text.contains("Hello"));
81/// ```
82pub fn rtf_to_plain_text(rtf_content: &[u8]) -> Result<String, RtfError> {
83    let rtf_string = String::from_utf8_lossy(rtf_content);
84
85    let doc = RtfDocument::try_from(rtf_string.as_ref())
86        .map_err(|e| RtfError::ParseError(e.to_string()))?;
87
88    let text: String = doc.body.iter().map(|block| block.text.as_str()).collect();
89    Ok(text)
90}
91
92/// Convert a parsed RTF document to HTML
93fn convert_document_to_html(doc: &RtfDocument) -> String {
94    let mut html = String::new();
95    let mut current_paragraph: Vec<String> = Vec::new();
96    let mut in_paragraph = false;
97
98    for block in &doc.body {
99        let text = &block.text;
100
101        // Skip empty blocks
102        if text.is_empty() {
103            continue;
104        }
105
106        // Check if this is a paragraph break
107        if text == "\n" || text.contains('\n') {
108            // Flush current paragraph
109            if !current_paragraph.is_empty() {
110                html.push_str("<p>");
111                html.push_str(&current_paragraph.join(""));
112                html.push_str("</p>");
113                current_paragraph.clear();
114            }
115
116            // Handle multiple newlines (which might be paragraph separators)
117            let newline_count = text.chars().filter(|c| *c == '\n').count();
118            if newline_count > 1 && !in_paragraph {
119                // Add empty paragraphs for extra line breaks
120                for _ in 1..newline_count {
121                    html.push_str("<p></p>");
122                }
123            }
124
125            in_paragraph = false;
126            continue;
127        }
128
129        // Apply formatting
130        let formatted_text = apply_formatting(text, &block.painter);
131        current_paragraph.push(formatted_text);
132        in_paragraph = true;
133    }
134
135    // Flush any remaining content
136    if !current_paragraph.is_empty() {
137        html.push_str("<p>");
138        html.push_str(&current_paragraph.join(""));
139        html.push_str("</p>");
140    }
141
142    // If no HTML was generated but we have text, wrap it in a paragraph
143    if html.is_empty() && !doc.body.is_empty() {
144        let plain_text = doc
145            .body
146            .iter()
147            .map(|b| escape_html(&b.text))
148            .collect::<Vec<_>>()
149            .join("");
150        if !plain_text.is_empty() {
151            html = format!("<p>{}</p>", plain_text);
152        }
153    }
154
155    html
156}
157
158/// Apply formatting tags based on the painter style
159fn apply_formatting(text: &str, painter: &rtf_parser_tt::Painter) -> String {
160    let escaped = escape_html(text);
161
162    if escaped.is_empty() {
163        return escaped;
164    }
165
166    let mut result = escaped;
167
168    // Apply formatting in order: bold, italic, underline
169    if painter.bold {
170        result = format!("<strong>{}</strong>", result);
171    }
172
173    if painter.italic {
174        result = format!("<em>{}</em>", result);
175    }
176
177    if painter.underline {
178        result = format!("<u>{}</u>", result);
179    }
180
181    result
182}
183
184/// Escape HTML special characters
185fn escape_html(text: &str) -> String {
186    text.replace('&', "&amp;")
187        .replace('<', "&lt;")
188        .replace('>', "&gt;")
189        .replace('"', "&quot;")
190}
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195
196    #[test]
197    fn test_simple_rtf_to_html() {
198        let rtf = br#"{\rtf1\ansi\deff0 Hello World}"#;
199        let result = rtf_to_html(rtf);
200        assert!(result.is_ok());
201        let html = result.unwrap();
202        assert!(html.contains("Hello World"));
203    }
204
205    #[test]
206    fn test_rtf_with_paragraphs() {
207        let rtf = br#"{\rtf1\ansi\deff0 First paragraph\par Second paragraph}"#;
208        let result = rtf_to_html(rtf);
209        assert!(result.is_ok());
210        let html = result.unwrap();
211        assert!(html.contains("<p>"));
212    }
213
214    #[test]
215    fn test_escape_html() {
216        assert_eq!(escape_html("<script>"), "&lt;script&gt;");
217        assert_eq!(escape_html("a & b"), "a &amp; b");
218        assert_eq!(escape_html("\"quoted\""), "&quot;quoted&quot;");
219    }
220
221    #[test]
222    fn test_plain_text_extraction() {
223        let rtf = br#"{\rtf1\ansi\deff0 Hello World}"#;
224        let result = rtf_to_plain_text(rtf);
225        assert!(result.is_ok());
226        assert!(result.unwrap().contains("Hello"));
227    }
228
229    #[test]
230    fn test_rtf_with_bold_formatting() {
231        let rtf = br#"{\rtf1\ansi\deff0 {\b Name:} John Doe}"#;
232        let result = rtf_to_html(rtf);
233        assert!(result.is_ok());
234        let html = result.unwrap();
235        assert!(
236            html.contains("<strong>") || html.contains("Name"),
237            "Expected formatted or plain content"
238        );
239    }
240
241    #[test]
242    fn test_rtf_with_italic_formatting() {
243        let rtf = br#"{\rtf1\ansi\deff0 Normal {\i italic text} more normal}"#;
244        let result = rtf_to_html(rtf);
245        assert!(result.is_ok());
246        let html = result.unwrap();
247        assert!(
248            html.contains("<em>") || html.contains("italic"),
249            "Expected formatted or plain content"
250        );
251    }
252
253    #[test]
254    fn test_invalid_rtf() {
255        let rtf = b"not valid rtf content";
256        let result = rtf_to_html(rtf);
257        assert!(result.is_err());
258    }
259
260    #[test]
261    fn test_empty_rtf() {
262        let rtf = br#"{\rtf1\ansi\deff0 }"#;
263        let result = rtf_to_html(rtf);
264        assert!(result.is_ok());
265    }
266}