Skip to main content

deformat/
lib.rs

1//! De-format: extract plain text from HTML, PDF, and other document formats.
2//!
3//! NER engines, LLM pipelines, and search indexers need plain text.
4//! `deformat` sits upstream: it takes formatted documents and returns clean
5//! text. No I/O -- it operates on `&str` and `&[u8]` inputs.
6//!
7//! # Quick start
8//!
9//! ```
10//! use deformat::{extract, Format};
11//!
12//! // Auto-detect format and extract text
13//! let result = extract("<p>Hello <b>world</b>!</p>");
14//! assert_eq!(result.text, "Hello world!");
15//! assert_eq!(result.format, Format::Html);
16//!
17//! // Plain text passes through unchanged
18//! let result = extract("Just plain text.");
19//! assert_eq!(result.text, "Just plain text.");
20//! assert_eq!(result.format, Format::PlainText);
21//! ```
22//!
23//! # Feature flags
24//!
25//! All features are opt-in. The default build has zero non-core dependencies
26//! beyond `once_cell` and `regex` (used for HTML entity decoding and
27//! boilerplate removal).
28//!
29//! | Feature | Crate | What it adds |
30//! |---------|-------|-------------|
31//! | `readability` | [`dom_smoothie`] | Mozilla Readability article extraction |
32//! | `html2text` | [`html2text`] | DOM-based HTML-to-text with layout awareness |
33//! | `pdf` | [`pdf-extract`] | PDF text extraction from file paths |
34
35pub mod detect;
36pub mod error;
37pub mod html;
38
39#[cfg(feature = "pdf")]
40pub mod pdf;
41
42use std::collections::HashMap;
43
44pub use detect::Format;
45pub use error::Error;
46
47/// Extracted text with metadata about the source document.
48#[derive(Debug, Clone)]
49pub struct Extracted {
50    /// The extracted plain text content.
51    pub text: String,
52    /// The detected (or specified) source format.
53    pub format: Format,
54    /// Metadata from extraction (title, extractor used, language, etc.).
55    pub metadata: HashMap<String, String>,
56}
57
58/// Extract plain text from content, auto-detecting the format.
59///
60/// Examines the content to determine whether it is HTML or plain text,
61/// then applies the appropriate extraction strategy. Plain text and
62/// markdown pass through unchanged.
63///
64/// For PDF extraction, use [`extract_bytes`] or the [`pdf`] module
65/// (requires the `pdf` feature).
66///
67/// # Examples
68///
69/// ```
70/// let result = deformat::extract("<html><body><p>Hello</p></body></html>");
71/// assert!(result.text.contains("Hello"));
72/// assert_eq!(result.format, deformat::Format::Html);
73/// ```
74pub fn extract(content: &str) -> Extracted {
75    let format = detect::detect_str(content);
76    extract_as(content, format)
77}
78
79/// Extract plain text with an explicit format override.
80///
81/// Skips format detection and applies the specified extraction strategy
82/// directly.
83pub fn extract_as(content: &str, format: Format) -> Extracted {
84    match format {
85        Format::Html => {
86            let text = html::strip_to_text(content);
87            let mut metadata = HashMap::new();
88            metadata.insert("extractor".into(), "strip".into());
89            Extracted {
90                text,
91                format,
92                metadata,
93            }
94        }
95        Format::PlainText | Format::Markdown | Format::Unknown => Extracted {
96            text: content.to_string(),
97            format,
98            metadata: HashMap::new(),
99        },
100        Format::Pdf => Extracted {
101            text: String::new(),
102            format,
103            metadata: HashMap::from([(
104                "error".into(),
105                "PDF requires file path; use deformat::pdf::extract_file()".into(),
106            )]),
107        },
108    }
109}
110
111/// Extract article content from HTML using readability analysis.
112///
113/// Attempts Mozilla Readability extraction first (content-focused,
114/// removes boilerplate). Falls back to tag stripping if readability
115/// fails or produces insufficient content (< 50 chars).
116///
117/// Requires the `readability` feature.
118///
119/// # Arguments
120///
121/// * `html` - HTML content to extract from.
122/// * `url` - Optional source URL (improves link resolution and metadata).
123#[cfg(feature = "readability")]
124pub fn extract_readable(html: &str, url: Option<&str>) -> Extracted {
125    match html::extract_with_readability(html, url.unwrap_or("")) {
126        Some((text, title, excerpt)) => {
127            let mut metadata = HashMap::new();
128            metadata.insert("extractor".into(), "readability".into());
129            if let Some(t) = title {
130                metadata.insert("title".into(), t);
131            }
132            if let Some(e) = excerpt {
133                metadata.insert("excerpt".into(), e);
134            }
135            Extracted {
136                text,
137                format: Format::Html,
138                metadata,
139            }
140        }
141        None => {
142            let text = html::strip_to_text(html);
143            let mut metadata = HashMap::new();
144            metadata.insert("extractor".into(), "strip".into());
145            metadata.insert("readability_fallback".into(), "true".into());
146            Extracted {
147                text,
148                format: Format::Html,
149                metadata,
150            }
151        }
152    }
153}
154
155/// Extract text from HTML using DOM-based conversion with layout awareness.
156///
157/// Produces formatted text that respects block structure, tables, and
158/// link footnotes. Falls back to tag stripping on parse errors.
159///
160/// Requires the `html2text` feature.
161///
162/// # Arguments
163///
164/// * `html` - HTML content to convert.
165/// * `width` - Target line width for wrapping (e.g., 80, 120, or 10000
166///   for effectively no wrapping).
167#[cfg(feature = "html2text")]
168pub fn extract_html2text(html: &str, width: usize) -> Extracted {
169    match ::html2text::from_read(html.as_bytes(), width) {
170        Ok(text) => {
171            let mut metadata = HashMap::new();
172            metadata.insert("extractor".into(), "html2text".into());
173            Extracted {
174                text,
175                format: Format::Html,
176                metadata,
177            }
178        }
179        Err(_) => {
180            let text = html::strip_to_text(html);
181            let mut metadata = HashMap::new();
182            metadata.insert("extractor".into(), "strip".into());
183            metadata.insert("html2text_fallback".into(), "true".into());
184            Extracted {
185                text,
186                format: Format::Html,
187                metadata,
188            }
189        }
190    }
191}
192
193#[cfg(test)]
194mod tests {
195    use super::*;
196
197    #[test]
198    fn extract_html_auto() {
199        let result = extract("<p>Hello <b>world</b>!</p>");
200        assert_eq!(result.text, "Hello world!");
201        assert_eq!(result.format, Format::Html);
202    }
203
204    #[test]
205    fn extract_full_html_doc() {
206        let html = "<!DOCTYPE html><html><head><title>T</title></head>\
207                     <body><p>Content here.</p></body></html>";
208        let result = extract(html);
209        assert!(result.text.contains("Content here"));
210        assert!(!result.text.contains("<title>"), "tags should be stripped");
211        assert_eq!(result.format, Format::Html);
212    }
213
214    #[test]
215    fn extract_plain_text() {
216        let result = extract("Just plain text, no markup.");
217        assert_eq!(result.text, "Just plain text, no markup.");
218        assert_eq!(result.format, Format::PlainText);
219    }
220
221    #[test]
222    fn extract_as_html() {
223        let result = extract_as("<b>bold</b> text", Format::Html);
224        assert_eq!(result.text, "bold text");
225    }
226
227    #[test]
228    fn extract_as_plain() {
229        let result = extract_as("<b>not html</b>", Format::PlainText);
230        assert_eq!(result.text, "<b>not html</b>");
231    }
232
233    #[test]
234    fn extract_metadata_has_extractor() {
235        let result = extract("<p>Hello</p>");
236        assert_eq!(result.metadata.get("extractor").unwrap(), "strip");
237    }
238
239    #[test]
240    fn extract_empty_string() {
241        let result = extract("");
242        assert_eq!(result.text, "");
243        assert_eq!(result.format, Format::PlainText);
244    }
245
246    #[cfg(feature = "readability")]
247    #[test]
248    fn extract_readable_with_article() {
249        let html = r#"<!DOCTYPE html>
250        <html><head><title>Test Article</title></head>
251        <body>
252            <nav><a href="/">Home</a></nav>
253            <article>
254                <h1>Test Article</h1>
255                <p>A team of researchers at the University of Cambridge has announced
256                   the discovery of a previously unknown species. The discovery was
257                   published in the journal Nature. The finding represents one of the
258                   most significant discoveries in recent years and has drawn attention
259                   from conservation organizations worldwide.</p>
260                <p>Lead researcher Dr. Sarah Chen said the species was found during
261                   an expedition in January. Chen and her team spent three weeks
262                   collecting specimens and documenting the habitat conditions where
263                   the species was found along tributary streams.</p>
264                <p>Conservation groups including the World Wildlife Fund have called
265                   for increased protection of the region. Local communities have long
266                   known about the species but it had never been formally described.</p>
267                <p>The research was funded by a grant from the European Research Council.
268                   Additional specimens will be housed at the Natural History Museum in
269                   London. Future expeditions are planned to search for related species
270                   in neighboring regions.</p>
271            </article>
272            <footer>Copyright 2026</footer>
273        </body></html>"#;
274        let result = extract_readable(html, Some("https://example.com/article"));
275        assert!(result.text.contains("Dr. Sarah Chen"));
276        assert_eq!(result.metadata.get("extractor").unwrap(), "readability");
277    }
278
279    #[cfg(feature = "readability")]
280    #[test]
281    fn extract_readable_fallback_on_short() {
282        let result = extract_readable("<p>Short</p>", None);
283        assert_eq!(result.metadata.get("readability_fallback").unwrap(), "true");
284    }
285
286    #[cfg(feature = "html2text")]
287    #[test]
288    fn extract_html2text_basic() {
289        let result = extract_html2text("<p>Hello <b>world</b>!</p>", 80);
290        assert!(result.text.contains("Hello"));
291        assert!(result.text.contains("world"));
292        assert_eq!(result.metadata.get("extractor").unwrap(), "html2text");
293    }
294}