1pub mod detect;
36pub mod error;
37pub mod html;
38
39#[cfg(feature = "pdf")]
40pub mod pdf;
41
42use std::collections::HashMap;
43
44pub use detect::Format;
45pub use error::Error;
46
47#[derive(Debug, Clone)]
49pub struct Extracted {
50 pub text: String,
52 pub format: Format,
54 pub metadata: HashMap<String, String>,
56}
57
58pub fn extract(content: &str) -> Extracted {
75 let format = detect::detect_str(content);
76 extract_as(content, format)
77}
78
79pub fn extract_as(content: &str, format: Format) -> Extracted {
84 match format {
85 Format::Html => {
86 let text = html::strip_to_text(content);
87 let mut metadata = HashMap::new();
88 metadata.insert("extractor".into(), "strip".into());
89 Extracted {
90 text,
91 format,
92 metadata,
93 }
94 }
95 Format::PlainText | Format::Markdown | Format::Unknown => Extracted {
96 text: content.to_string(),
97 format,
98 metadata: HashMap::new(),
99 },
100 Format::Pdf => Extracted {
101 text: String::new(),
102 format,
103 metadata: HashMap::from([(
104 "error".into(),
105 "PDF requires file path; use deformat::pdf::extract_file()".into(),
106 )]),
107 },
108 }
109}
110
111#[cfg(feature = "readability")]
124pub fn extract_readable(html: &str, url: Option<&str>) -> Extracted {
125 match html::extract_with_readability(html, url.unwrap_or("")) {
126 Some((text, title, excerpt)) => {
127 let mut metadata = HashMap::new();
128 metadata.insert("extractor".into(), "readability".into());
129 if let Some(t) = title {
130 metadata.insert("title".into(), t);
131 }
132 if let Some(e) = excerpt {
133 metadata.insert("excerpt".into(), e);
134 }
135 Extracted {
136 text,
137 format: Format::Html,
138 metadata,
139 }
140 }
141 None => {
142 let text = html::strip_to_text(html);
143 let mut metadata = HashMap::new();
144 metadata.insert("extractor".into(), "strip".into());
145 metadata.insert("readability_fallback".into(), "true".into());
146 Extracted {
147 text,
148 format: Format::Html,
149 metadata,
150 }
151 }
152 }
153}
154
155#[cfg(feature = "html2text")]
168pub fn extract_html2text(html: &str, width: usize) -> Extracted {
169 match ::html2text::from_read(html.as_bytes(), width) {
170 Ok(text) => {
171 let mut metadata = HashMap::new();
172 metadata.insert("extractor".into(), "html2text".into());
173 Extracted {
174 text,
175 format: Format::Html,
176 metadata,
177 }
178 }
179 Err(_) => {
180 let text = html::strip_to_text(html);
181 let mut metadata = HashMap::new();
182 metadata.insert("extractor".into(), "strip".into());
183 metadata.insert("html2text_fallback".into(), "true".into());
184 Extracted {
185 text,
186 format: Format::Html,
187 metadata,
188 }
189 }
190 }
191}
192
193#[cfg(test)]
194mod tests {
195 use super::*;
196
197 #[test]
198 fn extract_html_auto() {
199 let result = extract("<p>Hello <b>world</b>!</p>");
200 assert_eq!(result.text, "Hello world!");
201 assert_eq!(result.format, Format::Html);
202 }
203
204 #[test]
205 fn extract_full_html_doc() {
206 let html = "<!DOCTYPE html><html><head><title>T</title></head>\
207 <body><p>Content here.</p></body></html>";
208 let result = extract(html);
209 assert!(result.text.contains("Content here"));
210 assert!(!result.text.contains("<title>"), "tags should be stripped");
211 assert_eq!(result.format, Format::Html);
212 }
213
214 #[test]
215 fn extract_plain_text() {
216 let result = extract("Just plain text, no markup.");
217 assert_eq!(result.text, "Just plain text, no markup.");
218 assert_eq!(result.format, Format::PlainText);
219 }
220
221 #[test]
222 fn extract_as_html() {
223 let result = extract_as("<b>bold</b> text", Format::Html);
224 assert_eq!(result.text, "bold text");
225 }
226
227 #[test]
228 fn extract_as_plain() {
229 let result = extract_as("<b>not html</b>", Format::PlainText);
230 assert_eq!(result.text, "<b>not html</b>");
231 }
232
233 #[test]
234 fn extract_metadata_has_extractor() {
235 let result = extract("<p>Hello</p>");
236 assert_eq!(result.metadata.get("extractor").unwrap(), "strip");
237 }
238
239 #[test]
240 fn extract_empty_string() {
241 let result = extract("");
242 assert_eq!(result.text, "");
243 assert_eq!(result.format, Format::PlainText);
244 }
245
246 #[cfg(feature = "readability")]
247 #[test]
248 fn extract_readable_with_article() {
249 let html = r#"<!DOCTYPE html>
250 <html><head><title>Test Article</title></head>
251 <body>
252 <nav><a href="/">Home</a></nav>
253 <article>
254 <h1>Test Article</h1>
255 <p>A team of researchers at the University of Cambridge has announced
256 the discovery of a previously unknown species. The discovery was
257 published in the journal Nature. The finding represents one of the
258 most significant discoveries in recent years and has drawn attention
259 from conservation organizations worldwide.</p>
260 <p>Lead researcher Dr. Sarah Chen said the species was found during
261 an expedition in January. Chen and her team spent three weeks
262 collecting specimens and documenting the habitat conditions where
263 the species was found along tributary streams.</p>
264 <p>Conservation groups including the World Wildlife Fund have called
265 for increased protection of the region. Local communities have long
266 known about the species but it had never been formally described.</p>
267 <p>The research was funded by a grant from the European Research Council.
268 Additional specimens will be housed at the Natural History Museum in
269 London. Future expeditions are planned to search for related species
270 in neighboring regions.</p>
271 </article>
272 <footer>Copyright 2026</footer>
273 </body></html>"#;
274 let result = extract_readable(html, Some("https://example.com/article"));
275 assert!(result.text.contains("Dr. Sarah Chen"));
276 assert_eq!(result.metadata.get("extractor").unwrap(), "readability");
277 }
278
279 #[cfg(feature = "readability")]
280 #[test]
281 fn extract_readable_fallback_on_short() {
282 let result = extract_readable("<p>Short</p>", None);
283 assert_eq!(result.metadata.get("readability_fallback").unwrap(), "true");
284 }
285
286 #[cfg(feature = "html2text")]
287 #[test]
288 fn extract_html2text_basic() {
289 let result = extract_html2text("<p>Hello <b>world</b>!</p>", 80);
290 assert!(result.text.contains("Hello"));
291 assert!(result.text.contains("world"));
292 assert_eq!(result.metadata.get("extractor").unwrap(), "html2text");
293 }
294}