pub mod dom;
pub mod error;
pub(crate) mod extraction;
pub mod metadata;
pub mod options;
pub mod result;
pub(crate) mod selector;
pub(crate) mod settings;
pub mod utils;
pub use error::TrafilaturaError;
pub use options::{Config, ExtractionFocus, FallbackCandidates, HtmlDateMode, Options};
pub use result::{ExtractResult, Metadata};
use crate::dom::Document;
use crate::extraction::{
baseline::baseline,
external::compare_external_extraction,
html_processing::{convert_tags, doc_cleaning, post_cleaning, prune_unwanted_nodes},
{extract_comments, extract_content},
};
use crate::settings::FORMAT_TAG_CATALOG;
use crate::utils::{
language::{check_html_language, language_classifier},
lru::LruCache,
text::duplicate_test,
};
pub fn extract(html: &str, opts: &Options) -> Result<ExtractResult, TrafilaturaError> {
let doc = Document::parse(html);
extract_document(doc, opts)
}
pub fn extract_document(doc: Document, opts: &Options) -> Result<ExtractResult, TrafilaturaError> {
let mut opts = opts.clone();
let mut cache = LruCache::new(opts.config.cache_size);
if opts.target_language.is_some() && !check_html_language(&doc, &opts, false) {
return Err(TrafilaturaError::LanguageMismatch {
expected: opts.target_language.clone().unwrap_or_default(),
got: String::new(),
});
}
let mut meta = metadata::extract_metadata(&doc, &opts);
if opts.has_essential_metadata {
if meta.title.is_empty() {
return Err(TrafilaturaError::MissingMetadata("title".into()));
}
if meta.url.is_empty() {
return Err(TrafilaturaError::MissingMetadata("url".into()));
}
if meta.date.is_none() {
return Err(TrafilaturaError::MissingMetadata("date".into()));
}
}
if opts.original_url.is_none() && !meta.url.is_empty() {
if let Ok(parsed) = url::Url::parse(&meta.url) {
if matches!(parsed.scheme(), "http" | "https") {
opts.original_url = Some(parsed);
}
}
}
let mut doc = doc;
if let Some(sel) = &opts.prune_selector {
let root = doc.root();
let to_remove = doc.query_selector_all(root, sel);
for id in to_remove.into_iter().rev() {
doc.remove(id, false);
}
}
let doc_backup1 = doc.clone_document();
let mut doc_backup2 = doc.clone_document();
doc_cleaning(&mut doc, &opts);
convert_tags(&mut doc, &opts);
let (comments_doc, tmp_comments) = if !opts.exclude_comments {
extract_comments(&mut doc, &mut cache, &opts)
} else {
if opts.focus == ExtractionFocus::FavorPrecision {
doc = prune_unwanted_nodes(&doc, selector::discard::REMOVED_COMMENTS, false);
}
(None, String::new())
};
let len_comments = tmp_comments.chars().count();
let (mut content_doc, mut tmp_body_text) = extract_content(&doc, &mut cache, &opts);
if opts.enable_fallback {
(content_doc, tmp_body_text) =
compare_external_extraction(&doc_backup1, content_doc, &opts);
}
let len_text = tmp_body_text.chars().count();
if len_text < opts.config.min_extracted_size && opts.focus != ExtractionFocus::FavorPrecision {
(content_doc, tmp_body_text) = baseline(&mut doc_backup2);
}
if let Some(max_tree) = opts.max_tree_size {
let content_body = content_doc.body().unwrap_or_else(|| content_doc.root());
if content_doc.children(content_body).len() > max_tree {
let fmt_tags: Vec<&str> = FORMAT_TAG_CATALOG.iter().copied().collect();
content_doc.strip_tags(content_body, &fmt_tags);
let n_children = content_doc.children(content_body).len();
if n_children > max_tree {
return Err(TrafilaturaError::TreeTooLarge(n_children));
}
}
}
let len_text = tmp_body_text.chars().count();
if len_text < opts.config.min_output_size && len_comments < opts.config.min_output_comment_size
{
return Err(TrafilaturaError::InsufficientContent {
text_len: len_text,
comment_len: len_comments,
min_output_size: opts.config.min_output_size,
min_output_comment_size: opts.config.min_output_comment_size,
});
}
if opts.deduplicate {
let content_body = content_doc.body().unwrap_or_else(|| content_doc.root());
if duplicate_test(&content_doc, content_body, &mut cache, &opts) {
return Err(TrafilaturaError::DuplicateContent);
}
}
let lang = language_classifier(&tmp_body_text, &tmp_comments);
if let Some(ref target) = opts.target_language {
if &lang != target {
return Err(TrafilaturaError::LanguageMismatch {
expected: target.clone(),
got: lang.clone(),
});
}
}
if !lang.is_empty() {
meta.language = lang;
}
post_cleaning(&mut content_doc);
let mut comments_doc = comments_doc;
if let Some(ref mut cd) = comments_doc {
post_cleaning(cd);
}
let content_body = content_doc.body().unwrap_or_else(|| content_doc.root());
let content_html = content_doc.inner_html(content_body);
let comments_html = if let Some(ref cd) = comments_doc {
let comments_body = cd.body().unwrap_or_else(|| cd.root());
cd.inner_html(comments_body)
} else {
String::new()
};
Ok(ExtractResult {
content_text: tmp_body_text,
comments_text: tmp_comments,
content_html,
comments_html,
metadata: meta,
})
}
pub fn create_readable_document(result: &ExtractResult) -> String {
let m = &result.metadata;
let escape = |s: &str| {
s.replace('&', "&")
.replace('"', """)
.replace('<', "<")
.replace('>', ">")
};
let date_str = match m.date {
Some(d) => d.format("%Y-%m-%d").to_string(),
None => String::new(),
};
let categories = m.categories.join(", ");
let tags = m.tags.join("; ");
let mut html = String::with_capacity(1024);
html.push_str("<html><head>");
for (name, value) in &[
("title", m.title.as_str()),
("author", m.author.as_str()),
("url", m.url.as_str()),
("hostname", m.hostname.as_str()),
("description", m.description.as_str()),
("sitename", m.sitename.as_str()),
("date", date_str.as_str()),
("categories", categories.as_str()),
("tags", tags.as_str()),
("license", m.license.as_str()),
] {
html.push_str(r#"<meta name=""#);
html.push_str(name);
html.push_str(r#"" content=""#);
html.push_str(&escape(value));
html.push_str(r#""/>"#);
}
html.push_str("</head><body>");
if !result.content_html.is_empty() {
html.push_str(r#"<div id="content-body">"#);
html.push_str(&result.content_html);
html.push_str("</div>");
}
if !result.comments_html.is_empty() {
html.push_str(r#"<div id="comments-body">"#);
html.push_str(&result.comments_html);
html.push_str("</div>");
}
html.push_str("</body></html>");
html
}
#[cfg(test)]
mod tests {
use super::*;
fn simple_article(body: &str) -> String {
format!("<html><head><title>Test</title></head><body>{body}</body></html>")
}
#[test]
fn test_extract_basic_article() {
let html = simple_article(
"<article><p>This is the main content of the article. It has enough text to pass \
the minimum size threshold for extraction and should appear in the result.</p></article>",
);
let result = extract(&html, &Options::default()).unwrap();
assert!(
!result.content_text.is_empty(),
"should extract content text"
);
assert!(
result.content_text.contains("main content"),
"content should contain article text"
);
}
#[test]
fn test_extract_strips_scripts_and_nav() {
let html = simple_article(
"<nav>Navigation</nav>\
<script>alert('x')</script>\
<article><p>Real content here that is long enough to be extracted without \
any issues from the minimum size requirements.</p></article>",
);
let result = extract(&html, &Options::default()).unwrap();
assert!(
!result.content_text.contains("Navigation"),
"nav should be stripped"
);
assert!(
!result.content_text.contains("alert"),
"script should be stripped"
);
}
#[test]
fn test_extract_empty_html_returns_error() {
let result = extract("", &Options::default());
assert!(result.is_err(), "empty HTML should return an error");
}
#[test]
fn test_extract_exclude_comments() {
let html = simple_article(
"<article><p>Article content that is long enough to pass the threshold for \
minimum extracted size in the extractor pipeline.</p></article>\
<div id=\"comments\"><p>User comment here</p></div>",
);
let opts = Options::default().with_exclude_comments(true);
let result = extract(&html, &opts).unwrap();
assert!(
result.comments_text.is_empty(),
"comments should be excluded"
);
}
#[test]
fn test_extract_missing_essential_metadata_title() {
let html = "<html><body><p>Content that is long enough to pass the minimum size \
threshold for the extraction algorithm to work properly.</p></body></html>";
let opts = Options::default().with_essential_metadata(true);
let result = extract(html, &opts);
assert!(
matches!(result, Err(TrafilaturaError::MissingMetadata(_))),
"should fail with missing metadata"
);
}
#[test]
fn test_extract_favor_recall_option() {
let html = simple_article(
"<div class='content'><p>Some content in a div that recall mode should pick up \
even without a standard article tag structure.</p></div>",
);
let opts = Options::default().with_focus(ExtractionFocus::FavorRecall);
let _ = extract(&html, &opts); }
#[test]
fn test_extract_document_returns_metadata() {
let html = r#"<html>
<head>
<title>My Article Title</title>
<meta name="author" content="Jane Doe" />
</head>
<body>
<article>
<p>Article content that is long enough to pass the minimum size threshold
for the extraction algorithm to return a valid result without errors.</p>
</article>
</body>
</html>"#;
let result = extract(html, &Options::default()).unwrap();
assert!(!result.metadata.title.is_empty(), "should extract title");
}
#[test]
fn test_extract_content_html_populated() {
let html = simple_article(
"<article><p>Content text that is long enough to pass all minimum size checks \
and produce a non-empty HTML output in the result struct.</p></article>",
);
let result = extract(&html, &Options::default()).unwrap();
assert!(
!result.content_html.is_empty(),
"content_html should be populated"
);
}
#[test]
fn test_extract_missing_essential_metadata_url() {
let html = "<html><head><title>My Title</title></head>\
<body><article><p>Content that is long enough to pass the minimum \
size threshold for the extraction algorithm.</p></article></body></html>";
let opts = Options::default().with_essential_metadata(true);
let result = extract(html, &opts);
assert!(
matches!(result, Err(TrafilaturaError::MissingMetadata(_))),
"should fail: no URL in metadata"
);
}
#[test]
fn test_extract_missing_essential_metadata_date() {
let html = r#"<html>
<head>
<title>My Title</title>
<link rel="canonical" href="https://example.com/article" />
</head>
<body><article><p>Content that is long enough to pass the minimum size
threshold for the extraction algorithm to work correctly.</p></article></body>
</html>"#;
let opts = Options::default().with_essential_metadata(true);
let result = extract(html, &opts);
assert!(
matches!(result, Err(TrafilaturaError::MissingMetadata(_))),
"should fail: no date in metadata"
);
}
#[test]
fn test_extract_prune_selector() {
let html = simple_article(
"<article><p>Keep this content that is definitely long enough to \
pass the minimum size threshold.</p></article>\
<div class=\"sidebar\"><p>Remove this sidebar text.</p></div>",
);
let opts = Options::default().with_prune_selector(".sidebar");
let result = extract(&html, &opts).unwrap();
assert!(
!result.content_text.contains("Remove this sidebar"),
"pruned element should not appear in output"
);
assert!(
result.content_text.contains("Keep this content"),
"non-pruned content should survive"
);
}
#[test]
fn test_extract_max_tree_size_error() {
let many_ps: String = (0..200)
.map(|i| format!("<p>Paragraph number {i} with enough text.</p>"))
.collect();
let html = simple_article(&many_ps);
let opts = Options::default().with_max_tree_size(10);
let result = extract(&html, &opts);
assert!(
matches!(result, Err(TrafilaturaError::TreeTooLarge(_))),
"should return TreeTooLarge when tree exceeds max_tree_size"
);
}
#[test]
fn test_extract_target_language_rejects_unknown() {
let html = simple_article(
"<article><p>Short text that is just barely long enough to pass the minimum \
size threshold but may not be long enough to detect a language reliably.</p></article>",
);
let opts = Options::default().with_target_language("zh");
let result = extract(&html, &opts);
assert!(
matches!(result, Err(TrafilaturaError::LanguageMismatch { .. })),
"should reject content when detected language != target language"
);
}
#[test]
fn test_create_readable_document_structure() {
let result = ExtractResult {
content_text: "Hello world".into(),
comments_text: String::new(),
content_html: "<p>Hello world</p>".into(),
comments_html: String::new(),
metadata: crate::result::Metadata {
title: "My Title".into(),
author: "Jane Doe".into(),
url: "https://example.com/article".into(),
hostname: "example.com".into(),
description: "A description".into(),
sitename: "Example".into(),
date: chrono::NaiveDate::from_ymd_opt(2023, 4, 5),
categories: vec!["Tech".into(), "News".into()],
tags: vec!["rust".into(), "web".into()],
license: "CC BY 4.0".into(),
..Default::default()
},
};
let html = create_readable_document(&result);
assert!(
html.starts_with("<html><head>"),
"should start with html/head"
);
assert!(
html.ends_with("</body></html>"),
"should end with /body/html"
);
assert!(html.contains(r#"name="title" content="My Title""#));
assert!(html.contains(r#"name="author" content="Jane Doe""#));
assert!(html.contains(r#"name="url" content="https://example.com/article""#));
assert!(html.contains(r#"name="hostname" content="example.com""#));
assert!(html.contains(r#"name="description" content="A description""#));
assert!(html.contains(r#"name="sitename" content="Example""#));
assert!(html.contains(r#"name="date" content="2023-04-05""#));
assert!(html.contains(r#"name="categories" content="Tech, News""#));
assert!(html.contains(r#"name="tags" content="rust; web""#));
assert!(html.contains(r#"name="license" content="CC BY 4.0""#));
assert!(html.contains(r#"<div id="content-body">"#));
assert!(html.contains("<p>Hello world</p>"));
assert!(!html.contains(r#"id="comments-body""#));
}
#[test]
fn test_create_readable_document_with_comments() {
let result = ExtractResult {
content_html: "<p>Article</p>".into(),
comments_html: "<p>A comment</p>".into(),
..Default::default()
};
let html = create_readable_document(&result);
assert!(html.contains(r#"<div id="comments-body">"#));
assert!(html.contains("<p>A comment</p>"));
}
#[test]
fn test_create_readable_document_no_date() {
let result = ExtractResult {
content_html: "<p>Content</p>".into(),
..Default::default()
};
let html = create_readable_document(&result);
assert!(html.contains(r#"name="date" content="""#));
}
#[test]
fn test_create_readable_document_escapes_special_chars() {
let result = ExtractResult {
content_html: "<p>Content</p>".into(),
metadata: crate::result::Metadata {
title: r#"Title with "quotes" & <tags>"#.into(),
..Default::default()
},
..Default::default()
};
let html = create_readable_document(&result);
assert!(html.contains(""quotes""));
assert!(html.contains("&"));
assert!(html.contains("<tags>"));
}
}