use dom_query::Document as DomDocument;
use crate::error::Result;
use crate::options::{Options, ReaderableOptions};
use crate::readability::{Article, Readability};
use crate::readerable::is_probably_readerable_doc;
pub struct Document<'a> {
doc: DomDocument,
html: &'a str,
}
impl<'a> Document<'a> {
pub fn new(html: &'a str) -> Self {
let doc = DomDocument::from(html);
Self { doc, html }
}
pub fn is_probably_readerable(&self, options: Option<ReaderableOptions>) -> bool {
is_probably_readerable_doc(&self.doc, options)
}
pub fn parse(self, url: Option<&str>, options: Option<Options>) -> Result<Article> {
Readability::from_document(self.doc, self.html, url, options).parse()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_readerable_check() {
let long_text = "a".repeat(600);
let html = format!("<html><body><p>{}</p></body></html>", long_text);
let doc = Document::new(&html);
assert!(doc.is_probably_readerable(None));
}
#[test]
fn test_not_readerable() {
let html = "<html><body><p>Short</p></body></html>";
let doc = Document::new(html);
assert!(!doc.is_probably_readerable(None));
}
#[test]
fn test_parse() {
let html = r#"
<html>
<head><title>Test Article</title></head>
<body>
<article>
<h1>Test Article</h1>
<p>This is the main content of the article. It contains several
paragraphs of text that make up the body of the article.</p>
<p>More content here to ensure we have enough text for the
readability algorithm to work with properly.</p>
</article>
</body>
</html>
"#;
let doc = Document::new(html);
let article = doc.parse(Some("https://example.com"), None).unwrap();
assert!(!article.title.is_empty());
assert!(!article.content.is_empty());
}
#[test]
fn test_readerable_then_parse() {
let long_text = "a ".repeat(300);
let html = format!(
r#"
<html>
<head><title>Test Article</title></head>
<body>
<article>
<h1>Test Article</h1>
<p>{}</p>
</article>
</body>
</html>
"#,
long_text
);
let doc = Document::new(&html);
assert!(doc.is_probably_readerable(None));
let article = doc.parse(None, None).unwrap();
assert!(!article.content.is_empty());
}
#[test]
fn test_parse_no_body() {
let html = "<html><head><title>No Body</title></head></html>";
let doc = Document::new(html);
assert!(doc.parse(None, None).is_err());
}
#[test]
fn test_parse_invalid_url() {
let html = "<html><body><p>Content</p></body></html>";
let doc = Document::new(html);
assert!(doc.parse(Some("not a valid url ://"), None).is_err());
}
}