[−][src]Trait extrablatt::extract::Extractor
Used to retrieve all valuable information from a
select::document::Document
.
Provided methods
fn title<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>
Extract the article title.
Assumptions:
og:title
usually contains the plain title, but shortened compared to<h1>
<title>
tag is the most reliable, but often contains also the newspaper name like: "Some title - The New York Times"<h1>
, if properly detected, is the best since this is also displayed to users)
Matching strategy:
<h1>
takes precedent overog:title
og:title
takes precedent over<title>
fn authors<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>
Extract all the listed authors for the article.
fn publishing_date(
&self,
doc: &Document,
base_url: Option<&Url>
) -> Option<ArticleDate>
&self,
doc: &Document,
base_url: Option<&Url>
) -> Option<ArticleDate>
When the article was published (and last updated).
fn favicon(&self, doc: &Document, base_url: &Url) -> Option<Url>
Extract the favicon from a website.
fn base_url(&self, doc: &Document) -> Option<Url>
Finds the href in the <base>
tag.
fn meta_language(&self, doc: &Document) -> Option<Language>
Extract content language from meta tag.
fn meta_data<'a>(&self, doc: &'a Document) -> Vec<MetaNode<'a>>
Finds all <meta>
nodes in the document.
fn meta_content<'a, 'b>(
&self,
doc: &'a Document,
attr: Attr<&'b str, &'b str>
) -> Option<Cow<'a, str>>
&self,
doc: &'a Document,
attr: Attr<&'b str, &'b str>
) -> Option<Cow<'a, str>>
Extract a given meta content form document.
fn meta_thumbnail_url(
&self,
doc: &Document,
base_url: Option<&Url>
) -> Option<Url>
&self,
doc: &Document,
base_url: Option<&Url>
) -> Option<Url>
Extract the thumbnail for the article.
fn meta_img_url(&self, doc: &Document, base_url: Option<&Url>) -> Option<Url>
Extract the 'top img' as specified by the website.
fn meta_type<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>
Returns meta type of article, open graph protocol
fn meta_site_name<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>
Returns site name of article, open graph protocol.
fn meta_description<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>
If the article has meta description set in the source, use that
fn meta_keywords<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>
If the article has meta keywords set in the source, use that.
fn text<'a>(&self, doc: &'a Document, lang: Language) -> Option<Cow<'a, str>>
Get the full text of the article.
fn text_with_cleaner<'a, T: DocumentCleaner>(
&self,
doc: &'a Document,
lang: Language,
cleaner: T
) -> Option<Cow<'a, str>>
&self,
doc: &'a Document,
lang: Language,
cleaner: T
) -> Option<Cow<'a, str>>
Get the full text of the article with a designated DocumentCleaner
fn article_node<'a>(
&self,
doc: &'a Document,
lang: Language
) -> Option<ArticleTextNode<'a>>
&self,
doc: &'a Document,
lang: Language
) -> Option<ArticleTextNode<'a>>
Detect the select::node::Node
that contains the article's text.
If the doc
's body contains a node that matches the
crate::text::ARTICLE_BODY_ATTR
attribute selectors, this node will
be selected. Otherwise the article node will be calculated by analysing
and scoring the textual content of text nodes.
fn all_urls<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>
Extract the href
attribute for all <a>
tags of the document.
fn article_urls(
&self,
doc: &Document,
base_url: Option<&Url>
) -> Vec<ArticleUrl>
&self,
doc: &Document,
base_url: Option<&Url>
) -> Vec<ArticleUrl>
Finds all urls from the document that might point to an article.
fn image_urls(&self, doc: &Document, base_url: Option<&Url>) -> Vec<Url>
Extract all of the images of the document.
fn is_article(article: &ArticleUrl, base_url: &Url) -> bool
First, perform basic format and domain checks like making sure the format of the url.
We also filter out articles with a subdomain or first degree path on a registered bad keyword.
fn is_category(category: &Category, base_url: &Url) -> bool
fn categories(&self, doc: &Document, base_url: &Url) -> Vec<Category>
Finds all of the top level urls, assuming that these are the category urls.
fn article_content<'a>(
&self,
doc: &'a Document,
base_url: Option<&Url>,
lang: Option<Language>
) -> ArticleContent<'a>
&self,
doc: &'a Document,
base_url: Option<&Url>,
lang: Option<Language>
) -> ArticleContent<'a>
Gathers all items for an article from the document.
fn canonical_link(&self, doc: &Document) -> Option<Url>
Return the article's canonical URL
Gets the first available value of:
- The rel=canonical tag
- The og:url tag
fn videos<'a>(
&self,
doc: &'a Document,
lang: Option<Language>
) -> Vec<VideoNode<'a>>
&self,
doc: &'a Document,
lang: Option<Language>
) -> Vec<VideoNode<'a>>
All video content in the article.