[−][src]Trait extrablatt::extract::Extractor

pub trait Extractor {
    fn title<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> { ... }
    fn authors<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>> { ... }
    fn publishing_date(
        &self, 
        doc: &Document, 
        base_url: Option<&Url>
    ) -> Option<ArticleDate> { ... }
    fn favicon(&self, doc: &Document, base_url: &Url) -> Option<Url> { ... }
    fn base_url(&self, doc: &Document) -> Option<Url> { ... }
    fn meta_language(&self, doc: &Document) -> Option<Language> { ... }
    fn meta_data<'a>(&self, doc: &'a Document) -> Vec<MetaNode<'a>> { ... }
    fn meta_content<'a, 'b>(
        &self, 
        doc: &'a Document, 
        attr: Attr<&'b str, &'b str>
    ) -> Option<Cow<'a, str>> { ... }
    fn meta_thumbnail_url(
        &self, 
        doc: &Document, 
        base_url: Option<&Url>
    ) -> Option<Url> { ... }
    fn meta_img_url(
        &self, 
        doc: &Document, 
        base_url: Option<&Url>
    ) -> Option<Url> { ... }
    fn meta_type<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> { ... }
    fn meta_site_name<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> { ... }
    fn meta_description<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> { ... }
    fn meta_keywords<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>> { ... }
    fn text<'a>(
        &self, 
        doc: &'a Document, 
        lang: Language
    ) -> Option<Cow<'a, str>> { ... }
    fn text_with_cleaner<'a, T: DocumentCleaner>(
        &self, 
        doc: &'a Document, 
        lang: Language, 
        cleaner: T
    ) -> Option<Cow<'a, str>> { ... }
    fn article_node<'a>(
        &self, 
        doc: &'a Document, 
        lang: Language
    ) -> Option<ArticleTextNode<'a>> { ... }
    fn all_urls<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>> { ... }
    fn article_urls(
        &self, 
        doc: &Document, 
        base_url: Option<&Url>
    ) -> Vec<ArticleUrl> { ... }
    fn image_urls(&self, doc: &Document, base_url: Option<&Url>) -> Vec<Url> { ... }
    fn is_article(article: &ArticleUrl, base_url: &Url) -> bool { ... }
    fn is_category(category: &Category, base_url: &Url) -> bool { ... }
    fn categories(&self, doc: &Document, base_url: &Url) -> Vec<Category> { ... }
    fn article_content<'a>(
        &self, 
        doc: &'a Document, 
        base_url: Option<&Url>, 
        lang: Option<Language>
    ) -> ArticleContent<'a> { ... }
    fn canonical_link(&self, doc: &Document) -> Option<Url> { ... }
    fn videos<'a>(
        &self, 
        doc: &'a Document, 
        lang: Option<Language>
    ) -> Vec<VideoNode<'a>> { ... }
}

Used to retrieve all valuable information from a select::document::Document.

Provided methods

`fn title<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>`

Extract the article title.

Assumptions:

og:title usually contains the plain title, but shortened compared to <h1>
<title> tag is the most reliable, but often contains also the newspaper name like: "Some title - The New York Times"
<h1>, if properly detected, is the best since this is also displayed to users)

Matching strategy:

<h1> takes precedent over og:title
og:title takes precedent over <title>

`fn authors<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>`

Extract all the listed authors for the article.

`fn publishing_date( &self, doc: &Document, base_url: Option<&Url> ) -> Option<ArticleDate>`

When the article was published (and last updated).

`fn favicon(&self, doc: &Document, base_url: &Url) -> Option<Url>`

Extract the favicon from a website.

`fn base_url(&self, doc: &Document) -> Option<Url>`

Finds the href in the <base> tag.

`fn meta_language(&self, doc: &Document) -> Option<Language>`

Extract content language from meta tag.

`fn meta_data<'a>(&self, doc: &'a Document) -> Vec<MetaNode<'a>>`

Finds all <meta> nodes in the document.

`fn meta_content<'a, 'b>( &self, doc: &'a Document, attr: Attr<&'b str, &'b str> ) -> Option<Cow<'a, str>>`

Extract a given meta content form document.

`fn meta_thumbnail_url( &self, doc: &Document, base_url: Option<&Url> ) -> Option<Url>`

Extract the thumbnail for the article.

`fn meta_img_url(&self, doc: &Document, base_url: Option<&Url>) -> Option<Url>`

Extract the 'top img' as specified by the website.

`fn meta_type<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>`

Returns meta type of article, open graph protocol

`fn meta_site_name<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>`

Returns site name of article, open graph protocol.

`fn meta_description<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>`

If the article has meta description set in the source, use that

`fn meta_keywords<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>`

If the article has meta keywords set in the source, use that.

`fn text<'a>(&self, doc: &'a Document, lang: Language) -> Option<Cow<'a, str>>`

Get the full text of the article.

`fn text_with_cleaner<'a, T: DocumentCleaner>( &self, doc: &'a Document, lang: Language, cleaner: T ) -> Option<Cow<'a, str>>`

Get the full text of the article with a designated DocumentCleaner

`fn article_node<'a>( &self, doc: &'a Document, lang: Language ) -> Option<ArticleTextNode<'a>>`

Detect the select::node::Node that contains the article's text.

If the doc's body contains a node that matches the crate::text::ARTICLE_BODY_ATTR attribute selectors, this node will be selected. Otherwise the article node will be calculated by analysing and scoring the textual content of text nodes.

`fn all_urls<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>`

Extract the href attribute for all <a> tags of the document.

`fn article_urls( &self, doc: &Document, base_url: Option<&Url> ) -> Vec<ArticleUrl>`

Finds all urls from the document that might point to an article.

`fn image_urls(&self, doc: &Document, base_url: Option<&Url>) -> Vec<Url>`

Extract all of the images of the document.

`fn is_article(article: &ArticleUrl, base_url: &Url) -> bool`

First, perform basic format and domain checks like making sure the format of the url.

We also filter out articles with a subdomain or first degree path on a registered bad keyword.

`fn is_category(category: &Category, base_url: &Url) -> bool`

`fn categories(&self, doc: &Document, base_url: &Url) -> Vec<Category>`

Finds all of the top level urls, assuming that these are the category urls.

`fn article_content<'a>( &self, doc: &'a Document, base_url: Option<&Url>, lang: Option<Language> ) -> ArticleContent<'a>`

Gathers all items for an article from the document.

`fn canonical_link(&self, doc: &Document) -> Option<Url>`

Return the article's canonical URL

Gets the first available value of:

The rel=canonical tag
The og:url tag

`fn videos<'a>( &self, doc: &'a Document, lang: Option<Language> ) -> Vec<VideoNode<'a>>`

All video content in the article.

Loading content...

Implementors

`impl Extractor for DefaultExtractor`[src]

Loading content...

[−][src]Trait extrablatt::extract::Extractor

Provided methods

fn title<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>

fn authors<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>

fn publishing_date( &self, doc: &Document, base_url: Option<&Url>) -> Option<ArticleDate>

fn favicon(&self, doc: &Document, base_url: &Url) -> Option<Url>

fn base_url(&self, doc: &Document) -> Option<Url>

fn meta_language(&self, doc: &Document) -> Option<Language>

fn meta_data<'a>(&self, doc: &'a Document) -> Vec<MetaNode<'a>>

fn meta_content<'a, 'b>( &self, doc: &'a Document, attr: Attr<&'b str, &'b str>) -> Option<Cow<'a, str>>

fn meta_thumbnail_url( &self, doc: &Document, base_url: Option<&Url>) -> Option<Url>

fn meta_img_url(&self, doc: &Document, base_url: Option<&Url>) -> Option<Url>

fn meta_type<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>

fn meta_site_name<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>

fn meta_description<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>

fn meta_keywords<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>

fn text<'a>(&self, doc: &'a Document, lang: Language) -> Option<Cow<'a, str>>

fn text_with_cleaner<'a, T: DocumentCleaner>( &self, doc: &'a Document, lang: Language, cleaner: T) -> Option<Cow<'a, str>>

fn article_node<'a>( &self, doc: &'a Document, lang: Language) -> Option<ArticleTextNode<'a>>

fn all_urls<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>

fn article_urls( &self, doc: &Document, base_url: Option<&Url>) -> Vec<ArticleUrl>

fn image_urls(&self, doc: &Document, base_url: Option<&Url>) -> Vec<Url>

fn is_article(article: &ArticleUrl, base_url: &Url) -> bool

fn is_category(category: &Category, base_url: &Url) -> bool

fn categories(&self, doc: &Document, base_url: &Url) -> Vec<Category>

fn article_content<'a>( &self, doc: &'a Document, base_url: Option<&Url>, lang: Option<Language>) -> ArticleContent<'a>

fn canonical_link(&self, doc: &Document) -> Option<Url>

fn videos<'a>( &self, doc: &'a Document, lang: Option<Language>) -> Vec<VideoNode<'a>>