[][src]Trait extrablatt::extract::Extractor

pub trait Extractor {
    fn title<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> { ... }
fn authors<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>> { ... }
fn publishing_date(
        &self,
        doc: &Document,
        base_url: Option<&Url>
    ) -> Option<ArticleDate> { ... }
fn favicon(&self, doc: &Document, base_url: &Url) -> Option<Url> { ... }
fn base_url(&self, doc: &Document) -> Option<Url> { ... }
fn meta_language(&self, doc: &Document) -> Option<Language> { ... }
fn meta_data<'a>(&self, doc: &'a Document) -> Vec<MetaNode<'a>> { ... }
fn meta_content<'a, 'b>(
        &self,
        doc: &'a Document,
        attr: Attr<&'b str, &'b str>
    ) -> Option<Cow<'a, str>> { ... }
fn meta_thumbnail_url(
        &self,
        doc: &Document,
        base_url: Option<&Url>
    ) -> Option<Url> { ... }
fn meta_img_url(
        &self,
        doc: &Document,
        base_url: Option<&Url>
    ) -> Option<Url> { ... }
fn meta_type<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> { ... }
fn meta_site_name<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> { ... }
fn meta_description<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> { ... }
fn meta_keywords<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>> { ... }
fn text<'a>(
        &self,
        doc: &'a Document,
        lang: Language
    ) -> Option<Cow<'a, str>> { ... }
fn text_with_cleaner<'a, T: DocumentCleaner>(
        &self,
        doc: &'a Document,
        lang: Language,
        cleaner: T
    ) -> Option<Cow<'a, str>> { ... }
fn article_node<'a>(
        &self,
        doc: &'a Document,
        lang: Language
    ) -> Option<ArticleTextNode<'a>> { ... }
fn all_urls<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>> { ... }
fn article_urls(
        &self,
        doc: &Document,
        base_url: Option<&Url>
    ) -> Vec<ArticleUrl> { ... }
fn image_urls(&self, doc: &Document, base_url: Option<&Url>) -> Vec<Url> { ... }
fn is_article(article: &ArticleUrl, base_url: &Url) -> bool { ... }
fn is_category(category: &Category, base_url: &Url) -> bool { ... }
fn categories(&self, doc: &Document, base_url: &Url) -> Vec<Category> { ... }
fn article_content<'a>(
        &self,
        doc: &'a Document,
        base_url: Option<&Url>,
        lang: Option<Language>
    ) -> ArticleContent<'a> { ... }
fn canonical_link(&self, doc: &Document) -> Option<Url> { ... }
fn videos<'a>(
        &self,
        doc: &'a Document,
        lang: Option<Language>
    ) -> Vec<VideoNode<'a>> { ... } }

Used to retrieve all valuable information from a select::document::Document.

Provided methods

fn title<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>

Extract the article title.

Assumptions:

  • og:title usually contains the plain title, but shortened compared to <h1>
  • <title> tag is the most reliable, but often contains also the newspaper name like: "Some title - The New York Times"
  • <h1>, if properly detected, is the best since this is also displayed to users)

Matching strategy:

  1. <h1> takes precedent over og:title
  2. og:title takes precedent over <title>

fn authors<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>

Extract all the listed authors for the article.

fn publishing_date(
    &self,
    doc: &Document,
    base_url: Option<&Url>
) -> Option<ArticleDate>

When the article was published (and last updated).

fn favicon(&self, doc: &Document, base_url: &Url) -> Option<Url>

Extract the favicon from a website.

fn base_url(&self, doc: &Document) -> Option<Url>

Finds the href in the <base> tag.

fn meta_language(&self, doc: &Document) -> Option<Language>

Extract content language from meta tag.

fn meta_data<'a>(&self, doc: &'a Document) -> Vec<MetaNode<'a>>

Finds all <meta> nodes in the document.

fn meta_content<'a, 'b>(
    &self,
    doc: &'a Document,
    attr: Attr<&'b str, &'b str>
) -> Option<Cow<'a, str>>

Extract a given meta content form document.

fn meta_thumbnail_url(
    &self,
    doc: &Document,
    base_url: Option<&Url>
) -> Option<Url>

Extract the thumbnail for the article.

fn meta_img_url(&self, doc: &Document, base_url: Option<&Url>) -> Option<Url>

Extract the 'top img' as specified by the website.

fn meta_type<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>

Returns meta type of article, open graph protocol

fn meta_site_name<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>

Returns site name of article, open graph protocol.

fn meta_description<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>>

If the article has meta description set in the source, use that

fn meta_keywords<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>

If the article has meta keywords set in the source, use that.

fn text<'a>(&self, doc: &'a Document, lang: Language) -> Option<Cow<'a, str>>

Get the full text of the article.

fn text_with_cleaner<'a, T: DocumentCleaner>(
    &self,
    doc: &'a Document,
    lang: Language,
    cleaner: T
) -> Option<Cow<'a, str>>

Get the full text of the article with a designated DocumentCleaner

fn article_node<'a>(
    &self,
    doc: &'a Document,
    lang: Language
) -> Option<ArticleTextNode<'a>>

Detect the select::node::Node that contains the article's text.

If the doc's body contains a node that matches the crate::text::ARTICLE_BODY_ATTR attribute selectors, this node will be selected. Otherwise the article node will be calculated by analysing and scoring the textual content of text nodes.

fn all_urls<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>>

Extract the href attribute for all <a> tags of the document.

fn article_urls(
    &self,
    doc: &Document,
    base_url: Option<&Url>
) -> Vec<ArticleUrl>

Finds all urls from the document that might point to an article.

fn image_urls(&self, doc: &Document, base_url: Option<&Url>) -> Vec<Url>

Extract all of the images of the document.

fn is_article(article: &ArticleUrl, base_url: &Url) -> bool

First, perform basic format and domain checks like making sure the format of the url.

We also filter out articles with a subdomain or first degree path on a registered bad keyword.

fn is_category(category: &Category, base_url: &Url) -> bool

fn categories(&self, doc: &Document, base_url: &Url) -> Vec<Category>

Finds all of the top level urls, assuming that these are the category urls.

fn article_content<'a>(
    &self,
    doc: &'a Document,
    base_url: Option<&Url>,
    lang: Option<Language>
) -> ArticleContent<'a>

Gathers all items for an article from the document.

Return the article's canonical URL

Gets the first available value of:

  1. The rel=canonical tag
  2. The og:url tag

fn videos<'a>(
    &self,
    doc: &'a Document,
    lang: Option<Language>
) -> Vec<VideoNode<'a>>

All video content in the article.

Loading content...

Implementors

impl Extractor for DefaultExtractor[src]

Loading content...