use std::collections::HashMap;

use chrono::{DateTime, FixedOffset};
use ego_tree::NodeRef;
use regex::Regex;
use scraper::{node::Element, Node};
use tendril::StrTendril;
use unicode_segmentation::UnicodeSegmentation;

pub fn parse_document(doc: &str) -> Document {
    let html = scraper::Html::parse_document(doc);
    let root = html.root_element();

    let mut state = ParseState::new();
    state.parse(&root);
    state.flush();

    let mut pipeline = Document {
        text_blocks: state.text_blocks,
        title: state.title,
        time: state.time,
    };

    pipeline.process();

    pipeline
}

struct ParseState {
    flush: bool,
    tag_depth: u64,
    block_tag_depth: i64,
    body_depth: u64,
    anchor_depth: u64,
    last_start_tag: Option<Tag>,
    in_anchor_text: bool,
    label_stack: Vec<Label>,
    text: StrTendril,
    title: Option<String>,
    text_blocks: Vec<TextBlock>,
    time: Option<DateTime<FixedOffset>>,
}

const ANCHOR_TEXT_START: &'static str = "$\u{e00a}<";
const ANCHOR_TEXT_END: &'static str = ">\u{e00a}$";
const MAX_LINE_LENGTH: usize = 80;

impl ParseState {
    fn new() -> Self {
        Self {
            flush: false,
            tag_depth: 0,
            block_tag_depth: -1,
            body_depth: 0,
            anchor_depth: 0,
            last_start_tag: None,
            in_anchor_text: false,
            label_stack: vec![],
            text: StrTendril::new(),
            title: None,
            text_blocks: vec![],
            time: None,
        }
    }

    fn parse(&mut self, root: &NodeRef<Node>) {
        for node in root.children() {
            match node.value() {
                scraper::Node::Text(t) => {
                    self.text(&t.text);
                }

                scraper::Node::Element(el) => {
                    let tag = Tag::from_str(el.name());

                    self.start_element(node, el, &tag);
                    self.end_element(&tag);
                }

                _ => {} // ignore
            }
        }
    }

    fn text(&mut self, t: &StrTendril) {
        if self.flush {
            self.flush();
            self.flush = false;
        }

        self.text.push_slice(t.trim());
        self.text.push_char(' ');

        if self.block_tag_depth == -1 {
            self.block_tag_depth = self.tag_depth as i64;
        }
    }

    fn start_element(&mut self, root: NodeRef<Node>, el: &Element, tag: &Option<Tag>) {
        let action = match tag {
            Some(tag) => {
                let action = tag.action();

                if action.changes_tag_level() {
                    self.tag_depth += 1;
                }

                self.flush = action.should_flush() || self.flush;

                action
            }
            None => {
                self.tag_depth += 1;
                self.flush = true;

                Action::Inline
            }
        };

        match &action {
            Action::Body => {
                self.flush();
                self.body_depth += 1;
            }
            Action::Anchor => {
                self.anchor_depth += 1;
                self.text.push_slice(ANCHOR_TEXT_START);
            }
            Action::BlockTagLabel(labels) => {
                self.label_stack.extend_from_slice(labels);
            }
            Action::Time => {
                self.time = el
                    .attr("datetime")
                    .and_then(|dt| DateTime::parse_from_rfc3339(dt).ok());
            }
            _ => {}
        }

        match action {
            Action::Ingore | Action::IngoreVoid => {}
            _ => {
                self.parse(&root);
            }
        }

        self.last_start_tag = *tag;
    }

    fn end_element(&mut self, tag: &Option<Tag>) {
        let action = match tag {
            Some(tag) => {
                let action = tag.action();

                self.flush = action.should_flush() || self.flush;

                if action.changes_tag_level() {
                    self.tag_depth -= 1;
                }

                action
            }
            None => {
                self.tag_depth -= 1;
                self.flush = true;

                Action::Inline
            }
        };

        match action {
            Action::Body => {
                self.flush();
                self.body_depth -= 1;
            }
            Action::Anchor => {
                self.anchor_depth -= 1;
                if self.anchor_depth == 0 {
                    self.text.push_slice(ANCHOR_TEXT_END);
                }
            }
            _ => {}
        }

        if self.flush {
            self.flush();
        }

        self.label_stack.pop();
    }

    fn flush(&mut self) {
        if self.body_depth == 0 {
            if let Some(Tag::Title) = self.last_start_tag {
                if self.text.len() > 0 {
                    self.title = self.text.trim().to_string().into();
                }
            }

            self.text.clear();
            return;
        }

        if self.text.len() == 0 || self.text.len() == 1 {
            self.text.clear();
            return;
        }

        {
            let mut num_words = 0;
            let mut num_linked_words = 0;
            let mut num_wrapped_lines = 0;
            let mut num_tokens = 0;
            let mut num_words_current_line = 0;
            let mut current_line_length = 0;

            for word in self.text.unicode_words() {
                match word {
                    ANCHOR_TEXT_START => {
                        self.in_anchor_text = true;
                    }
                    ANCHOR_TEXT_END => {
                        self.in_anchor_text = false;
                    }
                    word if word.chars().all(|c| c.is_alphanumeric()) => {
                        num_tokens += 1;
                        num_words += 1;
                        num_words_current_line += 1;

                        if self.in_anchor_text {
                            num_linked_words += 1;
                        }

                        current_line_length += word.len() + 1;

                        if current_line_length > MAX_LINE_LENGTH {
                            num_wrapped_lines += 1;
                            current_line_length = word.len();
                            num_words_current_line = 1;
                        }
                    }
                    "" => {}
                    _ => {
                        num_tokens += 1;
                    }
                }
            }

            if num_tokens == 0 {
                return;
            }

            let num_words_in_wrapped_lines= if num_wrapped_lines == 0 {
                num_wrapped_lines = 1;

                num_words
            } else {
                num_words - num_words_current_line
            };

            let text = self
                .text
                .trim()
                .replace(ANCHOR_TEXT_START, "")
                .replace(ANCHOR_TEXT_END, "");

            if text.len() > 0 {
                let mut text_block = TextBlock {
                    text: text.into(),
                    num_words,
                    num_linked_words,
                    num_words_in_wrapped_lines,
                    num_wrapped_lines,
                    offset_block_start: self.text_blocks.len(),
                    offset_block_end: self.text_blocks.len(),
                    tag_level: self.block_tag_depth as usize,
                    is_content: true,
                    label_map: HashMap::new(),
                };

                text_block.add_labels(&self.label_stack);
                self.label_stack.clear();

                self.text_blocks.push(text_block);
            }

            self.text.clear();
            self.block_tag_depth = -1;
        }
    }
}

#[derive(Debug, Copy, Clone)]
enum Tag {
    Applet,
    Figcaption,
    Figure,
    Noscript,
    Object,
    Option,
    Script,
    Style,
    A,
    Body,
    Abbr,
    B,
    Code,
    Em,
    Font,
    I,
    Span,
    Strike,
    Strong,
    Sub,
    Sup,
    Tt,
    U,
    Var,
    Li,
    H1,
    H2,
    H3,
    Area,
    Base,
    Br,
    Col,
    Embed,
    Hr,
    Img,
    Input,
    Link,
    Menuitem,
    Meta,
    Param,
    Source,
    Track,
    Wbr,
    Time,
    Title,
}

impl Tag {
    fn from_str(s: &str) -> Option<Self> {
        match s {
            "applet" => Some(Self::Applet),
            "figcaption" => Some(Self::Figcaption),
            "figure" => Some(Self::Figure),
            "noscript" => Some(Self::Noscript),
            "object" => Some(Self::Object),
            "option" => Some(Self::Option),
            "script" => Some(Self::Script),
            "style" => Some(Self::Style),
            "a" => Some(Self::A),
            "body" => Some(Self::Body),
            "abbr" => Some(Self::Abbr),
            "b" => Some(Self::B),
            "code" => Some(Self::Code),
            "em" => Some(Self::Em),
            "font" => Some(Self::Font),
            "i" => Some(Self::I),
            "span" => Some(Self::Span),
            "strike" => Some(Self::Strike),
            "strong" => Some(Self::Strong),
            "sub" => Some(Self::Sub),
            "sup" => Some(Self::Sup),
            "tt" => Some(Self::Tt),
            "u" => Some(Self::U),
            "var" => Some(Self::Var),
            "li" => Some(Self::Li),
            "h1" => Some(Self::H1),
            "h2" => Some(Self::H2),
            "h3" => Some(Self::H3),
            "area" => Some(Self::Area),
            "base" => Some(Self::Base),
            "br" => Some(Self::Br),
            "col" => Some(Self::Col),
            "embed" => Some(Self::Embed),
            "hr" => Some(Self::Hr),
            "img" => Some(Self::Img),
            "input" => Some(Self::Input),
            "link" => Some(Self::Link),
            "menuitem" => Some(Self::Menuitem),
            "meta" => Some(Self::Meta),
            "param" => Some(Self::Param),
            "source" => Some(Self::Source),
            "track" => Some(Self::Track),
            "wbr" => Some(Self::Wbr),
            "time" => Some(Self::Time),
            "title" => Some(Self::Title),

            _ => None,
        }
    }

    fn action(&self) -> Action {
        match self {
            Tag::Applet
            | Tag::Figcaption
            | Tag::Figure
            | Tag::Noscript
            | Tag::Object
            | Tag::Option
            | Tag::Script
            | Tag::Style => Action::Ingore,

            Tag::A => Action::Anchor,

            Tag::Body => Action::Body,

            Tag::Abbr
            | Tag::B
            | Tag::Code
            | Tag::Em
            | Tag::Font
            | Tag::I
            | Tag::Span
            | Tag::Strike
            | Tag::Strong
            | Tag::Sub
            | Tag::Sup
            | Tag::Tt
            | Tag::U
            | Tag::Var => Action::Inline,

            Tag::Li => Action::BlockTagLabel(vec![Label::List]),
            Tag::H1 => Action::BlockTagLabel(vec![Label::Heading, Label::Heading1]),
            Tag::H2 => Action::BlockTagLabel(vec![Label::Heading, Label::Heading2]),
            Tag::H3 => Action::BlockTagLabel(vec![Label::Heading, Label::Heading3]),

            Tag::Area
            | Tag::Base
            | Tag::Br
            | Tag::Col
            | Tag::Embed
            | Tag::Hr
            | Tag::Img
            | Tag::Input
            | Tag::Link
            | Tag::Menuitem
            | Tag::Meta
            | Tag::Param
            | Tag::Source
            | Tag::Track
            | Tag::Wbr => Action::IngoreVoid,

            Tag::Time => Action::Time,
            Tag::Title => Action::Title,
        }
    }
}

#[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)]
enum Label {
    EndOfText,
    MightBeContent,
    VeryLikelyContent,
    Title,
    List,
    Heading,
    Heading1,
    Heading2,
    Heading3,
}

enum Action {
    Ingore,
    Anchor,
    Body,
    Inline,
    BlockTagLabel(Vec<Label>),
    IngoreVoid,
    Time,
    Title,
}

impl Action {
    fn should_flush(&self) -> bool {
        match self {
            Action::Ingore => true,
            Action::Anchor => false,
            Action::Body => false,
            Action::Inline => false,
            Action::BlockTagLabel(_) => true,
            Action::IngoreVoid => false,
            Action::Time => true,
            Action::Title => true,
        }
    }

    fn changes_tag_level(&self) -> bool {
        match self {
            Action::Ingore => true,
            Action::Anchor => true,
            Action::Body => true,
            Action::Inline => false,
            Action::BlockTagLabel(_) => true,
            Action::IngoreVoid => false,
            Action::Time => true,
            Action::Title => false,
        }
    }
}

#[derive(Debug)]
struct TextBlock {
    text: StrTendril,
    num_words: usize,
    num_linked_words: usize,
    num_words_in_wrapped_lines: usize,
    num_wrapped_lines: usize,
    offset_block_start: usize,
    offset_block_end: usize,
    tag_level: usize,
    is_content: bool,
    label_map: HashMap<Label, usize>,
}

impl TextBlock {
    fn add_labels(&mut self, labels: &[Label]) {
        for label in labels {
            let lbl = self.label_map.entry(*label).or_insert(0);
            *lbl += 1;
        }
    }

    fn link_density(&self) -> f64 {
        if self.num_words == 0 {
            return 0.0;
        }

        return self.num_linked_words as f64 / self.num_words as f64;
    }

    fn text_density(&self) -> f64 {
        self.num_words_in_wrapped_lines as f64 / self.num_wrapped_lines as f64
    }

    fn empty_start() -> Self {
        Self {
            text: StrTendril::new(),
            num_words: 0,
            num_linked_words: 0,
            num_words_in_wrapped_lines: 0,
            num_wrapped_lines: 0,
            offset_block_start: std::usize::MIN,
            offset_block_end: std::usize::MIN,
            tag_level: 0,
            is_content: false,
            label_map: HashMap::new(),
        }
    }

    fn empty_end() -> Self {
        Self {
            text: StrTendril::new(),
            num_words: 0,
            num_linked_words: 0,
            num_words_in_wrapped_lines: 0,
            num_wrapped_lines: 0,
            offset_block_start: std::usize::MAX,
            offset_block_end: std::usize::MAX,
            tag_level: 0,
            is_content: false,
            label_map: HashMap::new(),
        }
    }

    fn merge(&mut self, other: &Self) {
        self.text.push_char('\n');
        self.text.push_tendril(&other.text);

        self.offset_block_start = std::cmp::min(self.offset_block_start, other.offset_block_start);
        self.offset_block_end = std::cmp::max(self.offset_block_end, other.offset_block_end);

        self.num_words += other.num_words;
        self.num_linked_words += other.num_linked_words;
        self.num_words_in_wrapped_lines += other.num_words_in_wrapped_lines;
        self.num_wrapped_lines += other.num_wrapped_lines;

        self.is_content |= other.is_content;
        self.tag_level = std::cmp::min(self.tag_level, other.tag_level);

        for (k, v) in other.label_map.iter() {
            let count = self.label_map.entry(*k).or_insert(0);
            *count += v;
        }
    }
}

pub struct Document {
    text_blocks: Vec<TextBlock>,
    pub title: Option<String>,
    pub time: Option<DateTime<FixedOffset>>,
}

impl Document {
    fn process(&mut self) -> bool {
        let mut has_changed = self.terminating_blocks();
        has_changed |= self.document_title_match();
        has_changed |= self.num_words_rules_classifier();
        has_changed |= self.ignore_block_after_content();
        has_changed |= self.trailing_headline_to_boilerplate();
        has_changed |= self.block_proximity_fusion(1, false, false);
        has_changed |= self.boilerplate_block();
        has_changed |= self.block_proximity_fusion(1, true, true);
        has_changed |= self.keep_largest_blocks();
        has_changed |= self.expand_title_to_content();
        has_changed |= self.large_block_same_tag_level_to_content();
        has_changed |= self.list_at_end();

        has_changed
    }

    fn terminating_blocks(&mut self) -> bool {
        let mut has_changed = false;

        for tb in self.text_blocks.iter_mut() {
            if tb.num_words < 15 {
                if tb.text.len() >= 8 {
                    let s = tb.text.to_lowercase();

                    if s.starts_with("comments")
                        || s.starts_with("© reuters")
                        || Self::starts_with_number(
                            &tb.text,
                            &[
                                " comments",
                                " users responded in",
                                " комментария",
                                " комментариев",
                            ],
                        )
                        || s.starts_with("please rate this")
                        || s.starts_with("post a comment")
                        || s.contains("what you think...")
                        || s.contains("add your comment")
                        || s.contains("add comment")
                        || s.contains("reader views")
                        || s.contains("have your say")
                        || s.contains("rätta artikeln")
                        || s.contains("оставьте комментарий")
                        || s.contains("расскажите нам, что вы думаете")
                    {
                        tb.add_labels(&[Label::EndOfText]);
                        has_changed = true;
                    }
                } else if tb.link_density() == 1.0 {
                    if tb.text.as_ref() == "Comment" || tb.text.as_ref() == "Комментарии"
                    {
                        tb.add_labels(&[Label::EndOfText]);
                    }
                }
            }
        }

        has_changed
    }

    fn starts_with_number(text: &str, prefixes: &[&str]) -> bool {
        let has_numbers = text
            .grapheme_indices(true)
            .find(|(_i, g)| g.chars().all(|c| c.is_digit(10)));

        match has_numbers {
            Some((i, _)) => {
                for p in prefixes {
                    if text[i..].starts_with(p) {
                        return true;
                    }
                }

                false
            }
            None => false,
        }
    }

    fn document_title_match(&mut self) -> bool {
        let title = match &self.title {
            Some(t) if t.is_empty() => {
                return false;
            }
            Some(t) => t,
            None => {
                return false;
            }
        };

        let title = title.replace("\u{00a0}", " ");
        let title = title.replace("'", "");
        let title = title.trim().to_lowercase();

        if title.is_empty() {
            return false;
        }

        use std::borrow::Cow;

        let mut potential_titles: HashMap<Cow<str>, bool> = HashMap::new();
        potential_titles.insert(title.as_str().into(), true);

        lazy_static::lazy_static! {
            static ref REGEXES_1: [Regex; 6] = [
                Regex::new("[ ]*[\\|»|-][ ]*").unwrap(),
                Regex::new("[ ]*[\\|»|:][ ]*").unwrap(),
                Regex::new("[ ]*[\\|»|:\\(\\)][ ]*").unwrap(),
                Regex::new("[ ]*[\\|»|:\\(\\)\\-][ ]*").unwrap(),
                Regex::new("[ ]*[\\|»|,|:\\(\\)\\-][ ]*").unwrap(),
                Regex::new("[ ]*[\\|»|,|:\\(\\)\\-\u{00a0}][ ]*").unwrap(),
            ];
        }

        for r in REGEXES_1.iter() {
            let potential_title = Self::get_longest_part(&title, &r);
            if !potential_title.is_empty() {
                potential_titles.insert(potential_title.into(), true);
            }
        }

        lazy_static::lazy_static! {
            static ref REGEXES_2: [Regex; 2] = [
                Regex::new("[ ]+[\\|][ ]+").unwrap(),
                Regex::new("[ ]+[\\-][ ]+").unwrap(),
            ];
        }

        for r in REGEXES_2.iter() {
            let parts_count = r.split(&title).count();

            if parts_count == 1 {
                continue;
            }

            let parts = r.split(&title);

            for part in parts {
                if part.contains(".com") {
                    continue;
                }

                let num_words = part.unicode_words().count();
                if num_words >= 4 {
                    potential_titles.insert(part.into(), true);
                }
            }
        }

        lazy_static::lazy_static! {
            static ref REGEXES_3: [Regex; 2] = [
                Regex::new(" - [^\\-]+$").unwrap(),
                Regex::new("^[^\\-]+ - ").unwrap(),
            ];
        };

        for r in REGEXES_3.iter() {
            let potential_title = r.replacen(&title, 1, "");
            potential_titles.insert(potential_title, true);
        }

        let mut has_changed = false;

        lazy_static::lazy_static! {
            static ref REMOVE_RE: Regex = Regex::new(r"[?!.-:]+").unwrap();
        }

        for tb in self.text_blocks.iter_mut() {
            let text = tb.text.replace("\u{00a0}", " ");
            let text = text.replace("'", "");
            let text = text.trim().to_lowercase();

            if potential_titles.contains_key(text.as_str().into()) {
                tb.add_labels(&[Label::Title]);
                has_changed = true;
                break;
            }

            let text = REMOVE_RE.replace(&text, "");
            let text = text.trim();

            if potential_titles.contains_key(text) {
                tb.add_labels(&[Label::Title]);
                has_changed = true;
                break;
            }
        }

        has_changed
    }

    fn get_longest_part<'s>(title: &'s str, r: &regex::Regex) -> &'s str {
        let parts = r.split(title);

        let mut longest_num_words = 0;
        let mut longest_part = "";
        let mut parts_count = 0;

        for part in parts {
            if part.contains(".com") {
                continue;
            }

            let num_words = part.unicode_words().count();
            if num_words > longest_num_words || part.len() > longest_part.len() {
                longest_num_words = num_words;
                longest_part = part;
            }

            parts_count += 1;
        }

        if longest_part.is_empty() || parts_count == 1 {
            return "";
        }

        longest_part.trim()
    }

    fn num_words_rules_classifier(&mut self) -> bool {
        if self.text_blocks.is_empty() {
            return false;
        }

        let mut has_changed = false;

        let empty_start = TextBlock::empty_start();
        let empty_end = TextBlock::empty_end();

        for i in 0..self.text_blocks.len() {
            let window = i
                .checked_sub(1)
                .and_then(|i_1| self.text_blocks.get_mut(i_1..=i + 1));

            let (prev, cur, next): (&TextBlock, &mut TextBlock, &TextBlock) = match window {
                Some([prev, cur, next]) => (&*prev, cur, &*next),
                Some(w) => {
                    debug_assert_eq!(w.len(), 3);
                    continue;
                }
                None => match self.text_blocks.get_mut(i..=i + 1) {
                    Some([cur, next]) => (&empty_start, cur, &*next),
                    Some(w) => {
                        debug_assert_eq!(w.len(), 2);
                        continue;
                    }
                    None => match self.text_blocks.get_mut(i - 1..=i) {
                        Some([prev, cur]) => (&*prev, cur, &empty_end),
                        Some(w) => {
                            debug_assert_eq!(w.len(), 2);
                            continue;
                        }
                        None => match self.text_blocks.get_mut(i) {
                            Some(cur) => (&empty_start, cur, &empty_start),
                            None => {
                                continue;
                            }
                        },
                    },
                },
            };

            let is_content = Self::classify_is_content(prev, cur, next);
            cur.is_content = is_content;
            has_changed |= is_content;
        }

        has_changed
    }

    fn classify_is_content(prev: &TextBlock, cur: &TextBlock, next: &TextBlock) -> bool {
        let mut is_content = false;

        if cur.link_density() <= 0.333333 {
            if prev.link_density() <= 0.555556 {
                if cur.num_words <= 16 {
                    if next.num_words <= 15 {
                        if prev.num_words <= 4 {
                            is_content = false;
                        } else {
                            is_content = true;
                        }
                    } else {
                        is_content = true;
                    }
                } else {
                    is_content = true;
                }
            } else {
                if cur.num_words <= 40 {
                    if next.num_words <= 17 {
                        is_content = false;
                    } else {
                        is_content = true;
                    }
                }
            }
        } else {
            is_content = false;
        }

        is_content
    }

    fn ignore_block_after_content(&mut self) -> bool {
        let mut has_changed = false;
        let mut num_words = 0;
        let mut found_end_of_text = false;

        for tb in self.text_blocks.iter_mut() {
            let end_of_text = tb.label_map.contains_key(&Label::EndOfText);

            if tb.is_content {
                num_words += Self::get_num_full_text_words(tb);
            }

            if end_of_text && num_words >= 60 {
                found_end_of_text = true;
            }

            if found_end_of_text {
                has_changed = true;
                tb.is_content = false;
            }
        }

        has_changed
    }

    fn get_num_full_text_words(tb: &TextBlock) -> usize {
        let min_text_density = 9.0;

        if tb.text_density() >= min_text_density {
            tb.num_words
        } else {
            0
        }
    }

    fn trailing_headline_to_boilerplate(&mut self) -> bool {
        let mut has_changed = false;

        for tb in self.text_blocks.iter_mut().rev() {
            if tb.is_content {
                if tb.label_map.contains_key(&Label::Heading) {
                    tb.is_content = false;
                    has_changed = true;
                } else {
                    break;
                }
            }
        }

        has_changed
    }

    fn block_proximity_fusion(
        &mut self,
        max_block_distance: usize,
        content_only: bool,
        same_tag_level_only: bool,
    ) -> bool {
        if self.text_blocks.len() < 2 {
            return false;
        }

        let mut has_changed = false;
        let mut prev_block = 0;
        let mut start_block = 0;

        if content_only {
            for (i, tb) in self.text_blocks.iter().enumerate() {
                start_block += 1;

                if tb.is_content {
                    prev_block = i;
                    break;
                }
            }

            if prev_block == 0 {
                return false;
            }
        } else {
            prev_block = 0;
            start_block = 1;
        }

        let mut i = start_block;
        let mut to_remove = None;

        loop {
            if let Some(i) = to_remove {
                self.text_blocks.remove(i);
                to_remove = None;
            }

            match self.text_blocks.get_mut(prev_block..=i) {
                Some([prev, cur]) => {
                    if cur.is_content == false {
                        prev_block = i;
                    } else {
                        let diff_blocks = cur.offset_block_end - cur.offset_block_start + 1;
                        if diff_blocks <= max_block_distance {
                            let mut merge = true;

                            if content_only {
                                if !prev.is_content || !cur.is_content {
                                    merge = false;
                                }
                            }

                            if merge && same_tag_level_only && prev.tag_level != cur.tag_level {
                                merge = false;
                            }

                            if merge {
                                prev.merge(cur);
                                to_remove = Some(i);
                                i -= 1;

                                has_changed = true;
                            } else {
                                prev_block += 1;
                            }
                        } else {
                            prev_block += 1;
                        }
                    }
                }
                Some(w) => {
                    debug_assert_eq!(w.len(), 2);
                    break;
                }
                None => {
                    break;
                }
            }

            i += 1;
        }

        has_changed
    }

    fn boilerplate_block(&mut self) -> bool {
        let mut has_changed = false;

        let mut i = 0;

        loop {
            let remove = match self.text_blocks.get(i) {
                Some(tb) => !tb.is_content && !tb.label_map.contains_key(&Label::Title),
                None => {
                    break;
                }
            };

            if remove {
                self.text_blocks.remove(i);
                if i > 0 {
                    i -= 1;
                }
                has_changed = true;
            }

            i += 1;
        }

        has_changed
    }

    fn keep_largest_blocks(&mut self) -> bool {
        let expand_to_same_level_text = true;
        let min_words = 150;

        if self.text_blocks.len() < 2 {
            return false;
        }

        let mut max_num_words = 0;
        let mut largest_block = 0;
        let mut level = 0;
        let mut j = 0;
        let mut n = -1;

        for (i, tb) in self.text_blocks.iter().enumerate() {
            if tb.is_content {
                if tb.num_words > max_num_words {
                    largest_block = i;
                    max_num_words = tb.num_words;
                    n = j;

                    if expand_to_same_level_text {
                        level = tb.tag_level;
                    }
                }
            }

            j += 1;
        }

        for (i, tb) in self.text_blocks.iter_mut().enumerate() {
            if i == largest_block {
                tb.is_content = true;
                tb.add_labels(&[Label::VeryLikelyContent]);
            } else {
                tb.is_content = Self::is_largest_block(max_num_words, tb);
                tb.add_labels(&[Label::MightBeContent]);
            }
        }

        if expand_to_same_level_text && n != -1 {
            for tb in self.text_blocks.iter_mut().rev() {
                if tb.tag_level < level {
                    break;
                } else if tb.tag_level == level {
                    if tb.num_words >= min_words {
                        tb.is_content = true;
                    }
                }
            }

            for tb in self.text_blocks.iter_mut() {
                if tb.tag_level < level {
                    break;
                } else if tb.tag_level == level {
                    if tb.num_words >= min_words {
                        tb.is_content = true;
                    }
                }
            }
        }

        true
    }

    fn is_largest_block(max_num_words: usize, tb: &TextBlock) -> bool {
        let min_word_percent = match max_num_words {
            n if n >= 1000 => 0.25,
            n if n >= 500 => 0.6,
            _ => {
                return tb.is_content && tb.num_words == max_num_words;
            }
        };

        tb.is_content && tb.num_words >= (min_word_percent * max_num_words as f64).trunc() as usize
    }

    fn expand_title_to_content(&mut self) -> bool {
        let mut j = 0;
        let mut title = -1;
        let mut content_start = -1;

        for tb in self.text_blocks.iter() {
            if content_start == -1 && tb.label_map.contains_key(&Label::Title) {
                title = j;
                content_start = -1;
            }

            if content_start == -1 && tb.is_content {
                content_start = j;
            }

            j += 1;
        }

        if content_start <= title || title == -1 {
            return false;
        }

        let mut has_changed = false;

        match self
            .text_blocks
            .get_mut(title as usize..content_start as usize)
        {
            Some(tbs) => {
                for tb in tbs {
                    if tb.label_map.contains_key(&Label::MightBeContent) {
                        has_changed |= !tb.is_content;
                        tb.is_content = true;
                    }
                }
            }
            None => {}
        }

        has_changed
    }

    fn large_block_same_tag_level_to_content(&mut self) -> bool {
        let mut has_changed = false;
        let mut tag_level = None;

        for tb in self.text_blocks.iter() {
            if tb.is_content && tb.label_map.contains_key(&Label::VeryLikelyContent) {
                tag_level = Some(tb.tag_level);
                break;
            }
        }

        let tag_level = match tag_level {
            Some(tl) => tl,
            None => {
                return false;
            }
        };

        for tb in self.text_blocks.iter_mut() {
            if !tb.is_content {
                if tb.num_words >= 100 && tb.tag_level == tag_level {
                    tb.is_content = true;
                    has_changed = true;
                }
            }
        }

        has_changed
    }

    fn list_at_end(&mut self) -> bool {
        let mut has_changed = false;
        let mut tag_level = std::usize::MAX;

        for tb in self.text_blocks.iter_mut() {
            if tb.is_content && tb.label_map.contains_key(&Label::VeryLikelyContent) {
                tag_level = tb.tag_level;
            } else {
                if tb.tag_level > tag_level
                    && tb.label_map.contains_key(&Label::MightBeContent)
                    && tb.label_map.contains_key(&Label::List)
                    && tb.link_density() == 0.0
                {
                    tb.is_content = true;
                    has_changed = true;
                } else {
                    tag_level = std::usize::MAX;
                }
            }
        }

        has_changed
    }

    fn text(&self, include_content: bool, include_non_content: bool) -> StrTendril {
        let mut text = StrTendril::new();

        for tb in self.text_blocks.iter() {
            if tb.is_content {
                if !include_content {
                    continue;
                }
            } else {
                if !include_non_content {
                    continue;
                }
            }

            text.push_tendril(&tb.text);
        }

        return text;
    }

    pub fn content(&self) -> StrTendril {
        self.text(true, false)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test() {
        for i in 0..9 {
            let html_file = format!("test-data/{}.html", i);
            let b64_file = format!("test-data/{}.base64", i);

            let html = std::fs::read(html_file).unwrap();
            let s = String::from_utf8(html).unwrap();

            let doc = parse_document(&s);
            let content = doc.content();
            let b64 = base64::encode(content.as_bytes());

            let base64 = std::fs::read(b64_file).unwrap();
            let expected_base64 = String::from_utf8(base64).unwrap();

            println!("{:?}", doc.title);
            println!("{:?}", doc.time);

            assert_eq!(b64, expected_base64);
        }
    }
}