use std::collections::HashSet;
use crate::paragraph::{ClassType, Paragraph};
use crate::Config;
#[allow(clippy::if_same_then_else)]
pub fn classify_paragraphs(
paragraphs: &mut [Paragraph],
stoplist: &HashSet<String>,
config: &Config,
) {
for paragraph in paragraphs.iter_mut() {
paragraph.heading = !config.no_headings && paragraph.is_heading();
let length = paragraph.text.chars().count();
let link_density = paragraph.links_density();
let stopword_density = paragraph.stopwords_density(stoplist);
paragraph.initial_class = if link_density > config.max_link_density {
ClassType::Bad
} else if paragraph.text.contains('\u{00A9}') || paragraph.text.contains("©") {
ClassType::Bad
} else if paragraph.dom_path.contains("select") {
ClassType::Bad
} else if length < config.length_low {
if paragraph.chars_count_in_links > 0 {
ClassType::Bad
} else {
ClassType::Short
}
} else if stopword_density >= config.stopwords_high {
if length > config.length_high {
ClassType::Good
} else {
ClassType::NearGood
}
} else if stopword_density >= config.stopwords_low {
ClassType::NearGood
} else {
ClassType::Bad
};
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::paragraph_maker::make_paragraphs;
use crate::preprocess::preprocess;
fn make_paragraph(text: &str, chars_in_links: usize) -> Paragraph {
let html = format!("<html><body><p>{text}</p></body></html>");
let doc = preprocess(&html);
let mut ps = make_paragraphs(&doc);
assert!(!ps.is_empty(), "no paragraphs parsed from: {text}");
ps[0].chars_count_in_links = chars_in_links;
ps.remove(0)
}
fn empty_stoplist() -> HashSet<String> {
HashSet::new()
}
fn stoplist(words: &[&str]) -> HashSet<String> {
words.iter().map(|w| w.to_string()).collect()
}
#[test]
fn test_max_link_density() {
let mut paragraphs = vec![
make_paragraph("0123456789".repeat(2).as_str(), 0),
make_paragraph("0123456789".repeat(2).as_str(), 20),
make_paragraph("0123456789".repeat(8).as_str(), 40),
make_paragraph("0123456789".repeat(8).as_str(), 39),
make_paragraph("0123456789".repeat(8).as_str(), 41),
];
let config = Config {
max_link_density: 0.5,
..Config::default() };
classify_paragraphs(&mut paragraphs, &empty_stoplist(), &config);
assert_eq!(paragraphs[0].initial_class, ClassType::Short);
assert_eq!(paragraphs[1].initial_class, ClassType::Bad);
assert_eq!(paragraphs[2].initial_class, ClassType::Bad);
assert_eq!(paragraphs[3].initial_class, ClassType::Bad);
assert_eq!(paragraphs[4].initial_class, ClassType::Bad);
}
#[test]
fn test_length_low() {
let mut paragraphs = vec![
make_paragraph("0 1 2 3 4 5 6 7 8 9".repeat(2).as_str(), 0),
make_paragraph("0 1 2 3 4 5 6 7 8 9".repeat(2).as_str(), 20),
];
let config = Config {
max_link_density: 1.0,
length_low: 1000,
..Config::default()
};
classify_paragraphs(&mut paragraphs, &empty_stoplist(), &config);
assert_eq!(paragraphs[0].initial_class, ClassType::Short);
assert_eq!(paragraphs[1].initial_class, ClassType::Bad);
}
#[test]
fn test_stopwords_high() {
let mut paragraphs = vec![
make_paragraph("0 1 2 3 4 5 6 7 8 9", 0),
make_paragraph("0 1 2 3 4 5 6 7 8 9".repeat(2).as_str(), 0),
];
let config = Config {
max_link_density: 1.0,
length_low: 0,
stopwords_high: 0.0,
length_high: 20,
..Config::default()
};
classify_paragraphs(&mut paragraphs, &stoplist(&["0"]), &config);
assert_eq!(paragraphs[0].initial_class, ClassType::NearGood);
assert_eq!(paragraphs[1].initial_class, ClassType::Good);
}
#[test]
fn test_stopwords_low() {
let mut paragraphs = vec![
make_paragraph("0 0 0 0 1 2 3 4 5 6 7 8 9", 0),
make_paragraph("0 1 2 3 4 5 6 7 8 9", 0),
make_paragraph("1 2 3 4 5 6 7 8 9", 0),
];
let config = Config {
max_link_density: 1.0,
length_low: 0,
stopwords_high: 1000.0,
stopwords_low: 0.2,
..Config::default()
};
classify_paragraphs(&mut paragraphs, &stoplist(&["0", "1"]), &config);
assert_eq!(paragraphs[0].initial_class, ClassType::NearGood);
assert_eq!(paragraphs[1].initial_class, ClassType::NearGood);
assert_eq!(paragraphs[2].initial_class, ClassType::Bad);
}
#[test]
fn test_copyright_symbol() {
let mut ps = vec![make_paragraph("Copyright \u{00A9} 2024 Acme", 0)];
classify_paragraphs(&mut ps, &empty_stoplist(), &Config::default());
assert_eq!(ps[0].initial_class, ClassType::Bad);
}
#[test]
fn test_copyright_entity_literal() {
let mut ps = vec![make_paragraph("© 2024 Acme Corp", 0)];
classify_paragraphs(&mut ps, &empty_stoplist(), &Config::default());
assert_eq!(ps[0].initial_class, ClassType::Bad);
}
#[test]
fn test_select_in_dom_path() {
let html = "<html><body><select><option>Choose</option></select></body></html>";
let doc = preprocess(html);
let mut ps = make_paragraphs(&doc);
if ps.is_empty() {
return; }
classify_paragraphs(&mut ps, &empty_stoplist(), &Config::default());
for p in &ps {
if p.dom_path.contains("select") {
assert_eq!(p.initial_class, ClassType::Bad);
}
}
}
#[test]
fn test_heading_detection() {
let html = "<html><body><h1>A heading</h1><p>body text here</p></body></html>";
let doc = preprocess(html);
let mut ps = make_paragraphs(&doc);
let config = Config::default();
classify_paragraphs(&mut ps, &empty_stoplist(), &config);
assert!(ps[0].heading, "h1 paragraph should be marked as heading");
assert!(!ps[1].heading, "p paragraph should not be heading");
}
#[test]
fn test_no_headings_config() {
let html = "<html><body><h1>A heading</h1></body></html>";
let doc = preprocess(html);
let mut ps = make_paragraphs(&doc);
let config = Config {
no_headings: true,
..Config::default()
};
classify_paragraphs(&mut ps, &empty_stoplist(), &config);
assert!(
!ps[0].heading,
"heading should be false when no_headings=true"
);
}
}