use std::collections::HashMap;
use crate::paragraph::{ClassType, Paragraph};
pub fn revise_paragraph_classification(paragraphs: &mut [Paragraph], max_heading_distance: usize) {
for i in 0..paragraphs.len() {
paragraphs[i].class_type = paragraphs[i].initial_class;
if !(paragraphs[i].heading && paragraphs[i].class_type == ClassType::Short) {
continue;
}
let mut j = i + 1;
let mut distance = 0;
while j < paragraphs.len() && distance <= max_heading_distance {
if paragraphs[j].class_type == ClassType::Good {
paragraphs[i].class_type = ClassType::NearGood;
break;
}
distance += paragraphs[j].text.chars().count();
j += 1;
}
}
let mut new_classes: HashMap<usize, ClassType> = HashMap::new();
for i in 0..paragraphs.len() {
if paragraphs[i].class_type != ClassType::Short {
continue;
}
let prev = get_neighbour(i, paragraphs, true, Direction::Prev);
let next = get_neighbour(i, paragraphs, true, Direction::Next);
let class = if prev == ClassType::Good && next == ClassType::Good {
ClassType::Good
} else if prev == ClassType::Bad && next == ClassType::Bad {
ClassType::Bad
} else if (prev == ClassType::Bad
&& get_neighbour(i, paragraphs, false, Direction::Prev) == ClassType::NearGood)
|| (next == ClassType::Bad
&& get_neighbour(i, paragraphs, false, Direction::Next) == ClassType::NearGood)
{
ClassType::Good
} else {
ClassType::Bad
};
new_classes.insert(i, class);
}
for (i, c) in new_classes {
paragraphs[i].class_type = c;
}
for i in 0..paragraphs.len() {
if paragraphs[i].class_type != ClassType::NearGood {
continue;
}
let prev = get_neighbour(i, paragraphs, true, Direction::Prev);
let next = get_neighbour(i, paragraphs, true, Direction::Next);
paragraphs[i].class_type = if prev == ClassType::Bad && next == ClassType::Bad {
ClassType::Bad
} else {
ClassType::Good
};
}
for i in 0..paragraphs.len() {
if !(paragraphs[i].heading
&& paragraphs[i].class_type == ClassType::Bad
&& paragraphs[i].initial_class != ClassType::Bad)
{
continue;
}
let mut j = i + 1;
let mut distance = 0;
while j < paragraphs.len() && distance <= max_heading_distance {
if paragraphs[j].class_type == ClassType::Good {
paragraphs[i].class_type = ClassType::Good;
break;
}
distance += paragraphs[j].text.chars().count();
j += 1;
}
}
}
#[derive(Clone, Copy)]
enum Direction {
Prev,
Next,
}
fn get_neighbour(
i: usize,
paragraphs: &[Paragraph],
ignore_neargood: bool,
direction: Direction,
) -> ClassType {
let len = paragraphs.len();
let mut idx = i as isize;
loop {
idx = match direction {
Direction::Prev => idx - 1,
Direction::Next => idx + 1,
};
if idx < 0 || idx >= len as isize {
return ClassType::Bad;
}
let c = paragraphs[idx as usize].class_type;
match c {
ClassType::Good | ClassType::Bad => return c,
ClassType::NearGood if !ignore_neargood => return c,
ClassType::Short | ClassType::NearGood => continue, }
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::paragraph::ClassType::*;
fn para(cf: ClassType) -> Paragraph {
let mut p = Paragraph::new(
"body.p".to_string(),
"/html[1]/body[1]/p[1]".to_string(),
"some text here".to_string(),
0,
0,
);
p.initial_class = cf;
p.class_type = cf;
p
}
fn para_heading(cf: ClassType) -> Paragraph {
let mut p = Paragraph::new(
"body.h1".to_string(),
"/html[1]/body[1]/h1[1]".to_string(),
"heading text".to_string(),
0,
0,
);
p.initial_class = cf;
p.class_type = cf;
p.heading = true;
p
}
fn para_text(cf: ClassType, text: &str) -> Paragraph {
let mut p = Paragraph::new(
"body.p".to_string(),
"/html[1]/body[1]/p[1]".to_string(),
text.to_string(),
0,
0,
);
p.initial_class = cf;
p.class_type = cf;
p
}
#[test]
fn test_stage1_short_heading_near_good_becomes_neargood_then_good() {
let mut ps = vec![para_heading(Short), para(Good)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[0].class_type, Good);
}
#[test]
fn test_stage1_short_heading_not_promoted_becomes_bad() {
let mut ps = vec![
para_heading(Short),
para_text(Bad, &"x".repeat(201)),
para(Good),
];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[0].class_type, Bad);
}
#[test]
fn test_stage1_non_heading_short_not_promoted() {
let mut ps = vec![para(Short), para(Good)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[0].class_type, Bad);
}
#[test]
fn test_stage2_short_between_two_good() {
let mut ps = vec![para(Good), para(Short), para(Good)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[1].class_type, Good);
}
#[test]
fn test_stage2_short_between_two_bad() {
let mut ps = vec![para(Bad), para(Short), para(Bad)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[1].class_type, Bad);
}
#[test]
fn test_stage2_short_neargood_proximity_prev() {
let mut ps = vec![para(Good), para(Short), para(NearGood), para(Bad)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[1].class_type, Good);
}
#[test]
fn test_stage2_short_neargood_proximity_next() {
let mut ps = vec![para(Bad), para(NearGood), para(Short), para(Good)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[2].class_type, Good);
}
#[test]
fn test_stage2_batching() {
let mut ps = vec![para(Good), para(Short), para(Short), para(Bad)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[1].class_type, Bad);
assert_eq!(ps[2].class_type, Bad);
}
#[test]
fn test_stage3_neargood_both_bad_neighbors() {
let mut ps = vec![para(Bad), para(NearGood), para(Bad)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[1].class_type, Bad);
}
#[test]
fn test_stage3_neargood_one_good_neighbor() {
let mut ps = vec![para(Good), para(NearGood), para(Bad)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[1].class_type, Good);
}
#[test]
fn test_stage3_neargood_at_document_end() {
let mut ps = vec![para(Good), para(NearGood)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[1].class_type, Good); }
#[test]
fn test_stage4_heading_bad_cf_not_bad_near_good() {
let mut ps = vec![
para_heading(Short),
para_text(Bad, &"x".repeat(10)),
para(Good),
];
ps[0].class_type = Bad;
revise_paragraph_classification(&mut ps, 200);
let mut ps2 = vec![
{
let mut p = para_heading(NearGood);
p.class_type = Bad;
p
},
para_text(Bad, "filler"),
para(Good),
];
revise_paragraph_classification(&mut ps2, 200);
assert_eq!(ps2[0].class_type, Good);
}
#[test]
fn test_stage4_heading_cf_bad_not_promoted() {
let mut ps = vec![para_heading(Bad), para(Good)];
revise_paragraph_classification(&mut ps, 200);
assert_eq!(ps[0].class_type, Bad);
}
#[test]
fn test_neighbour_at_start_returns_bad() {
let ps = vec![para(Short), para(Good)];
let prev = get_neighbour(0, &ps, true, Direction::Prev);
assert_eq!(prev, Bad);
}
#[test]
fn test_neighbour_at_end_returns_bad() {
let ps = vec![para(Good), para(Short)];
let next = get_neighbour(1, &ps, true, Direction::Next);
assert_eq!(next, Bad);
}
}