use serde::{Deserialize, Serialize};
const W_CONTENT_DENSITY: f64 = 0.35;
const W_STRUCTURE: f64 = 0.25;
const W_COMPLETENESS: f64 = 0.25;
const W_ENCODING: f64 = 0.15;
const MIN_HTML_BYTES: usize = 500;
const DENSITY_SATURATION: f64 = 0.30;
const COMPLETENESS_SATURATION_CHARS: usize = 500;
const ENCODING_BAD_PER_CHARS: f64 = 200.0;
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct QualityScore {
pub confidence: f64,
pub content_density: f64,
pub structure: f64,
pub completeness: f64,
pub encoding_quality: f64,
}
#[must_use]
pub fn score_extraction(html_bytes: &[u8], markdown: &str) -> QualityScore {
if html_bytes.len() < MIN_HTML_BYTES && markdown.is_empty() {
return QualityScore::perfect();
}
let content_density = score_content_density(html_bytes, markdown);
let structure = score_structure(markdown);
let completeness = score_completeness(markdown);
let encoding_quality = score_encoding(markdown);
let confidence = weighted_average(content_density, structure, completeness, encoding_quality);
QualityScore {
confidence,
content_density,
structure,
completeness,
encoding_quality,
}
}
impl QualityScore {
fn perfect() -> Self {
Self {
confidence: 1.0,
content_density: 1.0,
structure: 1.0,
completeness: 1.0,
encoding_quality: 1.0,
}
}
}
#[inline]
fn weighted_average(density: f64, structure: f64, completeness: f64, encoding: f64) -> f64 {
let raw = density * W_CONTENT_DENSITY
+ structure * W_STRUCTURE
+ completeness * W_COMPLETENESS
+ encoding * W_ENCODING;
clamp01(raw)
}
fn score_content_density(html_bytes: &[u8], markdown: &str) -> f64 {
let html_len = html_bytes.len();
if html_len < MIN_HTML_BYTES {
return 1.0;
}
let md_chars = markdown.chars().count();
#[allow(clippy::cast_precision_loss)]
let ratio = md_chars as f64 / html_len as f64;
clamp01(ratio / DENSITY_SATURATION)
}
fn score_structure(markdown: &str) -> f64 {
const MAX_POINTS: f64 = 4.0;
let mut points: f64 = 0.0;
let has_heading = markdown.lines().any(|l| l.starts_with('#'));
let has_list = markdown.lines().any(|l| {
let t = l.trim_start();
t.starts_with("- ")
|| t.starts_with("* ")
|| t.starts_with("+ ")
|| starts_with_ordered_list(t)
});
let has_code = markdown.contains("```") || markdown.contains(" "); let paragraph_count = count_paragraphs(markdown);
if has_heading {
points += 1.0;
}
if has_list {
points += 1.0;
}
if has_code {
points += 1.0;
}
if paragraph_count > 3 {
points += 1.0;
}
clamp01(points / MAX_POINTS)
}
fn score_completeness(markdown: &str) -> f64 {
let md_chars = markdown.chars().count();
#[allow(clippy::cast_precision_loss)]
let ratio = md_chars as f64 / COMPLETENESS_SATURATION_CHARS as f64;
clamp01(ratio)
}
fn score_encoding(markdown: &str) -> f64 {
if markdown.is_empty() {
return 1.0;
}
let replacement_count = markdown.chars().filter(|&c| c == '\u{FFFD}').count();
let mojibake_count = count_mojibake(markdown);
let total_bad = replacement_count + mojibake_count;
if total_bad == 0 {
return 1.0;
}
#[allow(clippy::cast_precision_loss)]
let penalty = total_bad as f64 / ENCODING_BAD_PER_CHARS;
clamp01(1.0 - penalty)
}
fn count_paragraphs(markdown: &str) -> usize {
markdown
.split("\n\n")
.filter(|block| {
let t = block.trim();
!t.is_empty() && !t.starts_with('#') && !t.starts_with("```")
})
.count()
}
fn starts_with_ordered_list(text: &str) -> bool {
let mut chars = text.chars().peekable();
if !chars.next().is_some_and(|c| c.is_ascii_digit()) {
return false;
}
while chars.peek().is_some_and(char::is_ascii_digit) {
chars.next();
}
chars.next() == Some('.') && chars.next() == Some(' ')
}
fn count_mojibake(text: &str) -> usize {
const MOJIBAKE_PATTERNS: &[&str] = &[
"é", "â€", "ä", "ö", "ü", "’", "“", "â€", "à ", "î", ];
MOJIBAKE_PATTERNS
.iter()
.map(|pat| text.matches(pat).count())
.sum()
}
#[inline]
fn clamp01(v: f64) -> f64 {
v.clamp(0.0, 1.0)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn content_density_tiny_html_scores_perfect() {
assert_eq!(score_content_density(b"<p>Hi</p>", "Hi"), 1.0);
}
#[test]
fn content_density_zero_markdown_on_large_html_scores_zero() {
let html = b"<!DOCTYPE html><html><body>"
.iter()
.chain(b"x".repeat(2000).iter())
.chain(b"</body></html>".iter())
.copied()
.collect::<Vec<u8>>();
assert_eq!(score_content_density(&html, ""), 0.0);
}
#[test]
fn content_density_saturates_at_one_for_dense_output() {
let html = vec![b'x'; 1000];
let md = "a".repeat(400); assert_eq!(score_content_density(&html, &md), 1.0);
}
#[test]
fn content_density_mid_range_proportional() {
let html = vec![b'x'; 5000];
let md = "a".repeat(150);
let score = score_content_density(&html, &md);
assert!(
(score - 0.10).abs() < 0.01,
"expected ~0.10, got {score:.4}"
);
}
#[test]
fn structure_empty_markdown_scores_zero() {
assert_eq!(score_structure(""), 0.0);
}
#[test]
fn structure_all_elements_present_scores_one() {
let md = "# Title\n\n- item\n\n```\ncode\n```\n\npara1\n\npara2\n\npara3\n\npara4";
assert_eq!(score_structure(md), 1.0);
}
#[test]
fn structure_heading_only_scores_quarter() {
let md = "# Just a heading";
assert_eq!(score_structure(md), 0.25);
}
#[test]
fn structure_ordered_list_detected() {
let md = "1. First item\n2. Second item";
let score = score_structure(md);
assert!(
score > 0.0,
"ordered list should contribute to structure score"
);
}
#[test]
fn structure_indented_code_detected() {
let md = "Some intro:\n\n let x = 1;\n let y = 2;";
let score = score_structure(md);
assert!(
score > 0.0,
"indented code should contribute to structure score"
);
}
#[test]
fn completeness_empty_markdown_scores_zero() {
assert_eq!(score_completeness(""), 0.0);
}
#[test]
fn completeness_saturates_at_saturation_chars() {
let md = "a".repeat(COMPLETENESS_SATURATION_CHARS);
assert_eq!(score_completeness(&md), 1.0);
}
#[test]
fn completeness_half_saturation_scores_half() {
let md = "a".repeat(COMPLETENESS_SATURATION_CHARS / 2);
let score = score_completeness(&md);
assert!((score - 0.5).abs() < 0.01, "expected 0.5, got {score:.4}");
}
#[test]
fn encoding_clean_utf8_scores_one() {
assert_eq!(score_encoding("Hello, world! 你好 мир"), 1.0);
}
#[test]
fn encoding_replacement_char_reduces_score() {
let text = "Hello \u{FFFD} world";
assert!(
score_encoding(text) < 1.0,
"replacement char should reduce encoding score"
);
}
#[test]
fn encoding_mojibake_reduces_score() {
let text = "caf\u{00C3}\u{00A9} au lait"; assert!(
score_encoding(text) < 1.0,
"mojibake should reduce encoding score"
);
}
#[test]
fn encoding_empty_scores_one() {
assert_eq!(score_encoding(""), 1.0);
}
#[test]
fn score_extraction_well_structured_article_scores_high() {
let html = b"<!DOCTYPE html><html><body><article><h1>Title</h1>\
<p>Paragraph one of the article with useful content.</p>\
<p>Paragraph two continues the discussion.</p>\
<ul><li>Point one</li><li>Point two</li></ul>\
</article></body></html>";
let html_padded = html
.iter()
.chain(b" ".repeat(200).iter())
.copied()
.collect::<Vec<u8>>();
let md = "# Title\n\nParagraph one of the article with useful content.\n\n\
Paragraph two continues the discussion.\n\n- Point one\n- Point two";
let score = score_extraction(&html_padded, md);
assert!(
score.confidence > 0.6,
"well-structured article should score > 0.6, got {:.3}",
score.confidence
);
}
#[test]
fn score_extraction_empty_html_empty_markdown_scores_perfect() {
let score = score_extraction(b"", "");
assert_eq!(score.confidence, 1.0);
}
#[test]
fn score_extraction_large_html_no_markdown_scores_low() {
let html = vec![b'x'; 10_000];
let score = score_extraction(&html, "");
assert!(
score.confidence < 0.4,
"empty extraction from large HTML should score < 0.4, got {:.3}",
score.confidence
);
}
#[test]
fn score_extraction_encoding_issues_penalise_confidence() {
let html = vec![b'x'; 600];
let md = "caf\u{00C3}\u{00A9} article ".repeat(50); let score = score_extraction(&html, &md);
assert!(
score.encoding_quality < 1.0,
"mojibake should reduce encoding_quality, got {:.3}",
score.encoding_quality
);
}
#[test]
fn score_extraction_composite_signals_sum_to_confidence() {
let html = vec![b'x'; 2000];
let md = "# Head\n\n- item\n\nsome text\n\nmore text";
let score = score_extraction(&html, md);
let expected = (score.content_density * W_CONTENT_DENSITY
+ score.structure * W_STRUCTURE
+ score.completeness * W_COMPLETENESS
+ score.encoding_quality * W_ENCODING)
.clamp(0.0, 1.0);
assert!(
(score.confidence - expected).abs() < 1e-9,
"confidence {:.6} ≠ weighted sum {:.6}",
score.confidence,
expected
);
}
#[test]
fn count_paragraphs_counts_non_heading_blocks() {
let md = "# Heading\n\nFirst para.\n\nSecond para.\n\nThird para.";
assert_eq!(count_paragraphs(md), 3);
}
#[test]
fn starts_with_ordered_list_true_for_digit_dot_space() {
assert!(starts_with_ordered_list("1. item"));
assert!(starts_with_ordered_list("42. item"));
}
#[test]
fn starts_with_ordered_list_false_for_non_matching() {
assert!(!starts_with_ordered_list("- item"));
assert!(!starts_with_ordered_list("1.item"));
assert!(!starts_with_ordered_list("text"));
assert!(!starts_with_ordered_list(""));
}
#[test]
fn clamp01_clamps_negative_and_above_one() {
assert_eq!(clamp01(-1.0), 0.0);
assert_eq!(clamp01(2.0), 1.0);
assert_eq!(clamp01(0.5), 0.5);
}
}