use super::patterns::*;
use ahash::AHashMap;
use memchr::memmem;
use regex::Regex;
pub(crate) const OCR_PENALTY_WEIGHT: f64 = 0.3;
pub(crate) const SCRIPT_PENALTY_WEIGHT: f64 = 0.2;
pub(crate) const NAV_PENALTY_WEIGHT: f64 = 0.1;
pub(crate) const STRUCTURE_BONUS_WEIGHT: f64 = 0.2;
pub(crate) const METADATA_BONUS_WEIGHT: f64 = 0.1;
pub(crate) const MIN_TEXT_LENGTH: usize = 10;
pub(crate) const LARGE_TEXT_LENGTH: usize = 1000;
#[inline]
pub(crate) fn sum_match_lengths(text: &str, pattern: &Regex) -> usize {
pattern.find_iter(text).map(|m| m.len()).sum()
}
#[inline]
pub(crate) fn calculate_ocr_penalty(text: &str, total_chars: f64) -> f64 {
if total_chars == 0.0 {
return 0.0;
}
if memmem::find(text.as_bytes(), b" ").is_none() && memmem::find(text.as_bytes(), b"...").is_none() {
return 0.0;
}
let artifact_chars = sum_match_lengths(text, &SCATTERED_CHARS_PATTERN)
+ sum_match_lengths(text, &REPEATED_PUNCT_PATTERN)
+ count_non_table_dash_artifacts(text)
+ sum_match_lengths(text, &ISOLATED_PUNCT_PATTERN)
+ sum_match_lengths(text, &MALFORMED_WORDS_PATTERN)
+ sum_match_lengths(text, &EXCESSIVE_WHITESPACE_PATTERN);
(artifact_chars as f64 / total_chars).min(1.0)
}
#[inline]
pub(crate) fn count_non_table_dash_artifacts(text: &str) -> usize {
let mut artifact_count = 0;
for line in text.lines() {
let trimmed = line.trim();
let is_table_separator = trimmed.starts_with('|')
&& trimmed.ends_with('|')
&& trimmed
.chars()
.all(|c| c == '|' || c == '-' || c.is_whitespace() || c == ':');
if !is_table_separator {
for m in DASH_PATTERN.find_iter(line) {
artifact_count += m.len();
}
}
}
artifact_count
}
#[inline]
pub(crate) fn calculate_script_penalty(text: &str, total_chars: f64) -> f64 {
if total_chars == 0.0 {
return 0.0;
}
if memmem::find(text.as_bytes(), b"function").is_none()
&& memmem::find(text.as_bytes(), b"<script").is_none()
&& memmem::find(text.as_bytes(), b"<style").is_none()
{
return 0.0;
}
let script_chars = sum_match_lengths(text, &JS_FUNCTION_PATTERN)
+ sum_match_lengths(text, &CSS_RULES_PATTERN)
+ sum_match_lengths(text, &SCRIPT_TAG_PATTERN)
+ sum_match_lengths(text, &STYLE_TAG_PATTERN);
(script_chars as f64 / total_chars).min(1.0)
}
#[inline]
pub(crate) fn calculate_navigation_penalty(text: &str, total_chars: f64) -> f64 {
if total_chars == 0.0 {
return 0.0;
}
let nav_chars = sum_match_lengths(text, &NAV_WORDS_PATTERN)
+ sum_match_lengths(text, &BREADCRUMB_PATTERN)
+ sum_match_lengths(text, &PAGINATION_PATTERN);
(nav_chars as f64 / total_chars).min(1.0)
}
#[inline]
pub(crate) fn calculate_metadata_bonus(metadata: &AHashMap<String, String>) -> f64 {
const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
let present_fields = IMPORTANT_FIELDS
.iter()
.filter(|&&field| metadata.contains_key(field))
.count();
present_fields as f64 / IMPORTANT_FIELDS.len() as f64
}
pub fn calculate_quality_score(text: &str, metadata: Option<&AHashMap<String, String>>) -> f64 {
if text.is_empty() || text.trim().is_empty() {
return 0.0;
}
let total_chars = text.len() as f64;
if text.len() < MIN_TEXT_LENGTH {
return 0.1;
}
let mut score = 1.0;
if text.len() > LARGE_TEXT_LENGTH {
let ocr_penalty = calculate_ocr_penalty(text, total_chars);
let script_penalty = calculate_script_penalty(text, total_chars);
let nav_penalty = calculate_navigation_penalty(text, total_chars);
let structure_bonus = super::heuristics::calculate_structure_bonus(text);
score -= ocr_penalty * OCR_PENALTY_WEIGHT;
score -= script_penalty * SCRIPT_PENALTY_WEIGHT;
score -= nav_penalty * NAV_PENALTY_WEIGHT;
score += structure_bonus * STRUCTURE_BONUS_WEIGHT;
} else {
score -= calculate_ocr_penalty(text, total_chars) * OCR_PENALTY_WEIGHT;
score += super::heuristics::calculate_structure_bonus(text) * STRUCTURE_BONUS_WEIGHT;
}
if let Some(metadata) = metadata {
score += calculate_metadata_bonus(metadata) * METADATA_BONUS_WEIGHT;
}
score.clamp(0.0, 1.0)
}