#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocumentType {
Academic,
Policy,
Government,
Form,
ScannedOCR,
Mixed,
}
#[derive(Debug, Clone, PartialEq)]
pub struct ExtractionProfile {
pub name: &'static str,
pub tj_offset_threshold: f32,
pub word_margin_ratio: f32,
pub space_threshold_em_ratio: f32,
pub space_char_multiplier: f32,
pub use_adaptive_threshold: bool,
pub enable_document_type_detection: bool,
pub enable_email_detection: bool,
pub enable_citation_detection: bool,
}
impl ExtractionProfile {
pub const CONSERVATIVE: Self = Self {
name: "Conservative (Default)",
tj_offset_threshold: -120.0, word_margin_ratio: 0.1, space_threshold_em_ratio: 0.25, space_char_multiplier: 0.5, use_adaptive_threshold: false,
enable_document_type_detection: false,
enable_email_detection: false,
enable_citation_detection: false,
};
pub const AGGRESSIVE: Self = Self {
name: "Aggressive",
tj_offset_threshold: -80.0, word_margin_ratio: 0.2, space_threshold_em_ratio: 0.15, space_char_multiplier: 0.8, use_adaptive_threshold: false,
enable_document_type_detection: false,
enable_email_detection: false,
enable_citation_detection: false,
};
pub const BALANCED: Self = Self {
name: "Balanced",
tj_offset_threshold: -100.0,
word_margin_ratio: 0.15,
space_threshold_em_ratio: 0.2,
space_char_multiplier: 0.65,
use_adaptive_threshold: false,
enable_document_type_detection: false,
enable_email_detection: false,
enable_citation_detection: false,
};
pub const ACADEMIC: Self = Self {
name: "Academic",
tj_offset_threshold: -105.0, word_margin_ratio: 0.12, space_threshold_em_ratio: 0.18, space_char_multiplier: 0.6, use_adaptive_threshold: true,
enable_document_type_detection: false,
enable_email_detection: true,
enable_citation_detection: true,
};
pub const POLICY: Self = Self {
name: "Policy",
tj_offset_threshold: -110.0, word_margin_ratio: 0.18, space_threshold_em_ratio: 0.22, space_char_multiplier: 0.7, use_adaptive_threshold: true,
enable_document_type_detection: false,
enable_email_detection: false,
enable_citation_detection: false,
};
pub const FORM: Self = Self {
name: "Form",
tj_offset_threshold: -120.0, word_margin_ratio: 0.08, space_threshold_em_ratio: 0.2, space_char_multiplier: 0.5, use_adaptive_threshold: false,
enable_document_type_detection: false,
enable_email_detection: false,
enable_citation_detection: false,
};
pub const GOVERNMENT: Self = Self {
name: "Government",
tj_offset_threshold: -105.0,
word_margin_ratio: 0.14,
space_threshold_em_ratio: 0.2,
space_char_multiplier: 0.65,
use_adaptive_threshold: true,
enable_document_type_detection: false,
enable_email_detection: false,
enable_citation_detection: false,
};
pub const SCANNED_OCR: Self = Self {
name: "Scanned OCR",
tj_offset_threshold: -85.0, word_margin_ratio: 0.2, space_threshold_em_ratio: 0.15, space_char_multiplier: 0.75, use_adaptive_threshold: true,
enable_document_type_detection: false,
enable_email_detection: false,
enable_citation_detection: false,
};
pub const ADAPTIVE: Self = Self {
name: "Adaptive",
tj_offset_threshold: -100.0,
word_margin_ratio: 0.15,
space_threshold_em_ratio: 0.2,
space_char_multiplier: 0.65,
use_adaptive_threshold: true,
enable_document_type_detection: true,
enable_email_detection: false,
enable_citation_detection: false,
};
pub fn for_document_type(doc_type: DocumentType) -> Self {
match doc_type {
DocumentType::Academic => Self::ACADEMIC,
DocumentType::Policy => Self::POLICY,
DocumentType::Government => Self::GOVERNMENT,
DocumentType::Form => Self::FORM,
DocumentType::ScannedOCR => Self::SCANNED_OCR,
DocumentType::Mixed => Self::BALANCED,
}
}
pub fn all_profiles() -> &'static [&'static str] {
&[
Self::CONSERVATIVE.name,
Self::AGGRESSIVE.name,
Self::BALANCED.name,
Self::ACADEMIC.name,
Self::POLICY.name,
Self::FORM.name,
Self::GOVERNMENT.name,
Self::SCANNED_OCR.name,
Self::ADAPTIVE.name,
]
}
pub fn by_name(name: &str) -> Option<Self> {
match name {
"Conservative (Default)" => Some(Self::CONSERVATIVE),
"Aggressive" => Some(Self::AGGRESSIVE),
"Balanced" => Some(Self::BALANCED),
"Academic" => Some(Self::ACADEMIC),
"Policy" => Some(Self::POLICY),
"Form" => Some(Self::FORM),
"Government" => Some(Self::GOVERNMENT),
"Scanned OCR" => Some(Self::SCANNED_OCR),
"Adaptive" => Some(Self::ADAPTIVE),
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_profile_creation() {
assert_eq!(ExtractionProfile::CONSERVATIVE.name, "Conservative (Default)");
assert_eq!(ExtractionProfile::ACADEMIC.name, "Academic");
assert_eq!(ExtractionProfile::POLICY.name, "Policy");
}
#[test]
fn test_profiles_by_document_type() {
assert_eq!(
ExtractionProfile::for_document_type(DocumentType::Academic),
ExtractionProfile::ACADEMIC
);
assert_eq!(
ExtractionProfile::for_document_type(DocumentType::Policy),
ExtractionProfile::POLICY
);
}
#[test]
fn test_profile_by_name() {
assert!(ExtractionProfile::by_name("Academic").is_some());
assert!(ExtractionProfile::by_name("InvalidProfile").is_none());
}
}