use std::collections::HashMap;
use super::style_mapping::StyleMapping;
use crate::model::{Block, Document, HeadingLevel, Paragraph, Section};
#[derive(Debug, Clone)]
pub struct HeadingConfig {
pub max_heading_level: u8,
pub max_text_length: usize,
pub size_threshold_ratio: f32,
pub trust_explicit_styles: bool,
pub analyze_sequences: bool,
pub min_sequence_count: usize,
pub style_mapping: Option<StyleMapping>,
}
impl Default for HeadingConfig {
fn default() -> Self {
Self {
max_heading_level: 4,
max_text_length: 80,
size_threshold_ratio: 1.2,
trust_explicit_styles: true,
analyze_sequences: true,
min_sequence_count: 2,
style_mapping: None,
}
}
}
impl HeadingConfig {
pub fn new() -> Self {
Self::default()
}
pub fn with_max_level(mut self, level: u8) -> Self {
self.max_heading_level = level.clamp(1, 6);
self
}
pub fn with_max_text_length(mut self, length: usize) -> Self {
self.max_text_length = length;
self
}
pub fn with_size_ratio(mut self, ratio: f32) -> Self {
self.size_threshold_ratio = ratio.max(1.0);
self
}
pub fn with_trust_explicit(mut self, trust: bool) -> Self {
self.trust_explicit_styles = trust;
self
}
pub fn with_sequence_analysis(mut self, analyze: bool) -> Self {
self.analyze_sequences = analyze;
self
}
pub fn with_style_mapping(mut self, mapping: StyleMapping) -> Self {
self.style_mapping = Some(mapping);
self
}
pub fn with_default_style_mapping(mut self) -> Self {
self.style_mapping = Some(StyleMapping::with_defaults());
self
}
}
#[derive(Debug, Clone, Default)]
pub struct DocumentStats {
pub font_sizes: HashMap<u32, usize>,
pub base_font_size: Option<u32>,
pub bold_paragraphs: usize,
pub total_paragraphs: usize,
pub explicit_heading_count: usize,
}
impl DocumentStats {
pub fn calculate_base_font_size(&mut self) {
self.base_font_size = self
.font_sizes
.iter()
.max_by_key(|(_, count)| *count)
.map(|(size, _)| *size);
}
pub fn is_larger_than_base(&self, size: u32, ratio: f32) -> bool {
if let Some(base) = self.base_font_size {
let threshold = (base as f32 * ratio) as u32;
size >= threshold
} else {
false
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HeadingDecision {
Explicit(HeadingLevel),
Inferred(HeadingLevel),
Demoted,
None,
}
impl HeadingDecision {
pub fn is_heading(&self) -> bool {
matches!(
self,
HeadingDecision::Explicit(_) | HeadingDecision::Inferred(_)
)
}
pub fn level(&self) -> Option<HeadingLevel> {
match self {
HeadingDecision::Explicit(level) | HeadingDecision::Inferred(level) => Some(*level),
_ => None,
}
}
}
pub struct HeadingAnalyzer {
config: HeadingConfig,
stats: DocumentStats,
}
impl HeadingAnalyzer {
pub fn new(config: HeadingConfig) -> Self {
Self {
config,
stats: DocumentStats::default(),
}
}
pub fn with_defaults() -> Self {
Self::new(HeadingConfig::default())
}
pub fn analyze(&mut self, doc: &Document) -> Vec<Vec<HeadingDecision>> {
self.collect_stats(doc);
self.analyze_sections(&doc.sections)
}
pub fn analyze_sections(&self, sections: &[Section]) -> Vec<Vec<HeadingDecision>> {
sections
.iter()
.map(|section| self.analyze_section(section))
.collect()
}
fn analyze_section(&self, section: &Section) -> Vec<HeadingDecision> {
let paragraphs: Vec<&Paragraph> = section
.content
.iter()
.filter_map(|block| {
if let Block::Paragraph(para) = block {
Some(para)
} else {
None
}
})
.collect();
self.analyze_paragraphs(¶graphs)
}
fn analyze_paragraphs(&self, paragraphs: &[&Paragraph]) -> Vec<HeadingDecision> {
let mut decisions = Vec::with_capacity(paragraphs.len());
for para in paragraphs {
decisions.push(self.decide_heading(para));
}
if self.config.analyze_sequences {
self.apply_sequence_analysis(paragraphs, &mut decisions);
}
decisions
}
fn collect_stats(&mut self, doc: &Document) {
self.stats = DocumentStats::default();
for section in &doc.sections {
for block in §ion.content {
if let Block::Paragraph(para) = block {
self.collect_paragraph_stats(para);
}
}
}
self.stats.calculate_base_font_size();
}
fn collect_paragraph_stats(&mut self, para: &Paragraph) {
self.stats.total_paragraphs += 1;
if para.heading.is_heading() {
self.stats.explicit_heading_count += 1;
}
let mut has_bold = false;
for run in ¶.runs {
if let Some(size) = run.style.size {
*self.stats.font_sizes.entry(size).or_insert(0) += 1;
}
if run.style.bold {
has_bold = true;
}
}
if has_bold {
self.stats.bold_paragraphs += 1;
}
}
fn decide_heading(&self, para: &Paragraph) -> HeadingDecision {
let plain_text = para.plain_text();
let trimmed = plain_text.trim();
if let Some(ref mapping) = self.config.style_mapping {
if let Some(level) = mapping.get(para.style_id.as_deref(), para.style_name.as_deref()) {
let capped = self.cap_heading_level(level);
return HeadingDecision::Explicit(capped);
}
}
if para.heading.is_heading() && self.config.trust_explicit_styles {
let level = self.cap_heading_level(para.heading);
return HeadingDecision::Explicit(level);
}
if self.looks_like_list_item(trimmed) {
return if para.heading.is_heading() {
HeadingDecision::Demoted
} else {
HeadingDecision::None
};
}
if trimmed.chars().count() > self.config.max_text_length {
return if para.heading.is_heading() {
HeadingDecision::Demoted
} else {
HeadingDecision::None
};
}
if let Some(inferred) = self.infer_heading_from_style(para) {
return HeadingDecision::Inferred(inferred);
}
if para.heading.is_heading() {
let level = self.cap_heading_level(para.heading);
return HeadingDecision::Explicit(level);
}
HeadingDecision::None
}
fn looks_like_list_item(&self, text: &str) -> bool {
if text.is_empty() {
return false;
}
const LIST_MARKERS: &[char] = &[
'ㅇ', 'ㆍ', '○', '●', '◎', '■', '□', '▪', '▫', '◆', '◇', '★', '☆', '※', '•', '-', '–',
'—', '→', '▶', '►', '▷', '▹', '◁', '◀', '◃', '◂',
];
text.chars()
.next()
.map(|c| LIST_MARKERS.contains(&c))
.unwrap_or(false)
}
fn infer_heading_from_style(&self, para: &Paragraph) -> Option<HeadingLevel> {
if para.runs.is_empty() || para.plain_text().trim().is_empty() {
return None;
}
let all_bold = para
.runs
.iter()
.filter(|r| !r.text.is_empty())
.all(|r| r.style.bold);
let dominant_size = self.get_dominant_font_size(para);
if !all_bold {
return None;
}
if let Some(size) = dominant_size {
if self
.stats
.is_larger_than_base(size, self.config.size_threshold_ratio)
{
let level = self.infer_level_from_size(size);
return Some(self.cap_heading_level(level));
}
}
None
}
fn get_dominant_font_size(&self, para: &Paragraph) -> Option<u32> {
let mut sizes: HashMap<u32, usize> = HashMap::new();
for run in ¶.runs {
if let Some(size) = run.style.size {
let text_len = run.text.chars().count();
*sizes.entry(size).or_insert(0) += text_len;
}
}
sizes
.into_iter()
.max_by_key(|(_, count)| *count)
.map(|(size, _)| size)
}
fn infer_level_from_size(&self, size: u32) -> HeadingLevel {
let base = self.stats.base_font_size.unwrap_or(24);
let ratio = size as f32 / base as f32;
if ratio >= 2.0 {
HeadingLevel::H1
} else if ratio >= 1.5 {
HeadingLevel::H2
} else if ratio >= 1.2 {
HeadingLevel::H3
} else {
HeadingLevel::H4
}
}
fn cap_heading_level(&self, level: HeadingLevel) -> HeadingLevel {
let current = level.level();
if current > self.config.max_heading_level {
HeadingLevel::from_number(self.config.max_heading_level)
} else {
level
}
}
fn apply_sequence_analysis(
&self,
paragraphs: &[&Paragraph],
decisions: &mut [HeadingDecision],
) {
if paragraphs.len() < self.config.min_sequence_count {
return;
}
let mut i = 0;
while i < paragraphs.len() {
if let Some(seq_len) = self.detect_sequence_at(paragraphs, i) {
if seq_len >= self.config.min_sequence_count {
for decision in decisions.iter_mut().skip(i).take(seq_len) {
if decision.is_heading() {
*decision = HeadingDecision::Demoted;
}
}
i += seq_len;
continue;
}
}
i += 1;
}
}
fn detect_sequence_at(&self, paragraphs: &[&Paragraph], start: usize) -> Option<usize> {
let first_text = paragraphs[start].plain_text();
let first_trimmed = first_text.trim();
let first_marker = self.extract_sequence_marker(first_trimmed)?;
let mut seq_len = 1;
let mut expected_next = self.next_marker(&first_marker)?;
for para in paragraphs.iter().skip(start + 1) {
let text = para.plain_text();
let trimmed = text.trim();
if let Some(marker) = self.extract_sequence_marker(trimmed) {
if marker == expected_next {
seq_len += 1;
if let Some(next) = self.next_marker(&marker) {
expected_next = next;
} else {
break;
}
} else {
break;
}
} else {
break;
}
}
if seq_len >= 2 {
Some(seq_len)
} else {
None
}
}
fn extract_sequence_marker(&self, text: &str) -> Option<String> {
let text = text.trim_start();
if text.is_empty() {
return None;
}
let chars: Vec<char> = text.chars().take(10).collect();
if chars[0] == '(' {
if let Some(close_idx) = chars.iter().position(|&c| c == ')') {
let inner: String = chars[1..close_idx].iter().collect();
if !inner.is_empty()
&& (inner.chars().all(|c| c.is_ascii_digit())
|| inner.chars().count() == 1
&& inner.chars().next().is_some_and(|c| c.is_ascii_lowercase())
|| inner.chars().count() == 1
&& inner.chars().next().is_some_and(is_korean_sequence_char))
{
return Some(inner);
}
}
}
let mut num_end = 0;
for (i, &c) in chars.iter().enumerate() {
if c.is_ascii_digit() {
num_end = i + 1;
} else {
break;
}
}
if num_end > 0 && num_end < chars.len() {
let next = chars[num_end];
if next == '.' || next == ')' {
return Some(chars[..num_end].iter().collect());
}
}
if chars.len() >= 2
&& is_korean_sequence_char(chars[0])
&& (chars[1] == '.' || chars[1] == ')')
{
return Some(chars[0].to_string());
}
if chars.len() >= 2 && chars[0].is_ascii_lowercase() && (chars[1] == '.' || chars[1] == ')')
{
return Some(chars[0].to_string());
}
None
}
fn next_marker(&self, marker: &str) -> Option<String> {
if let Ok(n) = marker.parse::<u32>() {
return Some((n + 1).to_string());
}
if marker.chars().count() == 1 {
let c = marker.chars().next()?;
const KOREAN_SEQ: &[char] = &[
'가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하',
];
if let Some(idx) = KOREAN_SEQ.iter().position(|&x| x == c) {
if idx + 1 < KOREAN_SEQ.len() {
return Some(KOREAN_SEQ[idx + 1].to_string());
}
}
if c.is_ascii_lowercase() && c != 'z' {
return Some(((c as u8) + 1) as char).map(|c| c.to_string());
}
}
None
}
pub fn stats(&self) -> &DocumentStats {
&self.stats
}
pub fn config(&self) -> &HeadingConfig {
&self.config
}
}
fn is_korean_sequence_char(c: char) -> bool {
const KOREAN_SEQ: &[char] = &[
'가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하',
];
KOREAN_SEQ.contains(&c)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::{RevisionType, TextRun, TextStyle};
fn make_paragraph(text: &str, heading: HeadingLevel) -> Paragraph {
Paragraph {
runs: vec![TextRun::plain(text)],
heading,
..Default::default()
}
}
fn make_bold_paragraph(text: &str, font_size: u32) -> Paragraph {
Paragraph {
runs: vec![TextRun {
text: text.to_string(),
style: TextStyle {
bold: true,
size: Some(font_size),
..Default::default()
},
hyperlink: None,
line_break: false,
page_break: false,
revision: RevisionType::None,
}],
heading: HeadingLevel::None,
..Default::default()
}
}
#[test]
fn test_explicit_heading_trusted() {
let config = HeadingConfig::default();
let analyzer = HeadingAnalyzer::new(config);
let para = make_paragraph("제목", HeadingLevel::H1);
let decision = analyzer.decide_heading(¶);
assert!(matches!(
decision,
HeadingDecision::Explicit(HeadingLevel::H1)
));
}
#[test]
fn test_explicit_heading_with_list_marker_trusted() {
let config = HeadingConfig::default();
let analyzer = HeadingAnalyzer::new(config);
let para = make_paragraph("ㅇ 항목 내용", HeadingLevel::H2);
let decision = analyzer.decide_heading(¶);
assert!(matches!(
decision,
HeadingDecision::Explicit(HeadingLevel::H2)
));
}
#[test]
fn test_list_marker_demoted_when_untrusted() {
let config = HeadingConfig::default().with_trust_explicit(false);
let analyzer = HeadingAnalyzer::new(config);
let para = make_paragraph("ㅇ 항목 내용", HeadingLevel::H2);
let decision = analyzer.decide_heading(¶);
assert_eq!(decision, HeadingDecision::Demoted);
}
#[test]
fn test_numbered_heading_preserved_when_standalone() {
let config = HeadingConfig::default().with_trust_explicit(false);
let analyzer = HeadingAnalyzer::new(config);
let para = make_paragraph("1. 첫 번째 항목", HeadingLevel::H2);
let decision = analyzer.decide_heading(¶);
assert!(matches!(
decision,
HeadingDecision::Explicit(HeadingLevel::H2)
));
}
#[test]
fn test_long_text_demoted_when_untrusted() {
let config = HeadingConfig::default().with_trust_explicit(false);
let analyzer = HeadingAnalyzer::new(config);
let long_text = "이것은 매우 긴 텍스트입니다. ".repeat(5);
let para = make_paragraph(&long_text, HeadingLevel::H2);
let decision = analyzer.decide_heading(¶);
assert_eq!(decision, HeadingDecision::Demoted);
}
#[test]
fn test_list_marker_none_without_heading() {
let config = HeadingConfig::default();
let analyzer = HeadingAnalyzer::new(config);
let para = make_paragraph("ㅇ 항목 내용", HeadingLevel::None);
let decision = analyzer.decide_heading(¶);
assert_eq!(decision, HeadingDecision::None);
}
#[test]
fn test_inferred_heading() {
let config = HeadingConfig::default();
let mut analyzer = HeadingAnalyzer::new(config);
analyzer.stats.base_font_size = Some(24);
let para = make_bold_paragraph("추론된 제목", 32);
let decision = analyzer.decide_heading(¶);
assert!(matches!(decision, HeadingDecision::Inferred(_)));
}
#[test]
fn test_sequence_detection() {
let config = HeadingConfig::default();
let analyzer = HeadingAnalyzer::new(config);
assert!(analyzer.extract_sequence_marker("1. 항목").is_some());
assert!(analyzer.extract_sequence_marker("2) 항목").is_some());
assert!(analyzer.extract_sequence_marker("(3) 항목").is_some());
assert!(analyzer.extract_sequence_marker("가. 항목").is_some());
assert!(analyzer.extract_sequence_marker("a. 항목").is_some());
assert!(analyzer.extract_sequence_marker("일반 텍스트").is_none());
assert!(analyzer.extract_sequence_marker("제목").is_none());
}
#[test]
fn test_sequence_marker_extraction() {
let config = HeadingConfig::default();
let analyzer = HeadingAnalyzer::new(config);
assert_eq!(
analyzer.extract_sequence_marker("1. 항목"),
Some("1".to_string())
);
assert_eq!(
analyzer.extract_sequence_marker("2) 항목"),
Some("2".to_string())
);
assert_eq!(
analyzer.extract_sequence_marker("(3) 항목"),
Some("3".to_string())
);
assert_eq!(
analyzer.extract_sequence_marker("가. 항목"),
Some("가".to_string())
);
assert_eq!(
analyzer.extract_sequence_marker("a. 항목"),
Some("a".to_string())
);
}
#[test]
fn test_next_marker() {
let config = HeadingConfig::default();
let analyzer = HeadingAnalyzer::new(config);
assert_eq!(analyzer.next_marker("1"), Some("2".to_string()));
assert_eq!(analyzer.next_marker("9"), Some("10".to_string()));
assert_eq!(analyzer.next_marker("가"), Some("나".to_string()));
assert_eq!(analyzer.next_marker("a"), Some("b".to_string()));
}
#[test]
fn test_korean_sequence_patterns() {
let config = HeadingConfig::default();
let analyzer = HeadingAnalyzer::new(config);
assert_eq!(
analyzer.extract_sequence_marker("가. 첫째"),
Some("가".to_string())
);
assert_eq!(
analyzer.extract_sequence_marker("나) 둘째"),
Some("나".to_string())
);
assert_eq!(
analyzer.extract_sequence_marker("(다) 셋째"),
Some("다".to_string())
);
assert!(analyzer.extract_sequence_marker("각. 항목").is_none()); }
#[test]
fn test_arrow_marker_demoted_when_untrusted() {
let config = HeadingConfig::default().with_trust_explicit(false);
let analyzer = HeadingAnalyzer::new(config);
let para = make_paragraph("→ 화살표 항목", HeadingLevel::H2);
let decision = analyzer.decide_heading(¶);
assert_eq!(decision, HeadingDecision::Demoted);
}
#[test]
fn test_max_heading_level_capped() {
let config = HeadingConfig::default().with_max_level(2);
let analyzer = HeadingAnalyzer::new(config);
let para = make_paragraph("제목", HeadingLevel::H4);
let decision = analyzer.decide_heading(¶);
assert!(matches!(
decision,
HeadingDecision::Explicit(HeadingLevel::H2)
));
}
#[test]
fn test_sequence_analysis_demotes_consecutive() {
let config = HeadingConfig::default().with_trust_explicit(false);
let analyzer = HeadingAnalyzer::new(config);
let paras = vec![
make_paragraph("1. 첫째", HeadingLevel::H2),
make_paragraph("2. 둘째", HeadingLevel::H2),
make_paragraph("3. 셋째", HeadingLevel::H2),
];
let para_refs: Vec<&Paragraph> = paras.iter().collect();
let decisions = analyzer.analyze_paragraphs(¶_refs);
assert!(decisions
.iter()
.all(|d| matches!(d, HeadingDecision::Demoted)));
}
#[test]
fn test_standalone_numbered_heading_preserved() {
let config = HeadingConfig::default().with_trust_explicit(false);
let analyzer = HeadingAnalyzer::new(config);
let paras = vec![
make_paragraph("1. 서론", HeadingLevel::H2),
make_paragraph("본문 내용입니다.", HeadingLevel::None),
make_paragraph("2. 본론", HeadingLevel::H2),
];
let para_refs: Vec<&Paragraph> = paras.iter().collect();
let decisions = analyzer.analyze_paragraphs(¶_refs);
assert!(
matches!(decisions[0], HeadingDecision::Explicit(HeadingLevel::H2)),
"First heading should be preserved: {:?}",
decisions[0]
);
assert!(
matches!(decisions[2], HeadingDecision::Explicit(HeadingLevel::H2)),
"Third heading should be preserved: {:?}",
decisions[2]
);
}
#[test]
fn test_numbered_heading_without_explicit_style() {
let config = HeadingConfig::default();
let analyzer = HeadingAnalyzer::new(config);
let para = make_paragraph("1. 서론", HeadingLevel::None);
let decision = analyzer.decide_heading(¶);
assert_eq!(decision, HeadingDecision::None);
}
#[test]
fn test_numbered_heading_with_explicit_style_trusted() {
let config = HeadingConfig::default(); let analyzer = HeadingAnalyzer::new(config);
let para = make_paragraph("1. 서론", HeadingLevel::H1);
let decision = analyzer.decide_heading(¶);
assert!(matches!(
decision,
HeadingDecision::Explicit(HeadingLevel::H1)
));
}
#[test]
fn test_style_mapping_korean() {
let config = HeadingConfig::default().with_default_style_mapping();
let analyzer = HeadingAnalyzer::new(config);
let mut para = make_paragraph("제목 내용입니다", HeadingLevel::None);
para.style_name = Some("제목 1".to_string());
let decision = analyzer.decide_heading(¶);
assert!(
matches!(decision, HeadingDecision::Explicit(HeadingLevel::H1)),
"Korean style name should be recognized: {:?}",
decision
);
}
#[test]
fn test_style_mapping_english() {
let config = HeadingConfig::default().with_default_style_mapping();
let analyzer = HeadingAnalyzer::new(config);
let mut para = make_paragraph("Some heading text", HeadingLevel::None);
para.style_name = Some("Heading 2".to_string());
let decision = analyzer.decide_heading(¶);
assert!(
matches!(decision, HeadingDecision::Explicit(HeadingLevel::H2)),
"English style name should be recognized: {:?}",
decision
);
}
#[test]
fn test_style_mapping_takes_priority() {
let config = HeadingConfig::default().with_default_style_mapping();
let analyzer = HeadingAnalyzer::new(config);
let mut para = make_paragraph("Title text", HeadingLevel::H3);
para.style_name = Some("Title".to_string());
let decision = analyzer.decide_heading(¶);
assert!(
matches!(decision, HeadingDecision::Explicit(HeadingLevel::H1)),
"Style mapping should take priority: {:?}",
decision
);
}
#[test]
fn test_style_id_fallback() {
let config = HeadingConfig::default().with_default_style_mapping();
let analyzer = HeadingAnalyzer::new(config);
let mut para = make_paragraph("Heading text", HeadingLevel::None);
para.style_id = Some("Heading3".to_string());
para.style_name = None;
let decision = analyzer.decide_heading(¶);
assert!(
matches!(decision, HeadingDecision::Explicit(HeadingLevel::H3)),
"Style ID should be recognized: {:?}",
decision
);
}
}