1use crate::normalize::normalize_document;
12use crate::types::{
13 DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
14};
15use orbok_core::{OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17
18const EXTRACTOR_NAME: &str = "html";
19const EXTRACTOR_VERSION: &str = "v1";
20
21pub struct HtmlExtractor;
22
23impl DocumentExtractor for HtmlExtractor {
24 fn name(&self) -> &'static str { EXTRACTOR_NAME }
25 fn version(&self) -> &'static str { EXTRACTOR_VERSION }
26 fn supported_extensions(&self) -> &'static [&'static str] { &["html", "htm"] }
27
28 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
29 let content = std::fs::read_to_string(&path.canonical)?;
30 let blocks = extract_blocks(&content);
31 let mut segments = Vec::new();
32 let mut total_chars = 0u64;
33 let mut line = 1u32;
34
35 for block in &blocks {
36 let norm = normalize_document(&block.text);
37 if norm.trim().is_empty() { line += 1; continue; }
38 total_chars += norm.len() as u64;
39 segments.push(ExtractedSegment {
40 kind: block.kind,
41 text: norm,
42 line_start: line,
43 line_end: line,
44 heading_path: block.heading.clone(),
45 location_quality: LocationQuality::Approximate,
46 });
47 line += 1;
48 }
49
50 Ok(ExtractOutput {
51 extractor_name: EXTRACTOR_NAME.to_string(),
52 extractor_version: EXTRACTOR_VERSION.to_string(),
53 normalization_version: NORMALIZATION_VERSION.to_string(),
54 segments,
55 char_count: total_chars,
56 })
57 }
58}
59
60struct Block {
61 text: String,
62 kind: SegmentKind,
63 heading: Option<String>,
64}
65
66const BLOCK_TAGS: &[&str] = &["p", "div", "section", "article", "aside",
68 "h1", "h2", "h3", "h4", "h5", "h6", "li", "dt", "dd",
69 "td", "th", "caption", "blockquote", "pre", "br"];
70const HEADING_TAGS: &[&str] = &["h1", "h2", "h3", "h4", "h5", "h6"];
71const SKIP_TAGS: &[&str] = &["script", "style", "head", "noscript", "template"];
73
74fn extract_blocks(html: &str) -> Vec<Block> {
75 let mut blocks = Vec::new();
76 let mut current_text = String::new();
77 let mut current_heading: Option<String> = None;
78 let mut heading_trail: Vec<String> = Vec::new();
79 let mut skip_depth: usize = 0;
80 let mut skip_tag = "";
81 let mut pos = 0;
82 let chars: Vec<char> = html.chars().collect();
83 let n = chars.len();
84
85 let flush = |text: &mut String, heading: &Option<String>, blocks: &mut Vec<Block>| {
86 let t = text.trim().to_string();
87 if !t.is_empty() {
88 blocks.push(Block { text: t, kind: SegmentKind::Paragraph, heading: heading.clone() });
89 }
90 text.clear();
91 };
92
93 while pos < n {
94 if chars[pos] == '<' {
95 let mut tag_end = pos + 1;
97 while tag_end < n && chars[tag_end] != '>' { tag_end += 1; }
98 let tag_str: String = chars[pos..tag_end.min(n)].iter().collect();
99 let is_close = tag_str.starts_with("</");
100 let tag_name = tag_str
101 .trim_start_matches('<')
102 .trim_start_matches('/')
103 .split_whitespace()
104 .next()
105 .unwrap_or("")
106 .to_ascii_lowercase();
107
108 if skip_depth > 0 {
109 if is_close && tag_name == skip_tag { skip_depth -= 1; }
110 else if !is_close && tag_name == skip_tag { skip_depth += 1; }
111 } else if !is_close && SKIP_TAGS.contains(&tag_name.as_str()) {
112 skip_tag = SKIP_TAGS.iter().find(|&&t| t == tag_name.as_str()).copied().unwrap_or("");
113 skip_depth = 1;
114 flush(&mut current_text, ¤t_heading, &mut blocks);
115 } else if is_close && HEADING_TAGS.contains(&tag_name.as_str()) {
116 let heading_text = current_text.trim().to_string();
118 if !heading_text.is_empty() {
119 let level = tag_name[1..].parse::<usize>().unwrap_or(1);
120 heading_trail.truncate(level.saturating_sub(1));
121 heading_trail.push(heading_text.clone());
122 current_heading = Some(heading_trail.join(" > "));
123 blocks.push(Block {
124 text: heading_text,
125 kind: SegmentKind::Heading,
126 heading: current_heading.clone(),
127 });
128 current_text.clear();
129 }
130 } else if BLOCK_TAGS.contains(&tag_name.as_str()) {
131 flush(&mut current_text, ¤t_heading, &mut blocks);
132 if !is_close && HEADING_TAGS.contains(&tag_name.as_str()) {
133 let level = tag_name[1..].parse::<usize>().unwrap_or(1);
134 heading_trail.truncate(level.saturating_sub(1));
135 }
136 }
137 pos = tag_end + 1;
138 } else if skip_depth == 0 {
139 let c = chars[pos];
141 if c == '&' {
142 let semi = chars[pos..].iter().position(|&x| x == ';').map(|p| pos + p);
143 if let Some(end) = semi {
144 let entity: String = chars[pos..=end].iter().collect();
145 match entity.as_str() {
146 "&" => current_text.push('&'),
147 "<" => current_text.push('<'),
148 ">" => current_text.push('>'),
149 """ | "“" | "”" => current_text.push('"'),
150 "'" | "‘" | "’" => current_text.push('\''),
151 " " | " " => current_text.push(' '),
152 _ => current_text.push_str(&entity),
153 }
154 pos = end + 1;
155 continue;
156 }
157 }
158 current_text.push(c);
159 pos += 1;
160 } else {
161 pos += 1;
162 }
163 }
164 flush(&mut current_text, ¤t_heading, &mut blocks);
165 blocks
166}