1use crate::normalize::normalize_document;
12use crate::types::{
13 DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
14};
15use orbok_core::{OrbokResult, versions::NORMALIZATION_VERSION};
16use orbok_fs::ValidatedPath;
17
18const EXTRACTOR_NAME: &str = "html";
19const EXTRACTOR_VERSION: &str = "v1";
20
21pub struct HtmlExtractor;
22
23impl DocumentExtractor for HtmlExtractor {
24 fn name(&self) -> &'static str {
25 EXTRACTOR_NAME
26 }
27 fn version(&self) -> &'static str {
28 EXTRACTOR_VERSION
29 }
30 fn supported_extensions(&self) -> &'static [&'static str] {
31 &["html", "htm"]
32 }
33
34 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
35 let content = std::fs::read_to_string(&path.canonical)?;
36 let blocks = extract_blocks(&content);
37 let mut segments = Vec::new();
38 let mut total_chars = 0u64;
39 let mut line = 1u32;
40
41 for block in &blocks {
42 let norm = normalize_document(&block.text);
43 if norm.trim().is_empty() {
44 line += 1;
45 continue;
46 }
47 total_chars += norm.len() as u64;
48 segments.push(ExtractedSegment {
49 kind: block.kind,
50 text: norm,
51 line_start: line,
52 line_end: line,
53 heading_path: block.heading.clone(),
54 location_quality: LocationQuality::Approximate,
55 });
56 line += 1;
57 }
58
59 Ok(ExtractOutput {
60 extractor_name: EXTRACTOR_NAME.to_string(),
61 extractor_version: EXTRACTOR_VERSION.to_string(),
62 normalization_version: NORMALIZATION_VERSION.to_string(),
63 segments,
64 char_count: total_chars,
65 })
66 }
67}
68
69struct Block {
70 text: String,
71 kind: SegmentKind,
72 heading: Option<String>,
73}
74
75const BLOCK_TAGS: &[&str] = &[
77 "p",
78 "div",
79 "section",
80 "article",
81 "aside",
82 "h1",
83 "h2",
84 "h3",
85 "h4",
86 "h5",
87 "h6",
88 "li",
89 "dt",
90 "dd",
91 "td",
92 "th",
93 "caption",
94 "blockquote",
95 "pre",
96 "br",
97];
98const HEADING_TAGS: &[&str] = &["h1", "h2", "h3", "h4", "h5", "h6"];
99const SKIP_TAGS: &[&str] = &["script", "style", "head", "noscript", "template"];
101
102fn extract_blocks(html: &str) -> Vec<Block> {
103 let mut blocks = Vec::new();
104 let mut current_text = String::new();
105 let mut current_heading: Option<String> = None;
106 let mut heading_trail: Vec<String> = Vec::new();
107 let mut skip_depth: usize = 0;
108 let mut skip_tag = "";
109 let mut pos = 0;
110 let chars: Vec<char> = html.chars().collect();
111 let n = chars.len();
112
113 let flush = |text: &mut String, heading: &Option<String>, blocks: &mut Vec<Block>| {
114 let t = text.trim().to_string();
115 if !t.is_empty() {
116 blocks.push(Block {
117 text: t,
118 kind: SegmentKind::Paragraph,
119 heading: heading.clone(),
120 });
121 }
122 text.clear();
123 };
124
125 while pos < n {
126 if chars[pos] == '<' {
127 let mut tag_end = pos + 1;
129 while tag_end < n && chars[tag_end] != '>' {
130 tag_end += 1;
131 }
132 let tag_str: String = chars[pos..tag_end.min(n)].iter().collect();
133 let is_close = tag_str.starts_with("</");
134 let tag_name = tag_str
135 .trim_start_matches('<')
136 .trim_start_matches('/')
137 .split_whitespace()
138 .next()
139 .unwrap_or("")
140 .to_ascii_lowercase();
141
142 if skip_depth > 0 {
143 if is_close && tag_name == skip_tag {
144 skip_depth -= 1;
145 } else if !is_close && tag_name == skip_tag {
146 skip_depth += 1;
147 }
148 } else if !is_close && SKIP_TAGS.contains(&tag_name.as_str()) {
149 skip_tag = SKIP_TAGS
150 .iter()
151 .find(|&&t| t == tag_name.as_str())
152 .copied()
153 .unwrap_or("");
154 skip_depth = 1;
155 flush(&mut current_text, ¤t_heading, &mut blocks);
156 } else if is_close && HEADING_TAGS.contains(&tag_name.as_str()) {
157 let heading_text = current_text.trim().to_string();
159 if !heading_text.is_empty() {
160 let level = tag_name[1..].parse::<usize>().unwrap_or(1);
161 heading_trail.truncate(level.saturating_sub(1));
162 heading_trail.push(heading_text.clone());
163 current_heading = Some(heading_trail.join(" > "));
164 blocks.push(Block {
165 text: heading_text,
166 kind: SegmentKind::Heading,
167 heading: current_heading.clone(),
168 });
169 current_text.clear();
170 }
171 } else if BLOCK_TAGS.contains(&tag_name.as_str()) {
172 flush(&mut current_text, ¤t_heading, &mut blocks);
173 if !is_close && HEADING_TAGS.contains(&tag_name.as_str()) {
174 let level = tag_name[1..].parse::<usize>().unwrap_or(1);
175 heading_trail.truncate(level.saturating_sub(1));
176 }
177 }
178 pos = tag_end + 1;
179 } else if skip_depth == 0 {
180 let c = chars[pos];
182 if c == '&' {
183 let semi = chars[pos..].iter().position(|&x| x == ';').map(|p| pos + p);
184 if let Some(end) = semi {
185 let entity: String = chars[pos..=end].iter().collect();
186 match entity.as_str() {
187 "&" => current_text.push('&'),
188 "<" => current_text.push('<'),
189 ">" => current_text.push('>'),
190 """ | "“" | "”" => current_text.push('"'),
191 "'" | "‘" | "’" => current_text.push('\''),
192 " " | " " => current_text.push(' '),
193 _ => current_text.push_str(&entity),
194 }
195 pos = end + 1;
196 continue;
197 }
198 }
199 current_text.push(c);
200 pos += 1;
201 } else {
202 pos += 1;
203 }
204 }
205 flush(&mut current_text, ¤t_heading, &mut blocks);
206 blocks
207}