use crate::normalize::normalize_document;
use crate::types::{
DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
read_error_category,
};
use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
use orbok_fs::ValidatedPath;
pub struct PlainTextExtractor;
impl DocumentExtractor for PlainTextExtractor {
fn name(&self) -> &'static str {
"plain_text"
}
fn version(&self) -> &'static str {
"text-v1"
}
fn supported_extensions(&self) -> &'static [&'static str] {
&[
"txt", "log", "csv", "rs", "py", "js", "ts", "jsx", "tsx", "java", "c", "h", "cpp",
"hpp", "go", "rb", "php", "sh", "bash", "sql", "toml", "yaml", "yml", "json", "xml",
"css", "html", "htm",
]
}
fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
let bytes = std::fs::read(&path.canonical).map_err(|e| OrbokError::Extraction {
category: read_error_category(&e),
message: e.to_string(),
})?;
let raw = String::from_utf8(bytes).map_err(|_| OrbokError::Extraction {
category: ErrorCategory::EncodingError,
message: "file is not valid UTF-8".into(),
})?;
let normalized = normalize_document(&raw);
let segments = segment_paragraphs(&normalized);
let char_count = normalized.chars().count() as u64;
Ok(ExtractOutput {
extractor_name: self.name().into(),
extractor_version: self.version().into(),
normalization_version: NORMALIZATION_VERSION.into(),
segments,
char_count,
})
}
}
pub(crate) fn segment_paragraphs(normalized: &str) -> Vec<ExtractedSegment> {
let mut segments = Vec::new();
let mut current: Vec<&str> = Vec::new();
let mut start_line = 0u32;
for (idx, line) in normalized.lines().enumerate() {
let line_no = idx as u32 + 1;
if line.trim().is_empty() {
flush(&mut segments, &mut current, start_line, line_no.saturating_sub(1));
} else {
if current.is_empty() {
start_line = line_no;
}
current.push(line);
}
}
let last_line = normalized.lines().count() as u32;
flush(&mut segments, &mut current, start_line, last_line);
segments
}
fn flush(
segments: &mut Vec<ExtractedSegment>,
current: &mut Vec<&str>,
start: u32,
end: u32,
) {
if current.is_empty() {
return;
}
segments.push(ExtractedSegment {
kind: SegmentKind::Paragraph,
text: current.join("\n"),
line_start: start,
line_end: end,
heading_path: None,
location_quality: LocationQuality::Exact,
});
current.clear();
}