use std::{
ffi::OsStr,
io::{BufRead, BufReader},
};
use anyhow::{Context, Result};
use crate::{
formats::{self, ContentShape, FORMAT_SPECS, FormatSpec},
input::InputSource,
transform::{self, FormatKind, FormatOptions, TransformStrategy},
};
const SNIFF_BYTES: usize = 1024 * 1024;
const SNIFF_LINES: usize = 16;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct TypeProfile {
pub(crate) content: FormatKind,
pub(crate) shape: ContentShape,
pub(crate) load: crate::load::LoadPlan,
pub(crate) transform: TransformStrategy,
}
impl FormatSpec {
fn profile(self) -> TypeProfile {
TypeProfile {
content: self.kind,
shape: self.shape,
load: self.load,
transform: self.transform,
}
}
}
impl TypeProfile {
pub(crate) fn resolve(source: &InputSource, options: &FormatOptions) -> Result<Self> {
if options.kind != FormatKind::Auto {
return Ok(explicit_profile(options.kind));
}
if let Some(kind) = extension_kind(source) {
return Ok(explicit_profile(kind));
}
let sample = TypeSample::read(source)?;
if sample.looks_like_record_stream() {
return Ok(explicit_profile(FormatKind::Jsonl));
}
Ok(match sample.first_non_ws {
Some(b'<') => explicit_profile(FormatKind::Xml),
Some(b'{' | b'[') => explicit_profile(FormatKind::Json),
_ => explicit_profile(FormatKind::Plain),
})
}
pub(crate) fn format_options(self, indent: usize) -> FormatOptions {
FormatOptions {
kind: self.content,
indent,
}
}
}
fn explicit_profile(kind: FormatKind) -> TypeProfile {
FORMAT_SPECS
.iter()
.copied()
.find(|spec| spec.kind == kind)
.map(FormatSpec::profile)
.unwrap_or_else(|| unreachable!("auto must be resolved before building a type profile"))
}
fn extension_kind(source: &InputSource) -> Option<FormatKind> {
let extension = source
.path()
.extension()
.and_then(OsStr::to_str)
.map(str::to_ascii_lowercase)?;
formats::kind_for_extension(&extension)
}
#[derive(Default)]
struct TypeSample {
first_non_ws: Option<u8>,
non_empty_lines: usize,
parseable_record_lines: usize,
}
impl TypeSample {
fn read(source: &InputSource) -> Result<Self> {
let mut reader = BufReader::new(source.open()?);
let mut sample = Self::default();
let mut bytes_read = 0_usize;
let mut line = Vec::with_capacity(8192);
while bytes_read < SNIFF_BYTES && sample.non_empty_lines < SNIFF_LINES {
line.clear();
let max = SNIFF_BYTES - bytes_read;
let read = read_line_limited(&mut reader, &mut line, max)
.with_context(|| format!("failed to inspect {}", source.label()))?;
if read == 0 {
break;
}
bytes_read += read;
if sample.first_non_ws.is_none() {
sample.first_non_ws = line
.iter()
.copied()
.find(|byte| !byte.is_ascii_whitespace());
}
let trimmed = trim_ascii_ws(transform::trim_record_line_end(&line));
if trimmed.is_empty() {
continue;
}
sample.non_empty_lines += 1;
if transform::parseable_record_line(trimmed) {
sample.parseable_record_lines += 1;
}
}
Ok(sample)
}
fn looks_like_record_stream(&self) -> bool {
self.non_empty_lines >= 2 && self.parseable_record_lines == self.non_empty_lines
}
}
fn read_line_limited<R: BufRead>(
reader: &mut R,
line: &mut Vec<u8>,
limit: usize,
) -> Result<usize> {
let before = line.len();
let mut total = 0_usize;
while total < limit {
let available = reader.fill_buf()?;
if available.is_empty() {
break;
}
let take = available
.iter()
.position(|byte| *byte == b'\n')
.map(|index| index + 1)
.unwrap_or(available.len())
.min(limit - total);
line.extend_from_slice(&available[..take]);
reader.consume(take);
total += take;
if line.ends_with(b"\n") || take == 0 {
break;
}
}
Ok(line.len() - before)
}
fn trim_ascii_ws(mut bytes: &[u8]) -> &[u8] {
while bytes.first().is_some_and(u8::is_ascii_whitespace) {
bytes = &bytes[1..];
}
while bytes.last().is_some_and(u8::is_ascii_whitespace) {
bytes = &bytes[..bytes.len() - 1];
}
bytes
}
#[cfg(test)]
mod tests {
use std::io::Write;
use crate::load::LoadPlan;
use tempfile::{Builder as TempFileBuilder, NamedTempFile};
use super::*;
fn source(contents: &[u8]) -> (NamedTempFile, InputSource) {
let mut temp = NamedTempFile::new().unwrap();
temp.write_all(contents).unwrap();
temp.flush().unwrap();
let source = InputSource::from_arg(temp.path().to_str().unwrap(), None).unwrap();
(temp, source)
}
fn source_with_suffix(contents: &[u8], suffix: &str) -> (NamedTempFile, InputSource) {
let mut temp = TempFileBuilder::new().suffix(suffix).tempfile().unwrap();
temp.write_all(contents).unwrap();
temp.flush().unwrap();
let source = InputSource::from_arg(temp.path().to_str().unwrap(), None).unwrap();
(temp, source)
}
#[test]
fn resolves_plain_extension_to_passthrough_profile() {
let (_temp, source) = source_with_suffix(b"plain\n", ".txt");
let profile = TypeProfile::resolve(
&source,
&FormatOptions {
kind: FormatKind::Auto,
indent: 2,
},
)
.unwrap();
assert_eq!(profile.content, FormatKind::Plain);
assert_eq!(profile.shape, ContentShape::LineIndexed);
assert_eq!(profile.load, LoadPlan::EagerIndexedSource);
assert_eq!(profile.transform, TransformStrategy::Passthrough);
}
#[test]
fn resolves_jinja_extension_to_template_profile() {
let (_temp, source) = source_with_suffix(b"<h1>{{ title }}</h1>\n", ".html.j2");
let profile = TypeProfile::resolve(
&source,
&FormatOptions {
kind: FormatKind::Auto,
indent: 2,
},
)
.unwrap();
assert_eq!(profile.content, FormatKind::Jinja);
assert_eq!(profile.shape, ContentShape::LineIndexed);
assert_eq!(profile.load, LoadPlan::EagerIndexedSource);
assert_eq!(profile.transform, TransformStrategy::Passthrough);
}
#[test]
fn resolves_toml_extension_to_passthrough_profile() {
let (_temp, source) = source_with_suffix(b"[package]\nname = \"fmtview\"\n", ".toml");
let profile = TypeProfile::resolve(
&source,
&FormatOptions {
kind: FormatKind::Auto,
indent: 2,
},
)
.unwrap();
assert_eq!(profile.content, FormatKind::Toml);
assert_eq!(profile.shape, ContentShape::LineIndexed);
assert_eq!(profile.load, LoadPlan::EagerIndexedSource);
assert_eq!(profile.transform, TransformStrategy::Passthrough);
}
#[test]
fn resolves_markdown_extension_to_passthrough_profile() {
let (_temp, source) = source_with_suffix(b"# fmtview\n\n- fast viewer\n", ".md");
let profile = TypeProfile::resolve(
&source,
&FormatOptions {
kind: FormatKind::Auto,
indent: 2,
},
)
.unwrap();
assert_eq!(profile.content, FormatKind::Markdown);
assert_eq!(profile.shape, ContentShape::LineIndexed);
assert_eq!(profile.load, LoadPlan::EagerIndexedSource);
assert_eq!(profile.transform, TransformStrategy::Passthrough);
}
#[test]
fn unknown_textual_content_falls_back_to_plain_profile() {
let (_temp, source) = source_with_suffix(b"hello world\nnot json\n", ".weird");
let profile = TypeProfile::resolve(
&source,
&FormatOptions {
kind: FormatKind::Auto,
indent: 2,
},
)
.unwrap();
assert_eq!(profile.content, FormatKind::Plain);
assert_eq!(profile.shape, ContentShape::LineIndexed);
assert_eq!(profile.load, LoadPlan::EagerIndexedSource);
assert_eq!(profile.transform, TransformStrategy::Passthrough);
}
#[test]
fn resolves_record_stream_to_lazy_jsonl_profile() {
let (_temp, source) = source_with_suffix(b"{\"a\":1}\n{\"b\":2}\n", ".data");
let profile = TypeProfile::resolve(
&source,
&FormatOptions {
kind: FormatKind::Auto,
indent: 2,
},
)
.unwrap();
assert_eq!(profile.content, FormatKind::Jsonl);
assert_eq!(profile.shape, ContentShape::RecordStream);
assert_eq!(profile.load, LoadPlan::LazyTransformedRecords);
assert_eq!(profile.transform, TransformStrategy::RecordPrettyPrint);
}
#[test]
fn explicit_format_kinds_choose_profile_without_sniffing() {
let (_temp, source) = source(b"{\"broken\":\n");
let cases = [
(
FormatKind::Jsonl,
ContentShape::RecordStream,
LoadPlan::LazyTransformedRecords,
TransformStrategy::RecordPrettyPrint,
),
(
FormatKind::Json,
ContentShape::WholeDocument,
LoadPlan::EagerTransformedDocument,
TransformStrategy::PrettyPrint,
),
(
FormatKind::Xml,
ContentShape::WholeDocument,
LoadPlan::EagerTransformedDocument,
TransformStrategy::PrettyPrint,
),
(
FormatKind::Toml,
ContentShape::LineIndexed,
LoadPlan::EagerIndexedSource,
TransformStrategy::Passthrough,
),
(
FormatKind::Markdown,
ContentShape::LineIndexed,
LoadPlan::EagerIndexedSource,
TransformStrategy::Passthrough,
),
(
FormatKind::Plain,
ContentShape::LineIndexed,
LoadPlan::EagerIndexedSource,
TransformStrategy::Passthrough,
),
(
FormatKind::Jinja,
ContentShape::LineIndexed,
LoadPlan::EagerIndexedSource,
TransformStrategy::Passthrough,
),
];
for (kind, shape, load, transform) in cases {
let profile =
TypeProfile::resolve(&source, &FormatOptions { kind, indent: 2 }).unwrap();
assert_eq!(profile.content, kind);
assert_eq!(profile.shape, shape);
assert_eq!(profile.load, load);
assert_eq!(profile.transform, transform);
}
}
#[test]
fn resolves_jsonl_extension_before_sampling() {
let mut data = b"{\"message\":\"".to_vec();
data.extend(std::iter::repeat_n(b'a', SNIFF_BYTES + 1024));
data.extend_from_slice(b"\"}\n{\"ok\":true}\n");
let (_temp, source) = source_with_suffix(&data, ".jsonl");
let profile = TypeProfile::resolve(
&source,
&FormatOptions {
kind: FormatKind::Auto,
indent: 2,
},
)
.unwrap();
assert_eq!(profile.content, FormatKind::Jsonl);
assert_eq!(profile.shape, ContentShape::RecordStream);
assert_eq!(profile.load, LoadPlan::LazyTransformedRecords);
}
#[test]
fn keeps_truncated_record_prefix_as_eager_json_document_without_extension() {
let mut data = b"{\"message\":\"".to_vec();
data.extend(std::iter::repeat_n(b'a', SNIFF_BYTES + 1024));
data.extend_from_slice(b"\"}\n{\"ok\":true}\n");
let (_temp, source) = source(&data);
let profile = TypeProfile::resolve(
&source,
&FormatOptions {
kind: FormatKind::Auto,
indent: 2,
},
)
.unwrap();
assert_eq!(profile.content, FormatKind::Json);
assert_eq!(profile.shape, ContentShape::WholeDocument);
assert_eq!(profile.load, LoadPlan::EagerTransformedDocument);
}
#[test]
fn keeps_multiline_documents_eager() {
let (_json_temp, json_source) = source(b"{\n \"items\": [\n {\"a\": 1}\n ]\n}\n");
let (_xml_temp, xml_source) = source(b"<root>\n <item>one</item>\n</root>\n");
let options = FormatOptions {
kind: FormatKind::Auto,
indent: 2,
};
let json = TypeProfile::resolve(&json_source, &options).unwrap();
let xml = TypeProfile::resolve(&xml_source, &options).unwrap();
assert_eq!(json.content, FormatKind::Json);
assert_eq!(json.shape, ContentShape::WholeDocument);
assert_eq!(json.load, LoadPlan::EagerTransformedDocument);
assert_eq!(xml.content, FormatKind::Xml);
assert_eq!(xml.shape, ContentShape::WholeDocument);
assert_eq!(xml.load, LoadPlan::EagerTransformedDocument);
}
#[test]
fn keeps_single_line_json_document_eager() {
let (_temp, source) = source(b"{\"items\":[{\"a\":1},{\"b\":2}]}\n");
let profile = TypeProfile::resolve(
&source,
&FormatOptions {
kind: FormatKind::Auto,
indent: 2,
},
)
.unwrap();
assert_eq!(profile.content, FormatKind::Json);
assert_eq!(profile.shape, ContentShape::WholeDocument);
assert_eq!(profile.load, LoadPlan::EagerTransformedDocument);
}
}