use crate::toolkit::FastScanner;
use std::path::Path;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RdfFormat {
Turtle,
NTriples,
NQuads,
TriG,
}
impl RdfFormat {
pub fn extension(&self) -> &'static str {
match self {
RdfFormat::Turtle => "ttl",
RdfFormat::NTriples => "nt",
RdfFormat::NQuads => "nq",
RdfFormat::TriG => "trig",
}
}
pub fn mime_type(&self) -> &'static str {
match self {
RdfFormat::Turtle => "text/turtle",
RdfFormat::NTriples => "application/n-triples",
RdfFormat::NQuads => "application/n-quads",
RdfFormat::TriG => "application/trig",
}
}
pub fn name(&self) -> &'static str {
match self {
RdfFormat::Turtle => "Turtle",
RdfFormat::NTriples => "N-Triples",
RdfFormat::NQuads => "N-Quads",
RdfFormat::TriG => "TriG",
}
}
}
#[derive(Debug, Clone)]
pub struct DetectionResult {
pub format: RdfFormat,
pub confidence: f64,
pub method: DetectionMethod,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DetectionMethod {
FileExtension,
MimeType,
ContentAnalysis,
Combined,
}
#[derive(Debug, Clone)]
pub struct FormatDetector {
sample_size: usize,
min_confidence: f64,
}
impl Default for FormatDetector {
fn default() -> Self {
Self::new()
}
}
impl FormatDetector {
pub fn new() -> Self {
Self {
sample_size: 4096, min_confidence: 0.6,
}
}
pub fn with_sample_size(mut self, size: usize) -> Self {
self.sample_size = size;
self
}
pub fn with_min_confidence(mut self, confidence: f64) -> Self {
self.min_confidence = confidence.clamp(0.0, 1.0);
self
}
pub fn detect_from_path(&self, path: &Path) -> Option<DetectionResult> {
path.extension()
.and_then(|ext| ext.to_str())
.and_then(|ext| self.detect_from_extension(ext))
}
pub fn detect_from_extension(&self, extension: &str) -> Option<DetectionResult> {
let ext_lower = extension.to_lowercase();
let format = match ext_lower.as_str() {
"ttl" | "turtle" => RdfFormat::Turtle,
"nt" | "ntriples" => RdfFormat::NTriples,
"nq" | "nquads" => RdfFormat::NQuads,
"trig" => RdfFormat::TriG,
_ => return None,
};
Some(DetectionResult {
format,
confidence: 0.9, method: DetectionMethod::FileExtension,
})
}
pub fn detect_from_mime_type(&self, mime_type: &str) -> Option<DetectionResult> {
let mime_lower = mime_type.to_lowercase();
let format = if mime_lower.contains("turtle") {
RdfFormat::Turtle
} else if mime_lower.contains("n-triples") || mime_lower.contains("ntriples") {
RdfFormat::NTriples
} else if mime_lower.contains("n-quads") || mime_lower.contains("nquads") {
RdfFormat::NQuads
} else if mime_lower.contains("trig") {
RdfFormat::TriG
} else {
return None;
};
Some(DetectionResult {
format,
confidence: 0.95, method: DetectionMethod::MimeType,
})
}
pub fn detect_from_content(&self, content: &[u8]) -> Option<DetectionResult> {
let sample = &content[..content.len().min(self.sample_size)];
let scanner = FastScanner::new(sample);
let mut scores = FormatScores::default();
self.analyze_directives(&scanner, &mut scores);
self.analyze_syntax(&scanner, &mut scores);
self.analyze_structure(&scanner, &mut scores);
scores.determine_format(self.min_confidence)
}
pub fn detect(
&self,
path: Option<&Path>,
mime_type: Option<&str>,
content: Option<&[u8]>,
) -> Option<DetectionResult> {
let mut results = Vec::new();
if let Some(path) = path {
if let Some(result) = self.detect_from_path(path) {
results.push(result);
}
}
if let Some(mime) = mime_type {
if let Some(result) = self.detect_from_mime_type(mime) {
results.push(result);
}
}
if let Some(content) = content {
if let Some(result) = self.detect_from_content(content) {
results.push(result);
}
}
self.combine_results(results)
}
fn analyze_directives(&self, scanner: &FastScanner, scores: &mut FormatScores) {
let mut pos = 0;
let mut has_prefix_directive = false;
while pos < scanner.len() {
pos = scanner.skip_whitespace_and_comments(pos);
if pos >= scanner.len() {
break;
}
if scanner.byte_at(pos) == Some(b'@') {
scores.turtle += 0.8;
scores.trig += 0.5; has_prefix_directive = true;
break; }
let slice = scanner.slice(pos, pos + 6);
if slice.starts_with(b"PREFIX") || slice.starts_with(b"BASE") {
scores.turtle += 0.7;
scores.trig += 0.4; has_prefix_directive = true;
break;
}
if let Some(newline) = scanner.find_line_end(pos) {
pos = newline + 1;
} else {
break;
}
}
if has_prefix_directive {
scores.ntriples = 0.0;
scores.nquads = 0.0;
}
}
fn analyze_syntax(&self, scanner: &FastScanner, scores: &mut FormatScores) {
let content = scanner.slice(0, scanner.len());
let mut has_curly_braces = false;
for byte in content {
match byte {
b';' => {
scores.turtle += 0.1;
scores.trig += 0.1;
}
b',' => {
scores.turtle += 0.05;
scores.trig += 0.05;
}
b'[' | b']' => {
scores.turtle += 0.08;
scores.trig += 0.08;
}
b'{' | b'}' => {
has_curly_braces = true;
scores.trig += 0.5;
}
_ => {}
}
}
if has_curly_braces {
scores.turtle *= 0.5; }
}
fn analyze_structure(&self, scanner: &FastScanner, scores: &mut FormatScores) {
let mut pos = 0;
let mut line_count = 0;
let mut triple_lines = 0;
let mut quad_lines = 0;
let mut has_prefixed_names = false;
let mut has_full_iris = false;
while pos < scanner.len() && line_count < 10 {
pos = scanner.skip_whitespace_and_comments(pos);
if pos >= scanner.len() {
break;
}
let line_start = pos;
let line_end = scanner.find_line_end(pos).unwrap_or(scanner.len());
let line_content = scanner.slice(line_start, line_end);
let element_count = self.count_line_elements(line_content);
if self.has_prefixed_name(line_content) {
has_prefixed_names = true;
}
let trimmed_start = scanner.skip_whitespace(line_start);
if trimmed_start < scanner.len() {
match scanner.byte_at(trimmed_start) {
Some(b'<') | Some(b'_') => has_full_iris = true,
_ => {}
}
}
match element_count {
3 => triple_lines += 1,
4 => quad_lines += 1,
_ => {}
}
line_count += 1;
pos = line_end + 1;
}
if has_prefixed_names {
scores.turtle += 0.4;
scores.trig += 0.4;
scores.ntriples = 0.0;
scores.nquads = 0.0;
return;
}
if line_count > 0 && has_full_iris {
let triple_ratio = triple_lines as f64 / line_count as f64;
let quad_ratio = quad_lines as f64 / line_count as f64;
if triple_ratio > 0.7 {
scores.ntriples += 0.6;
}
if quad_ratio > 0.7 {
scores.nquads += 0.7;
}
}
}
fn has_prefixed_name(&self, line: &[u8]) -> bool {
let mut i = 0;
while i < line.len() {
while i < line.len() && (line[i] == b' ' || line[i] == b'\t') {
i += 1;
}
if i >= line.len() {
break;
}
if line[i] == b'<' {
while i < line.len() && line[i] != b'>' {
i += 1;
}
i += 1;
continue;
}
if line[i] == b'"' {
i += 1;
while i < line.len() && line[i] != b'"' {
if line[i] == b'\\' {
i += 2;
} else {
i += 1;
}
}
i += 1;
continue;
}
if line[i].is_ascii_alphabetic() {
while i < line.len() && (line[i].is_ascii_alphanumeric() || line[i] == b'_') {
i += 1;
}
if i < line.len() && line[i] == b':' {
if i + 1 < line.len() && line[i + 1] != b' ' {
return true;
}
}
continue;
}
i += 1;
}
false
}
fn count_line_elements(&self, line: &[u8]) -> usize {
let mut count = 0;
let mut in_string = false;
let mut in_angle_bracket = false;
let mut prev_space = true;
let mut elements = Vec::new();
let mut current_element_start = 0;
for (i, &byte) in line.iter().enumerate() {
match byte {
b'"' => in_string = !in_string,
b'<' if !in_string => in_angle_bracket = true,
b'>' if !in_string => in_angle_bracket = false,
b' ' | b'\t' if !in_string && !in_angle_bracket => {
if !prev_space {
elements.push(&line[current_element_start..i]);
}
prev_space = true;
current_element_start = i + 1;
}
_ => {
if prev_space && !in_string {
prev_space = false;
}
}
}
}
if !prev_space && current_element_start < line.len() {
elements.push(&line[current_element_start..]);
}
for elem in elements {
if elem.is_empty() || elem == b"." {
continue;
}
count += 1;
}
count
}
fn combine_results(&self, results: Vec<DetectionResult>) -> Option<DetectionResult> {
if results.is_empty() {
return None;
}
if results.len() == 1 {
return Some(results[0].clone());
}
let mut format_scores: std::collections::HashMap<RdfFormat, f64> =
std::collections::HashMap::new();
for result in &results {
let weight = match result.method {
DetectionMethod::MimeType => 1.5,
DetectionMethod::FileExtension => 1.2,
DetectionMethod::ContentAnalysis => 1.0,
DetectionMethod::Combined => 1.3,
};
*format_scores.entry(result.format).or_insert(0.0) += result.confidence * weight;
}
format_scores
.into_iter()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map(|(format, score)| DetectionResult {
format,
confidence: (score / results.len() as f64).min(1.0),
method: DetectionMethod::Combined,
})
}
}
#[derive(Debug, Default)]
struct FormatScores {
turtle: f64,
ntriples: f64,
nquads: f64,
trig: f64,
}
impl FormatScores {
fn determine_format(&self, min_confidence: f64) -> Option<DetectionResult> {
let formats = [
(RdfFormat::Turtle, self.turtle),
(RdfFormat::NTriples, self.ntriples),
(RdfFormat::NQuads, self.nquads),
(RdfFormat::TriG, self.trig),
];
formats
.iter()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.filter(|(_, score)| *score >= min_confidence)
.map(|(format, score)| DetectionResult {
format: *format,
confidence: *score,
method: DetectionMethod::ContentAnalysis,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_turtle_from_extension() {
let detector = FormatDetector::new();
let result = detector
.detect_from_extension("ttl")
.expect("detection should succeed");
assert_eq!(result.format, RdfFormat::Turtle);
assert!(result.confidence > 0.8);
}
#[test]
fn test_detect_ntriples_from_extension() {
let detector = FormatDetector::new();
let result = detector
.detect_from_extension("nt")
.expect("detection should succeed");
assert_eq!(result.format, RdfFormat::NTriples);
}
#[test]
fn test_detect_from_path() {
let detector = FormatDetector::new();
let path = Path::new("test.ttl");
let result = detector
.detect_from_path(path)
.expect("detection should succeed");
assert_eq!(result.format, RdfFormat::Turtle);
}
#[test]
fn test_detect_turtle_from_content() {
let detector = FormatDetector::new();
let content = b"@prefix ex: <http://example.org/> .\nex:subject ex:predicate ex:object .";
let result = detector
.detect_from_content(content)
.expect("detection should succeed");
assert_eq!(result.format, RdfFormat::Turtle);
}
#[test]
fn test_detect_trig_from_content() {
let detector = FormatDetector::new();
let content = b"@prefix ex: <http://example.org/> .\nex:graph { ex:s ex:p ex:o . }";
let result = detector
.detect_from_content(content)
.expect("detection should succeed");
assert_eq!(result.format, RdfFormat::TriG);
}
#[test]
fn test_detect_ntriples_from_content() {
let detector = FormatDetector::new();
let content = b"<http://example.org/s> <http://example.org/p> <http://example.org/o> .\n<http://example.org/s2> <http://example.org/p2> <http://example.org/o2> .";
let result = detector
.detect_from_content(content)
.expect("detection should succeed");
assert_eq!(result.format, RdfFormat::NTriples);
}
#[test]
fn test_detect_from_mime_type() {
let detector = FormatDetector::new();
let result = detector
.detect_from_mime_type("text/turtle")
.expect("detection should succeed");
assert_eq!(result.format, RdfFormat::Turtle);
let result = detector
.detect_from_mime_type("application/n-triples")
.expect("detection should succeed");
assert_eq!(result.format, RdfFormat::NTriples);
}
#[test]
fn test_combined_detection() {
let detector = FormatDetector::new();
let path = Path::new("test.ttl");
let content = b"@prefix ex: <http://example.org/> .";
let result = detector
.detect(Some(path), None, Some(content))
.expect("detection should succeed");
assert_eq!(result.format, RdfFormat::Turtle);
assert!(result.confidence > 0.8);
}
#[test]
fn test_format_properties() {
assert_eq!(RdfFormat::Turtle.extension(), "ttl");
assert_eq!(RdfFormat::Turtle.mime_type(), "text/turtle");
assert_eq!(RdfFormat::Turtle.name(), "Turtle");
}
}