use alloc::collections::BTreeMap;
use alloc::string::String;
use alloc::vec::Vec;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum DataClass {
Unknown,
Document,
Media,
Code,
Database,
System,
Archive,
Temporary,
Critical,
}
impl DataClass {
pub fn name(&self) -> &'static str {
match self {
DataClass::Unknown => "Unknown",
DataClass::Document => "Document",
DataClass::Media => "Media",
DataClass::Code => "Code",
DataClass::Database => "Database",
DataClass::System => "System",
DataClass::Archive => "Archive",
DataClass::Temporary => "Temporary",
DataClass::Critical => "Critical",
}
}
pub fn compression_level(&self) -> u8 {
match self {
DataClass::Unknown => 5,
DataClass::Document => 6, DataClass::Media => 0, DataClass::Code => 6, DataClass::Database => 3, DataClass::System => 5,
DataClass::Archive => 0, DataClass::Temporary => 1, DataClass::Critical => 9, }
}
pub fn replication_factor(&self) -> u8 {
match self {
DataClass::Unknown => 1,
DataClass::Document => 2,
DataClass::Media => 1,
DataClass::Code => 2,
DataClass::Database => 3, DataClass::System => 2,
DataClass::Archive => 1,
DataClass::Temporary => 1, DataClass::Critical => 3, }
}
pub fn recommended_tier(&self) -> &'static str {
match self {
DataClass::Unknown => "Standard",
DataClass::Document => "Standard",
DataClass::Media => "Archive", DataClass::Code => "Hot", DataClass::Database => "Hot", DataClass::System => "Hot",
DataClass::Archive => "Cold",
DataClass::Temporary => "Temp",
DataClass::Critical => "Hot",
}
}
}
#[derive(Debug, Clone)]
pub struct ClassificationRule {
pub name: String,
pub pattern: String,
pub class: DataClass,
pub priority: u8,
}
impl ClassificationRule {
pub fn new(name: String, pattern: String, class: DataClass) -> Self {
Self {
name,
pattern,
class,
priority: 50,
}
}
pub fn matches(&self, path: &str) -> bool {
if self.pattern.starts_with("*.") {
let ext = &self.pattern[2..];
path.ends_with(ext)
} else if self.pattern.starts_with('/') {
path.starts_with(&self.pattern)
} else {
path.contains(&self.pattern)
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Tag {
pub key: String,
pub value: String,
}
impl Tag {
pub fn new(key: String, value: String) -> Self {
Self { key, value }
}
}
#[derive(Debug, Clone)]
pub struct ClassifiedFile {
pub dataset_id: u64,
pub offset: u64,
pub path: String,
pub size: u64,
pub class: DataClass,
pub tags: Vec<Tag>,
pub classified_at: u64,
pub last_access: u64,
pub access_count: u64,
}
impl ClassifiedFile {
pub fn new(dataset_id: u64, offset: u64, path: String, size: u64, timestamp: u64) -> Self {
Self {
dataset_id,
offset,
path,
size,
class: DataClass::Unknown,
tags: Vec::new(),
classified_at: timestamp,
last_access: timestamp,
access_count: 0,
}
}
pub fn add_tag(&mut self, key: String, value: String) {
if let Some(tag) = self.tags.iter_mut().find(|t| t.key == key) {
tag.value = value;
} else {
self.tags.push(Tag::new(key, value));
}
}
pub fn get_tag(&self, key: &str) -> Option<&str> {
self.tags
.iter()
.find(|t| t.key == key)
.map(|t| t.value.as_str())
}
pub fn has_tag(&self, key: &str) -> bool {
self.tags.iter().any(|t| t.key == key)
}
}
#[derive(Debug, Clone, Default)]
pub struct ClassificationStats {
pub files_classified: u64,
pub by_class: BTreeMap<DataClass, u64>,
pub tags_applied: u64,
pub rules_matched: u64,
}
pub struct DataClassifier {
rules: Vec<ClassificationRule>,
files: BTreeMap<(u64, u64), ClassifiedFile>,
stats: ClassificationStats,
}
impl Default for DataClassifier {
fn default() -> Self {
Self::new()
}
}
impl DataClassifier {
pub fn new() -> Self {
let mut classifier = Self {
rules: Vec::new(),
files: BTreeMap::new(),
stats: ClassificationStats::default(),
};
classifier.add_default_rules();
classifier
}
fn add_default_rules(&mut self) {
self.add_rule(ClassificationRule::new(
"text".into(),
"*.txt".into(),
DataClass::Document,
));
self.add_rule(ClassificationRule::new(
"pdf".into(),
"*.pdf".into(),
DataClass::Document,
));
self.add_rule(ClassificationRule::new(
"doc".into(),
"*.doc".into(),
DataClass::Document,
));
self.add_rule(ClassificationRule::new(
"docx".into(),
"*.docx".into(),
DataClass::Document,
));
self.add_rule(ClassificationRule::new(
"jpeg".into(),
"*.jpg".into(),
DataClass::Media,
));
self.add_rule(ClassificationRule::new(
"png".into(),
"*.png".into(),
DataClass::Media,
));
self.add_rule(ClassificationRule::new(
"mp4".into(),
"*.mp4".into(),
DataClass::Media,
));
self.add_rule(ClassificationRule::new(
"mp3".into(),
"*.mp3".into(),
DataClass::Media,
));
self.add_rule(ClassificationRule::new(
"rust".into(),
"*.rs".into(),
DataClass::Code,
));
self.add_rule(ClassificationRule::new(
"c".into(),
"*.c".into(),
DataClass::Code,
));
self.add_rule(ClassificationRule::new(
"cpp".into(),
"*.cpp".into(),
DataClass::Code,
));
self.add_rule(ClassificationRule::new(
"python".into(),
"*.py".into(),
DataClass::Code,
));
self.add_rule(ClassificationRule::new(
"sqlite".into(),
"*.db".into(),
DataClass::Database,
));
self.add_rule(ClassificationRule::new(
"sql".into(),
"*.sql".into(),
DataClass::Database,
));
self.add_rule(ClassificationRule::new(
"config".into(),
"*.conf".into(),
DataClass::System,
));
self.add_rule(ClassificationRule::new(
"ini".into(),
"*.ini".into(),
DataClass::System,
));
self.add_rule(ClassificationRule::new(
"etc".into(),
"/etc/".into(),
DataClass::System,
));
self.add_rule(ClassificationRule::new(
"zip".into(),
"*.zip".into(),
DataClass::Archive,
));
self.add_rule(ClassificationRule::new(
"tar".into(),
"*.tar".into(),
DataClass::Archive,
));
self.add_rule(ClassificationRule::new(
"gz".into(),
"*.gz".into(),
DataClass::Archive,
));
self.add_rule(ClassificationRule::new(
"tmp".into(),
"*.tmp".into(),
DataClass::Temporary,
));
self.add_rule(ClassificationRule::new(
"temp_dir".into(),
"/tmp/".into(),
DataClass::Temporary,
));
self.add_rule(ClassificationRule::new(
"backup".into(),
"*.bak".into(),
DataClass::Critical,
));
self.add_rule(ClassificationRule::new(
"backup_dir".into(),
"/backup/".into(),
DataClass::Critical,
));
}
pub fn add_rule(&mut self, rule: ClassificationRule) {
self.rules.push(rule);
self.rules.sort_by(|a, b| b.priority.cmp(&a.priority));
}
pub fn classify_file(
&mut self,
dataset_id: u64,
offset: u64,
path: String,
size: u64,
timestamp: u64,
) -> Result<DataClass, &'static str> {
let mut file = ClassifiedFile::new(dataset_id, offset, path.clone(), size, timestamp);
for rule in &self.rules {
if rule.matches(&path) {
file.class = rule.class;
self.stats.rules_matched += 1;
file.add_tag("rule".into(), rule.name.clone());
file.add_tag(
"compression".into(),
alloc::format!("{}", rule.class.compression_level()),
);
file.add_tag(
"replication".into(),
alloc::format!("{}", rule.class.replication_factor()),
);
file.add_tag("tier".into(), rule.class.recommended_tier().into());
self.stats.tags_applied += 4;
crate::lcpfs_println!(
"[ CLASSIFY ] {} -> {} (rule: {})",
path,
rule.class.name(),
rule.name
);
break;
}
}
let class = file.class;
self.files.insert((dataset_id, offset), file);
self.stats.files_classified += 1;
*self.stats.by_class.entry(class).or_insert(0) += 1;
Ok(class)
}
pub fn add_tag(
&mut self,
dataset_id: u64,
offset: u64,
key: String,
value: String,
) -> Result<(), &'static str> {
let file = self
.files
.get_mut(&(dataset_id, offset))
.ok_or("File not found")?;
file.add_tag(key, value);
self.stats.tags_applied += 1;
Ok(())
}
pub fn get_file(&self, dataset_id: u64, offset: u64) -> Option<&ClassifiedFile> {
self.files.get(&(dataset_id, offset))
}
pub fn get_files_by_class(&self, class: DataClass) -> Vec<&ClassifiedFile> {
self.files.values().filter(|f| f.class == class).collect()
}
pub fn get_files_by_tag(&self, key: &str, value: &str) -> Vec<&ClassifiedFile> {
self.files
.values()
.filter(|f| f.get_tag(key) == Some(value))
.collect()
}
pub fn get_stats(&self) -> ClassificationStats {
self.stats.clone()
}
pub fn storage_by_class(&self) -> BTreeMap<DataClass, u64> {
let mut breakdown = BTreeMap::new();
for file in self.files.values() {
*breakdown.entry(file.class).or_insert(0) += file.size;
}
breakdown
}
pub fn get_placement_policy(&self, dataset_id: u64, offset: u64) -> Option<PlacementPolicy> {
let file = self.files.get(&(dataset_id, offset))?;
Some(PlacementPolicy {
compression_level: file.class.compression_level(),
replication_factor: file.class.replication_factor(),
tier: file.class.recommended_tier().into(),
class: file.class,
})
}
}
#[derive(Debug, Clone)]
pub struct PlacementPolicy {
pub compression_level: u8,
pub replication_factor: u8,
pub tier: String,
pub class: DataClass,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_data_class_properties() {
assert_eq!(DataClass::Media.compression_level(), 0); assert_eq!(DataClass::Code.compression_level(), 6); assert_eq!(DataClass::Critical.replication_factor(), 3); assert_eq!(DataClass::Temporary.replication_factor(), 1); }
#[test]
fn test_classification_rule_extension() {
let rule = ClassificationRule::new("rust".into(), "*.rs".into(), DataClass::Code);
assert!(rule.matches("main.rs"));
assert!(rule.matches("/src/lib.rs"));
assert!(!rule.matches("main.c"));
}
#[test]
fn test_classification_rule_path() {
let rule = ClassificationRule::new("etc".into(), "/etc/".into(), DataClass::System);
assert!(rule.matches("/etc/config.conf"));
assert!(rule.matches("/etc/ssh/sshd_config"));
assert!(!rule.matches("/home/user/file.txt"));
}
#[test]
fn test_tag_operations() {
let mut file = ClassifiedFile::new(1, 0x1000, "test.txt".into(), 1024, 1000);
file.add_tag("owner".into(), "alice".into());
file.add_tag("project".into(), "demo".into());
assert_eq!(file.tags.len(), 2);
assert_eq!(file.get_tag("owner"), Some("alice"));
assert_eq!(file.get_tag("project"), Some("demo"));
assert_eq!(file.get_tag("missing"), None);
file.add_tag("owner".into(), "bob".into());
assert_eq!(file.tags.len(), 2);
assert_eq!(file.get_tag("owner"), Some("bob"));
}
#[test]
fn test_classifier_creation() {
let classifier = DataClassifier::new();
assert!(!classifier.rules.is_empty());
}
#[test]
fn test_classify_rust_file() {
let mut classifier = DataClassifier::new();
let class = classifier
.classify_file(1, 0x1000, "main.rs".into(), 1024, 1000)
.expect("test: operation should succeed");
assert_eq!(class, DataClass::Code);
let file = classifier
.get_file(1, 0x1000)
.expect("test: operation should succeed");
assert_eq!(file.class, DataClass::Code);
assert_eq!(file.get_tag("rule"), Some("rust"));
}
#[test]
fn test_classify_document() {
let mut classifier = DataClassifier::new();
classifier
.classify_file(1, 0x1000, "report.pdf".into(), 50000, 1000)
.expect("test: operation should succeed");
let file = classifier
.get_file(1, 0x1000)
.expect("test: operation should succeed");
assert_eq!(file.class, DataClass::Document);
assert_eq!(file.get_tag("compression"), Some("6"));
}
#[test]
fn test_classify_media() {
let mut classifier = DataClassifier::new();
classifier
.classify_file(1, 0x1000, "video.mp4".into(), 1_000_000, 1000)
.expect("test: operation should succeed");
let file = classifier
.get_file(1, 0x1000)
.expect("test: operation should succeed");
assert_eq!(file.class, DataClass::Media);
assert_eq!(file.get_tag("compression"), Some("0")); }
#[test]
fn test_classify_system() {
let mut classifier = DataClassifier::new();
classifier
.classify_file(1, 0x1000, "/etc/fstab".into(), 512, 1000)
.expect("test: operation should succeed");
let file = classifier
.get_file(1, 0x1000)
.expect("test: operation should succeed");
assert_eq!(file.class, DataClass::System);
}
#[test]
fn test_classify_temporary() {
let mut classifier = DataClassifier::new();
classifier
.classify_file(1, 0x1000, "/tmp/cache.tmp".into(), 2048, 1000)
.expect("test: operation should succeed");
let file = classifier
.get_file(1, 0x1000)
.expect("test: operation should succeed");
assert_eq!(file.class, DataClass::Temporary);
assert_eq!(file.get_tag("replication"), Some("1"));
}
#[test]
fn test_get_files_by_class() {
let mut classifier = DataClassifier::new();
classifier
.classify_file(1, 0x1000, "main.rs".into(), 1024, 1000)
.expect("test: operation should succeed");
classifier
.classify_file(2, 0x2000, "lib.rs".into(), 2048, 1000)
.expect("test: operation should succeed");
classifier
.classify_file(3, 0x3000, "video.mp4".into(), 50000, 1000)
.expect("test: operation should succeed");
let code_files = classifier.get_files_by_class(DataClass::Code);
assert_eq!(code_files.len(), 2);
let media_files = classifier.get_files_by_class(DataClass::Media);
assert_eq!(media_files.len(), 1);
}
#[test]
fn test_get_files_by_tag() {
let mut classifier = DataClassifier::new();
classifier
.classify_file(1, 0x1000, "main.rs".into(), 1024, 1000)
.expect("test: operation should succeed");
classifier
.add_tag(1, 0x1000, "project".into(), "myapp".into())
.expect("test: operation should succeed");
classifier
.classify_file(2, 0x2000, "lib.rs".into(), 2048, 1000)
.expect("test: operation should succeed");
classifier
.add_tag(2, 0x2000, "project".into(), "myapp".into())
.expect("test: operation should succeed");
classifier
.classify_file(3, 0x3000, "test.rs".into(), 512, 1000)
.expect("test: operation should succeed");
classifier
.add_tag(3, 0x3000, "project".into(), "other".into())
.expect("test: operation should succeed");
let myapp_files = classifier.get_files_by_tag("project", "myapp");
assert_eq!(myapp_files.len(), 2);
}
#[test]
fn test_storage_breakdown() {
let mut classifier = DataClassifier::new();
classifier
.classify_file(1, 0x1000, "main.rs".into(), 1024, 1000)
.expect("test: operation should succeed");
classifier
.classify_file(2, 0x2000, "lib.rs".into(), 2048, 1000)
.expect("test: operation should succeed");
classifier
.classify_file(3, 0x3000, "video.mp4".into(), 50000, 1000)
.expect("test: operation should succeed");
let breakdown = classifier.storage_by_class();
assert_eq!(breakdown.get(&DataClass::Code), Some(&3072)); assert_eq!(breakdown.get(&DataClass::Media), Some(&50000));
}
#[test]
fn test_placement_policy() {
let mut classifier = DataClassifier::new();
classifier
.classify_file(1, 0x1000, "backup.bak".into(), 100000, 1000)
.expect("test: operation should succeed");
let policy = classifier
.get_placement_policy(1, 0x1000)
.expect("test: operation should succeed");
assert_eq!(policy.class, DataClass::Critical);
assert_eq!(policy.compression_level, 9);
assert_eq!(policy.replication_factor, 3);
assert_eq!(policy.tier, "Hot");
}
#[test]
fn test_statistics() {
let mut classifier = DataClassifier::new();
classifier
.classify_file(1, 0x1000, "main.rs".into(), 1024, 1000)
.expect("test: operation should succeed");
classifier
.classify_file(2, 0x2000, "video.mp4".into(), 50000, 1000)
.expect("test: operation should succeed");
let stats = classifier.get_stats();
assert_eq!(stats.files_classified, 2);
assert_eq!(stats.rules_matched, 2);
assert!(stats.tags_applied > 0);
assert_eq!(stats.by_class.get(&DataClass::Code), Some(&1));
assert_eq!(stats.by_class.get(&DataClass::Media), Some(&1));
}
#[test]
fn test_custom_rule() {
let mut classifier = DataClassifier::new();
let mut rule = ClassificationRule::new(
"critical_data".into(),
"/important/".into(),
DataClass::Critical,
);
rule.priority = 100;
classifier.add_rule(rule);
classifier
.classify_file(1, 0x1000, "/important/data.bin".into(), 1024, 1000)
.expect("test: operation should succeed");
let file = classifier
.get_file(1, 0x1000)
.expect("test: operation should succeed");
assert_eq!(file.class, DataClass::Critical);
}
}