Skip to main content

synapse_core/ingest/
extractor.rs

1use anyhow::Result;
2
3pub struct ExtractionResult {
4    pub triples: Vec<(String, String, String)>,
5}
6
7pub trait Extractor {
8    fn extract(&self, content: &str) -> Result<ExtractionResult>;
9}
10
11pub struct CsvExtractor {
12    pub delimiter: u8,
13}
14
15impl CsvExtractor {
16    pub fn new() -> Self {
17        Self { delimiter: b',' }
18    }
19}
20
21impl Extractor for CsvExtractor {
22    fn extract(&self, content: &str) -> Result<ExtractionResult> {
23        let mut rdr = csv::ReaderBuilder::new()
24            .delimiter(self.delimiter)
25            .from_reader(content.as_bytes());
26        
27        let headers = rdr.headers()?.clone();
28        let mut triples = Vec::new();
29        
30        for result in rdr.records() {
31            let record = result?;
32            if let Some(subject) = record.get(0) {
33                if subject.trim().is_empty() { continue; }
34                
35                for (i, value) in record.iter().enumerate().skip(1) {
36                    if let Some(predicate) = headers.get(i) {
37                        let val_trimmed = value.trim();
38                        if !val_trimmed.is_empty() {
39                            triples.push((
40                                subject.to_string(),
41                                predicate.to_string(),
42                                val_trimmed.to_string()
43                            ));
44                        }
45                    }
46                }
47            }
48        }
49        
50        Ok(ExtractionResult { triples })
51    }
52}
53
54pub struct MarkdownExtractor;
55
56impl Extractor for MarkdownExtractor {
57    fn extract(&self, content: &str) -> Result<ExtractionResult> {
58        let mut triples = Vec::new();
59        let mut current_header = String::new();
60        
61        for line in content.lines() {
62            let trimmed = line.trim();
63            if trimmed.is_empty() { continue; }
64            
65            if trimmed.starts_with("#") {
66                current_header = trimmed.trim_start_matches('#').trim().to_string();
67            } else if trimmed.starts_with("- ") || trimmed.starts_with("* ") {
68                if !current_header.is_empty() {
69                    let item = trimmed[2..].trim();
70                    if !item.is_empty() {
71                        triples.push((
72                            current_header.clone(),
73                            "mentions".to_string(),
74                            item.to_string()
75                        ));
76                    }
77                }
78            } else if trimmed.contains(":") {
79                let parts: Vec<&str> = trimmed.splitn(2, ':').collect();
80                if parts.len() == 2 && !current_header.is_empty() {
81                    let predicate = parts[0].trim();
82                    let object = parts[1].trim();
83                    triples.push((
84                        current_header.clone(),
85                        predicate.to_string(),
86                        object.to_string()
87                    ));
88                }
89            }
90        }
91        
92        Ok(ExtractionResult { triples })
93    }
94}