Skip to main content

synapse_core/ingest/
extractor.rs

1use anyhow::Result;
2
3pub struct ExtractionResult {
4    pub triples: Vec<(String, String, String)>,
5}
6
7pub trait Extractor {
8    fn extract(&self, content: &str) -> Result<ExtractionResult>;
9}
10
11pub struct CsvExtractor {
12    pub delimiter: u8,
13}
14
15impl Default for CsvExtractor {
16    fn default() -> Self {
17        Self::new()
18    }
19}
20
21impl CsvExtractor {
22    pub fn new() -> Self {
23        Self { delimiter: b',' }
24    }
25}
26
27impl Extractor for CsvExtractor {
28    fn extract(&self, content: &str) -> Result<ExtractionResult> {
29        let mut rdr = csv::ReaderBuilder::new()
30            .delimiter(self.delimiter)
31            .from_reader(content.as_bytes());
32
33        let headers = rdr.headers()?.clone();
34        let mut triples = Vec::new();
35
36        for result in rdr.records() {
37            let record = result?;
38            if let Some(subject) = record.get(0) {
39                if subject.trim().is_empty() {
40                    continue;
41                }
42
43                for (i, value) in record.iter().enumerate().skip(1) {
44                    if let Some(predicate) = headers.get(i) {
45                        let val_trimmed = value.trim();
46                        if !val_trimmed.is_empty() {
47                            triples.push((
48                                subject.to_string(),
49                                predicate.to_string(),
50                                val_trimmed.to_string(),
51                            ));
52                        }
53                    }
54                }
55            }
56        }
57
58        Ok(ExtractionResult { triples })
59    }
60}
61
62pub struct MarkdownExtractor;
63
64impl Extractor for MarkdownExtractor {
65    fn extract(&self, content: &str) -> Result<ExtractionResult> {
66        let mut triples = Vec::new();
67        let mut current_header = String::new();
68
69        for line in content.lines() {
70            let trimmed = line.trim();
71            if trimmed.is_empty() {
72                continue;
73            }
74
75            if trimmed.starts_with("#") {
76                current_header = trimmed.trim_start_matches('#').trim().to_string();
77            } else if trimmed.starts_with("- ") || trimmed.starts_with("* ") {
78                if !current_header.is_empty() {
79                    let item = trimmed[2..].trim();
80                    if !item.is_empty() {
81                        triples.push((
82                            current_header.clone(),
83                            "mentions".to_string(),
84                            item.to_string(),
85                        ));
86                    }
87                }
88            } else if trimmed.contains(":") {
89                let parts: Vec<&str> = trimmed.splitn(2, ':').collect();
90                if parts.len() == 2 && !current_header.is_empty() {
91                    let predicate = parts[0].trim();
92                    let object = parts[1].trim();
93                    triples.push((
94                        current_header.clone(),
95                        predicate.to_string(),
96                        object.to_string(),
97                    ));
98                }
99            }
100        }
101
102        Ok(ExtractionResult { triples })
103    }
104}