synapse_core/ingest/
extractor.rs1use anyhow::Result;
2
3pub struct ExtractionResult {
4 pub triples: Vec<(String, String, String)>,
5}
6
7pub trait Extractor {
8 fn extract(&self, content: &str) -> Result<ExtractionResult>;
9}
10
11pub struct CsvExtractor {
12 pub delimiter: u8,
13}
14
15impl Default for CsvExtractor {
16 fn default() -> Self {
17 Self::new()
18 }
19}
20
21impl CsvExtractor {
22 pub fn new() -> Self {
23 Self { delimiter: b',' }
24 }
25}
26
27impl Extractor for CsvExtractor {
28 fn extract(&self, content: &str) -> Result<ExtractionResult> {
29 let mut rdr = csv::ReaderBuilder::new()
30 .delimiter(self.delimiter)
31 .from_reader(content.as_bytes());
32
33 let headers = rdr.headers()?.clone();
34 let mut triples = Vec::new();
35
36 for result in rdr.records() {
37 let record = result?;
38 if let Some(subject) = record.get(0) {
39 if subject.trim().is_empty() {
40 continue;
41 }
42
43 for (i, value) in record.iter().enumerate().skip(1) {
44 if let Some(predicate) = headers.get(i) {
45 let val_trimmed = value.trim();
46 if !val_trimmed.is_empty() {
47 triples.push((
48 subject.to_string(),
49 predicate.to_string(),
50 val_trimmed.to_string(),
51 ));
52 }
53 }
54 }
55 }
56 }
57
58 Ok(ExtractionResult { triples })
59 }
60}
61
62pub struct MarkdownExtractor;
63
64impl Extractor for MarkdownExtractor {
65 fn extract(&self, content: &str) -> Result<ExtractionResult> {
66 let mut triples = Vec::new();
67 let mut current_header = String::new();
68
69 for line in content.lines() {
70 let trimmed = line.trim();
71 if trimmed.is_empty() {
72 continue;
73 }
74
75 if trimmed.starts_with("#") {
76 current_header = trimmed.trim_start_matches('#').trim().to_string();
77 } else if trimmed.starts_with("- ") || trimmed.starts_with("* ") {
78 if !current_header.is_empty() {
79 let item = trimmed[2..].trim();
80 if !item.is_empty() {
81 triples.push((
82 current_header.clone(),
83 "mentions".to_string(),
84 item.to_string(),
85 ));
86 }
87 }
88 } else if trimmed.contains(":") {
89 let parts: Vec<&str> = trimmed.splitn(2, ':').collect();
90 if parts.len() == 2 && !current_header.is_empty() {
91 let predicate = parts[0].trim();
92 let object = parts[1].trim();
93 triples.push((
94 current_header.clone(),
95 predicate.to_string(),
96 object.to_string(),
97 ));
98 }
99 }
100 }
101
102 Ok(ExtractionResult { triples })
103 }
104}