synapse_core/ingest/
extractor.rs1use anyhow::Result;
2
3pub struct ExtractionResult {
4 pub triples: Vec<(String, String, String)>,
5}
6
7pub trait Extractor {
8 fn extract(&self, content: &str) -> Result<ExtractionResult>;
9}
10
11pub struct CsvExtractor {
12 pub delimiter: u8,
13}
14
15impl CsvExtractor {
16 pub fn new() -> Self {
17 Self { delimiter: b',' }
18 }
19}
20
21impl Extractor for CsvExtractor {
22 fn extract(&self, content: &str) -> Result<ExtractionResult> {
23 let mut rdr = csv::ReaderBuilder::new()
24 .delimiter(self.delimiter)
25 .from_reader(content.as_bytes());
26
27 let headers = rdr.headers()?.clone();
28 let mut triples = Vec::new();
29
30 for result in rdr.records() {
31 let record = result?;
32 if let Some(subject) = record.get(0) {
33 if subject.trim().is_empty() { continue; }
34
35 for (i, value) in record.iter().enumerate().skip(1) {
36 if let Some(predicate) = headers.get(i) {
37 let val_trimmed = value.trim();
38 if !val_trimmed.is_empty() {
39 triples.push((
40 subject.to_string(),
41 predicate.to_string(),
42 val_trimmed.to_string()
43 ));
44 }
45 }
46 }
47 }
48 }
49
50 Ok(ExtractionResult { triples })
51 }
52}
53
54pub struct MarkdownExtractor;
55
56impl Extractor for MarkdownExtractor {
57 fn extract(&self, content: &str) -> Result<ExtractionResult> {
58 let mut triples = Vec::new();
59 let mut current_header = String::new();
60
61 for line in content.lines() {
62 let trimmed = line.trim();
63 if trimmed.is_empty() { continue; }
64
65 if trimmed.starts_with("#") {
66 current_header = trimmed.trim_start_matches('#').trim().to_string();
67 } else if trimmed.starts_with("- ") || trimmed.starts_with("* ") {
68 if !current_header.is_empty() {
69 let item = trimmed[2..].trim();
70 if !item.is_empty() {
71 triples.push((
72 current_header.clone(),
73 "mentions".to_string(),
74 item.to_string()
75 ));
76 }
77 }
78 } else if trimmed.contains(":") {
79 let parts: Vec<&str> = trimmed.splitn(2, ':').collect();
80 if parts.len() == 2 && !current_header.is_empty() {
81 let predicate = parts[0].trim();
82 let object = parts[1].trim();
83 triples.push((
84 current_header.clone(),
85 predicate.to_string(),
86 object.to_string()
87 ));
88 }
89 }
90 }
91
92 Ok(ExtractionResult { triples })
93 }
94}