Skip to main content

sqlite_knowledge_graph/
migrate.rs

1//! Data migration module for importing external knowledge sources.
2
3use rusqlite::Connection;
4use serde_json::Value;
5use std::collections::HashMap;
6use std::fs;
7use std::path::Path;
8
9use crate::error::{Error, Result};
10use crate::graph::Entity;
11use crate::KnowledgeGraph;
12
13/// Migrate papers from Aerial's knowledge database to the knowledge graph.
14///
15/// This function:
16/// - Creates "paper" entities from the papers table
17/// - Stores arxiv_id and other metadata as properties
18/// - Creates placeholder vectors (can be updated later with real embeddings)
19pub fn migrate_papers(source_db: &str, kg: &KnowledgeGraph) -> Result<i64> {
20    let source_conn = Connection::open(source_db)?;
21
22    let tx = kg.transaction()?;
23    let mut count = 0;
24
25    // Query all papers
26    let mut stmt = source_conn.prepare(
27        r#"
28        SELECT arxiv_id, title, file_path, keywords, utility,
29               skill_created, last_accessed, created_at, notes
30        FROM papers
31        "#,
32    )?;
33
34    let rows = stmt.query_map([], |row| {
35        Ok((
36            row.get::<_, String>(0)?,         // arxiv_id
37            row.get::<_, String>(1)?,         // title
38            row.get::<_, Option<String>>(2)?, // file_path
39            row.get::<_, Option<String>>(3)?, // keywords
40            row.get::<_, Option<f64>>(4)?,    // utility
41            row.get::<_, Option<String>>(5)?, // skill_created
42            row.get::<_, Option<String>>(6)?, // last_accessed
43            row.get::<_, Option<String>>(7)?, // created_at
44            row.get::<_, Option<String>>(8)?, // notes
45        ))
46    })?;
47
48    for row in rows {
49        let (
50            arxiv_id,
51            title,
52            file_path,
53            keywords,
54            utility,
55            skill_created,
56            last_accessed,
57            created_at,
58            notes,
59        ) = row?;
60
61        let mut properties = HashMap::new();
62        properties.insert("arxiv_id".to_string(), Value::String(arxiv_id.clone()));
63
64        if let Some(fp) = file_path {
65            properties.insert("file_path".to_string(), Value::String(fp));
66        }
67
68        if let Some(kw) = keywords {
69            properties.insert("keywords".to_string(), Value::String(kw));
70        }
71
72        if let Some(util) = utility {
73            properties.insert(
74                "utility".to_string(),
75                Value::Number(
76                    serde_json::Number::from_f64(util).unwrap_or(serde_json::Number::from(0)),
77                ),
78            );
79        }
80
81        if let Some(skill) = skill_created {
82            properties.insert("skill_created".to_string(), Value::String(skill));
83        }
84
85        if let Some(la) = last_accessed {
86            properties.insert("last_accessed".to_string(), Value::String(la));
87        }
88
89        if let Some(ca) = created_at {
90            properties.insert("created_at".to_string(), Value::String(ca));
91        }
92
93        if let Some(n) = notes {
94            properties.insert("notes".to_string(), Value::String(n));
95        }
96
97        let entity = Entity::with_properties("paper", title, properties);
98
99        let entity_id = crate::graph::insert_entity(&tx, &entity)?;
100        count += 1;
101
102        // Create placeholder vector (random values, will be replaced later)
103        let placeholder_vector = vec![0.0_f32; 384]; // Common embedding dimension
104        crate::vector::VectorStore::new().insert_vector(&tx, entity_id, placeholder_vector)?;
105    }
106
107    tx.commit()?;
108    Ok(count)
109}
110
111/// Migrate skills from the skills directory to the knowledge graph.
112///
113/// This function:
114/// - Creates "skill" entities from skill directories
115/// - Reads SKILL.md files for content
116/// - Creates placeholder vectors
117pub fn migrate_skills(skills_dir: &str, kg: &KnowledgeGraph) -> Result<i64> {
118    let skills_path = Path::new(skills_dir);
119
120    if !skills_path.exists() {
121        return Err(Error::Other(format!(
122            "Skills directory not found: {}",
123            skills_dir
124        )));
125    }
126
127    let tx = kg.transaction()?;
128    let mut count = 0;
129
130    for entry in fs::read_dir(skills_path)? {
131        let entry = entry?;
132        let skill_dir = entry.path();
133
134        if skill_dir.is_dir() {
135            let skill_name = skill_dir
136                .file_name()
137                .and_then(|n| n.to_str())
138                .ok_or_else(|| Error::Other("Invalid skill directory name".to_string()))?;
139
140            let mut properties = HashMap::new();
141            properties.insert(
142                "skill_name".to_string(),
143                Value::String(skill_name.to_string()),
144            );
145
146            // Try to read SKILL.md
147            let skill_md_path = skill_dir.join("SKILL.md");
148            if skill_md_path.exists() {
149                let content = fs::read_to_string(&skill_md_path).unwrap_or_default();
150
151                // Extract metadata from SKILL.md
152                if let Some(description) = extract_description(&content) {
153                    properties.insert("description".to_string(), Value::String(description));
154                }
155
156                properties.insert("content".to_string(), Value::String(content));
157            }
158
159            let entity = Entity::with_properties("skill", skill_name, properties);
160            let entity_id = crate::graph::insert_entity(&tx, &entity)?;
161            count += 1;
162
163            // Create placeholder vector
164            let placeholder_vector = vec![0.0_f32; 384];
165            crate::vector::VectorStore::new().insert_vector(&tx, entity_id, placeholder_vector)?;
166        }
167    }
168
169    tx.commit()?;
170    Ok(count)
171}
172
173/// Build relationships between entities.
174///
175/// This function:
176/// - Links papers to skills (derived_from)
177/// - Links related papers (related_by_keywords)
178/// - Links similar skills (similar_to)
179pub fn build_relationships(kg: &KnowledgeGraph) -> Result<i64> {
180    let tx = kg.transaction()?;
181    let mut count = 0;
182
183    // Get all papers and skills
184    let papers = kg.list_entities(Some("paper"), None)?;
185    let skills = kg.list_entities(Some("skill"), None)?;
186
187    // Build arxiv_id -> entity_id map for papers
188    let mut paper_map: HashMap<String, i64> = HashMap::new();
189    for paper in &papers {
190        if let Some(arxiv_id) = paper.get_property("arxiv_id").and_then(|v| v.as_str()) {
191            if let Some(id) = paper.id {
192                paper_map.insert(arxiv_id.to_string(), id);
193            }
194        }
195    }
196
197    // Build skill_name -> entity_id map for skills
198    let mut skill_map: HashMap<String, i64> = HashMap::new();
199    for skill in &skills {
200        if let Some(skill_name) = skill.get_property("skill_name").and_then(|v| v.as_str()) {
201            if let Some(id) = skill.id {
202                skill_map.insert(skill_name.to_string(), id);
203            }
204        }
205    }
206
207    // Connect papers to skills (derived_from)
208    for paper in &papers {
209        if let Some(skill_created) = paper.get_property("skill_created").and_then(|v| v.as_str()) {
210            if !skill_created.is_empty() {
211                if let Some(paper_id) = paper.id {
212                    if let Some(skill_id) = skill_map.get(skill_created) {
213                        let relation =
214                            crate::graph::Relation::new(paper_id, *skill_id, "derived_from", 1.0)?;
215                        crate::graph::insert_relation(&tx, &relation)?;
216                        count += 1;
217                    }
218                }
219            }
220        }
221    }
222
223    // Connect related papers (related_by_keywords)
224    for i in 0..papers.len() {
225        for j in (i + 1)..papers.len() {
226            let paper_a = &papers[i];
227            let paper_b = &papers[j];
228
229            if let (Some(id_a), Some(id_b)) = (paper_a.id, paper_b.id) {
230                if let Some(similarity) = compute_keyword_similarity(paper_a, paper_b) {
231                    if similarity > 0.3 {
232                        let relation = crate::graph::Relation::new(
233                            id_a,
234                            id_b,
235                            "related_by_keywords",
236                            similarity,
237                        )?;
238                        crate::graph::insert_relation(&tx, &relation)?;
239                        count += 1;
240                    }
241                }
242            }
243        }
244    }
245
246    // Connect similar skills (similar_to) - based on common keywords in descriptions
247    for i in 0..skills.len() {
248        for j in (i + 1)..skills.len() {
249            let skill_a = &skills[i];
250            let skill_b = &skills[j];
251
252            if let (Some(id_a), Some(id_b)) = (skill_a.id, skill_b.id) {
253                if let Some(similarity) = compute_skill_similarity(skill_a, skill_b) {
254                    if similarity > 0.3 {
255                        let relation =
256                            crate::graph::Relation::new(id_a, id_b, "similar_to", similarity)?;
257                        crate::graph::insert_relation(&tx, &relation)?;
258                        count += 1;
259                    }
260                }
261            }
262        }
263    }
264
265    tx.commit()?;
266    Ok(count)
267}
268
269/// Extract description from SKILL.md content.
270fn extract_description(content: &str) -> Option<String> {
271    // Look for description section or use first paragraph
272    for line in content.lines() {
273        let line = line.trim();
274        if line.starts_with("# Description") || line.starts_with("## Description") {
275            continue;
276        }
277        if !line.is_empty() && !line.starts_with("#") {
278            return Some(line.to_string());
279        }
280    }
281    None
282}
283
284/// Compute similarity between two papers based on keywords.
285fn compute_keyword_similarity(paper_a: &Entity, paper_b: &Entity) -> Option<f64> {
286    let keywords_a: Vec<String> = paper_a
287        .get_property("keywords")
288        .and_then(|v| v.as_str())
289        .and_then(|s| serde_json::from_str::<Vec<String>>(s).ok())
290        .unwrap_or_default();
291
292    let keywords_b: Vec<String> = paper_b
293        .get_property("keywords")
294        .and_then(|v| v.as_str())
295        .and_then(|s| serde_json::from_str::<Vec<String>>(s).ok())
296        .unwrap_or_default();
297
298    if keywords_a.is_empty() || keywords_b.is_empty() {
299        return None;
300    }
301
302    let set_a: std::collections::HashSet<&String> = keywords_a.iter().collect();
303    let set_b: std::collections::HashSet<&String> = keywords_b.iter().collect();
304
305    let intersection = set_a.intersection(&set_b).count();
306    let union = set_a.union(&set_b).count();
307
308    if union == 0 {
309        return Some(0.0);
310    }
311
312    Some(intersection as f64 / union as f64)
313}
314
315/// Compute similarity between two skills based on description content.
316fn compute_skill_similarity(skill_a: &Entity, skill_b: &Entity) -> Option<f64> {
317    let desc_a = skill_a
318        .get_property("description")
319        .and_then(|v| v.as_str())
320        .unwrap_or("");
321    let desc_b = skill_b
322        .get_property("description")
323        .and_then(|v| v.as_str())
324        .unwrap_or("");
325
326    if desc_a.is_empty() || desc_b.is_empty() {
327        return None;
328    }
329
330    // Simple word overlap similarity
331    let words_a: std::collections::HashSet<&str> = desc_a.split_whitespace().collect();
332    let words_b: std::collections::HashSet<&str> = desc_b.split_whitespace().collect();
333
334    let intersection = words_a.intersection(&words_b).count();
335    let union = words_a.union(&words_b).count();
336
337    if union == 0 {
338        return Some(0.0);
339    }
340
341    Some(intersection as f64 / union as f64)
342}
343
344/// Perform full migration: papers, skills, and relationships.
345pub fn migrate_all(
346    source_db: &str,
347    skills_dir: &str,
348    kg: &KnowledgeGraph,
349) -> Result<MigrationStats> {
350    let papers_count = migrate_papers(source_db, kg)?;
351    let skills_count = migrate_skills(skills_dir, kg)?;
352    let relations_count = build_relationships(kg)?;
353
354    Ok(MigrationStats {
355        papers_count,
356        skills_count,
357        relations_count,
358    })
359}
360
361/// Statistics from migration.
362#[derive(Debug, Clone, serde::Serialize)]
363pub struct MigrationStats {
364    pub papers_count: i64,
365    pub skills_count: i64,
366    pub relations_count: i64,
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[test]
374    fn test_extract_description() {
375        let content = "# Test Skill\n\nThis is a test description.\n\nMore content here.";
376        let desc = extract_description(content);
377        assert_eq!(desc, Some("This is a test description.".to_string()));
378    }
379
380    #[test]
381    fn test_keyword_similarity() {
382        let mut paper_a = Entity::new("paper", "Paper A");
383        // Store as JSON string (as it comes from the database)
384        paper_a.set_property(
385            "keywords",
386            serde_json::Value::String(r#"["machine", "learning"]"#.to_string()),
387        );
388
389        let mut paper_b = Entity::new("paper", "Paper B");
390        paper_b.set_property(
391            "keywords",
392            serde_json::Value::String(r#"["machine", "vision"]"#.to_string()),
393        );
394
395        let similarity = compute_keyword_similarity(&paper_a, &paper_b).unwrap();
396        // keywords_a = ["machine", "learning"]
397        // keywords_b = ["machine", "vision"]
398        // intersection = ["machine"] = 1
399        // union = ["machine", "learning", "vision"] = 3
400        // similarity = 1/3 ≈ 0.333
401        assert!((similarity - 0.333).abs() < 0.01);
402    }
403
404    #[test]
405    fn test_skill_similarity() {
406        let mut skill_a = Entity::new("skill", "Skill A");
407        skill_a.set_property("description", serde_json::json!("neural network learning"));
408
409        let mut skill_b = Entity::new("skill", "Skill B");
410        skill_b.set_property("description", serde_json::json!("neural network vision"));
411
412        let similarity = compute_skill_similarity(&skill_a, &skill_b).unwrap();
413        // Intersection: {neural, network} = 2 words
414        // Union: {neural, network, learning, vision} = 4 words
415        // Similarity: 2/4 = 0.5
416        assert!((similarity - 0.5).abs() < 0.01);
417    }
418}