Skip to main content

phago_runtime/
corpus.rs

1//! Corpus loader — replaces hard-coded documents in POC.
2//!
3//! Provides a standard way to load text documents from a directory
4//! or use a built-in embedded test corpus. Every branch prototype
5//! uses this to ingest documents into the colony.
6
7use crate::colony::Colony;
8use phago_core::types::Position;
9use std::path::Path;
10
11/// A corpus of documents to be ingested into a colony.
12pub struct Corpus {
13    pub documents: Vec<CorpusDocument>,
14    pub name: String,
15}
16
17/// A single document in a corpus.
18#[derive(Debug, Clone)]
19pub struct CorpusDocument {
20    pub title: String,
21    pub content: String,
22    pub category: Option<String>,
23    pub position: Position,
24}
25
26impl Corpus {
27    /// Load all .txt files from a directory.
28    ///
29    /// Files are assigned positions in a grid layout and categories
30    /// are inferred from filename prefixes (e.g., `cell_biology_01.txt`
31    /// gets category "cell_biology").
32    pub fn from_directory(path: &Path) -> std::io::Result<Self> {
33        let mut documents = Vec::new();
34        let mut entries: Vec<_> = std::fs::read_dir(path)?
35            .filter_map(|e| e.ok())
36            .filter(|e| {
37                e.path()
38                    .extension()
39                    .map_or(false, |ext| ext == "txt")
40            })
41            .collect();
42
43        entries.sort_by_key(|e| e.file_name());
44
45        let cols = 5;
46        let spacing = 5.0;
47
48        for (i, entry) in entries.iter().enumerate() {
49            let content = std::fs::read_to_string(entry.path())?;
50            let filename = entry.file_name().to_string_lossy().to_string();
51            let title = filename.trim_end_matches(".txt").to_string();
52
53            // Infer category from filename prefix (everything before last _NN)
54            let category = title
55                .rfind('_')
56                .and_then(|pos| {
57                    let suffix = &title[pos + 1..];
58                    if suffix.chars().all(|c| c.is_ascii_digit()) {
59                        Some(title[..pos].to_string())
60                    } else {
61                        None
62                    }
63                });
64
65            let row = i / cols;
66            let col = i % cols;
67            let position = Position::new(col as f64 * spacing, row as f64 * spacing);
68
69            documents.push(CorpusDocument {
70                title,
71                content,
72                category,
73                position,
74            });
75        }
76
77        let name = path
78            .file_name()
79            .map(|n| n.to_string_lossy().to_string())
80            .unwrap_or_else(|| "corpus".to_string());
81
82        Ok(Corpus { documents, name })
83    }
84
85    /// Load corpus from disk or fall back to inline content.
86    ///
87    /// Tries to load the expanded 100-document corpus from the `poc/data/corpus/`
88    /// directory. Falls back to an inline 20-document corpus if the directory
89    /// is not found (e.g., when running tests from a different working directory).
90    ///
91    /// Topics: cell_biology, molecular_transport, genetics, quantum_computing.
92    /// Ground-truth clusters enable measuring community detection purity.
93    pub fn from_embedded() -> Self {
94        // Try common corpus directory paths
95        let candidate_paths = [
96            Path::new("poc/data/corpus"),
97            Path::new("../../poc/data/corpus"),
98        ];
99        for path in &candidate_paths {
100            if path.exists() {
101                if let Ok(corpus) = Self::from_directory(path) {
102                    if corpus.len() >= 20 {
103                        return corpus;
104                    }
105                }
106            }
107        }
108
109        // Also try relative to CARGO_MANIFEST_DIR at compile time
110        let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR"))
111            .parent()
112            .and_then(|p| p.parent())
113            .map(|p| p.join("poc/data/corpus"));
114        if let Some(path) = manifest_path {
115            if path.exists() {
116                if let Ok(corpus) = Self::from_directory(&path) {
117                    if corpus.len() >= 20 {
118                        return corpus;
119                    }
120                }
121            }
122        }
123
124        // Fallback: inline 20-document corpus
125        Self::inline_corpus()
126    }
127
128    /// Inline fallback corpus with 20 documents across 4 topics.
129    /// Used when the disk corpus directory is not available.
130    pub fn inline_corpus() -> Self {
131        let topics: &[(&str, &[&str])] = &[
132            ("cell_biology", &[
133                "The cell membrane is a phospholipid bilayer that forms the outer boundary of every living cell. Integral membrane proteins span the bilayer and serve as channels receptors and enzymes. The fluid mosaic model describes the dynamic nature of the membrane where lipids and proteins move laterally within the layer.",
134                "The cytoskeleton provides structural support and facilitates intracellular transport. Microtubules are hollow polymers of tubulin that serve as tracks for motor proteins like kinesin and dynein. Actin filaments form a dense network beneath the plasma membrane called the cell cortex.",
135                "Organelles compartmentalize cellular functions within membrane-bound structures. The endoplasmic reticulum synthesizes proteins and lipids. The Golgi apparatus processes and packages proteins for secretion. Lysosomes contain digestive enzymes that break down cellular waste.",
136                "Cell division occurs through mitosis and meiosis. During mitosis the cell duplicates its chromosomes and divides into two identical daughter cells. The mitotic spindle composed of microtubules attaches to kinetochores on chromosomes to ensure proper segregation.",
137                "Apoptosis is programmed cell death essential for development and tissue homeostasis. Intrinsic apoptosis is triggered by mitochondrial outer membrane permeabilization releasing cytochrome c. Caspase enzymes execute the dismantling of cellular components.",
138            ]),
139            ("molecular_transport", &[
140                "Active transport moves molecules against their concentration gradient using ATP hydrolysis. The sodium potassium pump exchanges three sodium ions outward for two potassium ions inward maintaining the electrochemical gradient.",
141                "Passive transport occurs down the concentration gradient without energy expenditure. Simple diffusion allows small nonpolar molecules like oxygen and carbon dioxide to cross the lipid bilayer. Facilitated diffusion uses channel proteins and carrier proteins.",
142                "Vesicular transport moves large molecules between compartments through membrane budding and fusion. Endocytosis internalizes extracellular material by membrane invagination forming vesicles. Exocytosis releases intracellular contents by vesicle fusion with the plasma membrane.",
143                "Mitochondria produce ATP through oxidative phosphorylation in the electron transport chain. NADH and FADH2 donate electrons to protein complexes embedded in the inner mitochondrial membrane. The proton gradient drives ATP synthase.",
144                "Signal transduction pathways relay extracellular signals to intracellular responses. G-protein coupled receptors activate second messenger cascades involving cyclic AMP and calcium ions. Receptor tyrosine kinases trigger phosphorylation cascades.",
145            ]),
146            ("genetics", &[
147                "DNA replication is semiconservative with each strand serving as a template. DNA helicase unwinds the double helix at the replication fork. DNA polymerase synthesizes new strands in the five prime to three prime direction.",
148                "Transcription converts DNA sequence into messenger RNA through RNA polymerase activity. Promoter regions upstream of genes recruit transcription factors. Introns are spliced out by the spliceosome complex leaving exons joined in mature mRNA.",
149                "Translation occurs at ribosomes where messenger RNA codons are decoded into amino acid sequences. Transfer RNA molecules carry specific amino acids and recognize codons through anticodon base pairing. The ribosome catalyzes peptide bond formation.",
150                "Gene regulation controls when and how much protein is produced from each gene. Transcription factors bind to enhancer and silencer regions to activate or repress gene expression. Epigenetic modifications alter chromatin accessibility.",
151                "CRISPR-Cas9 enables precise genome editing by creating targeted double-strand breaks in DNA. Guide RNA directs the Cas9 nuclease to complementary sequences. Homology-directed repair allows insertion of new genetic material at the cut site.",
152            ]),
153            ("quantum_computing", &[
154                "Quantum bits or qubits exploit superposition to exist in multiple states simultaneously. Unlike classical bits a qubit represents a linear combination of both states with complex probability amplitudes. Measurement collapses the superposition.",
155                "Quantum entanglement creates correlations between qubits that have no classical analogue. Bell states are maximally entangled two-qubit states used in quantum teleportation and superdense coding. Entanglement is a resource consumed by quantum algorithms.",
156                "Quantum gates manipulate qubits through unitary transformations. The Hadamard gate creates superposition from basis states. CNOT gate entangles two qubits and forms a universal gate set when combined with single qubit rotations.",
157                "Shor's algorithm factors large integers in polynomial time using quantum Fourier transform. This threatens RSA encryption which relies on the computational difficulty of integer factorization. Grover's algorithm provides quadratic speedup for unstructured search.",
158                "Quantum error correction protects quantum information from decoherence and gate errors. The surface code encodes logical qubits in two-dimensional arrays of physical qubits. Topological quantum computing uses anyonic braiding for fault-tolerant operations.",
159            ]),
160        ];
161
162        let mut documents = Vec::new();
163        let spacing = 5.0;
164
165        for (topic_idx, (topic, docs)) in topics.iter().enumerate() {
166            for (doc_idx, content) in docs.iter().enumerate() {
167                let title = format!("{}_{:02}", topic, doc_idx + 1);
168                let x = doc_idx as f64 * spacing;
169                let y = topic_idx as f64 * spacing;
170
171                documents.push(CorpusDocument {
172                    title,
173                    content: content.to_string(),
174                    category: Some(topic.to_string()),
175                    position: Position::new(x, y),
176                });
177            }
178        }
179
180        Corpus {
181            documents,
182            name: "embedded-20".to_string(),
183        }
184    }
185
186    /// Number of documents in the corpus.
187    pub fn len(&self) -> usize {
188        self.documents.len()
189    }
190
191    /// Whether the corpus is empty.
192    pub fn is_empty(&self) -> bool {
193        self.documents.is_empty()
194    }
195
196    /// Get the ground-truth category labels (for NMI computation).
197    /// Returns a map of document title -> category.
198    pub fn ground_truth(&self) -> std::collections::HashMap<String, String> {
199        self.documents
200            .iter()
201            .filter_map(|d| {
202                d.category
203                    .as_ref()
204                    .map(|c| (d.title.clone(), c.clone()))
205            })
206            .collect()
207    }
208
209    /// Get unique categories in the corpus.
210    pub fn categories(&self) -> Vec<String> {
211        let mut cats: Vec<String> = self
212            .documents
213            .iter()
214            .filter_map(|d| d.category.clone())
215            .collect::<std::collections::HashSet<_>>()
216            .into_iter()
217            .collect();
218        cats.sort();
219        cats
220    }
221
222    /// Limit corpus to at most `max` documents, evenly sampled across categories.
223    pub fn limit(mut self, max: usize) -> Self {
224        if self.documents.len() <= max {
225            return self;
226        }
227        let cats = self.categories();
228        let per_cat = max / cats.len().max(1);
229        let mut limited = Vec::new();
230        for cat in &cats {
231            let cat_docs: Vec<_> = self.documents.iter()
232                .filter(|d| d.category.as_deref() == Some(cat))
233                .cloned()
234                .collect();
235            limited.extend(cat_docs.into_iter().take(per_cat));
236        }
237        self.documents = limited;
238        self
239    }
240
241    /// Ingest all documents into a colony.
242    pub fn ingest_into(&self, colony: &mut Colony) {
243        for doc in &self.documents {
244            colony.ingest_document(&doc.title, &doc.content, doc.position);
245        }
246    }
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252
253    #[test]
254    fn embedded_corpus_has_at_least_20_documents() {
255        let corpus = Corpus::from_embedded();
256        assert!(corpus.len() >= 20, "corpus has {} docs, expected >= 20", corpus.len());
257    }
258
259    #[test]
260    fn embedded_corpus_has_4_categories() {
261        let corpus = Corpus::from_embedded();
262        let cats = corpus.categories();
263        assert_eq!(cats.len(), 4);
264        assert!(cats.contains(&"cell_biology".to_string()));
265        assert!(cats.contains(&"quantum_computing".to_string()));
266    }
267
268    #[test]
269    fn ground_truth_maps_all_documents() {
270        let corpus = Corpus::from_embedded();
271        let gt = corpus.ground_truth();
272        assert!(gt.len() >= 20, "ground truth has {} entries, expected >= 20", gt.len());
273    }
274
275    #[test]
276    fn inline_corpus_has_20_documents() {
277        let corpus = Corpus::inline_corpus();
278        assert_eq!(corpus.len(), 20);
279    }
280
281    #[test]
282    fn from_directory_loads_txt_files() {
283        let path = Path::new(env!("CARGO_MANIFEST_DIR"))
284            .parent()
285            .unwrap()
286            .parent()
287            .unwrap()
288            .join("poc/data/corpus");
289        if path.exists() {
290            let corpus = Corpus::from_directory(&path).unwrap();
291            assert!(corpus.len() >= 20, "directory corpus has {} docs", corpus.len());
292        }
293    }
294}