1use crate::colony::Colony;
8use phago_core::types::Position;
9use std::path::Path;
10
11pub struct Corpus {
13 pub documents: Vec<CorpusDocument>,
14 pub name: String,
15}
16
17#[derive(Debug, Clone)]
19pub struct CorpusDocument {
20 pub title: String,
21 pub content: String,
22 pub category: Option<String>,
23 pub position: Position,
24}
25
26impl Corpus {
27 pub fn from_directory(path: &Path) -> std::io::Result<Self> {
33 let mut documents = Vec::new();
34 let mut entries: Vec<_> = std::fs::read_dir(path)?
35 .filter_map(|e| e.ok())
36 .filter(|e| {
37 e.path()
38 .extension()
39 .map_or(false, |ext| ext == "txt")
40 })
41 .collect();
42
43 entries.sort_by_key(|e| e.file_name());
44
45 let cols = 5;
46 let spacing = 5.0;
47
48 for (i, entry) in entries.iter().enumerate() {
49 let content = std::fs::read_to_string(entry.path())?;
50 let filename = entry.file_name().to_string_lossy().to_string();
51 let title = filename.trim_end_matches(".txt").to_string();
52
53 let category = title
55 .rfind('_')
56 .and_then(|pos| {
57 let suffix = &title[pos + 1..];
58 if suffix.chars().all(|c| c.is_ascii_digit()) {
59 Some(title[..pos].to_string())
60 } else {
61 None
62 }
63 });
64
65 let row = i / cols;
66 let col = i % cols;
67 let position = Position::new(col as f64 * spacing, row as f64 * spacing);
68
69 documents.push(CorpusDocument {
70 title,
71 content,
72 category,
73 position,
74 });
75 }
76
77 let name = path
78 .file_name()
79 .map(|n| n.to_string_lossy().to_string())
80 .unwrap_or_else(|| "corpus".to_string());
81
82 Ok(Corpus { documents, name })
83 }
84
85 pub fn from_embedded() -> Self {
94 let candidate_paths = [
96 Path::new("poc/data/corpus"),
97 Path::new("../../poc/data/corpus"),
98 ];
99 for path in &candidate_paths {
100 if path.exists() {
101 if let Ok(corpus) = Self::from_directory(path) {
102 if corpus.len() >= 20 {
103 return corpus;
104 }
105 }
106 }
107 }
108
109 let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR"))
111 .parent()
112 .and_then(|p| p.parent())
113 .map(|p| p.join("poc/data/corpus"));
114 if let Some(path) = manifest_path {
115 if path.exists() {
116 if let Ok(corpus) = Self::from_directory(&path) {
117 if corpus.len() >= 20 {
118 return corpus;
119 }
120 }
121 }
122 }
123
124 Self::inline_corpus()
126 }
127
128 pub fn inline_corpus() -> Self {
131 let topics: &[(&str, &[&str])] = &[
132 ("cell_biology", &[
133 "The cell membrane is a phospholipid bilayer that forms the outer boundary of every living cell. Integral membrane proteins span the bilayer and serve as channels receptors and enzymes. The fluid mosaic model describes the dynamic nature of the membrane where lipids and proteins move laterally within the layer.",
134 "The cytoskeleton provides structural support and facilitates intracellular transport. Microtubules are hollow polymers of tubulin that serve as tracks for motor proteins like kinesin and dynein. Actin filaments form a dense network beneath the plasma membrane called the cell cortex.",
135 "Organelles compartmentalize cellular functions within membrane-bound structures. The endoplasmic reticulum synthesizes proteins and lipids. The Golgi apparatus processes and packages proteins for secretion. Lysosomes contain digestive enzymes that break down cellular waste.",
136 "Cell division occurs through mitosis and meiosis. During mitosis the cell duplicates its chromosomes and divides into two identical daughter cells. The mitotic spindle composed of microtubules attaches to kinetochores on chromosomes to ensure proper segregation.",
137 "Apoptosis is programmed cell death essential for development and tissue homeostasis. Intrinsic apoptosis is triggered by mitochondrial outer membrane permeabilization releasing cytochrome c. Caspase enzymes execute the dismantling of cellular components.",
138 ]),
139 ("molecular_transport", &[
140 "Active transport moves molecules against their concentration gradient using ATP hydrolysis. The sodium potassium pump exchanges three sodium ions outward for two potassium ions inward maintaining the electrochemical gradient.",
141 "Passive transport occurs down the concentration gradient without energy expenditure. Simple diffusion allows small nonpolar molecules like oxygen and carbon dioxide to cross the lipid bilayer. Facilitated diffusion uses channel proteins and carrier proteins.",
142 "Vesicular transport moves large molecules between compartments through membrane budding and fusion. Endocytosis internalizes extracellular material by membrane invagination forming vesicles. Exocytosis releases intracellular contents by vesicle fusion with the plasma membrane.",
143 "Mitochondria produce ATP through oxidative phosphorylation in the electron transport chain. NADH and FADH2 donate electrons to protein complexes embedded in the inner mitochondrial membrane. The proton gradient drives ATP synthase.",
144 "Signal transduction pathways relay extracellular signals to intracellular responses. G-protein coupled receptors activate second messenger cascades involving cyclic AMP and calcium ions. Receptor tyrosine kinases trigger phosphorylation cascades.",
145 ]),
146 ("genetics", &[
147 "DNA replication is semiconservative with each strand serving as a template. DNA helicase unwinds the double helix at the replication fork. DNA polymerase synthesizes new strands in the five prime to three prime direction.",
148 "Transcription converts DNA sequence into messenger RNA through RNA polymerase activity. Promoter regions upstream of genes recruit transcription factors. Introns are spliced out by the spliceosome complex leaving exons joined in mature mRNA.",
149 "Translation occurs at ribosomes where messenger RNA codons are decoded into amino acid sequences. Transfer RNA molecules carry specific amino acids and recognize codons through anticodon base pairing. The ribosome catalyzes peptide bond formation.",
150 "Gene regulation controls when and how much protein is produced from each gene. Transcription factors bind to enhancer and silencer regions to activate or repress gene expression. Epigenetic modifications alter chromatin accessibility.",
151 "CRISPR-Cas9 enables precise genome editing by creating targeted double-strand breaks in DNA. Guide RNA directs the Cas9 nuclease to complementary sequences. Homology-directed repair allows insertion of new genetic material at the cut site.",
152 ]),
153 ("quantum_computing", &[
154 "Quantum bits or qubits exploit superposition to exist in multiple states simultaneously. Unlike classical bits a qubit represents a linear combination of both states with complex probability amplitudes. Measurement collapses the superposition.",
155 "Quantum entanglement creates correlations between qubits that have no classical analogue. Bell states are maximally entangled two-qubit states used in quantum teleportation and superdense coding. Entanglement is a resource consumed by quantum algorithms.",
156 "Quantum gates manipulate qubits through unitary transformations. The Hadamard gate creates superposition from basis states. CNOT gate entangles two qubits and forms a universal gate set when combined with single qubit rotations.",
157 "Shor's algorithm factors large integers in polynomial time using quantum Fourier transform. This threatens RSA encryption which relies on the computational difficulty of integer factorization. Grover's algorithm provides quadratic speedup for unstructured search.",
158 "Quantum error correction protects quantum information from decoherence and gate errors. The surface code encodes logical qubits in two-dimensional arrays of physical qubits. Topological quantum computing uses anyonic braiding for fault-tolerant operations.",
159 ]),
160 ];
161
162 let mut documents = Vec::new();
163 let spacing = 5.0;
164
165 for (topic_idx, (topic, docs)) in topics.iter().enumerate() {
166 for (doc_idx, content) in docs.iter().enumerate() {
167 let title = format!("{}_{:02}", topic, doc_idx + 1);
168 let x = doc_idx as f64 * spacing;
169 let y = topic_idx as f64 * spacing;
170
171 documents.push(CorpusDocument {
172 title,
173 content: content.to_string(),
174 category: Some(topic.to_string()),
175 position: Position::new(x, y),
176 });
177 }
178 }
179
180 Corpus {
181 documents,
182 name: "embedded-20".to_string(),
183 }
184 }
185
186 pub fn len(&self) -> usize {
188 self.documents.len()
189 }
190
191 pub fn is_empty(&self) -> bool {
193 self.documents.is_empty()
194 }
195
196 pub fn ground_truth(&self) -> std::collections::HashMap<String, String> {
199 self.documents
200 .iter()
201 .filter_map(|d| {
202 d.category
203 .as_ref()
204 .map(|c| (d.title.clone(), c.clone()))
205 })
206 .collect()
207 }
208
209 pub fn categories(&self) -> Vec<String> {
211 let mut cats: Vec<String> = self
212 .documents
213 .iter()
214 .filter_map(|d| d.category.clone())
215 .collect::<std::collections::HashSet<_>>()
216 .into_iter()
217 .collect();
218 cats.sort();
219 cats
220 }
221
222 pub fn limit(mut self, max: usize) -> Self {
224 if self.documents.len() <= max {
225 return self;
226 }
227 let cats = self.categories();
228 let per_cat = max / cats.len().max(1);
229 let mut limited = Vec::new();
230 for cat in &cats {
231 let cat_docs: Vec<_> = self.documents.iter()
232 .filter(|d| d.category.as_deref() == Some(cat))
233 .cloned()
234 .collect();
235 limited.extend(cat_docs.into_iter().take(per_cat));
236 }
237 self.documents = limited;
238 self
239 }
240
241 pub fn ingest_into(&self, colony: &mut Colony) {
243 for doc in &self.documents {
244 colony.ingest_document(&doc.title, &doc.content, doc.position);
245 }
246 }
247}
248
249#[cfg(test)]
250mod tests {
251 use super::*;
252
253 #[test]
254 fn embedded_corpus_has_at_least_20_documents() {
255 let corpus = Corpus::from_embedded();
256 assert!(corpus.len() >= 20, "corpus has {} docs, expected >= 20", corpus.len());
257 }
258
259 #[test]
260 fn embedded_corpus_has_4_categories() {
261 let corpus = Corpus::from_embedded();
262 let cats = corpus.categories();
263 assert_eq!(cats.len(), 4);
264 assert!(cats.contains(&"cell_biology".to_string()));
265 assert!(cats.contains(&"quantum_computing".to_string()));
266 }
267
268 #[test]
269 fn ground_truth_maps_all_documents() {
270 let corpus = Corpus::from_embedded();
271 let gt = corpus.ground_truth();
272 assert!(gt.len() >= 20, "ground truth has {} entries, expected >= 20", gt.len());
273 }
274
275 #[test]
276 fn inline_corpus_has_20_documents() {
277 let corpus = Corpus::inline_corpus();
278 assert_eq!(corpus.len(), 20);
279 }
280
281 #[test]
282 fn from_directory_loads_txt_files() {
283 let path = Path::new(env!("CARGO_MANIFEST_DIR"))
284 .parent()
285 .unwrap()
286 .parent()
287 .unwrap()
288 .join("poc/data/corpus");
289 if path.exists() {
290 let corpus = Corpus::from_directory(&path).unwrap();
291 assert!(corpus.len() >= 20, "directory corpus has {} docs", corpus.len());
292 }
293 }
294}