1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
//! Dataset trait and batch type for training data pipelines.
use crateResult;
use Runtime;
use Tensor;
/// A batch of training data: input tensors and corresponding targets.
///
/// For language modeling: `inputs` and `targets` are both `[batch_size, seq_len]`
/// tensors of token IDs, where targets are inputs shifted by one position.
///
/// For other tasks, shapes depend on the domain.
/// Trait for indexable datasets that produce batches.
///
/// Datasets must return CPU-resident tensors. GPU transfer (if needed)
/// is handled by the caller after batching — one bulk transfer per batch
/// is far more efficient than per-sample transfers.
///
/// # Example
///
/// ```ignore
/// struct TokenDataset { tokens: Vec<u32>, seq_len: usize }
///
/// impl Dataset<CpuRuntime> for TokenDataset {
/// fn len(&self) -> usize {
/// self.tokens.len().saturating_sub(1) / self.seq_len
/// }
/// fn get(&self, idx: usize, device: &CpuDevice) -> Result<Batch<CpuRuntime>> {
/// let start = idx * self.seq_len;
/// let inputs = Tensor::from_slice(&self.tokens[start..start + self.seq_len], &[self.seq_len], device);
/// let targets = Tensor::from_slice(&self.tokens[start + 1..start + self.seq_len + 1], &[self.seq_len], device);
/// Ok(Batch { inputs, targets })
/// }
/// }
/// ```