[package]
edition = "2024"
rust-version = "1.91"
name = "brainwires-datasets"
version = "0.6.0"
authors = ["Brainwires"]
build = false
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Training data pipelines for the Brainwires Agent Framework — JSONL I/O, tokenization, deduplication, format conversion"
homepage = "https://github.com/Brainwires/brainwires-framework"
documentation = "https://docs.rs/brainwires-datasets"
readme = "README.md"
keywords = [
"datasets",
"training-data",
"tokenization",
"jsonl",
"deduplication",
]
categories = ["development-tools"]
license = "MIT OR Apache-2.0"
repository = "https://github.com/Brainwires/brainwires-framework"
resolver = "2"
[features]
dedup = [
"dep:sha2",
"dep:rand",
]
default = ["hf-tokenizer"]
full = [
"hf-tokenizer",
"tiktoken",
"dedup",
]
hf-tokenizer = ["dep:tokenizers"]
tiktoken = ["dep:tiktoken-rs"]
[lib]
name = "brainwires_datasets"
path = "src/lib.rs"
[dependencies.anyhow]
version = "1"
[dependencies.brainwires-core]
version = "0.6.0"
[dependencies.rand]
version = "0.10"
optional = true
[dependencies.serde]
version = "1"
features = ["derive"]
[dependencies.serde_json]
version = "1"
[dependencies.sha2]
version = "0.10"
optional = true
[dependencies.thiserror]
version = "2"
[dependencies.tiktoken-rs]
version = "0.9"
optional = true
[dependencies.tokenizers]
version = "0.22"
optional = true
[dependencies.tracing]
version = "0.1"
[dependencies.uuid]
version = "1"
features = [
"v4",
"serde",
]
[dev-dependencies.serde_json]
version = "1"
[dev-dependencies.tempfile]
version = "3"
[dev-dependencies.tokio]
version = "1.43"
features = [
"full",
"macros",
"rt-multi-thread",
]