triplets 0.17.4-alpha

Composable data sampling primitives for deterministic multi-source ML/AI training-data orchestration.
Documentation
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.

[package]
edition = "2024"
name = "triplets"
version = "0.17.4-alpha"
authors = ["Jeremy Harris <jeremy.harris@zenosmosis.com>"]
build = false
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Composable data sampling primitives for deterministic multi-source ML/AI training-data orchestration."
readme = "README.md"
keywords = [
    "train-test-split",
    "triplet-mining",
    "dataset-sampling",
    "training-data",
    "bm25",
]
categories = [
    "algorithms",
    "artificial-intelligence",
    "science",
    "text-processing",
]
license = "MIT OR Apache-2.0"
repository = "https://github.com/jzombie/rust-triplets"

[features]
bm25-mining = ["dep:bm25"]
default = []
extended-metrics = []
huggingface = [
    "dep:serde_json",
    "dep:hf-hub",
    "dep:parquet",
    "dep:reqwest",
    "dep:tokio",
]

[lib]
name = "triplets"
path = "src/lib.rs"

[[example]]
name = "estimate_capacity"
path = "examples/estimate_capacity.rs"

[[example]]
name = "hf_source_list_demo"
path = "examples/hf_source_list_demo.rs"

[[example]]
name = "hf_text_latency_temp"
path = "examples/hf_text_latency_temp.rs"

[[example]]
name = "multi_source_demo"
path = "examples/multi_source_demo.rs"

[[test]]
name = "anchor_positive_swap"
path = "tests/anchor_positive_swap.rs"

[[test]]
name = "huggingface_integration"
path = "tests/huggingface_integration.rs"

[[test]]
name = "ingestion_buffering"
path = "tests/ingestion_buffering.rs"

[[test]]
name = "invariants"
path = "tests/invariants.rs"

[[test]]
name = "prefetcher"
path = "tests/prefetcher.rs"

[[test]]
name = "shuffled_batch_size"
path = "tests/shuffled_batch_size.rs"

[[test]]
name = "shuffled_persistence"
path = "tests/shuffled_persistence.rs"

[[test]]
name = "split_persistence_file_stream"
path = "tests/split_persistence_file_stream.rs"

[[test]]
name = "triplet_diversity"
path = "tests/triplet_diversity.rs"

[dependencies.bitcode]
version = "0.6.9"

[dependencies.bm25]
version = "2.3.2"
features = ["parallelism"]
optional = true

[dependencies.cache-manager]
version = "0.4.0"

[dependencies.chrono]
version = "0.4.44"
features = ["serde"]

[dependencies.clap]
version = "4.6.0"
features = ["derive"]

[dependencies.csv]
version = "1.4.0"

[dependencies.hf-hub]
version = "0.5.0"
features = [
    "tokio",
    "rustls-tls",
]
optional = true
default-features = false

[dependencies.indexmap]
version = "2.13.0"

[dependencies.line-ending]
version = "1.5.1"

[dependencies.parquet]
version = "58.1.0"
features = ["json"]
optional = true

[dependencies.rand]
version = "0.9.2"
features = [
    "std",
    "small_rng",
]

[dependencies.rayon]
version = "1.11.0"

[dependencies.reqwest]
version = "0.12.28"
features = [
    "rustls-tls",
    "stream",
    "json",
]
optional = true
default-features = false

[dependencies.serde]
version = "1.0.228"
features = ["derive"]

[dependencies.serde_json]
version = "1.0.149"
optional = true

[dependencies.simd-r-drive]
version = "0.16.0-alpha"

[dependencies.tempfile]
version = "3.27.0"

[dependencies.thiserror]
version = "2.0.18"

[dependencies.tokio]
version = "1.50.0"
features = [
    "rt",
    "fs",
    "io-util",
]
optional = true

[dependencies.tracing]
version = "0.1.44"

[dependencies.tracing-subscriber]
version = "0.3.23"
features = ["env-filter"]

[dependencies.walkdir]
version = "2.5.0"