data_generator 0.1.119

RDF data shapes implementation in Rust
Documentation
# Ready-to-use configuration for RDF data generation
# Copy this file and modify as needed

[generation]
entity_count = 1000                    # Generate 1000 entities total
seed = 42                             # Use fixed seed for reproducible results
entity_distribution = "Equal"         # Distribute entities equally across shapes
cardinality_strategy = "Maximum"      # Use maximum cardinality to generate more triples per entity
# Note: To increase triples without more entities:
# 1. Use "Maximum" cardinality strategy (generates max possible property values)
# 2. Design schemas with higher cardinalities (e.g., :knows @:User {1,10})
# 3. Add more optional properties with * or + cardinalities
# 4. Use weighted distribution to favor shapes with more properties

[field_generators.default]
locale = "en"                         # Generate English text
quality = "Medium"                    # Medium quality data

# Configure specific data types
[field_generators.datatypes."http://www.w3.org/2001/XMLSchema#integer"]
generator = "integer"
[field_generators.datatypes."http://www.w3.org/2001/XMLSchema#integer".parameters]
min = 1
max = 10000

[field_generators.datatypes."http://www.w3.org/2001/XMLSchema#decimal"]
generator = "decimal"
[field_generators.datatypes."http://www.w3.org/2001/XMLSchema#decimal".parameters]
min = 0.0
max = 1000.0
precision = 2

[field_generators.datatypes."http://www.w3.org/2001/XMLSchema#date"]
generator = "date"
[field_generators.datatypes."http://www.w3.org/2001/XMLSchema#date".parameters]
start_year = 1980
end_year = 2024

# Configure specific properties
[field_generators.properties."http://example.org/name"]
generator = "string"
parameters = {}

[field_generators.properties."http://example.org/legalName"]
generator = "string"
parameters = {}

# Pattern-based field generation - NEW FEATURE!
# Patterns from ShEx/SHACL schemas are now supported via the pattern generator
# Use the "pattern" generator for regex-based string generation

# Phone number with pattern
[field_generators.properties."http://example.org/phone"]
generator = "pattern"
[field_generators.properties."http://example.org/phone".parameters]
pattern = "\\d{3}-\\d{3}-\\d{4}"  # Generates: 123-456-7890

# Email with pattern
[field_generators.properties."http://example.org/email"]
generator = "pattern"
[field_generators.properties."http://example.org/email".parameters]
pattern = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"

# ID number with pattern
[field_generators.properties."http://example.org/studentId"]
generator = "pattern"
[field_generators.properties."http://example.org/studentId".parameters]
pattern = "[A-Z]{2,3}\\d{4,6}"  # Generates: AB1234 or XYZ123456

# US phone number with country code
[field_generators.properties."http://example.org/usPhone"]
generator = "pattern"
[field_generators.properties."http://example.org/usPhone".parameters]
pattern = "\\+1-\\d{3}-\\d{3}-\\d{4}"  # Generates: +1-555-123-4567

# Date pattern
[field_generators.properties."http://example.org/dateString"]
generator = "pattern"
[field_generators.properties."http://example.org/dateString".parameters]
pattern = "\\d{4}-\\d{2}-\\d{2}"  # Generates: 2023-12-25

# URL pattern
[field_generators.properties."http://example.org/website"]
generator = "pattern"
[field_generators.properties."http://example.org/website".parameters]
pattern = "https?://[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"

# IP address pattern
[field_generators.properties."http://example.org/ipAddress"]
generator = "pattern"
[field_generators.properties."http://example.org/ipAddress".parameters]
pattern = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"

# Note: The pattern generator also provides heuristic-based generation
# when no pattern is specified but the property name suggests a type:
# - Properties containing "phone" or "tel" → phone number format
# - Properties containing "email" → email format
# - Properties containing "url" or "website" → URL format
# - Properties containing "id" → ID format

[output]
path = "generated_data.ttl"           # Output file name
format = "Turtle"                     # Use Turtle format
compress = false                      # Don't compress output
write_stats = true                    # Include generation statistics
parallel_writing = false             # Disable parallel writing (use single file)
parallel_file_count = 4              # Number of parallel files (when enabled)

[parallel]
worker_threads = 4                    # Use 4 parallel threads
batch_size = 100                      # Process entities in batches of 100
parallel_shapes = true               # Process shapes in parallel
parallel_fields = true              # Generate fields in parallel