#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <arrow-glib/arrow-glib.h>
#ifndef RYPE_ARROW
#define RYPE_ARROW
#endif
#include "../rype.h"
static const char *SEQUENCES[] = {
"AAAAACCCCCAAAAACCCCCAAAAACCCCCAAAAACCCCCAAAAACCCCCAAAAACCCCCAAAAACCCCC",
"AAAATTTTGGGGCCCCAAAATTTTGGGGCCCCAAAATTTTGGGGCCCC",
};
static const int NUM_SEQUENCES = 2;
static GArrowRecordBatch *
build_input_batch(GError **error)
{
GArrowInt64ArrayBuilder *id_builder = garrow_int64_array_builder_new();
for (int i = 0; i < NUM_SEQUENCES; i++) {
if (!garrow_int64_array_builder_append_value(id_builder, i + 1, error)) {
g_object_unref(id_builder);
return NULL;
}
}
GArrowArray *id_array =
garrow_array_builder_finish(GARROW_ARRAY_BUILDER(id_builder), error);
g_object_unref(id_builder);
if (!id_array)
return NULL;
GArrowBinaryArrayBuilder *seq_builder = garrow_binary_array_builder_new();
for (int i = 0; i < NUM_SEQUENCES; i++) {
gint32 len = (gint32)strlen(SEQUENCES[i]);
gboolean ok = garrow_binary_array_builder_append_value(
seq_builder, (const guint8 *)SEQUENCES[i], len, error);
if (!ok) {
g_object_unref(seq_builder);
g_object_unref(id_array);
return NULL;
}
}
GArrowArray *seq_array =
garrow_array_builder_finish(GARROW_ARRAY_BUILDER(seq_builder), error);
g_object_unref(seq_builder);
if (!seq_array) {
g_object_unref(id_array);
return NULL;
}
GArrowField *id_field =
garrow_field_new("id", GARROW_DATA_TYPE(garrow_int64_data_type_new()));
GArrowField *seq_field =
garrow_field_new("sequence", GARROW_DATA_TYPE(garrow_binary_data_type_new()));
GList *fields = NULL;
fields = g_list_append(fields, id_field);
fields = g_list_append(fields, seq_field);
GArrowSchema *schema = garrow_schema_new(fields);
g_list_free(fields);
g_object_unref(id_field);
g_object_unref(seq_field);
GList *columns = NULL;
columns = g_list_append(columns, id_array);
columns = g_list_append(columns, seq_array);
GArrowRecordBatch *batch =
garrow_record_batch_new(schema, NUM_SEQUENCES, columns, error);
g_list_free(columns);
g_object_unref(id_array);
g_object_unref(seq_array);
g_object_unref(schema);
return batch;
}
static struct ArrowArrayStream *
export_batch_as_stream(GArrowRecordBatch *batch, GError **error)
{
GArrowSchema *schema =
garrow_record_batch_get_schema(batch);
GList *batches = NULL;
batches = g_list_append(batches, batch);
GArrowRecordBatchReader *reader =
garrow_record_batch_reader_new(batches, schema, error);
g_list_free(batches);
g_object_unref(schema);
if (!reader)
return NULL;
gpointer c_stream = garrow_record_batch_reader_export(reader, error);
g_object_unref(reader);
return (struct ArrowArrayStream *)c_stream;
}
static void
print_uint64_list_column(GArrowRecordBatch *batch, guint col_idx, const char *label)
{
GArrowArray *array = garrow_record_batch_get_column_data(batch, col_idx);
GArrowListArray *list_array = GARROW_LIST_ARRAY(array);
gint64 n_rows = garrow_array_get_length(array);
for (gint64 row = 0; row < n_rows; row++) {
GArrowArray *values = garrow_list_array_get_value(list_array, row);
GArrowUInt64Array *u64_values = GARROW_UINT64_ARRAY(values);
gint64 len = garrow_array_get_length(values);
g_print(" row %ld %s (%ld values):", (long)row, label, (long)len);
gint64 print_count = len < 5 ? len : 5;
for (gint64 i = 0; i < print_count; i++) {
g_print(" 0x%016lx", (unsigned long)garrow_uint64_array_get_value(u64_values, i));
}
if (len > print_count) {
g_print(" ...");
}
g_print("\n");
g_object_unref(values);
}
g_object_unref(array);
}
static int
read_and_print_results(struct ArrowArrayStream *out_stream, const char *title)
{
GError *error = NULL;
GArrowRecordBatchReader *reader =
garrow_record_batch_reader_import(out_stream, &error);
if (!reader) {
g_printerr("Failed to import output stream: %s\n", error->message);
g_error_free(error);
return -1;
}
GArrowSchema *schema = garrow_record_batch_reader_get_schema(reader);
gchar *schema_str = garrow_schema_to_string(schema);
g_print("\n=== %s ===\n", title);
g_print("Schema:\n%s\n", schema_str);
g_free(schema_str);
g_object_unref(schema);
int total_rows = 0;
while (TRUE) {
GArrowRecordBatch *batch =
garrow_record_batch_reader_read_next(reader, &error);
if (error) {
g_printerr("Failed to read batch: %s\n", error->message);
g_error_free(error);
g_object_unref(reader);
return -1;
}
if (!batch)
break;
gint64 n_rows = garrow_record_batch_get_n_rows(batch);
guint n_cols = garrow_record_batch_get_n_columns(batch);
g_print("\nBatch: %ld rows, %u columns\n", (long)n_rows, n_cols);
for (guint col = 1; col < n_cols; col++) {
const gchar *name = garrow_record_batch_get_column_name(batch, col);
print_uint64_list_column(batch, col, name);
}
total_rows += (int)n_rows;
g_object_unref(batch);
}
g_print("Total rows: %d\n", total_rows);
g_object_unref(reader);
return 0;
}
int
main(void)
{
GError *error = NULL;
g_print("Rype Arrow Extraction Example\n");
g_print("Sequences: %d\n", NUM_SEQUENCES);
for (int i = 0; i < NUM_SEQUENCES; i++) {
g_print(" [%d] len=%zu: %.40s%s\n", i + 1, strlen(SEQUENCES[i]),
SEQUENCES[i], strlen(SEQUENCES[i]) > 40 ? "..." : "");
}
size_t k = 16;
size_t w = 5;
uint64_t salt = 0;
g_print("Parameters: k=%zu, w=%zu, salt=0x%016lx\n", k, w, (unsigned long)salt);
{
GArrowRecordBatch *input_batch = build_input_batch(&error);
if (!input_batch) {
g_printerr("Failed to build input batch: %s\n", error->message);
g_error_free(error);
return EXIT_FAILURE;
}
struct ArrowArrayStream *input_stream =
export_batch_as_stream(input_batch, &error);
g_object_unref(input_batch);
if (!input_stream) {
g_printerr("Failed to export input stream: %s\n", error->message);
g_error_free(error);
return EXIT_FAILURE;
}
struct ArrowArrayStream out_stream;
memset(&out_stream, 0, sizeof(out_stream));
int rc = rype_extract_minimizer_set_arrow(input_stream, k, w, salt, &out_stream);
if (rc != 0) {
g_printerr("rype_extract_minimizer_set_arrow failed: %s\n",
rype_get_last_error());
return EXIT_FAILURE;
}
if (read_and_print_results(&out_stream, "Minimizer Set Extraction") != 0) {
return EXIT_FAILURE;
}
}
{
GArrowRecordBatch *input_batch = build_input_batch(&error);
if (!input_batch) {
g_printerr("Failed to build input batch: %s\n", error->message);
g_error_free(error);
return EXIT_FAILURE;
}
struct ArrowArrayStream *input_stream =
export_batch_as_stream(input_batch, &error);
g_object_unref(input_batch);
if (!input_stream) {
g_printerr("Failed to export input stream: %s\n", error->message);
g_error_free(error);
return EXIT_FAILURE;
}
struct ArrowArrayStream out_stream;
memset(&out_stream, 0, sizeof(out_stream));
int rc = rype_extract_strand_minimizers_arrow(input_stream, k, w, salt, &out_stream);
if (rc != 0) {
g_printerr("rype_extract_strand_minimizers_arrow failed: %s\n",
rype_get_last_error());
return EXIT_FAILURE;
}
if (read_and_print_results(&out_stream, "Strand Minimizers Extraction") != 0) {
return EXIT_FAILURE;
}
}
g_print("\nDone.\n");
return EXIT_SUCCESS;
}