seq_io 0.3.4

Fast FASTA and FASTQ readers
Documentation
library(ggplot2)
library(cowplot)

criterion_dir <- commandArgs(TRUE)[1]

sessionInfo()

#### read data ####

d <- lapply(list.dirs(criterion_dir, F, F), function(dir) {
  read.csv(file.path(criterion_dir, dir, 'new', 'raw.csv'))
})
d <- do.call(rbind, d)

s <- strsplit(as.character(d$group), ' ')
cfg <- as.data.frame(t(as.data.frame(lapply(s, '[', 1:5))))
names(cfg) <- c('format', 'reader', 'seqlen', 'config', 'data_len')
cfg$data_len <- as.numeric(as.character(cfg$data_len))
cfg$seqlen <- as.numeric(as.character(cfg$seqlen))
cfg$cap_test <- grepl('_cap', as.character(cfg$reader))

d <- cbind(d, cfg)

d$gb_per_s <- (as.numeric(d$data_len) / 1e9) / (d$sample_measured_value / 1e9) * d$iteration_count

d <- subset(d, !is.na(data_len))


#### setup output directory ####

outdir <- file.path(dirname(dirname(criterion_dir)), 'bench_results')
dir.create(outdir, F)


#### comparison of readers ####

reader_plot <- function(data, facets) {
  ggplot(data, aes(reader, gb_per_s, fill=reader)) +
    stat_summary(fun=mean, geom='bar', width=1, colour='#222222', linewidth=0.2) +
    stat_summary(fun.data=mean_se, geom = 'errorbar', width=0.2, alpha=0.5) +
    facet_grid(as.formula(paste(paste(facets, collapse= "+"), " ~ .")),
               space='free_y', scale='free_y') +
    scale_fill_grey(start=0.05, end=0.95) +
    coord_flip(expand=F, ylim=c(0, max(data$gb_per_s, na.rm=T))) +
    labs(x='reader', y='GB/s') +
    theme_bw() +
    theme(
      strip.text.y = element_text(angle=0),
      strip.background = element_rect(fill='white', colour='gray50'),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank(),
      axis.title = element_blank(),
      panel.grid.major.y = element_blank(),
      panel.grid.minor.y = element_blank(),
      panel.spacing = unit(-.5, 'pt')
    )
}

reader_cmp <- subset(d, !cap_test)

full <- lapply(split(reader_cmp, reader_cmp$format), function(data) {
    reader_plot(data, c('seqlen', 'config'))
})

reader_cmp_filter <- subset(reader_cmp, seqlen == 500 & !grepl('(records|seq|iter)', config, perl=T))
simple <- lapply(split(reader_cmp_filter, reader_cmp_filter$format), function(data) {
    reader_plot(data, 'config')
})

no_legend <- theme(legend.position='none')

png(file.path(outdir, 'reader_comparison.png'), width=1400, height=700, res=150)
plot_grid(NULL, NULL,
          full$fasta + no_legend, full$fastq,
          rel_widths=c(10, 13), rel_heights=c(1, 12),
          labels=c('FASTA', 'FASTQ'), label_size=12)
dev.off()

png(file.path(outdir, 'reader_comparison_simple.png'), width=1400, height=250, res=150)
plot_grid(NULL, NULL,
          simple$fasta + no_legend, simple$fastq,
          rel_widths=c(10, 13), rel_heights=c(2, 12),
          labels=c('FASTA', 'FASTQ'), label_size=12)
dev.off()


for (fmt in levels(reader_cmp$format)) {
  data <- subset(reader_cmp, format == fmt & !is.na(data_len))
  png(file.path(outdir, sprintf('bench_%s.png', fmt)), width=1400, height=30*length(unique(data$group))+150, res=200)
  print(reader_plot(data, c('seqlen', 'config')))
  dev.off()

  sub <- subset(data, seqlen == 500 & !grepl('(records|seq|iter)', config, perl=T))
  png(file.path(outdir, sprintf('bench_%s_simple.png', fmt)), width=1400, height=30*length(unique(sub$group))+150, res=200)
  print(reader_plot(sub, c('config')))
  dev.off()
}


#### test different buffer capacities ####

cap_cmp <- subset(d, cap_test)

png(file.path(outdir, 'bench_cap.png'), width=1400, height=1000, res=200)
cap_cmp$bufsize <- as.numeric(gsub('([0-9]+)ki', '\\1', cap_cmp$config))
ggplot(cap_cmp, aes(bufsize, gb_per_s, color=as.factor(seqlen), linetype=format)) +
    stat_summary(fun=mean, geom='point') +
    stat_summary(fun=mean, geom='line') +
    stat_summary(fun.data=mean_se, geom = 'errorbar', width=0.1, alpha=0.5) +
    expand_limits(y=0) +
    scale_x_continuous(trans='log1p', breaks=2^(0:25)) +
    labs(x='Buffer size (KiB)', y='GB/s', color='Sequence length', linetype='Format') +
    scale_color_brewer(palette='Set1') +
    theme_bw()
dev.off()