Skip to content

Commit

Permalink
support gzip format
Browse files Browse the repository at this point in the history
  • Loading branch information
y9c committed Apr 7, 2020
1 parent 117a074 commit 6177ae2
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 9 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ build/

# Directory created by dartdoc
doc/api/

# Benchmark test
benchmark/
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.0.4

- support gz compression format

## 0.0.3

- filter sequence by motif (`--filter-motif`)
Expand Down
14 changes: 14 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#
# Makefile
# Ye Chang, 2020-04-07 21:23
#

all: binary
@echo "Done!"

binary:
@echo "Building binary by dart2native..."
@dart2native bin/main.dart -o build/bio 1>/dev/null 2>/dev/null

# vim:ft=make
#
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,9 @@ Functions and command line tools for biological computation written in **Dart**.

## BENCHMARK

vs. biopython
vs. seqkit
| tool | test IO time | test RC time |
| --------------- | -----------: | -----------: |
| biodart | 6.678 | 15.533 |
| seqkit (Golang) | 0.996 | 0.879 |
| seqtk (C) | 0.849 | 0.854 |

5 changes: 4 additions & 1 deletion TODO.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
> seqIO
- [x] support stdin and stdout
- [ ] support gzip format as output
- [ ] improve speed
- [ ] split record name and record description
- [ ] support multiple line fastq?
- [ ] trim low quality base
- [ ] population genetic calculator
- [ ] support degenerate bases matching
- [x] support stdin and stdout
- [x] support gzip format as input

> alignIO
Expand Down
6 changes: 6 additions & 0 deletions bin/main.dart
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class SeqCommand extends Command {
outputFile: argResults['output'],
inputFormat: argResults['input-format'],
outputFormat: argResults['output-format'],
inputCompressed: argResults['input-compressed'],
outputCompressed: argResults['output-compressed'],
fastaLineLength: int.parse(argResults['fasta-line-length']),
sample: int.parse(argResults['sample']),
randomSeed: int.tryParse(argResults['sample-seed']),
Expand All @@ -60,8 +62,12 @@ class SeqCommand extends Command {
abbr: 'o', defaultsTo: '-', valueHelp: 'Path of output file')
..addOption('input-format',
abbr: 's', help: 'Format of input file', valueHelp: 'auto')
..addOption('input-compressed',
help: 'Whether input file is in .gz format', valueHelp: 'auto')
..addOption('output-format',
abbr: 't', help: 'Format of output file', valueHelp: 'auto')
..addOption('output-compressed',
help: 'Whether output file is in .gz format', valueHelp: 'auto')
..addOption('fasta-line-length',
abbr: 'l', defaultsTo: '0', help: 'Number of charaters in each line')
..addOption('sample',
Expand Down
31 changes: 26 additions & 5 deletions lib/seq.dart
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import 'dart:io' show File, exit;
import 'dart:io' show File, exit, gzip;
import 'dart:convert' show LineSplitter, utf8;
import 'dart:math' show Random, min;
import 'package:bio/utils.dart' as utils;
Expand Down Expand Up @@ -117,9 +117,18 @@ Stream<Seq> _fqStringStream2SeqStream(Stream<String> lines) async* {
}

/// Read file as Stream<Seq>
Stream<Seq> file2Stream(File file, String format) {
Stream<Seq> file2Stream(File file, String format, {bool isGzip = false}) {
// Read file as Stream<String>
var lines = file.openRead().transform(utf8.decoder).transform(LineSplitter());
Stream<String> lines;
if (isGzip) {
lines = file
.openRead()
.transform(gzip.decoder)
.transform(utf8.decoder)
.transform(LineSplitter());
} else {
lines = file.openRead().transform(utf8.decoder).transform(LineSplitter());
}
// parse Stream<String> into Stream<Seq>
Stream<Seq> s;
if (format == 'fa') {
Expand Down Expand Up @@ -175,8 +184,10 @@ int countFq(File file) {
void seqIO(
{String inputFile,
String outputFile,
bool inputCompressed,
String inputFormat,
String outputFormat,
bool outputCompressed,
int fastaLineLength,
int sample = 0,
int randomSeed,
Expand Down Expand Up @@ -215,13 +226,23 @@ void seqIO(
'fastq': 'fq',
'fq': 'fq'
};
inputFormat ??= inputFile.split('.').last;

bool isGzip;
if (inputFile.split('.').last == 'gz') {
isGzip = true;
var nameSplited = inputFile.split('.');
inputFormat ??= nameSplited[nameSplited.length - 2];
} else {
isGzip = false;
inputFormat ??= inputFile.split('.').last;
}
if (supportedFormats.containsKey(inputFormat)) {
inputFormat = supportedFormats[inputFormat];
} else {
log.warning('${inputFormat} format is not supported!');
exit(1);
}

outputFormat ??= outputFile.split('.').last;
if (supportedFormats.containsKey(outputFormat)) {
outputFormat = supportedFormats[outputFormat];
Expand All @@ -231,7 +252,7 @@ void seqIO(
}

// Read file as Stream<Seq>
var inputStream = file2Stream(infile, inputFormat);
var inputStream = file2Stream(infile, inputFormat, isGzip: isGzip);

// var castStream = inputStream.asBroadcastStream();
// castStream.length.then((value) => print("stream.length: $value"));
Expand Down
2 changes: 1 addition & 1 deletion pubspec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: bio
description: "Some useful command line utils written in Dart. For parsing and manipulating sequence records in various formats..."
version: 0.0.3
version: 0.0.4
homepage: https://github.com/yech1990/biodart
documentation: https://pub.dev/documentation/bio

Expand Down

0 comments on commit 6177ae2

Please sign in to comment.