Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

--with-missing-fields: pad AD & PL to expected vector length #29

Merged
merged 3 commits into from
Oct 2, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 67 additions & 24 deletions src/spVCF.cc
Original file line number Diff line number Diff line change
Expand Up @@ -532,13 +532,20 @@ class DecoderImpl : public TranscoderBase {
const char *ProcessLine(char *input_line) override;

private:
void add_missing_fields(const char *entry, int field_count, string &ans);
void add_missing_fields(const char *entry, int n_alt, string &ans);

// temp buffers used in ProcessLine (to reduce allocations)
vector<string> dense_entries_;
OStringStream buffer_;

bool with_missing_fields_;
int format_field_count_ = 0;
string format_;
vector<string> format_split_;
int iAD_ = -1, iPL_ = -1;
// temp buffers used in add_missing_fields (to reduce allocations)
OStringStream format_buffer_;
string entry_copy_;
vector<char *> entry_fields_;
};

const char *DecoderImpl::ProcessLine(char *input_line) {
Expand All @@ -565,6 +572,23 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
fail("Invalid project VCF: fewer than 10 columns");
}

int n_alt = -1;
if (with_missing_fields_) {
// count n_alt for use in missing fields with Number={A,G,R}
n_alt = 1;
for (char *alt = tokens[4]; *alt; alt++) {
if (*alt == ',') {
n_alt++;
}
}
if (n_alt != 1) {
// FIXME: handling multiallelic sites will require add_missing_fields() even on sparse
// entries
fail("--with-missing-fields only works with biallelic pVCF for now"
"; try piping output through bcftools instead");
}
}

// Figure out number of dense columns, the number of columns on the first line
uint64_t N = dense_entries_.empty() ? (tokens.size() - 9) : dense_entries_.size();
if (dense_entries_.empty()) {
Expand All @@ -574,7 +598,6 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
assert(dense_entries_.size() == N);

// Pass through first nine columns
int format_field_count = 1;
buffer_.Clear();
buffer_ << tokens[0];
for (int i = 1; i < 9; i++) {
Expand All @@ -592,15 +615,23 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
continue;
}
} else if (i == 8 && with_missing_fields_) {
// Count FORMAT fields for --with-missing-fields
for (char *FORMAT = tokens[8]; *FORMAT != 0; FORMAT++) {
if (*FORMAT == ':') {
format_field_count++;
if (format_.empty()) {
// one-time initialization
format_ = tokens[8];
string format_copy = format_;
vector<char *> format_split;
split(format_copy, ':', back_inserter(format_split));
for (int j = 0; j < format_split.size(); j++) {
char *s = format_split[j];
if (!strcmp(s, "AD")) {
iAD_ = j;
} else if (!strcmp(s, "PL")) {
iPL_ = j;
}
format_split_.push_back(s);
}
}
if (format_field_count_ <= 0) {
format_field_count_ = format_field_count;
} else if (format_field_count != format_field_count_) {
if (format_ != tokens[8]) {
fail(
"--with-missing-fields is unsuitable when pVCF lines have varying field FORMATs"
"; try piping output through bcftools instead");
Expand All @@ -617,14 +648,12 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
fail("empty cell");
} else if (*t != '"') {
// Dense entry - remember it and copy it to the output
// TODO: Perhaps fill QC fields with missing values (.) if they were squeezed out.
// The VCF spec does however say "Trailing fields can be dropped"
if (dense_cursor >= N) {
fail("Greater-than-expected number of columns implied by sparse encoding");
}
string &dense_entry = dense_entries_[dense_cursor++];
if (with_missing_fields_) {
add_missing_fields(t, format_field_count, dense_entry);
add_missing_fields(t, n_alt, dense_entry);
} else {
dense_entry = t;
}
Expand Down Expand Up @@ -678,19 +707,33 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
return buffer_.Get();
}

// Add trailing missing fields to entry (--with-missing-fields); return tmpstr.c_str() or entry
// itself (if already complete).
void DecoderImpl::add_missing_fields(const char *entry, int field_count, string &ans) {
int entry_field_count = 1;
for (char *cursor = const_cast<char *>(entry); *cursor != 0; cursor++) {
if (*cursor == ':') {
entry_field_count++;
// Add trailing missing fields to entry (--with-missing-fields)
// Most missing fields are just represented by . except for AD and PL, which we pad with . to the
// correct vector length. (In principle we should do that for any Number={A,G,R} field, but this
// suffices for our practical need for this feature.)
void DecoderImpl::add_missing_fields(const char *entry, int n_alt, string &ans) {
format_buffer_.Clear();
entry_copy_ = entry;
entry_fields_.clear();
split(entry_copy_, ':', back_inserter(entry_fields_));
for (int i = 0; i < format_split_.size(); i++) {
bool present = i < entry_fields_.size();
const char *field = present ? entry_fields_[i] : nullptr;
if (i > 0) {
format_buffer_ << ':';
}
if (i == iAD_ && (!present || !strcmp(field, "."))) {
// FIXME: handle multiallelic. Meaning we need to run this on sparse entries too.
// const int nAD = n_alt + 1;
format_buffer_ << ".,.";
} else if (i == iPL_ && (!present || !strcmp(field, "."))) {
// const int nPL = (n_alt + 1) * (n_alt + 2) / 2;
format_buffer_ << ".,.,.";
} else {
format_buffer_ << (present ? field : ".");
}
}
ans = entry;
while (entry_field_count++ < field_count) {
ans += ":.";
}
ans = format_buffer_.Get();
}

unique_ptr<Transcoder> NewDecoder(bool with_missing_fields) {
Expand Down
Loading