Skip to content

Commit

Permalink
Remove region argument
Browse files Browse the repository at this point in the history
  • Loading branch information
jonperdomo committed Oct 9, 2024
1 parent 282ed9e commit 1c9fe23
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 187 deletions.
8 changes: 0 additions & 8 deletions __main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,6 @@ def main():
required=True
)

parser.add_argument(
"-r", "--region",
help="region to analyze (e.g. chr1, chr1:1000-2000). If not provided, the entire genome will be analyzed",
required=False,
)

# Thread count.
parser.add_argument(
"-t", "--threads",
Expand Down Expand Up @@ -207,7 +201,6 @@ def main():
input_data.setRefGenome(args.reference)
input_data.setSNPFilepath(args.snps)
input_data.setEthnicity(args.ethnicity)
input_data.setRegion(args.region)
input_data.setThreadCount(args.threads)
input_data.setMeanChromosomeCoverage(args.chr_cov)
input_data.setAlleleFreqFilepaths(args.pfb)
Expand All @@ -222,7 +215,6 @@ def main():
# Determine the data paths for downstream analysis.
vcf_path = os.path.join(args.output, "output.vcf")
output_dir = args.output
region = args.region
# cnv_data_path = os.path.join(args.output, "cnv_data.tsv")

# Generate python-based CNV plots if SNP-based CNV predictions are enabled
Expand Down
2 changes: 1 addition & 1 deletion include/cnv_caller.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ class CNVCaller {
void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info);

// Save a TSV with B-allele frequencies, log 2 ratios, and copy number predictions
void saveToTSV(SNPData& snp_data, std::string filepath);
void saveToTSV(SNPData& snp_data, std::string filepath, std::string chr);
};

#endif // CNV_CALLER_H
13 changes: 0 additions & 13 deletions include/input_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,6 @@ class InputData {
void setEthnicity(std::string ethnicity);
std::string getEthnicity();

// Set the genomic region to analyze.
void setRegion(std::string region);
std::string getRegion();
std::string getRegionChr();
int getRegionStart();
int getRegionEnd();
bool getRegionSet();

// Set the window size for the log2 ratio calculation.
void setWindowSize(int window_size);
int getWindowSize();
Expand All @@ -94,11 +86,6 @@ class InputData {
void setThreadCount(int thread_count);
int getThreadCount();

// Set the whole genome flag to true if the entire genome is being
// analyzed.
void setWholeGenome(bool whole_genome);
bool getWholeGenome();

// Set the verbose flag to true if verbose output is desired.
void setVerbose(bool verbose);
bool getVerbose();
Expand Down
15 changes: 3 additions & 12 deletions src/cnv_caller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ void CNVCaller::runCopyNumberPredictionChunk(std::string chr, std::map<SVCandida
if (end_pos - start_pos > 10000) {
std::string sv_filename = this->input_data->getOutputDir() + "/sv_snps_" + chr + "_" + std::to_string((int)start_pos) + "_" + std::to_string((int)end_pos) + ".tsv";
std::cout << "Saving SV SNP data to " << sv_filename << std::endl;
this->saveToTSV(snp_data, sv_filename);
this->saveToTSV(snp_data, sv_filename, chr);
}
}
}
Expand Down Expand Up @@ -426,15 +426,7 @@ CNVCaller::CNVCaller(InputData &input_data)
void CNVCaller::run(SVData& sv_calls)
{
// Predict copy number states at SNP positions using a hidden Markov model
// Get the region data
bool whole_genome = this->input_data->getWholeGenome();
std::vector<std::string> chromosomes;
if (whole_genome)
{
chromosomes = this->input_data->getRefGenomeChromosomes();
} else {
chromosomes.push_back(this->input_data->getRegionChr());
}
// chromosomes = this->input_data->getRefGenomeChromosomes();

// Read the HMM from file
std::string hmm_filepath = this->input_data->getHMMFilepath();
Expand Down Expand Up @@ -1020,7 +1012,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
}
}

void CNVCaller::saveToTSV(SNPData& snp_data, std::string filepath)
void CNVCaller::saveToTSV(SNPData& snp_data, std::string filepath, std::string chr)
{
// Open the TSV file for writing
std::ofstream tsv_file(filepath);
Expand All @@ -1029,7 +1021,6 @@ void CNVCaller::saveToTSV(SNPData& snp_data, std::string filepath)
tsv_file << "chromosome\tposition\tsnp\tb_allele_freq\tlog2_ratio\tcnv_state\tpopulation_freq" << std::endl;

// Write the data
std::string chr = this->input_data->getRegionChr();
int snp_count = (int) snp_data.pos.size();
for (int i = 0; i < snp_count; i++)
{
Expand Down
121 changes: 0 additions & 121 deletions src/input_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ InputData::InputData()
this->region_set = false;
this->thread_count = 1;
this->hmm_filepath = "data/wgs.hmm";
this->whole_genome = true;
this->verbose = false;
this->save_cnv_data = false;
}
Expand Down Expand Up @@ -124,96 +123,6 @@ void InputData::setOutputDir(std::string dirpath)
system(cmd.c_str());
}

std::string InputData::getRegion()
{
return this->region;
}

void InputData::setRegion(std::string region)
{
this->region = region;

// Check if empty string. This means all chromosomes will be used
if (region == "")
{
std::cout << "No region specified. Using all chromosomes" << std::endl;
this->whole_genome = true;
return;
}

// Parse the region
char *tok = strtok((char *)region.c_str(), ":");
int col = 0;
while (tok != NULL)
{
// Get the chromosome
if (col == 0)
{
this->region_chr = tok;
}

// Get the start and end positions
else if (col == 1)
{
// Check if empty
if (strcmp(tok, "") == 0)
{
break;
}

// Split the start and end positions
char *start_tok = strtok(tok, "-");
char *end_tok = strtok(NULL, "-");

// Get the start position
if (start_tok != NULL)
{
this->region_start = atoi(start_tok);
}

// Get the end position
if (end_tok != NULL)
{
this->region_end = atoi(end_tok);
}
}
tok = strtok(NULL, ":");
col++;
}

// Check if a valid chromosome was parsed
if (this->region_chr == "")
{
std::cerr << "Error: Could not parse region" << std::endl;
exit(1);
}

// Check if only a chromosome was parsed
if (this->region_start == 0 && this->region_end == 0)
{
// Use the entire chromosome as the region
this->region_set = true;

// Set the whole genome flag to false
this->whole_genome = false;
std::cout << "Parsed region = " << this->region_chr << std::endl;
} else {
// Check if a valid chromosome start and end position were parsed
if (this->region_start == 0 || this->region_end == 0 || this->region_start > this->region_end)
{
std::cerr << "Error: Region start and end positions not set" << std::endl;
exit(1);
} else {
// Set the region
this->region_set = true;

// Set the whole genome flag to false
this->whole_genome = false;
std::cout << "Parsed region = " << this->region_chr << ":" << this->region_start << "-" << this->region_end << std::endl;
}
}
}

int InputData::getWindowSize()
{
return this->window_size;
Expand Down Expand Up @@ -244,26 +153,6 @@ void InputData::setEthnicity(std::string ethnicity)
this->ethnicity = ethnicity;
}

std::string InputData::getRegionChr()
{
return this->region_chr;
}

int InputData::getRegionStart()
{
return this->region_start;
}

int InputData::getRegionEnd()
{
return this->region_end;
}

bool InputData::getRegionSet()
{
return this->region_set;
}

void InputData::setMeanChromosomeCoverage(std::string chr_cov)
{
// Update the chromosome coverage map if the string is not empty
Expand Down Expand Up @@ -456,16 +345,6 @@ void InputData::setHMMFilepath(std::string filepath)
}
}

void InputData::setWholeGenome(bool whole_genome)
{
this->whole_genome = whole_genome;
}

bool InputData::getWholeGenome()
{
return this->whole_genome;
}

void InputData::setVerbose(bool verbose)
{
this->verbose = verbose;
Expand Down
48 changes: 16 additions & 32 deletions src/sv_caller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,50 +446,34 @@ SVData SVCaller::run()
std::string bam_filepath = this->input_data->getLongReadBam();

// Get the region data
bool whole_genome = this->input_data->getWholeGenome();
std::vector<std::string> regions;
if (whole_genome) {
regions = this->input_data->getRefGenomeChromosomes();
} else {
regions.push_back(this->input_data->getRegion());
}
int num_regions = regions.size();
std::vector<std::string> chromosomes = this->input_data->getRefGenomeChromosomes();
int chr_count = chromosomes.size();

// Loop through each region and detect SVs
std::cout << "Detecting SVs from " << num_regions << " region(s)..." << std::endl;
std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
int region_count = 0;
auto start1 = std::chrono::high_resolution_clock::now();
SVData sv_calls;
PrimaryMap primary_alignments;
SuppMap supplementary_alignments;
int all_region_sv_count = 0;
for (const auto& region : regions) {
// std::cout << "Extracting alignments for region: " << region << std::endl;
for (const auto& chr : chromosomes) {
// Split the region into chunks and process each chunk in a separate
// thread
int num_threads = this->input_data->getThreadCount();
int chr_len = this->input_data->getRefGenomeChromosomeLength(chr);

// If a region range is specified (e.g., chr1:1-1000), use a single
// thread
// Split the region into chunks
std::vector<std::string> region_chunks;
if (region.find(":") != std::string::npos) {
num_threads = 1;
region_chunks.push_back(region);
} else {
// Get the chromosome length
int chr_len = this->input_data->getRefGenomeChromosomeLength(region);

// Split the region into chunks
int chunk_size = chr_len / num_threads;
for (int i = 0; i < num_threads; i++) {
int start = i * chunk_size + 1; // 1-based
int end = start + chunk_size;
if (i == num_threads - 1) {
end = chr_len;
}
std::string chunk = region + ":" + std::to_string(start) + "-" + std::to_string(end);
region_chunks.push_back(chunk);
int chunk_size = chr_len / num_threads;
for (int i = 0; i < num_threads; i++) {
int start = i * chunk_size + 1; // 1-based
int end = start + chunk_size;
if (i == num_threads - 1) {
end = chr_len;
}
std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end);
region_chunks.push_back(chunk);
}

// Loop through the chunks and process each chunk in a separate thread
Expand Down Expand Up @@ -546,15 +530,15 @@ SVData SVCaller::run()

// Increment the region count
region_count++;
std::cout << "Extracted aligments for " << region_count << " of " << num_regions << " regions." << std::endl;
std::cout << "Extracted aligments for " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl;
}

// Run split-read SV detection in a single thread
std::cout << "Detecting SVs from split-read alignments..." << std::endl;
this->detectSVsFromSplitReads(sv_calls, primary_alignments, supplementary_alignments);

auto end1 = std::chrono::high_resolution_clock::now();
std::cout << "Finished detecting " << sv_calls.totalCalls() << " SVs from " << num_regions << " region(s). Elapsed time: " << getElapsedTime(start1, end1) << std::endl;
std::cout << "Finished detecting " << sv_calls.totalCalls() << " SVs from " << chr_count << " chromosome(s). Elapsed time: " << getElapsedTime(start1, end1) << std::endl;

// Return the SV calls
return sv_calls;
Expand Down

0 comments on commit 1c9fe23

Please sign in to comment.