Skip to content

Commit 042bc4b

Browse files
committed
improved subsampling
1 parent ccd416d commit 042bc4b

File tree

1 file changed

+30
-8
lines changed
  • src/python/ensembl/tools/anno/transcriptomic_annotation

1 file changed

+30
-8
lines changed

src/python/ensembl/tools/anno/transcriptomic_annotation/star.py

100644100755
+30-8
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ def _subsample_paired_fastq_files( # pylint:disable=too-many-branches
418418
else:
419419
raise FileNotFoundError("No fastq file found")
420420

421-
if fastq_file_1.suffix.endswith(".gz$"):
421+
if fastq_file_1.suffix.endswith(".gz"):
422422
compressed = True
423423
num_lines = sum(1 for line in gzip.open(fastq_file_1)) # pylint:disable=consider-using-with
424424
else:
@@ -443,8 +443,8 @@ def _subsample_paired_fastq_files( # pylint:disable=too-many-branches
443443
)
444444
return
445445

446-
rand_list = random.sample(range(0, range_limit - 1), sampling_size)
447-
random_indices = {idx * 4: 1 for idx in rand_list}
446+
rand_list = random.sample(range(0, range_limit-1), sampling_size)
447+
random_indices = [idx * 4 for idx in rand_list]
448448
logging.info("Processing paired files in parallel")
449449
if num_threads >= 2:
450450
pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with
@@ -498,6 +498,29 @@ def _subsample_fastq_subset(
498498
compressed : the files is compressed
499499
"""
500500
line_index = 0
501+
read_block = []
502+
with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open(
503+
output_file, "w+"
504+
) as file_out:
505+
for line in file_in:
506+
read_block.append(line)
507+
if len(read_block) == 4:
508+
if line_index in random_indices:
509+
file_out.writelines(read_block)
510+
read_block = []
511+
line_index += 4
512+
#lines = [file_in.readline() for _ in range(4)]
513+
"""
514+
while lines[3]:
515+
#lines = [file_in.readline() for _ in range(4)]
516+
# Break if the end of the file is reached
517+
if len(lines) < 4 : # No more lines to read
518+
break
519+
# Write to output if current index is in random_indices
520+
if line_index in random_indices:
521+
file_out.writelines(lines)
522+
line_index += 4
523+
lines = [file_in.readline() for _ in range(4)]
501524
502525
with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open(
503526
output_file, "w+"
@@ -508,7 +531,7 @@ def _subsample_fastq_subset(
508531
file_out.writelines(lines)
509532
line_index += 4
510533
lines = [file_in.readline() for _ in range(4)]
511-
534+
"""
512535

513536
def run_trimming(
514537
output_dir: Path,
@@ -609,9 +632,9 @@ def multiprocess_trim_galore(trim_galore_cmd: List, fastq_paired_files: List[Pat
609632
def parse_args():
610633
"""Parse command line arguments."""
611634
parser = argparse.ArgumentParser(description="STAR's arguments")
612-
parser.add_argument("--genome_file", required=True, help="Genome file path")
613-
parser.add_argument("--output_dir", required=True, help="Output directory path")
614-
parser.add_argument("--short_read_fastq_dir", required=True, help="Short read directory path")
635+
parser.add_argument("--genome_file", help="Genome file path")
636+
parser.add_argument("--output_dir", help="Output directory path")
637+
parser.add_argument("--short_read_fastq_dir", help="Short read directory path")
615638
parser.add_argument(
616639
"--delete_pre_trim_fastq",
617640
action="store_true",
@@ -682,7 +705,6 @@ def parse_args():
682705
parser.add_argument(
683706
"--run_star",
684707
type=bool,
685-
default=True,
686708
help="If True will run STAR alignment given an input dataset of fastq files.",
687709
required=False,
688710
)

0 commit comments

Comments
 (0)