Skip to content

Commit

Permalink
Extended KMersTransformer to support editable sliding window
Browse files Browse the repository at this point in the history
  • Loading branch information
Piotr Tynecki committed Oct 23, 2020
1 parent 731c890 commit 18fd3fc
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 9 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
All notable changes to this project will be documented in this file.


## [0.0.9] - 23.10.2020
### Changed
* Extended `KMersTransformer` to support editable sliding window;


## [0.0.8] - 11.10.2020
### Added
* Initial online documentation;
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
author = 'Piotr Tynecki, Iwona Świętochowska, Yana Minina, Przemysław Mitura, Wojciech Łaguna'

# The full version, including alpha/beta/rc tags
release = version = '0.0.8'
release = version = '0.0.9'


# -- General configuration ---------------------------------------------------
Expand Down
16 changes: 9 additions & 7 deletions phages2050/features/transformers/kmers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@

class KMersTransformer(BaseEstimator, TransformerMixin):
"""
K-mer transformer is responsible to extract set of
words which are subsequences of length (6 by default)
contained within a biological sequence
K-mer transformer is responsible to extract set of words -
using configurable sliding window - which are subsequences
of length (6 by default) contained within a biological sequence
Each of the word is called k-mer and are composed of
nucleotides (i.e. A, T, G, and C)
Each of the word is called k-mer and are composed of nucleotides
(i.e. A, T, G, and C). Each word which includes other characters
is removed from the output
Example:
fname = 'NC_001604.fasta'
Expand All @@ -37,9 +38,10 @@ class KMersTransformer(BaseEstimator, TransformerMixin):
kmt.transform(sample)
"""

def __init__(self, size: int = 6):
def __init__(self, size: int = 6, sliding_window: int = 1):
self.accepted_chars: Set[str] = {"A", "C", "T", "G"}
self.size: int = size
self.sliding_window: int = sliding_window

def _extract_kmers_from_sequence(self, sequence: str) -> str:
"""
Expand All @@ -57,7 +59,7 @@ def _extract_kmers_from_sequence(self, sequence: str) -> str:
return " ".join(
[
sequence[x : x + self.size]
for x in range(len(sequence) - self.size + 1)
for x in range(0, len(sequence) - self.size + 1, self.sliding_window)
if not set(sequence[x : x + self.size]) - self.accepted_chars
]
)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
dependencies = list(map(str.strip, filter(None, dependencies.split("\n"))))


version = "0.0.8"
version = "0.0.9"

setup(
name="phages2050",
Expand Down

0 comments on commit 18fd3fc

Please sign in to comment.