Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added normalisation and unit test cases #118

Open
wants to merge 2 commits into
base: release/0.9
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added src/koheesio/.vs/slnx.sqlite
Binary file not shown.
9 changes: 9 additions & 0 deletions src/koheesio/clipping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import List
from pydantic import BaseModel

class ClipConfig(BaseModel):
min_value: float
max_value: float

def clip_data(data: List[float], config: ClipConfig) -> List[float]:
return [max(config.min_value, min(x, config.max_value)) for x in data]
40 changes: 40 additions & 0 deletions src/koheesio/clipping_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import unittest
from clipping import clip_data, ClipConfig

class TestClipData(unittest.TestCase):
def test_clip_data(self):
data = [5, 15, 25]
config = ClipConfig(min_value=10, max_value=20)
result = clip_data(data, config)
expected = [10, 15, 20]
self.assertEqual(result, expected)

def test_clip_data_beyond_limits(self):
data = [-10, 0, 10, 20, 30]
config = ClipConfig(min_value=0, max_value=20)
result = clip_data(data, config)
expected = [0, 0, 10, 20, 20]
self.assertEqual(result, expected)

def test_clip_data_all_below_min(self):
data = [-10, -5, -1]
config = ClipConfig(min_value=0, max_value=20)
result = clip_data(data, config)
expected = [0, 0, 0]
self.assertEqual(result, expected)

def test_clip_data_all_above_max(self):
data = [25, 30, 35]
config = ClipConfig(min_value=10, max_value=20)
result = clip_data(data, config)
expected = [20, 20, 20]
self.assertEqual(result, expected)

def test_clip_data_empty_list(self):
data = []
config = ClipConfig(min_value=10, max_value=20)
result = clip_data(data, config)
self.assertEqual(result, [])

if __name__ == "__main__":
unittest.main()
11 changes: 11 additions & 0 deletions src/koheesio/exponential.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import List
from pydantic import BaseModel

class ExpTransformConfig(BaseModel):
base: float

def exp_transform(data: List[float], config: ExpTransformConfig) -> List[float]:
import math
if config.base <= 0:
raise ValueError("Base must be a positive number.")
return [config.base ** x for x in data]
37 changes: 37 additions & 0 deletions src/koheesio/exponential_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import unittest
from exponential import exp_transform, ExpTransformConfig

class TestExpTransform(unittest.TestCase):
def test_exp_transform_base_2(self):
data = [1, 2, 3]
config = ExpTransformConfig(base=2)
result = exp_transform(data, config)
expected = [2, 4, 8]
self.assertEqual(result, expected)

def test_exp_transform_base_e(self):
data = [1, 2, 3]
config = ExpTransformConfig(base=2.71828)
result = exp_transform(data, config)
self.assertTrue(all(isinstance(x, float) for x in result))

def test_exp_transform_large_values(self):
data = [10, 20]
config = ExpTransformConfig(base=10)
result = exp_transform(data, config)
self.assertTrue(all(isinstance(x, float) for x in result))

def test_exp_transform_with_invalid_base(self):
data = [1, 2, 3]
config = ExpTransformConfig(base=-1)
with self.assertRaises(ValueError):
exp_transform(data, config)

def test_exp_transform_empty_data(self):
data = []
config = ExpTransformConfig(base=2)
result = exp_transform(data, config)
self.assertEqual(result, [])

if __name__ == "__main__":
unittest.main()
10 changes: 10 additions & 0 deletions src/koheesio/min_max.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from typing import List

def min_max_normalize(data: List[float], new_min: float, new_max: float) -> List[float]:
if not data:
raise ValueError("Data list cannot be empty.")
min_data = min(data)
max_data = max(data)
if min_data == max_data:
return [new_min for _ in data]
return [(new_min + (x - min_data) * (new_max - new_min) / (max_data - min_data)) for x in data]
34 changes: 34 additions & 0 deletions src/koheesio/min_max_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest
from min_max import min_max_normalize

class TestMinMaxNormalize(unittest.TestCase):
def test_standard_case(self):
data = [10, 20, 30, 40, 50]
result = min_max_normalize(data, 0, 1)
expected = [0.0, 0.25, 0.5, 0.75, 1.0]
self.assertEqual(result, expected)

def test_identical_values(self):
data = [10, 10, 10]
result = min_max_normalize(data, 0, 1)
expected = [0.0, 0.0, 0.0]
self.assertEqual(result, expected)

def test_negative_values(self):
data = [-50, -25, 0, 25, 50]
result = min_max_normalize(data, 0, 1)
expected = [0.0, 0.25, 0.5, 0.75, 1.0]
self.assertEqual(result, expected)

def test_empty_data(self):
with self.assertRaises(ValueError):
min_max_normalize([], 0, 1)

def test_negative_target_range(self):
data = [10, 20, 30, 40, 50]
result = min_max_normalize(data, -5, -1)
expected = [-5.0, -4.0, -3.0, -2.0, -1.0]
self.assertEqual(result, expected)

if __name__ == "__main__":
unittest.main()
21 changes: 21 additions & 0 deletions src/koheesio/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from pydantic import BaseModel

class NormalizeConfig(BaseModel):
min_value: float
max_value: float

def normalize(data, config: NormalizeConfig):

min_data = min(data)
max_data = max(data)

range_data = max_data - min_data

range_config = config.max_value - config.min_value

normalized_data = [
((value - min_data) / range_data) * range_config + config.min_value
for value in data
]

return normalized_data
10 changes: 10 additions & 0 deletions src/koheesio/scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from typing import List

def scale_data(data: List[float], multiplier: float) -> List[float]:

if not isinstance(multiplier, (int, float)):
raise ValueError("Multiplier must be a numeric value.")
if not all(isinstance(num, (int, float)) for num in data):
raise ValueError("All elements in data must be numeric.")

return [num * multiplier for num in data]
58 changes: 58 additions & 0 deletions src/koheesio/scale_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import unittest
from scale import scale_data

class TestScaleData(unittest.TestCase):

def test_scale_positive_numbers(self):
data = [1, 2, 3, 4, 5]
multiplier = 2
result = scale_data(data, multiplier)
expected = [2, 4, 6, 8, 10]
self.assertEqual(result, expected)

def test_scale_negative_numbers(self):
data = [-1, -2, -3, -4, -5]
multiplier = 3
result = scale_data(data, multiplier)
expected = [-3, -6, -9, -12, -15]
self.assertEqual(result, expected)


"""Test scaling a list with both positive and negative numbers."""
data = [-1, 0, 1]
multiplier = 5
result = scale_data(data, multiplier)
expected = [-5, 0, 5]
self.assertEqual(result, expected)

def test_scale_with_zero_multiplier(self):
data = [10, -10, 100]
multiplier = 0
result = scale_data(data, multiplier)
expected = [0, 0, 0]
self.assertEqual(result, expected)

def test_scale_with_one_multiplier(self):
data = [1.5, 2.5, 3.5]
multiplier = 1
result = scale_data(data, multiplier)
expected = [1.5, 2.5, 3.5]
self.assertEqual(result, expected)

def test_scale_empty_list(self):
data = []
multiplier = 2
result = scale_data(data, multiplier)
expected = []
self.assertEqual(result, expected)


"""Test scaling a list with very large numbers."""
data = [1e10, 2e10, 3e10]
multiplier = 10
result = scale_data(data, multiplier)
expected = [1e11, 2e11, 3e11]
self.assertEqual(result, expected)

if __name__ == '__main__':
unittest.main()
14 changes: 14 additions & 0 deletions src/koheesio/standardisation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import List
from pydantic import BaseModel
import statistics

class StandardizeConfig(BaseModel):
mean: float = None
std_dev: float = None

def standardize(data: List[float], config: StandardizeConfig) -> List[float]:
mean = config.mean if config.mean is not None else statistics.mean(data)
std_dev = config.std_dev if config.std_dev is not None else statistics.stdev(data)
if std_dev == 0:
raise ValueError("Standard deviation cannot be zero.")
return [(x - mean) / std_dev for x in data]
40 changes: 40 additions & 0 deletions src/koheesio/standardize_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import unittest
from standardisation import standardize, StandardizeConfig

class TestStandardize(unittest.TestCase):

def test_standardize_identical_values(self):
data = [10, 10, 10]
config = StandardizeConfig()
with self.assertRaises(ValueError):
standardize(data, config)

def test_standardize_with_precomputed_values(self):
data = [10, 20, 30]
config = StandardizeConfig(mean=20, std_dev=10)
result = standardize(data, config)
expected = [-1.0, 0.0, 1.0]
self.assertEqual(result, expected)

def test_standardize_empty_data(self):
data = []
config = StandardizeConfig()
with self.assertRaises(ValueError):
standardize(data, config)


def test_standardize_with_custom_mean_and_std_dev(self):
data = [1, 2, 3, 4, 5]
config = StandardizeConfig(mean=3, std_dev=1)
result = standardize(data, config)
expected = [-2.0, -1.0, 0.0, 1.0, 2.0]
self.assertEqual(result, expected)

def test_standardize_with_small_variance(self):
data = [1.001, 1.002, 1.003, 1.004, 1.005]
config = StandardizeConfig()
result = standardize(data, config)
self.assertAlmostEqual(sum(result), 0, places=5)

if __name__ == "__main__":
unittest.main()
42 changes: 42 additions & 0 deletions src/koheesio/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import unittest

from normalize import normalize, NormalizeConfig

class TestNormalize(unittest.TestCase):

def test_normalize_standard(self):
data = [10, 20, 30, 40, 50]
config = NormalizeConfig(min_value=0, max_value=1)
result = normalize(data, config)
expected = [0.0, 0.25, 0.5, 0.75, 1.0]
self.assertEqual(result, expected)

def test_normalize_negative_values(self):
data = [-50, -25, 0, 25, 50]
config = NormalizeConfig(min_value=-1, max_value=1)
result = normalize(data, config)
expected = [-1.0, -0.5, 0.0, 0.5, 1.0]
self.assertEqual(result, expected)

def test_normalize_float_values(self):
data = [0.1, 0.2, 0.3, 0.4, 0.5]
config = NormalizeConfig(min_value=0, max_value=1)
result = normalize(data, config)
expected = [0.0, 0.25, 0.49999999999999994, 0.7500000000000001, 1.0]
self.assertEqual(result, expected)

def test_normalize_inverted_config(self):
data = [10, 20, 30, 40, 50]
config = NormalizeConfig(min_value=1, max_value=0)
result = normalize(data, config)
expected = [1.0, 0.75, 0.5, 0.25, 0.0]
self.assertEqual(result, expected)

def test_normalize_empty_data(self):
data = []
config = NormalizeConfig(min_value=0, max_value=1)
with self.assertRaises(ValueError):
normalize(data, config)

if __name__ == '__main__':
unittest.main()
11 changes: 11 additions & 0 deletions src/koheesio/z_score_normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import List
import statistics

def z_score_normalize(data: List[float]) -> List[float]:
if not data:
raise ValueError("Data list cannot be empty.")
mean = statistics.mean(data)
std_dev = statistics.stdev(data)
if std_dev == 0:
return [0.0 for _ in data]
return [(x - mean) / std_dev for x in data]
41 changes: 41 additions & 0 deletions src/koheesio/z_score_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import unittest
from z_score_normalize import z_score_normalize

class TestZScoreNormalize(unittest.TestCase):

def test_large_numbers(self):
data = [1e10, 2e10, 3e10, 4e10, 5e10]
result = z_score_normalize(data)
expected = [-1.2649110640673518, -0.6324555320336759, 0.0, 0.6324555320336759, 1.2649110640673518]
self.assertAlmostEqual(result, expected, places=5)

def test_mixed_positive_and_negative_values(self):
data = [-10, -5, 0, 5, 10]
result = z_score_normalize(data)
expected = [-1.2649110640673518, -0.6324555320336759, 0.0, 0.6324555320336759, 1.2649110640673518]
self.assertAlmostEqual(result, expected, places=5)

def test_precomputed_mean_and_std_dev(self):
from statistics import mean, stdev
data = [10, 20, 30, 40, 50]
mean_value = mean(data)
std_dev = stdev(data)
result = [(x - mean_value) / std_dev for x in data]
expected = [-1.2649110640673518, -0.6324555320336759, 0.0, 0.6324555320336759, 1.2649110640673518]
self.assertAlmostEqual(result, expected, places=5)


def test_data_with_highly_skewed_distribution(self):
data = [1, 2, 3, 4, 1000]
result = z_score_normalize(data)
self.assertTrue(result[-1] > 1.0)
self.assertTrue(result[0] < 0.0)

def test_already_normalized_data(self):
data = [-1.2649110640673518, -0.6324555320336759, 0.0, 0.6324555320336759, 1.2649110640673518]
result = z_score_normalize(data)
expected = data
self.assertAlmostEqual(result, expected, places=5)

if __name__ == "__main__":
unittest.main()