generated from streamlit/streamlit-hello
-
Notifications
You must be signed in to change notification settings - Fork 2
/
segment_feature.py
204 lines (180 loc) · 6.91 KB
/
segment_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.metrics import mean_squared_error
class SegmentFeature(BaseEstimator):
"""
Computing the features per segment.
Inputs a list of segmented signals (meaning signals with their according
list of breakpoints) and a list of features
Outputs the computed features per segment.
Parameters
----------
features_names : list, default=['mean']
Features to compute per segment, list of strings who must belong to the
following possible values:
- 'mean',
- 'min',
- 'max',
- 'mean_of_min_max' (which is the mean between the max and the min)
- 'variance',
- 'slope',
- 'scaled_complexity_invariance',
- 'linear_residuals'.
- 'length'
"""
ALL_FEATURE_NAMES = [
"mean",
"min",
"max",
"mean_of_min_max",
"variance",
"slope",
"scaled_complexity_invariance",
"linear_residuals",
"length",
]
def __init__(
self,
features_names=[
"mean",
],
) -> None:
# Check that all asked features are in ALL_FEATURE_NAMES
for feature_name in features_names:
err_msg = f"Choose an existing feature, not {feature_name}."
assert feature_name in SegmentFeature.ALL_FEATURE_NAMES, err_msg
err_msg = "For now, you can only choose one feature."
assert len(features_names) == 1, err_msg
self.features_names = features_names
def fit(self, *args, **kwargs):
return self
def transform(self, b_transform_segmentation) -> pd.DataFrame:
"""
Signals are assumed to be multivariate.
"""
list_of_multivariate_signals = (
b_transform_segmentation.list_of_multivariate_signals
)
list_of_bkps = b_transform_segmentation.list_of_bkps
feature_name = self.features_names[0]
list_of_df = list()
for multivariate_signal_index, (multivariate_signal, bkps) in enumerate(
zip(list_of_multivariate_signals, list_of_bkps)
):
features_for_single_multivariate_signal = (
self.transform_single_multivariate_signal(
multivariate_signal=multivariate_signal, bkps=bkps
)
)
features_for_single_multivariate_signal_df = (
pd.DataFrame(features_for_single_multivariate_signal)
.add_prefix("dim_")
.add_suffix(f"_{feature_name}_feat")
) # adding a prefix and a suffix to feature columns
features_for_single_multivariate_signal_df[
"signal_index"
] = multivariate_signal_index
features_for_single_multivariate_signal_df["segment_start"] = [
0
] + bkps[:-1]
features_for_single_multivariate_signal_df["segment_end"] = bkps
features_for_single_multivariate_signal_df["segment_length"] = (
features_for_single_multivariate_signal_df.segment_end
- features_for_single_multivariate_signal_df.segment_start
)
list_of_df.append(features_for_single_multivariate_signal_df)
segment_features_df = pd.concat(list_of_df).reset_index(drop=True)
if "length" in self.features_names:
segment_features_df.insert(
len(self.features_names) - 1,
"length_feat",
segment_features_df["segment_length"].values,
)
return segment_features_df
def transform_single_multivariate_signal(self, multivariate_signal, bkps):
"""Return a list of features for each segment.
Output is a list (of dict) of length n_segments."""
features_for_single_signal = [
np.mean(sub_multivariate_signal, axis=0)
for sub_multivariate_signal in np.split(
multivariate_signal, bkps[:-1]
)
]
return features_for_single_signal
def feature_func(self, sub_signal):
"""Return a dict of features computed on the whole sub-signal.
Output is a dict of size `n_features`.
"""
dict_of_features = dict()
if "mean" in self.features_names:
dict_of_features["mean"] = self.get_mean(sub_signal)
if "min" in self.features_names:
dict_of_features["min"] = self.get_min(sub_signal)
if "max" in self.features_names:
dict_of_features["max"] = self.get_max(sub_signal)
if "mean_of_min_max" in self.features_names:
dict_of_features["mean_of_min_max"] = self.get_mean_of_min_max(
sub_signal
)
if "variance" in self.features_names:
dict_of_features["variance"] = self.get_var(sub_signal)
if "slope" in self.features_names:
dict_of_features["slope"] = self.get_slope(sub_signal)
if "scaled_complexity_invariance" in self.features_names:
dict_of_features[
"scaled_complexity_invariance"
] = self.get_scaled_complexity_invariance(sub_signal)
if "linear_residuals" in self.features_names:
dict_of_features["linear_residuals"] = self.get_linear_residuals(
sub_signal
)
return dict_of_features
@staticmethod
def get_mean(sub_signal):
return np.mean(sub_signal, axis=0)
@staticmethod
def get_min(sub_signal):
return np.min(sub_signal, axis=0)
@staticmethod
def get_max(sub_signal):
return np.max(sub_signal, axis=0)
@staticmethod
def get_mean_of_min_max(sub_signal):
"""For E-SAX (Extended SAX)
TO BE UPDATED TO MULTIVARIATE SETTING.
"""
return (sub_signal.min() + sub_signal.max()) / 2
@staticmethod
def get_slope(sub_signal):
"""Return the value of the slope on the sub-signal.
TO BE UPDATED TO MULTIVARIATE SETTING.
"""
n_samples = sub_signal.shape[0]
return (sub_signal[-1] - sub_signal[0]) / (n_samples - 1)
@staticmethod
def get_var(sub_signal):
"""
TO BE UPDATED TO MULTIVARIATE SETTING.
"""
return sub_signal.var()
@staticmethod
def get_scaled_complexity_invariance(sub_signal):
"""From the CSAX paper, equation 6 and not equation 5.
TO BE UPDATED TO MULTIVARIATE SETTING.
"""
return (np.sqrt((np.diff(sub_signal) ** 2).sum())) / (
len(sub_signal) - 1
)
@staticmethod
def get_linear_residuals(sub_signal):
"""
TO BE UPDATED TO MULTIVARIATE SETTING.
"""
y_linear = np.linspace(
start=sub_signal[0], stop=sub_signal[-1], num=len(sub_signal)
)
linear_residuals = mean_squared_error(
y_true=sub_signal, y_pred=y_linear, squared=False
)
return linear_residuals