|
25 | 25 | summarize_bootstrap, |
26 | 26 | matrix_permutation, |
27 | 27 | fisher_r_to_z, |
| 28 | + fisher_z_to_r, |
28 | 29 | _calc_pvalue, |
29 | 30 | _bootstrap_isc, |
30 | 31 | ) |
|
36 | 37 | concatenate, |
37 | 38 | _bootstrap_apply_func, |
38 | 39 | _df_meta_to_arr, |
39 | | - isiterable, |
40 | 40 | ) |
41 | 41 | from .design_matrix import Design_Matrix |
42 | 42 | from joblib import Parallel, delayed |
@@ -114,29 +114,40 @@ def __init__(self, data=None, Y=None, matrix_type=None, labels=[], **kwargs): |
114 | 114 | self.issymmetric = symmetric_all[0] |
115 | 115 | self.matrix_type = matrix_type_all[0] |
116 | 116 | self.is_single_matrix = False |
117 | | - elif (isinstance(data, str) or isinstance(data, Path)) and ( |
118 | | - (".h5" in data) or (".hdf5" in data) |
119 | | - ): |
120 | | - f = dd.io.load(data) |
121 | | - self.data = f["data"] |
122 | | - self.Y = pd.DataFrame( |
123 | | - f["Y"], |
124 | | - columns=[ |
125 | | - e.decode("utf-8") if isinstance(e, bytes) else e |
126 | | - for e in f["Y_columns"] |
127 | | - ], |
128 | | - index=[ |
| 117 | + elif isinstance(data, str) or isinstance(data, Path): |
| 118 | + to_load = str(data) |
| 119 | + # Data is a string or apth and h5 |
| 120 | + if (".h5" in to_load) or (".hdf5" in to_load): |
| 121 | + f = dd.io.load(data) |
| 122 | + self.data = f["data"] |
| 123 | + self.Y = pd.DataFrame( |
| 124 | + f["Y"], |
| 125 | + columns=[ |
| 126 | + e.decode("utf-8") if isinstance(e, bytes) else e |
| 127 | + for e in f["Y_columns"] |
| 128 | + ], |
| 129 | + index=[ |
| 130 | + e.decode("utf-8") if isinstance(e, bytes) else e |
| 131 | + for e in f["Y_index"] |
| 132 | + ], |
| 133 | + ) |
| 134 | + self.matrix_type = f["matrix_type"] |
| 135 | + self.is_single_matrix = f["is_single_matrix"] |
| 136 | + self.issymmetric = f["issymmetric"] |
| 137 | + self.labels = [ |
129 | 138 | e.decode("utf-8") if isinstance(e, bytes) else e |
130 | | - for e in f["Y_index"] |
131 | | - ], |
132 | | - ) |
133 | | - self.matrix_type = f["matrix_type"] |
134 | | - self.is_single_matrix = f["is_single_matrix"] |
135 | | - self.issymmetric = f["issymmetric"] |
136 | | - self.labels = [ |
137 | | - e.decode("utf-8") if isinstance(e, bytes) else e for e in f["labels"] |
138 | | - ] |
139 | | - return |
| 139 | + for e in f["labels"] |
| 140 | + ] |
| 141 | + return |
| 142 | + # Data is a string or path but not h5 |
| 143 | + else: |
| 144 | + ( |
| 145 | + self.data, |
| 146 | + self.issymmetric, |
| 147 | + self.matrix_type, |
| 148 | + self.is_single_matrix, |
| 149 | + ) = self._import_single_data(data, matrix_type=matrix_type) |
| 150 | + # Data is not a string or path |
140 | 151 | else: |
141 | 152 | ( |
142 | 153 | self.data, |
@@ -511,6 +522,30 @@ def mean(self, axis=0): |
511 | 522 | elif axis == 1: |
512 | 523 | return np.nanmean(self.data, axis=axis) |
513 | 524 |
|
| 525 | + def sum(self, axis=0): |
| 526 | + """Calculate sum of Adjacency |
| 527 | +
|
| 528 | + Args: |
| 529 | + axis: (int) calculate mean over features (0) or data (1). |
| 530 | + For data it will be on upper triangle. |
| 531 | +
|
| 532 | + Returns: |
| 533 | + mean: float if single, adjacency if axis=0, np.array if axis=1 |
| 534 | + and multiple |
| 535 | +
|
| 536 | + """ |
| 537 | + |
| 538 | + if self.is_single_matrix: |
| 539 | + return np.nansum(self.data) |
| 540 | + else: |
| 541 | + if axis == 0: |
| 542 | + return Adjacency( |
| 543 | + data=np.nansum(self.data, axis=axis), |
| 544 | + matrix_type=self.matrix_type + "_flat", |
| 545 | + ) |
| 546 | + elif axis == 1: |
| 547 | + return np.nansum(self.data, axis=axis) |
| 548 | + |
514 | 549 | def std(self, axis=0): |
515 | 550 | """Calculate standard deviation of Adjacency |
516 | 551 |
|
@@ -752,6 +787,13 @@ def r_to_z(self): |
752 | 787 | out.data = fisher_r_to_z(out.data) |
753 | 788 | return out |
754 | 789 |
|
| 790 | + def z_to_r(self): |
| 791 | + """ Convert z score back into r value for each element of data object""" |
| 792 | + |
| 793 | + out = self.copy() |
| 794 | + out.data = fisher_z_to_r(out.data) |
| 795 | + return out |
| 796 | + |
755 | 797 | def threshold(self, upper=None, lower=None, binarize=False): |
756 | 798 | """Threshold Adjacency instance. Provide upper and lower values or |
757 | 799 | percentages to perform two-sided thresholding. Binarize will return |
@@ -1067,7 +1109,7 @@ def isc( |
1067 | 1109 | exclude_self_corr=exclude_self_corr, |
1068 | 1110 | random_state=random_state, |
1069 | 1111 | ) |
1070 | | - for i in range(n_bootstraps) |
| 1112 | + for _ in range(n_bootstraps) |
1071 | 1113 | ) |
1072 | 1114 |
|
1073 | 1115 | stats["p"] = _calc_pvalue(all_bootstraps - stats["isc"], stats["isc"], tail) |
@@ -1185,57 +1227,85 @@ def plot_mds( |
1185 | 1227 | ax.xaxis.set_visible(False) |
1186 | 1228 | ax.yaxis.set_visible(False) |
1187 | 1229 |
|
1188 | | - def distance_to_similarity(self, beta=1): |
1189 | | - """Convert distance matrix to similarity matrix |
| 1230 | + def distance_to_similarity(self, metric="correlation", beta=1): |
| 1231 | + """Convert distance matrix to similarity matrix. |
| 1232 | +
|
| 1233 | + Note: currently only implemented for correlation and euclidean. |
1190 | 1234 |
|
1191 | 1235 | Args: |
1192 | | - beta: (float) parameter to scale exponential function (default: 1) |
| 1236 | + metric: (str) Can only be correlation or euclidean |
| 1237 | + beta: (float) parameter to scale exponential function (default: 1) for euclidean |
1193 | 1238 |
|
1194 | 1239 | Returns: |
1195 | 1240 | out: (Adjacency) Adjacency object |
1196 | 1241 |
|
1197 | 1242 | """ |
1198 | 1243 | if self.matrix_type == "distance": |
1199 | | - return Adjacency( |
1200 | | - np.exp(-beta * self.squareform() / self.squareform().std()), |
1201 | | - labels=self.labels, |
1202 | | - matrix_type="similarity", |
1203 | | - ) |
| 1244 | + if metric == "correlation": |
| 1245 | + return Adjacency(1 - self.squareform(), matrix_type="similarity") |
| 1246 | + elif metric == "euclidean": |
| 1247 | + return Adjacency( |
| 1248 | + np.exp(-beta * self.squareform() / self.squareform().std()), |
| 1249 | + labels=self.labels, |
| 1250 | + matrix_type="similarity", |
| 1251 | + ) |
| 1252 | + else: |
| 1253 | + raise ValueError('metric can only be ["correlation","euclidean"]') |
1204 | 1254 | else: |
1205 | 1255 | raise ValueError("Matrix is not a distance matrix.") |
1206 | 1256 |
|
1207 | | - def similarity_to_distance(self): |
1208 | | - """Convert similarity matrix to distance matrix""" |
1209 | | - if self.matrix_type == "similarity": |
1210 | | - return Adjacency( |
1211 | | - 1 - self.squareform(), labels=self.labels, matrix_type="distance" |
1212 | | - ) |
1213 | | - else: |
1214 | | - raise ValueError("Matrix is not a similarity matrix.") |
| 1257 | + def cluster_summary(self, clusters=None, metric="mean", summary="within"): |
| 1258 | + """This function provides summaries of clusters within Adjacency matrices. |
1215 | 1259 |
|
1216 | | - def within_cluster_mean(self, clusters=None): |
1217 | | - """This function calculates mean within cluster labels |
| 1260 | + It can compute mean/median of within and between cluster values. Requires a |
| 1261 | + list of cluster ids indicating the row/column of each cluster. |
1218 | 1262 |
|
1219 | 1263 | Args: |
1220 | 1264 | clusters: (list) list of cluster labels |
| 1265 | + metric: (str) method to summarize mean or median. If 'None" then return all r values |
| 1266 | + summary: (str) summarize within cluster or between clusters |
| 1267 | +
|
1221 | 1268 | Returns: |
1222 | 1269 | dict: (dict) within cluster means |
| 1270 | +
|
1223 | 1271 | """ |
| 1272 | + if metric not in ["mean", "median", None]: |
| 1273 | + raise ValueError("metric must be ['mean','median', None]") |
1224 | 1274 |
|
1225 | 1275 | distance = pd.DataFrame(self.squareform()) |
1226 | 1276 | clusters = np.array(clusters) |
1227 | 1277 |
|
1228 | 1278 | if len(clusters) != distance.shape[0]: |
1229 | 1279 | raise ValueError("Cluster labels must be same length as distance matrix") |
1230 | 1280 |
|
1231 | | - out = pd.DataFrame(columns=["Mean", "Label"], index=None) |
1232 | 1281 | out = {} |
1233 | 1282 | for i in list(set(clusters)): |
1234 | | - out[i] = np.mean( |
1235 | | - distance.loc[clusters == i, clusters == i].values[ |
1236 | | - np.triu_indices(sum(clusters == i), k=1) |
1237 | | - ] |
1238 | | - ) |
| 1283 | + if summary == "within": |
| 1284 | + if metric == "mean": |
| 1285 | + out[i] = np.mean( |
| 1286 | + distance.loc[clusters == i, clusters == i].values[ |
| 1287 | + np.triu_indices(sum(clusters == i), k=1) |
| 1288 | + ] |
| 1289 | + ) |
| 1290 | + elif metric == "median": |
| 1291 | + out[i] = np.median( |
| 1292 | + distance.loc[clusters == i, clusters == i].values[ |
| 1293 | + np.triu_indices(sum(clusters == i), k=1) |
| 1294 | + ] |
| 1295 | + ) |
| 1296 | + elif metric is None: |
| 1297 | + out[i] = distance.loc[clusters == i, clusters == i].values[ |
| 1298 | + np.triu_indices(sum(clusters == i), k=1) |
| 1299 | + ] |
| 1300 | + elif summary == "between": |
| 1301 | + if metric == "mean": |
| 1302 | + out[i] = distance.loc[clusters == i, clusters != i].mean().mean() |
| 1303 | + elif metric == "median": |
| 1304 | + out[i] = ( |
| 1305 | + distance.loc[clusters == i, clusters != i].median().median() |
| 1306 | + ) |
| 1307 | + elif metric is None: |
| 1308 | + out[i] = distance.loc[clusters == i, clusters != i] |
1239 | 1309 | return out |
1240 | 1310 |
|
1241 | 1311 | def regress(self, X, mode="ols", **kwargs): |
@@ -1281,11 +1351,11 @@ def regress(self, X, mode="ols", **kwargs): |
1281 | 1351 | def social_relations_model(self, summarize_results=True, nan_replace=True): |
1282 | 1352 | """Estimate the social relations model from a matrix for a round-robin design. |
1283 | 1353 |
|
1284 | | - X_{ij} = m + \alpha_i + \beta_j + g_{ij} + \episolon_{ijl} |
| 1354 | + X_{ij} = m + \alpha_i + \beta_j + g_{ij} + \epsilon_{ijl} |
1285 | 1355 |
|
1286 | 1356 | where X_{ij} is the score for person i rating person j, m is the group mean, |
1287 | 1357 | \alpha_i is person i's actor effect, \beta_j is person j's partner effect, g_{ij} |
1288 | | - is the relationship effect and \episolon_{ijl} is the error in measure l for actor i and partner j. |
| 1358 | + is the relationship effect and \epsilon_{ijl} is the error in measure l for actor i and partner j. |
1289 | 1359 |
|
1290 | 1360 | This model is primarily concerned with partioning the variance of the various effects. |
1291 | 1361 |
|
@@ -1551,7 +1621,7 @@ def fix_missing(data): |
1551 | 1621 | return (X, coord) |
1552 | 1622 |
|
1553 | 1623 | if nan_replace: |
1554 | | - data, coord = replace_missing(self) |
| 1624 | + data, _ = replace_missing(self) |
1555 | 1625 | else: |
1556 | 1626 | data = self.copy() |
1557 | 1627 |
|
|
0 commit comments