1+ function [data , clustPoints , idx , centers , slopes , lengths ] = ...
2+ generateData( ...
3+ slope , ...
4+ slopeStd , ...
5+ numClusts , ...
6+ xClustAvgSep , ...
7+ yClustAvgSep , ...
8+ lengthAvg , ...
9+ lengthStd , ...
10+ lateralStd , ...
11+ totalPoints ...
12+ )
13+ % GENERATEDATA Generates 2D data for clustering; data is created along
14+ % straight lines, which can be more or less parallel depending
15+ % on slopeStd argument.
16+ %
17+ % [data clustPoints idx centers slopes lengths] =
18+ % GENERATEDATA(slope, slopeStd, numClusts, xClustAvgSep, yClustAvgSep, ...
19+ % lengthAvg, lengthStd, lateralStd, totalPoints)
20+ %
21+ % Inputs:
22+ % slope - Base direction of the lines on which clusters are based.
23+ % slopeStd - Standard deviation of the slope; used to obtain a random
24+ % slope variation from the normal distribution, which is
25+ % added to the base slope in order to obtain the final slope
26+ % of each cluster.
27+ % numClusts - Number of clusters (and therefore of lines) to generate.
28+ % xClustAvgSep - Average separation of line centers along the X axis.
29+ % yClustAvgSep - Average separation of line centers along the Y axis.
30+ % lengthAvg - The base length of lines on which clusters are based.
31+ % lengthStd - Standard deviation of line length; used to obtain a random
32+ % length variation from the normal distribution, which is
33+ % added to the base length in order to obtain the final
34+ % length of each line.
35+ % lateralStd - "Cluster fatness", i.e., the standard deviation of the
36+ % distance from each point to the respective line, in both x
37+ % and y directions; this distance is obtained from the
38+ % normal distribution.
39+ % totalPoints - Total points in generated data (will be
40+ % randomly divided among clusters).
41+ %
42+ % Outputs:
43+ % data - Matrix (totalPoints x 2) with the generated data
44+ % clustPoints - Vector (numClusts x 1) containing number of points in each
45+ % cluster
46+ % idx - Vector (totalPoints x 1) containing the cluster indices of
47+ % each point
48+ % centers - Matrix (numClusts x 2) containing centers from where
49+ % clusters were generated
50+ % slopes - Vector (numClusts x 1) containing the effective slopes
51+ % used to generate clusters
52+ % lengths - Vector (numClusts x 1) containing the effective lengths
53+ % used to generate clusters
54+ %
55+ % ----------------------------------------------------------
56+ % Usage example:
57+ %
58+ % [data cp idx] = GENERATEDATA(1, 0.5, 5, 15, 15, 5, 1, 2, 200);
59+ %
60+ % This creates 5 clusters with a total of 200 points, with a base slope
61+ % of 1 (std=0.5), separated in average by 15 units in both x and y
62+ % directions, with average length of 5 units (std=1) and a "fatness" or
63+ % spread of 2 units.
64+ %
65+ % To take a quick look at the clusters just do:
66+ %
67+ % scatter(data(:,1), data(:,2), 8, idx);
68+
69+ % N. Fachada
70+ % Instituto Superior Técnico, Lisboa, Portugal
71+
72+ % Make sure totalPoints >= numClusts
73+ if totalPoints < numClusts
74+ error(' Number of points must be equal or larger than the number of clusters.' );
75+ end ;
76+
77+ % Determine number of points in each cluster
78+ clustPoints = abs(randn(numClusts , 1 ));
79+ clustPoints = clustPoints / sum(clustPoints );
80+ clustPoints = round(clustPoints * totalPoints );
81+
82+ % Make sure totalPoints is respected
83+ while sum(clustPoints ) < totalPoints
84+ % If one point is missing add it to the smaller cluster
85+ [C ,I ] = min(clustPoints );
86+ clustPoints(I(1 )) = C + 1 ;
87+ end ;
88+ while sum(clustPoints ) > totalPoints
89+ % If there is one extra point, remove it from larger cluster
90+ [C ,I ] = max(clustPoints );
91+ clustPoints(I(1 )) = C - 1 ;
92+ end ;
93+
94+ % Make sure there are no empty clusters
95+ emptyClusts = find(clustPoints == 0 );
96+ if ~isempty(emptyClusts )
97+ % If there are empty clusters...
98+ numEmptyClusts = size(emptyClusts , 1 );
99+ for i= 1 : numEmptyClusts
100+ % ...get a point from the largest cluster and assign it to the
101+ % empty cluster
102+ [C ,I ] = max(clustPoints );
103+ clustPoints(I(1 )) = C - 1 ;
104+ clustPoints(emptyClusts(i )) = 1 ;
105+ end ;
106+ end ;
107+
108+ % Initialize data matrix
109+ data = zeros(sum(clustPoints ), 2 );
110+
111+ % Initialize idx (vector containing the cluster indices of each point)
112+ idx = zeros(totalPoints , 1 );
113+
114+ % Initialize lengths vector
115+ lengths = zeros(numClusts , 1 );
116+
117+ % Determine cluster centers
118+ xCenters = xClustAvgSep * numClusts * (rand(numClusts , 1 ) - 0.5 );
119+ yCenters = yClustAvgSep * numClusts * (rand(numClusts , 1 ) - 0.5 );
120+ centers = [xCenters yCenters ];
121+
122+ % Determine cluster slopes
123+ slopes = slope + slopeStd * randn(numClusts , 1 );
124+
125+ % Create clusters
126+ for i= 1 : numClusts
127+ % Determine length of line where this cluster will be based
128+ lengths(i ) = abs(lengthAvg + lengthStd * randn );
129+ % Determine how many points have been assigned to previous clusters
130+ sumClustPoints = 0 ;
131+ if i > 1
132+ sumClustPoints = sum(clustPoints(1 : (i - 1 )));
133+ end ;
134+ % Create points for this cluster
135+ for j= 1 : clustPoints(i )
136+ % Determine where in the line the next point will be projected
137+ position = lengths(i ) * rand - lengths(i ) / 2 ;
138+ % Determine x coordinate of point projection
139+ delta_x = cos(atan(slopes(i ))) * position ;
140+ % Determine y coordinate of point projection
141+ delta_y = delta_x * slopes(i );
142+ % Get point distance from line in x coordinate
143+ delta_x = delta_x + lateralStd * randn ;
144+ % Get point distance from line in y coordinate
145+ delta_y = delta_y + lateralStd * randn ;
146+ % Determine the actual point
147+ data(sumClustPoints + j , : ) = [(xCenters(i ) + delta_x ) (yCenters(i ) + delta_y )];
148+ end ;
149+ % Update idx
150+ idx(sumClustPoints + 1 : sumClustPoints + clustPoints(i )) = i ;
151+ end ;
0 commit comments