-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathall-maximal-sets-cardinality.cc
250 lines (225 loc) · 8.52 KB
/
all-maximal-sets-cardinality.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
// Copyright 2010 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ---
// An algorithm for finding all maximal sets based on the cardinality
// property.
// ---
// Author: Roberto Bayardo
#include "all-maximal-sets-cardinality.h"
#include <assert.h>
#include <algorithm>
#include <iostream>
#include <vector>
#include "data-source-iterator.h"
#include "set-properties.h"
namespace google_extremal_sets {
namespace {
// Returns true if the elements in set #2 are all contained by
// set #1.
inline bool DoesSubsume(
std::vector<uint32_t>::const_iterator it1,
const std::vector<uint32_t>::const_iterator it1_end,
const uint32_t* it2,
const uint32_t* const it2_end) {
while (it2 != it2_end) {
it1 = std::lower_bound(it1, it1_end, *it2);
if (it1 == it1_end || *it1 > *it2)
return false;
++it1;
++it2;
}
return true;
}
} // namespace
bool AllMaximalSetsCardinality::FindAllMaximalSets(
DataSourceIterator* data,
uint32_t max_item_id,
uint32_t max_items_in_ram,
OutputModeEnum output_mode) {
Init();
// The index_us vector contains the previous itemsets whose
// cardinality is the same as the current itemset. We delay their
// indexing until they can potentially be subsumed; that is, when
// the data iterator reaches itemsets with a higher cardinality.
std::vector<SetProperties*> index_us;
// Vars set by the data source iterator.
int result;
uint32_t set_id;
std::vector<uint32_t> current_set;
// This outer loop supports multiple passes over the data in the
// case where the dataset exceeds the bound on max_items_in_ram. As
// long as resume_offset == 0, we will continue retaining itemsets
// in RAM. Otherwise itemsets from the data iterator will be used
// only to perform subsumption checks against existing candidates,
// and will be indexed during a subsequent pass.
off_t resume_offset = 0;
do { // while (resume_offset != 0)
if (!PrepareForDataScan(data, max_item_id, resume_offset))
return false; // IO error
resume_offset = 0;
uint32_t items_in_ram = 0;
int current_set_size = -1;
// This loop scans the input data from beginning to end.
while ((result = data->Next(&set_id, ¤t_set)) > 0) {
DeleteSubsumedCandidates(current_set);
// If current_set has higher cardinality than the itemsets within
// index_us, we move them from index_us into the candidate index.
if (current_set.size() != static_cast<uint32_t>(current_set_size)) {
IndexSets(index_us);
index_us.clear();
current_set_size = current_set.size();
}
if (resume_offset == 0) {
// Copy the current_set into RAM and place a pointer to it in index_us.
index_us.push_back(SetProperties::Create(set_id, current_set));
items_in_ram += current_set.size();
++input_sets_count_;
// Check if we've exceeded the RAM limit and if so stop
// retaining any further itemsets in memory until the next
// scan.
if (items_in_ram >= max_items_in_ram) {
resume_offset = data->Tell();
std::cerr << "; Halting indexing at input set number "
<< input_sets_count_ << " with id " << set_id << std::endl;
// Force the sets in index_us to get added to the index.
current_set_size = -1;
}
} // if (resume_offset = 0)
} // while ((result = data->Next())
if (result != 0) // IO error
return false;
// At this point, any remaining candidate set and any remaining set
// in index_us is maximal!
DumpMaximalSets(&index_us, output_mode);
} while (resume_offset != 0);
return true;
}
void AllMaximalSetsCardinality::Init() {
maximal_sets_count_ = input_sets_count_ = subsumption_checks_count_ = 0;
}
bool AllMaximalSetsCardinality::PrepareForDataScan(
DataSourceIterator* data, uint32_t max_item_id, off_t resume_offset) {
assert(candidates_.size() == 0);
candidates_.resize(max_item_id);
std::cerr << "; Starting new dataset scan at offset: "
<< resume_offset << std::endl;
return data->Seek(resume_offset);
}
void AllMaximalSetsCardinality::IndexSets(const std::vector<SetProperties*>& index_us) {
for (std::vector<SetProperties*>::const_iterator it = index_us.begin();
it != index_us.end();
++it) {
SetProperties* itemset_to_index = *it;
uint32_t item_id_to_index = itemset_to_index->item[0];
if (item_id_to_index >= candidates_.size())
candidates_.resize(item_id_to_index + 1);
candidates_[item_id_to_index].push_back(itemset_to_index);
}
}
inline SetProperties* AllMaximalSetsCardinality::NextCandidate(
const CandidateList& candidates,
const std::vector<uint32_t>& current_set,
unsigned int current_index,
int* candidate_index) {
do {
++(*candidate_index);
if (static_cast<unsigned int>(*candidate_index) == candidates.size())
return 0;
} while (!candidates[*candidate_index]);
SetProperties* candidate = candidates[*candidate_index];
if (current_set.size() - current_index < candidate->size)
return 0; // remaining sets are too big to be subsumed
return candidate;
}
void AllMaximalSetsCardinality::DeleteSubsumedCandidates(
const std::vector<uint32_t>& current_set) {
std::vector<uint32_t>::const_iterator current_begin = current_set.begin();
std::vector<uint32_t>::const_iterator current_end = current_set.end();
SetProperties* candidate = 0;
for (unsigned int i = 0; i < current_set.size(); ++i, ++current_begin) {
if (candidates_.size() <= current_set[i])
return;
CandidateList& candidates = candidates_[current_set[i]];
int candidate_index = -1;
while ((candidate =
NextCandidate(
candidates,
current_set,
i,
&candidate_index))) {
// We must explicitly check subsumption. We need not check every
// item in each set because we already know:
// (1) the candidate does not contain any items within
// current_set[0] through current_set[i - 1].
// (2) the candidate's first item is already known to be
// the same as current_set[i].
// We adjust the iterators accordingly.
if (DoesSubsume(
current_begin, current_end,
candidate->begin() + 1, candidate->end())) {
// Candidate is not maximal, so we delete it. Note that we
// must preserve the cardinality based ordering, so we NULL
// out the pointer to the deleted entry rather than performing
// any swapping. TODO: Occasionally it might be beneficial to
// compress out the holes left by the NULL entries if we have
// accumulated a significant number of them.
candidates[candidate_index] = 0;
SetProperties::Delete(candidate);
}
++subsumption_checks_count_;
}
}
}
void AllMaximalSetsCardinality::DumpMaximalSets(
std::vector<SetProperties*>* unindexed_sets,
OutputModeEnum output_mode) {
for (unsigned int i = 0; i < unindexed_sets->size(); ++i) {
SetProperties* maximal_set = (*unindexed_sets)[i];
FoundMaximalSet(*maximal_set, output_mode);
SetProperties::Delete(maximal_set);
}
unindexed_sets->clear();
for (unsigned int i = 0; i < candidates_.size(); ++i) {
CandidateList& candidate_set = candidates_[i];
for (unsigned int j = 0; j < candidate_set.size(); ++j) {
SetProperties* maximal_set = candidate_set[j];
if (maximal_set) {
FoundMaximalSet(*maximal_set, output_mode);
SetProperties::Delete(maximal_set);
}
}
candidate_set.clear();
}
candidates_.clear();
std::cout << std::flush;
}
void AllMaximalSetsCardinality::FoundMaximalSet(
const SetProperties& maximal_set, OutputModeEnum output_mode) {
++maximal_sets_count_;
switch (output_mode) {
case COUNT_ONLY:
break;
case ID:
std::cout << maximal_set.set_id << '\n';
break;
case ID_AND_ITEMS:
std::cout << maximal_set << '\n';
break;
default:
std::cout << "Huh?\n";
assert(0);
break;
}
}
} // namespace google_extremal_sets