From a1aaaa18b3f3b930fb86a734785d8a821c6d77ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Wa=C5=9B?= Date: Sat, 2 Nov 2024 10:54:17 +0100 Subject: [PATCH] Make Faker deterministic --- docs/src/main/sphinx/connector/faker.md | 3 -- .../plugin/faker/FakerPageSourceProvider.java | 25 ++++++++++-- .../io/trino/plugin/faker/FakerSplit.java | 2 +- .../trino/plugin/faker/FakerSplitManager.java | 4 +- .../trino/plugin/faker/TestFakerQueries.java | 40 ++++++++++++++----- 5 files changed, 54 insertions(+), 20 deletions(-) diff --git a/docs/src/main/sphinx/connector/faker.md b/docs/src/main/sphinx/connector/faker.md index a254e301e9c47..5c1b3db1cf6eb 100644 --- a/docs/src/main/sphinx/connector/faker.md +++ b/docs/src/main/sphinx/connector/faker.md @@ -284,7 +284,4 @@ CREATE TABLE generator.default.customer ( ## Limitations -- Generated data is not deterministic. There is no way to specify a seed for - the random generator. The same query reading from catalogs using this - connector, executed multiple times, returns different results each time. - It is not possible to choose the locale used by the Datafaker's generators. diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerPageSourceProvider.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerPageSourceProvider.java index b08497c014eb6..aea5fce42add1 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerPageSourceProvider.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerPageSourceProvider.java @@ -26,20 +26,27 @@ import java.util.List; import java.util.Random; +import java.util.random.RandomGeneratorFactory; import static com.google.common.collect.ImmutableList.toImmutableList; +import static java.util.random.RandomGenerator.JumpableGenerator; public class FakerPageSourceProvider implements ConnectorPageSourceProvider { - private final Random random; + private final JumpableGenerator jumpableRandom; private final Faker faker; @Inject public FakerPageSourceProvider() { - random = new Random(); - faker = new Faker(random); + // Every split should generate data in a sequence that does not overlap with other splits. + // To make data generation deterministic, use a generator with the same seed, + // but advance its state by a different offset for every split. + // A jumpable random generator's state can be advanced forward by a big distance in a single call. + // Xoroshiro128PlusPlus state has a period of 2^128, and a jump distance of 2^64. + jumpableRandom = (JumpableGenerator) RandomGeneratorFactory.of("Xoroshiro128PlusPlus").create(1); + faker = new Faker(Random.from(jumpableRandom.copy())); } @Override @@ -58,7 +65,17 @@ public ConnectorPageSource createPageSource( FakerTableHandle fakerTable = (FakerTableHandle) table; FakerSplit fakerSplit = (FakerSplit) split; - return new FakerPageSource(faker, random, handles, fakerTable.constraint(), fakerSplit.limit()); + Random random = random(fakerSplit.splitNumber()); + return new FakerPageSource(new Faker(random), random, handles, fakerTable.constraint(), fakerSplit.limit()); + } + + private Random random(long index) + { + JumpableGenerator jumpableRandom = this.jumpableRandom.copy(); + for (long i = 0; i < index; i++) { + jumpableRandom.jump(); + } + return Random.from(jumpableRandom); } public void validateGenerator(String generator) diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerSplit.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerSplit.java index 0df3b56902da8..08f155c1b96f8 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerSplit.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerSplit.java @@ -20,7 +20,7 @@ import static java.util.Objects.requireNonNull; -public record FakerSplit(List addresses, long limit) +public record FakerSplit(List addresses, long splitNumber, long limit) implements ConnectorSplit { public FakerSplit diff --git a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerSplitManager.java b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerSplitManager.java index c0ea4578b480e..5b94f7d97bbe5 100644 --- a/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerSplitManager.java +++ b/plugin/trino-faker/src/main/java/io/trino/plugin/faker/FakerSplitManager.java @@ -70,11 +70,11 @@ public ConnectorSplitSource getSplits( ImmutableList.Builder splits = ImmutableList.builder(); for (long i = 0; i < splitCount - 1; i++) { HostAddress address = addresses.get((int) (i % addresses.size())); - splits.add(new FakerSplit(ImmutableList.of(address), MAX_ROWS_PER_SPLIT)); + splits.add(new FakerSplit(ImmutableList.of(address), i, MAX_ROWS_PER_SPLIT)); } HostAddress address = addresses.get((int) ((splitCount - 1) % addresses.size())); long limit = fakerTable.limit() % MAX_ROWS_PER_SPLIT; - splits.add(new FakerSplit(ImmutableList.of(address), limit == 0 ? MAX_ROWS_PER_SPLIT : limit)); + splits.add(new FakerSplit(ImmutableList.of(address), splitCount - 1, limit == 0 ? MAX_ROWS_PER_SPLIT : limit)); return new FixedSplitSource(splits.build()); } } diff --git a/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerQueries.java b/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerQueries.java index a11c9ca085139..e6c610059a835 100644 --- a/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerQueries.java +++ b/plugin/trino-faker/src/test/java/io/trino/plugin/faker/TestFakerQueries.java @@ -186,19 +186,39 @@ rnd_nchar char(1000) NOT NULL, @Test void testSelectLimit() { - @Language("SQL") - String tableQuery = "CREATE TABLE faker.default.single_column (rnd_bigint bigint NOT NULL)"; - assertUpdate(tableQuery); + assertUpdate("CREATE TABLE faker.default.single_column (rnd_bigint bigint NOT NULL)"); - @Language("SQL") - String testQuery = "SELECT count(rnd_bigint) FROM (SELECT rnd_bigint FROM single_column LIMIT 5) a"; - assertQuery(testQuery, "VALUES (5)"); + assertQuery("SELECT count(rnd_bigint) FROM (SELECT rnd_bigint FROM single_column LIMIT 5) a", + "VALUES (5)"); - testQuery = "SELECT count(rnd_bigint) FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a".formatted(2*MAX_ROWS_PER_SPLIT); - assertQuery(testQuery, "VALUES (%d)".formatted(2*MAX_ROWS_PER_SPLIT)); + assertQuery(""" + SELECT count(rnd_bigint) + FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a""".formatted(2 * MAX_ROWS_PER_SPLIT), + "VALUES (%d)".formatted(2 * MAX_ROWS_PER_SPLIT)); + + assertQuery("SELECT count(distinct rnd_bigint) FROM single_column LIMIT 5", + "VALUES (1000)"); + + assertQuery(""" + SELECT count(rnd_bigint) + FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a""".formatted(MAX_ROWS_PER_SPLIT), + "VALUES (%d)".formatted(MAX_ROWS_PER_SPLIT)); + + // generating data should be deterministic + String testQuery = """ + SELECT to_hex(checksum(rnd_bigint)) + FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a""".formatted(3 * MAX_ROWS_PER_SPLIT); + assertQuery(testQuery, "VALUES ('1FB3289AC3A44EEA')"); + assertQuery(testQuery, "VALUES ('1FB3289AC3A44EEA')"); + assertQuery(testQuery, "VALUES ('1FB3289AC3A44EEA')"); + + // there should be no overlap between data generated from different splits + assertQuery(""" + SELECT count(1) + FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a + JOIN (SELECT rnd_bigint FROM single_column LIMIT %d) b ON a.rnd_bigint = b.rnd_bigint""".formatted(2 * MAX_ROWS_PER_SPLIT, 5 * MAX_ROWS_PER_SPLIT), + "VALUES (%d)".formatted(2 * MAX_ROWS_PER_SPLIT)); - testQuery = "SELECT count(distinct rnd_bigint) FROM single_column LIMIT 5"; - assertQuery(testQuery, "VALUES (1000)"); assertUpdate("DROP TABLE faker.default.single_column"); }