Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add third data quality metric #11939

Draft
wants to merge 17 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ polyglot java import org.enso.base.Time_Utils
polyglot java import org.enso.table.data.column.operation.cast.CastProblemAggregator
polyglot java import org.enso.table.data.column.operation.CountNothing
polyglot java import org.enso.table.data.column.operation.CountUntrimmed
polyglot java import org.enso.table.data.column.operation.CountWhitespace
polyglot java import org.enso.table.data.column.operation.unary.DatePartOperation
polyglot java import org.enso.table.data.column.operation.unary.IsEmptyOperation
polyglot java import org.enso.table.data.column.operation.unary.IsFiniteOperation
Expand Down Expand Up @@ -2223,6 +2224,14 @@ type Column
if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else
CountUntrimmed.apply self.java_column sample_size

## PRIVATE
Counts the number of text values with non trivial whitespace.
Used for data quality indicator in Table Viz.
count_non_trivial_whitespace : Integer -> Integer | Nothing
count_non_trivial_whitespace self sample_size:Integer=Column.default_sample_size =
if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else
CountWhitespace.apply self.java_column sample_size

## PRIVATE
Default size for sampling data quality indicators.
default_sample_size -> Integer =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,11 @@ make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_ta
number_untrimmed = case all_rows_count > Column.default_sample_size of
False -> JS_Object.from_pairs [["name", "Count untrimmed whitespace"], ["percentage_value", columns.map .count_untrimmed]]
True -> JS_Object.from_pairs [["name", "Count untrimmed whitespace (sampled)"], ["percentage_value", columns.map .count_untrimmed]]
[number_nothing, number_untrimmed]
number_non_triv = case all_rows_count > Column.default_sample_size of
False -> JS_Object.from_pairs [["name", "Count non trivial whitespace"], ["percentage_value", columns.map .count_non_trivial_whitespace]]
True -> JS_Object.from_pairs [["name", "Count non trivial whitespace (sampled)"], ["percentage_value", columns.map .count_non_trivial_whitespace]]
JS_Object.from_pairs
[number_nothing, number_untrimmed, number_non_triv]
pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"], child_label]
JS_Object.from_pairs pairs

Expand Down
89 changes: 68 additions & 21 deletions std-bits/base/src/main/java/org/enso/base/Text_Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,25 @@ public static boolean has_leading_trailing_whitespace(String s) {
}

var trailing = Text_Utils.take_suffix(s, 1);
if (trailing != null && is_all_whitespace(trailing)) {
return true;
return trailing != null && is_all_whitespace(trailing);
}

/**
* Checks if the string contains any non trivial whitespace.
*
* @param s the string to check
* @return whether the string contains any of the non trivial whitespace listed
*/
public static boolean has_non_trivial_whitespace(String s) {
List<String> trivialWhiteSpaceList =
List.of(
"\u200A", "\u200B", "\u205F", "\u2004", "\u2005", "\u2006", "\u2008", "\u2009",
"\u2007", "\r", "\n", "\t", "\u2002", "\u00A0", "\u3000", "\u2003");

for (String white_space_to_check : trivialWhiteSpaceList) {
if (s.contains(white_space_to_check)) {
return true;
}
}

return false;
Expand Down Expand Up @@ -240,8 +257,12 @@ public static int compare_normalized_ignoring_case(String a, String b, Locale lo
public static boolean contains(String string, String substring) {
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
// cases.
if (substring.isEmpty()) return true;
if (string.isEmpty()) return false;
if (substring.isEmpty()) {
return true;
}
if (string.isEmpty()) {
return false;
}
StringSearch searcher = new StringSearch(substring, string);
return searcher.first() != StringSearch.DONE;
}
Expand All @@ -268,8 +289,12 @@ public static boolean ends_with(String string, String suffix) {
public static boolean contains_case_insensitive(String string, String substring, Locale locale) {
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
// cases.
if (substring.isEmpty()) return true;
if (string.isEmpty()) return false;
if (substring.isEmpty()) {
return true;
}
if (string.isEmpty()) {
return false;
}

Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
StringSearch searcher = new StringSearch(fold.apply(substring), fold.apply(string));
Expand Down Expand Up @@ -335,12 +360,18 @@ public static String take_suffix(String str, long grapheme_length) {
* @return a UTF-16 code unit span of the first needle or null if not found.
*/
public static Utf16Span span_of(String haystack, String needle) {
if (needle.isEmpty()) return new Utf16Span(0, 0);
if (haystack.isEmpty()) return null;
if (needle.isEmpty()) {
return new Utf16Span(0, 0);
}
if (haystack.isEmpty()) {
return null;
}

StringSearch search = new StringSearch(needle, haystack);
int pos = search.first();
if (pos == StringSearch.DONE) return null;
if (pos == StringSearch.DONE) {
return null;
}
return new Utf16Span(pos, pos + search.getMatchLength());
}

Expand All @@ -356,11 +387,15 @@ public static Utf16Span last_span_of(String haystack, String needle) {
int afterLast = haystack.length();
return new Utf16Span(afterLast, afterLast);
}
if (haystack.isEmpty()) return null;
if (haystack.isEmpty()) {
return null;
}

StringSearch search = new StringSearch(needle, haystack);
int pos = search.last();
if (pos == StringSearch.DONE) return null;
if (pos == StringSearch.DONE) {
return null;
}
return new Utf16Span(pos, pos + search.getMatchLength());
}

Expand All @@ -372,10 +407,13 @@ public static Utf16Span last_span_of(String haystack, String needle) {
* @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
*/
public static List<Utf16Span> span_of_all(String haystack, String needle) {
if (needle.isEmpty())
if (needle.isEmpty()) {
throw new IllegalArgumentException(
"The operation `span_of_all` does not support searching for an empty term.");
if (haystack.isEmpty()) return List.of();
}
if (haystack.isEmpty()) {
return List.of();
}

StringSearch search = new StringSearch(needle, haystack);
ArrayList<Utf16Span> occurrences = new ArrayList<>();
Expand All @@ -396,10 +434,13 @@ public static List<Utf16Span> span_of_all(String haystack, String needle) {
* @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
*/
public static List<Utf16Span> span_of_all_multiple(String haystack, List<String> needles) {
if (needles.isEmpty() || needles.stream().anyMatch(String::isEmpty))
if (needles.isEmpty() || needles.stream().anyMatch(String::isEmpty)) {
throw new IllegalArgumentException(
"The operation `span_of_all_multiple` does not support searching for an empty term.");
if (haystack.isEmpty()) return List.of();
}
if (haystack.isEmpty()) {
return List.of();
}

StringSearch stringSearches[] =
IntStream.range(0, needles.size())
Expand Down Expand Up @@ -514,10 +555,13 @@ public static long[] utf16_indices_to_grapheme_indices(String text, List<Long> c
*/
public static GraphemeSpan span_of_case_insensitive(
String haystack, String needle, Locale locale, boolean searchForLast) {
if (needle.isEmpty())
if (needle.isEmpty()) {
throw new IllegalArgumentException(
"The operation `span_of_case_insensitive` does not support searching for an empty term.");
if (haystack.isEmpty()) return null;
}
if (haystack.isEmpty()) {
return null;
}

CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
Expand Down Expand Up @@ -545,11 +589,14 @@ public static GraphemeSpan span_of_case_insensitive(
*/
public static List<GraphemeSpan> span_of_all_case_insensitive(
String haystack, String needle, Locale locale) {
if (needle.isEmpty())
if (needle.isEmpty()) {
throw new IllegalArgumentException(
"The operation `span_of_all_case_insensitive` does not support searching for an empty"
+ " term.");
if (haystack.isEmpty()) return List.of();
}
if (haystack.isEmpty()) {
return List.of();
}

CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
Expand Down Expand Up @@ -647,11 +694,11 @@ public static String normalize(String str) {
/**
* Normalizes the string to its canonical Unicode form using the specified name and mode.
*
* @param name the normalization name, must be "nfc", "nfkc", or "nfkc_cf"
* @param mode the normalization mode
* @see https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/com/ibm/icu/text/Normalizer2.html
* @see
* https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/com/ibm/icu/text/Normalizer2.Mode.html
* @param name the normalization name, must be "nfc", "nfkc", or "nfkc_cf"
* @param mode the normalization mode
*/
public static String normalizeWithMode(String str, String name, Mode mode) {
return Normalizer2.getInstance(null, name, mode).normalize(str);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package org.enso.table.data.column.operation;

import java.util.Random;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.storage.ColumnStorage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.data.table.Column;
import org.graalvm.polyglot.Context;

public class CountWhitespace {
// Default seed for random number generation (no specific reason for this value, just stability on
// result).

private static final long RANDOM_SEED = 672716252;

// Default sample size for counting cells that contain non trivial whitespace.
public static final long DEFAULT_SAMPLE_SIZE = 10000;

/** Counts the number of cells in the columns with non trivial whitespace */
public static Long apply(Column column, long sampleSize) throws InterruptedException {
ColumnStorage storage = column.getStorage();
return applyToStorage(storage, sampleSize);
}

/** Counts the number of cells in the given storage with non trivial whitespace */
public static Long applyToStorage(ColumnStorage storage, long sampleSize)
throws InterruptedException {
return (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage)
? stringStorage.cachedWhitespaceCount()
: (Long) compute(storage, sampleSize, Context.getCurrent());
}

/** Internal method performing the calculation on a storage. */
public static long compute(ColumnStorage storage, long sampleSize, Context context) {
long size = storage.getSize();

long count = 0;
if (sampleSize < size) {
var rng = new Random(RANDOM_SEED);
for (int i = 0; i < sampleSize; i++) {
long idx = rng.nextInt(Math.toIntExact(size));
var val = storage.getItemAsObject(idx);
if (val instanceof String str && Text_Utils.has_non_trivial_whitespace(str)) {
count++;
}

if (context != null) {
context.safepoint();
}
}
count = Math.min(size, (long) Math.ceil((double) count / sampleSize * size));
} else {
for (long i = 0; i < storage.getSize(); i++) {
var val = storage.getItemAsObject(i);
if (val instanceof String str && Text_Utils.has_non_trivial_whitespace(str)) {
count++;
}

if (context != null) {
context.safepoint();
}
}
}

return count;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.enso.base.CompareException;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.operation.CountUntrimmed;
import org.enso.table.data.column.operation.CountWhitespace;
import org.enso.table.data.column.operation.map.BinaryMapOperation;
import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
import org.enso.table.data.column.operation.map.MapOperationStorage;
Expand All @@ -23,10 +24,12 @@

/** A column storing strings. */
public final class StringStorage extends SpecializedStorage<String> {

private static final Logger LOGGER = org.slf4j.LoggerFactory.getLogger(StringStorage.class);

private final TextType type;
private Future<Long> untrimmedCount;
private Future<Long> whitespaceCount;

/**
* @param data the underlying data
Expand All @@ -40,6 +43,10 @@ public StringStorage(String[] data, int size, TextType type) {
untrimmedCount =
CompletableFuture.supplyAsync(
() -> CountUntrimmed.compute(this, CountUntrimmed.DEFAULT_SAMPLE_SIZE, null));

whitespaceCount =
CompletableFuture.supplyAsync(
() -> CountWhitespace.compute(this, CountWhitespace.DEFAULT_SAMPLE_SIZE, null));
}

@Override
Expand Down Expand Up @@ -80,6 +87,29 @@ public Long cachedUntrimmedCount() throws InterruptedException {
}
}

/**
* Counts the number of cells in the columns with whitespace. If the calculation fails then it
* returns null.
*
* @return the number of cells with whitespace
*/
public Long cachedWhitespaceCount() throws InterruptedException {
if (untrimmedCount.isCancelled()) {
// Need to recompute the value, as was cancelled.
whitespaceCount =
CompletableFuture.completedFuture(
CountWhitespace.compute(
this, CountWhitespace.DEFAULT_SAMPLE_SIZE, Context.getCurrent()));
}

try {
return untrimmedCount.get();
} catch (ExecutionException e) {
LOGGER.error("Failed to compute non trivial whitespace count", e);
return null;
}
}

private static MapOperationStorage<String, SpecializedStorage<String>> buildOps() {
MapOperationStorage<String, SpecializedStorage<String>> t = ObjectStorage.buildObjectOps();
t.add(
Expand Down Expand Up @@ -277,6 +307,7 @@ public StorageType inferPreciseTypeShrunk() {
}

private abstract static class StringComparisonOp extends StringBooleanOp {

public StringComparisonOp(String name) {
super(name);
}
Expand Down
Loading
Loading