From f406c0d31bb01c43d58c43dee9cb2a3f3b01e0e8 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Mon, 14 Mar 2022 15:13:37 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20truncate=20cell=20conten?= =?UTF-8?q?ts=20instead=20of=20removing=20rows=20(#178)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a ROWS_MIN_NUMBER environment variable, which defines how many rows should be returned as a minimum. If the size of these rows is greater than the ROWS_MAX_BYTES limit, then the cells themselves are truncated (transformed to strings, then truncated to 100 bytes which is an hardcoded limit). In that case, the new field "truncated_cells" contain the list of cells (column names) that are truncated. BREAKING CHANGE: 🧨 The /rows response format has changed --- .env.example | 3 + .github/workflows/unit-tests.yml | 1 + Makefile | 4 +- README.md | 16 ++-- src/datasets_preview_backend/config.py | 2 + src/datasets_preview_backend/constants.py | 4 + src/datasets_preview_backend/io/cache.py | 97 ++++++++++++++++++--- src/datasets_preview_backend/models/row.py | 4 +- src/datasets_preview_backend/routes/rows.py | 8 +- 9 files changed, 115 insertions(+), 24 deletions(-) diff --git a/.env.example b/.env.example index 91436d2912..6e289c410b 100644 --- a/.env.example +++ b/.env.example @@ -41,6 +41,9 @@ # Max number of rows in the /rows endpoint response # ROWS_MAX_NUMBER=100 +# Min number of rows in the /rows endpoint response +# ROWS_MIN_NUMBER=10 + # Number of uvicorn workers # WEB_CONCURRENCY = 2 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 29146e8677..cc6182a5be 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -28,6 +28,7 @@ jobs: run: sudo docker run -d -p 27018:27017 mongo:latest - name: Run unit tests env: + ROWS_MIN_NUMBER: 2 ROWS_MAX_NUMBER: 5 MONGO_CACHE_DATABASE: datasets_preview_cache_test MONGO_QUEUE_DATABASE: datasets_preview_queue_test diff --git a/Makefile b/Makefile index 51c4954b5a..7abc378041 100644 --- a/Makefile +++ b/Makefile @@ -14,11 +14,11 @@ watch: .PHONY: test test: - ROWS_MAX_NUMBER=5 MONGO_CACHE_DATABASE="datasets_preview_cache_test" MONGO_QUEUE_DATABASE="datasets_preview_queue_test" poetry run python -m pytest -x tests + ROWS_MIN_NUMBER=2 ROWS_MAX_NUMBER=5 MONGO_CACHE_DATABASE="datasets_preview_cache_test" MONGO_QUEUE_DATABASE="datasets_preview_queue_test" poetry run python -m pytest -x tests .PHONY: coverage coverage: - ROWS_MAX_NUMBER=5 MONGO_CACHE_DATABASE="datasets_preview_cache_test" MONGO_QUEUE_DATABASE="datasets_preview_queue_test" poetry run python -m pytest -s --cov --cov-report xml:coverage.xml --cov-report=term tests + ROWS_MIN_NUMBER=2 ROWS_MAX_NUMBER=5 MONGO_CACHE_DATABASE="datasets_preview_cache_test" MONGO_QUEUE_DATABASE="datasets_preview_queue_test" poetry run python -m pytest -s --cov --cov-report xml:coverage.xml --cov-report=term tests # Check that source code meets quality standards + security .PHONY: quality diff --git a/README.md b/README.md index 21368b84d6..5ed1c691c6 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ Set environment variables to configure the following aspects: - `MONGO_URL`: the URL used to connect to the mongo db server. Defaults to `"mongodb://localhost:27018"`. - `ROWS_MAX_BYTES`: max size of the /rows endpoint response in bytes. Defaults to `1_000_000` (1 MB). - `ROWS_MAX_NUMBER`: max number of rows in the /rows endpoint response. Defaults to `100`. +- `ROWS_MIN_NUMBER`: min number of rows in the /rows endpoint response. Defaults to `10`. - `WEB_CONCURRENCY`: the number of workers. For now, it's ignored and hardcoded to 1 because the cache is not shared yet. Defaults to `2`. For example: @@ -605,7 +606,7 @@ Parameters: Responses: -- `200`: JSON content that provides the types of the columns (see features at https://huggingface.co/docs/datasets/about_dataset_features.html) and the data rows, with the following structure. Note that the features are ordered and this order can be used to display the columns in a table for example. Binary values are transmitted in UTF-8 encoded base64 strings. The number of rows depends on `ROWS_MAX_BYTES` and `ROWS_MAX_NUMBER`. +- `200`: JSON content that provides the types of the columns (see features at https://huggingface.co/docs/datasets/about_dataset_features.html) and the data rows, with the following structure. Note that the features are ordered and this order can be used to display the columns in a table for example. Binary values are transmitted in UTF-8 encoded base64 strings. The number of rows depends on `ROWS_MAX_BYTES`, `ROWS_MIN_NUMBER` and `ROWS_MAX_NUMBER`. Note that the content of a cell might be truncated to fit within the limits, in which case the `truncated_cells` array will contain the name of the cell (see the last element in the example), and the cell content will always be a string. ```json { @@ -654,7 +655,8 @@ Responses: "hypothesis": "The cat did not sit on the mat.", "label": -1, "idx": 0 - } + }, + "truncated_cells": [] }, { "dataset": "glue", @@ -666,7 +668,8 @@ Responses: "hypothesis": "The cat sat on the mat.", "label": -1, "idx": 1 - } + }, + "truncated_cells": [] }, { "dataset": "glue", @@ -674,11 +677,12 @@ Responses: "split": "test", "row_idx": 2, "row": { - "premise": "When you've got no snow, it's really hard to learn a snow sport so we looked at all the different ways I could mimic being on snow without actually being on snow.", - "hypothesis": "When you've got snow, it's really hard to learn a snow sport so we looked at all the different ways I could mimic being on snow without actually being on snow.", + "premise": "When you've got no snow, it's really hard to learn a snow sport so we lo", + "hypothesis": "When you've got snow, it's really hard to learn a snow sport so we looke", "label": -1, "idx": 2 - } + }, + "truncated_cells": ["premise", "hypothesis"] } ] } diff --git a/src/datasets_preview_backend/config.py b/src/datasets_preview_backend/config.py index cb10246453..5e8964ed4d 100644 --- a/src/datasets_preview_backend/config.py +++ b/src/datasets_preview_backend/config.py @@ -16,6 +16,7 @@ DEFAULT_MONGO_URL, DEFAULT_ROWS_MAX_BYTES, DEFAULT_ROWS_MAX_NUMBER, + DEFAULT_ROWS_MIN_NUMBER, DEFAULT_WEB_CONCURRENCY, ) from datasets_preview_backend.utils import ( @@ -43,6 +44,7 @@ MONGO_URL = get_str_value(d=os.environ, key="MONGO_URL", default=DEFAULT_MONGO_URL) ROWS_MAX_BYTES = get_int_value(d=os.environ, key="ROWS_MAX_BYTES", default=DEFAULT_ROWS_MAX_BYTES) ROWS_MAX_NUMBER = get_int_value(d=os.environ, key="ROWS_MAX_NUMBER", default=DEFAULT_ROWS_MAX_NUMBER) +ROWS_MIN_NUMBER = get_int_value(d=os.environ, key="ROWS_MIN_NUMBER", default=DEFAULT_ROWS_MIN_NUMBER) WEB_CONCURRENCY = get_int_value(d=os.environ, key="WEB_CONCURRENCY", default=DEFAULT_WEB_CONCURRENCY) # Ensure datasets library uses the expected revision for canonical datasets diff --git a/src/datasets_preview_backend/constants.py b/src/datasets_preview_backend/constants.py index adda51940d..745220806b 100644 --- a/src/datasets_preview_backend/constants.py +++ b/src/datasets_preview_backend/constants.py @@ -13,6 +13,7 @@ DEFAULT_MONGO_URL: str = "mongodb://localhost:27018" DEFAULT_ROWS_MAX_BYTES: int = 1_000_000 DEFAULT_ROWS_MAX_NUMBER: int = 100 +DEFAULT_ROWS_MIN_NUMBER: int = 10 DEFAULT_WEB_CONCURRENCY: int = 2 DEFAULT_HF_TOKEN: Optional[str] = None @@ -25,6 +26,9 @@ DEFAULT_REFRESH_PCT: int = 1 +# below 100 bytes, the cell content will not be truncated +DEFAULT_MIN_CELL_BYTES: int = 100 + # these datasets take too much time, we block them beforehand DATASETS_BLOCKLIST: List[str] = [ "imthanhlv/binhvq_news21_raw", diff --git a/src/datasets_preview_backend/io/cache.py b/src/datasets_preview_backend/io/cache.py index 4f945397ae..9da76c962d 100644 --- a/src/datasets_preview_backend/io/cache.py +++ b/src/datasets_preview_backend/io/cache.py @@ -28,6 +28,7 @@ from mongoengine.queryset.queryset import QuerySet from datasets_preview_backend.config import MONGO_CACHE_DATABASE, MONGO_URL +from datasets_preview_backend.constants import DEFAULT_MIN_CELL_BYTES from datasets_preview_backend.exceptions import ( Status400Error, Status500Error, @@ -134,6 +135,7 @@ class RowItem(TypedDict): split: str row_idx: int row: Dict[str, Any] + truncated_cells: List[str] class DbRow(Document): @@ -150,6 +152,7 @@ def to_item(self) -> RowItem: "split": self.split_name, "row_idx": self.row_idx, "row": self.row, + "truncated_cells": [], } meta = {"collection": "rows", "db_alias": "cache"} @@ -484,8 +487,8 @@ def get_splits_response(dataset_name: str) -> Tuple[Union[SplitsResponse, None], return splits_response, None, 200 -def get_size_in_bytes(row: RowItem): - return sys.getsizeof(orjson_dumps(row)) +def get_size_in_bytes(obj: Any): + return sys.getsizeof(orjson_dumps(obj)) # ^^ every row is transformed here in a string, because it corresponds to # the size the row will contribute in the JSON response to /rows endpoint. # The size of the string is measured in bytes. @@ -494,17 +497,85 @@ def get_size_in_bytes(row: RowItem): # dataset viewer table on the hub) -def to_row_items(rows: QuerySet[DbRow], rows_max_bytes: Optional[int]) -> List[RowItem]: +def truncate_cell(cell: Any, min_cell_bytes: int) -> str: + return orjson_dumps(cell)[:min_cell_bytes].decode("utf8", "ignore") + + +# Mutates row_item, and returns it anyway +def truncate_row_item(row_item: RowItem) -> RowItem: + min_cell_bytes = DEFAULT_MIN_CELL_BYTES + row = {} + for column_name, cell in row_item["row"].items(): + # for now: all the cells, but the smallest ones, are truncated + cell_bytes = get_size_in_bytes(cell) + if cell_bytes > min_cell_bytes: + row_item["truncated_cells"].append(column_name) + row[column_name] = truncate_cell(cell, min_cell_bytes) + else: + row[column_name] = cell + row_item["row"] = row + return row_item + + +# Mutates row_items, and returns them anyway +def truncate_row_items(row_items: List[RowItem], rows_max_bytes: int) -> List[RowItem]: + # compute the current size + rows_bytes = sum(get_size_in_bytes(row_item) for row_item in row_items) + + # Loop backwards, so that the last rows are truncated first + for row_item in reversed(row_items): + previous_size = get_size_in_bytes(row_item) + row_item = truncate_row_item(row_item) + new_size = get_size_in_bytes(row_item) + rows_bytes += new_size - previous_size + row_idx = row_item["row_idx"] + logger.debug(f"the size of the rows is now ({rows_bytes}) after truncating row idx={row_idx}") + if rows_bytes < rows_max_bytes: + break + return row_items + + +def to_row_items( + rows: QuerySet[DbRow], rows_max_bytes: Optional[int], rows_min_number: Optional[int] +) -> List[RowItem]: row_items = [] - bytes = 0 - for idx, row in enumerate(rows): + rows_bytes = 0 + if rows_min_number is None: + rows_min_number = 0 + else: + logger.debug(f"min number of rows in the response: '{rows_min_number}'") + if rows_max_bytes is not None: + logger.debug(f"max number of bytes in the response: '{rows_max_bytes}'") + + # two restrictions must be enforced: + # - at least rows_min_number rows + # - at most rows_max_bytes bytes + # To enforce this: + # 1. first get the first rows_min_number rows + for row in rows[:rows_min_number]: + row_item = row.to_item() + if rows_max_bytes is not None: + rows_bytes += get_size_in_bytes(row_item) + row_items.append(row_item) + + # 2. if the total is over the bytes limit, truncate the values, iterating backwards starting + # from the last rows, until getting under the threshold + if rows_max_bytes is not None and rows_bytes >= rows_max_bytes: + logger.debug( + f"the size of the first {rows_min_number} rows ({rows_bytes}) is above the max number of bytes" + f" ({rows_max_bytes}), they will be truncated" + ) + return truncate_row_items(row_items, rows_max_bytes) + + # 3. else: add the remaining rows until the end, or until the bytes threshold + for idx, row in enumerate(rows[rows_min_number:]): row_item = row.to_item() if rows_max_bytes is not None: - bytes += get_size_in_bytes(row_item) - if bytes >= rows_max_bytes: + rows_bytes += get_size_in_bytes(row_item) + if rows_bytes >= rows_max_bytes: logger.debug( - f"the rows in the split have been truncated to {idx} row(s) to keep the size ({bytes}) under the" - f" limit ({rows_max_bytes})" + f"the rows in the split have been truncated to {rows_min_number + idx} row(s) to keep the size" + f" ({rows_bytes}) under the limit ({rows_max_bytes})" ) break row_items.append(row_item) @@ -512,7 +583,11 @@ def to_row_items(rows: QuerySet[DbRow], rows_max_bytes: Optional[int]) -> List[R def get_rows_response( - dataset_name: str, config_name: str, split_name: str, rows_max_bytes: Optional[int] = None + dataset_name: str, + config_name: str, + split_name: str, + rows_max_bytes: Optional[int] = None, + rows_min_number: Optional[int] = None, ) -> Tuple[Union[RowsResponse, None], Union[ErrorItem, None], int]: try: split = DbSplit.objects(dataset_name=dataset_name, config_name=config_name, split_name=split_name).get() @@ -540,7 +615,7 @@ def get_rows_response( rows = DbRow.objects(dataset_name=dataset_name, config_name=config_name, split_name=split_name).order_by( "+row_idx" ) - row_items = to_row_items(rows, rows_max_bytes) + row_items = to_row_items(rows, rows_max_bytes, rows_min_number) rows_response: RowsResponse = { "columns": [column.to_item() for column in columns], "rows": row_items, diff --git a/src/datasets_preview_backend/models/row.py b/src/datasets_preview_backend/models/row.py index e5016e3a74..5e9e5480a1 100644 --- a/src/datasets_preview_backend/models/row.py +++ b/src/datasets_preview_backend/models/row.py @@ -31,11 +31,9 @@ def get_rows( elif not isinstance(dataset, Dataset): raise TypeError("load_dataset should return a Dataset") rows_plus_one = list(itertools.islice(dataset, ROWS_MAX_NUMBER + 1)) - # ^^ to be able to detect if a split has exactly DEFAULT_ROWS_MAX_NUMBER rows + # ^^ to be able to detect if a split has exactly ROWS_MAX_NUMBER rows if len(rows_plus_one) <= ROWS_MAX_NUMBER: logger.debug(f"all the rows in the split have been fetched ({len(rows_plus_one)})") else: logger.debug(f"the rows in the split have been truncated ({ROWS_MAX_NUMBER} rows)") return rows_plus_one[:ROWS_MAX_NUMBER] - # ^^ note that DEFAULT_ROWS_MAX_BYTES is not enforced here, but in typed_row.py - # after the type of the fields is known (ie: the row can be converted to JSON) diff --git a/src/datasets_preview_backend/routes/rows.py b/src/datasets_preview_backend/routes/rows.py index f1d9834bb0..c006e5bbca 100644 --- a/src/datasets_preview_backend/routes/rows.py +++ b/src/datasets_preview_backend/routes/rows.py @@ -3,7 +3,11 @@ from starlette.requests import Request from starlette.responses import Response -from datasets_preview_backend.config import MAX_AGE_LONG_SECONDS, ROWS_MAX_BYTES +from datasets_preview_backend.config import ( + MAX_AGE_LONG_SECONDS, + ROWS_MAX_BYTES, + ROWS_MIN_NUMBER, +) from datasets_preview_backend.exceptions import StatusError from datasets_preview_backend.io.cache import get_rows_response from datasets_preview_backend.routes._utils import get_response @@ -21,7 +25,7 @@ async def rows_endpoint(request: Request) -> Response: if not isinstance(dataset_name, str) or not isinstance(config_name, str) or not isinstance(split_name, str): raise StatusError("Parameters 'dataset', 'config' and 'split' are required", 400) rows_response, rows_error, status_code = get_rows_response( - dataset_name, config_name, split_name, ROWS_MAX_BYTES + dataset_name, config_name, split_name, ROWS_MAX_BYTES, ROWS_MIN_NUMBER ) return get_response(rows_response or rows_error, status_code, MAX_AGE_LONG_SECONDS) except StatusError as err: