Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: LEAP-237: Patch ORM Leak vulnerability in open source #5012

Merged
merged 3 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion label_studio/core/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import re
from datetime import timedelta

from label_studio.core.utils.params import get_bool_env
from label_studio.core.utils.params import get_bool_env, get_env_list

formatter = 'standard'
JSON_LOG = get_bool_env('JSON_LOG', False)
Expand Down Expand Up @@ -668,3 +668,9 @@ def collect_versions_dummy(**kwargs):
REAL_HOSTNAME = os.getenv('HOSTNAME') # we have to use getenv, because we don't use LABEL_STUDIO_ prefix
GCS_CLOUD_STORAGE_FORCE_DEFAULT_CREDENTIALS = get_bool_env('GCS_CLOUD_STORAGE_FORCE_DEFAULT_CREDENTIALS', False)
PUBLIC_API_DOCS = get_bool_env('PUBLIC_API_DOCS', False)

# By default, we disallow filters with foreign keys in data manager for security reasons.
# Add to this list (either here in code, or via the env) to allow specific filters that rely on foreign keys.
DATA_MANAGER_FILTER_ALLOWLIST = list(
set(get_env_list('DATA_MANAGER_FILTER_ALLOWLIST') + ['updated_by__active_organization'])
)
18 changes: 15 additions & 3 deletions label_studio/core/utils/params.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from typing import Callable, Optional, Sequence, TypeVar

from rest_framework.exceptions import ValidationError

Expand Down Expand Up @@ -125,16 +126,27 @@ def get_bool_env(key, default):
return get_env(key, default, is_bool=True)


def get_env_list_int(key, default=None):
T = TypeVar('T')


def get_env_list(
key: str, default: Optional[Sequence[T]] = None, value_transform: Callable[[str], T] = str
) -> Sequence[T]:
"""
"1,2,3" in env variable => [1, 2, 3] in python
"foo,bar,baz" in env variable => ["foo", "bar", "baz"] in python.
Use value_transform to convert the strings to any other type.
"""
value = get_env(key)
if not value:
if default is None:
return []
return default
return [int(el) for el in value.split(',')]

return [value_transform(el) for el in value.split(',')]


def get_env_list_int(key, default=None) -> Sequence[int]:
return get_env_list(key, default=default, value_transform=int)


def get_all_env_with_prefix(prefix=None, is_bool=True, default_value=None):
Expand Down
37 changes: 32 additions & 5 deletions label_studio/data_manager/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
import logging
from collections import OrderedDict
from typing import Tuple
from urllib.parse import unquote

import ujson as json
Expand Down Expand Up @@ -337,11 +338,37 @@ def preprocess_filter(_filter, *_):
return _filter


def preprocess_field_name(raw_field_name, only_undefined_field=False):
field_name = raw_field_name.replace('filter:', '')
field_name = field_name.replace('tasks:', '')
ascending = False if field_name[0] == '-' else True # detect direction
field_name = field_name[1:] if field_name[0] == '-' else field_name # remove direction
def preprocess_field_name(raw_field_name, only_undefined_field=False) -> Tuple[str, bool]:
"""Transform a field name (as specified in the datamanager views endpoint) to
a django ORM field name. Also handle dotted accesses to task.data.

Edit with care; it's critical that this function not be changed in ways that
introduce vulnerabilities in the vein of the ORM Leak (see #5012). In particular
it is not advisable to use `replace` or other calls that replace all instances
of a string within this function.

Returns: Django ORM field name: str, Sort is ascending: bool
"""

field_name = raw_field_name
ascending = True

# Descending marker `-` may come at the beginning of the string
if field_name.startswith('-'):
ascending = False
field_name = field_name[1:]

# For security reasons, these must only be removed when they fall at the beginning of the string (or after `-`).
optional_prefixes = ['filter:', 'tasks:']
for prefix in optional_prefixes:
if field_name.startswith(prefix):
field_name = field_name[len(prefix) :]

# Descending marker may also come after other prefixes. Double negative is not allowed.
if ascending and field_name.startswith('-'):
ascending = False
field_name = field_name[1:]

if field_name.startswith('data.'):
if only_undefined_field:
field_name = f'data__{settings.DATA_UNDEFINED_NAME}'
Expand Down
44 changes: 44 additions & 0 deletions label_studio/data_manager/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import ujson as json
from data_manager.models import Filter, FilterGroup, View
from django.conf import settings
from django.db import transaction
from projects.models import Project
from rest_framework import serializers
Expand All @@ -18,6 +19,49 @@ class Meta:
model = Filter
fields = '__all__'

def validate_column(self, column: str) -> str:
"""
Ensure that the passed filter expression starts with 'filter:tasks:' and contains
no foreign key traversals. This means either the filter expression contains no '__'
substrings, or that it's the task.data json field that's accessed.

Users depending on foreign key traversals in views can allowlist them via the
DATA_MANAGER_FILTER_ALLOWLIST setting in the env.

Edit with care. The validations below are critical for security.
"""

column_copy = column

# We may support 'filter:annotations:' in the future, but we don't as of yet.
required_prefix = 'filter:tasks:'
optional_prefix = '-'

if not column_copy.startswith(required_prefix):
raise serializers.ValidationError(f'Filter "{column}" should start with "{required_prefix}"')

column_copy = column_copy[len(required_prefix) :]

if column_copy.startswith(optional_prefix):
column_copy = column_copy[len(optional_prefix) :]

if column_copy.startswith('data.'):
# Allow underscores if the filter is based on the `task.data` JSONField, because these don't leverage foreign keys.
return column

# Specific filters relying on foreign keys can be allowlisted
if column_copy in settings.DATA_MANAGER_FILTER_ALLOWLIST:
return column

# But in general, we don't allow foreign keys
if '__' in column_copy:
raise serializers.ValidationError(
f'"__" is not generally allowed in filters. Consider asking your administrator to add "{column_copy}" '
'to DATA_MANAGER_FILTER_ALLOWLIST, but note that some filter expressions may pose a security risk'
)

return column


class FilterGroupSerializer(serializers.ModelSerializer):
filters = FilterSerializer(many=True)
Expand Down
Loading
Loading