diff --git a/conda.yaml b/conda.yaml index c8f5d5fa1..1d69db8b8 100644 --- a/conda.yaml +++ b/conda.yaml @@ -6,6 +6,7 @@ maven>=3.6.0 pytest>=6.0.1 pytest-cov>=2.10.1 sphinx>=3.2.1 +tzlocal>=2.1 fastapi>=0.61.1 uvicorn>=0.11.3 pyarrow>=0.15.1 diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 5205a49f1..9b4b15a4f 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -1,3 +1,4 @@ +from datetime import datetime import operator from functools import reduce from typing import Any, Union, Callable @@ -9,6 +10,7 @@ import dask.dataframe as dd import dask.array as da import pandas as pd +from tzlocal import get_localzone from dask_sql.physical.rex import RexConverter from dask_sql.physical.rex.base import BaseRexPlugin @@ -16,6 +18,7 @@ from dask_sql.datacontainer import DataContainer logger = logging.getLogger(__name__) +SeriesOrScalar = Union[dd.Series, Any] class Operation: @@ -25,7 +28,7 @@ def __init__(self, f: Callable): """Init with the given function""" self.f = f - def __call__(self, *operands) -> Union[dd.Series, Any]: + def __call__(self, *operands) -> SeriesOrScalar: """Call the stored function""" return self.f(*operands) @@ -94,11 +97,8 @@ def __init__(self): super().__init__(self.case) def case( - self, - where: Union[dd.Series, Any], - then: Union[dd.Series, Any], - other: Union[dd.Series, Any], - ) -> Union[dd.Series, Any]: + self, where: SeriesOrScalar, then: SeriesOrScalar, other: SeriesOrScalar, + ) -> SeriesOrScalar: """ Returns `then` where `where`, else `other`. """ @@ -124,7 +124,7 @@ class IsFalseOperation(Operation): def __init__(self): super().__init__(self.false_) - def false_(self, df: Union[dd.Series, Any],) -> Union[dd.Series, Any]: + def false_(self, df: SeriesOrScalar,) -> SeriesOrScalar: """ Returns true where `df` is false (where `df` can also be just a scalar). Returns false on nan. @@ -141,7 +141,7 @@ class IsTrueOperation(Operation): def __init__(self): super().__init__(self.true_) - def true_(self, df: Union[dd.Series, Any],) -> Union[dd.Series, Any]: + def true_(self, df: SeriesOrScalar,) -> SeriesOrScalar: """ Returns true where `df` is true (where `df` can also be just a scalar). Returns false on nan. @@ -158,7 +158,7 @@ class NotOperation(Operation): def __init__(self): super().__init__(self.not_) - def not_(self, df: Union[dd.Series, Any],) -> Union[dd.Series, Any]: + def not_(self, df: SeriesOrScalar,) -> SeriesOrScalar: """ Returns not `df` (where `df` can also be just a scalar). """ @@ -174,7 +174,7 @@ class IsNullOperation(Operation): def __init__(self): super().__init__(self.null) - def null(self, df: Union[dd.Series, Any],) -> Union[dd.Series, Any]: + def null(self, df: SeriesOrScalar,) -> SeriesOrScalar: """ Returns true where `df` is null (where `df` can also be just a scalar). """ @@ -184,15 +184,15 @@ def null(self, df: Union[dd.Series, Any],) -> Union[dd.Series, Any]: return pd.isna(df) or df is None or np.isnan(df) -class LikeOperation(Operation): - """The like operator (regex for SQL with some twist)""" +class RegexOperation(Operation): + """An abstract regex operation, which transforms the SQL regex into something python can understand""" def __init__(self): - super().__init__(self.like) + super().__init__(self.regex) - def like( - self, test: Union[dd.Series, Any], regex: str, escape: str = None, - ) -> Union[dd.Series, Any]: + def regex( + self, test: SeriesOrScalar, regex: str, escape: str = None, + ) -> SeriesOrScalar: """ Returns true, if the string test matches the given regex (maybe escaped by escape) @@ -219,30 +219,15 @@ def like( if char == "]": in_char_range = False - elif char == "[": - in_char_range = True - # These chars have a special meaning in regex # whereas in SQL they have not, so we need to # add additional escaping - elif char in [ - "#", - "$", - "^", - ".", - "|", - "~", - "-", - "+", - "*", - "?", - "(", - ")", - "{", - "}", - ]: + elif char in self.replacement_chars: char = "\\" + char + elif char == "[": + in_char_range = True + # The needed "\" is printed above, so we continue elif char == escape: escaped = True @@ -268,6 +253,38 @@ def like( return bool(re.match(transformed_regex, test)) +class LikeOperation(RegexOperation): + replacement_chars = [ + "#", + "$", + "^", + ".", + "|", + "~", + "-", + "+", + "*", + "?", + "(", + ")", + "{", + "}", + "[", + "]", + ] + + +class SimilarOperation(RegexOperation): + replacement_chars = [ + "#", + "$", + "^", + ".", + "~", + "-", + ] + + class PositionOperation(Operation): """The position operator (get the position of a string)""" @@ -354,6 +371,51 @@ def overlay(self, s, replace, start, length=None): return s +class ExtractOperation(Operation): + def __init__(self): + super().__init__(self.extract) + + def extract(self, what, df: SeriesOrScalar): + input_df = df + if is_frame(df): + df = df.dt + else: + df = pd.to_datetime(df) + + if what == "CENTURY": + return da.trunc(df.year / 100) + elif what == "DAY": + return df.day + elif what == "DECADE": + return da.trunc(df.year / 10) + elif what == "DOW": + return (df.dayofweek + 1) % 7 + elif what == "DOY": + return df.dayofyear + elif what == "HOUR": + return df.hour + elif what == "MICROSECOND": + return df.microsecond + elif what == "MILLENNIUM": + return da.trunc(df.year / 1000) + elif what == "MILLISECOND": + return da.trunc(1000 * df.microsecond) + elif what == "MINUTE": + return df.minute + elif what == "MONTH": + return df.month + elif what == "QUARTER": + return df.quarter + elif what == "SECOND": + return df.second + elif what == "WEEK": + return df.week + elif what == "YEAR": + return df.year + else: # pragma: no cover + raise NotImplementedError(f"Extraction of {what} is not (yet) implemented.") + + class RexCallPlugin(BaseRexPlugin): """ RexCall is used for expressions, which calculate something. @@ -389,6 +451,7 @@ class RexCallPlugin(BaseRexPlugin): # special operations "case": CaseOperation(), "like": LikeOperation(), + "similar to": SimilarOperation(), "not": NotOperation(), "is null": IsNullOperation(), "is not null": NotOperation().of(IsNullOperation()), @@ -431,6 +494,13 @@ class RexCallPlugin(BaseRexPlugin): "overlay": OverlayOperation(), "substring": SubStringOperation(), "initcap": TensorScalarOperation(lambda x: x.str.title(), lambda x: x.title()), + # date/time operations + "extract": ExtractOperation(), + "localtime": Operation(lambda *args: pd.Timestamp.now()), + "localtimestamp": Operation(lambda *args: pd.Timestamp.now()), + "current_time": Operation(lambda *args: pd.Timestamp.now()), + "current_date": Operation(lambda *args: pd.Timestamp.now()), + "current_timestamp": Operation(lambda *args: pd.Timestamp.now()), } def convert( @@ -438,7 +508,7 @@ def convert( rex: "org.apache.calcite.rex.RexNode", dc: DataContainer, context: "dask_sql.Context", - ) -> Union[dd.Series, Any]: + ) -> SeriesOrScalar: # Prepare the operands by turning the RexNodes into python expressions operands = [ RexConverter.convert(o, dc, context=context) for o in rex.getOperands() diff --git a/dask_sql/utils.py b/dask_sql/utils.py index 712fa21e1..bc42284a3 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -3,6 +3,7 @@ from collections import defaultdict from dask_sql.datacontainer import DataContainer import re +from datetime import datetime import logging import dask.dataframe as dd @@ -36,7 +37,12 @@ def is_frame(df): """ Check if something is a dataframe (and not a scalar or none) """ - return df is not None and not np.isscalar(df) and not isinstance(df, type(pd.NA)) + return ( + df is not None + and not np.isscalar(df) + and not isinstance(df, type(pd.NA)) + and not isinstance(df, datetime) + ) class Pluggable: diff --git a/docs/pages/sql.rst b/docs/pages/sql.rst index d019284b2..8957b2527 100644 --- a/docs/pages/sql.rst +++ b/docs/pages/sql.rst @@ -148,9 +148,15 @@ Binary Operations: ``AND``, ``OR``, ``>``, ``>=``, ``<``, ``<=``, ``=``, ``<>``, Unary Math Operations: ``ABS``, ``ACOS``, ``ASIN``, ``ATAN``, ``ATAN2``, ``CBRT``, ``CEIL``, ``COS``, ``COT``, ``DEGREES``, ``EXP``, ``FLOOR``, ``LOG10``, ``LN``, ``POWER``, ``RADIANS``, ``ROUND``, ``SIGN``, ``SIN``, ``TAN``, ``TRUNCATE`` -String operations: ``||``, ``CHAR_LENGTH``, ``UPPER``, ``LOWER``, ``POSITION``, ``TRIM``, ``OVERLAY``, ``SUBSTRING``, ``INITCAP`` +String operations: ``LIKE``, ``SIMILAR TO``, ``||``, ``CHAR_LENGTH``, ``UPPER``, ``LOWER``, ``POSITION``, ``TRIM``, ``OVERLAY``, ``SUBSTRING``, ``INITCAP`` -Special Operations: ``CASE``, ``LIKE``, ``NOT``, ``IS NULL``, ``IS NOT NULL``, ``IS TRUE``, ``IS NOT TRUE``, ``IS FALSE:``, ``IS NOT FALSE``, ``IS UNKNOWN``, ``IS NOT UNKNOWN``, ``EXISTS`` +Date operations: ``EXTRACT``, ``YEAR``, ``QUARTER``, ``MONTH``, ``WEEK``, ``DAYOFYEAR``, ``DAYOFMONTH``, ``DAYOFWEEK``, ``HOUR``, ``MINUTE``, ``SECOND``, ``LOCALTIME``, ``LOCALTIMESTAMP``, ``CURRENT_TIME``, ``CURRENT_DATE``, ``CURRENT_TIMESTAMP`` + +.. note:: + +Due to a `bug/inconsistency `_ in Apache Calcite, both the ``CURRENTTIME`` and ``LOCALTIME`` return a time without timezone and are therefore the same functionality. + +Special Operations: ``CASE``, ``NOT``, ``IS NULL``, ``IS NOT NULL``, ``IS TRUE``, ``IS NOT TRUE``, ``IS FALSE:``, ``IS NOT FALSE``, ``IS UNKNOWN``, ``IS NOT UNKNOWN``, ``EXISTS`` Aggregations ~~~~~~~~~~~~ diff --git a/tests/integration/test_postgres.py b/tests/integration/test_postgres.py index e7c1c85ba..c555e6172 100644 --- a/tests/integration/test_postgres.py +++ b/tests/integration/test_postgres.py @@ -223,6 +223,14 @@ def test_string_operations(assert_query_gives_same_result): SELECT s, s || 'hello' || s, + s SIMILAR TO '%(b|d)%', + s SIMILAR TO '%(B|c)%', + s SIMILAR TO '%[a-zA-Z]%', + s SIMILAR TO '.*', + s LIKE '%(b|d)%', + s LIKE '%(B|c)%', + s LIKE '%[a-zA-Z]%', + s LIKE '.*', CHAR_LENGTH(s), UPPER(s), LOWER(s), diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index 443b17ad0..1bf161d34 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pandas as pd import dask.dataframe as dd @@ -117,12 +119,21 @@ def test_like(c, string_table): df = c.sql( """ SELECT * FROM string_table - WHERE a LIKE '%n[a-z]rmal st_i%' + WHERE a SIMILAR TO '%n[a-z]rmal st_i%' """ ).compute() assert_frame_equal(df, string_table.iloc[[0]]) + df = c.sql( + """ + SELECT * FROM string_table + WHERE a LIKE '%n[a-z]rmal st_i%' + """ + ).compute() + + assert len(df) == 0 + df = c.sql( """ SELECT * FROM string_table @@ -132,6 +143,15 @@ def test_like(c, string_table): assert_frame_equal(df, string_table.iloc[[1]]) + df = c.sql( + """ + SELECT * FROM string_table + WHERE a SIMILAR TO '^|()-*r[r]$' ESCAPE 'r' + """ + ).compute() + + assert_frame_equal(df, string_table.iloc[[2]]) + df = c.sql( """ SELECT * FROM string_table @@ -387,3 +407,54 @@ def test_string_functions(c): assert_frame_equal( df.head(1), expected_df, ) + + +def test_date_functions(c): + date = datetime(2021, 10, 3, 15, 53, 42, 47) + + df = dd.from_pandas(pd.DataFrame({"d": [date]}), npartitions=1) + c.register_dask_table(df, "df") + + df = c.sql( + """ + SELECT + EXTRACT(CENTURY FROM d) AS "century", + EXTRACT(DAY FROM d) AS "day", + EXTRACT(DECADE FROM d) AS "decade", + EXTRACT(DOW FROM d) AS "dow", + EXTRACT(DOY FROM d) AS "doy", + EXTRACT(HOUR FROM d) AS "hour", + EXTRACT(MICROSECOND FROM d) AS "microsecond", + EXTRACT(MILLENNIUM FROM d) AS "millennium", + EXTRACT(MILLISECOND FROM d) AS "millisecond", + EXTRACT(MINUTE FROM d) AS "minute", + EXTRACT(MONTH FROM d) AS "month", + EXTRACT(QUARTER FROM d) AS "quarter", + EXTRACT(SECOND FROM d) AS "second", + EXTRACT(WEEK FROM d) AS "week", + EXTRACT(YEAR FROM d) AS "year" + FROM df + """ + ).compute() + + expected_df = pd.DataFrame( + { + "century": [20], + "day": [3], + "decade": [202], + "dow": [0], + "doy": [276], + "hour": [15], + "microsecond": [47], + "millennium": [2], + "millisecond": [47000], + "minute": [53], + "month": [10], + "quarter": [4], + "second": [42], + "week": [39], + "year": [2021], + } + ) + + assert_frame_equal(df, expected_df, check_dtype=False) diff --git a/tests/unit/test_call.py b/tests/unit/test_call.py index d97fd5534..42b2b5e6c 100644 --- a/tests/unit/test_call.py +++ b/tests/unit/test_call.py @@ -1,5 +1,6 @@ from unittest.mock import MagicMock import operator +from datetime import datetime import dask.dataframe as dd import numpy as np @@ -99,6 +100,8 @@ def test_like(): assert op("a string", r"%a%") == True assert op("another string", r"a%") == True assert op("another string", r"s%") == False + + op = call.SimilarOperation() assert op("normal", r"n[a-z]rm_l") == True assert op("not normal", r"n[a-z]rm_l") == False @@ -179,3 +182,24 @@ def test_string_operations(): assert ops_mapping["substring"](a, 2) == " normal string" assert ops_mapping["substring"](a, 2, 2) == " n" assert ops_mapping["initcap"](a) == "A Normal String" + + +def test_dates(): + op = call.ExtractOperation() + + date = datetime(2021, 10, 3, 15, 53, 42, 47) + assert int(op("CENTURY", date)) == 20 + assert op("DAY", date) == 3 + assert int(op("DECADE", date)) == 202 + assert op("DOW", date) == 0 + assert op("DOY", date) == 276 + assert op("HOUR", date) == 15 + assert op("MICROSECOND", date) == 47 + assert op("MILLENNIUM", date) == 2 + assert op("MILLISECOND", date) == 47000 + assert op("MINUTE", date) == 53 + assert op("MONTH", date) == 10 + assert op("QUARTER", date) == 4 + assert op("SECOND", date) == 42 + assert op("WEEK", date) == 39 + assert op("YEAR", date) == 2021