From 6eefb4012a84cef7549f8fe26281950a5f912196 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 6 Feb 2024 20:50:05 +0000 Subject: [PATCH 01/33] feat: Support pd.to_datetime for scalars, iterables and series. --- bigframes/core/compile/scalar_op_compiler.py | 28 ++++++ bigframes/core/tools/__init__.py | 19 ++++ bigframes/core/tools/datetimes.py | 92 +++++++++++++++++++ bigframes/operations/__init__.py | 11 +++ bigframes/pandas/__init__.py | 22 +++++ tests/system/small/test_pandas.py | 50 ++++++++++ .../pandas/core/tools/__init__.py | 0 .../pandas/core/tools/datetimes.py | 32 +++++++ 8 files changed, 254 insertions(+) create mode 100644 bigframes/core/tools/__init__.py create mode 100644 bigframes/core/tools/datetimes.py create mode 100644 third_party/bigframes_vendored/pandas/core/tools/__init__.py create mode 100644 third_party/bigframes_vendored/pandas/core/tools/datetimes.py diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index bf0755acc7..17ddea4847 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -40,6 +40,14 @@ # ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. _FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) +UNIT_TO_US_CONVERSION_FACTORS = { + "D": 24 * 60 * 60 * 1e6, + "s": 1e6, + "ms": 1e3, + "us": 1, + "ns": 1e-3, +} + class ScalarOpCompiler: # Mapping of operation name to implemenations @@ -656,6 +664,26 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): return x.isin(matchable_ibis_values) +@scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) +def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): + if x.type() == ibis_dtypes.str: + if op.format is None: + return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) + return x.to_timestamp(op.format).cast( + ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None) + ) + + if x.type() in (ibis_dtypes.timestamp, ibis_dtypes.Timestamp(timezone="UTC")): + return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) + + x_converted = x * UNIT_TO_US_CONVERSION_FACTORS.get(op.unit, 1e-3) + x_converted = x_converted.cast(ibis_dtypes.int64) + x_datetime = x_converted.to_timestamp(unit="us") + if op.utc: + x_datetime = x_datetime.cast(ibis_dtypes.Timestamp(timezone="UTC")) + return x_datetime + + @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): if not hasattr(op.func, "bigframes_remote_function"): diff --git a/bigframes/core/tools/__init__.py b/bigframes/core/tools/__init__.py new file mode 100644 index 0000000000..38563510a7 --- /dev/null +++ b/bigframes/core/tools/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.core.tools.datetimes import to_datetime + +__all__ = [ + "to_datetime", +] diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py new file mode 100644 index 0000000000..a689c8492c --- /dev/null +++ b/bigframes/core/tools/datetimes.py @@ -0,0 +1,92 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Mapping +from datetime import datetime +from typing import Iterable, Literal, overload, Union + +import pandas as pd + +import bigframes.constants as constants +import bigframes.core.global_session as global_session +import bigframes.dataframe +import bigframes.operations as ops +import bigframes.series + + +@overload +def to_datetime( + arg: Union[int, float, str, datetime], + *, + utc: bool = False, + format: str | None = None, + unit: str | None = None, +): + ... + + +@overload +def to_datetime( + arg: Union[Iterable, pd.Series, pd.DataFrame, Mapping], + *, + utc: bool = False, + format: str | None = None, + unit: str | None = None, +): + ... + + +def to_datetime( + arg: Union[bigframes.series.Series, bigframes.dataframe.DataFrame], + *, + utc: bool = False, + format: str | None = None, + unit: str | None = None, +): + if isinstance(arg, (int, float, str, datetime)): + return pd.to_datetime( + arg, + utc=utc, + format=format, + unit=unit, + ) + + if isinstance(arg, (Mapping, pd.DataFrame, bigframes.dataframe.DataFrame)): + raise NotImplementedError( + "Conversion of Mapping, pandas.DataFrame, or bigframes.dataframe.DataFrame " + f"to datetime is not implemented. {constants.FEEDBACK_LINK}" + ) + + if ~isinstance(arg, bigframes.series.Series): + # TODO: Currently, data upload is performed using pandas DataFrames + # combined with the `read_pandas` method due to the BigFrames DataFrame + # constructor's limitations in handling various data types. Plan to update + # the upload process to utilize the BigPandas DataFrame constructor directly + # once it is enhanced for more related datatypes. + arg = global_session.with_default_session( + bigframes.session.Session.read_pandas, + arg if isinstance(arg, pd.Series) else pd.DataFrame(arg), + ) + + if len(arg.columns) != 1: + raise ValueError("Input must be 1-dimensional.") + + arg = arg[arg.columns[0]] + return arg._apply_unary_op( + ops.ToDatetimeOp( + utc=utc, + format=format, + unit=unit, + ) + ) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index b40f42a3e8..579f3d48e3 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -404,6 +404,17 @@ def output_type(self, *input_types): return input_types[0] +@dataclasses.dataclass(frozen=True) +class ToDatetimeOp(UnaryOp): + name: typing.ClassVar[str] = "to_datetime" + utc: bool = (False,) + format: str | None = (None,) + unit: str | None = (None,) + + def output_type(self, *input_types): + return input_types[0] + + # Binary Ops fillna_op = create_binary_op(name="fillna") cliplower_op = create_binary_op(name="clip_lower") diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 554acda202..385c706f2e 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -52,6 +52,7 @@ import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.core.reshape +import bigframes.core.tools import bigframes.dataframe import bigframes.operations as ops import bigframes.series @@ -61,6 +62,7 @@ import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile +import third_party.bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes import third_party.bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq @@ -635,6 +637,25 @@ def read_gbq_function(function_name: str): read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function) + +def to_datetime( + arg, + *, + utc: bool = False, + format: str | None = None, + unit: str | None = None, +): + return bigframes.core.tools.to_datetime( + arg, + utc=utc, + format=format, + unit=unit, + ) + + +to_datetime.__doc__ = vendored_pandas_datetimes.to_datetime.__doc__ + + # pandas dtype attributes NA = pandas.NA BooleanDtype = pandas.BooleanDtype @@ -680,6 +701,7 @@ def read_gbq_function(function_name: str): "read_pandas", "read_pickle", "remote_function", + "to_datetime", # pandas dtype attributes "NA", "BooleanDtype", diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 0910c0b7e2..27e981247a 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from datetime import datetime + import pandas as pd import pytest @@ -477,3 +479,51 @@ def test_qcut(scalars_dfs, q): pd_result = pd_result.astype("Int64") pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("arg", "utc", "unit", "format"), + [ + (173872738, False, None, None), + (32787983.23, True, "s", None), + ("2023-01-01", False, None, "%Y-%m-%d"), + (datetime(2023, 1, 1, 12, 0), False, None, None), + ], +) +def test_to_datetime_scalar(arg, utc, unit, format): + bf_result = bpd.to_datetime(arg, utc=utc, unit=unit, format=format) + pd_result = pd.to_datetime(arg, utc=utc, unit=unit, format=format) + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("arg", "utc", "unit", "format"), + [ + ([173872738], False, None, None), + ([32787983.23], True, "s", None), + ([datetime(2023, 1, 1, 12, 0)], False, None, None), + (["2023-01-01 12:00"], False, None, "%Y-%m-%d %H:%M"), + (["2023-01-01"], True, None, None), + (["01-31-2023 14:00", "02-01-2023 15:00"], False, None, "%m-%d-%Y %H:%M"), + ], +) +def test_to_datetime_iterable(arg, utc, unit, format): + bf_result = ( + bpd.to_datetime(arg, utc=utc, unit=unit, format=format) + .to_pandas() + .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") + ) + pd_result = pd.Series( + pd.to_datetime(arg, utc=utc, unit=unit, format=format) + ).dt.floor("us") + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_names=False + ) + + +def test_to_datetime_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col = "int64_too" + bf_result = bpd.to_datetime(scalars_df[col]).to_pandas().astype("datetime64[ns]") + pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col])).dt.floor("us") diff --git a/third_party/bigframes_vendored/pandas/core/tools/__init__.py b/third_party/bigframes_vendored/pandas/core/tools/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py new file mode 100644 index 0000000000..5b67ce5e87 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -0,0 +1,32 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py + +from typing import Literal + + +def to_datetime( + arg, + *, + utc=False, + format=None, + unit=None, +): + """ + This function converts a scalar, array-like or Series to a pandas datetime object. + + arg (int, float, str, datetime, list, tuple, 1-d array, Series): + The object to convert to a datetime. If a DataFrame is provided, the method + expects minimally the following columns: "year", "month", "day". The column + “year” must be specified in 4-digit format. + + utc (bool, default False): + Control timezone-related parsing, localization and conversion. If True, the + function always returns a timezone-aware UTC-localized timestamp or series. + If False (default), inputs will not be coerced to UTC. + + format (str, default None): + The strftime to parse time, e.g. "%d/%m/%Y". + + unit (str, default 'ns'): + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or + float number. + """ From 033e3381745b8323065cb871db550732efde1165 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 6 Feb 2024 21:06:28 +0000 Subject: [PATCH 02/33] update test and docstring --- bigframes/core/tools/datetimes.py | 20 +++++++++---------- bigframes/pandas/__init__.py | 6 +++--- tests/system/small/test_pandas.py | 11 ++++++++-- .../pandas/core/tools/datetimes.py | 5 +++++ 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index a689c8492c..0fee5da0e3 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -14,7 +14,7 @@ from collections.abc import Mapping from datetime import datetime -from typing import Iterable, Literal, overload, Union +from typing import Iterable, Optional, overload, Union import pandas as pd @@ -29,9 +29,9 @@ def to_datetime( arg: Union[int, float, str, datetime], *, - utc: bool = False, - format: str | None = None, - unit: str | None = None, + utc: Optional[bool] = False, + format: Optional[str] = None, + unit: Optional[str] = None, ): ... @@ -40,9 +40,9 @@ def to_datetime( def to_datetime( arg: Union[Iterable, pd.Series, pd.DataFrame, Mapping], *, - utc: bool = False, - format: str | None = None, - unit: str | None = None, + utc: Optional[bool] = False, + format: Optional[str] = None, + unit: Optional[str] = None, ): ... @@ -50,9 +50,9 @@ def to_datetime( def to_datetime( arg: Union[bigframes.series.Series, bigframes.dataframe.DataFrame], *, - utc: bool = False, - format: str | None = None, - unit: str | None = None, + utc: Optional[bool] = False, + format: Optional[str] = None, + unit: Optional[str] = None, ): if isinstance(arg, (int, float, str, datetime)): return pd.to_datetime( diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 385c706f2e..768ab2e352 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -641,9 +641,9 @@ def read_gbq_function(function_name: str): def to_datetime( arg, *, - utc: bool = False, - format: str | None = None, - unit: str | None = None, + utc: Optional[bool] = False, + format: Optional[str] = None, + unit: Optional[str] = None, ): return bigframes.core.tools.to_datetime( arg, diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 27e981247a..1590200c56 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -525,5 +525,12 @@ def test_to_datetime_iterable(arg, utc, unit, format): def test_to_datetime_series(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col = "int64_too" - bf_result = bpd.to_datetime(scalars_df[col]).to_pandas().astype("datetime64[ns]") - pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col])).dt.floor("us") + bf_result = ( + bpd.to_datetime(scalars_df[col], unit="s").to_pandas().astype("datetime64[s]") + ) + pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col], unit="s")) + print(bf_result) + print(pd_result) + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_names=False + ) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 5b67ce5e87..3b7bbbe4e4 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -13,6 +13,11 @@ def to_datetime( """ This function converts a scalar, array-like or Series to a pandas datetime object. + .. note:: + BigQuery only supports precision up to microseconds (us). Therefore, when working + with timestamps that have a finer granularity than microseconds, be aware that + the additional precision will not be represented in BigQuery. + arg (int, float, str, datetime, list, tuple, 1-d array, Series): The object to convert to a datetime. If a DataFrame is provided, the method expects minimally the following columns: "year", "month", "day". The column From e4feb09940304c7e839ceca7b236e6c6a46fdc12 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 6 Feb 2024 22:20:09 +0000 Subject: [PATCH 03/33] update types --- bigframes/core/compile/scalar_op_compiler.py | 5 ++-- bigframes/core/tools/datetimes.py | 30 ++++---------------- bigframes/operations/__init__.py | 6 ++-- bigframes/pandas/__init__.py | 2 +- 4 files changed, 12 insertions(+), 31 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 17ddea4847..0db4ff70e1 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -675,8 +675,9 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): if x.type() in (ibis_dtypes.timestamp, ibis_dtypes.Timestamp(timezone="UTC")): return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) - - x_converted = x * UNIT_TO_US_CONVERSION_FACTORS.get(op.unit, 1e-3) + + unit = op.unit if op.unit is not None else 'ns' + x_converted = x * UNIT_TO_US_CONVERSION_FACTORS.get(unit, 1e-3) x_converted = x_converted.cast(ibis_dtypes.int64) x_datetime = x_converted.to_timestamp(unit="us") if op.utc: diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 0fee5da0e3..c7cdd605a0 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -24,33 +24,13 @@ import bigframes.operations as ops import bigframes.series +local_scalars = Union[int, float, str, datetime] +local_iterables = Union[Iterable, pd.Series, pd.DataFrame, Mapping] -@overload def to_datetime( - arg: Union[int, float, str, datetime], + arg: Union[local_scalars, local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame], *, - utc: Optional[bool] = False, - format: Optional[str] = None, - unit: Optional[str] = None, -): - ... - - -@overload -def to_datetime( - arg: Union[Iterable, pd.Series, pd.DataFrame, Mapping], - *, - utc: Optional[bool] = False, - format: Optional[str] = None, - unit: Optional[str] = None, -): - ... - - -def to_datetime( - arg: Union[bigframes.series.Series, bigframes.dataframe.DataFrame], - *, - utc: Optional[bool] = False, + utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, ): @@ -83,7 +63,7 @@ def to_datetime( raise ValueError("Input must be 1-dimensional.") arg = arg[arg.columns[0]] - return arg._apply_unary_op( + return arg._apply_unary_op( # type: ignore ops.ToDatetimeOp( utc=utc, format=format, diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 579f3d48e3..37188e490e 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -407,9 +407,9 @@ def output_type(self, *input_types): @dataclasses.dataclass(frozen=True) class ToDatetimeOp(UnaryOp): name: typing.ClassVar[str] = "to_datetime" - utc: bool = (False,) - format: str | None = (None,) - unit: str | None = (None,) + utc: bool = False + format: typing.Optional[str] = None + unit: typing.Optional[str] = None def output_type(self, *input_types): return input_types[0] diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 768ab2e352..25978d7be6 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -641,7 +641,7 @@ def read_gbq_function(function_name: str): def to_datetime( arg, *, - utc: Optional[bool] = False, + utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, ): From 35f14f50ce3af6b4d2bc0f62cbed54c19ee291c8 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 6 Feb 2024 22:25:05 +0000 Subject: [PATCH 04/33] format update --- bigframes/core/compile/scalar_op_compiler.py | 4 ++-- bigframes/core/tools/datetimes.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 0db4ff70e1..5efdb9be31 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -675,8 +675,8 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): if x.type() in (ibis_dtypes.timestamp, ibis_dtypes.Timestamp(timezone="UTC")): return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) - - unit = op.unit if op.unit is not None else 'ns' + + unit = op.unit if op.unit is not None else "ns" x_converted = x * UNIT_TO_US_CONVERSION_FACTORS.get(unit, 1e-3) x_converted = x_converted.cast(ibis_dtypes.int64) x_datetime = x_converted.to_timestamp(unit="us") diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index c7cdd605a0..88b7b1084a 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -27,8 +27,14 @@ local_scalars = Union[int, float, str, datetime] local_iterables = Union[Iterable, pd.Series, pd.DataFrame, Mapping] + def to_datetime( - arg: Union[local_scalars, local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame], + arg: Union[ + local_scalars, + local_iterables, + bigframes.series.Series, + bigframes.dataframe.DataFrame, + ], *, utc: bool = False, format: Optional[str] = None, @@ -63,7 +69,7 @@ def to_datetime( raise ValueError("Input must be 1-dimensional.") arg = arg[arg.columns[0]] - return arg._apply_unary_op( # type: ignore + return arg._apply_unary_op( # type: ignore ops.ToDatetimeOp( utc=utc, format=format, From 22ede7d3e42c5427eda11eea34d6a5a593de44ea Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 6 Feb 2024 22:27:01 +0000 Subject: [PATCH 05/33] remove import. --- bigframes/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 88b7b1084a..ae0f193d6b 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -14,7 +14,7 @@ from collections.abc import Mapping from datetime import datetime -from typing import Iterable, Optional, overload, Union +from typing import Iterable, Optional, Union import pandas as pd From af274cbf906d35bd20082ac721c7a795bf0ea4b9 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 6 Feb 2024 22:34:47 +0000 Subject: [PATCH 06/33] update docstring --- .../pandas/core/tools/datetimes.py | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 3b7bbbe4e4..9408b98f58 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -18,20 +18,24 @@ def to_datetime( with timestamps that have a finer granularity than microseconds, be aware that the additional precision will not be represented in BigQuery. - arg (int, float, str, datetime, list, tuple, 1-d array, Series): - The object to convert to a datetime. If a DataFrame is provided, the method - expects minimally the following columns: "year", "month", "day". The column - “year” must be specified in 4-digit format. - - utc (bool, default False): - Control timezone-related parsing, localization and conversion. If True, the - function always returns a timezone-aware UTC-localized timestamp or series. - If False (default), inputs will not be coerced to UTC. - - format (str, default None): - The strftime to parse time, e.g. "%d/%m/%Y". - - unit (str, default 'ns'): - The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or - float number. + Args: + arg (int, float, str, datetime, list, tuple, 1-d array, Series): + The object to convert to a datetime. If a DataFrame is provided, the method + expects minimally the following columns: "year", "month", "day". The column + “year” must be specified in 4-digit format. + + utc (bool, default False): + Control timezone-related parsing, localization and conversion. If True, the + function always returns a timezone-aware UTC-localized timestamp or series. + If False (default), inputs will not be coerced to UTC. + + format (str, default None): + The strftime to parse time, e.g. "%d/%m/%Y". + + unit (str, default 'ns'): + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or + float number. + + Returns: + Timestamp, datetime.datetime or bigframes.series.Series: Return type depends on input. """ From fe955dbb03b7776085a2faf06bae2b476de1650e Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 6 Feb 2024 23:37:35 +0000 Subject: [PATCH 07/33] update arg conversion --- bigframes/core/tools/datetimes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index ae0f193d6b..6637807b6b 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -61,8 +61,7 @@ def to_datetime( # the upload process to utilize the BigPandas DataFrame constructor directly # once it is enhanced for more related datatypes. arg = global_session.with_default_session( - bigframes.session.Session.read_pandas, - arg if isinstance(arg, pd.Series) else pd.DataFrame(arg), + bigframes.session.Session.read_pandas, pd.DataFrame(arg) ) if len(arg.columns) != 1: From 8c1f6337ff9d347cce4eca655f564203d82176dc Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 7 Feb 2024 01:41:09 +0000 Subject: [PATCH 08/33] update examples --- bigframes/core/compile/scalar_op_compiler.py | 28 ++++++++----------- tests/system/small/test_pandas.py | 1 - .../pandas/core/tools/datetimes.py | 20 +++++++++++++ 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 5efdb9be31..fb7d7402ee 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -666,23 +666,19 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): - if x.type() == ibis_dtypes.str: - if op.format is None: - return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) - return x.to_timestamp(op.format).cast( - ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None) - ) - - if x.type() in (ibis_dtypes.timestamp, ibis_dtypes.Timestamp(timezone="UTC")): - return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) - unit = op.unit if op.unit is not None else "ns" - x_converted = x * UNIT_TO_US_CONVERSION_FACTORS.get(unit, 1e-3) - x_converted = x_converted.cast(ibis_dtypes.int64) - x_datetime = x_converted.to_timestamp(unit="us") - if op.utc: - x_datetime = x_datetime.cast(ibis_dtypes.Timestamp(timezone="UTC")) - return x_datetime + if x.type() == ibis_dtypes.str: + x = x.to_timestamp(op.format) if op.format else x + elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"): + return x + elif x.type() != ibis_dtypes.timestamp: + unit = op.unit if op.unit is not None else "ns" + x_converted = x * UNIT_TO_US_CONVERSION_FACTORS.get(unit, 1e-3) + x_converted = x_converted.cast(ibis_dtypes.int64) + x = x_converted.to_timestamp(unit="us") + x.execute() + + return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 1590200c56..50affcd903 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -20,7 +20,6 @@ import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal - @pytest.mark.parametrize( ("ordered"), [ diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 9408b98f58..96c7f3a7dc 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -18,6 +18,26 @@ def to_datetime( with timestamps that have a finer granularity than microseconds, be aware that the additional precision will not be represented in BigQuery. + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> scalar = 123456.789 + >>> bpd.to_datetime(scalar, unit = 's') + Timestamp('1970-01-02 10:17:36.789000') + + >>> list_str = ["01-31-2021 14:30", "02-28-2021 15:45"] + >>> bpd.to_datetime(list_str, format="%m-%d-%Y %H:%M") + 0 2021-01-31 14:30:00 + 1 2021-02-28 15:45:00 + Name: 0, dtype: timestamp[us][pyarrow] + + >>> series_str = bpd.Series(["01-31-2021 14:30", "02-28-2021 15:45"]) + >>> bpd.to_datetime(series_str, format="%m-%d-%Y %H:%M", utc=True) + 0 2021-01-31 14:30:00+00:00 + 1 2021-02-28 15:45:00+00:00 + Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] + Args: arg (int, float, str, datetime, list, tuple, 1-d array, Series): The object to convert to a datetime. If a DataFrame is provided, the method From 637ca212380e8235b4e09ee794c4c9e0540fe9d9 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 7 Feb 2024 01:43:07 +0000 Subject: [PATCH 09/33] update format --- tests/system/small/test_pandas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 50affcd903..1590200c56 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -20,6 +20,7 @@ import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal + @pytest.mark.parametrize( ("ordered"), [ From 23fbf15b0f1386081e14ebc1613478fc48f9f898 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 7 Feb 2024 23:32:15 +0000 Subject: [PATCH 10/33] update code examples, and working logic. --- bigframes/core/compile/scalar_op_compiler.py | 83 +++++++++++++++++-- bigframes/core/tools/datetimes.py | 6 ++ tests/system/small/test_pandas.py | 15 +++- .../pandas/core/tools/datetimes.py | 16 ++-- 4 files changed, 101 insertions(+), 19 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index fb7d7402ee..8e341f3960 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -41,13 +41,19 @@ _FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) UNIT_TO_US_CONVERSION_FACTORS = { - "D": 24 * 60 * 60 * 1e6, - "s": 1e6, - "ms": 1e3, + "D": 24 * 60 * 60 * 1000 * 1000, + "h": 60 * 60 * 1000 * 1000, + "m": 60 * 1000 * 1000, + "s": 1000 * 1000, + "ms": 1000, "us": 1, "ns": 1e-3, } +TIMEZONE_POS_REGEX = r"[\+]\d{2}:\d{2}$" +TIMEZONE_NEG_REGEX = r"[\-]\d{2}:\d{2}$" +UTC_REGEX = r"[Zz]$" + class ScalarOpCompiler: # Mapping of operation name to implemenations @@ -668,15 +674,80 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): if x.type() == ibis_dtypes.str: - x = x.to_timestamp(op.format) if op.format else x + # This is not a exact match of Pandas behavior, but this ensures + # UTC str to be properly handled. + x = ( + ibis.case() + .when( + x.re_search(TIMEZONE_POS_REGEX), + ( + ibis.timestamp( + ( + x.substr(0, x.length() - 6).to_timestamp(op.format) + if op.format + else x.substr(0, x.length() - 6) + ) + ) + .cast(ibis_dtypes.Timestamp(timezone="UTC")) + .cast(ibis_dtypes.int64) + - x.substr(x.length() - 5, 2).cast(ibis_dtypes.int64) + * UNIT_TO_US_CONVERSION_FACTORS["h"] + - x.substr(x.length() - 2, 2).cast(ibis_dtypes.int64) + * UNIT_TO_US_CONVERSION_FACTORS["m"] + ) + .to_timestamp(unit="us") + .cast(ibis_dtypes.Timestamp(timezone="UTC")), + ) + .when( + x.re_search(TIMEZONE_NEG_REGEX), + ( + ibis.timestamp( + ( + x.substr(0, x.length() - 6).to_timestamp(op.format) + if op.format + else x.substr(0, x.length() - 6) + ) + ) + .cast(ibis_dtypes.Timestamp(timezone="UTC")) + .cast(ibis_dtypes.int64) + + x.substr(x.length() - 5, 2).cast(ibis_dtypes.int64) + * UNIT_TO_US_CONVERSION_FACTORS["h"] + + x.substr(x.length() - 2, 2).cast(ibis_dtypes.int64) + * UNIT_TO_US_CONVERSION_FACTORS["m"] + ) + .to_timestamp(unit="us") + .cast(ibis_dtypes.Timestamp(timezone="UTC")), + ) + .when( + x.re_search(UTC_REGEX), + ( + x.substr(0, x.length() - 1).to_timestamp(op.format) + if op.format + else x.substr(0, x.length() - 1) + ).cast(ibis_dtypes.Timestamp(timezone="UTC")), + ) + .else_( + (x.to_timestamp(op.format) if op.format else x).cast( + ibis_dtypes.Timestamp(timezone="UTC") + ) + ) + .end() + ) + elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"): return x elif x.type() != ibis_dtypes.timestamp: unit = op.unit if op.unit is not None else "ns" x_converted = x * UNIT_TO_US_CONVERSION_FACTORS.get(unit, 1e-3) x_converted = x_converted.cast(ibis_dtypes.int64) - x = x_converted.to_timestamp(unit="us") - x.execute() + # Note: Due to an issue where casting directly to a non-UTC + # timezone does not work, we first cast to UTC. This seems + # to bypass a potential bug in Ibis's cast function, allowing + # for subsequent casting to a non-UTC timezone. Further + # investigation is needed to confirm this behavior. + x = x_converted.to_timestamp(unit="us").cast( + ibis_dtypes.Timestamp(timezone="UTC") + ) return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 6637807b6b..56edce7d1e 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -68,6 +68,12 @@ def to_datetime( raise ValueError("Input must be 1-dimensional.") arg = arg[arg.columns[0]] + + if not utc and arg.dtype not in ("Int64", "Float64"): + raise NotImplementedError( + f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}" + ) + return arg._apply_unary_op( # type: ignore ops.ToDatetimeOp( utc=utc, diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 1590200c56..4fc9cdf023 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -16,6 +16,7 @@ import pandas as pd import pytest +import pytz import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal @@ -502,10 +503,15 @@ def test_to_datetime_scalar(arg, utc, unit, format): [ ([173872738], False, None, None), ([32787983.23], True, "s", None), - ([datetime(2023, 1, 1, 12, 0)], False, None, None), - (["2023-01-01 12:00"], False, None, "%Y-%m-%d %H:%M"), - (["2023-01-01"], True, None, None), - (["01-31-2023 14:00", "02-01-2023 15:00"], False, None, "%m-%d-%Y %H:%M"), + ( + [datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("America/New_York"))], + True, + None, + None, + ), + (["2023-01-01"], True, None, "%Y-%m-%d"), + (["2023-02-01T15:00:00+07:22"], True, None, None), + (["01-31-2023 14:00", "02-01-2023 15:00"], True, None, "%m-%d-%Y %H:%M"), ], ) def test_to_datetime_iterable(arg, utc, unit, format): @@ -514,6 +520,7 @@ def test_to_datetime_iterable(arg, utc, unit, format): .to_pandas() .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") ) + print(bf_result) pd_result = pd.Series( pd.to_datetime(arg, utc=utc, unit=unit, format=format) ).dt.floor("us") diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 96c7f3a7dc..2de082adb5 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -27,22 +27,20 @@ def to_datetime( Timestamp('1970-01-02 10:17:36.789000') >>> list_str = ["01-31-2021 14:30", "02-28-2021 15:45"] - >>> bpd.to_datetime(list_str, format="%m-%d-%Y %H:%M") - 0 2021-01-31 14:30:00 - 1 2021-02-28 15:45:00 - Name: 0, dtype: timestamp[us][pyarrow] + >>> bpd.to_datetime(list_str, format="%m-%d-%Y %H:%M", utc=True) + 0 2021-01-31 14:30:00+00:00 + 1 2021-02-28 15:45:00+00:00 + Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] - >>> series_str = bpd.Series(["01-31-2021 14:30", "02-28-2021 15:45"]) + >>> series_str = bpd.Series(["01-31-2021 14:30+08:00", "02-28-2021 15:45z"]) >>> bpd.to_datetime(series_str, format="%m-%d-%Y %H:%M", utc=True) - 0 2021-01-31 14:30:00+00:00 + 0 2021-01-31 06:30:00+00:00 1 2021-02-28 15:45:00+00:00 Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] Args: arg (int, float, str, datetime, list, tuple, 1-d array, Series): - The object to convert to a datetime. If a DataFrame is provided, the method - expects minimally the following columns: "year", "month", "day". The column - “year” must be specified in 4-digit format. + The object to convert to a datetime. utc (bool, default False): Control timezone-related parsing, localization and conversion. If True, the From c6d254d952ebca8dc02494d91e674d5bfa10c84e Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 7 Feb 2024 23:36:26 +0000 Subject: [PATCH 11/33] docstring update. --- third_party/bigframes_vendored/pandas/core/tools/datetimes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 2de082adb5..5e4873ae4b 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -1,6 +1,6 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py -from typing import Literal +from bigframes import constants def to_datetime( @@ -57,3 +57,5 @@ def to_datetime( Returns: Timestamp, datetime.datetime or bigframes.series.Series: Return type depends on input. """ + + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 0692c79ca421a0821cb1fd20458006cc58c918c8 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 7 Feb 2024 23:46:05 +0000 Subject: [PATCH 12/33] type update. --- bigframes/core/tools/datetimes.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 56edce7d1e..b28f44ad6c 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -63,18 +63,17 @@ def to_datetime( arg = global_session.with_default_session( bigframes.session.Session.read_pandas, pd.DataFrame(arg) ) - if len(arg.columns) != 1: raise ValueError("Input must be 1-dimensional.") - + arg = arg[arg.columns[0]] - if not utc and arg.dtype not in ("Int64", "Float64"): + if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore raise NotImplementedError( f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}" ) - return arg._apply_unary_op( # type: ignore + return arg._apply_unary_op( # type: ignore ops.ToDatetimeOp( utc=utc, format=format, From f436149741489e38f8235b7d1971b07fb881fedb Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 7 Feb 2024 23:48:06 +0000 Subject: [PATCH 13/33] format update. --- bigframes/core/tools/datetimes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index b28f44ad6c..9157a93cb7 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -65,15 +65,15 @@ def to_datetime( ) if len(arg.columns) != 1: raise ValueError("Input must be 1-dimensional.") - + arg = arg[arg.columns[0]] - if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore + if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore raise NotImplementedError( f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}" ) - return arg._apply_unary_op( # type: ignore + return arg._apply_unary_op( # type: ignore ops.ToDatetimeOp( utc=utc, format=format, From 87d1749f54b6b678f720e7f2654ecc32015819f2 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 7 Feb 2024 23:54:51 +0000 Subject: [PATCH 14/33] Update docstring format --- bigframes/core/tools/datetimes.py | 10 +++----- bigframes/pandas/__init__.py | 10 ++++++-- .../pandas/core/tools/datetimes.py | 25 +++++++++++++------ 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 9157a93cb7..a2af2259c4 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -23,15 +23,13 @@ import bigframes.dataframe import bigframes.operations as ops import bigframes.series - -local_scalars = Union[int, float, str, datetime] -local_iterables = Union[Iterable, pd.Series, pd.DataFrame, Mapping] +import third_party.bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes def to_datetime( arg: Union[ - local_scalars, - local_iterables, + vendored_pandas_datetimes.local_scalars, + vendored_pandas_datetimes.local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame, ], @@ -39,7 +37,7 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -): +) -> Union[pd.Timestamp, datetime, bigframes.series.Series]: if isinstance(arg, (int, float, str, datetime)): return pd.to_datetime( arg, diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 25978d7be6..5320e84e21 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -17,6 +17,7 @@ from __future__ import annotations from collections import namedtuple +from datetime import datetime import inspect import sys import typing @@ -639,12 +640,17 @@ def read_gbq_function(function_name: str): def to_datetime( - arg, + arg: Union[ + vendored_pandas_datetimes.local_scalars, + vendored_pandas_datetimes.local_iterables, + bigframes.series.Series, + bigframes.dataframe.DataFrame, + ], *, utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -): +) -> Union[pandas.Timestamp, datetime, bigframes.series.Series]: return bigframes.core.tools.to_datetime( arg, utc=utc, diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 5e4873ae4b..cf97042013 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -1,6 +1,14 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py -from bigframes import constants +from datetime import datetime +from typing import Iterable, Mapping, Union + +import pandas as pd + +from bigframes import constants, series + +local_scalars = Union[int, float, str, datetime] +local_iterables = Union[Iterable, pd.Series, pd.DataFrame, Mapping] def to_datetime( @@ -9,9 +17,9 @@ def to_datetime( utc=False, format=None, unit=None, -): +) -> Union[pd.Timestamp, datetime, series.Series]: """ - This function converts a scalar, array-like or Series to a pandas datetime object. + This function converts a scalar, array-like or Series to a datetime object. .. note:: BigQuery only supports precision up to microseconds (us). Therefore, when working @@ -19,19 +27,26 @@ def to_datetime( the additional precision will not be represented in BigQuery. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None + Converting a Scalar to datetime: + >>> scalar = 123456.789 >>> bpd.to_datetime(scalar, unit = 's') Timestamp('1970-01-02 10:17:36.789000') + Converting a List of Strings without Timezone Information: + >>> list_str = ["01-31-2021 14:30", "02-28-2021 15:45"] >>> bpd.to_datetime(list_str, format="%m-%d-%Y %H:%M", utc=True) 0 2021-01-31 14:30:00+00:00 1 2021-02-28 15:45:00+00:00 Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] + Converting a Series of Strings with Timezone Information: + >>> series_str = bpd.Series(["01-31-2021 14:30+08:00", "02-28-2021 15:45z"]) >>> bpd.to_datetime(series_str, format="%m-%d-%Y %H:%M", utc=True) 0 2021-01-31 06:30:00+00:00 @@ -41,15 +56,12 @@ def to_datetime( Args: arg (int, float, str, datetime, list, tuple, 1-d array, Series): The object to convert to a datetime. - utc (bool, default False): Control timezone-related parsing, localization and conversion. If True, the function always returns a timezone-aware UTC-localized timestamp or series. If False (default), inputs will not be coerced to UTC. - format (str, default None): The strftime to parse time, e.g. "%d/%m/%Y". - unit (str, default 'ns'): The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. @@ -57,5 +69,4 @@ def to_datetime( Returns: Timestamp, datetime.datetime or bigframes.series.Series: Return type depends on input. """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From b180fe37adb85c9ba12d2b6cbbf88bfef0f5060f Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 8 Feb 2024 01:11:03 +0000 Subject: [PATCH 15/33] remove import --- bigframes/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index a2af2259c4..123f8eb627 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -14,7 +14,7 @@ from collections.abc import Mapping from datetime import datetime -from typing import Iterable, Optional, Union +from typing import Optional, Union import pandas as pd From 3f0f7db49843b42df93b92a4f0430f31657cb5a7 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 8 Feb 2024 01:32:40 +0000 Subject: [PATCH 16/33] remove empty line --- bigframes/core/compile/scalar_op_compiler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 8e341f3960..e7182348f4 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -672,7 +672,6 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): - if x.type() == ibis_dtypes.str: # This is not a exact match of Pandas behavior, but this ensures # UTC str to be properly handled. @@ -733,7 +732,6 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): ) .end() ) - elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"): return x elif x.type() != ibis_dtypes.timestamp: From dc6cfcd50690f2a10575e5563d1db6acdec43f86 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 8 Feb 2024 18:04:54 +0000 Subject: [PATCH 17/33] Remove extra code --- bigframes/core/compile/scalar_op_compiler.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index e7182348f4..72f5599ef2 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -680,12 +680,10 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): .when( x.re_search(TIMEZONE_POS_REGEX), ( - ibis.timestamp( - ( - x.substr(0, x.length() - 6).to_timestamp(op.format) - if op.format - else x.substr(0, x.length() - 6) - ) + ( + x.substr(0, x.length() - 6).to_timestamp(op.format) + if op.format + else x.substr(0, x.length() - 6) ) .cast(ibis_dtypes.Timestamp(timezone="UTC")) .cast(ibis_dtypes.int64) @@ -700,12 +698,10 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): .when( x.re_search(TIMEZONE_NEG_REGEX), ( - ibis.timestamp( - ( - x.substr(0, x.length() - 6).to_timestamp(op.format) - if op.format - else x.substr(0, x.length() - 6) - ) + ( + x.substr(0, x.length() - 6).to_timestamp(op.format) + if op.format + else x.substr(0, x.length() - 6) ) .cast(ibis_dtypes.Timestamp(timezone="UTC")) .cast(ibis_dtypes.int64) From 68ec37ef4c329f618490675b968fdeec7b9fbace Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 8 Feb 2024 18:44:05 +0000 Subject: [PATCH 18/33] remove prints. --- bigframes/core/compile/scalar_op_compiler.py | 4 +++- tests/system/small/test_pandas.py | 3 --- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 72f5599ef2..b175affeeb 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -732,7 +732,9 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): return x elif x.type() != ibis_dtypes.timestamp: unit = op.unit if op.unit is not None else "ns" - x_converted = x * UNIT_TO_US_CONVERSION_FACTORS.get(unit, 1e-3) + if unit not in UNIT_TO_US_CONVERSION_FACTORS: + raise ValueError(f"Cannot convert input with unit '{unit}'.") + x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] x_converted = x_converted.cast(ibis_dtypes.int64) # Note: Due to an issue where casting directly to a non-UTC # timezone does not work, we first cast to UTC. This seems diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 4fc9cdf023..9457c08638 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -520,7 +520,6 @@ def test_to_datetime_iterable(arg, utc, unit, format): .to_pandas() .astype("datetime64[ns, UTC]" if utc else "datetime64[ns]") ) - print(bf_result) pd_result = pd.Series( pd.to_datetime(arg, utc=utc, unit=unit, format=format) ).dt.floor("us") @@ -536,8 +535,6 @@ def test_to_datetime_series(scalars_dfs): bpd.to_datetime(scalars_df[col], unit="s").to_pandas().astype("datetime64[s]") ) pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col], unit="s")) - print(bf_result) - print(pd_result) pd.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) From 8b8d61a903dfe1e9bad0e23de49f9f83ea94bfd1 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 18:39:57 +0000 Subject: [PATCH 19/33] Code logic updates. --- bigframes/core/compile/scalar_op_compiler.py | 83 +++++++++++--------- bigframes/core/tools/datetimes.py | 4 +- 2 files changed, 49 insertions(+), 38 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index b175affeeb..bb13afe08d 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -40,6 +40,7 @@ # ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. _FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) +# Datetime constants UNIT_TO_US_CONVERSION_FACTORS = { "D": 24 * 60 * 60 * 1000 * 1000, "h": 60 * 60 * 1000 * 1000, @@ -54,6 +55,14 @@ TIMEZONE_NEG_REGEX = r"[\-]\d{2}:\d{2}$" UTC_REGEX = r"[Zz]$" +# The length of the timezone offset in a datetime string is 6 characters, +# accounting for the "+" or "-" sign and the "HH:MM" format of the offset. +TIMEZONE_OFFSET_LENGTH = 6 + +# The UTC indicator in a datetime string is represented by a single character, +# either "Z" or "z", standing for Zulu time, which is another notation for UTC. +UTC_INDICATOR_LENGTH = 1 + class ScalarOpCompiler: # Mapping of operation name to implemenations @@ -673,6 +682,8 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): if x.type() == ibis_dtypes.str: + # Ibis lacks built-in support for timezone-aware datetime strings. + # So we manually process timezone data with case expressions. # This is not a exact match of Pandas behavior, but this ensures # UTC str to be properly handled. x = ( @@ -680,46 +691,24 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): .when( x.re_search(TIMEZONE_POS_REGEX), ( - ( - x.substr(0, x.length() - 6).to_timestamp(op.format) - if op.format - else x.substr(0, x.length() - 6) + _extract_datetime(x, op, TIMEZONE_OFFSET_LENGTH).cast( + ibis_dtypes.int64 ) - .cast(ibis_dtypes.Timestamp(timezone="UTC")) - .cast(ibis_dtypes.int64) - - x.substr(x.length() - 5, 2).cast(ibis_dtypes.int64) - * UNIT_TO_US_CONVERSION_FACTORS["h"] - - x.substr(x.length() - 2, 2).cast(ibis_dtypes.int64) - * UNIT_TO_US_CONVERSION_FACTORS["m"] - ) - .to_timestamp(unit="us") - .cast(ibis_dtypes.Timestamp(timezone="UTC")), + - _extract_timezone_as_us(x) + ).to_timestamp(unit="us"), ) .when( x.re_search(TIMEZONE_NEG_REGEX), ( - ( - x.substr(0, x.length() - 6).to_timestamp(op.format) - if op.format - else x.substr(0, x.length() - 6) + _extract_datetime(x, op, TIMEZONE_OFFSET_LENGTH).cast( + ibis_dtypes.int64 ) - .cast(ibis_dtypes.Timestamp(timezone="UTC")) - .cast(ibis_dtypes.int64) - + x.substr(x.length() - 5, 2).cast(ibis_dtypes.int64) - * UNIT_TO_US_CONVERSION_FACTORS["h"] - + x.substr(x.length() - 2, 2).cast(ibis_dtypes.int64) - * UNIT_TO_US_CONVERSION_FACTORS["m"] - ) - .to_timestamp(unit="us") - .cast(ibis_dtypes.Timestamp(timezone="UTC")), + + _extract_timezone_as_us(x) + ).to_timestamp(unit="us"), ) .when( x.re_search(UTC_REGEX), - ( - x.substr(0, x.length() - 1).to_timestamp(op.format) - if op.format - else x.substr(0, x.length() - 1) - ).cast(ibis_dtypes.Timestamp(timezone="UTC")), + _extract_datetime(x, op, UTC_INDICATOR_LENGTH), ) .else_( (x.to_timestamp(op.format) if op.format else x).cast( @@ -731,16 +720,19 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"): return x elif x.type() != ibis_dtypes.timestamp: + # The default unit is set to "ns" (nanoseconds) for consistency + # with pandas, where "ns" is the default unit for datetime operations. unit = op.unit if op.unit is not None else "ns" if unit not in UNIT_TO_US_CONVERSION_FACTORS: raise ValueError(f"Cannot convert input with unit '{unit}'.") x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] x_converted = x_converted.cast(ibis_dtypes.int64) - # Note: Due to an issue where casting directly to a non-UTC - # timezone does not work, we first cast to UTC. This seems - # to bypass a potential bug in Ibis's cast function, allowing - # for subsequent casting to a non-UTC timezone. Further - # investigation is needed to confirm this behavior. + # Note: Due to an issue where casting directly to a timestamp + # without a timezone does not work, we first cast to UTC. This + # approach appears to bypass a potential bug in Ibis's cast function, + # allowing for subsequent casting to a timestamp type without timezone + # information. Further investigation is needed to confirm this behavior. + x = x_converted.to_timestamp(unit="us").cast( ibis_dtypes.Timestamp(timezone="UTC") ) @@ -748,6 +740,25 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) +def _extract_datetime(x: ibis_types.Value, op: ops.ToDatetimeOp, tz_offset_len: int): + x_datetime = ( + x.substr(0, x.length() - tz_offset_len).to_timestamp(op.format) + if op.format + else x.substr(0, x.length() - tz_offset_len) + ).cast(ibis_dtypes.Timestamp(timezone="UTC")) + + return x_datetime + + +def _extract_timezone_as_us(x: ibis_types.Value): + return ( + x.substr(x.length() - 5, 2).cast(ibis_dtypes.int64) + * UNIT_TO_US_CONVERSION_FACTORS["h"] + + x.substr(x.length() - 2, 2).cast(ibis_dtypes.int64) + * UNIT_TO_US_CONVERSION_FACTORS["m"] + ) + + @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): if not hasattr(op.func, "bigframes_remote_function"): diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 123f8eb627..6aed72a6b2 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -52,11 +52,11 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - if ~isinstance(arg, bigframes.series.Series): + if not isinstance(arg, bigframes.series.Series): # TODO: Currently, data upload is performed using pandas DataFrames # combined with the `read_pandas` method due to the BigFrames DataFrame # constructor's limitations in handling various data types. Plan to update - # the upload process to utilize the BigPandas DataFrame constructor directly + # the upload process to utilize the BigFrames DataFrame constructor directly # once it is enhanced for more related datatypes. arg = global_session.with_default_session( bigframes.session.Session.read_pandas, pd.DataFrame(arg) From 5e5842b003fcdad6b0328d3a51093e9feb95f192 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 18:46:36 +0000 Subject: [PATCH 20/33] Add constants. --- bigframes/core/compile/scalar_op_compiler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index bb13afe08d..60740cdd62 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -58,6 +58,8 @@ # The length of the timezone offset in a datetime string is 6 characters, # accounting for the "+" or "-" sign and the "HH:MM" format of the offset. TIMEZONE_OFFSET_LENGTH = 6 +HOURS_TO_END_LENGTH = TIMEZONE_OFFSET_LENGTH - 1 +MINUTES_TO_END_LENGTH = TIMEZONE_OFFSET_LENGTH - 4 # The UTC indicator in a datetime string is represented by a single character, # either "Z" or "z", standing for Zulu time, which is another notation for UTC. @@ -752,9 +754,9 @@ def _extract_datetime(x: ibis_types.Value, op: ops.ToDatetimeOp, tz_offset_len: def _extract_timezone_as_us(x: ibis_types.Value): return ( - x.substr(x.length() - 5, 2).cast(ibis_dtypes.int64) + x.substr(x.length() - HOURS_TO_END_LENGTH, 2).cast(ibis_dtypes.int64) * UNIT_TO_US_CONVERSION_FACTORS["h"] - + x.substr(x.length() - 2, 2).cast(ibis_dtypes.int64) + + x.substr(x.length() - MINUTES_TO_END_LENGTH, 2).cast(ibis_dtypes.int64) * UNIT_TO_US_CONVERSION_FACTORS["m"] ) From d4a71b0c860b7a7f3db850a5daed905d3e567a0c Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 18:50:58 +0000 Subject: [PATCH 21/33] Update comments --- bigframes/core/tools/datetimes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 6aed72a6b2..093fa0a670 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -53,6 +53,8 @@ def to_datetime( ) if not isinstance(arg, bigframes.series.Series): + # This block ensures compatibility with local data formats, including + # iterables and pandas.Series # TODO: Currently, data upload is performed using pandas DataFrames # combined with the `read_pandas` method due to the BigFrames DataFrame # constructor's limitations in handling various data types. Plan to update From e0d1f8c7ed520c4f67a6a50b14bafe10cc25d9dc Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 18:55:27 +0000 Subject: [PATCH 22/33] Move datetime helpers to the end of file. --- bigframes/core/compile/scalar_op_compiler.py | 38 ++++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 60740cdd62..6c20790414 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -742,25 +742,6 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None)) -def _extract_datetime(x: ibis_types.Value, op: ops.ToDatetimeOp, tz_offset_len: int): - x_datetime = ( - x.substr(0, x.length() - tz_offset_len).to_timestamp(op.format) - if op.format - else x.substr(0, x.length() - tz_offset_len) - ).cast(ibis_dtypes.Timestamp(timezone="UTC")) - - return x_datetime - - -def _extract_timezone_as_us(x: ibis_types.Value): - return ( - x.substr(x.length() - HOURS_TO_END_LENGTH, 2).cast(ibis_dtypes.int64) - * UNIT_TO_US_CONVERSION_FACTORS["h"] - + x.substr(x.length() - MINUTES_TO_END_LENGTH, 2).cast(ibis_dtypes.int64) - * UNIT_TO_US_CONVERSION_FACTORS["m"] - ) - - @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): if not hasattr(op.func, "bigframes_remote_function"): @@ -1246,3 +1227,22 @@ def is_null(value) -> bool: def _ibis_num(number: float): return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) + + +def _extract_datetime(x: ibis_types.Value, op: ops.ToDatetimeOp, tz_offset_len: int): + x_datetime = ( + x.substr(0, x.length() - tz_offset_len).to_timestamp(op.format) + if op.format + else x.substr(0, x.length() - tz_offset_len) + ).cast(ibis_dtypes.Timestamp(timezone="UTC")) + + return x_datetime + + +def _extract_timezone_as_us(x: ibis_types.Value): + return ( + x.substr(x.length() - HOURS_TO_END_LENGTH, 2).cast(ibis_dtypes.int64) + * UNIT_TO_US_CONVERSION_FACTORS["h"] + + x.substr(x.length() - MINUTES_TO_END_LENGTH, 2).cast(ibis_dtypes.int64) + * UNIT_TO_US_CONVERSION_FACTORS["m"] + ) From d0db6992de37cf0dd103d37c8d259159ca2ab37b Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 18:56:17 +0000 Subject: [PATCH 23/33] Update helper --- bigframes/core/compile/scalar_op_compiler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 6c20790414..15c99b197a 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1234,9 +1234,8 @@ def _extract_datetime(x: ibis_types.Value, op: ops.ToDatetimeOp, tz_offset_len: x.substr(0, x.length() - tz_offset_len).to_timestamp(op.format) if op.format else x.substr(0, x.length() - tz_offset_len) - ).cast(ibis_dtypes.Timestamp(timezone="UTC")) - - return x_datetime + ) + return x_datetime.cast(ibis_dtypes.Timestamp(timezone="UTC")) def _extract_timezone_as_us(x: ibis_types.Value): From 958ca00f0b468f1b484ef3d0425929a849820525 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 19:01:38 +0000 Subject: [PATCH 24/33] update format --- bigframes/core/compile/scalar_op_compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 15c99b197a..bd66d7bf24 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -724,17 +724,17 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): elif x.type() != ibis_dtypes.timestamp: # The default unit is set to "ns" (nanoseconds) for consistency # with pandas, where "ns" is the default unit for datetime operations. - unit = op.unit if op.unit is not None else "ns" + unit = op.unit or "ns" if unit not in UNIT_TO_US_CONVERSION_FACTORS: raise ValueError(f"Cannot convert input with unit '{unit}'.") x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] x_converted = x_converted.cast(ibis_dtypes.int64) + # Note: Due to an issue where casting directly to a timestamp # without a timezone does not work, we first cast to UTC. This # approach appears to bypass a potential bug in Ibis's cast function, # allowing for subsequent casting to a timestamp type without timezone # information. Further investigation is needed to confirm this behavior. - x = x_converted.to_timestamp(unit="us").cast( ibis_dtypes.Timestamp(timezone="UTC") ) From 6ef47fb6d23632408ca0d012757e034ea2b96afd Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 20:12:09 +0000 Subject: [PATCH 25/33] String process logic updated. --- bigframes/core/compile/scalar_op_compiler.py | 70 ++------------------ tests/system/small/test_pandas.py | 2 +- 2 files changed, 6 insertions(+), 66 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index bd66d7bf24..c25bc5d490 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -51,20 +51,6 @@ "ns": 1e-3, } -TIMEZONE_POS_REGEX = r"[\+]\d{2}:\d{2}$" -TIMEZONE_NEG_REGEX = r"[\-]\d{2}:\d{2}$" -UTC_REGEX = r"[Zz]$" - -# The length of the timezone offset in a datetime string is 6 characters, -# accounting for the "+" or "-" sign and the "HH:MM" format of the offset. -TIMEZONE_OFFSET_LENGTH = 6 -HOURS_TO_END_LENGTH = TIMEZONE_OFFSET_LENGTH - 1 -MINUTES_TO_END_LENGTH = TIMEZONE_OFFSET_LENGTH - 4 - -# The UTC indicator in a datetime string is represented by a single character, -# either "Z" or "z", standing for Zulu time, which is another notation for UTC. -UTC_INDICATOR_LENGTH = 1 - class ScalarOpCompiler: # Mapping of operation name to implemenations @@ -684,41 +670,8 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): if x.type() == ibis_dtypes.str: - # Ibis lacks built-in support for timezone-aware datetime strings. - # So we manually process timezone data with case expressions. - # This is not a exact match of Pandas behavior, but this ensures - # UTC str to be properly handled. - x = ( - ibis.case() - .when( - x.re_search(TIMEZONE_POS_REGEX), - ( - _extract_datetime(x, op, TIMEZONE_OFFSET_LENGTH).cast( - ibis_dtypes.int64 - ) - - _extract_timezone_as_us(x) - ).to_timestamp(unit="us"), - ) - .when( - x.re_search(TIMEZONE_NEG_REGEX), - ( - _extract_datetime(x, op, TIMEZONE_OFFSET_LENGTH).cast( - ibis_dtypes.int64 - ) - + _extract_timezone_as_us(x) - ).to_timestamp(unit="us"), - ) - .when( - x.re_search(UTC_REGEX), - _extract_datetime(x, op, UTC_INDICATOR_LENGTH), - ) - .else_( - (x.to_timestamp(op.format) if op.format else x).cast( - ibis_dtypes.Timestamp(timezone="UTC") - ) - ) - .end() - ) + x = x.to_timestamp(op.format) if op.format else timestamp(x) + print(x) elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"): return x elif x.type() != ibis_dtypes.timestamp: @@ -1229,19 +1182,6 @@ def _ibis_num(number: float): return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) -def _extract_datetime(x: ibis_types.Value, op: ops.ToDatetimeOp, tz_offset_len: int): - x_datetime = ( - x.substr(0, x.length() - tz_offset_len).to_timestamp(op.format) - if op.format - else x.substr(0, x.length() - tz_offset_len) - ) - return x_datetime.cast(ibis_dtypes.Timestamp(timezone="UTC")) - - -def _extract_timezone_as_us(x: ibis_types.Value): - return ( - x.substr(x.length() - HOURS_TO_END_LENGTH, 2).cast(ibis_dtypes.int64) - * UNIT_TO_US_CONVERSION_FACTORS["h"] - + x.substr(x.length() - MINUTES_TO_END_LENGTH, 2).cast(ibis_dtypes.int64) - * UNIT_TO_US_CONVERSION_FACTORS["m"] - ) +@ibis.udf.scalar.builtin +def timestamp(a: str) -> ibis_dtypes.timestamp: + """Compute the Hamming distance between two strings.""" diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 9457c08638..14cb77c6bc 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -16,7 +16,6 @@ import pandas as pd import pytest -import pytz import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal @@ -511,6 +510,7 @@ def test_to_datetime_scalar(arg, utc, unit, format): ), (["2023-01-01"], True, None, "%Y-%m-%d"), (["2023-02-01T15:00:00+07:22"], True, None, None), + (["01-31-2023 14:30 -0800"], True, None, "%m-%d-%Y %H:%M %z"), (["01-31-2023 14:00", "02-01-2023 15:00"], True, None, "%m-%d-%Y %H:%M"), ], ) From a08ea2e9af500f3858e19a412451ca9b4bc13bba Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 20:15:51 +0000 Subject: [PATCH 26/33] update import --- tests/system/small/test_pandas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 14cb77c6bc..ec61329aa5 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -16,6 +16,7 @@ import pandas as pd import pytest +import pytz import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal From 6732fd9ebf366beee8e0f7c255e17a6869eca771 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 20:17:10 +0000 Subject: [PATCH 27/33] remove print --- bigframes/core/compile/scalar_op_compiler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index c25bc5d490..927d2af4fe 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -671,7 +671,6 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): if x.type() == ibis_dtypes.str: x = x.to_timestamp(op.format) if op.format else timestamp(x) - print(x) elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"): return x elif x.type() != ibis_dtypes.timestamp: From 7c54aaad50501e30f79e591c9ed7bbf87af85c40 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 20:27:02 +0000 Subject: [PATCH 28/33] update docstring --- third_party/bigframes_vendored/pandas/core/tools/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index cf97042013..efdd8e399d 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -47,8 +47,8 @@ def to_datetime( Converting a Series of Strings with Timezone Information: - >>> series_str = bpd.Series(["01-31-2021 14:30+08:00", "02-28-2021 15:45z"]) - >>> bpd.to_datetime(series_str, format="%m-%d-%Y %H:%M", utc=True) + >>> series_str = bpd.Series(["01-31-2021 14:30+08:00", "02-28-2021 15:45+00:00"]) + >>> bpd.to_datetime(series_str, format=format="%Y-%m-%dT%H:%M:%S%Z", utc=True) 0 2021-01-31 06:30:00+00:00 1 2021-02-28 15:45:00+00:00 Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] From 1b68883de43955ec03e92fc0e5a5efea1a9ee632 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 20:28:47 +0000 Subject: [PATCH 29/33] update docstring --- third_party/bigframes_vendored/pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index efdd8e399d..eebb63e772 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -48,7 +48,7 @@ def to_datetime( Converting a Series of Strings with Timezone Information: >>> series_str = bpd.Series(["01-31-2021 14:30+08:00", "02-28-2021 15:45+00:00"]) - >>> bpd.to_datetime(series_str, format=format="%Y-%m-%dT%H:%M:%S%Z", utc=True) + >>> bpd.to_datetime(series_str, format="%Y-%m-%dT%H:%M:%S%Z", utc=True) 0 2021-01-31 06:30:00+00:00 1 2021-02-28 15:45:00+00:00 Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] From 7057758eedb56d32b479cc8d21f24a6bfad386ff Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 20:40:10 +0000 Subject: [PATCH 30/33] update docstring --- .../bigframes_vendored/pandas/core/tools/datetimes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index eebb63e772..c88e87738b 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -26,6 +26,10 @@ def to_datetime( with timestamps that have a finer granularity than microseconds, be aware that the additional precision will not be represented in BigQuery. + The format strings for specifying datetime representations in BigQuery and pandas + are not completely identical. Ensure that the format string provided is compatible + with BigQuery. + **Examples:** >>> import bigframes.pandas as bpd @@ -48,7 +52,7 @@ def to_datetime( Converting a Series of Strings with Timezone Information: >>> series_str = bpd.Series(["01-31-2021 14:30+08:00", "02-28-2021 15:45+00:00"]) - >>> bpd.to_datetime(series_str, format="%Y-%m-%dT%H:%M:%S%Z", utc=True) + >>> bpd.to_datetime(series_str, format="%m-%d-%Y %H:%M%Z", utc=True) 0 2021-01-31 06:30:00+00:00 1 2021-02-28 15:45:00+00:00 Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] From 22abed06be65d140e3d3a5c10973b83a89d7fa7c Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 20:53:25 +0000 Subject: [PATCH 31/33] update note --- third_party/bigframes_vendored/pandas/core/tools/datetimes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index c88e87738b..16049bf010 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -26,6 +26,7 @@ def to_datetime( with timestamps that have a finer granularity than microseconds, be aware that the additional precision will not be represented in BigQuery. + .. note:: The format strings for specifying datetime representations in BigQuery and pandas are not completely identical. Ensure that the format string provided is compatible with BigQuery. From a4e981bf518764e6774a416a4001b2ed795fe441 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 21:16:57 +0000 Subject: [PATCH 32/33] update docstring --- bigframes/core/compile/scalar_op_compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 927d2af4fe..3bcdd70581 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1183,4 +1183,4 @@ def _ibis_num(number: float): @ibis.udf.scalar.builtin def timestamp(a: str) -> ibis_dtypes.timestamp: - """Compute the Hamming distance between two strings.""" + """Convert string to timestamp.""" From 24347a27508628ea9a27cb089726741500e2ab4b Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 9 Feb 2024 21:34:25 +0000 Subject: [PATCH 33/33] Update code examples --- third_party/bigframes_vendored/pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 16049bf010..8a8a562bae 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -56,7 +56,7 @@ def to_datetime( >>> bpd.to_datetime(series_str, format="%m-%d-%Y %H:%M%Z", utc=True) 0 2021-01-31 06:30:00+00:00 1 2021-02-28 15:45:00+00:00 - Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] + dtype: timestamp[us, tz=UTC][pyarrow] Args: arg (int, float, str, datetime, list, tuple, 1-d array, Series):