diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index cadd8e5145..4a3bb16a39 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -129,12 +129,15 @@ def cut( if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") - if labels is not False: + if labels is not None and labels is not False: raise NotImplementedError( - f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + "The 'labels' parameter must be either False or None. " + "Please provide a valid value for 'labels'." ) - return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec()) + return x._apply_window_op( + agg_ops.CutOp(bins, labels=labels), window_spec=core.WindowSpec() + ) def qcut( diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 452abf047c..cc2e79c100 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -229,7 +229,7 @@ def skips_nulls(self): class CutOp(WindowOp): - def __init__(self, bins: typing.Union[int, pd.IntervalIndex]): + def __init__(self, bins: typing.Union[int, pd.IntervalIndex], labels=None): if isinstance(bins, int): if not bins > 0: raise ValueError("`bins` should be a positive integer.") @@ -239,6 +239,8 @@ def __init__(self, bins: typing.Union[int, pd.IntervalIndex]): self._bins_int = 0 self._bins = bins + self._labels = labels + def _as_ibis(self, x: ibis_types.Column, window=None): out = ibis.case() @@ -247,12 +249,37 @@ def _as_ibis(self, x: ibis_types.Column, window=None): col_max = _apply_window_if_present(x.max(), window) bin_width = (col_max - col_min) / self._bins - for this_bin in range(self._bins_int - 1): - out = out.when( - x <= (col_min + (this_bin + 1) * bin_width), - dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()), - ) - out = out.when(x.notnull(), self._bins - 1) + if self._labels is False: + for this_bin in range(self._bins_int - 1): + out = out.when( + x <= (col_min + (this_bin + 1) * bin_width), + dtypes.literal_to_ibis_scalar( + this_bin, force_dtype=Int64Dtype() + ), + ) + out = out.when(x.notnull(), self._bins - 1) + else: + interval_struct = None + adj = (col_max - col_min) * 0.001 + for this_bin in range(self._bins_int): + left_edge = ( + col_min + this_bin * bin_width - (0 if this_bin > 0 else adj) + ) + right_edge = col_min + (this_bin + 1) * bin_width + interval_struct = ibis.struct( + { + "left_exclusive": left_edge, + "right_inclusive": right_edge, + } + ) + + if this_bin < self._bins_int - 1: + out = out.when( + x <= (col_min + (this_bin + 1) * bin_width), + interval_struct, + ) + else: + out = out.when(x.notnull(), interval_struct) else: for interval in self._bins: condition = (x > interval.left) & (x <= interval.right) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a79ddb64cd..0910c0b7e2 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -397,6 +397,30 @@ def test_cut(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +def test_cut_default_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5) + bf_result = bpd.cut(scalars_df["float64_col"], 5).to_pandas() + + # Convert to match data format + pd_result_converted = pd.Series( + [ + {"left_exclusive": interval.left, "right_inclusive": interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip( + pd_result, pd_result.cat.categories[pd_result.cat.codes] + ) + ], + name=pd_result.name, + ) + + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) + + @pytest.mark.parametrize( ("bins",), [ @@ -424,7 +448,6 @@ def test_cut_with_interval(scalars_dfs, bins): ], name=pd_result.name, ) - pd_result.index = pd_result.index.astype("Int64") pd.testing.assert_series_equal( bf_result, pd_result_converted, check_index=False, check_dtype=False diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 4835a24dc7..d6af223456 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -85,7 +85,10 @@ def test_method_matches_session(method_name: str): def test_cut_raises_with_labels(): - with pytest.raises(NotImplementedError, match="Only labels=False"): + with pytest.raises( + NotImplementedError, + match="The 'labels' parameter must be either False or None.", + ): mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 55975c3fc1..fbd1d2d052 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -38,6 +38,15 @@ def cut( Cut with an integer (equal-width bins): + >>> bpd.cut(s, bins=4) + 0 {'left_exclusive': -0.01, 'right_inclusive': 2.5} + 1 {'left_exclusive': -0.01, 'right_inclusive': 2.5} + 2 {'left_exclusive': 2.5, 'right_inclusive': 5.0} + 3 {'left_exclusive': 7.5, 'right_inclusive': 10.0} + dtype: struct[pyarrow] + + Cut with an integer (equal-width bins) and labels=False: + >>> bpd.cut(s, bins=4, labels=False) 0 0 1 0 @@ -50,7 +59,7 @@ def cut( >>> import pandas as pd >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) - >>> bpd.cut(s, bins=interval_index, labels=False) + >>> bpd.cut(s, bins=interval_index) 0 1 {'left_exclusive': 0, 'right_inclusive': 1} 2 {'left_exclusive': 1, 'right_inclusive': 5} @@ -60,7 +69,7 @@ def cut( Cut with an iterable of tuples: >>> bins_tuples = [(0, 1), (1, 4), (5, 20)] - >>> bpd.cut(s, bins=bins_tuples, labels=False) + >>> bpd.cut(s, bins=bins_tuples) 0 1 {'left_exclusive': 0, 'right_inclusive': 1} 2 @@ -82,9 +91,7 @@ def cut( labels (None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the - bins. This affects the type of the output container (see below). - If True, raises an error. When `ordered=False`, labels must be - provided. + bins. This affects the type of the output container. Returns: Series: A Series representing the respective bin for each value