Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 26 additions & 22 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2303,7 +2303,7 @@ def melt(
self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
)

_NUMERICAL_DISCRIBE_AGGS = (
_NUMERIC_DESCRIBE_AGGS = (
"count",
"mean",
"std",
Expand All @@ -2313,41 +2313,53 @@ def melt(
"75%",
"max",
)
_NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique")
_NON_NUMERIC_DESCRIBE_AGGS = ("count", "nunique")

def describe(self, include: None | Literal["all"] = None) -> DataFrame:

allowed_non_numeric_types = {
bigframes.dtypes.STRING_DTYPE,
bigframes.dtypes.BOOL_DTYPE,
bigframes.dtypes.BYTES_DTYPE,
}

if include is None:
numeric_df = self._drop_non_numeric(permissive=False)
if len(numeric_df.columns) == 0:
# Describe eligible non-numerical columns
result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS)
# Describe eligible non-numeric columns
result = self.select_dtypes(include=allowed_non_numeric_types).agg(
self._NON_NUMERIC_DESCRIBE_AGGS
)
else:
# Otherwise, only describe numerical columns
result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS)
# Otherwise, only describe numeric columns
result = numeric_df.agg(self._NUMERIC_DESCRIBE_AGGS)
return typing.cast(DataFrame, result)

elif include == "all":
numeric_result = typing.cast(
DataFrame,
self._drop_non_numeric(permissive=False).agg(
self._NUMERICAL_DISCRIBE_AGGS
self._NUMERIC_DESCRIBE_AGGS
),
)
string_result = typing.cast(

non_numeric_result = typing.cast(
DataFrame,
self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS),
self.select_dtypes(include=allowed_non_numeric_types).agg(
self._NON_NUMERIC_DESCRIBE_AGGS
),
)

if len(numeric_result.columns) == 0:
return string_result
elif len(string_result.columns) == 0:
return non_numeric_result
elif len(non_numeric_result.columns) == 0:
return numeric_result
else:
import bigframes.core.reshape as rs

# Use reindex after join to preserve the original column order.
return rs.concat(
[numeric_result, string_result], axis=1
[non_numeric_result, numeric_result], axis=1
)._reindex_columns(self.columns)

else:
Expand Down Expand Up @@ -2549,26 +2561,18 @@ def unstack(self, level: LevelsType = -1):
return DataFrame(pivot_block)

def _drop_non_numeric(self, permissive=True) -> DataFrame:
numerical_types = (
numeric_types = (
set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
if permissive
else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
)
non_numeric_cols = [
col_id
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
if dtype not in numerical_types
if dtype not in numeric_types
]
return DataFrame(self._block.drop_columns(non_numeric_cols))

def _drop_non_string(self) -> DataFrame:
string_cols = [
col_id
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
if dtype == bigframes.dtypes.STRING_DTYPE
]
return DataFrame(self._block.select_columns(string_cols))

def _drop_non_bool(self) -> DataFrame:
non_bool_cols = [
col_id
Expand Down
2 changes: 1 addition & 1 deletion bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
return True

if dtype == dtypes.STRING_DTYPE:
if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
return isinstance(op, (CountOp, NuniqueOp))

# For all other types, support no aggregation
Expand Down
26 changes: 13 additions & 13 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2619,15 +2619,15 @@ def test_df_describe(scalars_dfs):

@skip_legacy_pandas
@pytest.mark.parametrize("include", [None, "all"])
def test_df_describe_non_numerical(scalars_dfs, include):
def test_df_describe_non_numeric(scalars_dfs, include):
scalars_df, scalars_pandas_df = scalars_dfs

non_numerical_columns = ["string_col"]
non_numeric_columns = ["string_col", "bytes_col", "bool_col"]

modified_bf = scalars_df[non_numerical_columns]
modified_bf = scalars_df[non_numeric_columns]
bf_result = modified_bf.describe(include=include).to_pandas()

modified_pd_df = scalars_pandas_df[non_numerical_columns]
modified_pd_df = scalars_pandas_df[non_numeric_columns]
pd_result = modified_pd_df.describe(include=include)

# Reindex results with the specified keys and their order, because
Expand All @@ -2639,8 +2639,8 @@ def test_df_describe_non_numerical(scalars_dfs, include):
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result[non_numerical_columns].astype("Int64"),
bf_result[non_numerical_columns],
pd_result[non_numeric_columns].astype("Int64"),
bf_result[non_numeric_columns],
check_index_type=False,
)

Expand All @@ -2649,12 +2649,12 @@ def test_df_describe_non_numerical(scalars_dfs, include):
def test_df_describe_mixed_types_include_all(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

numerical_columns = [
numeric_columns = [
"int64_col",
"float64_col",
]
non_numerical_columns = ["string_col"]
supported_columns = numerical_columns + non_numerical_columns
non_numeric_columns = ["string_col"]
supported_columns = numeric_columns + non_numeric_columns

modified_bf = scalars_df[supported_columns]
bf_result = modified_bf.describe(include="all").to_pandas()
Expand All @@ -2678,14 +2678,14 @@ def test_df_describe_mixed_types_include_all(scalars_dfs):
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result[numerical_columns].astype("Float64"),
bf_result[numerical_columns],
pd_result[numeric_columns].astype("Float64"),
bf_result[numeric_columns],
check_index_type=False,
)

pd.testing.assert_frame_equal(
pd_result[non_numerical_columns].astype("Int64"),
bf_result[non_numerical_columns],
pd_result[non_numeric_columns].astype("Int64"),
bf_result[non_numeric_columns],
check_index_type=False,
)

Expand Down
39 changes: 15 additions & 24 deletions tests/unit/operations/test_aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,38 +55,29 @@
first_op,
]
)
_STRING_SUPPORTED_OPS = set([count_op, nunique_op])


@pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
@pytest.mark.parametrize("op", _ALL_OPS)
def test_is_agg_op_supported_numerical_support_all(dtype, op):
def test_is_agg_op_supported_numeric_support_all(dtype, op):
assert is_agg_op_supported(dtype, op) is True


@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS)
def test_is_agg_op_supported_string_support_ops(dtype, op):
assert is_agg_op_supported(dtype, op) is True


@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS)
def test_is_agg_op_supported_string_not_support_ops(dtype, op):
assert is_agg_op_supported(dtype, op) is False


@pytest.mark.parametrize(
"dtype",
("dtype", "supported_ops"),
[
dtypes.BYTES_DTYPE,
dtypes.DATE_DTYPE,
dtypes.TIME_DTYPE,
dtypes.DATETIME_DTYPE,
dtypes.TIMESTAMP_DTYPE,
dtypes.GEO_DTYPE,
(dtypes.STRING_DTYPE, {count_op, nunique_op}),
(dtypes.BYTES_DTYPE, {count_op, nunique_op}),
(dtypes.DATE_DTYPE, set()),
(dtypes.TIME_DTYPE, set()),
(dtypes.DATETIME_DTYPE, set()),
(dtypes.TIMESTAMP_DTYPE, set()),
(dtypes.GEO_DTYPE, set()),
],
)
@pytest.mark.parametrize("op", _ALL_OPS)
def test_is_agg_op_supported_non_numerical_no_support(dtype, op):
assert is_agg_op_supported(dtype, op) is False
def test_is_agg_op_supported_non_numeric(dtype, supported_ops):
for op in supported_ops:
assert is_agg_op_supported(dtype, op) is True

for op in _ALL_OPS - supported_ops:
assert is_agg_op_supported(dtype, op) is False