From 3c13aa5bf7752f5dc5a38410d3ded4157b50465b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 17 Sep 2024 19:34:29 +0000 Subject: [PATCH 1/6] feat: Support bool and bytes types in describe(include='all') --- bigframes/dataframe.py | 55 +++++++++++++++------------- bigframes/operations/aggregations.py | 2 +- tests/system/small/test_dataframe.py | 2 +- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 862c8dc2c8..f8267b456c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -32,6 +32,7 @@ Mapping, Optional, Sequence, + Set, Tuple, Union, ) @@ -2316,11 +2317,20 @@ def melt( _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique") def describe(self, include: None | Literal["all"] = None) -> DataFrame: + + allowed_non_numerical_types = { + bigframes.dtypes.STRING_DTYPE, + bigframes.dtypes.BOOL_DTYPE, + bigframes.dtypes.BYTES_DTYPE, + } + if include is None: numeric_df = self._drop_non_numeric(permissive=False) if len(numeric_df.columns) == 0: # Describe eligible non-numerical columns - result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS) + result = self._filter_columns_by_type(allowed_non_numerical_types).agg( + self._NON_NUMERICAL_DESCRIBE_AGGS + ) else: # Otherwise, only describe numerical columns result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS) @@ -2333,21 +2343,24 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame: self._NUMERICAL_DISCRIBE_AGGS ), ) - string_result = typing.cast( + + non_numeric_result = typing.cast( DataFrame, - self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS), + self._filter_columns_by_type(allowed_non_numerical_types).agg( + self._NON_NUMERICAL_DESCRIBE_AGGS + ), ) if len(numeric_result.columns) == 0: - return string_result - elif len(string_result.columns) == 0: + return non_numeric_result + elif len(non_numeric_result.columns) == 0: return numeric_result else: import bigframes.core.reshape as rs # Use reindex after join to preserve the original column order. return rs.concat( - [numeric_result, string_result], axis=1 + [non_numeric_result, numeric_result], axis=1 )._reindex_columns(self.columns) else: @@ -2548,34 +2561,24 @@ def unstack(self, level: LevelsType = -1): ) return DataFrame(pivot_block) + def _filter_columns_by_type(self, types: Set[bigframes.dtypes.Dtype]) -> DataFrame: + target_cols = [ + col_id + for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) + if dtype in types + ] + return DataFrame(self._block.select_columns(target_cols)) + def _drop_non_numeric(self, permissive=True) -> DataFrame: numerical_types = ( set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) if permissive else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) ) - non_numeric_cols = [ - col_id - for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) - if dtype not in numerical_types - ] - return DataFrame(self._block.drop_columns(non_numeric_cols)) - - def _drop_non_string(self) -> DataFrame: - string_cols = [ - col_id - for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) - if dtype == bigframes.dtypes.STRING_DTYPE - ] - return DataFrame(self._block.select_columns(string_cols)) + return self._filter_columns_by_type(numerical_types) def _drop_non_bool(self) -> DataFrame: - non_bool_cols = [ - col_id - for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) - if dtype not in bigframes.dtypes.BOOL_BIGFRAMES_TYPES - ] - return DataFrame(self._block.drop_columns(non_bool_cols)) + return self._filter_columns_by_type(set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES)) def _raise_on_non_numeric(self, op: str): if not all( diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index f20429e449..d071889ac4 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -568,7 +568,7 @@ def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool: if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: return True - if dtype == dtypes.STRING_DTYPE: + if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE): return isinstance(op, (CountOp, NuniqueOp)) # For all other types, support no aggregation diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b4c81bfbef..9777b1ae33 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2622,7 +2622,7 @@ def test_df_describe(scalars_dfs): def test_df_describe_non_numerical(scalars_dfs, include): scalars_df, scalars_pandas_df = scalars_dfs - non_numerical_columns = ["string_col"] + non_numerical_columns = ["string_col", "bytes_col", "bool_col"] modified_bf = scalars_df[non_numerical_columns] bf_result = modified_bf.describe(include=include).to_pandas() From 6aa7fa5132985773ac715d41c547c8e56292ed9d Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 17 Sep 2024 21:19:46 +0000 Subject: [PATCH 2/6] update aggregation unit tests --- tests/unit/operations/test_aggregations.py | 39 +++++++++------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/tests/unit/operations/test_aggregations.py b/tests/unit/operations/test_aggregations.py index 4cb6934c9d..68ad48ac29 100644 --- a/tests/unit/operations/test_aggregations.py +++ b/tests/unit/operations/test_aggregations.py @@ -55,38 +55,29 @@ first_op, ] ) -_STRING_SUPPORTED_OPS = set([count_op, nunique_op]) @pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) @pytest.mark.parametrize("op", _ALL_OPS) -def test_is_agg_op_supported_numerical_support_all(dtype, op): +def test_is_agg_op_supported_numeric_support_all(dtype, op): assert is_agg_op_supported(dtype, op) is True -@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE]) -@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS) -def test_is_agg_op_supported_string_support_ops(dtype, op): - assert is_agg_op_supported(dtype, op) is True - - -@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE]) -@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS) -def test_is_agg_op_supported_string_not_support_ops(dtype, op): - assert is_agg_op_supported(dtype, op) is False - - @pytest.mark.parametrize( - "dtype", + ("dtype", "supported_ops"), [ - dtypes.BYTES_DTYPE, - dtypes.DATE_DTYPE, - dtypes.TIME_DTYPE, - dtypes.DATETIME_DTYPE, - dtypes.TIMESTAMP_DTYPE, - dtypes.GEO_DTYPE, + (dtypes.STRING_DTYPE, {count_op, nunique_op}), + (dtypes.BYTES_DTYPE, {count_op, nunique_op}), + (dtypes.DATE_DTYPE, set()), + (dtypes.TIME_DTYPE, set()), + (dtypes.DATETIME_DTYPE, set()), + (dtypes.TIMESTAMP_DTYPE, set()), + (dtypes.GEO_DTYPE, set()), ], ) -@pytest.mark.parametrize("op", _ALL_OPS) -def test_is_agg_op_supported_non_numerical_no_support(dtype, op): - assert is_agg_op_supported(dtype, op) is False +def test_is_agg_op_supported_non_numeric(dtype, supported_ops): + for op in supported_ops: + assert is_agg_op_supported(dtype, op) is True + + for op in _ALL_OPS - supported_ops: + assert is_agg_op_supported(dtype, op) is False From 3b7bdfe07ce27763f04cd3966f4ab20c5ccaac04 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 18 Sep 2024 22:30:18 +0000 Subject: [PATCH 3/6] fix typo and remove unnecessary helper --- bigframes/dataframe.py | 28 ++++++++++------------------ tests/system/small/test_dataframe.py | 26 +++++++++++++------------- 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f8267b456c..56716afe11 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2304,7 +2304,7 @@ def melt( self._block.melt(id_col_ids, val_col_ids, var_name, value_name) ) - _NUMERICAL_DISCRIBE_AGGS = ( + _NUMERIC_DESCRIBE_AGGS = ( "count", "mean", "std", @@ -2314,7 +2314,7 @@ def melt( "75%", "max", ) - _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique") + _NON_NUMERIC_DESCRIBE_AGGS = ("count", "nunique") def describe(self, include: None | Literal["all"] = None) -> DataFrame: @@ -2328,26 +2328,26 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame: numeric_df = self._drop_non_numeric(permissive=False) if len(numeric_df.columns) == 0: # Describe eligible non-numerical columns - result = self._filter_columns_by_type(allowed_non_numerical_types).agg( - self._NON_NUMERICAL_DESCRIBE_AGGS + result = self.select_dtypes(include=allowed_non_numerical_types).agg( + self._NON_NUMERIC_DESCRIBE_AGGS ) else: # Otherwise, only describe numerical columns - result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS) + result = numeric_df.agg(self._NUMERIC_DESCRIBE_AGGS) return typing.cast(DataFrame, result) elif include == "all": numeric_result = typing.cast( DataFrame, self._drop_non_numeric(permissive=False).agg( - self._NUMERICAL_DISCRIBE_AGGS + self._NUMERIC_DESCRIBE_AGGS ), ) non_numeric_result = typing.cast( DataFrame, - self._filter_columns_by_type(allowed_non_numerical_types).agg( - self._NON_NUMERICAL_DESCRIBE_AGGS + self.select_dtypes(include=allowed_non_numerical_types).agg( + self._NON_NUMERIC_DESCRIBE_AGGS ), ) @@ -2561,24 +2561,16 @@ def unstack(self, level: LevelsType = -1): ) return DataFrame(pivot_block) - def _filter_columns_by_type(self, types: Set[bigframes.dtypes.Dtype]) -> DataFrame: - target_cols = [ - col_id - for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) - if dtype in types - ] - return DataFrame(self._block.select_columns(target_cols)) - def _drop_non_numeric(self, permissive=True) -> DataFrame: numerical_types = ( set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) if permissive else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) ) - return self._filter_columns_by_type(numerical_types) + return self.select_dtypes(include=numerical_types) def _drop_non_bool(self) -> DataFrame: - return self._filter_columns_by_type(set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES)) + return self.select_dtypes(include=bigframes.dtypes.BOOL_BIGFRAMES_TYPES) def _raise_on_non_numeric(self, op: str): if not all( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9777b1ae33..0a637e983f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2619,15 +2619,15 @@ def test_df_describe(scalars_dfs): @skip_legacy_pandas @pytest.mark.parametrize("include", [None, "all"]) -def test_df_describe_non_numerical(scalars_dfs, include): +def test_df_describe_non_numeric(scalars_dfs, include): scalars_df, scalars_pandas_df = scalars_dfs - non_numerical_columns = ["string_col", "bytes_col", "bool_col"] + non_numeric_columns = ["string_col", "bytes_col", "bool_col"] - modified_bf = scalars_df[non_numerical_columns] + modified_bf = scalars_df[non_numeric_columns] bf_result = modified_bf.describe(include=include).to_pandas() - modified_pd_df = scalars_pandas_df[non_numerical_columns] + modified_pd_df = scalars_pandas_df[non_numeric_columns] pd_result = modified_pd_df.describe(include=include) # Reindex results with the specified keys and their order, because @@ -2639,8 +2639,8 @@ def test_df_describe_non_numerical(scalars_dfs, include): ).rename(index={"unique": "nunique"}) pd.testing.assert_frame_equal( - pd_result[non_numerical_columns].astype("Int64"), - bf_result[non_numerical_columns], + pd_result[non_numeric_columns].astype("Int64"), + bf_result[non_numeric_columns], check_index_type=False, ) @@ -2649,12 +2649,12 @@ def test_df_describe_non_numerical(scalars_dfs, include): def test_df_describe_mixed_types_include_all(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - numerical_columns = [ + numeric_columns = [ "int64_col", "float64_col", ] - non_numerical_columns = ["string_col"] - supported_columns = numerical_columns + non_numerical_columns + non_numeric_columns = ["string_col"] + supported_columns = numeric_columns + non_numeric_columns modified_bf = scalars_df[supported_columns] bf_result = modified_bf.describe(include="all").to_pandas() @@ -2678,14 +2678,14 @@ def test_df_describe_mixed_types_include_all(scalars_dfs): ).rename(index={"unique": "nunique"}) pd.testing.assert_frame_equal( - pd_result[numerical_columns].astype("Float64"), - bf_result[numerical_columns], + pd_result[numeric_columns].astype("Float64"), + bf_result[numeric_columns], check_index_type=False, ) pd.testing.assert_frame_equal( - pd_result[non_numerical_columns].astype("Int64"), - bf_result[non_numerical_columns], + pd_result[non_numeric_columns].astype("Int64"), + bf_result[non_numeric_columns], check_index_type=False, ) From abbd5cdb8e5128c728c71e898a43240cde7b492c Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 18 Sep 2024 22:33:35 +0000 Subject: [PATCH 4/6] remove unnecessary dep --- bigframes/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 56716afe11..d396b1e057 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -32,7 +32,6 @@ Mapping, Optional, Sequence, - Set, Tuple, Union, ) From 31c1a7457a42e4228a538cd2707ad29e899d3831 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 18 Sep 2024 22:39:05 +0000 Subject: [PATCH 5/6] fix wording --- bigframes/dataframe.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d396b1e057..76fa9e74b2 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2317,7 +2317,7 @@ def melt( def describe(self, include: None | Literal["all"] = None) -> DataFrame: - allowed_non_numerical_types = { + allowed_non_numeric_types = { bigframes.dtypes.STRING_DTYPE, bigframes.dtypes.BOOL_DTYPE, bigframes.dtypes.BYTES_DTYPE, @@ -2326,12 +2326,12 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame: if include is None: numeric_df = self._drop_non_numeric(permissive=False) if len(numeric_df.columns) == 0: - # Describe eligible non-numerical columns - result = self.select_dtypes(include=allowed_non_numerical_types).agg( + # Describe eligible non-numeric columns + result = self.select_dtypes(include=allowed_non_numeric_types).agg( self._NON_NUMERIC_DESCRIBE_AGGS ) else: - # Otherwise, only describe numerical columns + # Otherwise, only describe numeric columns result = numeric_df.agg(self._NUMERIC_DESCRIBE_AGGS) return typing.cast(DataFrame, result) @@ -2345,7 +2345,7 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame: non_numeric_result = typing.cast( DataFrame, - self.select_dtypes(include=allowed_non_numerical_types).agg( + self.select_dtypes(include=allowed_non_numeric_types).agg( self._NON_NUMERIC_DESCRIBE_AGGS ), ) @@ -2561,12 +2561,12 @@ def unstack(self, level: LevelsType = -1): return DataFrame(pivot_block) def _drop_non_numeric(self, permissive=True) -> DataFrame: - numerical_types = ( + numeric_types = ( set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) if permissive else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) ) - return self.select_dtypes(include=numerical_types) + return self.select_dtypes(include=numeric_types) def _drop_non_bool(self) -> DataFrame: return self.select_dtypes(include=bigframes.dtypes.BOOL_BIGFRAMES_TYPES) From 6edb185308c6d2deb3ff507ca311f5684b4024bc Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 19 Sep 2024 22:52:35 +0000 Subject: [PATCH 6/6] restore logic in drop_non_xxx to fix tests --- bigframes/dataframe.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 76fa9e74b2..817a02f492 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2566,10 +2566,20 @@ def _drop_non_numeric(self, permissive=True) -> DataFrame: if permissive else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) ) - return self.select_dtypes(include=numeric_types) + non_numeric_cols = [ + col_id + for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) + if dtype not in numeric_types + ] + return DataFrame(self._block.drop_columns(non_numeric_cols)) def _drop_non_bool(self) -> DataFrame: - return self.select_dtypes(include=bigframes.dtypes.BOOL_BIGFRAMES_TYPES) + non_bool_cols = [ + col_id + for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) + if dtype not in bigframes.dtypes.BOOL_BIGFRAMES_TYPES + ] + return DataFrame(self._block.drop_columns(non_bool_cols)) def _raise_on_non_numeric(self, op: str): if not all(