From 3c13aa5bf7752f5dc5a38410d3ded4157b50465b Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@google.com>
Date: Tue, 17 Sep 2024 19:34:29 +0000
Subject: [PATCH 1/6] feat: Support bool and bytes types in
 describe(include='all')

---
 bigframes/dataframe.py               | 55 +++++++++++++++-------------
 bigframes/operations/aggregations.py |  2 +-
 tests/system/small/test_dataframe.py |  2 +-
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 862c8dc2c8..f8267b456c 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -32,6 +32,7 @@
     Mapping,
     Optional,
     Sequence,
+    Set,
     Tuple,
     Union,
 )
@@ -2316,11 +2317,20 @@ def melt(
     _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique")
 
     def describe(self, include: None | Literal["all"] = None) -> DataFrame:
+
+        allowed_non_numerical_types = {
+            bigframes.dtypes.STRING_DTYPE,
+            bigframes.dtypes.BOOL_DTYPE,
+            bigframes.dtypes.BYTES_DTYPE,
+        }
+
         if include is None:
             numeric_df = self._drop_non_numeric(permissive=False)
             if len(numeric_df.columns) == 0:
                 # Describe eligible non-numerical columns
-                result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS)
+                result = self._filter_columns_by_type(allowed_non_numerical_types).agg(
+                    self._NON_NUMERICAL_DESCRIBE_AGGS
+                )
             else:
                 # Otherwise, only describe numerical columns
                 result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS)
@@ -2333,21 +2343,24 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
                     self._NUMERICAL_DISCRIBE_AGGS
                 ),
             )
-            string_result = typing.cast(
+
+            non_numeric_result = typing.cast(
                 DataFrame,
-                self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS),
+                self._filter_columns_by_type(allowed_non_numerical_types).agg(
+                    self._NON_NUMERICAL_DESCRIBE_AGGS
+                ),
             )
 
             if len(numeric_result.columns) == 0:
-                return string_result
-            elif len(string_result.columns) == 0:
+                return non_numeric_result
+            elif len(non_numeric_result.columns) == 0:
                 return numeric_result
             else:
                 import bigframes.core.reshape as rs
 
                 # Use reindex after join to preserve the original column order.
                 return rs.concat(
-                    [numeric_result, string_result], axis=1
+                    [non_numeric_result, numeric_result], axis=1
                 )._reindex_columns(self.columns)
 
         else:
@@ -2548,34 +2561,24 @@ def unstack(self, level: LevelsType = -1):
         )
         return DataFrame(pivot_block)
 
+    def _filter_columns_by_type(self, types: Set[bigframes.dtypes.Dtype]) -> DataFrame:
+        target_cols = [
+            col_id
+            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
+            if dtype in types
+        ]
+        return DataFrame(self._block.select_columns(target_cols))
+
     def _drop_non_numeric(self, permissive=True) -> DataFrame:
         numerical_types = (
             set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
             if permissive
             else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
         )
-        non_numeric_cols = [
-            col_id
-            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
-            if dtype not in numerical_types
-        ]
-        return DataFrame(self._block.drop_columns(non_numeric_cols))
-
-    def _drop_non_string(self) -> DataFrame:
-        string_cols = [
-            col_id
-            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
-            if dtype == bigframes.dtypes.STRING_DTYPE
-        ]
-        return DataFrame(self._block.select_columns(string_cols))
+        return self._filter_columns_by_type(numerical_types)
 
     def _drop_non_bool(self) -> DataFrame:
-        non_bool_cols = [
-            col_id
-            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
-            if dtype not in bigframes.dtypes.BOOL_BIGFRAMES_TYPES
-        ]
-        return DataFrame(self._block.drop_columns(non_bool_cols))
+        return self._filter_columns_by_type(set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES))
 
     def _raise_on_non_numeric(self, op: str):
         if not all(
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
index f20429e449..d071889ac4 100644
--- a/bigframes/operations/aggregations.py
+++ b/bigframes/operations/aggregations.py
@@ -568,7 +568,7 @@ def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
     if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
         return True
 
-    if dtype == dtypes.STRING_DTYPE:
+    if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
         return isinstance(op, (CountOp, NuniqueOp))
 
     # For all other types, support no aggregation
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index b4c81bfbef..9777b1ae33 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -2622,7 +2622,7 @@ def test_df_describe(scalars_dfs):
 def test_df_describe_non_numerical(scalars_dfs, include):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    non_numerical_columns = ["string_col"]
+    non_numerical_columns = ["string_col", "bytes_col", "bool_col"]
 
     modified_bf = scalars_df[non_numerical_columns]
     bf_result = modified_bf.describe(include=include).to_pandas()

From 6aa7fa5132985773ac715d41c547c8e56292ed9d Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@google.com>
Date: Tue, 17 Sep 2024 21:19:46 +0000
Subject: [PATCH 2/6] update aggregation unit tests

---
 tests/unit/operations/test_aggregations.py | 39 +++++++++-------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/tests/unit/operations/test_aggregations.py b/tests/unit/operations/test_aggregations.py
index 4cb6934c9d..68ad48ac29 100644
--- a/tests/unit/operations/test_aggregations.py
+++ b/tests/unit/operations/test_aggregations.py
@@ -55,38 +55,29 @@
         first_op,
     ]
 )
-_STRING_SUPPORTED_OPS = set([count_op, nunique_op])
 
 
 @pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
 @pytest.mark.parametrize("op", _ALL_OPS)
-def test_is_agg_op_supported_numerical_support_all(dtype, op):
+def test_is_agg_op_supported_numeric_support_all(dtype, op):
     assert is_agg_op_supported(dtype, op) is True
 
 
-@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
-@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS)
-def test_is_agg_op_supported_string_support_ops(dtype, op):
-    assert is_agg_op_supported(dtype, op) is True
-
-
-@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
-@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS)
-def test_is_agg_op_supported_string_not_support_ops(dtype, op):
-    assert is_agg_op_supported(dtype, op) is False
-
-
 @pytest.mark.parametrize(
-    "dtype",
+    ("dtype", "supported_ops"),
     [
-        dtypes.BYTES_DTYPE,
-        dtypes.DATE_DTYPE,
-        dtypes.TIME_DTYPE,
-        dtypes.DATETIME_DTYPE,
-        dtypes.TIMESTAMP_DTYPE,
-        dtypes.GEO_DTYPE,
+        (dtypes.STRING_DTYPE, {count_op, nunique_op}),
+        (dtypes.BYTES_DTYPE, {count_op, nunique_op}),
+        (dtypes.DATE_DTYPE, set()),
+        (dtypes.TIME_DTYPE, set()),
+        (dtypes.DATETIME_DTYPE, set()),
+        (dtypes.TIMESTAMP_DTYPE, set()),
+        (dtypes.GEO_DTYPE, set()),
     ],
 )
-@pytest.mark.parametrize("op", _ALL_OPS)
-def test_is_agg_op_supported_non_numerical_no_support(dtype, op):
-    assert is_agg_op_supported(dtype, op) is False
+def test_is_agg_op_supported_non_numeric(dtype, supported_ops):
+    for op in supported_ops:
+        assert is_agg_op_supported(dtype, op) is True
+
+    for op in _ALL_OPS - supported_ops:
+        assert is_agg_op_supported(dtype, op) is False

From 3b7bdfe07ce27763f04cd3966f4ab20c5ccaac04 Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@google.com>
Date: Wed, 18 Sep 2024 22:30:18 +0000
Subject: [PATCH 3/6] fix typo and remove unnecessary helper

---
 bigframes/dataframe.py               | 28 ++++++++++------------------
 tests/system/small/test_dataframe.py | 26 +++++++++++++-------------
 2 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index f8267b456c..56716afe11 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2304,7 +2304,7 @@ def melt(
             self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
         )
 
-    _NUMERICAL_DISCRIBE_AGGS = (
+    _NUMERIC_DESCRIBE_AGGS = (
         "count",
         "mean",
         "std",
@@ -2314,7 +2314,7 @@ def melt(
         "75%",
         "max",
     )
-    _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique")
+    _NON_NUMERIC_DESCRIBE_AGGS = ("count", "nunique")
 
     def describe(self, include: None | Literal["all"] = None) -> DataFrame:
 
@@ -2328,26 +2328,26 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
             numeric_df = self._drop_non_numeric(permissive=False)
             if len(numeric_df.columns) == 0:
                 # Describe eligible non-numerical columns
-                result = self._filter_columns_by_type(allowed_non_numerical_types).agg(
-                    self._NON_NUMERICAL_DESCRIBE_AGGS
+                result = self.select_dtypes(include=allowed_non_numerical_types).agg(
+                    self._NON_NUMERIC_DESCRIBE_AGGS
                 )
             else:
                 # Otherwise, only describe numerical columns
-                result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS)
+                result = numeric_df.agg(self._NUMERIC_DESCRIBE_AGGS)
             return typing.cast(DataFrame, result)
 
         elif include == "all":
             numeric_result = typing.cast(
                 DataFrame,
                 self._drop_non_numeric(permissive=False).agg(
-                    self._NUMERICAL_DISCRIBE_AGGS
+                    self._NUMERIC_DESCRIBE_AGGS
                 ),
             )
 
             non_numeric_result = typing.cast(
                 DataFrame,
-                self._filter_columns_by_type(allowed_non_numerical_types).agg(
-                    self._NON_NUMERICAL_DESCRIBE_AGGS
+                self.select_dtypes(include=allowed_non_numerical_types).agg(
+                    self._NON_NUMERIC_DESCRIBE_AGGS
                 ),
             )
 
@@ -2561,24 +2561,16 @@ def unstack(self, level: LevelsType = -1):
         )
         return DataFrame(pivot_block)
 
-    def _filter_columns_by_type(self, types: Set[bigframes.dtypes.Dtype]) -> DataFrame:
-        target_cols = [
-            col_id
-            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
-            if dtype in types
-        ]
-        return DataFrame(self._block.select_columns(target_cols))
-
     def _drop_non_numeric(self, permissive=True) -> DataFrame:
         numerical_types = (
             set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
             if permissive
             else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
         )
-        return self._filter_columns_by_type(numerical_types)
+        return self.select_dtypes(include=numerical_types)
 
     def _drop_non_bool(self) -> DataFrame:
-        return self._filter_columns_by_type(set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES))
+        return self.select_dtypes(include=bigframes.dtypes.BOOL_BIGFRAMES_TYPES)
 
     def _raise_on_non_numeric(self, op: str):
         if not all(
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 9777b1ae33..0a637e983f 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -2619,15 +2619,15 @@ def test_df_describe(scalars_dfs):
 
 @skip_legacy_pandas
 @pytest.mark.parametrize("include", [None, "all"])
-def test_df_describe_non_numerical(scalars_dfs, include):
+def test_df_describe_non_numeric(scalars_dfs, include):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    non_numerical_columns = ["string_col", "bytes_col", "bool_col"]
+    non_numeric_columns = ["string_col", "bytes_col", "bool_col"]
 
-    modified_bf = scalars_df[non_numerical_columns]
+    modified_bf = scalars_df[non_numeric_columns]
     bf_result = modified_bf.describe(include=include).to_pandas()
 
-    modified_pd_df = scalars_pandas_df[non_numerical_columns]
+    modified_pd_df = scalars_pandas_df[non_numeric_columns]
     pd_result = modified_pd_df.describe(include=include)
 
     # Reindex results with the specified keys and their order, because
@@ -2639,8 +2639,8 @@ def test_df_describe_non_numerical(scalars_dfs, include):
     ).rename(index={"unique": "nunique"})
 
     pd.testing.assert_frame_equal(
-        pd_result[non_numerical_columns].astype("Int64"),
-        bf_result[non_numerical_columns],
+        pd_result[non_numeric_columns].astype("Int64"),
+        bf_result[non_numeric_columns],
         check_index_type=False,
     )
 
@@ -2649,12 +2649,12 @@ def test_df_describe_non_numerical(scalars_dfs, include):
 def test_df_describe_mixed_types_include_all(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    numerical_columns = [
+    numeric_columns = [
         "int64_col",
         "float64_col",
     ]
-    non_numerical_columns = ["string_col"]
-    supported_columns = numerical_columns + non_numerical_columns
+    non_numeric_columns = ["string_col"]
+    supported_columns = numeric_columns + non_numeric_columns
 
     modified_bf = scalars_df[supported_columns]
     bf_result = modified_bf.describe(include="all").to_pandas()
@@ -2678,14 +2678,14 @@ def test_df_describe_mixed_types_include_all(scalars_dfs):
     ).rename(index={"unique": "nunique"})
 
     pd.testing.assert_frame_equal(
-        pd_result[numerical_columns].astype("Float64"),
-        bf_result[numerical_columns],
+        pd_result[numeric_columns].astype("Float64"),
+        bf_result[numeric_columns],
         check_index_type=False,
     )
 
     pd.testing.assert_frame_equal(
-        pd_result[non_numerical_columns].astype("Int64"),
-        bf_result[non_numerical_columns],
+        pd_result[non_numeric_columns].astype("Int64"),
+        bf_result[non_numeric_columns],
         check_index_type=False,
     )
 

From abbd5cdb8e5128c728c71e898a43240cde7b492c Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@google.com>
Date: Wed, 18 Sep 2024 22:33:35 +0000
Subject: [PATCH 4/6] remove unnecessary dep

---
 bigframes/dataframe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 56716afe11..d396b1e057 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -32,7 +32,6 @@
     Mapping,
     Optional,
     Sequence,
-    Set,
     Tuple,
     Union,
 )

From 31c1a7457a42e4228a538cd2707ad29e899d3831 Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@google.com>
Date: Wed, 18 Sep 2024 22:39:05 +0000
Subject: [PATCH 5/6] fix wording

---
 bigframes/dataframe.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index d396b1e057..76fa9e74b2 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2317,7 +2317,7 @@ def melt(
 
     def describe(self, include: None | Literal["all"] = None) -> DataFrame:
 
-        allowed_non_numerical_types = {
+        allowed_non_numeric_types = {
             bigframes.dtypes.STRING_DTYPE,
             bigframes.dtypes.BOOL_DTYPE,
             bigframes.dtypes.BYTES_DTYPE,
@@ -2326,12 +2326,12 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
         if include is None:
             numeric_df = self._drop_non_numeric(permissive=False)
             if len(numeric_df.columns) == 0:
-                # Describe eligible non-numerical columns
-                result = self.select_dtypes(include=allowed_non_numerical_types).agg(
+                # Describe eligible non-numeric columns
+                result = self.select_dtypes(include=allowed_non_numeric_types).agg(
                     self._NON_NUMERIC_DESCRIBE_AGGS
                 )
             else:
-                # Otherwise, only describe numerical columns
+                # Otherwise, only describe numeric columns
                 result = numeric_df.agg(self._NUMERIC_DESCRIBE_AGGS)
             return typing.cast(DataFrame, result)
 
@@ -2345,7 +2345,7 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
 
             non_numeric_result = typing.cast(
                 DataFrame,
-                self.select_dtypes(include=allowed_non_numerical_types).agg(
+                self.select_dtypes(include=allowed_non_numeric_types).agg(
                     self._NON_NUMERIC_DESCRIBE_AGGS
                 ),
             )
@@ -2561,12 +2561,12 @@ def unstack(self, level: LevelsType = -1):
         return DataFrame(pivot_block)
 
     def _drop_non_numeric(self, permissive=True) -> DataFrame:
-        numerical_types = (
+        numeric_types = (
             set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
             if permissive
             else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
         )
-        return self.select_dtypes(include=numerical_types)
+        return self.select_dtypes(include=numeric_types)
 
     def _drop_non_bool(self) -> DataFrame:
         return self.select_dtypes(include=bigframes.dtypes.BOOL_BIGFRAMES_TYPES)

From 6edb185308c6d2deb3ff507ca311f5684b4024bc Mon Sep 17 00:00:00 2001
From: Shenyang Cai <sycai@google.com>
Date: Thu, 19 Sep 2024 22:52:35 +0000
Subject: [PATCH 6/6] restore logic in drop_non_xxx to fix tests

---
 bigframes/dataframe.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 76fa9e74b2..817a02f492 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2566,10 +2566,20 @@ def _drop_non_numeric(self, permissive=True) -> DataFrame:
             if permissive
             else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
         )
-        return self.select_dtypes(include=numeric_types)
+        non_numeric_cols = [
+            col_id
+            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
+            if dtype not in numeric_types
+        ]
+        return DataFrame(self._block.drop_columns(non_numeric_cols))
 
     def _drop_non_bool(self) -> DataFrame:
-        return self.select_dtypes(include=bigframes.dtypes.BOOL_BIGFRAMES_TYPES)
+        non_bool_cols = [
+            col_id
+            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
+            if dtype not in bigframes.dtypes.BOOL_BIGFRAMES_TYPES
+        ]
+        return DataFrame(self._block.drop_columns(non_bool_cols))
 
     def _raise_on_non_numeric(self, op: str):
         if not all(