Skip to content
Merged
38 changes: 35 additions & 3 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1939,21 +1939,53 @@ def test_cummax_int(scalars_df_index, scalars_pandas_df_index):
)


def test_value_counts(scalars_dfs):
@pytest.mark.parametrize(
("kwargs"),
[
{},
{"normalize": True},
{"ascending": True},
],
ids=[
"default",
"normalize",
"ascending",
],
)
def test_value_counts(scalars_dfs, kwargs):
if pd.__version__.startswith("1."):
pytest.skip("pandas 1.x produces different column labels.")
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_too"

bf_result = scalars_df[col_name].value_counts().to_pandas()
pd_result = scalars_pandas_df[col_name].value_counts()
bf_result = scalars_df[col_name].value_counts(**kwargs).to_pandas()
pd_result = scalars_pandas_df[col_name].value_counts(**kwargs)

pd.testing.assert_series_equal(
bf_result,
pd_result,
)


def test_value_counts_with_na(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_col"

bf_result = scalars_df[col_name].value_counts(dropna=False).to_pandas()
pd_result = scalars_pandas_df[col_name].value_counts(dropna=False)

# Older pandas version may not have these values, bigframes tries to emulate 2.0+
pd_result.name = "count"
pd_result.index.name = col_name

assert_series_equal(
bf_result,
pd_result,
# bigframes values_counts does not honor ordering in the original data
ignore_order=True,
)


def test_value_counts_w_cut(scalars_dfs):
if pd.__version__.startswith("1."):
pytest.skip("value_counts results different in pandas 1.x.")
Expand Down
56 changes: 56 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4216,6 +4216,62 @@ def value_counts(
"""
Return a Series containing counts of unique rows in the DataFrame.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> df = bpd.DataFrame({'num_legs': [2, 4, 4, 6, 7],
... 'num_wings': [2, 0, 0, 0, bpd.NA]},
... index=['falcon', 'dog', 'cat', 'ant', 'octopus'],
... dtype='Int64')
>>> df
num_legs num_wings
falcon 2 2
dog 4 0
cat 4 0
ant 6 0
octopus 7 <NA>
<BLANKLINE>
[5 rows x 2 columns]

``value_counts`` sorts the result by counts in a descending order by default:

>>> df.value_counts()
num_legs num_wings
4 0 2
2 2 1
6 0 1
Name: count, dtype: Int64

You can normalize the counts to return relative frequencies by setting ``normalize=True``:

>>> df.value_counts(normalize=True)
num_legs num_wings
4 0 0.5
2 2 0.25
6 0 0.25
Name: proportion, dtype: Float64

You can get the rows in the ascending order of the counts by setting ``ascending=True``:

>>> df.value_counts(ascending=True)
num_legs num_wings
2 2 1
6 0 1
4 0 2
Name: count, dtype: Int64

You can include the counts of the rows with ``NA`` values by setting ``dropna=False``:

>>> df.value_counts(dropna=False)
num_legs num_wings
4 0 2
2 2 1
6 0 1
7 <NA> 1
Name: count, dtype: Int64

Args:
subset (label or list of labels, optional):
Columns to use when counting unique combinations.
Expand Down
75 changes: 75 additions & 0 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2035,6 +2035,59 @@ def value_counts(
first element is the most frequently-occurring element.
Excludes NA values by default.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series([3, 1, 2, 3, 4, bpd.NA], dtype="Int64")

>>> s
0 3
1 1
2 2
3 3
4 4
5 <NA>
dtype: Int64

``value_counts`` sorts the result by counts in a descending order by default:

>>> s.value_counts()
3 2
1 1
2 1
4 1
Name: count, dtype: Int64

You can normalize the counts to return relative frequencies by setting ``normalize=True``:

>>> s.value_counts(normalize=True)
3 0.4
1 0.2
2 0.2
4 0.2
Name: proportion, dtype: Float64

You can get the values in the ascending order of the counts by setting ``ascending=True``:

>>> s.value_counts(ascending=True)
1 1
2 1
4 1
3 2
Name: count, dtype: Int64

You can include the counts of the ``NA`` values by setting ``dropna=False``:

>>> s.value_counts(dropna=False)
3 2
1 1
2 1
4 1
<NA> 1
Name: count, dtype: Int64

Args:
normalize (bool, default False):
If True then the object returned will contain the relative
Expand Down Expand Up @@ -2167,3 +2220,25 @@ def iat(self):
def at(self):
"""Access a single value for a row/column label pair."""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def values(self):
"""
Return Series as ndarray or ndarray-like depending on the dtype.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> bpd.Series([1, 2, 3]).values
array([1, 2, 3], dtype=object)

>>> bpd.Series(list('aabc')).values
array(['a', 'a', 'b', 'c'], dtype=object)

Returns:
numpy.ndarray or ndarray-like: Values in the Series.

"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)