From 08abf0c63e3466709d70c5a15c4c980a887a8638 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 11 Jun 2024 22:20:24 +0000 Subject: [PATCH 1/7] feat: bigframes.bigquery.json_set --- bigframes/bigquery/__init__.py | 56 ++++++++++++++++++- bigframes/core/compile/scalar_op_compiler.py | 9 +++ bigframes/dtypes.py | 5 ++ bigframes/operations/__init__.py | 13 +++++ .../ibis/backends/bigquery/registry.py | 9 +++ .../ibis/expr/operations/json.py | 15 ++++- 6 files changed, 104 insertions(+), 3 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 85a9010a7d..ad008bc301 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -36,6 +36,10 @@ import bigframes.series as series +# Array functions defined from +# https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions + + def array_length(series: series.Series) -> series.Series: """Compute the length of each array element in the Series. @@ -154,6 +158,56 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) +# JSON functions defined from +# https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions + + +def json_set( + series: series.Series, + json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]], +) -> series.Series: + """Produces a new JSON value by inserting or replacing values at specified paths. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] + >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) + 0 {"a":100,"b":"hi"} + Name: data, dtype: string + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path_value_pairs (Sequence[Tuple[str, typing.Any]]): + Pairs of JSON path and the new value to insert/replace. + + Returns: + bigframes.series.Series: A new Series with the transformed JSON data. + + """ + # SQLGlot parser does not support the "create_if_missing => true" syntax, so + # create_if_missing is not currently implemented. + json_path_value_tuples = [] + for json_path_value_pair in json_path_value_pairs: + if len(json_path_value_pair) != 2: + raise ValueError( + "Incorrect format: Expected (, ), but found: " + + f"{json_path_value_pair}" + ) + json_path_value_tuples.append(tuple(json_path_value_pair)) + + return series._apply_unary_op( + ops.JSONSet( + json_path_value_pairs=tuple(json_path_value_tuples), + ) + ) + + def vector_search( base_table: str, column_to_search: str, @@ -301,4 +355,4 @@ def vector_search( df = query._session.read_gbq(sql) df.index.names = index_labels - return df + return df \ No newline at end of file diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 6b8e60434e..a0ec40f200 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -894,6 +894,15 @@ def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp): return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter) +# JSON Ops +@scalar_op_compiler.register_unary_op(ops.JSONSet, pass_op=True) +def json_set_op_impl(x: ibis_types.Value, op: ops.JSONSet): + return vendored_ibis_ops.JSONSet( + x, + json_path_value_pairs=op.json_path_value_pairs, + ).to_expr() + + ### Binary Ops def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 5de8f896a9..441f1f10b2 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -240,6 +240,11 @@ def is_struct_like(type: ExpressionType) -> bool: ) +def is_json_like(type: ExpressionType) -> bool: + # TODO: Add JSON type support + return type == STRING_DTYPE + + def is_numeric(type: ExpressionType) -> bool: return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index c10b743631..ee4e75e86c 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -602,6 +602,19 @@ def output_type(self, *input_types): return dtypes.STRING_DTYPE +## JSON Ops +@dataclasses.dataclass(frozen=True) +class JSONSet(UnaryOp): + name: typing.ClassVar[str] = "json_set" + json_path_value_pairs: typing.Tuple[typing.Tuple[str, typing.Any], ...] + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError("Input type must be an JSON or JSON-formatted string type.") + return input_type + + # Binary Ops fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE) diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index ecef2115e5..f2dd0b7602 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -58,6 +58,14 @@ def _array_aggregate(translator, op: vendored_ibis_ops.ArrayAggregate): return f"ARRAY_AGG({arg} IGNORE NULLS {order_by_sql})" +def _json_set(translator, op: vendored_ibis_ops.JSONSet): + arg = translator.translate(op.arg) + json_path_value_pairs_list = [ + translator.translate(item) for pair in op.json_path_value_pairs for item in pair + ] + return f"JSON_SET(PARSE_JSON({arg}), {', '.join(json_path_value_pairs_list)})" + + patched_ops = { vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore @@ -67,6 +75,7 @@ def _array_aggregate(translator, op: vendored_ibis_ops.ArrayAggregate): vendored_ibis_ops.SafeCastToDatetime: _safe_cast_to_datetime, # type:ignore ibis_reductions.Quantile: _quantile, # type:ignore vendored_ibis_ops.ArrayAggregate: _array_aggregate, # type:ignore + vendored_ibis_ops.JSONSet: _json_set, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py index 772c2e8ff4..2ea2e2aed7 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/json.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -1,9 +1,20 @@ # Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/json.py from __future__ import annotations +import ibis.common.typing as ibis_typing import ibis.expr.datatypes as dt -from ibis.expr.operations.core import Unary +import ibis.expr.operations.core as ibis_ops_core +import ibis.expr.rules as rlz -class ToJsonString(Unary): +class ToJsonString(ibis_ops_core.Unary): dtype = dt.string + + +class JSONSet(ibis_ops_core.Unary): + json_path_value_pairs: ibis_typing.VarTuple[ + ibis_typing.VarTuple[ibis_ops_core.Value[dt.Any]] + ] + + shape = rlz.shape_like("arg") + dtype = rlz.dtype_like("arg") From f9a30256cdae40557037d97e8139500828ce4181 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 12 Jun 2024 22:45:46 +0000 Subject: [PATCH 2/7] add tests --- bigframes/bigquery/__init__.py | 3 +- tests/system/small/bigquery/test_json.py | 80 ++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 tests/system/small/bigquery/test_json.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index ad008bc301..f6686344bd 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -166,7 +166,8 @@ def json_set( series: series.Series, json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]], ) -> series.Series: - """Produces a new JSON value by inserting or replacing values at specified paths. + """Produces a new JSON value within a Series by inserting or replacing values at + specified paths. **Examples:** diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py new file mode 100644 index 0000000000..da49f45202 --- /dev/null +++ b/tests/system/small/bigquery/test_json.py @@ -0,0 +1,80 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +import pandas as pd + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def _get_series_from_json(json_data): + sql = " UNION ALL ".join( + [f"SELECT JSON '{json.dumps(data)}' AS data" for data in json_data] + ) + return bpd.read_gbq(sql)["data"] + + +def test_json_set(): + init_json = [ + {"a": 1}, + ] + s = _get_series_from_json(init_json) + actual = bbq.json_set(s, json_path_value_pairs=[("$.a", 10)]) + + expected_json = [ + {"a": 10}, + ] + expected = _get_series_from_json(expected_json) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_set_w_nested_json(): + init_json = [ + {"a": {"b": {"c": "tester", "d": []}}}, + ] + s = _get_series_from_json(init_json) + actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b.c", "user")]) + + expected_json = [ + {"a": {"b": {"c": "user", "d": []}}}, + ] + expected = _get_series_from_json(expected_json) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_set_w_ordered_pairs(): + init_json = [ + {"a": {"b": {"c": {}}}}, + ] + s = _get_series_from_json(init_json) + actual = bbq.json_set( + s, json_path_value_pairs=[("$.a.b.e", "user"), ("$.a.b.e", "dev")] + ) + + expected_json = [ + {"a": {"b": {"c": {}, "e": "dev"}}}, + ] + expected = _get_series_from_json(expected_json) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) From ad53e78ae27be39587d55b6d6e6a335058d47e8f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 12 Jun 2024 22:50:10 +0000 Subject: [PATCH 3/7] fix --- bigframes/bigquery/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index f6686344bd..235ad22568 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -356,4 +356,4 @@ def vector_search( df = query._session.read_gbq(sql) df.index.names = index_labels - return df \ No newline at end of file + return df From a60779454e0fb8bf29d939d46b818b36a198a008 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 12 Jun 2024 23:03:08 +0000 Subject: [PATCH 4/7] fixes --- tests/system/small/bigquery/test_json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index da49f45202..17690cacdf 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -62,7 +62,7 @@ def test_json_set_w_nested_json(): def test_json_set_w_ordered_pairs(): - init_json = [ + init_json: object = [ {"a": {"b": {"c": {}}}}, ] s = _get_series_from_json(init_json) @@ -70,7 +70,7 @@ def test_json_set_w_ordered_pairs(): s, json_path_value_pairs=[("$.a.b.e", "user"), ("$.a.b.e", "dev")] ) - expected_json = [ + expected_json: object = [ {"a": {"b": {"c": {}, "e": "dev"}}}, ] expected = _get_series_from_json(expected_json) From b5ffb59e11ccb8628bbf3a53ffc47bc2258dcdb6 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 27 Jun 2024 18:04:08 +0000 Subject: [PATCH 5/7] update to binary operator for series supports --- bigframes/bigquery/__init__.py | 35 +++++---- bigframes/core/compile/scalar_op_compiler.py | 7 +- bigframes/operations/__init__.py | 26 +++---- tests/system/small/bigquery/test_json.py | 76 ++++++++++--------- .../ibis/backends/bigquery/registry.py | 7 +- .../ibis/expr/operations/json.py | 6 +- 6 files changed, 84 insertions(+), 73 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 235ad22568..7595b08237 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -177,8 +177,11 @@ def json_set( >>> bpd.options.display.progress_bar = None >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] - >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) - 0 {"a":100,"b":"hi"} + >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100)]) + 0 {"a":100} + Name: data, dtype: string + >>> bbq.json_set(s, json_path_value_pairs=[("$.b", "hi")]) + 0 {"a":1,"b":"hi"} Name: data, dtype: string Args: @@ -193,19 +196,23 @@ def json_set( """ # SQLGlot parser does not support the "create_if_missing => true" syntax, so # create_if_missing is not currently implemented. - json_path_value_tuples = [] - for json_path_value_pair in json_path_value_pairs: - if len(json_path_value_pair) != 2: - raise ValueError( - "Incorrect format: Expected (, ), but found: " - + f"{json_path_value_pair}" - ) - json_path_value_tuples.append(tuple(json_path_value_pair)) - - return series._apply_unary_op( - ops.JSONSet( - json_path_value_pairs=tuple(json_path_value_tuples), + + # Currently limited to single JSON path/value pairs (binary operations only). + if len(json_path_value_pairs) != 1: + raise ValueError( + "Expected exactly one JSON path and value pair but found " + + f"{len(json_path_value_pairs)} pairs." ) + + if len(json_path_value_pairs[0]) != 2: + raise ValueError( + "Incorrect format: Expected (, ), but found: " + + f"{json_path_value_pairs[0]}" + ) + + json_path, json_value = json_path_value_pairs[0] + return series._apply_binary_op( + json_value, ops.JSONSet(json_path=json_path), alignment="left" ) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index a0ec40f200..dbcf38f9ca 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -895,11 +895,12 @@ def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp): # JSON Ops -@scalar_op_compiler.register_unary_op(ops.JSONSet, pass_op=True) -def json_set_op_impl(x: ibis_types.Value, op: ops.JSONSet): +@scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) +def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): return vendored_ibis_ops.JSONSet( x, - json_path_value_pairs=op.json_path_value_pairs, + json_value=y, + json_path=op.json_path, ).to_expr() diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index ee4e75e86c..b288a82f3e 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -602,19 +602,6 @@ def output_type(self, *input_types): return dtypes.STRING_DTYPE -## JSON Ops -@dataclasses.dataclass(frozen=True) -class JSONSet(UnaryOp): - name: typing.ClassVar[str] = "json_set" - json_path_value_pairs: typing.Tuple[typing.Tuple[str, typing.Any], ...] - - def output_type(self, *input_types): - input_type = input_types[0] - if not dtypes.is_json_like(input_type): - raise TypeError("Input type must be an JSON or JSON-formatted string type.") - return input_type - - # Binary Ops fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE) @@ -720,6 +707,19 @@ def output_type(self, *input_types): strconcat_op = StrConcatOp() +## JSON Ops +@dataclasses.dataclass(frozen=True) +class JSONSet(BinaryOp): + name: typing.ClassVar[str] = "json_set" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError("Input type must be an JSON or JSON-formatted string type.") + return input_type + + # Ternary Ops @dataclasses.dataclass(frozen=True) class WhereOp(TernaryOp): diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 17690cacdf..0d653b55be 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -15,6 +15,7 @@ import json import pandas as pd +import pytest import bigframes.bigquery as bbq import bigframes.pandas as bpd @@ -27,16 +28,17 @@ def _get_series_from_json(json_data): return bpd.read_gbq(sql)["data"] -def test_json_set(): - init_json = [ - {"a": 1}, - ] - s = _get_series_from_json(init_json) - actual = bbq.json_set(s, json_path_value_pairs=[("$.a", 10)]) +@pytest.mark.parametrize( + ("json_path", "expected_json"), + [ + pytest.param("$.a", [{"a": 10}], id="simple"), + pytest.param("$.a.b.c", [{"a": {"b": {"c": 10, "d": []}}}], id="nested"), + ], +) +def test_json_set_at_json_path(json_path, expected_json): + s = _get_series_from_json([{"a": {"b": {"c": "tester", "d": []}}}]) + actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) - expected_json = [ - {"a": 10}, - ] expected = _get_series_from_json(expected_json) pd.testing.assert_series_equal( actual.to_pandas(), @@ -44,16 +46,19 @@ def test_json_set(): ) -def test_json_set_w_nested_json(): - init_json = [ - {"a": {"b": {"c": "tester", "d": []}}}, - ] - s = _get_series_from_json(init_json) - actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b.c", "user")]) +@pytest.mark.parametrize( + ("json_value", "expected_json"), + [ + pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"), + pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"), + pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"), + pytest.param([1, 1], [{"a": {"b": 1}}, {"a": {"b": 1}}], id="series"), + ], +) +def test_json_set_at_json_value_type(json_value, expected_json): + s = _get_series_from_json([{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}]) + actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)]) - expected_json = [ - {"a": {"b": {"c": "user", "d": []}}}, - ] expected = _get_series_from_json(expected_json) pd.testing.assert_series_equal( actual.to_pandas(), @@ -61,20 +66,21 @@ def test_json_set_w_nested_json(): ) -def test_json_set_w_ordered_pairs(): - init_json: object = [ - {"a": {"b": {"c": {}}}}, - ] - s = _get_series_from_json(init_json) - actual = bbq.json_set( - s, json_path_value_pairs=[("$.a.b.e", "user"), ("$.a.b.e", "dev")] - ) - - expected_json: object = [ - {"a": {"b": {"c": {}, "e": "dev"}}}, - ] - expected = _get_series_from_json(expected_json) - pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), - ) +@pytest.mark.parametrize( + ("json_path_value_pairs"), + [ + pytest.param( + [("$.a", 1), ("$.b", 2)], + id="two_pairs", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + [("$.a", 1, 100)], + id="invalid_pair", + marks=pytest.mark.xfail(raises=ValueError), + ), + ], +) +def test_json_set_w_invalid_param(json_path_value_pairs): + s = _get_series_from_json([{"a": {"b": {"c": {}, "e": "dev"}}}]) + bbq.json_set(s, json_path_value_pairs=json_path_value_pairs) diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index f2dd0b7602..863661759a 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -60,10 +60,9 @@ def _array_aggregate(translator, op: vendored_ibis_ops.ArrayAggregate): def _json_set(translator, op: vendored_ibis_ops.JSONSet): arg = translator.translate(op.arg) - json_path_value_pairs_list = [ - translator.translate(item) for pair in op.json_path_value_pairs for item in pair - ] - return f"JSON_SET(PARSE_JSON({arg}), {', '.join(json_path_value_pairs_list)})" + json_value = translator.translate(op.json_value) + json_path = translator.translate(op.json_path) + return f"JSON_SET(PARSE_JSON({arg}), {json_path}, {json_value})" patched_ops = { diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py index 2ea2e2aed7..c3eaff206c 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/json.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -1,7 +1,6 @@ # Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/json.py from __future__ import annotations -import ibis.common.typing as ibis_typing import ibis.expr.datatypes as dt import ibis.expr.operations.core as ibis_ops_core import ibis.expr.rules as rlz @@ -12,9 +11,8 @@ class ToJsonString(ibis_ops_core.Unary): class JSONSet(ibis_ops_core.Unary): - json_path_value_pairs: ibis_typing.VarTuple[ - ibis_typing.VarTuple[ibis_ops_core.Value[dt.Any]] - ] + json_value: ibis_ops_core.Value[dt.Any] + json_path: ibis_ops_core.Value[dt.String] shape = rlz.shape_like("arg") dtype = rlz.dtype_like("arg") From fdb9c40990256ae378fa6f5488eb29cf7fd0391b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 28 Jun 2024 21:28:56 +0000 Subject: [PATCH 6/7] supports more than two pairs, defines as ibis udf --- bigframes/bigquery/__init__.py | 27 +++++++--------- bigframes/core/compile/scalar_op_compiler.py | 32 ++++++++++++++++--- bigframes/dtypes.py | 6 ++++ bigframes/operations/__init__.py | 19 ++++++++--- .../ibis/backends/bigquery/registry.py | 8 ----- .../ibis/expr/operations/json.py | 9 ------ 6 files changed, 59 insertions(+), 42 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 7595b08237..e89cb8201b 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -197,23 +197,18 @@ def json_set( # SQLGlot parser does not support the "create_if_missing => true" syntax, so # create_if_missing is not currently implemented. - # Currently limited to single JSON path/value pairs (binary operations only). - if len(json_path_value_pairs) != 1: - raise ValueError( - "Expected exactly one JSON path and value pair but found " - + f"{len(json_path_value_pairs)} pairs." - ) - - if len(json_path_value_pairs[0]) != 2: - raise ValueError( - "Incorrect format: Expected (, ), but found: " - + f"{json_path_value_pairs[0]}" + for json_path_value_pair in json_path_value_pairs: + if len(json_path_value_pair) != 2: + raise ValueError( + "Incorrect format: Expected (, ), but found: " + + f"{json_path_value_pair}" + ) + + json_path, json_value = json_path_value_pair + series = series._apply_binary_op( + json_value, ops.JSONSet(json_path=json_path), alignment="left" ) - - json_path, json_value = json_path_value_pairs[0] - return series._apply_binary_op( - json_value, ops.JSONSet(json_path=json_path), alignment="left" - ) + return series def vector_search( diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index dbcf38f9ca..0bc9f2e370 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -897,11 +897,21 @@ def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp): # JSON Ops @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): - return vendored_ibis_ops.JSONSet( - x, - json_value=y, - json_path=op.json_path, - ).to_expr() + if x.type().is_json(): + return json_set( + json_obj=x, + json_path=op.json_path, + json_value=y, + ).to_expr() + else: + # Enabling JSON type eliminates the need for less efficient string conversions. + return vendored_ibis_ops.ToJsonString( + json_set( + json_obj=parse_json(x), + json_path=op.json_path, + json_value=y, + ) + ).to_expr() ### Binary Ops @@ -1479,3 +1489,15 @@ def float_floor(a: float) -> float: def float_ceil(a: float) -> float: """Convert string to timestamp.""" return 0 # pragma: NO COVER + + +@ibis.udf.scalar.builtin(name="parse_json") +def parse_json(a: str) -> ibis_dtypes.JSON: + """Converts a JSON-formatted STRING value to a JSON value.""" + + +@ibis.udf.scalar.builtin(name="json_set") +def json_set( + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str, json_value +) -> ibis_dtypes.JSON: + """Produces a new SQL JSON value with the specified JSON data inserted or replaced.""" diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 441f1f10b2..160802ded9 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -245,6 +245,12 @@ def is_json_like(type: ExpressionType) -> bool: return type == STRING_DTYPE +def is_json_encoding_type(type: ExpressionType) -> bool: + # Types can be converted into JSON. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_encodings + return type != GEO_DTYPE + + def is_numeric(type: ExpressionType) -> bool: return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index b288a82f3e..145c415ca0 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -714,10 +714,21 @@ class JSONSet(BinaryOp): json_path: str def output_type(self, *input_types): - input_type = input_types[0] - if not dtypes.is_json_like(input_type): - raise TypeError("Input type must be an JSON or JSON-formatted string type.") - return input_type + left_type = input_types[0] + right_type = input_types[1] + if not dtypes.is_json_like(left_type): + raise TypeError( + "Input type must be an valid JSON object or JSON-formatted string type." + + f" Received type: {left_type}" + ) + if not dtypes.is_json_encoding_type(right_type): + raise TypeError( + "The value to be assigned must be a type that can be encoded as JSON." + + f"Received type: {right_type}" + ) + + # After JSON type implementation, ONLY return JSON data. + return left_type # Ternary Ops diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index 863661759a..ecef2115e5 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -58,13 +58,6 @@ def _array_aggregate(translator, op: vendored_ibis_ops.ArrayAggregate): return f"ARRAY_AGG({arg} IGNORE NULLS {order_by_sql})" -def _json_set(translator, op: vendored_ibis_ops.JSONSet): - arg = translator.translate(op.arg) - json_value = translator.translate(op.json_value) - json_path = translator.translate(op.json_path) - return f"JSON_SET(PARSE_JSON({arg}), {json_path}, {json_value})" - - patched_ops = { vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore @@ -74,7 +67,6 @@ def _json_set(translator, op: vendored_ibis_ops.JSONSet): vendored_ibis_ops.SafeCastToDatetime: _safe_cast_to_datetime, # type:ignore ibis_reductions.Quantile: _quantile, # type:ignore vendored_ibis_ops.ArrayAggregate: _array_aggregate, # type:ignore - vendored_ibis_ops.JSONSet: _json_set, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py index c3eaff206c..1eb0554137 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/json.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -3,16 +3,7 @@ import ibis.expr.datatypes as dt import ibis.expr.operations.core as ibis_ops_core -import ibis.expr.rules as rlz class ToJsonString(ibis_ops_core.Unary): dtype = dt.string - - -class JSONSet(ibis_ops_core.Unary): - json_value: ibis_ops_core.Value[dt.Any] - json_path: ibis_ops_core.Value[dt.String] - - shape = rlz.shape_like("arg") - dtype = rlz.dtype_like("arg") From 537e4b59abbfbcb5f429354fb5d10e4d3e4af6ec Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 28 Jun 2024 22:20:11 +0000 Subject: [PATCH 7/7] add more tests --- bigframes/bigquery/__init__.py | 7 +-- tests/system/small/bigquery/test_json.py | 57 +++++++++++++++++++----- 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index e89cb8201b..ec26d14f33 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -177,11 +177,8 @@ def json_set( >>> bpd.options.display.progress_bar = None >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] - >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100)]) - 0 {"a":100} - Name: data, dtype: string - >>> bbq.json_set(s, json_path_value_pairs=[("$.b", "hi")]) - 0 {"a":1,"b":"hi"} + >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) + 0 {"a":100,"b":"hi"} Name: data, dtype: string Args: diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 0d653b55be..ff759b8fda 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -14,6 +14,7 @@ import json +import geopandas as gpd # type: ignore import pandas as pd import pytest @@ -23,9 +24,13 @@ def _get_series_from_json(json_data): sql = " UNION ALL ".join( - [f"SELECT JSON '{json.dumps(data)}' AS data" for data in json_data] + [ + f"SELECT {id} AS id, JSON '{json.dumps(data)}' AS data" + for id, data in enumerate(json_data) + ] ) - return bpd.read_gbq(sql)["data"] + df = bpd.read_gbq(sql).set_index("id").sort_index() + return df["data"] @pytest.mark.parametrize( @@ -52,7 +57,7 @@ def test_json_set_at_json_path(json_path, expected_json): pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"), pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"), pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"), - pytest.param([1, 1], [{"a": {"b": 1}}, {"a": {"b": 1}}], id="series"), + pytest.param([1, 2], [{"a": {"b": 1}}, {"a": {"b": 2}}], id="series"), ], ) def test_json_set_at_json_value_type(json_value, expected_json): @@ -66,21 +71,49 @@ def test_json_set_at_json_value_type(json_value, expected_json): ) +def test_json_set_w_more_pairs(): + s = _get_series_from_json([{"a": 2}, {"b": 5}, {"c": 1}]) + actual = bbq.json_set( + s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])] + ) + expected = _get_series_from_json( + [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}] + ) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + @pytest.mark.parametrize( - ("json_path_value_pairs"), + ("series", "json_path_value_pairs"), [ pytest.param( - [("$.a", 1), ("$.b", 2)], - id="two_pairs", + _get_series_from_json([{"a": 10}]), + [("$.a", 1, 100)], + id="invalid_json_path_value_pairs", marks=pytest.mark.xfail(raises=ValueError), ), pytest.param( - [("$.a", 1, 100)], - id="invalid_pair", - marks=pytest.mark.xfail(raises=ValueError), + _get_series_from_json([{"a": 10}]), + [ + ( + "$.a", + bpd.read_pandas( + gpd.GeoSeries.from_wkt(["POINT (1 2)", "POINT (2 1)"]) + ), + ) + ], + id="invalid_json_value_type", + marks=pytest.mark.xfail(raises=TypeError), + ), + pytest.param( + bpd.Series([1, 2]), + [("$.a", 1)], + id="invalid_series_type", + marks=pytest.mark.xfail(raises=TypeError), ), ], ) -def test_json_set_w_invalid_param(json_path_value_pairs): - s = _get_series_from_json([{"a": {"b": {"c": {}, "e": "dev"}}}]) - bbq.json_set(s, json_path_value_pairs=json_path_value_pairs) +def test_json_set_w_invalid(series, json_path_value_pairs): + bbq.json_set(series, json_path_value_pairs=json_path_value_pairs)