From 9b472544004d36a241b7c5ebf7bceeffdadab5d4 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 18 Nov 2023 01:48:05 +0000 Subject: [PATCH 1/6] "fix: dataframes to_gbq now creates dataset if it doesn't exist --- bigframes/dataframe.py | 12 ++++++++-- tests/system/conftest.py | 18 ++++++++++++++ tests/system/small/test_dataframe.py | 35 ++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1f1275e217..5eb7489ccd 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2385,7 +2385,14 @@ def to_gbq( ) if_exists = "replace" - if "." not in destination_table: + table_parts = destination_table.split(".") + default_project = self._block.expr.session.bqclient.project + + if len(table_parts) == 2: + destination_dataset = f"{default_project}.{table_parts[0]}" + elif len(table_parts) == 3: + destination_dataset = f"{table_parts[0]}.{table_parts[1]}" + else: raise ValueError( f"Got invalid value for destination_table {repr(destination_table)}. " "Should be of the form 'datasetId.tableId' or 'projectId.datasetId.tableId'." @@ -2400,11 +2407,12 @@ def to_gbq( f"Valid options include None or one of {dispositions.keys()}." ) + self._session.bqclient.create_dataset(destination_dataset, exists_ok=True) job_config = bigquery.QueryJobConfig( write_disposition=dispositions[if_exists], destination=bigquery.table.TableReference.from_string( destination_table, - default_project=self._block.expr.session.bqclient.project, + default_project=default_project, ), ) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index f9f69c6c8e..64c95fef04 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -1002,3 +1002,21 @@ def floats_bf(session, floats_pd): @pytest.fixture() def floats_product_bf(session, floats_product_pd): return session.read_pandas(floats_product_pd) + + +@pytest.fixture() +def delete_bigquery_dataset(request, bigquery_client: bigquery.Client): + dataset_id = request.param + # Clean up the dataset before and after tests to handle residuals + # from test failures. + if dataset_id: + bigquery_client.delete_dataset( + dataset_id, delete_contents=True, not_found_ok=True + ) + + yield + + if dataset_id: + bigquery_client.delete_dataset( + dataset_id, delete_contents=True, not_found_ok=True + ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index a0cf25807c..84685f6bff 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3523,3 +3523,38 @@ def test_df_dot_operator_series( bf_result, pd_result, ) + + +@pytest.mark.parametrize( + ("table_id", "dataset_id", "delete_bigquery_dataset"), + [ + (None, None, None), + ( + "scalars_df", + "bigframes-dev.to_gbq_dataset_creation_fdjhfjksdhjkfdshjk", + "to_gbq_dataset_creation_fdjhfjksdhjkfdshjk", + ), + ( + "scalars_df", + "bigframes-dev.to_gbq_dataset_creation_fdjhfjksdhjkfdshjk", + "bigframes-dev.to_gbq_dataset_creation_fdjhfjksdhjkfdshjk", + ), + ], + indirect=["delete_bigquery_dataset"], +) +def test_gbq_write_and_create_dataset( + session, scalars_df_index, table_id, dataset_id, delete_bigquery_dataset +): + # The expected scenario is that if the dataset doesn't exist, + # it will be created. Do not use the same dataset id for other tests. + destination_table = f"{dataset_id}.{table_id}" if table_id else None + + result_table = scalars_df_index.to_gbq(destination_table) + assert ( + result_table == destination_table + if destination_table + else result_table is not None + ) + + loaded_scalars_df_index = session.read_gbq(result_table) + assert not loaded_scalars_df_index.empty From 13a9dee1990c141ae19811f03afab6afd3a5aee0 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 18 Nov 2023 01:48:05 +0000 Subject: [PATCH 2/6] fix: dataframes to_gbq now creates dataset if it doesn't exist --- tests/system/small/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 84685f6bff..9679b93105 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3542,7 +3542,7 @@ def test_df_dot_operator_series( ], indirect=["delete_bigquery_dataset"], ) -def test_gbq_write_and_create_dataset( +def test_to_gbq_and_create_dataset( session, scalars_df_index, table_id, dataset_id, delete_bigquery_dataset ): # The expected scenario is that if the dataset doesn't exist, From 82d47e8bae2a67519b021df4d90503d3889c8eaa Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 18 Nov 2023 01:48:05 +0000 Subject: [PATCH 3/6] fix: dataframes to_gbq now creates dataset if it doesn't exist --- tests/system/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 64c95fef04..fb334cd1bc 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -1007,7 +1007,7 @@ def floats_product_bf(session, floats_product_pd): @pytest.fixture() def delete_bigquery_dataset(request, bigquery_client: bigquery.Client): dataset_id = request.param - # Clean up the dataset before and after tests to handle residuals + # Delete the dataset before and after tests to handle residuals # from test failures. if dataset_id: bigquery_client.delete_dataset( From a4a2e50a01213dc189c397e8b34d6eac7beb02e7 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 21 Nov 2023 19:07:08 +0000 Subject: [PATCH 4/6] update test --- tests/system/conftest.py | 18 ------------------ tests/system/small/test_dataframe.py | 26 +++----------------------- 2 files changed, 3 insertions(+), 41 deletions(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index fb334cd1bc..f9f69c6c8e 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -1002,21 +1002,3 @@ def floats_bf(session, floats_pd): @pytest.fixture() def floats_product_bf(session, floats_product_pd): return session.read_pandas(floats_product_pd) - - -@pytest.fixture() -def delete_bigquery_dataset(request, bigquery_client: bigquery.Client): - dataset_id = request.param - # Delete the dataset before and after tests to handle residuals - # from test failures. - if dataset_id: - bigquery_client.delete_dataset( - dataset_id, delete_contents=True, not_found_ok=True - ) - - yield - - if dataset_id: - bigquery_client.delete_dataset( - dataset_id, delete_contents=True, not_found_ok=True - ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9679b93105..ab95d1ae3a 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3525,29 +3525,9 @@ def test_df_dot_operator_series( ) -@pytest.mark.parametrize( - ("table_id", "dataset_id", "delete_bigquery_dataset"), - [ - (None, None, None), - ( - "scalars_df", - "bigframes-dev.to_gbq_dataset_creation_fdjhfjksdhjkfdshjk", - "to_gbq_dataset_creation_fdjhfjksdhjkfdshjk", - ), - ( - "scalars_df", - "bigframes-dev.to_gbq_dataset_creation_fdjhfjksdhjkfdshjk", - "bigframes-dev.to_gbq_dataset_creation_fdjhfjksdhjkfdshjk", - ), - ], - indirect=["delete_bigquery_dataset"], -) -def test_to_gbq_and_create_dataset( - session, scalars_df_index, table_id, dataset_id, delete_bigquery_dataset -): - # The expected scenario is that if the dataset doesn't exist, - # it will be created. Do not use the same dataset id for other tests. - destination_table = f"{dataset_id}.{table_id}" if table_id else None +def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): + dataset_id = dataset_id_not_created + destination_table = f"{dataset_id}.scalars_df" result_table = scalars_df_index.to_gbq(destination_table) assert ( From 75a1810d1cc1340c210c46d59b9e065481dda97b Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 12 Dec 2023 22:21:12 +0000 Subject: [PATCH 5/6] update create dataset method. --- bigframes/dataframe.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1e7ff6015e..eed1a1641e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -34,6 +34,7 @@ Union, ) +import google.api_core.exceptions import google.cloud.bigquery as bigquery import numpy import pandas @@ -2530,7 +2531,11 @@ def to_gbq( f"Valid options include None or one of {dispositions.keys()}." ) - self._session.bqclient.create_dataset(destination_dataset, exists_ok=True) + try: + self._session.bqclient.get_dataset() + except google.api_core.exceptions.NotFound: + self._session.bqclient.create_dataset(destination_dataset, exists_ok=True) + job_config = bigquery.QueryJobConfig( write_disposition=dispositions[if_exists], destination=bigquery.table.TableReference.from_string( From 4eeec451c6958f02c1c6141da367d2054183e1ad Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 12 Dec 2023 23:05:21 +0000 Subject: [PATCH 6/6] fix --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index eed1a1641e..fca345a157 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2532,7 +2532,7 @@ def to_gbq( ) try: - self._session.bqclient.get_dataset() + self._session.bqclient.get_dataset(destination_dataset) except google.api_core.exceptions.NotFound: self._session.bqclient.create_dataset(destination_dataset, exists_ok=True)