From 156e518c46b5efc7bcfc674c9cccbd2492bcacbe Mon Sep 17 00:00:00 2001 From: Mend Renovate Date: Mon, 12 May 2025 20:15:23 +0200 Subject: [PATCH 01/13] chore(deps): update dependency db-dtypes to v1.4.3 (#2178) Co-authored-by: Lingqing Gan --- samples/geography/requirements.txt | 2 +- samples/magics/requirements.txt | 2 +- samples/notebooks/requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index 2b5a71c8c..3ff1b2944 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -6,7 +6,7 @@ click===8.1.8; python_version == '3.9' click==8.2.0; python_version >= '3.10' click-plugins==1.1.1 cligj==0.7.2 -db-dtypes==1.4.2 +db-dtypes==1.4.3 Fiona==1.10.1 geojson==3.2.0 geopandas==1.0.1 diff --git a/samples/magics/requirements.txt b/samples/magics/requirements.txt index 2c9e158c0..b000aa50c 100644 --- a/samples/magics/requirements.txt +++ b/samples/magics/requirements.txt @@ -1,5 +1,5 @@ bigquery_magics==0.9.0 -db-dtypes==1.4.2 +db-dtypes==1.4.3 google.cloud.bigquery==3.31.0 google-cloud-bigquery-storage==2.31.0 ipython===8.18.1 diff --git a/samples/notebooks/requirements.txt b/samples/notebooks/requirements.txt index d1e2f39fb..d80ffcd09 100644 --- a/samples/notebooks/requirements.txt +++ b/samples/notebooks/requirements.txt @@ -1,5 +1,5 @@ bigquery-magics==0.9.0 -db-dtypes==1.4.2 +db-dtypes==1.4.3 google-cloud-bigquery==3.31.0 google-cloud-bigquery-storage==2.31.0 ipython===8.18.1; python_version == '3.9' From 57f940d957613b4d80fb81ea40a1177b73856189 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Mon, 12 May 2025 13:10:11 -0700 Subject: [PATCH 02/13] feat: add ability to set autodetect_schema query param in update_table (#2171) * Add ability to set autodetect_schema query_param * fixup! Add ability to set autodetect_schema query_param * fixup! Add ability to set autodetect_schema query_param * fixup! Add ability to set autodetect_schema query_param --------- Co-authored-by: Chalmer Lowe --- google/cloud/bigquery/client.py | 11 ++++++++ tests/system/test_client.py | 47 +++++++++++++++++++++++++++++++++ tests/unit/test_client.py | 12 ++++++--- 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index e7cafc47e..8ad1586f4 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -1389,6 +1389,7 @@ def update_table( self, table: Table, fields: Sequence[str], + autodetect_schema: bool = False, retry: retries.Retry = DEFAULT_RETRY, timeout: TimeoutType = DEFAULT_TIMEOUT, ) -> Table: @@ -1419,6 +1420,10 @@ def update_table( fields (Sequence[str]): The fields of ``table`` to change, spelled as the :class:`~google.cloud.bigquery.table.Table` properties. + autodetect_schema (bool): + Specifies if the schema of the table should be autodetected when + updating the table from the underlying source. Only applicable + for external tables. retry (Optional[google.api_core.retry.Retry]): A description of how to retry the API call. timeout (Optional[float]): @@ -1438,12 +1443,18 @@ def update_table( path = table.path span_attributes = {"path": path, "fields": fields} + if autodetect_schema: + query_params = {"autodetect_schema": True} + else: + query_params = {} + api_response = self._call_api( retry, span_name="BigQuery.updateTable", span_attributes=span_attributes, method="PATCH", path=path, + query_params=query_params, data=partial, headers=headers, timeout=timeout, diff --git a/tests/system/test_client.py b/tests/system/test_client.py index 9df572b14..6584ca03c 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -978,6 +978,53 @@ def test_update_table_constraints(self): ) self.assertIsNone(reference_table3.table_constraints, None) + def test_update_table_autodetect_schema(self): + dataset = self.temp_dataset(_make_dataset_id("bq_update_table_test")) + + # Create an external table, restrict schema to one field + TABLE_NAME = "test_table" + set_schema = [bigquery.SchemaField("username", "STRING", mode="NULLABLE")] + table_arg = Table(dataset.table(TABLE_NAME)) + + # Create an external_config and include it in the table arguments + external_config = bigquery.ExternalConfig(bigquery.ExternalSourceFormat.AVRO) + external_config.source_uris = SOURCE_URIS_AVRO + external_config.reference_file_schema_uri = REFERENCE_FILE_SCHEMA_URI_AVRO + external_config.schema = set_schema + table_arg.external_data_configuration = external_config + + self.assertFalse(_table_exists(table_arg)) + + table = helpers.retry_403(Config.CLIENT.create_table)(table_arg) + self.to_delete.insert(0, table) + self.assertTrue(_table_exists(table)) + + self.assertEqual(table.schema, set_schema) + + # Update table with schema autodetection + updated_table_arg = Table(dataset.table(TABLE_NAME)) + + # Update the external_config and include it in the table arguments + updated_external_config = copy.deepcopy(external_config) + updated_external_config.autodetect = True + updated_external_config.schema = None + updated_table_arg.external_data_configuration = updated_external_config + + # PATCH call with autodetect_schema=True to trigger schema inference + updated_table = Config.CLIENT.update_table( + updated_table_arg, ["external_data_configuration"], autodetect_schema=True + ) + + # The updated table should have a schema inferred from the reference + # file, which has all four fields. + expected_schema = [ + bigquery.SchemaField("username", "STRING", mode="NULLABLE"), + bigquery.SchemaField("tweet", "STRING", mode="NULLABLE"), + bigquery.SchemaField("timestamp", "STRING", mode="NULLABLE"), + bigquery.SchemaField("likes", "INTEGER", mode="NULLABLE"), + ] + self.assertEqual(updated_table.schema, expected_schema) + @staticmethod def _fetch_single_page(table, selected_fields=None): iterator = Config.CLIENT.list_rows(table, selected_fields=selected_fields) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 34ef680dd..b8140df66 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -2385,7 +2385,7 @@ def test_update_table(self): "resourceTags": {"123456789012/key": "value"}, } conn.api_request.assert_called_once_with( - method="PATCH", data=sent, path="/" + path, timeout=7.5 + method="PATCH", data=sent, path="/" + path, timeout=7.5, query_params={} ) self.assertEqual(updated_table.description, table.description) self.assertEqual(updated_table.friendly_name, table.friendly_name) @@ -2439,6 +2439,7 @@ def test_update_table_w_custom_property(self): path="/%s" % path, data={"newAlphaProperty": "unreleased property"}, timeout=DEFAULT_TIMEOUT, + query_params={}, ) self.assertEqual( updated_table._properties["newAlphaProperty"], "unreleased property" @@ -2475,6 +2476,7 @@ def test_update_table_only_use_legacy_sql(self): path="/%s" % path, data={"view": {"useLegacySql": True}}, timeout=DEFAULT_TIMEOUT, + query_params={}, ) self.assertEqual(updated_table.view_use_legacy_sql, table.view_use_legacy_sql) @@ -2567,9 +2569,10 @@ def test_update_table_w_query(self): "schema": schema_resource, }, timeout=DEFAULT_TIMEOUT, + query_params={}, ) - def test_update_table_w_schema_None(self): + def test_update_table_w_schema_None_autodetect_schema(self): # Simulate deleting schema: not sure if back-end will actually # allow this operation, but the spec says it is optional. path = "projects/%s/datasets/%s/tables/%s" % ( @@ -2611,7 +2614,9 @@ def test_update_table_w_schema_None(self): with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: - updated_table = client.update_table(table, ["schema"]) + updated_table = client.update_table( + table, ["schema"], autodetect_schema=True + ) final_attributes.assert_called_once_with( {"path": "/%s" % path, "fields": ["schema"]}, client, None @@ -2623,6 +2628,7 @@ def test_update_table_w_schema_None(self): sent = {"schema": {"fields": None}} self.assertEqual(req[1]["data"], sent) self.assertEqual(req[1]["path"], "/%s" % path) + self.assertEqual(req[1]["query_params"], {"autodetect_schema": True}) self.assertEqual(len(updated_table.schema), 0) def test_update_table_delete_property(self): From 2d173a5bd8a29dfebe492bc3a79469da7f0fcfbd Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Tue, 13 May 2025 07:53:27 -0400 Subject: [PATCH 03/13] chore(python): remove docs from templates (#2164) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(python): remove docs from templates Source-Link: https://github.com/googleapis/synthtool/commit/3fca64a4bb1772258f8cc939a9192b17dbbbf335 Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:3b3a31be60853477bc39ed8d9bac162cac3ba083724cecaad54eb81d4e4dae9c * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * remove replacement in owlbot.py --------- Co-authored-by: Owl Bot Co-authored-by: Anthonios Partheniou Co-authored-by: Chalmer Lowe --- .github/.OwlBot.lock.yaml | 4 ++-- owlbot.py | 10 ---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 51b21a62b..cea9eb68f 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:a7aef70df5f13313ddc027409fc8f3151422ec2a57ac8730fce8fa75c060d5bb -# created: 2025-04-10T17:00:10.042601326Z + digest: sha256:3b3a31be60853477bc39ed8d9bac162cac3ba083724cecaad54eb81d4e4dae9c +# created: 2025-04-16T22:40:03.123475241Z diff --git a/owlbot.py b/owlbot.py index 8cfa2b097..60759adbe 100644 --- a/owlbot.py +++ b/owlbot.py @@ -109,16 +109,6 @@ python.py_samples() -s.replace( - "docs/conf.py", - r'\{"members": True\}', - '{"members": True, "inherited-members": True}', -) -s.replace( - "docs/conf.py", - r"exclude_patterns = \[", - '\\g<0>\n "google/cloud/bigquery_v2/**", # Legacy proto-based types.', -) s.replace( "samples/**/noxfile.py", 'BLACK_VERSION = "black==22.3.0"', From 02176377d5e2fc25b5cd4f46aa6ebfb1b6a960a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 14 May 2025 04:36:37 -0500 Subject: [PATCH 04/13] fix: ensure AccessEntry equality and repr uses the correct `entity_type` (#2182) * fix: ensure AccessEntry equality and repr uses the correct `entity_type` * add a test for access_entries --- google/cloud/bigquery/dataset.py | 4 +-- tests/unit/test_dataset.py | 44 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/google/cloud/bigquery/dataset.py b/google/cloud/bigquery/dataset.py index d225b7106..f788275cd 100644 --- a/google/cloud/bigquery/dataset.py +++ b/google/cloud/bigquery/dataset.py @@ -512,7 +512,7 @@ def __ne__(self, other): return not self == other def __repr__(self): - return f"" + return f"" def _key(self): """A tuple key that uniquely describes this field. @@ -531,7 +531,7 @@ def _key(self): properties["condition"] = condition_key prop_tup = tuple(sorted(properties.items())) - return (self.role, self._entity_type, self.entity_id, prop_tup) + return (self.role, self.entity_type, self.entity_id, prop_tup) def __hash__(self): return hash(self._key()) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 941430827..5cce2a9a7 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -613,6 +613,15 @@ def test_equality_and_hash_without_condition(self): assert hash(entry1) == hash(entry2) assert hash(entry1) != hash(entry3) # Usually true + def test_equality_and_hash_from_api_repr(self): + """Compare equal entries where one was created via from_api_repr.""" + entry1 = AccessEntry("OWNER", "specialGroup", "projectOwners") + entry2 = AccessEntry.from_api_repr( + {"role": "OWNER", "specialGroup": "projectOwners"} + ) + assert entry1 == entry2 + assert hash(entry1) == hash(entry2) + def test_equality_and_hash_with_condition(self, condition_1, condition_2): cond1a = Condition( condition_1.expression, condition_1.title, condition_1.description @@ -746,6 +755,13 @@ def test_dataset_property_with_condition(self, condition_1): assert "dataset" in entry._properties assert "condition" in entry._properties + def test_repr_from_api_repr(self): + """Check that repr() includes the correct entity_type when the object is initialized from a dictionary.""" + api_repr = {"role": "OWNER", "userByEmail": "owner@example.com"} + entry = AccessEntry.from_api_repr(api_repr) + entry_str = repr(entry) + assert entry_str == "" + class TestDatasetReference(unittest.TestCase): @staticmethod @@ -1097,6 +1113,34 @@ def test_ctor_explicit(self): self.assertIsNone(dataset.location) self.assertEqual(dataset.is_case_insensitive, False) + def test_access_entries_getter_from_api_repr(self): + """Check that `in` works correctly when Dataset is made via from_api_repr().""" + from google.cloud.bigquery.dataset import AccessEntry + + dataset = self._get_target_class().from_api_repr( + { + "datasetReference": {"projectId": "my-proj", "datasetId": "my_dset"}, + "access": [ + { + "role": "OWNER", + "userByEmail": "uilma@example.com", + }, + { + "role": "READER", + "groupByEmail": "rhubbles@example.com", + }, + ], + } + ) + assert ( + AccessEntry("OWNER", "userByEmail", "uilma@example.com") + in dataset.access_entries + ) + assert ( + AccessEntry("READER", "groupByEmail", "rhubbles@example.com") + in dataset.access_entries + ) + def test_access_entries_setter_non_list(self): dataset = self._make_one(self.DS_REF) with self.assertRaises(TypeError): From ebfd0a83d43bcb96f65f5669437220aa6138b766 Mon Sep 17 00:00:00 2001 From: Chalmer Lowe Date: Wed, 14 May 2025 13:34:26 -0400 Subject: [PATCH 05/13] feat: Add dtype parameters to to_geodataframe functions (#2176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Add dtype parameters to to_geodataframe This change adds support for `bool_dtype`, `int_dtype`, `float_dtype`, and `string_dtype` parameters to the `to_geodataframe` method in `RowIterator` and `QueryJob`. These parameters allow you to specify the desired pandas dtypes for boolean, integer, float, and string columns when converting BigQuery results to GeoDataFrames. The changes include: - Updating `RowIterator.to_geodataframe` to accept and pass these dtype parameters to the underlying `to_dataframe` method. - Updating `QueryJob.to_geodataframe` to accept and pass these dtype parameters to the underlying `RowIterator.to_geodataframe` method. - Adding unit tests to verify the correct handling of these parameters. * updates to several tests re geopandas as well as imports * updates to enum import * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Update pyproject.toml Co-authored-by: Tim Sweña (Swast) * Update testing/constraints-3.9.txt Co-authored-by: Tim Sweña (Swast) --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Owl Bot Co-authored-by: Tim Sweña (Swast) --- docs/conf.py | 3 +- google/cloud/bigquery/job/query.py | 36 ++++++++++ google/cloud/bigquery/table.py | 44 ++++++++++++ noxfile.py | 4 +- pyproject.toml | 8 ++- testing/constraints-3.9.txt | 2 +- tests/unit/job/test_query_pandas.py | 6 ++ tests/unit/test_table.py | 9 ++- tests/unit/test_table_pandas.py | 103 ++++++++++++++++++++++++++++ 9 files changed, 205 insertions(+), 10 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 826298090..df1c18b68 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ # autodoc/autosummary flags autoclass_content = "both" -autodoc_default_options = {"members": True, "inherited-members": True} +autodoc_default_options = {"members": True} autosummary_generate = True @@ -109,7 +109,6 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [ - "google/cloud/bigquery_v2/**", # Legacy proto-based types. "_build", "**/.nox/**/*", "samples/AUTHORING_GUIDE.md", diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index f14039bc0..f9b99b7fb 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -2102,6 +2102,10 @@ def to_geodataframe( create_bqstorage_client: bool = True, max_results: Optional[int] = None, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "geopandas.GeoDataFrame": """Return a GeoPandas GeoDataFrame from a QueryJob @@ -2152,6 +2156,34 @@ def to_geodataframe( identifies which one to use to construct a GeoPandas GeoDataFrame. This option can be ommitted if there's only one GEOGRAPHY column. + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type Returns: geopandas.GeoDataFrame: @@ -2175,6 +2207,10 @@ def to_geodataframe( progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_column=geography_column, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, ) def __iter__(self): diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 503ca4e71..e084468f6 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2727,6 +2727,10 @@ def to_geodataframe( progress_bar_type: Optional[str] = None, create_bqstorage_client: bool = True, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "geopandas.GeoDataFrame": """Create a GeoPandas GeoDataFrame by loading all pages of a query. @@ -2778,6 +2782,34 @@ def to_geodataframe( identifies which one to use to construct a geopandas GeoDataFrame. This option can be ommitted if there's only one GEOGRAPHY column. + bool_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``) + to convert BigQuery Boolean type, instead of relying on the default + ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type + int_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) + to convert BigQuery Integer types, instead of relying on the default + ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("int64")``. A list of BigQuery + Integer types can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types + float_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) + to convert BigQuery Float type, instead of relying on the default + ``numpy.dtype("float64")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("float64")``. BigQuery Float + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types + string_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to + convert BigQuery String type, instead of relying on the default + ``numpy.dtype("object")``. If you explicitly set the value to ``None``, + then the data type will be ``numpy.dtype("object")``. BigQuery String + type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type Returns: geopandas.GeoDataFrame: @@ -2829,6 +2861,10 @@ def to_geodataframe( progress_bar_type, create_bqstorage_client, geography_as_object=True, + bool_dtype=bool_dtype, + int_dtype=int_dtype, + float_dtype=float_dtype, + string_dtype=string_dtype, ) return geopandas.GeoDataFrame( @@ -2932,6 +2968,10 @@ def to_geodataframe( progress_bar_type=None, create_bqstorage_client=True, geography_column: Optional[str] = None, + bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE, + int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2941,6 +2981,10 @@ def to_geodataframe( progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. geography_column (str): Ignored. Added for compatibility with RowIterator. + bool_dtype (Any): Ignored. Added for compatibility with RowIterator. + int_dtype (Any): Ignored. Added for compatibility with RowIterator. + float_dtype (Any): Ignored. Added for compatibility with RowIterator. + string_dtype (Any): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/noxfile.py b/noxfile.py index c2b4bbb50..1922a68a5 100644 --- a/noxfile.py +++ b/noxfile.py @@ -109,9 +109,7 @@ def default(session, install_extras=True): # that logic (and the associated tests) we avoid installing the [ipython] extra # which has a downstream effect of then avoiding installing bigquery_magics. if install_extras and session.python == UNIT_TEST_PYTHON_VERSIONS[0]: - install_target = ( - ".[bqstorage,pandas,ipywidgets,geopandas,tqdm,opentelemetry,bigquery_v2]" - ) + install_target = ".[bqstorage,pandas,ipywidgets,geopandas,matplotlib,tqdm,opentelemetry,bigquery_v2]" elif install_extras: # run against all other UNIT_TEST_PYTHON_VERSIONS install_target = ".[all]" else: diff --git a/pyproject.toml b/pyproject.toml index 38d74cdd0..9c91a2fc8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,11 @@ pandas = [ ipywidgets = ["ipywidgets >= 7.7.1", "ipykernel >= 6.2.0"] geopandas = ["geopandas >= 0.9.0, < 2.0.0", "Shapely >= 1.8.4, < 3.0.0"] ipython = ["ipython >= 7.23.1", "bigquery-magics >= 0.6.0"] -tqdm = ["tqdm >= 4.7.4, < 5.0.0"] +matplotlib = [ + "matplotlib >= 3.7.1, <= 3.9.2; python_version == '3.9'", + "matplotlib >= 3.10.3; python_version >= '3.10'", +] +tqdm = ["tqdm >= 4.23.4, < 5.0.0"] opentelemetry = [ "opentelemetry-api >= 1.1.0", "opentelemetry-sdk >= 1.1.0", @@ -93,7 +97,7 @@ bigquery_v2 = [ "protobuf >= 3.20.2, < 7.0.0, != 4.21.0, != 4.21.1, != 4.21.2, != 4.21.3, != 4.21.4, != 4.21.5", # For the legacy proto-based types. ] all = [ - "google-cloud-bigquery[bqstorage,pandas,ipywidgets,geopandas,ipython,tqdm,opentelemetry,bigquery_v2]", + "google-cloud-bigquery[bqstorage,pandas,ipywidgets,geopandas,ipython,matplotlib,tqdm,opentelemetry,bigquery_v2]", ] [tool.setuptools.dynamic] diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index cb6c29f3b..60a155f0d 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -29,4 +29,4 @@ pyarrow==4.0.0 python-dateutil==2.8.2 requests==2.21.0 Shapely==1.8.4 -tqdm==4.7.4 +matplotlib==3.7.1 diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 2cda59bd1..d82f0dfe3 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -22,6 +22,7 @@ from ..helpers import make_connection from .helpers import _make_client from .helpers import _make_job_resource +from google.cloud.bigquery.enums import DefaultPandasDTypes try: from google.cloud import bigquery_storage @@ -30,6 +31,7 @@ except (ImportError, AttributeError): bigquery_storage = None + try: import shapely except (ImportError, AttributeError): @@ -1019,5 +1021,9 @@ def test_query_job_to_geodataframe_delegation(wait_for_query): progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, geography_column=geography_column, + bool_dtype=DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, ) assert df is row_iterator.to_geodataframe.return_value diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 253006547..8daa4ce43 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -31,6 +31,7 @@ from google.cloud.bigquery import exceptions from google.cloud.bigquery import external_config from google.cloud.bigquery import schema +from google.cloud.bigquery.enums import DefaultPandasDTypes from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.dataset import DatasetReference @@ -4065,7 +4066,7 @@ def test_to_dataframe_no_tqdm(self): def test_to_dataframe_tqdm_error(self): pytest.importorskip("pandas") - pytest.importorskip("tqdm") + tqdm = pytest.importorskip("tqdm") mock.patch("tqdm.tqdm_gui", new=None) mock.patch("tqdm.notebook.tqdm", new=None) mock.patch("tqdm.tqdm", new=None) @@ -4100,7 +4101,7 @@ def test_to_dataframe_tqdm_error(self): for warning in warned: # pragma: NO COVER self.assertIn( warning.category, - [UserWarning, DeprecationWarning], + [UserWarning, DeprecationWarning, tqdm.TqdmExperimentalWarning], ) def test_to_dataframe_w_empty_results(self): @@ -5639,6 +5640,10 @@ def test_rowiterator_to_geodataframe_delegation(self, to_dataframe): progress_bar_type, create_bqstorage_client, geography_as_object=True, + bool_dtype=DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, ) self.assertIsInstance(df, geopandas.GeoDataFrame) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 94737732b..43d64d77d 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -261,3 +261,106 @@ def test_to_dataframe_with_jobs_query_response(class_under_test): "Tiffani", ] assert list(df["number"]) == [6, 325, 26, 10, 17, 22, 6, 229, 8] + + +@mock.patch("google.cloud.bigquery.table.geopandas") +def test_rowiterator_to_geodataframe_with_default_dtypes( + mock_geopandas, monkeypatch, class_under_test +): + mock_geopandas.GeoDataFrame = mock.Mock(spec=True) + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + schema = [ + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("string_col", "STRING"), + ] + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, schema) + + mock_df = pandas.DataFrame( + { + "geo_col": ["POINT (1 2)"], + "bool_col": [True], + "int_col": [123], + "float_col": [1.23], + "string_col": ["abc"], + } + ) + rows.to_dataframe = mock.Mock(return_value=mock_df) + + rows.to_geodataframe(geography_column="geo_col") + + rows.to_dataframe.assert_called_once_with( + None, # bqstorage_client + None, # dtypes + None, # progress_bar_type + True, # create_bqstorage_client + geography_as_object=True, + bool_dtype=bigquery.enums.DefaultPandasDTypes.BOOL_DTYPE, + int_dtype=bigquery.enums.DefaultPandasDTypes.INT_DTYPE, + float_dtype=None, + string_dtype=None, + ) + mock_geopandas.GeoDataFrame.assert_called_once_with( + mock_df, crs="EPSG:4326", geometry="geo_col" + ) + + +@mock.patch("google.cloud.bigquery.table.geopandas") +def test_rowiterator_to_geodataframe_with_custom_dtypes( + mock_geopandas, monkeypatch, class_under_test +): + mock_geopandas.GeoDataFrame = mock.Mock(spec=True) + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + schema = [ + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("string_col", "STRING"), + ] + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, schema) + + mock_df = pandas.DataFrame( + { + "geo_col": ["POINT (3 4)"], + "bool_col": [False], + "int_col": [456], + "float_col": [4.56], + "string_col": ["def"], + } + ) + rows.to_dataframe = mock.Mock(return_value=mock_df) + + custom_bool_dtype = "bool" + custom_int_dtype = "int32" + custom_float_dtype = "float32" + custom_string_dtype = "string" + + rows.to_geodataframe( + geography_column="geo_col", + bool_dtype=custom_bool_dtype, + int_dtype=custom_int_dtype, + float_dtype=custom_float_dtype, + string_dtype=custom_string_dtype, + ) + + rows.to_dataframe.assert_called_once_with( + None, # bqstorage_client + None, # dtypes + None, # progress_bar_type + True, # create_bqstorage_client + geography_as_object=True, + bool_dtype=custom_bool_dtype, + int_dtype=custom_int_dtype, + float_dtype=custom_float_dtype, + string_dtype=custom_string_dtype, + ) + mock_geopandas.GeoDataFrame.assert_called_once_with( + mock_df, crs="EPSG:4326", geometry="geo_col" + ) From 2140a51bac95ab600759bdee576cf3a41c7dc834 Mon Sep 17 00:00:00 2001 From: Chalmer Lowe Date: Wed, 14 May 2025 13:59:44 -0400 Subject: [PATCH 06/13] chore: Fix two types of warnings in unit tests (#2183) * Fix two types of warnings in unit tests This commit addresses two warnings that appear when running unit tests: 1. `PytestRemovedIn9Warning` in `tests/unit/test_opentelemetry_tracing.py`: Removed a `@pytest.mark.skipif` decorator from a fixture. The skip condition is already present on the test methods using the fixture. 2. `FutureWarning` in `tests/unit/test_client.py`: Updated calls to `client.query()` to include `job_retry=None` when `job_id` is also specified. This is to avoid ambiguity as BigQuery cannot retry a failed job with the exact same ID. * Update tests/unit/test_client.py * Update tests/unit/test_client.py * Update linting * adds more examples of functions where job_retry is needed --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- tests/unit/test_client.py | 52 ++++++++++++++++++------ tests/unit/test_opentelemetry_tracing.py | 1 - 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index b8140df66..a35338698 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -4719,7 +4719,7 @@ def test_query_w_api_method_query_and_job_id_fails(self): client._connection = make_connection({}) with self.assertRaises(TypeError) as exc: - client.query(query, job_id="abcd", api_method="QUERY") + client.query(query, job_id="abcd", api_method="QUERY", job_retry=None) self.assertIn( "`job_id` was provided, but the 'QUERY' `api_method` was requested", exc.exception.args[0], @@ -4774,7 +4774,11 @@ def test_query_w_explicit_project(self): conn = client._connection = make_connection(resource) client.query( - query, job_id=job_id, project="other-project", location=self.LOCATION + query, + job_id=job_id, + project="other-project", + location=self.LOCATION, + job_retry=None, ) # Check that query actually starts the job. @@ -4833,7 +4837,11 @@ def test_query_w_explicit_job_config(self): original_config_copy = copy.deepcopy(job_config) client.query( - query, job_id=job_id, location=self.LOCATION, job_config=job_config + query, + job_id=job_id, + location=self.LOCATION, + job_config=job_config, + job_retry=None, ) # Check that query actually starts the job. @@ -4884,7 +4892,11 @@ def test_query_preserving_explicit_job_config(self): original_config_copy = copy.deepcopy(job_config) client.query( - query, job_id=job_id, location=self.LOCATION, job_config=job_config + query, + job_id=job_id, + location=self.LOCATION, + job_config=job_config, + job_retry=None, ) # Check that query actually starts the job. @@ -4940,7 +4952,13 @@ def test_query_preserving_explicit_default_job_config(self): ) conn = client._connection = make_connection(resource) - client.query(query, job_id=job_id, location=self.LOCATION, job_config=None) + client.query( + query, + job_id=job_id, + location=self.LOCATION, + job_config=None, + job_retry=None, + ) # Check that query actually starts the job. conn.api_request.assert_called_once_with( @@ -4978,7 +4996,11 @@ def test_query_w_invalid_job_config(self): with self.assertRaises(TypeError) as exc: client.query( - query, job_id=job_id, location=self.LOCATION, job_config=job_config + query, + job_id=job_id, + location=self.LOCATION, + job_config=job_config, + job_retry=None, ) self.assertIn("Expected an instance of QueryJobConfig", exc.exception.args[0]) @@ -5027,7 +5049,11 @@ def test_query_w_explicit_job_config_override(self): job_config.default_dataset = None client.query( - query, job_id=job_id, location=self.LOCATION, job_config=job_config + query, + job_id=job_id, + location=self.LOCATION, + job_config=job_config, + job_retry=None, ) # Check that query actually starts the job. @@ -5072,7 +5098,7 @@ def test_query_w_client_default_config_no_incoming(self): ) conn = client._connection = make_connection(resource) - client.query(query, job_id=job_id, location=self.LOCATION) + client.query(query, job_id=job_id, location=self.LOCATION, job_retry=None) # Check that query actually starts the job. conn.api_request.assert_called_once_with( @@ -5114,7 +5140,7 @@ def test_query_w_client_location(self): ) conn = client._connection = make_connection(resource) - client.query(query, job_id=job_id, project="other-project") + client.query(query, job_id=job_id, project="other-project", job_retry=None) # Check that query actually starts the job. conn.api_request.assert_called_once_with( @@ -5178,7 +5204,7 @@ def test_query_w_udf_resources(self): config.udf_resources = udf_resources config.use_legacy_sql = True - job = client.query(QUERY, job_config=config, job_id=JOB) + job = client.query(QUERY, job_config=config, job_id=JOB, job_retry=None) self.assertIsInstance(job, QueryJob) self.assertIs(job._client, client) @@ -5234,7 +5260,7 @@ def test_query_w_query_parameters(self): config = QueryJobConfig() config.query_parameters = query_parameters - job = client.query(QUERY, job_config=config, job_id=JOB) + job = client.query(QUERY, job_config=config, job_id=JOB, job_retry=None) self.assertIsInstance(job, QueryJob) self.assertIs(job._client, client) @@ -5277,7 +5303,7 @@ def test_query_job_rpc_fail_w_random_error(self): ) with job_begin_patcher: with pytest.raises(Unknown, match="Not sure what went wrong."): - client.query("SELECT 1;", job_id="123") + client.query("SELECT 1;", job_id="123", job_retry=None) def test_query_job_rpc_fail_w_conflict_job_id_given(self): from google.api_core.exceptions import Conflict @@ -5293,7 +5319,7 @@ def test_query_job_rpc_fail_w_conflict_job_id_given(self): ) with job_begin_patcher: with pytest.raises(Conflict, match="Job already exists."): - client.query("SELECT 1;", job_id="123") + client.query("SELECT 1;", job_id="123", job_retry=None) def test_query_job_rpc_fail_w_conflict_random_id_job_fetch_fails(self): from google.api_core.exceptions import Conflict diff --git a/tests/unit/test_opentelemetry_tracing.py b/tests/unit/test_opentelemetry_tracing.py index 546cc02bd..57132a1b9 100644 --- a/tests/unit/test_opentelemetry_tracing.py +++ b/tests/unit/test_opentelemetry_tracing.py @@ -42,7 +42,6 @@ TEST_SPAN_ATTRIBUTES = {"foo": "baz"} -@pytest.mark.skipif(opentelemetry is None, reason="Require `opentelemetry`") @pytest.fixture def setup(): importlib.reload(opentelemetry_tracing) From 110ad603cf61566c3421e26a028d897135e526d3 Mon Sep 17 00:00:00 2001 From: Mend Renovate Date: Wed, 14 May 2025 20:42:29 +0200 Subject: [PATCH 07/13] chore(deps): update all dependencies to v3.32.0 (#2179) Co-authored-by: Chalmer Lowe --- samples/desktopapp/requirements.txt | 2 +- samples/geography/requirements.txt | 2 +- samples/magics/requirements.txt | 2 +- samples/notebooks/requirements.txt | 2 +- samples/snippets/requirements.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/desktopapp/requirements.txt b/samples/desktopapp/requirements.txt index b98f4ace9..743d0fe35 100644 --- a/samples/desktopapp/requirements.txt +++ b/samples/desktopapp/requirements.txt @@ -1,2 +1,2 @@ -google-cloud-bigquery==3.31.0 +google-cloud-bigquery==3.32.0 google-auth-oauthlib==1.2.2 diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index 3ff1b2944..434a594cb 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -12,7 +12,7 @@ geojson==3.2.0 geopandas==1.0.1 google-api-core==2.24.2 google-auth==2.40.1 -google-cloud-bigquery==3.31.0 +google-cloud-bigquery==3.32.0 google-cloud-bigquery-storage==2.31.0 google-cloud-core==2.4.3 google-crc32c==1.7.1 diff --git a/samples/magics/requirements.txt b/samples/magics/requirements.txt index b000aa50c..bb60f2a67 100644 --- a/samples/magics/requirements.txt +++ b/samples/magics/requirements.txt @@ -1,6 +1,6 @@ bigquery_magics==0.9.0 db-dtypes==1.4.3 -google.cloud.bigquery==3.31.0 +google.cloud.bigquery==3.32.0 google-cloud-bigquery-storage==2.31.0 ipython===8.18.1 pandas==2.2.3 diff --git a/samples/notebooks/requirements.txt b/samples/notebooks/requirements.txt index d80ffcd09..17f43bf78 100644 --- a/samples/notebooks/requirements.txt +++ b/samples/notebooks/requirements.txt @@ -1,6 +1,6 @@ bigquery-magics==0.9.0 db-dtypes==1.4.3 -google-cloud-bigquery==3.31.0 +google-cloud-bigquery==3.32.0 google-cloud-bigquery-storage==2.31.0 ipython===8.18.1; python_version == '3.9' ipython==9.2.0; python_version >= '3.10' diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 4b88c6b70..c31815d69 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,2 +1,2 @@ # samples/snippets should be runnable with no "extras" -google-cloud-bigquery==3.31.0 +google-cloud-bigquery==3.32.0 From 5805066d9dfb696e7f514569567a5432ee98ad2b Mon Sep 17 00:00:00 2001 From: Chalmer Lowe Date: Thu, 15 May 2025 05:11:00 -0400 Subject: [PATCH 08/13] refactor: Fix DeprecationWarnings for datetime methods in job tests (#2185) * Fix DeprecationWarnings for datetime methods in job tests Replaced calls to deprecated `datetime.datetime.utcnow()` with `datetime.datetime.now(datetime.UTC)` in `tests/unit/job/test_base.py`. Replaced calls to deprecated `datetime.datetime.utcfromtimestamp()` with `datetime.datetime.fromtimestamp(timestamp, datetime.UTC)` in `tests/unit/job/helpers.py`. These changes address the specific warnings identified in the issue for these two files. * Update tests/unit/job/test_base.py * Update tests/unit/job/test_base.py * Updates datetime code related to UTC --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- tests/unit/job/helpers.py | 4 +++- tests/unit/job/test_base.py | 2 +- tests/unit/test__pandas_helpers.py | 2 +- tests/unit/test_client.py | 17 ++++++++--------- tests/unit/test_dataset.py | 4 +++- tests/unit/test_query.py | 17 +++++++++-------- tests/unit/test_table.py | 8 ++++++-- 7 files changed, 31 insertions(+), 23 deletions(-) diff --git a/tests/unit/job/helpers.py b/tests/unit/job/helpers.py index 3642c7229..24ba2fa99 100644 --- a/tests/unit/job/helpers.py +++ b/tests/unit/job/helpers.py @@ -106,7 +106,9 @@ def _setUpConstants(self): from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.006 - self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) + self.WHEN = datetime.datetime.fromtimestamp(self.WHEN_TS, UTC).replace( + tzinfo=UTC + ) self.ETAG = "ETAG" self.FULL_JOB_ID = "%s:%s" % (self.PROJECT, self.JOB_ID) self.RESOURCE_URL = "{}/bigquery/v2/projects/{}/jobs/{}".format( diff --git a/tests/unit/job/test_base.py b/tests/unit/job/test_base.py index 2d2f0c13c..22a0fa450 100644 --- a/tests/unit/job/test_base.py +++ b/tests/unit/job/test_base.py @@ -331,7 +331,7 @@ def _datetime_and_millis(): import datetime from google.cloud._helpers import _millis - now = datetime.datetime.utcnow().replace( + now = datetime.datetime.now(datetime.timezone.utc).replace( microsecond=123000, tzinfo=datetime.timezone.utc, # stats timestamps have ms precision ) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 48c085c1d..d6ea5df7e 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -886,7 +886,7 @@ def test_list_columns_and_indexes_with_named_index_same_as_column_name( @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_dataframe_to_json_generator(module_under_test): - utcnow = datetime.datetime.utcnow() + utcnow = datetime.datetime.now(datetime.timezone.utc) dataframe = pandas.DataFrame( { "a_series": [1, 2, 3, 4], diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index a35338698..468068321 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -5853,7 +5853,7 @@ def test_insert_rows_w_schema(self): from google.cloud.bigquery.schema import SchemaField WHEN_TS = 1437767599.006 - WHEN = datetime.datetime.utcfromtimestamp(WHEN_TS).replace(tzinfo=UTC) + WHEN = datetime.datetime.fromtimestamp(WHEN_TS, UTC).replace(tzinfo=UTC) PATH = "projects/%s/datasets/%s/tables/%s/insertAll" % ( self.PROJECT, self.DS_ID, @@ -5914,7 +5914,7 @@ def test_insert_rows_w_list_of_dictionaries(self): from google.cloud.bigquery.table import Table WHEN_TS = 1437767599.006 - WHEN = datetime.datetime.utcfromtimestamp(WHEN_TS).replace(tzinfo=UTC) + WHEN = datetime.datetime.fromtimestamp(WHEN_TS, UTC).replace(tzinfo=UTC) PATH = "projects/%s/datasets/%s/tables/%s/insertAll" % ( self.PROJECT, self.DS_ID, @@ -6097,6 +6097,7 @@ def _row_data(row): ) def test_insert_rows_w_repeated_fields(self): + from google.cloud._helpers import UTC from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table @@ -6126,12 +6127,8 @@ def test_insert_rows_w_repeated_fields(self): ( 12, [ - datetime.datetime( - 2018, 12, 1, 12, 0, 0, tzinfo=datetime.timezone.utc - ), - datetime.datetime( - 2018, 12, 1, 13, 0, 0, tzinfo=datetime.timezone.utc - ), + datetime.datetime(2018, 12, 1, 12, 0, 0, tzinfo=UTC), + datetime.datetime(2018, 12, 1, 13, 0, 0, tzinfo=UTC), ], [1.25, 2.5], ), @@ -6966,7 +6963,9 @@ def test_list_rows(self): ) WHEN_TS = 1437767599006000 - WHEN = datetime.datetime.utcfromtimestamp(WHEN_TS / 1e6).replace(tzinfo=UTC) + WHEN = datetime.datetime.fromtimestamp( + WHEN_TS / 1e6, datetime.timezone.utc + ).replace(tzinfo=UTC) WHEN_1 = WHEN + datetime.timedelta(microseconds=1) WHEN_2 = WHEN + datetime.timedelta(microseconds=2) ROWS = 1234 diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 5cce2a9a7..3fd2579af 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -945,7 +945,9 @@ def _setUpConstants(self): from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.006 - self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) + self.WHEN = datetime.datetime.fromtimestamp(self.WHEN_TS, UTC).replace( + tzinfo=UTC + ) self.ETAG = "ETAG" self.DS_FULL_ID = "%s:%s" % (self.PROJECT, self.DS_ID) self.RESOURCE_URL = "http://example.com/path/to/resource" diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 40ef080f7..0d967bdb8 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -637,9 +637,9 @@ def test_to_api_repr_w_timestamp_datetime(self): self.assertEqual(param.to_api_repr(), EXPECTED) def test_to_api_repr_w_timestamp_micros(self): - from google.cloud._helpers import _microseconds_from_datetime + from google.cloud._helpers import _microseconds_from_datetime, UTC - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) seconds = _microseconds_from_datetime(now) / 1.0e6 EXPECTED = { "parameterType": {"type": "TIMESTAMP"}, @@ -650,9 +650,9 @@ def test_to_api_repr_w_timestamp_micros(self): self.assertEqual(param.to_api_repr(), EXPECTED) def test_to_api_repr_w_datetime_datetime(self): - from google.cloud._helpers import _datetime_to_rfc3339 + from google.cloud._helpers import _datetime_to_rfc3339, UTC - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) EXPECTED = { "parameterType": {"type": "DATETIME"}, "parameterValue": { @@ -664,9 +664,9 @@ def test_to_api_repr_w_datetime_datetime(self): self.assertEqual(param.to_api_repr(), EXPECTED) def test_to_api_repr_w_datetime_string(self): - from google.cloud._helpers import _datetime_to_rfc3339 + from google.cloud._helpers import _datetime_to_rfc3339, UTC - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) now_str = _datetime_to_rfc3339(now) EXPECTED = { "parameterType": {"type": "DATETIME"}, @@ -1047,9 +1047,10 @@ def test_to_api_repr_w_datetime_str(self): self.assertEqual(param.to_api_repr(), EXPECTED) def test_to_api_repr_w_datetime_datetime(self): + from google.cloud._helpers import UTC # type: ignore from google.cloud.bigquery._helpers import _RFC3339_MICROS_NO_ZULU - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) now_str = now.strftime(_RFC3339_MICROS_NO_ZULU) EXPECTED = { "parameterType": { @@ -1089,7 +1090,7 @@ def test_to_api_repr_w_timestamp_str(self): def test_to_api_repr_w_timestamp_timestamp(self): from google.cloud._helpers import UTC # type: ignore - now = datetime.datetime.utcnow() + now = datetime.datetime.now(UTC) now = now.astimezone(UTC) now_str = str(now) EXPECTED = { diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 8daa4ce43..92fa0e2ec 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -395,7 +395,9 @@ def _setUpConstants(self): from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.006 - self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) + self.WHEN = datetime.datetime.fromtimestamp(self.WHEN_TS, UTC).replace( + tzinfo=UTC + ) self.ETAG = "ETAG" self.TABLE_FULL_ID = "%s:%s.%s" % (self.PROJECT, self.DS_ID, self.TABLE_NAME) self.RESOURCE_URL = "http://example.com/path/to/resource" @@ -1952,7 +1954,9 @@ def _setUpConstants(self): from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.125 - self.WHEN = datetime.datetime.utcfromtimestamp(self.WHEN_TS).replace(tzinfo=UTC) + self.WHEN = datetime.datetime.fromtimestamp(self.WHEN_TS, UTC).replace( + tzinfo=UTC + ) self.EXP_TIME = datetime.datetime(2015, 8, 1, 23, 59, 59, tzinfo=UTC) def test_ctor(self): From 7ec2848379d5743bbcb36700a1153540c451e0e0 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 19 May 2025 10:52:16 -0700 Subject: [PATCH 09/13] fix: ensure SchemaField.field_dtype returns a string (#2188) * fix: ensure SchemaField.field_dtype returns a string * fix cover tests * fix unit 3.9 --- google/cloud/bigquery/_pandas_helpers.py | 154 ++++++++++------------- google/cloud/bigquery/schema.py | 28 ++--- tests/unit/test__pandas_helpers.py | 113 ++++++----------- tests/unit/test_schema.py | 5 - 4 files changed, 113 insertions(+), 187 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 457eb9078..6691e7ef6 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -508,31 +508,37 @@ def dataframe_to_bq_schema(dataframe, bq_schema): bq_schema_unused = set() bq_schema_out = [] - unknown_type_fields = [] - + unknown_type_columns = [] + dataframe_reset_index = dataframe.reset_index() for column, dtype in list_columns_and_indexes(dataframe): - # Use provided type from schema, if present. + # Step 1: use provided type from schema, if present. bq_field = bq_schema_index.get(column) if bq_field: bq_schema_out.append(bq_field) bq_schema_unused.discard(bq_field.name) continue - # Otherwise, try to automatically determine the type based on the + # Step 2: try to automatically determine the type based on the # pandas dtype. bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) if bq_type is None: - sample_data = _first_valid(dataframe.reset_index()[column]) + sample_data = _first_valid(dataframe_reset_index[column]) if ( isinstance(sample_data, _BaseGeometry) and sample_data is not None # Paranoia ): bq_type = "GEOGRAPHY" - bq_field = schema.SchemaField(column, bq_type) - bq_schema_out.append(bq_field) + if bq_type is not None: + bq_schema_out.append(schema.SchemaField(column, bq_type)) + continue + + # Step 3: try with pyarrow if available + bq_field = _get_schema_by_pyarrow(column, dataframe_reset_index[column]) + if bq_field is not None: + bq_schema_out.append(bq_field) + continue - if bq_field.field_type is None: - unknown_type_fields.append(bq_field) + unknown_type_columns.append(column) # Catch any schema mismatch. The developer explicitly asked to serialize a # column, but it was not found. @@ -543,98 +549,70 @@ def dataframe_to_bq_schema(dataframe, bq_schema): ) ) - # If schema detection was not successful for all columns, also try with - # pyarrow, if available. - if unknown_type_fields: - if not pyarrow: - msg = "Could not determine the type of columns: {}".format( - ", ".join(field.name for field in unknown_type_fields) - ) - warnings.warn(msg) - return None # We cannot detect the schema in full. - - # The augment_schema() helper itself will also issue unknown type - # warnings if detection still fails for any of the fields. - bq_schema_out = augment_schema(dataframe, bq_schema_out) + if unknown_type_columns != []: + msg = "Could not determine the type of columns: {}".format( + ", ".join(unknown_type_columns) + ) + warnings.warn(msg) + return None # We cannot detect the schema in full. - return tuple(bq_schema_out) if bq_schema_out else None + return tuple(bq_schema_out) -def augment_schema(dataframe, current_bq_schema): - """Try to deduce the unknown field types and return an improved schema. +def _get_schema_by_pyarrow(name, series): + """Attempt to detect the type of the given series by leveraging PyArrow's + type detection capabilities. - This function requires ``pyarrow`` to run. If all the missing types still - cannot be detected, ``None`` is returned. If all types are already known, - a shallow copy of the given schema is returned. + This function requires the ``pyarrow`` library to be installed and + available. If the series type cannot be determined or ``pyarrow`` is not + available, ``None`` is returned. Args: - dataframe (pandas.DataFrame): - DataFrame for which some of the field types are still unknown. - current_bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]): - A BigQuery schema for ``dataframe``. The types of some or all of - the fields may be ``None``. + name (str): + the column name of the SchemaField. + series (pandas.Series): + The Series data for which to detect the data type. Returns: - Optional[Sequence[google.cloud.bigquery.schema.SchemaField]] + Optional[google.cloud.bigquery.schema.SchemaField]: + A tuple containing the BigQuery-compatible type string (e.g., + "STRING", "INTEGER", "TIMESTAMP", "DATETIME", "NUMERIC", "BIGNUMERIC") + and the mode string ("NULLABLE", "REPEATED"). + Returns ``None`` if the type cannot be determined or ``pyarrow`` + is not imported. """ - # pytype: disable=attribute-error - augmented_schema = [] - unknown_type_fields = [] - for field in current_bq_schema: - if field.field_type is not None: - augmented_schema.append(field) - continue - - arrow_table = pyarrow.array(dataframe.reset_index()[field.name]) - - if pyarrow.types.is_list(arrow_table.type): - # `pyarrow.ListType` - detected_mode = "REPEATED" - detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq( - arrow_table.values.type.id - ) - - # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds - # it to such datetimes, causing them to be recognized as TIMESTAMP type. - # We thus additionally check the actual data to see if we need to overrule - # that and choose DATETIME instead. - # Note that this should only be needed for datetime values inside a list, - # since scalar datetime values have a proper Pandas dtype that allows - # distinguishing between timezone-naive and timezone-aware values before - # even requiring the additional schema augment logic in this method. - if detected_type == "TIMESTAMP": - valid_item = _first_array_valid(dataframe[field.name]) - if isinstance(valid_item, datetime) and valid_item.tzinfo is None: - detected_type = "DATETIME" - else: - detected_mode = field.mode - detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id) - if detected_type == "NUMERIC" and arrow_table.type.scale > 9: - detected_type = "BIGNUMERIC" - if detected_type is None: - unknown_type_fields.append(field) - continue + if not pyarrow: + return None - new_field = schema.SchemaField( - name=field.name, - field_type=detected_type, - mode=detected_mode, - description=field.description, - fields=field.fields, - ) - augmented_schema.append(new_field) + arrow_table = pyarrow.array(series) + if pyarrow.types.is_list(arrow_table.type): + # `pyarrow.ListType` + mode = "REPEATED" + type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.values.type.id) + + # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds + # it to such datetimes, causing them to be recognized as TIMESTAMP type. + # We thus additionally check the actual data to see if we need to overrule + # that and choose DATETIME instead. + # Note that this should only be needed for datetime values inside a list, + # since scalar datetime values have a proper Pandas dtype that allows + # distinguishing between timezone-naive and timezone-aware values before + # even requiring the additional schema augment logic in this method. + if type == "TIMESTAMP": + valid_item = _first_array_valid(series) + if isinstance(valid_item, datetime) and valid_item.tzinfo is None: + type = "DATETIME" + else: + mode = "NULLABLE" # default mode + type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id) + if type == "NUMERIC" and arrow_table.type.scale > 9: + type = "BIGNUMERIC" - if unknown_type_fields: - warnings.warn( - "Pyarrow could not determine the type of columns: {}.".format( - ", ".join(field.name for field in unknown_type_fields) - ) - ) + if type is not None: + return schema.SchemaField(name, type, mode) + else: return None - return augmented_schema - # pytype: enable=attribute-error - def dataframe_to_arrow(dataframe, bq_schema): """Convert pandas dataframe to Arrow table, using BigQuery schema. diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py index 749b0a00e..1f1aab7a4 100644 --- a/google/cloud/bigquery/schema.py +++ b/google/cloud/bigquery/schema.py @@ -284,15 +284,13 @@ def name(self): return self._properties.get("name", "") @property - def field_type(self): + def field_type(self) -> str: """str: The type of the field. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type """ type_ = self._properties.get("type") - if type_ is None: # Shouldn't happen, but some unit tests do this. - return None return cast(str, type_).upper() @property @@ -397,20 +395,16 @@ def _key(self): Returns: Tuple: The contents of this :class:`~google.cloud.bigquery.schema.SchemaField`. """ - field_type = self.field_type.upper() if self.field_type is not None else None - - # Type can temporarily be set to None if the code needs a SchemaField instance, - # but has not determined the exact type of the field yet. - if field_type is not None: - if field_type == "STRING" or field_type == "BYTES": - if self.max_length is not None: - field_type = f"{field_type}({self.max_length})" - elif field_type.endswith("NUMERIC"): - if self.precision is not None: - if self.scale is not None: - field_type = f"{field_type}({self.precision}, {self.scale})" - else: - field_type = f"{field_type}({self.precision})" + field_type = self.field_type + if field_type == "STRING" or field_type == "BYTES": + if self.max_length is not None: + field_type = f"{field_type}({self.max_length})" + elif field_type.endswith("NUMERIC"): + if self.precision is not None: + if self.scale is not None: + field_type = f"{field_type}({self.precision}, {self.scale})" + else: + field_type = f"{field_type}({self.precision})" policy_tags = ( None if self.policy_tags is None else tuple(sorted(self.policy_tags.names)) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index d6ea5df7e..d87c65581 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -1568,31 +1568,7 @@ def test_augment_schema_type_detection_succeeds(module_under_test): # set to "datetime64[ns]", and pyarrow converts that to pyarrow.TimestampArray. # We thus cannot expect to get a DATETIME date when converting back to the # BigQuery type. - - current_schema = ( - schema.SchemaField("bool_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("int_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("float_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("time_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("timestamp_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("date_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("bytes_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("string_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("numeric_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), - ) - - with warnings.catch_warnings(record=True) as warned: - augmented_schema = module_under_test.augment_schema(dataframe, current_schema) - - # there should be no relevant warnings - unwanted_warnings = [ - warning for warning in warned if "Pyarrow could not" in str(warning) - ] - assert not unwanted_warnings - - # the augmented schema must match the expected - expected_schema = ( + expected_schemas = ( schema.SchemaField("bool_field", field_type="BOOL", mode="NULLABLE"), schema.SchemaField("int_field", field_type="INT64", mode="NULLABLE"), schema.SchemaField("float_field", field_type="FLOAT64", mode="NULLABLE"), @@ -1607,8 +1583,13 @@ def test_augment_schema_type_detection_succeeds(module_under_test): ), ) - by_name = operator.attrgetter("name") - assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name) + for col_name, expected_schema in zip(dataframe, expected_schemas): + with warnings.catch_warnings(record=True) as warned: + schema_field = module_under_test._get_schema_by_pyarrow( + col_name, dataframe[col_name] + ) + assert warned == [] + assert schema_field == expected_schema @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @@ -1639,30 +1620,20 @@ def test_augment_schema_repeated_fields(module_under_test): ] ) - current_schema = ( - schema.SchemaField("string_array", field_type=None, mode="NULLABLE"), - schema.SchemaField("timestamp_array", field_type=None, mode="NULLABLE"), - schema.SchemaField("datetime_array", field_type=None, mode="NULLABLE"), - ) - - with warnings.catch_warnings(record=True) as warned: - augmented_schema = module_under_test.augment_schema(dataframe, current_schema) - - # there should be no relevant warnings - unwanted_warnings = [ - warning for warning in warned if "Pyarrow could not" in str(warning) - ] - assert not unwanted_warnings - # the augmented schema must match the expected - expected_schema = ( + expected_schemas = ( schema.SchemaField("string_array", field_type="STRING", mode="REPEATED"), schema.SchemaField("timestamp_array", field_type="TIMESTAMP", mode="REPEATED"), schema.SchemaField("datetime_array", field_type="DATETIME", mode="REPEATED"), ) - by_name = operator.attrgetter("name") - assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name) + for col_name, expected_schema in zip(dataframe, expected_schemas): + with warnings.catch_warnings(record=True) as warned: + schema_field = module_under_test._get_schema_by_pyarrow( + col_name, dataframe[col_name] + ) + assert warned == [] + assert schema_field == expected_schema @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @@ -1681,24 +1652,21 @@ def test_augment_schema_type_detection_fails(module_under_test): }, ] ) - current_schema = [ - schema.SchemaField("status", field_type="STRING", mode="NULLABLE"), - schema.SchemaField("struct_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("struct_field_2", field_type=None, mode="NULLABLE"), - ] - - with warnings.catch_warnings(record=True) as warned: - augmented_schema = module_under_test.augment_schema(dataframe, current_schema) - assert augmented_schema is None + expected_schemas = ( + schema.SchemaField("status", field_type="STRING", mode="NULLABLE"), + # Could not determine the type of these columns + None, + None, + ) - expected_warnings = [ - warning for warning in warned if "could not determine" in str(warning) - ] - assert len(expected_warnings) == 1 - warning_msg = str(expected_warnings[0]) - assert "pyarrow" in warning_msg.lower() - assert "struct_field" in warning_msg and "struct_field_2" in warning_msg + for col_name, expected_schema in zip(dataframe, expected_schemas): + with warnings.catch_warnings(record=True) as warned: + schema_field = module_under_test._get_schema_by_pyarrow( + col_name, dataframe[col_name] + ) + assert warned == [] + assert schema_field == expected_schema @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @@ -1706,23 +1674,14 @@ def test_augment_schema_type_detection_fails_array_data(module_under_test): dataframe = pandas.DataFrame( data=[{"all_none_array": [None, float("NaN")], "empty_array": []}] ) - current_schema = [ - schema.SchemaField("all_none_array", field_type=None, mode="NULLABLE"), - schema.SchemaField("empty_array", field_type=None, mode="NULLABLE"), - ] - - with warnings.catch_warnings(record=True) as warned: - augmented_schema = module_under_test.augment_schema(dataframe, current_schema) - assert augmented_schema is None - - expected_warnings = [ - warning for warning in warned if "could not determine" in str(warning) - ] - assert len(expected_warnings) == 1 - warning_msg = str(expected_warnings[0]) - assert "pyarrow" in warning_msg.lower() - assert "all_none_array" in warning_msg and "empty_array" in warning_msg + for col_name in dataframe: + with warnings.catch_warnings(record=True) as warned: + schema_field = module_under_test._get_schema_by_pyarrow( + col_name, dataframe[col_name] + ) + assert warned == [] + assert schema_field is None @pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 3f2304a70..c63a8312c 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -640,11 +640,6 @@ def test___repr__(self): expected = "SchemaField('field1', 'STRING', 'NULLABLE', None, None, (), None)" self.assertEqual(repr(field1), expected) - def test___repr__type_not_set(self): - field1 = self._make_one("field1", field_type=None) - expected = "SchemaField('field1', None, 'NULLABLE', None, None, (), None)" - self.assertEqual(repr(field1), expected) - def test___repr__evaluable_no_policy_tags(self): field = self._make_one("field1", "STRING", "REQUIRED", "Description") field_repr = repr(field) From cb646ceea172bf199f366ae0592546dff2d3bcb2 Mon Sep 17 00:00:00 2001 From: Lingqing Gan Date: Mon, 19 May 2025 12:10:12 -0700 Subject: [PATCH 10/13] feat: support job reservation (#2186) * feat: support job reservation * Update google/cloud/bigquery/job/base.py Co-authored-by: Chalmer Lowe --------- Co-authored-by: Chalmer Lowe --- google/cloud/bigquery/job/base.py | 32 +++++++++++++++++++++++ tests/unit/job/test_base.py | 42 +++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/google/cloud/bigquery/job/base.py b/google/cloud/bigquery/job/base.py index eaa9d3460..5eb700ce7 100644 --- a/google/cloud/bigquery/job/base.py +++ b/google/cloud/bigquery/job/base.py @@ -224,6 +224,26 @@ def job_timeout_ms(self, value): else: self._properties.pop("jobTimeoutMs", None) + @property + def reservation(self): + """str: Optional. The reservation that job would use. + + User can specify a reservation to execute the job. If reservation is + not set, reservation is determined based on the rules defined by the + reservation assignments. The expected format is + projects/{project}/locations/{location}/reservations/{reservation}. + + Raises: + ValueError: If ``value`` type is not None or of string type. + """ + return self._properties.setdefault("reservation", None) + + @reservation.setter + def reservation(self, value): + if value and not isinstance(value, str): + raise ValueError("Reservation must be None or a string.") + self._properties["reservation"] = value + @property def labels(self): """Dict[str, str]: Labels for the job. @@ -488,6 +508,18 @@ def location(self): """str: Location where the job runs.""" return _helpers._get_sub_prop(self._properties, ["jobReference", "location"]) + @property + def reservation_id(self): + """str: Name of the primary reservation assigned to this job. + + Note that this could be different than reservations reported in + the reservation field if parent reservations were used to execute + this job. + """ + return _helpers._get_sub_prop( + self._properties, ["statistics", "reservation_id"] + ) + def _require_client(self, client): """Check client or verify over-ride. diff --git a/tests/unit/job/test_base.py b/tests/unit/job/test_base.py index 22a0fa450..aa3d49ce3 100644 --- a/tests/unit/job/test_base.py +++ b/tests/unit/job/test_base.py @@ -443,6 +443,16 @@ def test_state(self): status["state"] = state self.assertEqual(job.state, state) + def test_reservation_id(self): + reservation_id = "RESERVATION-ID" + client = _make_client(project=self.PROJECT) + job = self._make_one(self.JOB_ID, client) + self.assertIsNone(job.reservation_id) + stats = job._properties["statistics"] = {} + self.assertIsNone(job.reservation_id) + stats["reservation_id"] = reservation_id + self.assertEqual(job.reservation_id, reservation_id) + def _set_properties_job(self): client = _make_client(project=self.PROJECT) job = self._make_one(self.JOB_ID, client) @@ -1188,15 +1198,18 @@ def test_fill_query_job_config_from_default(self): job_config = QueryJobConfig() job_config.dry_run = True job_config.maximum_bytes_billed = 1000 + job_config.reservation = "reservation_1" default_job_config = QueryJobConfig() default_job_config.use_query_cache = True default_job_config.maximum_bytes_billed = 2000 + default_job_config.reservation = "reservation_2" final_job_config = job_config._fill_from_default(default_job_config) self.assertTrue(final_job_config.dry_run) self.assertTrue(final_job_config.use_query_cache) self.assertEqual(final_job_config.maximum_bytes_billed, 1000) + self.assertEqual(final_job_config.reservation, "reservation_1") def test_fill_load_job_from_default(self): from google.cloud.bigquery import LoadJobConfig @@ -1204,15 +1217,18 @@ def test_fill_load_job_from_default(self): job_config = LoadJobConfig() job_config.create_session = True job_config.encoding = "UTF-8" + job_config.reservation = "reservation_1" default_job_config = LoadJobConfig() default_job_config.ignore_unknown_values = True default_job_config.encoding = "ISO-8859-1" + default_job_config.reservation = "reservation_2" final_job_config = job_config._fill_from_default(default_job_config) self.assertTrue(final_job_config.create_session) self.assertTrue(final_job_config.ignore_unknown_values) self.assertEqual(final_job_config.encoding, "UTF-8") + self.assertEqual(final_job_config.reservation, "reservation_1") def test_fill_from_default_conflict(self): from google.cloud.bigquery import QueryJobConfig @@ -1232,10 +1248,12 @@ def test_fill_from_empty_default_conflict(self): job_config = QueryJobConfig() job_config.dry_run = True job_config.maximum_bytes_billed = 1000 + job_config.reservation = "reservation_1" final_job_config = job_config._fill_from_default(default_job_config=None) self.assertTrue(final_job_config.dry_run) self.assertEqual(final_job_config.maximum_bytes_billed, 1000) + self.assertEqual(final_job_config.reservation, "reservation_1") @mock.patch("google.cloud.bigquery._helpers._get_sub_prop") def test__get_sub_prop_wo_default(self, _get_sub_prop): @@ -1338,3 +1356,27 @@ def test_job_timeout_properties(self): job_config.job_timeout_ms = None assert job_config.job_timeout_ms is None assert "jobTimeoutMs" not in job_config._properties + + def test_reservation_miss(self): + job_config = self._make_one() + self.assertEqual(job_config.reservation, None) + + def test_reservation_hit(self): + job_config = self._make_one() + job_config._properties["reservation"] = "foo" + self.assertEqual(job_config.reservation, "foo") + + def test_reservation_update_in_place(self): + job_config = self._make_one() + job_config.reservation = "bar" # update in place + self.assertEqual(job_config.reservation, "bar") + + def test_reservation_setter_invalid(self): + job_config = self._make_one() + with self.assertRaises(ValueError): + job_config.reservation = object() + + def test_reservation_setter(self): + job_config = self._make_one() + job_config.reservation = "foo" + self.assertEqual(job_config._properties["reservation"], "foo") From bf58ca5425809b08895eebfa74a8ef5c559a69ac Mon Sep 17 00:00:00 2001 From: Chalmer Lowe Date: Mon, 19 May 2025 16:54:24 -0400 Subject: [PATCH 11/13] ci: Update to the CI/CD pipeline via github workflow to help cut turn-around time (#2189) Update to the CI/CD pipeline via github workflow to help cut turn-around time. * added github workflow * changed the number of pytest-xdist workers from "auto" to "8" (based on local tests and discussion with Tim, choosing auto sometimes takes longer to run than choosing a smaller number. I suspect this is partly because for small or short tests the overhead needed to setup a worker exceeds the time savings of having extra workers). * modified numerous tests to explicitly include a project path to avoid an attempt to find the project by making an external call via the pydata-google-auth workflow (which opens an input and waits for response from the user that never comes). --- .github/workflows/unittest.yml | 89 ++++++++++++++++++++++++++++++++++ noxfile.py | 2 +- tests/unit/test_magics.py | 30 +++++++++++- 3 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/unittest.yml diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml new file mode 100644 index 000000000..24c9ddbaf --- /dev/null +++ b/.github/workflows/unittest.yml @@ -0,0 +1,89 @@ +on: + pull_request: + branches: + - main +name: unittest +jobs: + unit: + # Use `ubuntu-latest` runner. + runs-on: ubuntu-latest + strategy: + matrix: + python: ['3.9', '3.11', '3.12', '3.13'] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run unit tests + env: + COVERAGE_FILE: .coverage-${{ matrix.python }} + run: | + nox -s unit-${{ matrix.python }} + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: coverage-artifact-${{ matrix.python }} + path: .coverage-${{ matrix.python }} + include-hidden-files: true + + unit_noextras: + # Use `ubuntu-latest` runner. + runs-on: ubuntu-latest + strategy: + matrix: + python: ['3.9', '3.13'] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run unit_noextras tests + env: + COVERAGE_FILE: .coverage-unit-noextras-${{ matrix.python }} + run: | + nox -s unit_noextras-${{ matrix.python }} + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: coverage-artifact-unit-noextras-${{ matrix.python }} + path: .coverage-unit-noextras-${{ matrix.python }} + include-hidden-files: true + + cover: + runs-on: ubuntu-latest + needs: + - unit + - unit_noextras + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + - name: Install coverage + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install coverage + - name: Download coverage results + uses: actions/download-artifact@v4 + with: + path: .coverage-results/ + - name: Report coverage results + run: | + find .coverage-results -type f -name '*.zip' -exec unzip {} \; + coverage combine .coverage-results/**/.coverage* + coverage report --show-missing --fail-under=100 diff --git a/noxfile.py b/noxfile.py index 1922a68a5..575bbb100 100644 --- a/noxfile.py +++ b/noxfile.py @@ -128,7 +128,7 @@ def default(session, install_extras=True): # Run py.test against the unit tests. session.run( "py.test", - "-n=auto", + "-n=8", "--quiet", "-W default::PendingDeprecationWarning", "--cov=google/cloud/bigquery", diff --git a/tests/unit/test_magics.py b/tests/unit/test_magics.py index 0f1e030cb..a9a12283b 100644 --- a/tests/unit/test_magics.py +++ b/tests/unit/test_magics.py @@ -480,6 +480,7 @@ def test_bigquery_magic_without_optional_arguments(monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -831,6 +832,7 @@ def test_bigquery_magic_w_max_results_query_job_results_fails(monkeypatch): assert close_transports.called +@pytest.mark.usefixtures("ipython_interactive") def test_bigquery_magic_w_table_id_invalid(monkeypatch): ip = IPython.get_ipython() monkeypatch.setattr(bigquery, "bigquery_magics", None) @@ -861,6 +863,7 @@ def test_bigquery_magic_w_table_id_invalid(monkeypatch): assert "Traceback (most recent call last)" not in output +@pytest.mark.usefixtures("ipython_interactive") def test_bigquery_magic_w_missing_query(monkeypatch): ip = IPython.get_ipython() monkeypatch.setattr(bigquery, "bigquery_magics", None) @@ -1354,6 +1357,8 @@ def test_bigquery_magic_w_progress_bar_type_w_context_setter(monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1383,6 +1388,8 @@ def test_bigquery_magic_with_progress_bar_type(monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + with run_query_patch as run_query_mock: ip.run_cell_magic( "bigquery", "--progress_bar_type=tqdm_gui", "SELECT 17 as num" @@ -1565,6 +1572,8 @@ def test_bigquery_magic_with_string_params(ipython_ns_cleanup, monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1605,6 +1614,8 @@ def test_bigquery_magic_with_dict_params(ipython_ns_cleanup, monkeypatch): run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1689,6 +1700,7 @@ def test_bigquery_magic_with_option_value_incorrect(monkeypatch): magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" sql = "SELECT @foo AS foo" @@ -1719,6 +1731,8 @@ def test_bigquery_magic_with_dict_params_negative_value( run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1760,6 +1774,8 @@ def test_bigquery_magic_with_dict_params_array_value(ipython_ns_cleanup, monkeyp run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1801,6 +1817,8 @@ def test_bigquery_magic_with_dict_params_tuple_value(ipython_ns_cleanup, monkeyp run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) + magics.context.project = "unit-test-project" + query_job_mock = mock.create_autospec( google.cloud.bigquery.job.QueryJob, instance=True ) @@ -1852,6 +1870,7 @@ def test_bigquery_magic_valid_query_in_existing_variable( magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" ipython_ns_cleanup.append((ip, "custom_query")) ipython_ns_cleanup.append((ip, "query_results_df")) @@ -1892,6 +1911,7 @@ def test_bigquery_magic_nonexisting_query_variable(monkeypatch): magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True @@ -1917,7 +1937,7 @@ def test_bigquery_magic_empty_query_variable_name(monkeypatch): magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) - + magics.context.project = "unit-test-project" run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True ) @@ -1940,6 +1960,7 @@ def test_bigquery_magic_query_variable_non_string(ipython_ns_cleanup, monkeypatc magics.context.credentials = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" run_query_patch = mock.patch( "google.cloud.bigquery.magics.magics._run_query", autospec=True @@ -1968,9 +1989,14 @@ def test_bigquery_magic_query_variable_not_identifier(monkeypatch): google.auth.credentials.Credentials, instance=True ) + magics.context.project = "unit-test-project" cell_body = "$123foo" # 123foo is not valid Python identifier - with io.capture_output() as captured_io: + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + + with run_query_patch, io.capture_output() as captured_io: ip.run_cell_magic("bigquery", "", cell_body) # If "$" prefixes a string that is not a Python identifier, we do not treat such From 06ee3df6c5346e5041ebab80da8b2a299378444c Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 19 May 2025 16:02:21 -0700 Subject: [PATCH 12/13] Revert "fix: table iterator should not use bqstorage when page_size is not None (#2154)" (#2191) This reverts commit e89a707b162182ededbf94cc9a0f7594bc2be475. --- google/cloud/bigquery/table.py | 11 ++--------- tests/unit/test_dbapi_cursor.py | 1 - tests/unit/test_table.py | 7 ------- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index e084468f6..3f472c490 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1897,11 +1897,6 @@ def total_bytes_processed(self) -> Optional[int]: """total bytes processed from job statistics, if present.""" return self._total_bytes_processed - @property - def page_size(self) -> Optional[int]: - """The maximum number of rows in each page of results from this request, if present.""" - return self._page_size - def _is_almost_completely_cached(self): """Check if all results are completely cached. @@ -1953,7 +1948,7 @@ def _should_use_bqstorage(self, bqstorage_client, create_bqstorage_client): if self._is_almost_completely_cached(): return False - if self.max_results is not None or self.page_size is not None: + if self.max_results is not None: return False try: @@ -2023,9 +2018,7 @@ def _maybe_warn_max_results( bqstorage_client: The BigQuery Storage client intended to use for downloading result rows. """ - if bqstorage_client is not None and ( - self.max_results is not None or self.page_size is not None - ): + if bqstorage_client is not None and self.max_results is not None: warnings.warn( "Cannot use bqstorage_client if max_results is set, " "reverting to fetching data with the REST endpoint.", diff --git a/tests/unit/test_dbapi_cursor.py b/tests/unit/test_dbapi_cursor.py index cba9030de..6fca4cec0 100644 --- a/tests/unit/test_dbapi_cursor.py +++ b/tests/unit/test_dbapi_cursor.py @@ -161,7 +161,6 @@ def _mock_rows( mock_rows, ) mock_rows.max_results = None - mock_rows.page_size = None type(mock_rows).job_id = mock.PropertyMock(return_value="test-job-id") type(mock_rows).location = mock.PropertyMock(return_value="test-location") type(mock_rows).num_dml_affected_rows = mock.PropertyMock( diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 92fa0e2ec..4791c6511 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -2858,13 +2858,6 @@ def test__should_use_bqstorage_returns_false_if_max_results_set(self): ) self.assertFalse(result) - def test__should_use_bqstorage_returns_false_if_page_size_set(self): - iterator = self._make_one(page_size=10, first_page_response=None) # not cached - result = iterator._should_use_bqstorage( - bqstorage_client=None, create_bqstorage_client=True - ) - self.assertFalse(result) - def test__should_use_bqstorage_returns_false_w_warning_if_missing_dependency(self): iterator = self._make_one(first_page_response=None) # not cached From b140fca726488106693aaf14695cb7bb9b4b2796 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 19 May 2025 16:27:29 -0700 Subject: [PATCH 13/13] chore(main): release 3.33.0 (#2180) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> Co-authored-by: Lingqing Gan --- CHANGELOG.md | 15 +++++++++++++++ google/cloud/bigquery/version.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff1bd7acc..2f7166d44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,21 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history +## [3.33.0](https://github.com/googleapis/python-bigquery/compare/v3.32.0...v3.33.0) (2025-05-19) + + +### Features + +* Add ability to set autodetect_schema query param in update_table ([#2171](https://github.com/googleapis/python-bigquery/issues/2171)) ([57f940d](https://github.com/googleapis/python-bigquery/commit/57f940d957613b4d80fb81ea40a1177b73856189)) +* Add dtype parameters to to_geodataframe functions ([#2176](https://github.com/googleapis/python-bigquery/issues/2176)) ([ebfd0a8](https://github.com/googleapis/python-bigquery/commit/ebfd0a83d43bcb96f65f5669437220aa6138b766)) +* Support job reservation ([#2186](https://github.com/googleapis/python-bigquery/issues/2186)) ([cb646ce](https://github.com/googleapis/python-bigquery/commit/cb646ceea172bf199f366ae0592546dff2d3bcb2)) + + +### Bug Fixes + +* Ensure AccessEntry equality and repr uses the correct `entity_type` ([#2182](https://github.com/googleapis/python-bigquery/issues/2182)) ([0217637](https://github.com/googleapis/python-bigquery/commit/02176377d5e2fc25b5cd4f46aa6ebfb1b6a960a6)) +* Ensure SchemaField.field_dtype returns a string ([#2188](https://github.com/googleapis/python-bigquery/issues/2188)) ([7ec2848](https://github.com/googleapis/python-bigquery/commit/7ec2848379d5743bbcb36700a1153540c451e0e0)) + ## [3.32.0](https://github.com/googleapis/python-bigquery/compare/v3.31.0...v3.32.0) (2025-05-12) diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index fe13d2477..8304ac025 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "3.32.0" +__version__ = "3.33.0"