Skip to content

insert_rows_from_dataframe doesn't support nullable boolean or nullable Int64 dtypes #1815

@tswast

Description

@tswast

Is your feature request related to a problem? Please describe.

I'm working on an alternative loading mechanism in BigQuery DataFrames based on the streaming API to avoid quota limits for load jobs.

Describe the solution you'd like

import pandas
import google.cloud.bigquery

df = pandas.DataFrame(
  {
    "bool_col": pandas.Series([True, False, True, pandas.NA, False, False, True, True, False], dtype="boolean"),
  }
)

bqclient = google.cloud.bigquery.Client()
bqclient.insert_rows_from_dataframe(bqclient.get_table("my_dataset.my_table"), df)

This results in the following error:

In [11]: bqclient.insert_rows_from_dataframe(bqclient.get_table("my_dataset.my_table"), df)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [11], in <cell line: 1>()
----> 1 bqclient.insert_rows_from_dataframe(bqclient.get_table("my_dataset.my_table"), df)

File ~/src/github.com/googleapis/python-bigquery/google/cloud/bigquery/client.py:3662, in Client.insert_rows_from_dataframe(self, table, dataframe, selected_fields, chunk_size, **kwargs)
   3660 for _ in range(chunk_count):
   3661     rows_chunk = itertools.islice(rows_iter, chunk_size)
-> 3662     result = self.insert_rows(table, rows_chunk, selected_fields, **kwargs)
   3663     insert_results.append(result)
   3665 return insert_results

File ~/src/github.com/googleapis/python-bigquery/google/cloud/bigquery/client.py:3605, in Client.insert_rows(self, table, rows, selected_fields, **kwargs)
   3596     raise ValueError(
   3597         (
   3598             "Could not determine schema for table '{}'. Call client.get_table() "
   3599             "or pass in a list of schema fields to the selected_fields argument."
   3600         ).format(table)
   3601     )
   3603 json_rows = [_record_field_to_json(schema, row) for row in rows]
-> 3605 return self.insert_rows_json(table, json_rows, **kwargs)

File ~/src/github.com/googleapis/python-bigquery/google/cloud/bigquery/client.py:3801, in Client.insert_rows_json(self, table, json_rows, row_ids, skip_invalid_rows, ignore_unknown_values, template_suffix, retry, timeout)
   3799 # We can always retry, because every row has an insert ID.
   3800 span_attributes = {"path": path}
-> 3801 response = self._call_api(
   3802     retry,
   3803     span_name="BigQuery.insertRowsJson",
   3804     span_attributes=span_attributes,
   3805     method="POST",
   3806     path=path,
   3807     data=data,
   3808     timeout=timeout,
   3809 )
   3810 errors = []
   3812 for error in response.get("insertErrors", ()):

File ~/src/github.com/googleapis/python-bigquery/google/cloud/bigquery/client.py:827, in Client._call_api(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)
    823 if span_name is not None:
    824     with create_span(
    825         name=span_name, attributes=span_attributes, client=self, job_ref=job_ref
    826     ):
--> 827         return call()
    829 return call()

File /opt/miniconda3/envs/dev-3.10/lib/python3.10/site-packages/google/api_core/retry.py:349, in Retry.__call__.<locals>.retry_wrapped_func(*args, **kwargs)
    345 target = functools.partial(func, *args, **kwargs)
    346 sleep_generator = exponential_sleep_generator(
    347     self._initial, self._maximum, multiplier=self._multiplier
    348 )
--> 349 return retry_target(
    350     target,
    351     self._predicate,
    352     sleep_generator,
    353     self._timeout,
    354     on_error=on_error,
    355 )

File /opt/miniconda3/envs/dev-3.10/lib/python3.10/site-packages/google/api_core/retry.py:191, in retry_target(target, predicate, sleep_generator, timeout, on_error, **kwargs)
    189 for sleep in sleep_generator:
    190     try:
--> 191         return target()
    193     # pylint: disable=broad-except
    194     # This function explicitly must deal with broad exceptions.
    195     except Exception as exc:

File /opt/miniconda3/envs/dev-3.10/lib/python3.10/site-packages/google/cloud/_http/__init__.py:479, in JSONConnection.api_request(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)
    476 # Making the executive decision that any dictionary
    477 # data will be sent properly as JSON.
    478 if data and isinstance(data, dict):
--> 479     data = json.dumps(data)
    480     content_type = "application/json"
    482 response = self._make_request(
    483     method=method,
    484     url=url,
   (...)
    490     extra_api_info=extra_api_info,
    491 )

File /opt/miniconda3/envs/dev-3.10/lib/python3.10/json/__init__.py:231, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
    226 # cached encoder
    227 if (not skipkeys and ensure_ascii and
    228     check_circular and allow_nan and
    229     cls is None and indent is None and separators is None and
    230     default is None and not sort_keys and not kw):
--> 231     return _default_encoder.encode(obj)
    232 if cls is None:
    233     cls = JSONEncoder

File /opt/miniconda3/envs/dev-3.10/lib/python3.10/json/encoder.py:199, in JSONEncoder.encode(self, o)
    195         return encode_basestring(o)
    196 # This doesn't pass the iterator directly to ''.join() because the
    197 # exceptions aren't as detailed.  The list call should be roughly
    198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
    200 if not isinstance(chunks, (list, tuple)):
    201     chunks = list(chunks)

File /opt/miniconda3/envs/dev-3.10/lib/python3.10/json/encoder.py:257, in JSONEncoder.iterencode(self, o, _one_shot)
    252 else:
    253     _iterencode = _make_iterencode(
    254         markers, self.default, _encoder, self.indent, floatstr,
    255         self.key_separator, self.item_separator, self.sort_keys,
    256         self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)

File /opt/miniconda3/envs/dev-3.10/lib/python3.10/json/encoder.py:179, in JSONEncoder.default(self, o)
    160 def default(self, o):
    161     """Implement this method in a subclass such that it returns
    162     a serializable object for ``o``, or calls the base implementation
    163     (to raise a ``TypeError``).
   (...)
    177 
    178     """
--> 179     raise TypeError(f'Object of type {o.__class__.__name__} '
    180                     f'is not JSON serializable')

TypeError: Object of type bool_ is not JSON serializable

Describe alternatives you've considered

  • Load jobs, but these have quota limitations and I'd like to provide an alternative.
  • Pulling even more into BigQuery DataFrames third_party, but I'd like to contribute this fix here, instead.

Metadata

Metadata

Assignees

Labels

api: bigqueryIssues related to the googleapis/python-bigquery API.type: feature request‘Nice-to-have’ improvement, new feature or different behavior or design.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions