From c993c5845f8a2ca001ba18d4af988edd3e53e8bf Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Wed, 6 Oct 2021 07:43:32 -0700 Subject: [PATCH] fix(BigQuery): explicitly quote columns in select_star (#16822) * fix (BigQuery): explicitly quote columns in select_star * Fix test * Fix SELECT * in BQ * Add unit tests * Remove type changes --- superset/db_engine_specs/bigquery.py | 120 +++++++++-- superset/result_set.py | 4 +- .../db_engine_specs/test_bigquery.py | 190 ++++++++++++++++++ 3 files changed, 296 insertions(+), 18 deletions(-) create mode 100644 tests/unit_tests/db_engine_specs/test_bigquery.py diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py index 1b7b73e543..a862df58d9 100644 --- a/superset/db_engine_specs/bigquery.py +++ b/superset/db_engine_specs/bigquery.py @@ -26,9 +26,10 @@ from apispec.ext.marshmallow import MarshmallowPlugin from flask_babel import gettext as __ from marshmallow import fields, Schema from marshmallow.exceptions import ValidationError -from sqlalchemy import literal_column +from sqlalchemy import column +from sqlalchemy.engine.base import Engine from sqlalchemy.engine.url import make_url -from sqlalchemy.sql.expression import ColumnClause +from sqlalchemy.sql import sqltypes from typing_extensions import TypedDict from superset.databases.schemas import encrypted_field_properties, EncryptedString @@ -282,20 +283,6 @@ class BigQueryEngineSpec(BaseEngineSpec): "clustering": {"cols": cluster_columns}, } - @classmethod - def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[ColumnClause]: - """ - BigQuery dialect requires us to not use backtick in the fieldname which are - nested. - Using literal_column handles that issue. - https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column - Also explicility specifying column names so we don't encounter duplicate - column names in the result. - """ - return [ - literal_column(c["name"]).label(c["name"].replace(".", "__")) for c in cols - ] - @classmethod def epoch_to_dttm(cls) -> str: return "TIMESTAMP_SECONDS({col})" @@ -425,3 +412,104 @@ class BigQueryEngineSpec(BaseEngineSpec): ma_plugin.converter.add_attribute_function(encrypted_field_properties) spec.components.schema(cls.__name__, schema=cls.parameters_schema) return spec.to_dict()["components"]["schemas"][cls.__name__] + + @classmethod + def select_star( # pylint: disable=too-many-arguments + cls, + database: "Database", + table_name: str, + engine: Engine, + schema: Optional[str] = None, + limit: int = 100, + show_cols: bool = False, + indent: bool = True, + latest_partition: bool = True, + cols: Optional[List[Dict[str, Any]]] = None, + ) -> str: + """ + Remove array structures from `SELECT *`. + + BigQuery supports structures and arrays of structures, eg: + + author STRUCT + trailer ARRAY> + + When loading metadata for a table each key in the struct is displayed as a + separate pseudo-column, eg: + + - author + - author.name + - author.email + - trailer + - trailer.key + - trailer.value + + When generating the `SELECT *` statement we want to remove any keys from + structs inside an array, since selecting them results in an error. The correct + select statement should look like this: + + SELECT + `author`, + `author`.`name`, + `author`.`email`, + `trailer` + FROM + table + + Selecting `trailer.key` or `trailer.value` results in an error, as opposed to + selecting `author.name`, since they are keys in a structure inside an array. + + This method removes any array pseudo-columns. + """ + if cols: + # For arrays of structs, remove the child columns, otherwise the query + # will fail. + array_prefixes = { + col["name"] for col in cols if isinstance(col["type"], sqltypes.ARRAY) + } + cols = [ + col + for col in cols + if "." not in col["name"] + or col["name"].split(".")[0] not in array_prefixes + ] + + return super().select_star( + database, + table_name, + engine, + schema, + limit, + show_cols, + indent, + latest_partition, + cols, + ) + + @classmethod + def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[Any]: + """ + Label columns using their fully qualified name. + + BigQuery supports columns of type `struct`, which are basically dictionaries. + When loading metadata for a table with struct columns, each key in the struct + is displayed as a separate pseudo-column, eg: + + author STRUCT + + Will be shown as 3 columns: + + - author + - author.name + - author.email + + If we select those fields: + + SELECT `author`, `author`.`name`, `author`.`email` FROM table + + The resulting columns will be called "author", "name", and "email", This may + result in a clash with other columns. To prevent that, we explicitly label + the columns using their fully qualified name, so we end up with "author", + "author__name" and "author__email", respectively. + """ + return [column(c["name"]).label(c["name"].replace(".", "__")) for c in cols] diff --git a/superset/result_set.py b/superset/result_set.py index 1f925138d2..b95b5e680d 100644 --- a/superset/result_set.py +++ b/superset/result_set.py @@ -25,7 +25,7 @@ import numpy as np import pandas as pd import pyarrow as pa -from superset import db_engine_specs +from superset.db_engine_specs import BaseEngineSpec from superset.typing import DbapiDescription, DbapiResult from superset.utils import core as utils @@ -76,7 +76,7 @@ class SupersetResultSet: self, data: DbapiResult, cursor_description: DbapiDescription, - db_engine_spec: Type[db_engine_specs.BaseEngineSpec], + db_engine_spec: Type[BaseEngineSpec], ): self.db_engine_spec = db_engine_spec data = data or [] diff --git a/tests/unit_tests/db_engine_specs/test_bigquery.py b/tests/unit_tests/db_engine_specs/test_bigquery.py new file mode 100644 index 0000000000..dcf1de718f --- /dev/null +++ b/tests/unit_tests/db_engine_specs/test_bigquery.py @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=unused-argument, import-outside-toplevel, protected-access + +from flask.ctx import AppContext +from pytest_mock import MockFixture +from sqlalchemy import select +from sqlalchemy.dialects.sqlite.base import SQLiteDialect +from sqlalchemy.sql import sqltypes + + +def test_get_fields(app_context: AppContext) -> None: + """ + Test the custom ``_get_fields`` method. + + The method adds custom labels (aliases) to the columns to prevent + collision when referencing record fields. Eg, if we had these two + columns: + + name STRING + project STRUCT + + One could write this query: + + SELECT + `name`, + `project`.`name` + FROM + the_table + + But then both columns would get aliased as "name". + + The custom method will replace the fields so that the final query + looks like this: + + SELECT + `name` AS `name`, + `project`.`name` AS project__name + FROM + the_table + + """ + from superset.db_engine_specs.bigquery import BigQueryEngineSpec + + columns = [{"name": "limit"}, {"name": "name"}, {"name": "project.name"}] + fields = BigQueryEngineSpec._get_fields(columns) + + # generic SQL + query = select(fields) + assert ( + str(query) + == 'SELECT "limit" AS "limit", name AS name, "project.name" AS project__name' + ) + + # BigQuery-specific SQL + try: + from pybigquery.sqlalchemy_bigquery import BigQueryDialect + except ModuleNotFoundError: + return + + assert str(query.compile(dialect=BigQueryDialect())) == ( + "SELECT `limit` AS `limit`, `name` AS `name`, " + "`project`.`name` AS `project__name`" + ) + + +def test_select_star(mocker: MockFixture, app_context: AppContext) -> None: + """ + Test the ``select_star`` method. + + The method removes pseudo-columns from structures inside arrays. While these + pseudo-columns show up as "columns" for metadata reasons, we can't select them + in the query, as opposed to fields from non-array structures. + """ + from superset.db_engine_specs.bigquery import BigQueryEngineSpec + + cols = [ + { + "name": "trailer", + "type": sqltypes.ARRAY(sqltypes.JSON()), + "nullable": True, + "comment": None, + "default": None, + "precision": None, + "scale": None, + "max_length": None, + }, + { + "name": "trailer.key", + "type": sqltypes.String(), + "nullable": True, + "comment": None, + "default": None, + "precision": None, + "scale": None, + "max_length": None, + }, + { + "name": "trailer.value", + "type": sqltypes.String(), + "nullable": True, + "comment": None, + "default": None, + "precision": None, + "scale": None, + "max_length": None, + }, + { + "name": "trailer.email", + "type": sqltypes.String(), + "nullable": True, + "comment": None, + "default": None, + "precision": None, + "scale": None, + "max_length": None, + }, + ] + + # mock the database so we can compile the query + database = mocker.MagicMock() + database.compile_sqla_query = lambda query: str( + query.compile(dialect=SQLiteDialect()) + ) + + # use SQLite dialect so we don't need the BQ dependency + engine = mocker.MagicMock() + engine.dialect = SQLiteDialect() + + sql = BigQueryEngineSpec.select_star( + database=database, + table_name="my_table", + engine=engine, + schema=None, + limit=100, + show_cols=True, + indent=True, + latest_partition=False, + cols=cols, + ) + assert ( + sql + == """SELECT trailer AS trailer +FROM my_table +LIMIT ? +OFFSET ?""" + ) + + # BigQuery-specific SQL + try: + from pybigquery.sqlalchemy_bigquery import BigQueryDialect + except ModuleNotFoundError: + return + + database.compile_sqla_query = lambda query: str( + query.compile(dialect=BigQueryDialect()) + ) + engine.dialect = BigQueryDialect() + + sql = BigQueryEngineSpec.select_star( + database=database, + table_name="my_table", + engine=engine, + schema=None, + limit=100, + show_cols=True, + indent=True, + latest_partition=False, + cols=cols, + ) + assert ( + sql + == """SELECT `trailer` AS `trailer` +FROM `my_table` +LIMIT :param_1""" + )