fix(BigQuery): explicitly quote columns in select_star (#16822)

* fix (BigQuery): explicitly quote columns in select_star * Fix test * Fix SELECT * in BQ * Add unit tests * Remove type changes
2021-10-06 07:43:32 -07:00 · 2021-10-06 07:43:32 -07:00 · c993c5845f
parent 191033cb44
commit c993c5845f
3 changed files with 296 additions and 18 deletions
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@ -26,9 +26,10 @@ from apispec.ext.marshmallow import MarshmallowPlugin
 from flask_babel import gettext as __
 from marshmallow import fields, Schema
 from marshmallow.exceptions import ValidationError
-from sqlalchemy import literal_column
+from sqlalchemy import column
+from sqlalchemy.engine.base import Engine
 from sqlalchemy.engine.url import make_url
-from sqlalchemy.sql.expression import ColumnClause
+from sqlalchemy.sql import sqltypes
 from typing_extensions import TypedDict

 from superset.databases.schemas import encrypted_field_properties, EncryptedString
@ -282,20 +283,6 @@ class BigQueryEngineSpec(BaseEngineSpec):
            "clustering": {"cols": cluster_columns},
        }

-    @classmethod
-    def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[ColumnClause]:
-        """
-        BigQuery dialect requires us to not use backtick in the fieldname which are
-        nested.
-        Using literal_column handles that issue.
-        https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column
-        Also explicility specifying column names so we don't encounter duplicate
-        column names in the result.
-        """
-        return [
-            literal_column(c["name"]).label(c["name"].replace(".", "__")) for c in cols
-        ]
-
    @classmethod
    def epoch_to_dttm(cls) -> str:
        return "TIMESTAMP_SECONDS({col})"
@ -425,3 +412,104 @@ class BigQueryEngineSpec(BaseEngineSpec):
        ma_plugin.converter.add_attribute_function(encrypted_field_properties)
        spec.components.schema(cls.__name__, schema=cls.parameters_schema)
        return spec.to_dict()["components"]["schemas"][cls.__name__]
+
+    @classmethod
+    def select_star(  # pylint: disable=too-many-arguments
+        cls,
+        database: "Database",
+        table_name: str,
+        engine: Engine,
+        schema: Optional[str] = None,
+        limit: int = 100,
+        show_cols: bool = False,
+        indent: bool = True,
+        latest_partition: bool = True,
+        cols: Optional[List[Dict[str, Any]]] = None,
+    ) -> str:
+        """
+        Remove array structures from `SELECT *`.
+
+        BigQuery supports structures and arrays of structures, eg:
+
+            author STRUCT<name STRING, email STRING>
+            trailer ARRAY<STRUCT<key STRING, value STRING>>
+
+        When loading metadata for a table each key in the struct is displayed as a
+        separate pseudo-column, eg:
+
+            - author
+            - author.name
+            - author.email
+            - trailer
+            - trailer.key
+            - trailer.value
+
+        When generating the `SELECT *` statement we want to remove any keys from
+        structs inside an array, since selecting them results in an error. The correct
+        select statement should look like this:
+
+            SELECT
+              `author`,
+              `author`.`name`,
+              `author`.`email`,
+              `trailer`
+            FROM
+              table
+
+        Selecting `trailer.key` or `trailer.value` results in an error, as opposed to
+        selecting `author.name`, since they are keys in a structure inside an array.
+
+        This method removes any array pseudo-columns.
+        """
+        if cols:
+            # For arrays of structs, remove the child columns, otherwise the query
+            # will fail.
+            array_prefixes = {
+                col["name"] for col in cols if isinstance(col["type"], sqltypes.ARRAY)
+            }
+            cols = [
+                col
+                for col in cols
+                if "." not in col["name"]
+                or col["name"].split(".")[0] not in array_prefixes
+            ]
+
+        return super().select_star(
+            database,
+            table_name,
+            engine,
+            schema,
+            limit,
+            show_cols,
+            indent,
+            latest_partition,
+            cols,
+        )
+
+    @classmethod
+    def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[Any]:
+        """
+        Label columns using their fully qualified name.
+
+        BigQuery supports columns of type `struct`, which are basically dictionaries.
+        When loading metadata for a table with struct columns, each key in the struct
+        is displayed as a separate pseudo-column, eg:
+
+            author STRUCT<name STRING, email STRING>
+
+        Will be shown as 3 columns:
+
+            - author
+            - author.name
+            - author.email
+
+        If we select those fields:
+
+            SELECT `author`, `author`.`name`, `author`.`email` FROM table
+
+        The resulting columns will be called "author", "name", and "email", This may
+        result in a clash with other columns. To prevent that, we explicitly label
+        the columns using their fully qualified name, so we end up with "author",
+        "author__name" and "author__email", respectively.
+        """
+        return [column(c["name"]).label(c["name"].replace(".", "__")) for c in cols]
--- a/superset/result_set.py
+++ b/superset/result_set.py
@ -25,7 +25,7 @@ import numpy as np
 import pandas as pd
 import pyarrow as pa

-from superset import db_engine_specs
+from superset.db_engine_specs import BaseEngineSpec
 from superset.typing import DbapiDescription, DbapiResult
 from superset.utils import core as utils

@ -76,7 +76,7 @@ class SupersetResultSet:
        self,
        data: DbapiResult,
        cursor_description: DbapiDescription,
-        db_engine_spec: Type[db_engine_specs.BaseEngineSpec],
+        db_engine_spec: Type[BaseEngineSpec],
    ):
        self.db_engine_spec = db_engine_spec
        data = data or []
--- a/tests/unit_tests/db_engine_specs/test_bigquery.py
+++ b/tests/unit_tests/db_engine_specs/test_bigquery.py
@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument, import-outside-toplevel, protected-access
+
+from flask.ctx import AppContext
+from pytest_mock import MockFixture
+from sqlalchemy import select
+from sqlalchemy.dialects.sqlite.base import SQLiteDialect
+from sqlalchemy.sql import sqltypes
+
+
+def test_get_fields(app_context: AppContext) -> None:
+    """
+    Test the custom ``_get_fields`` method.
+
+    The method adds custom labels (aliases) to the columns to prevent
+    collision when referencing record fields. Eg, if we had these two
+    columns:
+
+        name STRING
+        project STRUCT<name STRING>
+
+    One could write this query:
+
+        SELECT
+            `name`,
+            `project`.`name`
+        FROM
+            the_table
+
+    But then both columns would get aliased as "name".
+
+    The custom method will replace the fields so that the final query
+    looks like this:
+
+        SELECT
+            `name` AS `name`,
+            `project`.`name` AS project__name
+        FROM
+            the_table
+
+    """
+    from superset.db_engine_specs.bigquery import BigQueryEngineSpec
+
+    columns = [{"name": "limit"}, {"name": "name"}, {"name": "project.name"}]
+    fields = BigQueryEngineSpec._get_fields(columns)
+
+    # generic SQL
+    query = select(fields)
+    assert (
+        str(query)
+        == 'SELECT "limit" AS "limit", name AS name, "project.name" AS project__name'
+    )
+
+    # BigQuery-specific SQL
+    try:
+        from pybigquery.sqlalchemy_bigquery import BigQueryDialect
+    except ModuleNotFoundError:
+        return
+
+    assert str(query.compile(dialect=BigQueryDialect())) == (
+        "SELECT `limit` AS `limit`, `name` AS `name`, "
+        "`project`.`name` AS `project__name`"
+    )
+
+
+def test_select_star(mocker: MockFixture, app_context: AppContext) -> None:
+    """
+    Test the ``select_star`` method.
+
+    The method removes pseudo-columns from structures inside arrays. While these
+    pseudo-columns show up as "columns" for metadata reasons, we can't select them
+    in the query, as opposed to fields from non-array structures.
+    """
+    from superset.db_engine_specs.bigquery import BigQueryEngineSpec
+
+    cols = [
+        {
+            "name": "trailer",
+            "type": sqltypes.ARRAY(sqltypes.JSON()),
+            "nullable": True,
+            "comment": None,
+            "default": None,
+            "precision": None,
+            "scale": None,
+            "max_length": None,
+        },
+        {
+            "name": "trailer.key",
+            "type": sqltypes.String(),
+            "nullable": True,
+            "comment": None,
+            "default": None,
+            "precision": None,
+            "scale": None,
+            "max_length": None,
+        },
+        {
+            "name": "trailer.value",
+            "type": sqltypes.String(),
+            "nullable": True,
+            "comment": None,
+            "default": None,
+            "precision": None,
+            "scale": None,
+            "max_length": None,
+        },
+        {
+            "name": "trailer.email",
+            "type": sqltypes.String(),
+            "nullable": True,
+            "comment": None,
+            "default": None,
+            "precision": None,
+            "scale": None,
+            "max_length": None,
+        },
+    ]
+
+    # mock the database so we can compile the query
+    database = mocker.MagicMock()
+    database.compile_sqla_query = lambda query: str(
+        query.compile(dialect=SQLiteDialect())
+    )
+
+    # use SQLite dialect so we don't need the BQ dependency
+    engine = mocker.MagicMock()
+    engine.dialect = SQLiteDialect()
+
+    sql = BigQueryEngineSpec.select_star(
+        database=database,
+        table_name="my_table",
+        engine=engine,
+        schema=None,
+        limit=100,
+        show_cols=True,
+        indent=True,
+        latest_partition=False,
+        cols=cols,
+    )
+    assert (
+        sql
+        == """SELECT trailer AS trailer
+FROM my_table
+LIMIT ?
+OFFSET ?"""
+    )
+
+    # BigQuery-specific SQL
+    try:
+        from pybigquery.sqlalchemy_bigquery import BigQueryDialect
+    except ModuleNotFoundError:
+        return
+
+    database.compile_sqla_query = lambda query: str(
+        query.compile(dialect=BigQueryDialect())
+    )
+    engine.dialect = BigQueryDialect()
+
+    sql = BigQueryEngineSpec.select_star(
+        database=database,
+        table_name="my_table",
+        engine=engine,
+        schema=None,
+        limit=100,
+        show_cols=True,
+        indent=True,
+        latest_partition=False,
+        cols=cols,
+    )
+    assert (
+        sql
+        == """SELECT `trailer` AS `trailer`
+FROM `my_table`
+LIMIT :param_1"""
+    )