fix(BigQuery): explicitly quote columns in select_star (#16822)

* fix (BigQuery): explicitly quote columns in select_star

* Fix test

* Fix SELECT * in BQ

* Add unit tests

* Remove type changes
This commit is contained in:
Beto Dealmeida 2021-10-06 07:43:32 -07:00 committed by GitHub
parent 191033cb44
commit c993c5845f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 296 additions and 18 deletions

View File

@ -26,9 +26,10 @@ from apispec.ext.marshmallow import MarshmallowPlugin
from flask_babel import gettext as __
from marshmallow import fields, Schema
from marshmallow.exceptions import ValidationError
from sqlalchemy import literal_column
from sqlalchemy import column
from sqlalchemy.engine.base import Engine
from sqlalchemy.engine.url import make_url
from sqlalchemy.sql.expression import ColumnClause
from sqlalchemy.sql import sqltypes
from typing_extensions import TypedDict
from superset.databases.schemas import encrypted_field_properties, EncryptedString
@ -282,20 +283,6 @@ class BigQueryEngineSpec(BaseEngineSpec):
"clustering": {"cols": cluster_columns},
}
@classmethod
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[ColumnClause]:
"""
BigQuery dialect requires us to not use backtick in the fieldname which are
nested.
Using literal_column handles that issue.
https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column
Also explicility specifying column names so we don't encounter duplicate
column names in the result.
"""
return [
literal_column(c["name"]).label(c["name"].replace(".", "__")) for c in cols
]
@classmethod
def epoch_to_dttm(cls) -> str:
return "TIMESTAMP_SECONDS({col})"
@ -425,3 +412,104 @@ class BigQueryEngineSpec(BaseEngineSpec):
ma_plugin.converter.add_attribute_function(encrypted_field_properties)
spec.components.schema(cls.__name__, schema=cls.parameters_schema)
return spec.to_dict()["components"]["schemas"][cls.__name__]
@classmethod
def select_star( # pylint: disable=too-many-arguments
cls,
database: "Database",
table_name: str,
engine: Engine,
schema: Optional[str] = None,
limit: int = 100,
show_cols: bool = False,
indent: bool = True,
latest_partition: bool = True,
cols: Optional[List[Dict[str, Any]]] = None,
) -> str:
"""
Remove array structures from `SELECT *`.
BigQuery supports structures and arrays of structures, eg:
author STRUCT<name STRING, email STRING>
trailer ARRAY<STRUCT<key STRING, value STRING>>
When loading metadata for a table each key in the struct is displayed as a
separate pseudo-column, eg:
- author
- author.name
- author.email
- trailer
- trailer.key
- trailer.value
When generating the `SELECT *` statement we want to remove any keys from
structs inside an array, since selecting them results in an error. The correct
select statement should look like this:
SELECT
`author`,
`author`.`name`,
`author`.`email`,
`trailer`
FROM
table
Selecting `trailer.key` or `trailer.value` results in an error, as opposed to
selecting `author.name`, since they are keys in a structure inside an array.
This method removes any array pseudo-columns.
"""
if cols:
# For arrays of structs, remove the child columns, otherwise the query
# will fail.
array_prefixes = {
col["name"] for col in cols if isinstance(col["type"], sqltypes.ARRAY)
}
cols = [
col
for col in cols
if "." not in col["name"]
or col["name"].split(".")[0] not in array_prefixes
]
return super().select_star(
database,
table_name,
engine,
schema,
limit,
show_cols,
indent,
latest_partition,
cols,
)
@classmethod
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[Any]:
"""
Label columns using their fully qualified name.
BigQuery supports columns of type `struct`, which are basically dictionaries.
When loading metadata for a table with struct columns, each key in the struct
is displayed as a separate pseudo-column, eg:
author STRUCT<name STRING, email STRING>
Will be shown as 3 columns:
- author
- author.name
- author.email
If we select those fields:
SELECT `author`, `author`.`name`, `author`.`email` FROM table
The resulting columns will be called "author", "name", and "email", This may
result in a clash with other columns. To prevent that, we explicitly label
the columns using their fully qualified name, so we end up with "author",
"author__name" and "author__email", respectively.
"""
return [column(c["name"]).label(c["name"].replace(".", "__")) for c in cols]

View File

@ -25,7 +25,7 @@ import numpy as np
import pandas as pd
import pyarrow as pa
from superset import db_engine_specs
from superset.db_engine_specs import BaseEngineSpec
from superset.typing import DbapiDescription, DbapiResult
from superset.utils import core as utils
@ -76,7 +76,7 @@ class SupersetResultSet:
self,
data: DbapiResult,
cursor_description: DbapiDescription,
db_engine_spec: Type[db_engine_specs.BaseEngineSpec],
db_engine_spec: Type[BaseEngineSpec],
):
self.db_engine_spec = db_engine_spec
data = data or []

View File

@ -0,0 +1,190 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=unused-argument, import-outside-toplevel, protected-access
from flask.ctx import AppContext
from pytest_mock import MockFixture
from sqlalchemy import select
from sqlalchemy.dialects.sqlite.base import SQLiteDialect
from sqlalchemy.sql import sqltypes
def test_get_fields(app_context: AppContext) -> None:
"""
Test the custom ``_get_fields`` method.
The method adds custom labels (aliases) to the columns to prevent
collision when referencing record fields. Eg, if we had these two
columns:
name STRING
project STRUCT<name STRING>
One could write this query:
SELECT
`name`,
`project`.`name`
FROM
the_table
But then both columns would get aliased as "name".
The custom method will replace the fields so that the final query
looks like this:
SELECT
`name` AS `name`,
`project`.`name` AS project__name
FROM
the_table
"""
from superset.db_engine_specs.bigquery import BigQueryEngineSpec
columns = [{"name": "limit"}, {"name": "name"}, {"name": "project.name"}]
fields = BigQueryEngineSpec._get_fields(columns)
# generic SQL
query = select(fields)
assert (
str(query)
== 'SELECT "limit" AS "limit", name AS name, "project.name" AS project__name'
)
# BigQuery-specific SQL
try:
from pybigquery.sqlalchemy_bigquery import BigQueryDialect
except ModuleNotFoundError:
return
assert str(query.compile(dialect=BigQueryDialect())) == (
"SELECT `limit` AS `limit`, `name` AS `name`, "
"`project`.`name` AS `project__name`"
)
def test_select_star(mocker: MockFixture, app_context: AppContext) -> None:
"""
Test the ``select_star`` method.
The method removes pseudo-columns from structures inside arrays. While these
pseudo-columns show up as "columns" for metadata reasons, we can't select them
in the query, as opposed to fields from non-array structures.
"""
from superset.db_engine_specs.bigquery import BigQueryEngineSpec
cols = [
{
"name": "trailer",
"type": sqltypes.ARRAY(sqltypes.JSON()),
"nullable": True,
"comment": None,
"default": None,
"precision": None,
"scale": None,
"max_length": None,
},
{
"name": "trailer.key",
"type": sqltypes.String(),
"nullable": True,
"comment": None,
"default": None,
"precision": None,
"scale": None,
"max_length": None,
},
{
"name": "trailer.value",
"type": sqltypes.String(),
"nullable": True,
"comment": None,
"default": None,
"precision": None,
"scale": None,
"max_length": None,
},
{
"name": "trailer.email",
"type": sqltypes.String(),
"nullable": True,
"comment": None,
"default": None,
"precision": None,
"scale": None,
"max_length": None,
},
]
# mock the database so we can compile the query
database = mocker.MagicMock()
database.compile_sqla_query = lambda query: str(
query.compile(dialect=SQLiteDialect())
)
# use SQLite dialect so we don't need the BQ dependency
engine = mocker.MagicMock()
engine.dialect = SQLiteDialect()
sql = BigQueryEngineSpec.select_star(
database=database,
table_name="my_table",
engine=engine,
schema=None,
limit=100,
show_cols=True,
indent=True,
latest_partition=False,
cols=cols,
)
assert (
sql
== """SELECT trailer AS trailer
FROM my_table
LIMIT ?
OFFSET ?"""
)
# BigQuery-specific SQL
try:
from pybigquery.sqlalchemy_bigquery import BigQueryDialect
except ModuleNotFoundError:
return
database.compile_sqla_query = lambda query: str(
query.compile(dialect=BigQueryDialect())
)
engine.dialect = BigQueryDialect()
sql = BigQueryEngineSpec.select_star(
database=database,
table_name="my_table",
engine=engine,
schema=None,
limit=100,
show_cols=True,
indent=True,
latest_partition=False,
cols=cols,
)
assert (
sql
== """SELECT `trailer` AS `trailer`
FROM `my_table`
LIMIT :param_1"""
)