mirror of https://github.com/apache/superset.git
fix(BigQuery): explicitly quote columns in select_star (#16822)
* fix (BigQuery): explicitly quote columns in select_star * Fix test * Fix SELECT * in BQ * Add unit tests * Remove type changes
This commit is contained in:
parent
191033cb44
commit
c993c5845f
|
@ -26,9 +26,10 @@ from apispec.ext.marshmallow import MarshmallowPlugin
|
|||
from flask_babel import gettext as __
|
||||
from marshmallow import fields, Schema
|
||||
from marshmallow.exceptions import ValidationError
|
||||
from sqlalchemy import literal_column
|
||||
from sqlalchemy import column
|
||||
from sqlalchemy.engine.base import Engine
|
||||
from sqlalchemy.engine.url import make_url
|
||||
from sqlalchemy.sql.expression import ColumnClause
|
||||
from sqlalchemy.sql import sqltypes
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from superset.databases.schemas import encrypted_field_properties, EncryptedString
|
||||
|
@ -282,20 +283,6 @@ class BigQueryEngineSpec(BaseEngineSpec):
|
|||
"clustering": {"cols": cluster_columns},
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[ColumnClause]:
|
||||
"""
|
||||
BigQuery dialect requires us to not use backtick in the fieldname which are
|
||||
nested.
|
||||
Using literal_column handles that issue.
|
||||
https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column
|
||||
Also explicility specifying column names so we don't encounter duplicate
|
||||
column names in the result.
|
||||
"""
|
||||
return [
|
||||
literal_column(c["name"]).label(c["name"].replace(".", "__")) for c in cols
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def epoch_to_dttm(cls) -> str:
|
||||
return "TIMESTAMP_SECONDS({col})"
|
||||
|
@ -425,3 +412,104 @@ class BigQueryEngineSpec(BaseEngineSpec):
|
|||
ma_plugin.converter.add_attribute_function(encrypted_field_properties)
|
||||
spec.components.schema(cls.__name__, schema=cls.parameters_schema)
|
||||
return spec.to_dict()["components"]["schemas"][cls.__name__]
|
||||
|
||||
@classmethod
|
||||
def select_star( # pylint: disable=too-many-arguments
|
||||
cls,
|
||||
database: "Database",
|
||||
table_name: str,
|
||||
engine: Engine,
|
||||
schema: Optional[str] = None,
|
||||
limit: int = 100,
|
||||
show_cols: bool = False,
|
||||
indent: bool = True,
|
||||
latest_partition: bool = True,
|
||||
cols: Optional[List[Dict[str, Any]]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Remove array structures from `SELECT *`.
|
||||
|
||||
BigQuery supports structures and arrays of structures, eg:
|
||||
|
||||
author STRUCT<name STRING, email STRING>
|
||||
trailer ARRAY<STRUCT<key STRING, value STRING>>
|
||||
|
||||
When loading metadata for a table each key in the struct is displayed as a
|
||||
separate pseudo-column, eg:
|
||||
|
||||
- author
|
||||
- author.name
|
||||
- author.email
|
||||
- trailer
|
||||
- trailer.key
|
||||
- trailer.value
|
||||
|
||||
When generating the `SELECT *` statement we want to remove any keys from
|
||||
structs inside an array, since selecting them results in an error. The correct
|
||||
select statement should look like this:
|
||||
|
||||
SELECT
|
||||
`author`,
|
||||
`author`.`name`,
|
||||
`author`.`email`,
|
||||
`trailer`
|
||||
FROM
|
||||
table
|
||||
|
||||
Selecting `trailer.key` or `trailer.value` results in an error, as opposed to
|
||||
selecting `author.name`, since they are keys in a structure inside an array.
|
||||
|
||||
This method removes any array pseudo-columns.
|
||||
"""
|
||||
if cols:
|
||||
# For arrays of structs, remove the child columns, otherwise the query
|
||||
# will fail.
|
||||
array_prefixes = {
|
||||
col["name"] for col in cols if isinstance(col["type"], sqltypes.ARRAY)
|
||||
}
|
||||
cols = [
|
||||
col
|
||||
for col in cols
|
||||
if "." not in col["name"]
|
||||
or col["name"].split(".")[0] not in array_prefixes
|
||||
]
|
||||
|
||||
return super().select_star(
|
||||
database,
|
||||
table_name,
|
||||
engine,
|
||||
schema,
|
||||
limit,
|
||||
show_cols,
|
||||
indent,
|
||||
latest_partition,
|
||||
cols,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[Any]:
|
||||
"""
|
||||
Label columns using their fully qualified name.
|
||||
|
||||
BigQuery supports columns of type `struct`, which are basically dictionaries.
|
||||
When loading metadata for a table with struct columns, each key in the struct
|
||||
is displayed as a separate pseudo-column, eg:
|
||||
|
||||
author STRUCT<name STRING, email STRING>
|
||||
|
||||
Will be shown as 3 columns:
|
||||
|
||||
- author
|
||||
- author.name
|
||||
- author.email
|
||||
|
||||
If we select those fields:
|
||||
|
||||
SELECT `author`, `author`.`name`, `author`.`email` FROM table
|
||||
|
||||
The resulting columns will be called "author", "name", and "email", This may
|
||||
result in a clash with other columns. To prevent that, we explicitly label
|
||||
the columns using their fully qualified name, so we end up with "author",
|
||||
"author__name" and "author__email", respectively.
|
||||
"""
|
||||
return [column(c["name"]).label(c["name"].replace(".", "__")) for c in cols]
|
||||
|
|
|
@ -25,7 +25,7 @@ import numpy as np
|
|||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
|
||||
from superset import db_engine_specs
|
||||
from superset.db_engine_specs import BaseEngineSpec
|
||||
from superset.typing import DbapiDescription, DbapiResult
|
||||
from superset.utils import core as utils
|
||||
|
||||
|
@ -76,7 +76,7 @@ class SupersetResultSet:
|
|||
self,
|
||||
data: DbapiResult,
|
||||
cursor_description: DbapiDescription,
|
||||
db_engine_spec: Type[db_engine_specs.BaseEngineSpec],
|
||||
db_engine_spec: Type[BaseEngineSpec],
|
||||
):
|
||||
self.db_engine_spec = db_engine_spec
|
||||
data = data or []
|
||||
|
|
|
@ -0,0 +1,190 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# pylint: disable=unused-argument, import-outside-toplevel, protected-access
|
||||
|
||||
from flask.ctx import AppContext
|
||||
from pytest_mock import MockFixture
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.dialects.sqlite.base import SQLiteDialect
|
||||
from sqlalchemy.sql import sqltypes
|
||||
|
||||
|
||||
def test_get_fields(app_context: AppContext) -> None:
|
||||
"""
|
||||
Test the custom ``_get_fields`` method.
|
||||
|
||||
The method adds custom labels (aliases) to the columns to prevent
|
||||
collision when referencing record fields. Eg, if we had these two
|
||||
columns:
|
||||
|
||||
name STRING
|
||||
project STRUCT<name STRING>
|
||||
|
||||
One could write this query:
|
||||
|
||||
SELECT
|
||||
`name`,
|
||||
`project`.`name`
|
||||
FROM
|
||||
the_table
|
||||
|
||||
But then both columns would get aliased as "name".
|
||||
|
||||
The custom method will replace the fields so that the final query
|
||||
looks like this:
|
||||
|
||||
SELECT
|
||||
`name` AS `name`,
|
||||
`project`.`name` AS project__name
|
||||
FROM
|
||||
the_table
|
||||
|
||||
"""
|
||||
from superset.db_engine_specs.bigquery import BigQueryEngineSpec
|
||||
|
||||
columns = [{"name": "limit"}, {"name": "name"}, {"name": "project.name"}]
|
||||
fields = BigQueryEngineSpec._get_fields(columns)
|
||||
|
||||
# generic SQL
|
||||
query = select(fields)
|
||||
assert (
|
||||
str(query)
|
||||
== 'SELECT "limit" AS "limit", name AS name, "project.name" AS project__name'
|
||||
)
|
||||
|
||||
# BigQuery-specific SQL
|
||||
try:
|
||||
from pybigquery.sqlalchemy_bigquery import BigQueryDialect
|
||||
except ModuleNotFoundError:
|
||||
return
|
||||
|
||||
assert str(query.compile(dialect=BigQueryDialect())) == (
|
||||
"SELECT `limit` AS `limit`, `name` AS `name`, "
|
||||
"`project`.`name` AS `project__name`"
|
||||
)
|
||||
|
||||
|
||||
def test_select_star(mocker: MockFixture, app_context: AppContext) -> None:
|
||||
"""
|
||||
Test the ``select_star`` method.
|
||||
|
||||
The method removes pseudo-columns from structures inside arrays. While these
|
||||
pseudo-columns show up as "columns" for metadata reasons, we can't select them
|
||||
in the query, as opposed to fields from non-array structures.
|
||||
"""
|
||||
from superset.db_engine_specs.bigquery import BigQueryEngineSpec
|
||||
|
||||
cols = [
|
||||
{
|
||||
"name": "trailer",
|
||||
"type": sqltypes.ARRAY(sqltypes.JSON()),
|
||||
"nullable": True,
|
||||
"comment": None,
|
||||
"default": None,
|
||||
"precision": None,
|
||||
"scale": None,
|
||||
"max_length": None,
|
||||
},
|
||||
{
|
||||
"name": "trailer.key",
|
||||
"type": sqltypes.String(),
|
||||
"nullable": True,
|
||||
"comment": None,
|
||||
"default": None,
|
||||
"precision": None,
|
||||
"scale": None,
|
||||
"max_length": None,
|
||||
},
|
||||
{
|
||||
"name": "trailer.value",
|
||||
"type": sqltypes.String(),
|
||||
"nullable": True,
|
||||
"comment": None,
|
||||
"default": None,
|
||||
"precision": None,
|
||||
"scale": None,
|
||||
"max_length": None,
|
||||
},
|
||||
{
|
||||
"name": "trailer.email",
|
||||
"type": sqltypes.String(),
|
||||
"nullable": True,
|
||||
"comment": None,
|
||||
"default": None,
|
||||
"precision": None,
|
||||
"scale": None,
|
||||
"max_length": None,
|
||||
},
|
||||
]
|
||||
|
||||
# mock the database so we can compile the query
|
||||
database = mocker.MagicMock()
|
||||
database.compile_sqla_query = lambda query: str(
|
||||
query.compile(dialect=SQLiteDialect())
|
||||
)
|
||||
|
||||
# use SQLite dialect so we don't need the BQ dependency
|
||||
engine = mocker.MagicMock()
|
||||
engine.dialect = SQLiteDialect()
|
||||
|
||||
sql = BigQueryEngineSpec.select_star(
|
||||
database=database,
|
||||
table_name="my_table",
|
||||
engine=engine,
|
||||
schema=None,
|
||||
limit=100,
|
||||
show_cols=True,
|
||||
indent=True,
|
||||
latest_partition=False,
|
||||
cols=cols,
|
||||
)
|
||||
assert (
|
||||
sql
|
||||
== """SELECT trailer AS trailer
|
||||
FROM my_table
|
||||
LIMIT ?
|
||||
OFFSET ?"""
|
||||
)
|
||||
|
||||
# BigQuery-specific SQL
|
||||
try:
|
||||
from pybigquery.sqlalchemy_bigquery import BigQueryDialect
|
||||
except ModuleNotFoundError:
|
||||
return
|
||||
|
||||
database.compile_sqla_query = lambda query: str(
|
||||
query.compile(dialect=BigQueryDialect())
|
||||
)
|
||||
engine.dialect = BigQueryDialect()
|
||||
|
||||
sql = BigQueryEngineSpec.select_star(
|
||||
database=database,
|
||||
table_name="my_table",
|
||||
engine=engine,
|
||||
schema=None,
|
||||
limit=100,
|
||||
show_cols=True,
|
||||
indent=True,
|
||||
latest_partition=False,
|
||||
cols=cols,
|
||||
)
|
||||
assert (
|
||||
sql
|
||||
== """SELECT `trailer` AS `trailer`
|
||||
FROM `my_table`
|
||||
LIMIT :param_1"""
|
||||
)
|
Loading…
Reference in New Issue