feat: run BQ queries as single statement (#11904)

* feat: run BQ queries as single statement

* Update deps

* Fix lint

* Update superset/sql_lab.py

Co-authored-by: Ville Brofeldt <33317356+villebro@users.noreply.github.com>

Co-authored-by: Ville Brofeldt <33317356+villebro@users.noreply.github.com>
This commit is contained in:
Beto Dealmeida 2020-12-03 13:20:23 -08:00 committed by GitHub
parent 04f993e222
commit 54bf70733f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 22 additions and 7 deletions

View File

@ -106,7 +106,11 @@ setup(
],
extras_require={
"athena": ["pyathena>=1.10.8,<1.11"],
"bigquery": ["pandas_gbq>=0.10.0", "pybigquery>=0.4.10"],
"bigquery": [
"pandas_gbq>=0.10.0",
"pybigquery>=0.4.10",
"google-cloud-bigquery>=2.4.0",
],
"clickhouse": ["clickhouse-sqlalchemy>= 0.1.4, <0.2"],
"cockroachdb": ["cockroachdb>=0.3.5, <0.4"],
"cors": ["flask-cors>=2.0.0"],

View File

@ -156,6 +156,7 @@ class BaseEngineSpec: # pylint: disable=too-many-public-methods
arraysize = 0
max_column_name_length = 0
try_remove_schema_from_table_name = True # pylint: disable=invalid-name
run_multiple_statements_as_one = False
# default matching patterns for identifying column types
db_column_types: Dict[utils.DbColumnType, Tuple[Pattern[Any], ...]] = {
@ -454,7 +455,7 @@ class BaseEngineSpec: # pylint: disable=too-many-public-methods
@staticmethod
def csv_to_df(**kwargs: Any) -> pd.DataFrame:
""" Read csv into Pandas DataFrame
"""Read csv into Pandas DataFrame
:param kwargs: params to be passed to DataFrame.read_csv
:return: Pandas DataFrame containing data from csv
"""
@ -466,7 +467,7 @@ class BaseEngineSpec: # pylint: disable=too-many-public-methods
@classmethod
def df_to_sql(cls, df: pd.DataFrame, **kwargs: Any) -> None:
""" Upload data from a Pandas DataFrame to a database. For
"""Upload data from a Pandas DataFrame to a database. For
regular engines this calls the DataFrame.to_sql() method. Can be
overridden for engines that don't work well with to_sql(), e.g.
BigQuery.

View File

@ -39,6 +39,10 @@ class BigQueryEngineSpec(BaseEngineSpec):
engine_name = "Google BigQuery"
max_column_name_length = 128
# BigQuery doesn't maintain context when running multiple statements in the
# same cursor, so we need to run all statements at once
run_multiple_statements_as_one = True
"""
https://www.python.org/dev/peps/pep-0249/#arraysize
raw_connections bypass the pybigquery query execution context and deal with

View File

@ -296,7 +296,7 @@ def _serialize_and_expand_data(
return (data, selected_columns, all_columns, expanded_columns)
def execute_sql_statements( # pylint: disable=too-many-arguments, too-many-locals, too-many-statements
def execute_sql_statements( # pylint: disable=too-many-arguments, too-many-locals, too-many-statements, too-many-branches
query_id: int,
rendered_query: str,
return_results: bool,
@ -322,9 +322,15 @@ def execute_sql_statements( # pylint: disable=too-many-arguments, too-many-loca
raise SqlLabException("Results backend isn't configured.")
# Breaking down into multiple statements
parsed_query = ParsedQuery(rendered_query)
statements = parsed_query.get_statements()
logger.info("Query %s: Executing %i statement(s)", str(query_id), len(statements))
if not db_engine_spec.run_multiple_statements_as_one:
parsed_query = ParsedQuery(rendered_query)
statements = parsed_query.get_statements()
logger.info(
"Query %s: Executing %i statement(s)", str(query_id), len(statements)
)
else:
statements = [rendered_query]
logger.info("Query %s: Executing query as a single statement", str(query_id))
logger.info("Query %s: Set query to 'running'", str(query_id))
query.status = QueryStatus.RUNNING