superset/tests/db_engine_specs_test.py
Maxime Beauchemin b839608c32
[sql lab] a better approach at limiting queries (#4947)
* [sql lab] a better approach at limiting queries

Currently there are two mechanisms that we use to enforce the row
limiting constraints, depending on the database engine:
1. use dbapi's `cursor.fetchmany()`
2. wrap the SQL into a limiting subquery

Method 1 isn't great as it can result in the database server storing
larger than required result sets in memory expecting another fetch
command while we know we don't need that.

Method 2 has a positive side of working with all database engines,
whether they use LIMIT, ROWNUM, TOP or whatever else since sqlalchemy
does the work as specified for the dialect. On the downside though
the query optimizer might not be able to optimize this as much as an
approach that doesn't use a subquery.

Since most modern DBs use the LIMIT syntax, this adds a regex approach
to modify the query and force a LIMIT clause without using a subquery
for the database that support this syntax and uses method 2 for all
others.

* Fixing build

* Fix lint

* Added more tests

* Fix tests
2018-05-14 14:44:05 -05:00

183 lines
7.1 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import textwrap
from superset.db_engine_specs import (
HiveEngineSpec, MssqlEngineSpec, MySQLEngineSpec)
from superset.models.core import Database
from .base_tests import SupersetTestCase
class DbEngineSpecsTestCase(SupersetTestCase):
def test_0_progress(self):
log = """
17/02/07 18:26:27 INFO log.PerfLogger: <PERFLOG method=compile from=org.apache.hadoop.hive.ql.Driver>
17/02/07 18:26:27 INFO log.PerfLogger: <PERFLOG method=parse from=org.apache.hadoop.hive.ql.Driver>
""".split('\n') # noqa ignore: E501
self.assertEquals(
0, HiveEngineSpec.progress(log))
def test_number_of_jobs_progress(self):
log = """
17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
""".split('\n')
self.assertEquals(0, HiveEngineSpec.progress(log))
def test_job_1_launched_progress(self):
log = """
17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
""".split('\n')
self.assertEquals(0, HiveEngineSpec.progress(log))
def test_job_1_launched_stage_1_0_progress(self):
log = """
17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 0%, reduce = 0%
""".split('\n') # noqa ignore: E501
self.assertEquals(0, HiveEngineSpec.progress(log))
def test_job_1_launched_stage_1_map_40_progress(self):
log = """
17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 0%, reduce = 0%
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 40%, reduce = 0%
""".split('\n') # noqa ignore: E501
self.assertEquals(10, HiveEngineSpec.progress(log))
def test_job_1_launched_stage_1_map_80_reduce_40_progress(self):
log = """
17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 0%, reduce = 0%
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 40%, reduce = 0%
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 80%, reduce = 40%
""".split('\n') # noqa ignore: E501
self.assertEquals(30, HiveEngineSpec.progress(log))
def test_job_1_launched_stage_2_stages_progress(self):
log = """
17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 0%, reduce = 0%
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 40%, reduce = 0%
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 80%, reduce = 40%
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-2 map = 0%, reduce = 0%
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 100%, reduce = 0%
""".split('\n') # noqa ignore: E501
self.assertEquals(12, HiveEngineSpec.progress(log))
def test_job_2_launched_stage_2_stages_progress(self):
log = """
17/02/07 19:15:55 INFO ql.Driver: Total jobs = 2
17/02/07 19:15:55 INFO ql.Driver: Launching Job 1 out of 2
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 100%, reduce = 0%
17/02/07 19:15:55 INFO ql.Driver: Launching Job 2 out of 2
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 0%, reduce = 0%
17/02/07 19:16:09 INFO exec.Task: 2017-02-07 19:16:09,173 Stage-1 map = 40%, reduce = 0%
""".split('\n') # noqa ignore: E501
self.assertEquals(60, HiveEngineSpec.progress(log))
def get_generic_database(self):
return Database(sqlalchemy_uri='mysql://localhost')
def sql_limit_regex(
self, sql, expected_sql,
engine_spec_class=MySQLEngineSpec,
limit=1000):
main = self.get_generic_database()
limited = engine_spec_class.apply_limit_to_sql(sql, limit, main)
self.assertEquals(expected_sql, limited)
def test_wrapped_query(self):
self.sql_limit_regex(
'SELECT * FROM a',
'SELECT * \nFROM (SELECT * FROM a) AS inner_qry \n LIMIT 1000',
MssqlEngineSpec,
)
def test_wrapped_semi(self):
self.sql_limit_regex(
'SELECT * FROM a;',
'SELECT * \nFROM (SELECT * FROM a) AS inner_qry \n LIMIT 1000',
MssqlEngineSpec,
)
def test_wrapped_semi_tabs(self):
self.sql_limit_regex(
'SELECT * FROM a \t \n ; \t \n ',
'SELECT * \nFROM (SELECT * FROM a) AS inner_qry \n LIMIT 1000',
MssqlEngineSpec,
)
def test_simple_limit_query(self):
self.sql_limit_regex(
'SELECT * FROM a',
'SELECT * FROM a LIMIT 1000',
)
def test_modify_limit_query(self):
self.sql_limit_regex(
'SELECT * FROM a LIMIT 9999',
'SELECT * FROM a LIMIT 1000',
)
def test_modify_newline_query(self):
self.sql_limit_regex(
'SELECT * FROM a\nLIMIT 9999',
'SELECT * FROM a LIMIT 1000',
)
def test_modify_lcase_limit_query(self):
self.sql_limit_regex(
'SELECT * FROM a\tlimit 9999',
'SELECT * FROM a LIMIT 1000',
)
def test_limit_query_with_limit_subquery(self):
self.sql_limit_regex(
'SELECT * FROM (SELECT * FROM a LIMIT 10) LIMIT 9999',
'SELECT * FROM (SELECT * FROM a LIMIT 10) LIMIT 1000',
)
def test_limit_with_expr(self):
self.sql_limit_regex(
textwrap.dedent("""\
SELECT
'LIMIT 777' AS a
, b
FROM
table
LIMIT
99990"""),
textwrap.dedent("""\
SELECT
'LIMIT 777' AS a
, b
FROM
table LIMIT 1000"""),
)
def test_limit_expr_and_semicolon(self):
self.sql_limit_regex(
textwrap.dedent("""\
SELECT
'LIMIT 777' AS a
, b
FROM
table
LIMIT 99990 ;"""),
textwrap.dedent("""\
SELECT
'LIMIT 777' AS a
, b
FROM
table LIMIT 1000"""),
)