Basic integration with ClickHouse (#1844)

* db_engine_specs: added engine spec for clickhouse

* models: group by time without join on subquery

if this is enabled, it will remove the subquery when grouping by time,
and group by time will be rewritten as group by timeslot, ...

this particularly fixes clickhouse backend

* docs: added clickhouse

* models: allow using secondary dttm columns as constraints

* superset/forms: configurable default slice since time

* superset/models: respect dialect quoting

* dttm handling, codeclimate tweaks
This commit is contained in:
Marek Vavruša 2017-05-03 13:57:39 -07:00 committed by Maxime Beauchemin
parent 59a6f447ec
commit 9ff351532a
4 changed files with 62 additions and 7 deletions

View File

@ -211,6 +211,9 @@ Here's a list of some of the recommended packages.
| Vertica | ``pip install | ``vertica+vertica_python://`` |
| | sqlalchemy-vertica-python`` | |
+---------------+-------------------------------------+-------------------------------------------------+
| ClickHouse | ``pip install | ``clickhouse://`` |
| | sqlalchemy-clickhouse`` | |
+---------------+-------------------------------------+-------------------------------------------------+
Note that many other database are supported, the main criteria being the
existence of a functional SqlAlchemy dialect and Python driver. Googling

View File

@ -370,6 +370,9 @@ class SqlaTable(Model, BaseDatasource):
if granularity not in self.dttm_cols:
granularity = self.main_dttm_col
# Database spec supports join-free timeslot grouping
time_groupby_inline = self.database.db_engine_spec.time_groupby_inline
cols = {col.column_name: col for col in self.columns}
metrics_dict = {m.metric_name: m for m in self.metrics}
@ -386,7 +389,7 @@ class SqlaTable(Model, BaseDatasource):
if timeseries_limit_metric:
timeseries_limit_metric_expr = \
timeseries_limit_metric.sqla_col
if metrics:
if metrics_exprs:
main_metric_expr = metrics_exprs[0]
else:
main_metric_expr = literal_column("COUNT(*)").label("ccount")
@ -436,13 +439,20 @@ class SqlaTable(Model, BaseDatasource):
dttm_col = cols[granularity]
time_grain = extras.get('time_grain_sqla')
time_filters = []
if is_timeseries:
timestamp = dttm_col.get_timestamp_expression(time_grain)
select_exprs += [timestamp]
groupby_exprs += [timestamp]
time_filter = dttm_col.get_time_filter(from_dttm, to_dttm)
# Use main dttm column to support index with secondary dttm columns
if self.database.db_engine_spec.time_secondary_columns and \
self.main_dttm_col in self.dttm_cols and \
self.main_dttm_col != dttm_col.column_name:
time_filters.append(cols[self.main_dttm_col].
get_time_filter(from_dttm, to_dttm))
time_filters.append(dttm_col.get_time_filter(from_dttm, to_dttm))
select_exprs += metrics_exprs
qry = sa.select(select_exprs)
@ -509,7 +519,7 @@ class SqlaTable(Model, BaseDatasource):
having = template_processor.process_template(having)
having_clause_and += [sa.text('({})'.format(having))]
if granularity:
qry = qry.where(and_(*([time_filter] + where_clause_and)))
qry = qry.where(and_(*(time_filters + where_clause_and)))
else:
qry = qry.where(and_(*where_clause_and))
qry = qry.having(and_(*having_clause_and))
@ -522,7 +532,8 @@ class SqlaTable(Model, BaseDatasource):
qry = qry.limit(row_limit)
if is_timeseries and timeseries_limit and groupby:
if is_timeseries and \
timeseries_limit and groupby and not time_groupby_inline:
# some sql dialects require for order by expressions
# to also be in the select clause -- others, e.g. vertica,
# require a unique inner alias
@ -620,8 +631,7 @@ class SqlaTable(Model, BaseDatasource):
if not any_date_col and dbcol.is_time:
any_date_col = col.name
quoted = "{}".format(
column(dbcol.column_name).compile(dialect=db_dialect))
quoted = "{}".format(col.compile(dialect=db_dialect))
if dbcol.sum:
metrics.append(M(
metric_name='sum__' + dbcol.column_name,

View File

@ -50,7 +50,9 @@ class BaseEngineSpec(object):
engine = 'base' # str as defined in sqlalchemy.engine.engine
cursor_execute_kwargs = {}
time_grains = tuple()
time_groupby_inline = False
limit_method = LimitMethod.FETCH_MANY
time_secondary_columns = False
@classmethod
def fetch_data(cls, cursor, limit):
@ -865,6 +867,44 @@ class AthenaEngineSpec(BaseEngineSpec):
def epoch_to_dttm(cls):
return "from_unixtime({col})"
class ClickHouseEngineSpec(BaseEngineSpec):
"""Dialect for ClickHouse analytical DB."""
engine = 'clickhouse'
time_secondary_columns = True
time_groupby_inline = True
time_grains = (
Grain('Time Column', _('Time Column'), '{col}'),
Grain('minute', _('minute'),
"toStartOfMinute(toDateTime({col}))"),
Grain('5 minute', _('5 minute'),
"toDateTime(intDiv(toUInt32(toDateTime({col})), 300)*300)"),
Grain('10 minute', _('10 minute'),
"toDateTime(intDiv(toUInt32(toDateTime({col})), 600)*600)"),
Grain('hour', _('hour'),
"toStartOfHour(toDateTime({col}))"),
Grain('day', _('day'),
"toStartOfDay(toDateTime({col}))"),
Grain('month', _('month'),
"toStartOfMonth(toDateTime({col}))"),
Grain('quarter', _('quarter'),
"toStartOfQuarter(toDateTime({col}))"),
Grain('year', _('year'),
"toStartOfYear(toDateTime({col}))"),
)
@classmethod
def convert_dttm(cls, target_type, dttm):
tt = target_type.upper()
if tt == 'DATE':
return "toDate('{}')".format(dttm.strftime('%Y-%m-%d'))
if tt == 'DATETIME':
return "toDateTime('{}')".format(
dttm.strftime('%Y-%m-%d %H:%M:%S'))
return "'{}'".format(dttm.strftime('%Y-%m-%d %H:%M:%S'))
engines = {
o.engine: o for o in globals().values()
if inspect.isclass(o) and issubclass(o, BaseEngineSpec)}

View File

@ -130,7 +130,9 @@ class BaseViz(object):
# [column_name in list_of_values]. `__` prefix is there to avoid
# potential conflicts with column that would be named `from` or `to`
since = (
extra_filters.get('__from') or form_data.get("since", "1 year ago")
extra_filters.get('__from') or
form_data.get("since") or
config.get("SUPERSET_DEFAULT_SINCE", "1 year ago")
)
from_dttm = utils.parse_human_datetime(since)