superset/panoramix/models.py

752 lines
26 KiB
Python
Raw Normal View History

2015-07-15 13:12:32 -04:00
from flask.ext.appbuilder import Model
from datetime import timedelta
2015-08-05 20:36:33 -04:00
from flask.ext.appbuilder.models.mixins import AuditMixin
from flask import request, redirect, flash, Response
2015-07-29 20:33:37 -04:00
from sqlalchemy import Column, Integer, String, ForeignKey, Text, Boolean, DateTime
2015-08-13 00:22:02 -04:00
from sqlalchemy import create_engine, MetaData, desc
2015-08-03 11:34:58 -04:00
from sqlalchemy import Table as sqlaTable
2015-07-15 13:12:32 -04:00
from sqlalchemy.orm import relationship
from dateutil.parser import parse
2015-08-05 20:36:33 -04:00
from pydruid import client
from pydruid.utils.filters import Dimension, Filter
from pandas import read_sql_query
from sqlalchemy.sql import table, literal_column
from sqlalchemy import select, and_, text, String
2015-07-15 13:12:32 -04:00
2015-08-06 03:00:17 -04:00
from copy import deepcopy, copy
2015-08-07 19:25:19 -04:00
from collections import namedtuple
from datetime import datetime
import logging
import json
import sqlparse
import requests
2015-08-07 19:25:19 -04:00
import textwrap
2015-07-15 13:12:32 -04:00
2015-09-05 12:23:46 -04:00
from panoramix import db, get_session
2015-08-03 11:34:58 -04:00
2015-08-07 19:25:19 -04:00
QueryResult = namedtuple('namedtuple', ['df', 'query', 'duration'])
2015-08-03 11:34:58 -04:00
class Queryable(object):
@property
def column_names(self):
return sorted([c.column_name for c in self.columns])
@property
def groupby_column_names(self):
return sorted([c.column_name for c in self.columns if c.groupby])
@property
def filterable_column_names(self):
return sorted([c.column_name for c in self.columns if c.filterable])
class Database(Model, AuditMixin):
2015-08-06 17:44:25 -04:00
__tablename__ = 'dbs'
2015-08-03 11:34:58 -04:00
id = Column(Integer, primary_key=True)
2015-08-11 00:12:21 -04:00
database_name = Column(String(255), unique=True)
2015-08-03 11:34:58 -04:00
sqlalchemy_uri = Column(String(1024))
2015-08-03 11:34:58 -04:00
def __repr__(self):
return self.database_name
2015-08-05 02:41:00 -04:00
def get_sqla_engine(self):
return create_engine(self.sqlalchemy_uri)
2015-08-06 01:42:42 -04:00
def get_table(self, table_name):
2015-08-05 02:41:00 -04:00
meta = MetaData()
return sqlaTable(
2015-08-06 01:42:42 -04:00
table_name, meta,
2015-08-05 02:41:00 -04:00
autoload=True,
autoload_with=self.get_sqla_engine())
2015-08-03 11:34:58 -04:00
2015-08-06 17:49:18 -04:00
class Table(Model, Queryable, AuditMixin):
2015-08-03 11:34:58 -04:00
__tablename__ = 'tables'
id = Column(Integer, primary_key=True)
2015-08-11 00:12:21 -04:00
table_name = Column(String(255), unique=True)
main_datetime_column_id = Column(Integer, ForeignKey('table_columns.id'))
main_datetime_column = relationship(
'TableColumn', foreign_keys=[main_datetime_column_id])
2015-08-03 11:34:58 -04:00
default_endpoint = Column(Text)
database_id = Column(Integer, ForeignKey('dbs.id'), nullable=False)
2015-08-03 11:34:58 -04:00
database = relationship(
'Database', backref='tables', foreign_keys=[database_id])
baselink = "tableview"
2015-08-05 02:41:00 -04:00
@property
def name(self):
return self.table_name
2015-08-03 11:34:58 -04:00
@property
def table_link(self):
url = "/panoramix/table/{}/".format(self.id)
return '<a href="{url}">{self.table_name}</a>'.format(**locals())
@property
def metrics_combo(self):
return sorted(
[
2015-08-05 02:41:00 -04:00
(m.metric_name, m.verbose_name)
for m in self.metrics],
2015-08-03 11:34:58 -04:00
key=lambda x: x[1])
def query_bkp(
2015-08-05 02:41:00 -04:00
self, groupby, metrics,
granularity,
from_dttm, to_dttm,
limit_spec=None,
filter=None,
is_timeseries=True,
timeseries_limit=15, row_limit=None):
"""
Unused, legacy way of querying by building a SQL string without
using the sqlalchemy expression API (new approach which supports
all dialects)
"""
2015-08-05 02:41:00 -04:00
from pandas import read_sql_query
2015-08-07 19:25:19 -04:00
qry_start_dttm = datetime.now()
2015-08-05 02:41:00 -04:00
metrics_exprs = [
"{} AS {}".format(m.expression, m.metric_name)
for m in self.metrics if m.metric_name in metrics]
from_dttm_iso = from_dttm.isoformat()
to_dttm_iso = to_dttm.isoformat()
2015-08-06 03:00:17 -04:00
if metrics:
main_metric_expr = [m.expression for m in self.metrics if m.metric_name == metrics[0]][0]
else:
main_metric_expr = "COUNT(*)"
2015-08-05 02:41:00 -04:00
select_exprs = []
groupby_exprs = []
if groupby:
2015-08-06 03:00:17 -04:00
select_exprs = copy(groupby)
2015-08-05 02:41:00 -04:00
groupby_exprs = [s for s in groupby]
2015-08-06 03:00:17 -04:00
inner_groupby_exprs = [s for s in groupby]
2015-08-05 02:41:00 -04:00
select_exprs += metrics_exprs
if granularity != "all":
select_exprs += ['ds as timestamp']
groupby_exprs += ['ds']
select_exprs = ",\n".join(select_exprs)
groupby_exprs = ",\n".join(groupby_exprs)
where_clause = [
"ds >= '{from_dttm_iso}'",
"ds < '{to_dttm_iso}'"
]
2015-08-06 01:42:42 -04:00
for col, op, eq in filter:
if op in ('in', 'not in'):
l = ["'{}'".format(s) for s in eq.split(",")]
l = ", ".join(l)
op = op.upper()
where_clause.append(
"{col} {op} ({l})".format(**locals())
)
2015-08-05 02:41:00 -04:00
where_clause = " AND\n".join(where_clause).format(**locals())
2015-08-06 03:00:17 -04:00
on_clause = " AND ".join(["{g} = __{g}".format(g=g) for g in groupby])
limiting_join = ""
if timeseries_limit and groupby:
inner_select = ", ".join(["{g} as __{g}".format(g=g) for g in inner_groupby_exprs])
inner_groupby_exprs = ", ".join(inner_groupby_exprs)
2015-08-07 19:25:19 -04:00
limiting_join = (
"JOIN ( \n"
" SELECT {inner_select} \n"
" FROM {self.table_name} \n"
" WHERE \n"
" {where_clause}\n"
" GROUP BY {inner_groupby_exprs}\n"
" ORDER BY {main_metric_expr} DESC\n"
" LIMIT {timeseries_limit}\n"
") z ON {on_clause}\n"
).format(**locals())
sql = (
"SELECT\n"
" {select_exprs}\n"
"FROM {self.table_name}\n"
"{limiting_join}"
2015-08-07 19:25:19 -04:00
"WHERE\n"
" {where_clause}\n"
"GROUP BY\n"
" {groupby_exprs}\n"
).format(**locals())
2015-08-05 02:41:00 -04:00
df = read_sql_query(
sql=sql,
con=self.database.get_sqla_engine()
)
2015-08-07 19:25:19 -04:00
textwrap.dedent(sql)
return QueryResult(
df=df, duration=datetime.now() - qry_start_dttm, query=sql)
2015-08-05 02:41:00 -04:00
def query(
self, groupby, metrics,
granularity,
from_dttm, to_dttm,
limit_spec=None,
filter=None,
is_timeseries=True,
timeseries_limit=15, row_limit=None):
qry_start_dttm = datetime.now()
timestamp = literal_column(
self.main_datetime_column.column_name).label('timestamp')
metrics_exprs = [
literal_column(m.expression).label(m.metric_name)
for m in self.metrics if m.metric_name in metrics]
if metrics:
main_metric_expr = literal_column(
[m.expression for m in self.metrics if m.metric_name == metrics[0]][0])
else:
main_metric_expr = literal_column("COUNT(*)")
select_exprs = []
groupby_exprs = []
if groupby:
select_exprs = [literal_column(s) for s in groupby]
groupby_exprs = [literal_column(s) for s in groupby]
inner_groupby_exprs = [literal_column(s).label('__' + s) for s in groupby]
if granularity != "all":
select_exprs += [timestamp]
groupby_exprs += [timestamp]
2015-08-13 00:22:02 -04:00
select_exprs += metrics_exprs
qry = select(select_exprs)
from_clause = table(self.table_name)
qry = qry.group_by(*groupby_exprs)
where_clause_and = [
timestamp >= from_dttm.isoformat(),
timestamp < to_dttm.isoformat(),
]
for col, op, eq in filter:
if op in ('in', 'not in'):
values = eq.split(",")
cond = literal_column(col).in_(values)
if op == 'not in':
cond = ~cond
where_clause_and.append(cond)
qry = qry.where(and_(*where_clause_and))
2015-08-13 00:22:02 -04:00
qry = qry.order_by(desc(main_metric_expr))
qry = qry.limit(row_limit)
if timeseries_limit and groupby:
subq = select(inner_groupby_exprs)
subq = subq.select_from(table(self.table_name))
subq = subq.where(and_(*where_clause_and))
subq = subq.group_by(*inner_groupby_exprs)
2015-08-13 00:22:02 -04:00
subq = subq.order_by(desc(main_metric_expr))
subq = subq.limit(timeseries_limit)
on_clause = []
for gb in groupby:
2015-08-14 01:18:36 -04:00
on_clause.append(literal_column(gb)==literal_column("__" + gb))
from_clause = from_clause.join(subq.alias(), and_(*on_clause))
qry = qry.select_from(from_clause)
engine = self.database.get_sqla_engine()
sql = str(qry.compile(engine, compile_kwargs={"literal_binds": True}))
df = read_sql_query(
sql=sql,
con=engine
)
sql = sqlparse.format(sql, reindent=True)
return QueryResult(
df=df, duration=datetime.now() - qry_start_dttm, query=sql)
2015-08-05 02:41:00 -04:00
2015-08-03 11:34:58 -04:00
def fetch_metadata(self):
try:
table = self.database.get_table(self.table_name)
except Exception as e:
2015-09-03 18:23:44 -04:00
flash(str(e))
flash(
2015-09-03 18:23:44 -04:00
"Table doesn't seem to exist in the specified database, "
"couldn't fetch column information", "danger")
return
2015-08-03 11:34:58 -04:00
TC = TableColumn
M = SqlMetric
metrics = []
any_date_col = None
2015-08-03 11:34:58 -04:00
for col in table.columns:
2015-09-03 18:23:44 -04:00
try:
datatype = str(col.type)
except Exception as e:
datatype = "UNKNOWN"
2015-08-03 11:34:58 -04:00
dbcol = (
db.session
.query(TC)
.filter(TC.table==self)
.filter(TC.column_name==col.name)
.first()
)
db.session.flush()
if not dbcol:
dbcol = TableColumn(column_name=col.name)
2015-09-03 18:23:44 -04:00
if (
2015-09-03 18:23:44 -04:00
str(datatype).startswith('VARCHAR') or
str(datatype).startswith('STRING')):
2015-08-03 11:34:58 -04:00
dbcol.groupby = True
dbcol.filterable = True
db.session.merge(self)
self.columns.append(dbcol)
2015-09-03 18:23:44 -04:00
if not any_date_col and 'date' in datatype.lower():
any_date_col = dbcol
if dbcol.sum:
metrics.append(M(
metric_name='sum__' + dbcol.column_name,
verbose_name='sum__' + dbcol.column_name,
metric_type='sum',
expression="SUM({})".format(dbcol.column_name)
))
if dbcol.max:
metrics.append(M(
metric_name='max__' + dbcol.column_name,
verbose_name='max__' + dbcol.column_name,
metric_type='max',
expression="MAX({})".format(dbcol.column_name)
))
if dbcol.min:
metrics.append(M(
metric_name='min__' + dbcol.column_name,
verbose_name='min__' + dbcol.column_name,
metric_type='min',
expression="MIN({})".format(dbcol.column_name)
))
if dbcol.count_distinct:
metrics.append(M(
metric_name='count_distinct__' + dbcol.column_name,
verbose_name='count_distinct__' + dbcol.column_name,
metric_type='count_distinct',
expression="COUNT(DISTINCT {})".format(dbcol.column_name)
))
2015-09-03 18:23:44 -04:00
dbcol.type = datatype
db.session.merge(self)
2015-08-03 11:34:58 -04:00
db.session.commit()
metrics.append(M(
metric_name='count',
verbose_name='COUNT(*)',
metric_type='count',
expression="COUNT(*)"
))
for metric in metrics:
m = (
db.session.query(M)
.filter(M.metric_name==metric.metric_name)
.filter(M.table==self)
.first()
)
metric.table = self
if not m:
db.session.add(metric)
db.session.commit()
if not self.main_datetime_column:
self.main_datetime_column = any_date_col
2015-08-03 11:34:58 -04:00
2015-08-07 19:25:19 -04:00
2015-08-06 17:49:18 -04:00
class SqlMetric(Model, AuditMixin):
2015-08-05 02:41:00 -04:00
__tablename__ = 'sql_metrics'
id = Column(Integer, primary_key=True)
metric_name = Column(String(512))
verbose_name = Column(String(1024))
metric_type = Column(String(32))
2015-08-06 17:44:25 -04:00
table_id = Column(Integer,ForeignKey('tables.id'))
2015-08-05 02:41:00 -04:00
table = relationship(
'Table', backref='metrics', foreign_keys=[table_id])
expression = Column(Text)
description = Column(Text)
2015-08-03 11:34:58 -04:00
class TableColumn(Model, AuditMixin):
__tablename__ = 'table_columns'
id = Column(Integer, primary_key=True)
2015-08-06 17:44:25 -04:00
table_id = Column(Integer, ForeignKey('tables.id'))
2015-08-05 02:41:00 -04:00
table = relationship('Table', backref='columns', foreign_keys=[table_id])
2015-08-03 11:34:58 -04:00
column_name = Column(String(256))
is_dttm = Column(Boolean, default=True)
is_active = Column(Boolean, default=True)
type = Column(String(32), default='')
groupby = Column(Boolean, default=False)
count_distinct = Column(Boolean, default=False)
sum = Column(Boolean, default=False)
max = Column(Boolean, default=False)
min = Column(Boolean, default=False)
filterable = Column(Boolean, default=False)
description = Column(Text, default='')
def __repr__(self):
return self.column_name
2015-07-29 20:33:37 -04:00
@property
def isnum(self):
return self.type in ('LONG', 'DOUBLE', 'FLOAT')
2015-07-29 20:33:37 -04:00
class Cluster(Model, AuditMixin):
__tablename__ = 'clusters'
id = Column(Integer, primary_key=True)
2015-08-11 00:12:21 -04:00
cluster_name = Column(String(255), unique=True)
2015-07-29 20:33:37 -04:00
coordinator_host = Column(String(256))
coordinator_port = Column(Integer)
coordinator_endpoint = Column(String(256))
broker_host = Column(String(256))
broker_port = Column(Integer)
broker_endpoint = Column(String(256))
metadata_last_refreshed = Column(DateTime)
def __repr__(self):
return self.cluster_name
def get_pydruid_client(self):
cli = client.PyDruid(
"http://{0}:{1}/".format(self.broker_host, self.broker_port),
self.broker_endpoint)
return cli
def refresh_datasources(self):
endpoint = (
"http://{self.coordinator_host}:{self.coordinator_port}/"
"{self.coordinator_endpoint}/datasources"
).format(self=self)
2015-09-02 01:15:02 -04:00
datasources = json.loads(requests.get(endpoint).text)
for datasource in datasources:
2015-08-05 02:41:00 -04:00
Datasource.sync_to_db(datasource, self)
2015-07-29 20:33:37 -04:00
2015-08-03 11:34:58 -04:00
class Datasource(Model, AuditMixin, Queryable):
2015-08-11 00:12:21 -04:00
baselink = "datasourcemodelview"
2015-07-15 13:12:32 -04:00
__tablename__ = 'datasources'
id = Column(Integer, primary_key=True)
2015-08-11 00:12:21 -04:00
datasource_name = Column(String(255), unique=True)
2015-07-15 13:12:32 -04:00
is_featured = Column(Boolean, default=False)
is_hidden = Column(Boolean, default=False)
description = Column(Text)
2015-07-15 20:38:03 -04:00
default_endpoint = Column(Text)
2015-08-06 17:44:25 -04:00
user_id = Column(Integer, ForeignKey('ab_user.id'))
2015-07-21 14:56:05 -04:00
owner = relationship('User', backref='datasources', foreign_keys=[user_id])
2015-08-11 00:12:21 -04:00
cluster_name = Column(String(255),
2015-07-29 20:33:37 -04:00
ForeignKey('clusters.cluster_name'))
cluster = relationship('Cluster', backref='datasources', foreign_keys=[cluster_name])
2015-07-15 13:12:32 -04:00
@property
2015-07-15 20:38:03 -04:00
def metrics_combo(self):
return sorted(
[(m.metric_name, m.verbose_name) for m in self.metrics],
key=lambda x: x[1])
2015-07-15 13:12:32 -04:00
2015-08-05 02:41:00 -04:00
@property
def name(self):
return self.datasource_name
2015-07-15 13:12:32 -04:00
def __repr__(self):
return self.datasource_name
@property
def datasource_link(self):
url = "/panoramix/datasource/{}/".format(self.datasource_name)
return '<a href="{url}">{self.datasource_name}</a>'.format(**locals())
2015-07-15 20:38:03 -04:00
def get_metric_obj(self, metric_name):
return [
m.json_obj for m in self.metrics
if m.metric_name == metric_name
][0]
def latest_metadata(self):
client = self.cluster.get_pydruid_client()
results = client.time_boundary(datasource=self.datasource_name)
2015-09-02 01:15:02 -04:00
if not results:
return
2015-07-16 20:55:36 -04:00
max_time = results[0]['result']['minTime']
2015-07-15 13:12:32 -04:00
max_time = parse(max_time)
intervals = (max_time - timedelta(seconds=1)).isoformat() + '/'
intervals += (max_time + timedelta(seconds=1)).isoformat()
segment_metadata = client.segment_metadata(
datasource=self.datasource_name,
2015-07-15 13:12:32 -04:00
intervals=intervals)
2015-07-15 20:38:03 -04:00
if segment_metadata:
return segment_metadata[-1]['columns']
def generate_metrics(self):
for col in self.columns:
col.generate_metrics()
2015-07-15 13:12:32 -04:00
@classmethod
def sync_to_db(cls, name, cluster):
session = get_session()
datasource = session.query(cls).filter_by(datasource_name=name).first()
2015-07-15 13:12:32 -04:00
if not datasource:
datasource = cls(datasource_name=name)
session.add(datasource)
datasource.cluster = cluster
cols = datasource.latest_metadata()
2015-07-15 20:38:03 -04:00
if not cols:
return
2015-07-15 13:12:32 -04:00
for col in cols:
col_obj = (
session
2015-07-15 13:12:32 -04:00
.query(Column)
.filter_by(datasource_name=name, column_name=col)
.first()
)
datatype = cols[col]['type']
if not col_obj:
col_obj = Column(datasource_name=name, column_name=col)
session.add(col_obj)
2015-07-15 13:12:32 -04:00
if datatype == "STRING":
col_obj.groupby = True
col_obj.filterable = True
if col_obj:
col_obj.type = cols[col]['type']
2015-08-01 20:08:00 -04:00
col_obj.datasource = datasource
2015-07-15 20:38:03 -04:00
col_obj.generate_metrics()
#session.commit()
2015-08-05 20:36:33 -04:00
2015-08-05 02:41:00 -04:00
def query(
2015-09-03 18:23:44 -04:00
self, groupby, metrics,
granularity,
from_dttm, to_dttm,
limit_spec=None,
filter=None,
is_timeseries=True,
timeseries_limit=None,
row_limit=None):
2015-08-07 19:25:19 -04:00
qry_start_dttm = datetime.now()
2015-08-05 02:41:00 -04:00
query_str = ""
2015-08-05 02:41:00 -04:00
aggregations = {
m.metric_name: m.json_obj
for m in self.metrics if m.metric_name in metrics
}
if not isinstance(granularity, basestring):
granularity = {"type": "duration", "duration": granularity}
qry = dict(
datasource=self.datasource_name,
dimensions=groupby,
aggregations=aggregations,
granularity=granularity,
intervals= from_dttm.isoformat() + '/' + to_dttm.isoformat(),
)
2015-08-06 01:42:42 -04:00
filters = None
for col, op, eq in filter:
cond = None
if op == '==':
cond = Dimension(col)==eq
elif op == '!=':
cond = ~(Dimension(col)==eq)
elif op in ('in', 'not in'):
fields = []
splitted = eq.split(',')
if len(splitted) > 1:
for s in eq.split(','):
s = s.strip()
fields.append(Filter.build_filter(Dimension(col)==s))
cond = Filter(type="or", fields=fields)
else:
cond = Dimension(col)==eq
if op == 'not in':
cond = ~cond
if filters:
filters = Filter(type="and", fields=[
Filter.build_filter(cond),
Filter.build_filter(filters)
])
else:
filters = cond
if filters:
qry['filter'] = filters
2015-08-05 20:36:33 -04:00
2015-08-05 02:41:00 -04:00
client = self.cluster.get_pydruid_client()
2015-08-06 01:42:42 -04:00
orig_filters = filters
2015-09-03 18:23:44 -04:00
if timeseries_limit and is_timeseries:
2015-08-05 20:36:33 -04:00
# Limit on the number of timeseries, doing a two-phases query
pre_qry = deepcopy(qry)
pre_qry['granularity'] = "all"
2015-08-06 01:42:42 -04:00
pre_qry['limit_spec'] = {
"type": "default",
"limit": timeseries_limit,
"columns": [{
"dimension": metrics[0] if metrics else self.metrics[0],
"direction": "descending",
}],
}
client.groupby(**pre_qry)
query_str += "// Two phase query\n// Phase 1\n"
query_str += json.dumps(client.query_dict, indent=2) + "\n"
query_str += "//\nPhase 2 (built based on phase one's results)\n"
2015-08-05 20:36:33 -04:00
df = client.export_pandas()
if not df is None and not df.empty:
dims = qry['dimensions']
filters = []
for index, row in df.iterrows():
fields = []
for dim in dims:
f = Filter.build_filter(Dimension(dim) == row[dim])
fields.append(f)
if len(fields) > 1:
filt = Filter(type="and", fields=fields)
filters.append(Filter.build_filter(filt))
elif fields:
filters.append(fields[0])
if filters:
ff = Filter(type="or", fields=filters)
2015-08-06 01:42:42 -04:00
if not orig_filters:
2015-08-05 20:36:33 -04:00
qry['filter'] = ff
else:
qry['filter'] = Filter(type="and", fields=[
Filter.build_filter(ff),
2015-08-06 01:42:42 -04:00
Filter.build_filter(orig_filters)])
2015-08-05 20:36:33 -04:00
qry['limit_spec'] = None
2015-09-03 18:23:44 -04:00
if row_limit:
qry['limit_spec'] = {
"type": "default",
"limit": row_limit,
"columns": [{
"dimension": metrics[0] if metrics else self.metrics[0],
"direction": "descending",
}],
}
2015-08-05 02:41:00 -04:00
client.groupby(**qry)
query_str += json.dumps(client.query_dict, indent=2)
2015-08-05 02:41:00 -04:00
df = client.export_pandas()
2015-08-07 19:25:19 -04:00
return QueryResult(
df=df,
query=query_str,
duration=datetime.now() - qry_start_dttm)
2015-07-15 13:12:32 -04:00
2015-08-07 19:25:19 -04:00
#class Metric(Model, AuditMixin):
class Metric(Model):
2015-07-15 20:38:03 -04:00
__tablename__ = 'metrics'
2015-07-15 13:12:32 -04:00
id = Column(Integer, primary_key=True)
2015-07-15 20:38:03 -04:00
metric_name = Column(String(512))
verbose_name = Column(String(1024))
metric_type = Column(String(32))
2015-07-15 13:12:32 -04:00
datasource_name = Column(
String(256),
ForeignKey('datasources.datasource_name'))
2015-07-15 20:38:03 -04:00
datasource = relationship('Datasource', backref='metrics')
json = Column(Text)
2015-07-21 14:56:05 -04:00
description = Column(Text)
2015-07-15 13:12:32 -04:00
2015-07-15 20:38:03 -04:00
@property
def json_obj(self):
try:
obj = json.loads(self.json)
except Exception as e:
obj = {}
return obj
2015-07-15 13:12:32 -04:00
class Column(Model, AuditMixin):
__tablename__ = 'columns'
id = Column(Integer, primary_key=True)
datasource_name = Column(
String(256),
ForeignKey('datasources.datasource_name'))
2015-07-15 20:38:03 -04:00
datasource = relationship('Datasource', backref='columns')
2015-07-15 13:12:32 -04:00
column_name = Column(String(256))
is_active = Column(Boolean, default=True)
type = Column(String(32))
groupby = Column(Boolean, default=False)
count_distinct = Column(Boolean, default=False)
sum = Column(Boolean, default=False)
max = Column(Boolean, default=False)
min = Column(Boolean, default=False)
filterable = Column(Boolean, default=False)
2015-07-21 14:56:05 -04:00
description = Column(Text)
2015-07-15 13:12:32 -04:00
def __repr__(self):
return self.column_name
2015-07-15 20:38:03 -04:00
@property
def isnum(self):
return self.type in ('LONG', 'DOUBLE', 'FLOAT')
2015-07-15 20:38:03 -04:00
def generate_metrics(self):
M = Metric
metrics = []
metrics.append(Metric(
metric_name='count',
verbose_name='COUNT(*)',
metric_type='count',
json=json.dumps({'type': 'count', 'name': 'count'})
2015-07-15 20:38:03 -04:00
))
# Somehow we need to reassign this for UDAFs
2015-08-01 20:08:00 -04:00
corrected_type = 'DOUBLE' if self.type in ('DOUBLE', 'FLOAT') else self.type
2015-07-15 20:38:03 -04:00
if self.sum and self.isnum:
mt = corrected_type.lower() + 'Sum'
2015-07-15 20:38:03 -04:00
name='sum__' + self.column_name
metrics.append(Metric(
metric_name=name,
metric_type='sum',
verbose_name='SUM({})'.format(self.column_name),
json=json.dumps({
'type': mt, 'name': name, 'fieldName': self.column_name})
))
if self.min and self.isnum:
mt = corrected_type.lower() + 'Min'
2015-07-15 20:38:03 -04:00
name='min__' + self.column_name
metrics.append(Metric(
metric_name=name,
metric_type='min',
verbose_name='MIN({})'.format(self.column_name),
json=json.dumps({
'type': mt, 'name': name, 'fieldName': self.column_name})
))
if self.max and self.isnum:
mt = corrected_type.lower() + 'Max'
2015-07-15 20:38:03 -04:00
name='max__' + self.column_name
metrics.append(Metric(
metric_name=name,
metric_type='max',
verbose_name='MAX({})'.format(self.column_name),
json=json.dumps({
'type': mt, 'name': name, 'fieldName': self.column_name})
))
if self.count_distinct:
mt = 'count_distinct'
name='count_distinct__' + self.column_name
metrics.append(Metric(
metric_name=name,
verbose_name='COUNT(DISTINCT {})'.format(self.column_name),
metric_type='count_distinct',
json=json.dumps({
'type': 'cardinality',
'name': name,
'fieldNames': [self.column_name]})
))
session = get_session()
2015-07-15 20:38:03 -04:00
for metric in metrics:
m = (
session.query(M)
2015-07-15 20:38:03 -04:00
.filter(M.metric_name==metric.metric_name)
.filter(M.datasource_name==self.datasource_name)
.filter(Cluster.cluster_name==self.datasource.cluster_name)
2015-07-15 20:38:03 -04:00
.first()
)
metric.datasource_name = self.datasource_name
if not m:
session.add(metric)
session.commit()