""" Caravel wrapper around pandas.DataFrame. TODO(bkyryliuk): add support for the conventions like: *_dim or dim_* dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc. TODO(bkyryliuk): recognize integer encoded enums. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import pandas as pd import numpy as np INFER_COL_TYPES_THRESHOLD = 95 INFER_COL_TYPES_SAMPLE_SIZE = 100 # http://pandas.pydata.org/pandas-docs/stable/internals.html# # subclassing-pandas-data-structures class CaravelDataFrame(object): def __init__(self, df): self.__df = df.where((pd.notnull(df)), None) @property def size(self): return len(self.__df.index) @property def data(self): return self.__df.to_dict(orient='records') @property def columns_dict(self): """Provides metadata about columns for data visualization. :return: dict, with the fields name, type, is_date, is_dim and agg. """ if self.__df.empty: return None columns = [] sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index)) sample = self.__df if sample_size: sample = self.__df.sample(sample_size) for col in self.__df.dtypes.keys(): column = { 'name': col, 'type': self.__df.dtypes[col].name, 'is_date': is_date(self.__df.dtypes[col]), 'is_dim': is_dimension(self.__df.dtypes[col], col), } agg = agg_func(self.__df.dtypes[col], col) if agg_func: column['agg'] = agg if column['type'] == 'object': # check if encoded datetime if (datetime_conversion_rate(sample[col]) > INFER_COL_TYPES_THRESHOLD): column.update({ 'type': 'datetime_string', 'is_date': True, 'is_dim': False, 'agg': None }) # 'agg' is optional attribute if not column['agg']: column.pop('agg', None) columns.append(column) return columns # It will give false positives on the numbers that are stored as strings. # It is hard to distinguish integer numbers and timestamps def datetime_conversion_rate(data_series): success = 0 total = 0 for value in data_series: total = total + 1 try: pd.to_datetime(value) success = success + 1 except Exception: continue return 100 * success / total def is_date(dtype): return dtype.name.startswith('datetime') def is_dimension(dtype, column_name): if is_id(column_name): return False return dtype == np.object or dtype == np.bool def is_id(column_name): return column_name.startswith('id') or column_name.endswith('id') def agg_func(dtype, column_name): # consider checking for key substring too. if is_id(column_name): return 'count_distinct' if np.issubdtype(dtype, np.number): return 'sum' return None