superset/panoramix/data/__init__.py

364 lines
12 KiB
Python
Raw Normal View History

2015-12-22 18:35:06 -05:00
import pandas as pd
import csv
from datetime import datetime
import gzip
import os
from panoramix import app, db, models
from sqlalchemy import Column, String, DateTime, Table, Integer
from flask.ext.appbuilder import Base
config = app.config
DATA_FOLDER = os.path.join(config.get("BASE_DIR"), 'data')
def load_world_bank_health_n_pop():
"""
Details on how the data was loaded from
http://data.worldbank.org/data-catalog/health-nutrition-and-population-statistics
DIR = ""
df_country = pd.read_csv(DIR + '/HNP_Country.csv')
df_country.columns = ['country_code'] + list(df_country.columns[1:])
df_country = df_country[['country_code', 'Region']]
df_country.columns = ['country_code', 'region']
df = pd.read_csv(DIR + '/HNP_Data.csv')
del df['Unnamed: 60']
df.columns = ['country_name', 'country_code'] + list(df.columns[2:])
ndf = df.merge(df_country, how='inner')
dims = ('country_name', 'country_code', 'region')
vv = [str(i) for i in range(1960, 2015)]
mdf = pd.melt(ndf, id_vars=dims + ('Indicator Code',), value_vars=vv)
mdf['year'] = mdf.variable + '-01-01'
dims = dims + ('year',)
pdf = mdf.pivot_table(values='value', columns='Indicator Code', index=dims)
pdf = pdf.reset_index()
pdf.to_csv(DIR + '/countries.csv')
pdf.to_json(DIR + '/countries.json', orient='records')
"""
with gzip.open(os.path.join(DATA_FOLDER, 'countries.json.gz')) as f:
pdf = pd.read_json(f)
pdf.to_sql(
'wb_health_population',
db.engine,
if_exists='replace',
chunksize=500,
dtype={
'year': DateTime(),
'country_code': String(3),
'country_name': String(255),
'region': String(255),
},
index=False)
def load_birth_names():
BirthNames = Table(
"birth_names", Base.metadata,
Column("id", Integer, primary_key=True),
Column("state", String(10)),
Column("year", Integer),
Column("name", String(128)),
Column("num", Integer),
Column("ds", DateTime),
Column("gender", String(10)),
Column("sum_boys", Integer),
Column("sum_girls", Integer),
)
try:
BirthNames.drop(db.engine)
except:
pass
BirthNames.create(db.engine)
session = db.session()
filepath = os.path.join(DATA_FOLDER, 'birth_names.csv.gz')
with gzip.open(filepath, mode='rt') as f:
bb_csv = csv.reader(f)
for i, (state, year, name, gender, num) in enumerate(bb_csv):
if i == 0 or year < "1965": # jumpy data before 1965
continue
if num == "NA":
num = 0
ds = datetime(int(year), 1, 1)
db.engine.execute(
BirthNames.insert(),
state=state,
year=year,
ds=ds,
name=name, num=num, gender=gender,
sum_boys=num if gender == 'boy' else 0,
sum_girls=num if gender == 'girl' else 0,
)
if i % 1000 == 0:
print("{} loaded out of 82527 rows".format(i))
session.commit()
session.commit()
print("Done loading table!")
print("-" * 80)
print("Creating database reference")
DB = models.Database
dbobj = session.query(DB).filter_by(database_name='main').first()
if not dbobj:
dbobj = DB(database_name="main")
print(config.get("SQLALCHEMY_DATABASE_URI"))
dbobj.sqlalchemy_uri = config.get("SQLALCHEMY_DATABASE_URI")
session.add(dbobj)
session.commit()
print("Creating table reference")
TBL = models.SqlaTable
obj = session.query(TBL).filter_by(table_name='birth_names').first()
if not obj:
obj = TBL(table_name = 'birth_names')
obj.main_dttm_col = 'ds'
obj.default_endpoint = "/panoramix/datasource/table/1/?viz_type=table&granularity=ds&since=100+years&until=now&row_limit=10&where=&flt_col_0=ds&flt_op_0=in&flt_eq_0=&flt_col_1=ds&flt_op_1=in&flt_eq_1=&slice_name=TEST&datasource_name=birth_names&datasource_id=1&datasource_type=table"
obj.database = dbobj
obj.columns = [
models.TableColumn(column_name="num", sum=True, type="INTEGER"),
models.TableColumn(column_name="sum_boys", sum=True, type="INTEGER"),
models.TableColumn(column_name="sum_girls", sum=True, type="INTEGER"),
models.TableColumn(column_name="ds", is_dttm=True, type="DATETIME"),
]
models.Table
session.add(obj)
session.commit()
obj.fetch_metadata()
tbl = obj
print("Creating some slices")
def get_slice_json(slice_name, **kwargs):
defaults = {
"compare_lag": "10",
"compare_suffix": "o10Y",
"datasource_id": "1",
"datasource_name": "birth_names",
"datasource_type": "table",
"limit": "25",
"flt_col_1": "gender",
"flt_eq_1": "",
"flt_op_1": "in",
"granularity": "ds",
"groupby": [],
"metric": 'sum__num',
"metrics": ["sum__num"],
"row_limit": config.get("ROW_LIMIT"),
"since": "100 years",
"slice_name": slice_name,
"until": "now",
"viz_type": "table",
"where": "",
"markup_type": "markdown",
}
d = defaults.copy()
d.update(kwargs)
return json.dumps(d, indent=4, sort_keys=True)
Slice = models.Slice
slices = []
slice_name = "Girls"
slc = session.query(Slice).filter_by(slice_name=slice_name).first()
if not slc:
slc = Slice(
slice_name=slice_name,
viz_type='table',
datasource_type='table',
table=tbl,
params=get_slice_json(
slice_name, groupby=['name'], flt_eq_1="girl", row_limit=50))
session.add(slc)
slices.append(slc)
slice_name = "Boys"
slc = session.query(Slice).filter_by(slice_name=slice_name).first()
if not slc:
slc = Slice(
slice_name=slice_name,
viz_type='table',
datasource_type='table',
table=tbl,
params=get_slice_json(
slice_name, groupby=['name'], flt_eq_1="boy", row_limit=50))
session.add(slc)
slices.append(slc)
slice_name = "Participants"
slc = session.query(Slice).filter_by(slice_name=slice_name).first()
if not slc:
slc = Slice(
slice_name=slice_name,
viz_type='big_number',
datasource_type='table',
table=tbl,
params=get_slice_json(
slice_name, viz_type="big_number", granularity="ds",
compare_lag="5", compare_suffix="over 5Y"))
session.add(slc)
slices.append(slc)
slice_name = "Genders"
slc = session.query(Slice).filter_by(slice_name=slice_name).first()
if not slc:
slc = Slice(
slice_name=slice_name,
viz_type='pie',
datasource_type='table',
table=tbl,
params=get_slice_json(
slice_name, viz_type="pie", groupby=['gender']))
session.add(slc)
slices.append(slc)
slice_name = "Gender by State"
slc = session.query(Slice).filter_by(slice_name=slice_name).first()
if not slc:
slc = Slice(
slice_name=slice_name,
viz_type='dist_bar',
datasource_type='table',
table=tbl,
params=get_slice_json(
slice_name, flt_eq_1="other", viz_type="dist_bar",
metrics=['sum__sum_girls', 'sum__sum_boys'],
groupby=['state'], flt_op_1='not in', flt_col_1='state'))
session.add(slc)
slices.append(slc)
slice_name = "Trends"
slc = session.query(Slice).filter_by(slice_name=slice_name).first()
if not slc:
slc = Slice(
slice_name=slice_name,
viz_type='line',
datasource_type='table',
table=tbl,
params=get_slice_json(
slice_name, viz_type="line", groupby=['name'],
granularity='ds', rich_tooltip='y', show_legend='y'))
session.add(slc)
slices.append(slc)
slice_name = "Title"
slc = session.query(Slice).filter_by(slice_name=slice_name).first()
code = """
### Birth Names Dashboard
The source dataset came from [here](https://github.com/hadley/babynames)
![img](http://monblog.system-linux.net/image/tux/baby-tux_overlord59-tux.png)
"""
if not slc:
slc = Slice(
slice_name=slice_name,
viz_type='markup',
datasource_type='table',
table=tbl,
params=get_slice_json(
slice_name, viz_type="markup", markup_type="markdown",
code=code))
session.add(slc)
slices.append(slc)
slice_name = "Name Cloud"
slc = session.query(Slice).filter_by(slice_name=slice_name).first()
if not slc:
slc = Slice(
slice_name=slice_name,
viz_type='word_cloud',
datasource_type='table',
table=tbl,
params=get_slice_json(
slice_name, viz_type="word_cloud", size_from="10",
groupby=['name'], size_to="70", rotation="square",
limit='100'))
session.add(slc)
slices.append(slc)
slice_name = "Pivot Table"
slc = session.query(Slice).filter_by(slice_name=slice_name).first()
if not slc:
slc = Slice(
slice_name=slice_name,
viz_type='pivot_table',
datasource_type='table',
table=tbl,
params=get_slice_json(
slice_name, viz_type="pivot_table", metrics=['sum__num'],
groupby=['name'], columns=['state']))
session.add(slc)
slices.append(slc)
print("Creating a dashboard")
Dash = models.Dashboard
dash = session.query(Dash).filter_by(dashboard_title="Births").first()
if not dash:
dash = Dash(
dashboard_title="Births",
position_json="""
[
{
"size_y": 4,
"size_x": 2,
"col": 3,
"slice_id": "1",
"row": 3
},
{
"size_y": 4,
"size_x": 2,
"col": 1,
"slice_id": "2",
"row": 3
},
{
"size_y": 2,
"size_x": 2,
"col": 1,
"slice_id": "3",
"row": 1
},
{
"size_y": 2,
"size_x": 2,
"col": 3,
"slice_id": "4",
"row": 1
},
{
"size_y": 3,
"size_x": 7,
"col": 5,
"slice_id": "5",
"row": 4
},
{
"size_y": 5,
"size_x": 11,
"col": 1,
"slice_id": "6",
"row": 7
},
{
"size_y": 3,
"size_x": 3,
"col": 9,
"slice_id": "7",
"row": 1
},
{
"size_y": 3,
"size_x": 4,
"col": 5,
"slice_id": "8",
"row": 1
}
]
"""
)
session.add(dash)
for s in slices:
dash.slices.append(s)
session.commit()