[db engine] Add support for Elasticsearch (#8441)

* [db engine] Add support for Elasticsearch
2019-10-28 16:04:14 +00:00 · 2019-10-28 16:04:14 +00:00 · a757b43164
parent 148cec4690
commit a757b43164
4 changed files with 83 additions and 0 deletions
--- a/docs/index.rst
+++ b/docs/index.rst
@ -109,6 +109,7 @@ The following RDBMS are currently supported:
 - `Apache Spark SQL <https://spark.apache.org/sql/>`_
 - `BigQuery <https://cloud.google.com/bigquery/>`_
 - `ClickHouse <https://clickhouse.yandex/>`_
+- `Elasticsearch <https://www.elastic.co/products/elasticsearch/>`_
 - `Exasol <https://www.exasol.com/>`_
 - `Google Sheets <https://www.google.com/sheets/about/>`_
 - `Greenplum <https://greenplum.org/>`_
--- a/docs/installation.rst
+++ b/docs/installation.rst
@ -377,6 +377,8 @@ Here's a list of some of the recommended packages.
 +------------------+---------------------------------------+-------------------------------------------------+
 | ClickHouse       | ``pip install sqlalchemy-clickhouse`` |                                                 |
 +------------------+---------------------------------------+-------------------------------------------------+
+| Elasticsearch    | ``pip install elasticsearch-dbapi``   | ``elasticsearch+http://``                               |
+------------------+---------------------------------------+-------------------------------------------------+
 | Exasol           | ``pip install sqlalchemy-exasol``     | ``exa+pyodbc://``                               |
 +------------------+---------------------------------------+-------------------------------------------------+
 | Google Sheets    | ``pip install gsheetsdb``             | ``gsheets://``                                  |
@ -434,6 +436,38 @@ The connection string for BigQuery looks like this ::

 To be able to upload data, e.g. sample data, the python library `pandas_gbq` is required.

+Elasticsearch
+-------------
+
+The connection string for Elasticsearch looks like this ::
+
+    elasticsearch+http://{user}:{password}@{host}:9200/
+
+Using HTTPS ::
+
+    elasticsearch+https://{user}:{password}@{host}:9200/
+
+
+Elasticsearch as a default limit of 10000 rows, so you can increase this limit on your cluster
+or set Superset's row limit on config ::
+
+    ROW_LIMIT = 10000
+
+You can query multiple indices on SQLLab for example ::
+
+    select timestamp, agent from "logstash-*"
+
+But, to use visualizations for multiple indices you need to create an alias index on your cluster ::
+
+    POST /_aliases
+    {
+        "actions" : [
+            { "add" : { "index" : "logstash-**", "alias" : "logstash_all" } }
+        ]
+    }
+
+Then register your table with the ``alias`` name ``logstasg_all``
+
 Snowflake
 ---------

--- a/setup.py
+++ b/setup.py
@ -115,6 +115,7 @@ setup(
        "mysql": ["mysqlclient==1.4.2.post1"],
        "postgres": ["psycopg2-binary==2.7.5"],
        "presto": ["pyhive[presto]>=0.4.0"],
+        "elasticsearch": ["elasticsearch-dbapi>=0.1.0, <0.2.0"],
        "druid": ["pydruid==0.5.7", "requests==2.22.0"],
    },
    python_requires="~=3.6",
--- a/superset/db_engine_specs/elasticsearch.py
+++ b/superset/db_engine_specs/elasticsearch.py
@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=C,R,W
+from datetime import datetime
+from typing import Dict
+
+from superset.db_engine_specs.base import BaseEngineSpec
+
+
+class ElasticSearchEngineSpec(BaseEngineSpec):
+    engine = "elasticsearch"
+    time_groupby_inline = True
+    time_secondary_columns = True
+    allows_joins = False
+    allows_subqueries = True
+
+    _time_grain_functions = {
+        None: "{col}",
+        "PT1S": "HISTOGRAM({col}, INTERVAL 1 SECOND)",
+        "PT1M": "HISTOGRAM({col}, INTERVAL 1 MINUTE)",
+        "PT1H": "HISTOGRAM({col}, INTERVAL 1 HOUR)",
+        "P1D": "HISTOGRAM({col}, INTERVAL 1 DAY)",
+        "P1M": "HISTOGRAM({col}, INTERVAL 1 MONTH)",
+        "P1Y": "HISTOGRAM({col}, INTERVAL 1 YEAR)",
+    }
+
+    type_code_map: Dict[int, str] = {}  # loaded from get_datatype only if needed
+
+    @classmethod
+    def convert_dttm(cls, target_type: str, dttm: datetime) -> str:
+        if target_type.upper() in ("DATETIME", "DATE"):
+            return f"'{dttm.isoformat()}'"
+        return f"'{dttm.strftime('%Y-%m-%d %H:%M:%S')}'"