[db engine] Add support for Elasticsearch (#8441)

* [db engine] Add support for Elasticsearch
This commit is contained in:
Daniel Vaz Gaspar 2019-10-28 16:04:14 +00:00 committed by GitHub
parent 148cec4690
commit a757b43164
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 83 additions and 0 deletions

View File

@ -109,6 +109,7 @@ The following RDBMS are currently supported:
- `Apache Spark SQL <https://spark.apache.org/sql/>`_
- `BigQuery <https://cloud.google.com/bigquery/>`_
- `ClickHouse <https://clickhouse.yandex/>`_
- `Elasticsearch <https://www.elastic.co/products/elasticsearch/>`_
- `Exasol <https://www.exasol.com/>`_
- `Google Sheets <https://www.google.com/sheets/about/>`_
- `Greenplum <https://greenplum.org/>`_

View File

@ -377,6 +377,8 @@ Here's a list of some of the recommended packages.
+------------------+---------------------------------------+-------------------------------------------------+
| ClickHouse | ``pip install sqlalchemy-clickhouse`` | |
+------------------+---------------------------------------+-------------------------------------------------+
| Elasticsearch | ``pip install elasticsearch-dbapi`` | ``elasticsearch+http://`` |
+------------------+---------------------------------------+-------------------------------------------------+
| Exasol | ``pip install sqlalchemy-exasol`` | ``exa+pyodbc://`` |
+------------------+---------------------------------------+-------------------------------------------------+
| Google Sheets | ``pip install gsheetsdb`` | ``gsheets://`` |
@ -434,6 +436,38 @@ The connection string for BigQuery looks like this ::
To be able to upload data, e.g. sample data, the python library `pandas_gbq` is required.
Elasticsearch
-------------
The connection string for Elasticsearch looks like this ::
elasticsearch+http://{user}:{password}@{host}:9200/
Using HTTPS ::
elasticsearch+https://{user}:{password}@{host}:9200/
Elasticsearch as a default limit of 10000 rows, so you can increase this limit on your cluster
or set Superset's row limit on config ::
ROW_LIMIT = 10000
You can query multiple indices on SQLLab for example ::
select timestamp, agent from "logstash-*"
But, to use visualizations for multiple indices you need to create an alias index on your cluster ::
POST /_aliases
{
"actions" : [
{ "add" : { "index" : "logstash-**", "alias" : "logstash_all" } }
]
}
Then register your table with the ``alias`` name ``logstasg_all``
Snowflake
---------

View File

@ -115,6 +115,7 @@ setup(
"mysql": ["mysqlclient==1.4.2.post1"],
"postgres": ["psycopg2-binary==2.7.5"],
"presto": ["pyhive[presto]>=0.4.0"],
"elasticsearch": ["elasticsearch-dbapi>=0.1.0, <0.2.0"],
"druid": ["pydruid==0.5.7", "requests==2.22.0"],
},
python_requires="~=3.6",

View File

@ -0,0 +1,47 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=C,R,W
from datetime import datetime
from typing import Dict
from superset.db_engine_specs.base import BaseEngineSpec
class ElasticSearchEngineSpec(BaseEngineSpec):
engine = "elasticsearch"
time_groupby_inline = True
time_secondary_columns = True
allows_joins = False
allows_subqueries = True
_time_grain_functions = {
None: "{col}",
"PT1S": "HISTOGRAM({col}, INTERVAL 1 SECOND)",
"PT1M": "HISTOGRAM({col}, INTERVAL 1 MINUTE)",
"PT1H": "HISTOGRAM({col}, INTERVAL 1 HOUR)",
"P1D": "HISTOGRAM({col}, INTERVAL 1 DAY)",
"P1M": "HISTOGRAM({col}, INTERVAL 1 MONTH)",
"P1Y": "HISTOGRAM({col}, INTERVAL 1 YEAR)",
}
type_code_map: Dict[int, str] = {} # loaded from get_datatype only if needed
@classmethod
def convert_dttm(cls, target_type: str, dttm: datetime) -> str:
if target_type.upper() in ("DATETIME", "DATE"):
return f"'{dttm.isoformat()}'"
return f"'{dttm.strftime('%Y-%m-%d %H:%M:%S')}'"