From 0d418be7f95de1b21c0a939f6b92284d35519c31 Mon Sep 17 00:00:00 2001 From: Jonathan Roberts Date: Wed, 10 Mar 2021 17:02:15 +0000 Subject: [PATCH 1/6] Add pyodbc to setup deps --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 98089dc7..3a01183e 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,8 @@ def run(self): "requests", # Postgres "psycopg2-binary", + # Databricks + "pyodbc", # Sqlalchemy "sqlalchemy>1.3", # Athena From 857d0cb9e196539195c2838e4e82cf1de3d56f5b Mon Sep 17 00:00:00 2001 From: Jonathan Roberts Date: Wed, 10 Mar 2021 17:37:47 +0000 Subject: [PATCH 2/6] Add base databricks client and register as db --- src/tentaclio/__init__.py | 1 + src/tentaclio/clients/__init__.py | 1 + src/tentaclio/clients/databricks_client.py | 7 +++++++ 3 files changed, 9 insertions(+) create mode 100644 src/tentaclio/clients/databricks_client.py diff --git a/src/tentaclio/__init__.py b/src/tentaclio/__init__.py index fd2f4eab..de786ce3 100644 --- a/src/tentaclio/__init__.py +++ b/src/tentaclio/__init__.py @@ -64,6 +64,7 @@ # Db registry DB_REGISTRY.register("postgresql", PostgresClient) DB_REGISTRY.register("awsathena+rest", AthenaClient) +DB_REGISTRY.register("databricks+pyodbc", DatabricksClient) COPIER_REGISTRY.register("s3+s3", S3Client("s3://")) diff --git a/src/tentaclio/clients/__init__.py b/src/tentaclio/clients/__init__.py index 2f4b9e29..20b2786e 100644 --- a/src/tentaclio/clients/__init__.py +++ b/src/tentaclio/clients/__init__.py @@ -15,3 +15,4 @@ from .base_client import * # noqa from .local_fs_client import * # noqa from .google_drive_client import * # noqa +from .databricks_client import * # noqa diff --git a/src/tentaclio/clients/databricks_client.py b/src/tentaclio/clients/databricks_client.py new file mode 100644 index 00000000..b4fc83c7 --- /dev/null +++ b/src/tentaclio/clients/databricks_client.py @@ -0,0 +1,7 @@ +from . import sqla_client + + +class DatabricksClient(sqla_client.SQLAlchemyClient): + """Databricks client, backed by a pyodbc + SQLAlchemy connection""" + + pass From 20f2ca2362d81c8327e530a8c2b2b655a3e79f64 Mon Sep 17 00:00:00 2001 From: Jonathan Roberts Date: Wed, 17 Mar 2021 10:27:39 +0000 Subject: [PATCH 3/6] Build odbc connection string map from url --- src/tentaclio/clients/databricks_client.py | 36 +++++++++++- tests/unit/clients/test_databricks_client.py | 61 ++++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 tests/unit/clients/test_databricks_client.py diff --git a/src/tentaclio/clients/databricks_client.py b/src/tentaclio/clients/databricks_client.py index b4fc83c7..a74964f4 100644 --- a/src/tentaclio/clients/databricks_client.py +++ b/src/tentaclio/clients/databricks_client.py @@ -1,7 +1,39 @@ +"""Databricks query client.""" +from typing import Dict +import urllib + +from sqlalchemy.engine import Connection, create_engine + from . import sqla_client class DatabricksClient(sqla_client.SQLAlchemyClient): - """Databricks client, backed by a pyodbc + SQLAlchemy connection""" + """Databricks client, backed by a pyodbc + SQLAlchemy connection.""" + + def _connect(self) -> Connection: + odbc_connection_map = self._build_odbc_connection_dict() + connection_url = build_odbc_connection_string(**odbc_connection_map) + + if self.engine is None: + self.engine = create_engine( + f"mssql+pyodbc:///?odbc_connect={connection_url}" + ) + return self.engine.connect() + + def _build_odbc_connection_dict(self) -> Dict: + odbc_connection_string_map = { + "UID": "token", + "PWD": self.username, + "HOST": self.host, + "PORT": self.port, + "Schema": self.database, + } + if self.url.query: + odbc_connection_string_map.update(self.url.query) + return odbc_connection_string_map + - pass +def build_odbc_connection_string(**kwargs) -> str: + """Build a url formatted odbc connection string from kwargs.""" + connection_url = ";".join([f"{k}={v}" for k, v in kwargs.items()]) + return urllib.parse.quote(connection_url) diff --git a/tests/unit/clients/test_databricks_client.py b/tests/unit/clients/test_databricks_client.py new file mode 100644 index 00000000..6255ea10 --- /dev/null +++ b/tests/unit/clients/test_databricks_client.py @@ -0,0 +1,61 @@ +import pytest +from tentaclio.clients.databricks_client import ( + build_odbc_connection_string, + DatabricksClient, +) +from typing import Dict + + +@pytest.mark.parametrize( + "url, expected", + [ + ( + "databricks+pyodbc://my_t0k3n@db_host:443/database", + { + "UID": "token", + "PWD": "my_t0k3n", + "HOST": "db_host", + "PORT": 443, + "Schema": "database", + } + ), + ( + "databricks+pyodbc://my_t0k3n@db_host:443/", + { + "UID": "token", + "PWD": "my_t0k3n", + "HOST": "db_host", + "PORT": 443, + "Schema": "", + } + ), + ( + "databricks+pyodbc://my_t0k3n@db_host:443/database" + "?HTTPPath=sql/protocolv1/&AuthMech=3&SparkServerType=3" + "&ThriftTransport=2&SSL=1&IgnoreTransactions=1&DRIVER=/path/to/driver", + { + "UID": "token", + "PWD": "my_t0k3n", + "HOST": "db_host", + "PORT": 443, + "Schema": "database", + "AuthMech": '3', + "HTTPPath": "sql/protocolv1/", + "IgnoreTransactions": "1", + "SSL": "1", + "ThriftTransport": "2", + "SparkServerType": "3", + "DRIVER": "/path/to/driver" + } + ) + ], +) +def test_build_odbc_connection_dict(url: str, expected: Dict): + output = DatabricksClient(url)._build_odbc_connection_dict() + assert output == expected + + +def test_build_odbc_connection_string(): + conn_dict = {"UID": "user", "PWD": "p@ssw0rd", "HOST": "db_host", "PORT": 443} + output = build_odbc_connection_string(**conn_dict) + assert output == "UID%3Duser%3BPWD%3Dp%40ssw0rd%3BHOST%3Ddb_host%3BPORT%3D443" From db5150e4de95a4f00fd9ea16e1e7cf597db286e2 Mon Sep 17 00:00:00 2001 From: Jonathan Roberts Date: Wed, 17 Mar 2021 11:45:18 +0000 Subject: [PATCH 4/6] Remove params kwarg from sqla_client --- src/tentaclio/clients/sqla_client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tentaclio/clients/sqla_client.py b/src/tentaclio/clients/sqla_client.py index d38aa732..ac110f63 100644 --- a/src/tentaclio/clients/sqla_client.py +++ b/src/tentaclio/clients/sqla_client.py @@ -119,19 +119,19 @@ def delete_schema(self, meta_data: MetaData) -> None: # Query methods: @decorators.check_conn - def query(self, sql_query: str, params: dict = None, **kwargs) -> result.ResultProxy: + def query(self, sql_query: str, **kwargs) -> result.ResultProxy: """Execute a read-only SQL query, and return results. This will not commit any changes to the database. """ - return self.conn.execute(sql_query, params=params, **kwargs) + return self.conn.execute(sql_query, **kwargs) @decorators.check_conn - def execute(self, sql_query: str, params: dict = None, **kwargs) -> None: + def execute(self, sql_query: str, **kwargs) -> None: """Execute a raw SQL query command.""" trans = self.conn.begin() try: - self.conn.execute(sql_query, params=params, **kwargs) + self.conn.execute(sql_query, **kwargs) except Exception: trans.rollback() raise From 6736a0f105203e8d7557bebb47613eb57f5f3f5a Mon Sep 17 00:00:00 2001 From: Jonathan Roberts Date: Mon, 22 Mar 2021 16:44:48 +0000 Subject: [PATCH 5/6] Update readme to include databricks conn --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 91812b76..6f0dc855 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,7 @@ The supported url protocols are: * `sftp://path/to/file` * `http://host.com/path/to/resource` * `https://host.com/path/to/resource` +* `databricks+pyodbc://host/database` * `postgresql://host/database::table` will allow you to write from a csv format into a database with the same column names (note that the table goes after `::` :warning:). You can add the credentials for any of the urls in order to access protected resources. @@ -287,6 +288,23 @@ The token file has been saved in a default location '~/.tentaclio_google_drive.j The `credentials.json` file is not longer need, feel free to delete it. +## Configuring access to Databricks + +In order to use Tentaclio to connect to a Databricks cluster or SQL endpoint, it is necessary to install the required +[ODBC driver](https://databricks.com/spark/odbc-drivers-download) for your operating system. + +Once installed, it is possible to access Databricks as you would any supported URL protocol. However, +it is likely that you will have to pass some [additional variables](https://docs.databricks.com/integrations/bi/jdbc-odbc-bi.html) +in the URL query string, including the path to the installed driver. + +For example, if your Databricks connection requires you to set DRIVER and HTTPPATH values, +the URL should look like this: + +``` +databricks+pyodbc://@/?DRIVER=&HTTPPath= +``` + + ## Quick note on protocols structural subtyping. In order to abstract concrete dependencies from the implementation of data related functions (or in any part of the system really) we use typed [protocols](https://mypy.readthedocs.io/en/latest/protocols.html#simple-user-defined-protocols). This allows a more flexible dependency injection than using subclassing or [more complex approches](http://code.activestate.com/recipes/413268/). This idea is heavily inspired by how this exact thing is done in [go](https://www.youtube.com/watch?v=ifBUfIb7kdo). Learn more about this principle in our [tech blog](https://tech.octopus.energy/news/2019/03/21/python-interfaces-a-la-go.html). From 019aa5a42b022387b5712050f212757a42bcdec6 Mon Sep 17 00:00:00 2001 From: Jonathan Roberts Date: Tue, 23 Mar 2021 11:17:37 +0000 Subject: [PATCH 6/6] Bump version -> 0.0.15 --- CHANGELOG.txt | 4 ++++ setup.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 5f1e28d3..acddafce 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.15] - 2021-03-23 +### Addition + - Add Databricks db client + ## [0.0.14] - 2021-03-17 ### Fix - Temporarily pin sqlalchemy to <1.4 due to deprecated ResultProxy interface diff --git a/setup.py b/setup.py index 23d30654..4432ebdb 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ from setuptools.command.install import install -VERSION = "0.0.14" +VERSION = "0.0.15" REPO_ROOT = pathlib.Path(__file__).parent