-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from datarevenue-berlin/ah-adlfs
Implement azure data lake filesystem
- Loading branch information
Showing
11 changed files
with
278 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,3 +18,7 @@ | |
except ImportError: | ||
pass | ||
|
||
try: | ||
import drfs.filesystems.azure_datalake | ||
except ImportError: | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
from azure.datalake.store import lib, AzureDLFileSystem | ||
from drfs.filesystems.base import FileSystemBase, FILESYSTEMS | ||
|
||
|
||
class AzureDataLakeFileSystem(FileSystemBase): | ||
fs_cls = AzureDLFileSystem | ||
scheme = "adl" | ||
is_remote = True | ||
supports_scheme = False | ||
|
||
def __init__(self, tenant_id=None, client_id=None, client_secret=None, **kwargs): | ||
self.tenant_id = tenant_id | ||
self.client_id = client_id | ||
self.client_secret = client_secret | ||
self.kwargs = kwargs | ||
# self.kwargs['store_name'] = kwargs['host'] | ||
token = lib.auth( | ||
tenant_id=self.tenant_id, | ||
client_id=self.client_id, | ||
client_secret=self.client_secret, | ||
) | ||
self.kwargs["token"] = token | ||
self.fs = AzureDLFileSystem(**self.kwargs) | ||
|
||
def _parse_store_name(self, path): | ||
from drfs.path import RemotePath | ||
|
||
if not isinstance(path, RemotePath): | ||
path = RemotePath(path) | ||
|
||
store_name, path = path.hostname, path.path | ||
if store_name == "": | ||
raise ValueError( | ||
"Can't connect without store name. Please provide the path in the " | ||
"following form: 'adl://STORE_NAME/folder/file.extension'!" | ||
) | ||
return store_name, path | ||
|
||
def _connect(self, path): | ||
self.fs.kwargs["store_name"], path = self._parse_store_name(path) | ||
self.fs.connect() | ||
return path | ||
|
||
def _add_store_name(self, p): | ||
from drfs.path import RemotePath | ||
|
||
parts = p.parts | ||
part0 = parts[0].split("/")[2] | ||
drv = parts[0].replace(part0, self.fs.kwargs["store_name"]) | ||
return RemotePath(drv, part0, *parts[1:]) | ||
|
||
def ls(self, path, *args, **kwargs): | ||
path = self._connect(path) | ||
return [self._add_store_name(p) for p in super().ls(path, *args, **kwargs)] | ||
|
||
def open(self, path, *args, **kwargs): | ||
path = self._connect(path) | ||
return [self._add_store_name(p) for p in super().open(path, *args, **kwargs)] | ||
|
||
def exists(self, path, *args, **kwargs): | ||
path = self._connect(path) | ||
return [self._add_store_name(p) for p in super().exists(path, *args, **kwargs)] | ||
|
||
def remove(self, path, *args, **kwargs): | ||
path = self._connect(path) | ||
return [self._add_store_name(p) for p in super().remove(path, *args, **kwargs)] | ||
|
||
def mv(self, path, *args, **kwargs): | ||
path = self._connect(path) | ||
return [self._add_store_name(p) for p in super().mv(path, *args, **kwargs)] | ||
|
||
def makedirs(self, path, *args, **kwargs): | ||
path = self._connect(path) | ||
return [ | ||
self._add_store_name(p) for p in super().makedirs(path, *args, **kwargs) | ||
] | ||
|
||
def rmdir(self, path, *args, **kwargs): | ||
path = self._connect(path) | ||
return [self._add_store_name(p) for p in super().rmdir(path, *args, **kwargs)] | ||
|
||
def info(self, path, *args, **kwargs): | ||
path = self._connect(path) | ||
return [self._add_store_name(p) for p in super().info(path, *args, **kwargs)] | ||
|
||
def walk(self, *args, **kwargs): | ||
arg0 = self._connect(args[0]) | ||
return [ | ||
self._add_store_name(p) for p in super().walk(arg0, *args[1:], **kwargs) | ||
] | ||
|
||
def glob(self, *args, **kwargs): | ||
arg0 = self._connect(args[0]) | ||
return [ | ||
self._add_store_name(p) for p in super().glob(arg0, *args[1:], **kwargs) | ||
] | ||
|
||
|
||
FILESYSTEMS[AzureDataLakeFileSystem.scheme] = AzureDataLakeFileSystem |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import os | ||
|
||
from drfs.filesystems import get_fs | ||
|
||
try: | ||
from luigi.target import FileSystemTarget | ||
except ImportError: | ||
raise ImportError('Could not import luigi library. Try installing it.') | ||
|
||
|
||
class FileTarget(FileSystemTarget): | ||
|
||
def __init__(self, path, **kwargs): | ||
"""Target for any kind of storage. Infers file system automatically. | ||
Parameters | ||
---------- | ||
path: str | ||
Path to the file. | ||
**kwargs | ||
Will be used as filesystem options. (Options from settings are used | ||
by default, you can overwrite them here.) | ||
""" | ||
super(FileTarget, self).__init__(str(path)) | ||
self.storage_options = kwargs | ||
|
||
@property | ||
def fs(self): | ||
return get_fs(self.path, opts=self.storage_options, rtype='instance') | ||
|
||
def open(self, *args, **kwargs): | ||
return self.fs.open(self.path, *args, **kwargs) | ||
|
||
def makedirs(self, *args, **kwargs): | ||
self.fs.makedirs(os.path.dirname(self.path), *args, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import pytest | ||
from azure.datalake.store import AzureDLFileSystem | ||
from mock import MagicMock | ||
|
||
from drfs.filesystems import azure_datalake | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def mock_azure_fs_native(monkeypatch): | ||
fs = MagicMock(spec=AzureDLFileSystem) | ||
fs.ls.return_value = [ | ||
'folder/directory/file.txt', | ||
'folder/directory/file2.txt', | ||
'folder/directory/file3.txt' | ||
] | ||
fs.glob.return_value = [ | ||
'folder/directory/file.txt', | ||
'folder/directory/file2.txt', | ||
'folder/directory/file3.txt' | ||
] | ||
fs.kwargs = {} | ||
cls = MagicMock() | ||
cls.return_value = fs | ||
monkeypatch.setattr(azure_datalake, 'AzureDLFileSystem', cls) | ||
monkeypatch.setattr(azure_datalake.lib, 'auth', lambda *args, **kwargs: 'token') | ||
|
||
|
||
def test_custom_connect(): | ||
fs = azure_datalake.AzureDataLakeFileSystem() | ||
path = fs._connect('adl://intvanprofi/some/path.txt') | ||
assert fs.fs.kwargs['store_name'] == 'intvanprofi' | ||
assert not path.startswith('adl://intvanprofi') | ||
|
||
|
||
def test_ls(): | ||
fs = azure_datalake.AzureDataLakeFileSystem() | ||
res = fs.ls('adl://intvanprofi/some/path/to/directory') | ||
|
||
fs.fs.ls.assert_called_once_with('/some/path/to/directory') | ||
for p in res: | ||
assert p.hostname == 'intvanprofi' | ||
assert p.scheme == 'adl' | ||
|
||
|
||
def test_glob(): | ||
fs = azure_datalake.AzureDataLakeFileSystem() | ||
res = fs.glob('adl://intvanprofi/some/path/to/*.csv') | ||
|
||
fs.fs.glob.assert_called_once_with('/some/path/to/*.csv') | ||
|
||
for p in res: | ||
assert p.hostname == 'intvanprofi' | ||
assert p.scheme == 'adl' |
Oops, something went wrong.