Skip to content

Commit

Permalink
Implement multipart upload with default block_size=100MB (#23)
Browse files Browse the repository at this point in the history
* implement multipart upload with default block_size=100MB
* fix get_sync imports
  • Loading branch information
kayibal authored Nov 14, 2017
1 parent 05867f6 commit 060d09f
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 11 deletions.
17 changes: 11 additions & 6 deletions sparsity/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def traildb_to_coo(db, fieldname):
sparse.coo_matrix((np.ones(num_events), (r_idx, c_idx)))


def to_npz(sf, filename):
def to_npz(sf, filename, block_size=None):
data = _csr_to_dict(sf.data)
data['metadata'] = \
{'multiindex': True if isinstance(sf.index, pd.MultiIndex) else False}
Expand All @@ -40,17 +40,22 @@ def to_npz(sf, filename):
fp = open(filename, 'wb')
np.savez(fp, **data)
else:
_save_npz_s3(data, filename)
_save_npz_s3(data, filename, block_size)


def _save_npz_s3(data, filename):
def _save_npz_s3(data, filename, block_size=None):
if block_size is None:
block_size = 2**20 * 100 # 100 MB
buffer = BytesIO()
np.savez(buffer, **data)
buffer.seek(0)
fs = S3FileSystem()
fp = fs.open(filename, 'wb')
fp.write(buffer.read())

with fs.open(filename, 'wb', block_size) as s3f:
while True:
data = buffer.read(block_size)
if len(data) == 0:
break
s3f.write(data)

def read_npz(filename):
open_f = open if not filename.startswith('s3://') \
Expand Down
19 changes: 16 additions & 3 deletions sparsity/sparse_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,9 +593,22 @@ def read_npz(cls, filename):
""""Read from numpy npz format."""
return cls(*read_npz(filename))

def to_npz(self, filename):
"""Save to numpy npz format."""
to_npz(self, filename)
def to_npz(self, filename, block_size=None):
"""Save to numpy npz format.
Parameters
----------
filename: str
path to local file ot s3 path starting with `s3://`
block_size: int
block size in bytes only has effect if uploading to s3
if set to None default block will be 100MB
Returns
-------
None
"""
to_npz(self, filename, block_size)


def _aligned_csr_elop(a, b, a_idx, b_idx, op='_plus_', how='outer'):
Expand Down
4 changes: 2 additions & 2 deletions sparsity/test/test_dask_sparse_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
import pytest
import sparsity as sp
import sparsity.dask as dsp
from dask.async import get_sync
from dask.local import get_sync
from sparsity import sparse_one_hot
from sparsity.dask.reshape import one_hot_encode

from .conftest import tmpdir

dask.context.set_options(get=dask.async.get_sync)
dask.context.set_options(get=dask.local.get_sync)


@pytest.fixture
Expand Down

0 comments on commit 060d09f

Please sign in to comment.