Skip to content

Commit

Permalink
feat: add pre-commit hooks and format existing code
Browse files Browse the repository at this point in the history
This commit introduces pre-commit hooks to ensure that Python and Rust code adheres to the specified style guidelines and quality standards. The following hooks have been added to the .pre-commit-config.yaml file:

- pre-commit-hooks:
  - trailing-whitespace: Removes trailing whitespace
  - end-of-file-fixer: Ensures files end with a newline
  - check-yaml: Validates YAML files
  - check-json: Validates JSON files
  - check-added-large-files: Prevents large files from being added
  - check-merge-conflict: Detects merge conflict markers

- black:
  - Formats Python code according to Black's style guide with a max line length of 120 characters

- flake8:
  - Lints Python code with Flake8 and flake8-bugbear plugin, ignoring F401 errors and allowing a max line length of 120 characters

 - Mypy:
   - Performs static type checking on Python code, ignoring missing imports with --ignore-missing-imports flag

- local:
  - cargo fmt: Formats Rust code using

These hooks will run automatically before each commit, helping to maintain a consistent code style and quality across the project.

Additionally, this commit reformats the current codebase to comply with the new style guidelines enforced by the pre-commit hooks.

BREAKING CHANGE: Developers must now have pre-commit installed and configured to contribute to the project.
  • Loading branch information
yanghao14 committed Jul 27, 2024
1 parent 36316e0 commit 9dbd569
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 43 deletions.
39 changes: 39 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-json
- id: check-added-large-files
- id: check-merge-conflict

- repo: https://github.com/psf/black
rev: 24.4.2
hooks:
- id: black
language_version: python3
args: ["--line-length", "120"]

- repo: https://github.com/PyCQA/flake8
rev: 7.1.0
hooks:
- id: flake8
additional_dependencies: [flake8-bugbear]
args: ["--max-line-length", "120", "--ignore=F401"]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.11.0
hooks:
- id: mypy
args: [ "--ignore-missing-imports" ]

- repo: local
hooks:
- id: cargo-fmt
name: cargo fmt
entry: cargo fmt --
language: system
types: [rust]
pass_filenames: false
2 changes: 2 additions & 0 deletions python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ setup-venv: ## Setup the virtualenv
setup: ## Setup the requirements
$(info --- Setup dependencies ---)
pip install "$(MATURIN_VERSION)"
pip install pre-commit
pre-commit install

.PHONY: build
build: setup ## Build Python binding of delta-rs
Expand Down
15 changes: 3 additions & 12 deletions python/hudi/_internal.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import pyarrow

__version__: str


@dataclass(init=False)
class HudiFileSlice:
file_group_id: str
Expand All @@ -33,24 +32,16 @@ class HudiFileSlice:

def base_file_relative_path(self) -> str: ...


@dataclass(init=False)
class HudiTable:

def __init__(
self,
table_uri: str,
options: Optional[Dict[str, str]] = None,
self,
table_uri: str,
options: Optional[Dict[str, str]] = None,
): ...

def get_schema(self) -> "pyarrow.Schema": ...

def split_file_slices(self, n: int) -> List[List[HudiFileSlice]]: ...

def get_file_slices(self) -> List[HudiFileSlice]: ...

def read_file_slice(self, base_file_relative_path) -> pyarrow.RecordBatch: ...

def read_snapshot(self) -> List["pyarrow.RecordBatch"]: ...

def read_snapshot_as_of(self, timestamp: str) -> List["pyarrow.RecordBatch"]: ...
117 changes: 86 additions & 31 deletions python/tests/test_table_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,32 @@ def test_sample_table(get_sample_table):
table_path = get_sample_table
table = HudiTable(table_path)

assert table.get_schema().names == ['_hoodie_commit_time', '_hoodie_commit_seqno', '_hoodie_record_key',
'_hoodie_partition_path', '_hoodie_file_name', 'ts', 'uuid', 'rider', 'driver',
'fare', 'city']
assert table.get_schema().names == [
"_hoodie_commit_time",
"_hoodie_commit_seqno",
"_hoodie_record_key",
"_hoodie_partition_path",
"_hoodie_file_name",
"ts",
"uuid",
"rider",
"driver",
"fare",
"city",
]

file_slices = table.get_file_slices()
assert len(file_slices) == 5
assert set(f.commit_time for f in file_slices) == {'20240402123035233', '20240402144910683'}
assert set(f.commit_time for f in file_slices) == {"20240402123035233", "20240402144910683"}
assert all(f.num_records == 1 for f in file_slices)
file_slice_paths = [f.base_file_relative_path() for f in file_slices]
assert set(file_slice_paths) == {'chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet',
'san_francisco/d9082ffd-2eb1-4394-aefc-deb4a61ecc57-0_1-9-0_20240402123035233.parquet',
'san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet',
'san_francisco/5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet',
'sao_paulo/ee915c68-d7f8-44f6-9759-e691add290d8-0_3-11-0_20240402123035233.parquet'}
assert set(file_slice_paths) == {
"chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet",
"san_francisco/d9082ffd-2eb1-4394-aefc-deb4a61ecc57-0_1-9-0_20240402123035233.parquet",
"san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet",
"san_francisco/5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet",
"sao_paulo/ee915c68-d7f8-44f6-9759-e691add290d8-0_3-11-0_20240402123035233.parquet",
}

batch = table.read_file_slice(file_slice_paths[0])
t = pa.Table.from_batches([batch])
Expand All @@ -54,28 +66,71 @@ def test_sample_table(get_sample_table):

batches = table.read_snapshot()
t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts")
assert t.to_pylist() == [{'_hoodie_commit_time': '20240402144910683', 'ts': 1695046462179,
'uuid': '9909a8b1-2d15-4d3d-8ec9-efc48c536a00', 'fare': 339.0},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695091554788,
'uuid': 'e96c4396-3fad-413a-a942-4cb36106d721', 'fare': 27.7},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695115999911,
'uuid': 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa', 'fare': 17.85},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695159649087,
'uuid': '334e26e9-8355-45cc-97c6-c31daf0df330', 'fare': 19.1},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695516137016,
'uuid': 'e3cf430c-889d-4015-bc98-59bdce1e530c', 'fare': 34.15}]
assert t.to_pylist() == [
{
"_hoodie_commit_time": "20240402144910683",
"ts": 1695046462179,
"uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
"fare": 339.0,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695091554788,
"uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
"fare": 27.7,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695115999911,
"uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa",
"fare": 17.85,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695159649087,
"uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
"fare": 19.1,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695516137016,
"uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c",
"fare": 34.15,
},
]

table = HudiTable(table_path, {
"hoodie.read.as.of.timestamp": "20240402123035233"})
table = HudiTable(table_path, {"hoodie.read.as.of.timestamp": "20240402123035233"})
batches = table.read_snapshot()
t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts")
assert t.to_pylist() == [{'_hoodie_commit_time': '20240402123035233', 'ts': 1695046462179,
'uuid': '9909a8b1-2d15-4d3d-8ec9-efc48c536a00', 'fare': 33.9},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695091554788,
'uuid': 'e96c4396-3fad-413a-a942-4cb36106d721', 'fare': 27.7},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695115999911,
'uuid': 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa', 'fare': 17.85},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695159649087,
'uuid': '334e26e9-8355-45cc-97c6-c31daf0df330', 'fare': 19.1},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695516137016,
'uuid': 'e3cf430c-889d-4015-bc98-59bdce1e530c', 'fare': 34.15}]
assert t.to_pylist() == [
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695046462179,
"uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
"fare": 33.9,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695091554788,
"uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
"fare": 27.7,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695115999911,
"uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa",
"fare": 17.85,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695159649087,
"uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
"fare": 19.1,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695516137016,
"uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c",
"fare": 34.15,
},
]

0 comments on commit 9dbd569

Please sign in to comment.