Skip to content

Commit

Permalink
feat: Update for tree sitter 0.22 with prebuilt wheels (#46)
Browse files Browse the repository at this point in the history
* feat: update for tree-sitter==0.22 with pre-build wheels

* style: fix style

* ci: update build, remove cibuildwheel

* chore: update python version to 3.9 as tree-sitter 0.22 can not use 3.8

* ci: use build pkg for build

* ci: try to fix building for macos

* ci: add separate step for macos with venv

* ci: add separate step for macos with venv

* ci: add separate step for macos with venv

* ci: add separate wheels build/install for macos arm64 for languages

* ci: add separate wheels build/install for macos arm64 for languages

* docs: add note about m1
  • Loading branch information
k4black authored May 15, 2024
1 parent f352975 commit acd8217
Show file tree
Hide file tree
Showing 11 changed files with 170 additions and 164 deletions.
22 changes: 5 additions & 17 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,19 @@ jobs:
echo "VERSION"
echo "CHANGELOG"
external-build-workflow:
needs: update-version-and-changelog
uses: ./.github/workflows/reusable-build.yml
with:
CIBW_SKIP: "pp* cp36-* cp37-*"
CIBW_BUILD: "cp*-macosx* cp*-manylinux* cp*-win*"
CIBW_ARCHS_MACOS: "x86_64 arm64"
CIBW_ARCHS_LINUX: "x86_64 aarch64"
CIBW_ARCHS_WINDOWS: "x86"
VERSION: ${{ github.ref_name }}
secrets: inherit

release-python-package:
needs: [external-build-workflow, update-version-and-changelog]
needs: [update-version-and-changelog]
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/codebleu
permissions:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
steps:
- uses: actions/download-artifact@v4
with:
name: artifact # if `name: artifact` is omitted, the action will create extra parent dir
path: dist
- name: Build wheel
run: |
python3 -m pip install --upgrade build
python3 -m build --wheel --sdist --outdir ./dist
- uses: pypa/gh-action-pypi-publish@release/v1

sync-to-hf-hub:
Expand Down
50 changes: 27 additions & 23 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ jobs:
cache: 'pip' # caching pip dependencies
- name: Install dependencies
run: |
python -m pip install -e .[test]
python -m pip install -e .[all,test]
- name: Run isort check
run: python -m isort codebleu --check
- name: Run black check
run: python -m black codebleu --check
- name: Run ruff check
run: python -m ruff codebleu
run: python -m ruff check codebleu
- name: Run mypy check
run: python -m mypy codebleu

Expand All @@ -41,37 +41,33 @@ jobs:
cache: 'pip' # caching pip dependencies
- name: Install lib from source and dependencies
run: |
python -m pip install -e .[test]
python -m pip install -e .[all,test]
- name: Run tests
run: python -m pytest

external-build-workflow:
needs: [fast-tests-python]
uses: ./.github/workflows/reusable-build.yml
with:
CIBW_SKIP: "pp* cp36-* cp37-*"
CIBW_BUILD: "cp*-macosx* cp*-manylinux* cp*-win*"
CIBW_ARCHS_MACOS: "x86_64 arm64"
CIBW_ARCHS_LINUX: "x86_64 aarch64"
CIBW_ARCHS_WINDOWS: "x86"
secrets: inherit

full-tests-python:
needs: [fast-tests-python, external-build-workflow]
needs: [fast-tests-python]
strategy:
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
python-version: ['3.9', '3.10', '3.11', '3.12']
os: [ubuntu-latest, macos-latest, macos-13, windows-latest] # at the moment macos-latest=macos-14 is exclusive M1 chip, macos-13 is intel
fail-fast: false
name: Test wheel on ${{ matrix.os }} and Python ${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v4
with:
# for macos-13 get macos-latest artifacts
name: wheels-${{ matrix.os == 'macos-13' && 'macos-latest' || matrix.os }}
path: dist
- name: Build wheel (macos for "error externally-managed-environment")
if: startsWith(matrix.os, 'macos')
run: |
python3 -m venv .venv
source .venv/bin/activate
python3 -m pip install --upgrade build wheel setuptools
python3 -m build --wheel --sdist --outdir ./dist --no-isolation
- name: Build wheel (all other)
if: "!startsWith(matrix.os, 'macos')"
run: |
python3 -m pip install --upgrade build
python3 -m build --wheel --sdist --outdir ./dist
- name: Show dist files
run: ls -lah ./dist
shell: bash
Expand All @@ -84,14 +80,22 @@ jobs:
run: |
rm -rf ./dist/*.tar.gz
shell: bash
- name: Build tree-sitter languages for arm64 (not available on PyPI for now)
if: startsWith(matrix.os, 'macos-latest')
shell: bash
run: |
languages="python java javascript c-sharp c cpp go ruby rust php"
for lang in $languages; do
python3 -m pip install git+https://github.com/tree-sitter/tree-sitter-$lang
done
- name: Install lib and dependencies
run: |
# force install package from local dist directory
pip uninstall -y codebleu || true
# TODO: check the sdist package is not installed
pip install --upgrade --no-deps --no-index --find-links=./dist codebleu
# install dependencies for the package and tests
pip install .[test]
# install dependencies for the package languages and tests
pip install .[all,test]
- name: Test itself
run: python -m pytest --cov-report=xml
- name: Upload coverage
Expand Down
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,20 @@ or directly from git repo (require internet connection to download tree-sitter):
pip install git+https://github.com/k4black/codebleu.git
```

Also you have to install tree-sitter language you need (e.g. python, rust, etc):
```bash
pip install tree-sitter-python
```
Or you can install all languages:
```bash
pip install codebleu[all]
```

Note: At the moment (May 2024) precompiled languages are NOT available for arm64 (M1) MacOS, so you have to install and build tree-sitter languages manually, for example:
```bash
pip install pip install git+https://github.com/tree-sitter/tree-sitter-python.git
```


## Usage

Expand Down Expand Up @@ -96,11 +110,11 @@ Make your own fork and clone it:
git clone https://github.com/k4black/codebleu
```

For development, you need to install library (for so file to compile) with `test` extra:
For development, you need to install library with `all` precompiled languages and `test` extra:
(require internet connection to download tree-sitter)
```bash
python -m pip install -e .[test]
python -m pip install -e .\[test\] # for macos
python -m pip install -e .[all,test]
python -m pip install -e .\[all,test\] # for macos
```

For testing just run pytest:
Expand Down
26 changes: 10 additions & 16 deletions codebleu/codebleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,9 @@
from typing import Callable, Dict, List, Optional, Tuple, Union

from . import bleu, dataflow_match, syntax_match, weighted_ngram_match
from .utils import AVAILABLE_LANGS, get_tree_sitter_language

PACKAGE_DIR = Path(__file__).parent
AVAILABLE_LANGS = [
"java",
"javascript",
"c_sharp",
"php",
"c",
"cpp",
"python",
"go",
"ruby",
"rust",
] # keywords available


def calc_codebleu(
Expand All @@ -28,7 +17,6 @@ def calc_codebleu(
weights: Tuple[float, float, float, float] = (0.25, 0.25, 0.25, 0.25),
tokenizer: Optional[Callable] = None,
keywords_dir: Path = PACKAGE_DIR / "keywords",
lang_so_file: Path = PACKAGE_DIR / "my-languages.so",
) -> Dict[str, float]:
"""Calculate CodeBLEU score
Expand All @@ -48,7 +36,9 @@ def calc_codebleu(
assert lang in AVAILABLE_LANGS, f"Language {lang} is not supported (yet). Available languages: {AVAILABLE_LANGS}"
assert len(weights) == 4, "weights should be a tuple of 4 floats (alpha, beta, gamma, theta)"
assert keywords_dir.exists(), f"keywords_dir {keywords_dir} does not exist"
assert lang_so_file.exists(), f"lang_so_file {lang_so_file} does not exist"

# get the tree-sitter language for a given language
tree_sitter_language = get_tree_sitter_language(lang)

# preprocess inputs
references = [[x.strip() for x in ref] if isinstance(ref, list) else [ref.strip()] for ref in references]
Expand Down Expand Up @@ -80,10 +70,14 @@ def make_weights(reference_tokens, key_word_list):
weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights, tokenized_hyps)

# calculate syntax match
syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, lang, str(lang_so_file))
syntax_match_score = syntax_match.corpus_syntax_match(
references, hypothesis, lang, tree_sitter_language=tree_sitter_language
)

# calculate dataflow match
dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang, str(lang_so_file))
dataflow_match_score = dataflow_match.corpus_dataflow_match(
references, hypothesis, lang, tree_sitter_language=tree_sitter_language
)

alpha, beta, gamma, theta = weights
code_bleu_score = (
Expand Down
11 changes: 7 additions & 4 deletions codebleu/dataflow_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Licensed under the MIT license.
import logging

from tree_sitter import Language, Parser
from tree_sitter import Parser

from .parser import (
DFG_csharp,
Expand All @@ -17,6 +17,7 @@
remove_comments_and_docstrings,
tree_to_token_index,
)
from .utils import get_tree_sitter_language

dfg_function = {
"python": DFG_python,
Expand All @@ -36,10 +37,12 @@ def calc_dataflow_match(references, candidate, lang, langso_so_file):
return corpus_dataflow_match([references], [candidate], lang, langso_so_file)


def corpus_dataflow_match(references, candidates, lang, langso_so_file):
LANGUAGE = Language(langso_so_file, lang)
def corpus_dataflow_match(references, candidates, lang, tree_sitter_language=None):
if not tree_sitter_language:
tree_sitter_language = get_tree_sitter_language(lang)

parser = Parser()
parser.set_language(LANGUAGE)
parser.language = tree_sitter_language
parser = [parser, dfg_function[lang]]
match_count = 0
total_count = 0
Expand Down
17 changes: 10 additions & 7 deletions codebleu/syntax_match.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from tree_sitter import Language, Parser
from tree_sitter import Parser

from .parser import (
DFG_csharp,
Expand All @@ -13,6 +13,7 @@
DFG_ruby,
remove_comments_and_docstrings,
)
from .utils import get_tree_sitter_language

dfg_function = {
"python": DFG_python,
Expand All @@ -25,14 +26,16 @@
}


def calc_syntax_match(references, candidate, lang, lang_so_file):
return corpus_syntax_match([references], [candidate], lang, lang_so_file)
def calc_syntax_match(references, candidate, lang):
return corpus_syntax_match([references], [candidate], lang)


def corpus_syntax_match(references, candidates, lang, lang_so_file):
tree_sitter_language = Language(lang_so_file, lang)
def corpus_syntax_match(references, candidates, lang, tree_sitter_language=None):
if not tree_sitter_language:
tree_sitter_language = get_tree_sitter_language(lang)

parser = Parser()
parser.set_language(tree_sitter_language)
parser.language = tree_sitter_language
match_count = 0
match_count_candidate_to_reference = 0
total_count = 0
Expand Down Expand Up @@ -61,7 +64,7 @@ def get_all_sub_trees(root_node):
node_stack.append([root_node, depth])
while len(node_stack) != 0:
cur_node, cur_depth = node_stack.pop()
sub_tree_sexp_list.append([cur_node.sexp(), cur_depth])
sub_tree_sexp_list.append([str(cur_node), cur_depth])
for child_node in cur_node.children:
if len(child_node.children) != 0:
depth = cur_depth + 1
Expand Down
75 changes: 75 additions & 0 deletions codebleu/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@

from itertools import chain

from tree_sitter import Language

AVAILABLE_LANGS = [
"java",
"javascript",
"c_sharp",
"php",
"c",
"cpp",
"python",
"go",
"ruby",
"rust",
] # keywords available


def pad_sequence(
sequence,
Expand Down Expand Up @@ -104,3 +119,63 @@ def ngrams(
history.append(item)
yield tuple(history)
del history[0]


def get_tree_sitter_language(lang: str) -> Language:
"""
Get the tree-sitter language for a given language.
:param lang: the language name to get the tree-sitter language for
:return: the tree-sitter language
"""
assert lang in AVAILABLE_LANGS, f"Language {lang} not available. Available languages: {AVAILABLE_LANGS}"

try:
if lang == "java":
import tree_sitter_java

return Language(tree_sitter_java.language())
elif lang == "javascript":
import tree_sitter_javascript

return Language(tree_sitter_javascript.language())
elif lang == "c_sharp":
import tree_sitter_c_sharp

return Language(tree_sitter_c_sharp.language())
elif lang == "php":
import tree_sitter_php

try:
return Language(tree_sitter_php.language()) # type: ignore[attr-defined]
except AttributeError:
return Language(tree_sitter_php.language_php())
elif lang == "c":
import tree_sitter_c

return Language(tree_sitter_c.language())
elif lang == "cpp":
import tree_sitter_cpp

return Language(tree_sitter_cpp.language())
elif lang == "python":
import tree_sitter_python

return Language(tree_sitter_python.language())
elif lang == "go":
import tree_sitter_go

return Language(tree_sitter_go.language())
elif lang == "ruby":
import tree_sitter_ruby

return Language(tree_sitter_ruby.language())
elif lang == "rust":
import tree_sitter_rust

return Language(tree_sitter_rust.language())
else:
assert False, "Not reachable"
except ImportError:
raise ImportError(
f"Tree-sitter language for {lang} not available. Please install the language parser using `pip install tree-sitter-{lang}`."
)
Loading

0 comments on commit acd8217

Please sign in to comment.