From acd82178f75186d8d29743c14bfb8209146fa0ab Mon Sep 17 00:00:00 2001 From: Konstantin Chernyshev <38007247+k4black@users.noreply.github.com> Date: Wed, 15 May 2024 08:13:06 +0200 Subject: [PATCH] feat: Update for tree sitter 0.22 with prebuilt wheels (#46) * feat: update for tree-sitter==0.22 with pre-build wheels * style: fix style * ci: update build, remove cibuildwheel * chore: update python version to 3.9 as tree-sitter 0.22 can not use 3.8 * ci: use build pkg for build * ci: try to fix building for macos * ci: add separate step for macos with venv * ci: add separate step for macos with venv * ci: add separate step for macos with venv * ci: add separate wheels build/install for macos arm64 for languages * ci: add separate wheels build/install for macos arm64 for languages * docs: add note about m1 --- .github/workflows/publish.yml | 22 ++------- .github/workflows/test.yml | 50 +++++++++++--------- README.md | 20 ++++++-- codebleu/codebleu.py | 26 ++++------- codebleu/dataflow_match.py | 11 +++-- codebleu/syntax_match.py | 17 ++++--- codebleu/utils.py | 75 ++++++++++++++++++++++++++++++ evaluate_app/README.md | 2 +- evaluate_app/requirements.txt | 2 +- pyproject.toml | 22 +++++++-- setup.py | 87 ----------------------------------- 11 files changed, 170 insertions(+), 164 deletions(-) delete mode 100644 setup.py diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index f933a52..dc09fef 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -46,20 +46,8 @@ jobs: echo "VERSION" echo "CHANGELOG" - external-build-workflow: - needs: update-version-and-changelog - uses: ./.github/workflows/reusable-build.yml - with: - CIBW_SKIP: "pp* cp36-* cp37-*" - CIBW_BUILD: "cp*-macosx* cp*-manylinux* cp*-win*" - CIBW_ARCHS_MACOS: "x86_64 arm64" - CIBW_ARCHS_LINUX: "x86_64 aarch64" - CIBW_ARCHS_WINDOWS: "x86" - VERSION: ${{ github.ref_name }} - secrets: inherit - release-python-package: - needs: [external-build-workflow, update-version-and-changelog] + needs: [update-version-and-changelog] runs-on: ubuntu-latest environment: name: pypi @@ -67,10 +55,10 @@ jobs: permissions: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing steps: - - uses: actions/download-artifact@v4 - with: - name: artifact # if `name: artifact` is omitted, the action will create extra parent dir - path: dist + - name: Build wheel + run: | + python3 -m pip install --upgrade build + python3 -m build --wheel --sdist --outdir ./dist - uses: pypa/gh-action-pypi-publish@release/v1 sync-to-hf-hub: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7dabf27..e4700c0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,13 +18,13 @@ jobs: cache: 'pip' # caching pip dependencies - name: Install dependencies run: | - python -m pip install -e .[test] + python -m pip install -e .[all,test] - name: Run isort check run: python -m isort codebleu --check - name: Run black check run: python -m black codebleu --check - name: Run ruff check - run: python -m ruff codebleu + run: python -m ruff check codebleu - name: Run mypy check run: python -m mypy codebleu @@ -41,37 +41,33 @@ jobs: cache: 'pip' # caching pip dependencies - name: Install lib from source and dependencies run: | - python -m pip install -e .[test] + python -m pip install -e .[all,test] - name: Run tests run: python -m pytest - external-build-workflow: - needs: [fast-tests-python] - uses: ./.github/workflows/reusable-build.yml - with: - CIBW_SKIP: "pp* cp36-* cp37-*" - CIBW_BUILD: "cp*-macosx* cp*-manylinux* cp*-win*" - CIBW_ARCHS_MACOS: "x86_64 arm64" - CIBW_ARCHS_LINUX: "x86_64 aarch64" - CIBW_ARCHS_WINDOWS: "x86" - secrets: inherit - full-tests-python: - needs: [fast-tests-python, external-build-workflow] + needs: [fast-tests-python] strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12'] os: [ubuntu-latest, macos-latest, macos-13, windows-latest] # at the moment macos-latest=macos-14 is exclusive M1 chip, macos-13 is intel fail-fast: false name: Test wheel on ${{ matrix.os }} and Python ${{ matrix.python-version }} runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 - with: - # for macos-13 get macos-latest artifacts - name: wheels-${{ matrix.os == 'macos-13' && 'macos-latest' || matrix.os }} - path: dist + - name: Build wheel (macos for "error externally-managed-environment") + if: startsWith(matrix.os, 'macos') + run: | + python3 -m venv .venv + source .venv/bin/activate + python3 -m pip install --upgrade build wheel setuptools + python3 -m build --wheel --sdist --outdir ./dist --no-isolation + - name: Build wheel (all other) + if: "!startsWith(matrix.os, 'macos')" + run: | + python3 -m pip install --upgrade build + python3 -m build --wheel --sdist --outdir ./dist - name: Show dist files run: ls -lah ./dist shell: bash @@ -84,14 +80,22 @@ jobs: run: | rm -rf ./dist/*.tar.gz shell: bash + - name: Build tree-sitter languages for arm64 (not available on PyPI for now) + if: startsWith(matrix.os, 'macos-latest') + shell: bash + run: | + languages="python java javascript c-sharp c cpp go ruby rust php" + for lang in $languages; do + python3 -m pip install git+https://github.com/tree-sitter/tree-sitter-$lang + done - name: Install lib and dependencies run: | # force install package from local dist directory pip uninstall -y codebleu || true # TODO: check the sdist package is not installed pip install --upgrade --no-deps --no-index --find-links=./dist codebleu - # install dependencies for the package and tests - pip install .[test] + # install dependencies for the package languages and tests + pip install .[all,test] - name: Test itself run: python -m pytest --cov-report=xml - name: Upload coverage diff --git a/README.md b/README.md index 4328491..fd9ff4b 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,20 @@ or directly from git repo (require internet connection to download tree-sitter): pip install git+https://github.com/k4black/codebleu.git ``` +Also you have to install tree-sitter language you need (e.g. python, rust, etc): +```bash +pip install tree-sitter-python +``` +Or you can install all languages: +```bash +pip install codebleu[all] +``` + +Note: At the moment (May 2024) precompiled languages are NOT available for arm64 (M1) MacOS, so you have to install and build tree-sitter languages manually, for example: +```bash +pip install pip install git+https://github.com/tree-sitter/tree-sitter-python.git +``` + ## Usage @@ -96,11 +110,11 @@ Make your own fork and clone it: git clone https://github.com/k4black/codebleu ``` -For development, you need to install library (for so file to compile) with `test` extra: +For development, you need to install library with `all` precompiled languages and `test` extra: (require internet connection to download tree-sitter) ```bash -python -m pip install -e .[test] -python -m pip install -e .\[test\] # for macos +python -m pip install -e .[all,test] +python -m pip install -e .\[all,test\] # for macos ``` For testing just run pytest: diff --git a/codebleu/codebleu.py b/codebleu/codebleu.py index 65d8959..27855cf 100644 --- a/codebleu/codebleu.py +++ b/codebleu/codebleu.py @@ -5,20 +5,9 @@ from typing import Callable, Dict, List, Optional, Tuple, Union from . import bleu, dataflow_match, syntax_match, weighted_ngram_match +from .utils import AVAILABLE_LANGS, get_tree_sitter_language PACKAGE_DIR = Path(__file__).parent -AVAILABLE_LANGS = [ - "java", - "javascript", - "c_sharp", - "php", - "c", - "cpp", - "python", - "go", - "ruby", - "rust", -] # keywords available def calc_codebleu( @@ -28,7 +17,6 @@ def calc_codebleu( weights: Tuple[float, float, float, float] = (0.25, 0.25, 0.25, 0.25), tokenizer: Optional[Callable] = None, keywords_dir: Path = PACKAGE_DIR / "keywords", - lang_so_file: Path = PACKAGE_DIR / "my-languages.so", ) -> Dict[str, float]: """Calculate CodeBLEU score @@ -48,7 +36,9 @@ def calc_codebleu( assert lang in AVAILABLE_LANGS, f"Language {lang} is not supported (yet). Available languages: {AVAILABLE_LANGS}" assert len(weights) == 4, "weights should be a tuple of 4 floats (alpha, beta, gamma, theta)" assert keywords_dir.exists(), f"keywords_dir {keywords_dir} does not exist" - assert lang_so_file.exists(), f"lang_so_file {lang_so_file} does not exist" + + # get the tree-sitter language for a given language + tree_sitter_language = get_tree_sitter_language(lang) # preprocess inputs references = [[x.strip() for x in ref] if isinstance(ref, list) else [ref.strip()] for ref in references] @@ -80,10 +70,14 @@ def make_weights(reference_tokens, key_word_list): weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights, tokenized_hyps) # calculate syntax match - syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, lang, str(lang_so_file)) + syntax_match_score = syntax_match.corpus_syntax_match( + references, hypothesis, lang, tree_sitter_language=tree_sitter_language + ) # calculate dataflow match - dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang, str(lang_so_file)) + dataflow_match_score = dataflow_match.corpus_dataflow_match( + references, hypothesis, lang, tree_sitter_language=tree_sitter_language + ) alpha, beta, gamma, theta = weights code_bleu_score = ( diff --git a/codebleu/dataflow_match.py b/codebleu/dataflow_match.py index f110a91..dfe1ebe 100644 --- a/codebleu/dataflow_match.py +++ b/codebleu/dataflow_match.py @@ -2,7 +2,7 @@ # Licensed under the MIT license. import logging -from tree_sitter import Language, Parser +from tree_sitter import Parser from .parser import ( DFG_csharp, @@ -17,6 +17,7 @@ remove_comments_and_docstrings, tree_to_token_index, ) +from .utils import get_tree_sitter_language dfg_function = { "python": DFG_python, @@ -36,10 +37,12 @@ def calc_dataflow_match(references, candidate, lang, langso_so_file): return corpus_dataflow_match([references], [candidate], lang, langso_so_file) -def corpus_dataflow_match(references, candidates, lang, langso_so_file): - LANGUAGE = Language(langso_so_file, lang) +def corpus_dataflow_match(references, candidates, lang, tree_sitter_language=None): + if not tree_sitter_language: + tree_sitter_language = get_tree_sitter_language(lang) + parser = Parser() - parser.set_language(LANGUAGE) + parser.language = tree_sitter_language parser = [parser, dfg_function[lang]] match_count = 0 total_count = 0 diff --git a/codebleu/syntax_match.py b/codebleu/syntax_match.py index 0050c1a..860ae12 100644 --- a/codebleu/syntax_match.py +++ b/codebleu/syntax_match.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from tree_sitter import Language, Parser +from tree_sitter import Parser from .parser import ( DFG_csharp, @@ -13,6 +13,7 @@ DFG_ruby, remove_comments_and_docstrings, ) +from .utils import get_tree_sitter_language dfg_function = { "python": DFG_python, @@ -25,14 +26,16 @@ } -def calc_syntax_match(references, candidate, lang, lang_so_file): - return corpus_syntax_match([references], [candidate], lang, lang_so_file) +def calc_syntax_match(references, candidate, lang): + return corpus_syntax_match([references], [candidate], lang) -def corpus_syntax_match(references, candidates, lang, lang_so_file): - tree_sitter_language = Language(lang_so_file, lang) +def corpus_syntax_match(references, candidates, lang, tree_sitter_language=None): + if not tree_sitter_language: + tree_sitter_language = get_tree_sitter_language(lang) + parser = Parser() - parser.set_language(tree_sitter_language) + parser.language = tree_sitter_language match_count = 0 match_count_candidate_to_reference = 0 total_count = 0 @@ -61,7 +64,7 @@ def get_all_sub_trees(root_node): node_stack.append([root_node, depth]) while len(node_stack) != 0: cur_node, cur_depth = node_stack.pop() - sub_tree_sexp_list.append([cur_node.sexp(), cur_depth]) + sub_tree_sexp_list.append([str(cur_node), cur_depth]) for child_node in cur_node.children: if len(child_node.children) != 0: depth = cur_depth + 1 diff --git a/codebleu/utils.py b/codebleu/utils.py index ab8a6e0..468df81 100644 --- a/codebleu/utils.py +++ b/codebleu/utils.py @@ -7,6 +7,21 @@ from itertools import chain +from tree_sitter import Language + +AVAILABLE_LANGS = [ + "java", + "javascript", + "c_sharp", + "php", + "c", + "cpp", + "python", + "go", + "ruby", + "rust", +] # keywords available + def pad_sequence( sequence, @@ -104,3 +119,63 @@ def ngrams( history.append(item) yield tuple(history) del history[0] + + +def get_tree_sitter_language(lang: str) -> Language: + """ + Get the tree-sitter language for a given language. + :param lang: the language name to get the tree-sitter language for + :return: the tree-sitter language + """ + assert lang in AVAILABLE_LANGS, f"Language {lang} not available. Available languages: {AVAILABLE_LANGS}" + + try: + if lang == "java": + import tree_sitter_java + + return Language(tree_sitter_java.language()) + elif lang == "javascript": + import tree_sitter_javascript + + return Language(tree_sitter_javascript.language()) + elif lang == "c_sharp": + import tree_sitter_c_sharp + + return Language(tree_sitter_c_sharp.language()) + elif lang == "php": + import tree_sitter_php + + try: + return Language(tree_sitter_php.language()) # type: ignore[attr-defined] + except AttributeError: + return Language(tree_sitter_php.language_php()) + elif lang == "c": + import tree_sitter_c + + return Language(tree_sitter_c.language()) + elif lang == "cpp": + import tree_sitter_cpp + + return Language(tree_sitter_cpp.language()) + elif lang == "python": + import tree_sitter_python + + return Language(tree_sitter_python.language()) + elif lang == "go": + import tree_sitter_go + + return Language(tree_sitter_go.language()) + elif lang == "ruby": + import tree_sitter_ruby + + return Language(tree_sitter_ruby.language()) + elif lang == "rust": + import tree_sitter_rust + + return Language(tree_sitter_rust.language()) + else: + assert False, "Not reachable" + except ImportError: + raise ImportError( + f"Tree-sitter language for {lang} not available. Please install the language parser using `pip install tree-sitter-{lang}`." + ) diff --git a/evaluate_app/README.md b/evaluate_app/README.md index 90a6f15..ebeb9b3 100644 --- a/evaluate_app/README.md +++ b/evaluate_app/README.md @@ -65,7 +65,7 @@ Each of the scores is in range `[0, 1]`, where `1` is the best score. [//]: # (*Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*) -Using pip package (`pip install codebleu`): +Using pip package (`pip install codebleu`), also you have to install tree-sitter language you need (e.g. `pip install tree-sitter-python` or `pip install codebleu[all]` to install all languages): ```python from codebleu import calc_codebleu diff --git a/evaluate_app/requirements.txt b/evaluate_app/requirements.txt index b0bfbc0..8814f79 100644 --- a/evaluate_app/requirements.txt +++ b/evaluate_app/requirements.txt @@ -1,2 +1,2 @@ git+https://github.com/huggingface/evaluate@main -codebleu>=0.2.0,<1.0.0 +codebleu>=0.5.0,<1.0.0 diff --git a/pyproject.toml b/pyproject.toml index 58abe9e..dad4edf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=61.0.0", "wheel", "tree-sitter>=0.20.0,<0.22.0", "requests>=2.0.0,<3.0.0"] +requires = ["setuptools>=61.0.0", "wheel"] build-backend = "setuptools.build_meta" @@ -9,12 +9,12 @@ description = "Unofficial CodeBLEU implementation that supports Linux, MacOS and readme = "README.md" license = {text = "MIT License"} authors = [ - {name = "Konstantin Chernyshev", email = "kdchernyshev@gmail.com"}, + {name = "Konstantin Chernyshev", email = "kdchernyshev+github@gmail.com"}, ] keywords = ["codebleu", "code", "bleu", "nlp", "natural language processing", "programming", "evaluate", "evaluation", "code generation", "metrics"] dynamic = ["version"] -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", @@ -23,7 +23,7 @@ classifiers = [ ] dependencies = [ - "tree-sitter >=0.20.0,<0.22.0", + "tree-sitter >=0.22.0,<0.23.0", "setuptools >=61.0.0", # distutils removed in 3.12, but distutils.ccompiler used in tree-sitter ] @@ -37,7 +37,7 @@ exclude = ["tests", "tests.*", "codebleu.parser.tree-sitter"] [tool.setuptools.package-data] -"*" = ["py.typed", "*.txt", "*.so", "*.dylib", "*.dll", "keywords/*"] +"*" = ["py.typed", "*.txt", "keywords/*"] [project.scripts] @@ -47,6 +47,18 @@ codebleu = "codebleu.__main__:main" homepage = "https://github.com/k4black/codebleu" [project.optional-dependencies] +all = [ + "tree-sitter-python ~=0.21", + "tree-sitter-go ~=0.21", + "tree-sitter-javascript ~=0.21", + "tree-sitter-ruby ~=0.21", + "tree-sitter-php ~=0.22", + "tree-sitter-java ~=0.21", + "tree-sitter-c-sharp ~=0.21", + "tree-sitter-c ~=0.21", + "tree-sitter-cpp ~=0.22", + "tree-sitter-rust ~=0.21", +] test = [ "pytest >=7.0.0,<9.0.0", "pytest-cov >=4.0.0,<6.0.0", diff --git a/setup.py b/setup.py deleted file mode 100644 index e4d6aac..0000000 --- a/setup.py +++ /dev/null @@ -1,87 +0,0 @@ -from __future__ import annotations - -import io -import shutil -import zipfile -from pathlib import Path - -import requests -from setuptools import setup -from setuptools.dist import Distribution -from tree_sitter import Language - -ROOT = Path(__file__).parent - - -tree_sitter_languages = { - "go": "https://github.com/tree-sitter/tree-sitter-go/archive/refs/tags/v0.20.0.zip", - "javascript": "https://github.com/tree-sitter/tree-sitter-javascript/archive/refs/tags/v0.20.3.zip", - "python": "https://github.com/tree-sitter/tree-sitter-python/archive/refs/tags/v0.20.4.zip", - "ruby": "https://github.com/tree-sitter/tree-sitter-ruby/archive/refs/tags/v0.19.0.zip", - "php": "https://github.com/tree-sitter/tree-sitter-php/archive/refs/tags/v0.20.0.zip", # 0.21.1 not working - "java": "https://github.com/tree-sitter/tree-sitter-java/archive/refs/tags/v0.20.2.zip", - "c-sharp": "https://github.com/tree-sitter/tree-sitter-c-sharp/archive/refs/tags/v0.20.0.zip", - "c": "https://github.com/tree-sitter/tree-sitter-c/archive/refs/tags/v0.20.7.zip", - "cpp": "https://github.com/tree-sitter/tree-sitter-cpp/archive/refs/tags/v0.20.3.zip", - "rust": "https://github.com/tree-sitter/tree-sitter-rust/archive/refs/tags/v0.20.1.zip", -} - - -def download_tree_sitter_languages(languages: dict[str, str], languages_folder: Path) -> list[str]: - if languages_folder.exists(): - shutil.rmtree(languages_folder) - languages_folder.mkdir(parents=True) - - extracted_folders: list[str] = [] - for lang, url in languages.items(): - # Download the ZIP file - response = requests.get(url) - response.raise_for_status() - - # Extract the ZIP file - with zipfile.ZipFile(io.BytesIO(response.content)) as zip_f: - zip_f.extractall(languages_folder) - extracted_folders.append(zip_f.namelist()[0]) # get the name of the extracted folder - - return extracted_folders - - -def build_tree_sitter_languages(languages: dict[str, str], languages_folder: Path, target_lib_file: Path) -> str: - extracted_folders = download_tree_sitter_languages(languages, languages_folder) - - Language.build_library( - str(target_lib_file), - [str(languages_folder / lang_folder) for lang_folder in extracted_folders], - ) - - return str(target_lib_file) - - -build_tree_sitter_languages( - tree_sitter_languages, - ROOT / "tree_sitter_languages", - ROOT / "codebleu" / "my-languages.so", -) - - -# tree_sitter_extension = Extension( -# 'codebleu.tree_sitter', -# sources=[], -# include_dirs=[], -# libraries=[], -# extra_objects=[ -# -# ], -# ) - - -class PlatformSpecificDistribution(Distribution): - """Distribution which always forces a binary package with platform name""" - - def has_ext_modules(self): - return True - - -setup( - distclass=PlatformSpecificDistribution, -)