Skip to content

Commit

Permalink
Merge branch 'main-dev' into 25-nodejs-bindings-and-an-npm-package
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian authored Oct 5, 2023
2 parents 4fcd790 + 87f72bd commit 41ada28
Show file tree
Hide file tree
Showing 17 changed files with 2,570 additions and 1,287 deletions.
9 changes: 4 additions & 5 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@ AlignTrailingComments: true
AllowAllArgumentsOnNextLine: false
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortBlocksOnASingleLine: Always
AllowShortIfStatementsOnASingleLine: Always
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: true
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: true
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
AllowShortLoopsOnASingleLine: true
AlwaysBreakTemplateDeclarations: Yes
AlwaysBreakAfterReturnType: None
PenaltyReturnTypeOnItsOwnLine: 200
Expand All @@ -46,7 +45,7 @@ BraceWrapping:
IndentBraces: false


SortIncludes: false
SortIncludes: true
SortUsingDeclarations: true

SpaceAfterCStyleCast: false
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/prerelease.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
- name: Build locally
run: python -m pip install .
- name: Test with PyTest
run: pytest scripts/test.py
run: pytest scripts/


test_python_37:
Expand Down Expand Up @@ -68,6 +68,6 @@ jobs:
run: python -m pip install .

- name: Test with PyTest
run: pytest scripts/test.py
run: pytest scripts/


31 changes: 30 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -117,29 +117,58 @@
"__verbose_abort": "cpp",
"strstream": "cpp",
"filesystem": "cpp",
"stringzilla.h": "c"
"stringzilla.h": "c",
"__memory": "c"
},
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
"cSpell.words": [
"abababab",
"allowoverlap",
"basicsize",
"bigram",
"cibuildwheel",
"endregion",
"endswith",
"getitem",
"getslice",
"initproc",
"intp",
"itemsize",
"keeplinebreaks",
"keepseparator",
"kwargs",
"kwds",
"kwnames",
"levenstein",
"maxsplit",
"memcpy",
"MODINIT",
"napi",
"nargsf",
"ndim",
"newfunc",
"NOARGS",
"NOMINMAX",
"NOTIMPLEMENTED",
"numpy",
"pytest",
"Pythonic",
"quadgram",
"readlines",
"releasebuffer",
"richcompare",
"SIMD",
"splitlines",
"startswith",
"stringzilla",
"Strs",
"strzl",
"substr",
"SWAR",
"TPFLAGS",
"Vardanian",
"vectorcallfunc",
"XDECREF",
"Zilla"
]
}
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ if(STRINGZILLA_INSTALL)
endif()

if(${STRINGZILLA_BUILD_TEST} OR ${STRINGZILLA_BUILD_BENCHMARK})
add_executable(stringzilla_test scripts/test.cpp)
add_executable(stringzilla_test scripts/test.c)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")

Expand Down
42 changes: 28 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Coming soon.
## Quick Start: Python 🐍

1️. Install via pip: `pip install stringzilla`
2. Import classes: `from stringzilla import Str, File, Strs`
1. Import the classes you need: `from stringzilla import Str, Strs, File`

### Basic Usage

Expand All @@ -45,8 +45,8 @@ StringZilla offers two mostly interchangeable core classes:
```python
from stringzilla import Str, File

text1 = Str('some-string')
text2 = File('some-file.txt')
text_from_str = Str('some-string')
text_from_file = Str(File('some-file.txt'))
```

The `Str` is designed to replace long Python `str` strings and wrap our C-level API.
Expand All @@ -58,11 +58,12 @@ A standard dataset pre-processing use case would be to map a sizeable textual da

- Length: `len(text) -> int`
- Indexing: `text[42] -> str`
- Slicing: `text[42:46] -> str`
- Slicing: `text[42:46] -> Str`
- String conversion: `str(text) -> str`
- Substring check: `'substring' in text -> bool`

### Advanced Operations

- `'substring' in text -> bool`
- `text.contains('substring', start=0, end=9223372036854775807) -> bool`
- `text.find('substring', start=0, end=9223372036854775807) -> int`
- `text.count('substring', start=0, end=9223372036854775807, allowoverlap=False) -> int`
Expand Down Expand Up @@ -93,6 +94,19 @@ lines.append('Pythonic string')
lines.extend(shuffled_copy)
```

### Low-Level Python API

The StringZilla CPython bindings implement vector-call conventions for faster calls.

```py
import stringzilla as sz

contains: bool = sz.contains("haystack", "needle", start=0, end=9223372036854775807)
offset: int = sz.find("haystack", "needle", start=0, end=9223372036854775807)
count: int = sz.count("haystack", "needle", start=0, end=9223372036854775807, allowoverlap=False)
levenstein: int = sz.levenstein("needle", "nidl")
```

## Quick Start: C 🛠️

There is an ABI-stable C 99 interface, in case you have a database, an operating system, or a runtime you want to integrate with StringZilla.
Expand All @@ -101,24 +115,24 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating
#include "stringzilla.h"

// Initialize your haystack and needle
strzl_haystack_t haystack = {your_text, your_text_length};
strzl_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset};
sz_haystack_t haystack = {your_text, your_text_length};
sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset};

// Perform string-level operations
size_t character_count = strzl_naive_count_char(haystack, 'a');
size_t character_position = strzl_naive_find_char(haystack, 'a');
size_t substring_position = strzl_naive_find_substr(haystack, needle);
size_t character_count = sz_count_char_swar(haystack, 'a');
size_t character_position = sz_find_char_swar(haystack, 'a');
size_t substring_position = sz_find_substr_swar(haystack, needle);

// Perform collection level operations
strzl_array_t array = {your_order, your_count, your_get_begin, your_get_length, your_handle};
strzl_sort(&array, &your_config);
sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle};
sz_sort(&array, &your_config);
```
## Contributing 👾
Future development plans include:
- Replace PyBind11 with CPython.
- [x] Replace PyBind11 with CPython.
- Reverse-order operations in Python #12.
- Bindings for JavaScript #25, Java, and Rust.
- Faster string sorting algorithm.
Expand All @@ -135,7 +149,7 @@ CPython:
```sh
# Clean up and install
rm -rf build && pip install -e . && pytest scripts/test.py -s -x
rm -rf build && pip install -e . && pytest scripts/ -s -x
# Install without dependencies
pip install -e . --no-index --no-deps
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools>=42", "wheel", "cmake>=3.22", "pybind11"]
requires = ["setuptools>=42", "wheel", "cmake>=3.22", "numpy"]
build-backend = "setuptools.build_meta"

[tool.pytest.ini_options]
Expand All @@ -10,7 +10,7 @@ filterwarnings = ["error"]

[tool.cibuildwheel]
test-requires = ["pytest"]
test-command = "pytest {project}/scripts/test.py -x"
test-command = "pytest {project}/scripts/ -x"
build-verbosity = 0
skip = ["*musllinux*", "*i686*", "pp*"]

Expand Down
Loading

0 comments on commit 41ada28

Please sign in to comment.