Skip to content

Commit

Permalink
Fix: Buffer width for NodeJS bindings
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Oct 5, 2023
1 parent 014257c commit b1cf5e5
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 132 deletions.
148 changes: 56 additions & 92 deletions javascript/lib.c
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
/**
* @file lib.c
* @author Ash Vardanian
* @brief JavaScript bindings for StringZilla.
* @date 2023-09-18
* @file lib.c
* @brief JavaScript bindings for StringZilla.
* @author Ash Vardanian
* @date September 18, 2023
*
* @copyright Copyright (c) 2023
*
* @see NodeJS docs: https://nodejs.org/api/n-api.html
* @copyright Copyright (c) 2023
* @see NodeJS docs: https://nodejs.org/api/n-api.html
*/

#include <node_api.h>
Expand All @@ -18,49 +17,39 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
napi_get_cb_info(env, info, &argc, args, NULL, NULL);

// Extract the C string from the JavaScript string for haystack and needle
struct strzl_haystack_t strzl_haystack = {NULL, 0};
struct strzl_needle_t strzl_needle = {NULL, 0, 0};
sz_haystack_t haystack_sz = {NULL, 0};
sz_needle_t needle_sz = {NULL, 0, 0};

// For haystack
napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len);
char *haystack = malloc(strzl_haystack.len);
napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len);
strzl_haystack.ptr = haystack;
napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length);
haystack_sz.start = malloc(haystack_sz.length + 1);
napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length);

// For needle
napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len);
char *needle = malloc(strzl_needle.len);
napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len);
strzl_needle.ptr = needle;

// Perform the find operation
#if defined(__AVX2__)
uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
#elif defined(__ARM_NEON)
uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle);
#else
uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle);
#endif
napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length);
needle_sz.start = malloc(needle_sz.length + 1);
napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length);

// Perform the find operation
sz_size_t result = sz_find_substr(haystack_sz, needle_sz);

// Cleanup
free(haystack);
free(needle);
free(haystack_sz.start);
free(needle_sz.start);

// Convert the result to JavaScript BigInt and return
napi_value js_result;

// In JavaScript, if `find` is unable to find the specified value, then it should return -1
if (result == strzl_haystack.len)
napi_create_bigint_int64(env, -1, &js_result);
if (result == haystack_sz.length) napi_create_bigint_int64(env, -1, &js_result);
else
napi_create_bigint_uint64(env, result, &js_result);

return js_result;
}

size_t count_char(strzl_haystack_t strzl_haystack, char needle) {
size_t result = strzl_naive_count_char(strzl_haystack, needle);

size_t count_char(sz_haystack_t haystack_sz, char needle) {
size_t result = sz_count_char(haystack_sz, needle);
return result;
}

Expand All @@ -70,91 +59,66 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
napi_get_cb_info(env, info, &argc, args, NULL, NULL);

// Extract the C string from the JavaScript string for haystack and needle
struct strzl_haystack_t strzl_haystack = {NULL, 0};
struct strzl_needle_t strzl_needle = {NULL, 0, 0};
sz_haystack_t haystack_sz = {NULL, 0};
sz_needle_t needle_sz = {NULL, 0, 0};

// For haystack
napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len);
char *haystack = malloc(strzl_haystack.len);
napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len);
strzl_haystack.ptr = haystack;
napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length);
haystack_sz.start = malloc(haystack_sz.length + 1);
napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length);

// For needle
napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len);
char *needle = malloc(strzl_needle.len);
napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len);
strzl_needle.ptr = needle;
napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length);
needle_sz.start = malloc(needle_sz.length + 1);
napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length);

bool overlap = false;
if (argc > 2) {
napi_get_value_bool(env, args[2], &overlap);
}
if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); }

size_t result;
void const *haystack_start = haystack_sz.start, *needle_start = needle_sz.start;

if (strzl_needle.len == 0 || strzl_haystack.len == 0 || strzl_haystack.len < strzl_needle.len)
result = 0;
else if (strzl_needle.len == 1)
result = count_char(strzl_haystack, strzl_needle.ptr[0]);
size_t count = 0;
if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; }
else if (needle_sz.length == 1) { count = count_char(haystack_sz, needle_sz.start[0]); }
else if (overlap) {
while (strzl_haystack.len) {
#if defined(__AVX2__)
size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
#elif defined(__ARM_NEON)
size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
#else
size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
#endif

bool found = offset != strzl_haystack.len;
result += found;
strzl_haystack.ptr += offset + found;
strzl_haystack.len -= offset + found;
while (haystack_sz.length) {
sz_size_t offset = sz_find_substr(haystack_sz, needle_sz);
int found = offset != haystack_sz.length;
count += found;
haystack_sz.start += offset + found;
haystack_sz.length -= offset + found;
}
}

else {
while (strzl_haystack.len) {
#if defined(__AVX2__)
size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
#elif defined(__ARM_NEON)
size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
#else
size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
#endif

bool found = offset != strzl_haystack.len;
result += found;
strzl_haystack.ptr += offset + strzl_needle.len;
strzl_haystack.len -= offset + strzl_needle.len * found;
while (haystack_sz.length) {
sz_size_t offset = sz_find_substr(haystack_sz, needle_sz);
int found = offset != haystack_sz.length;
count += found;
haystack_sz.start += offset + needle_sz.length;
haystack_sz.length -= offset + needle_sz.length * found;
}
}

// Cleanup
free(haystack);
free(needle);
free(haystack_start);
free(needle_start);

// Convert the result to JavaScript `BigInt` and return
napi_value js_result;
napi_create_bigint_uint64(env, result, &js_result);
// Convert the `count` to JavaScript `BigInt` and return
napi_value js_count;
napi_create_bigint_uint64(env, count, &js_count);

return js_result;
return js_count;
}

napi_value Init(napi_env env, napi_value exports) {
// Define the "find" property
napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};

// Define the "count" property
napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0};

// Define an array of property descriptors
napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};
napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0};
napi_property_descriptor properties[] = {findDesc, countDesc};

// Define the number of properties in the array
size_t propertyCount = sizeof(properties) / sizeof(properties[0]);

// Define the properties on the `exports` object
size_t propertyCount = sizeof(properties) / sizeof(properties[0]);
napi_define_properties(env, exports, propertyCount, properties);

return exports;
Expand Down
17 changes: 0 additions & 17 deletions javascript/stringzilla.d.ts

This file was deleted.

24 changes: 22 additions & 2 deletions javascript/stringzilla.js
Original file line number Diff line number Diff line change
@@ -1,2 +1,22 @@
const stringzilla = require('bindings')('stringzilla');
module.exports = stringzilla;
const compiled = require('bindings')('stringzilla');

module.exports = {
/**
* Searches for a short string in a long one.
*
* @param {string} haystack
* @param {string} needle
* @returns {bigint}
*/
find: compiled.find,

/**
* Searches for a substring in a larger string.
*
* @param {string} haystack
* @param {string} needle
* @param {boolean} overlap
* @returns {bigint}
*/
count: compiled.count
};
24 changes: 15 additions & 9 deletions python/lib.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
/**
* @brief Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
* native Python strings, Apache Arrow collections, and more.
* @file lib.c
* @brief Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
* native Python strings, Apache Arrow collections, and more.
* @author Ash Vardanian
* @date July 10, 2023
* @copyright Copyright (c) 2023
*
* - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API.
* - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.
Expand Down Expand Up @@ -646,7 +650,7 @@ static int Str_in(Str *self, PyObject *arg) {
sz_haystack_t haystack;
haystack.start = self->start;
haystack.length = self->length;
size_t position = sz_find_substr_auto(haystack, needle_struct);
size_t position = sz_find_substr(haystack, needle_struct);
return position != haystack.length;
}

Expand Down Expand Up @@ -881,7 +885,7 @@ static int Str_find_( //
haystack.length = normalized_length;

// Perform contains operation
size_t offset = sz_find_substr_auto(haystack, needle);
size_t offset = sz_find_substr(haystack, needle);
if (offset == haystack.length) { *offset_out = -1; }
else { *offset_out = (Py_ssize_t)offset; }

Expand Down Expand Up @@ -1008,11 +1012,13 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
haystack.start += normalized_offset;
haystack.length = normalized_length;

size_t count = needle.length == 1 ? sz_count_char_swar(haystack, *needle.start) : 0;
if (needle.length != 1) {
size_t count = 0;
if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; }
else if (needle.length == 1) { count = sz_count_char(haystack, needle.start[0]); }
else if (needle.length != 1) {
if (allowoverlap) {
while (haystack.length) {
size_t offset = sz_find_substr_auto(haystack, needle);
sz_size_t offset = sz_find_substr(haystack, needle);
int found = offset != haystack.length;
count += found;
haystack.start += offset + found;
Expand All @@ -1021,7 +1027,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
}
else {
while (haystack.length) {
size_t offset = sz_find_substr_auto(haystack, needle);
sz_size_t offset = sz_find_substr(haystack, needle);
int found = offset != haystack.length;
count += found;
haystack.start += offset + needle.length;
Expand Down Expand Up @@ -1207,7 +1213,7 @@ static Strs *Str_split_(
sz_haystack_t text_remaining;
text_remaining.start = text.start + last_start;
text_remaining.length = text.length - last_start;
sz_size_t offset_in_remaining = sz_find_substr_auto(text_remaining, separator);
sz_size_t offset_in_remaining = sz_find_substr(text_remaining, separator);

// Reallocate offsets array if needed
if (offsets_count >= offsets_capacity) {
Expand Down
14 changes: 7 additions & 7 deletions scripts/test.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <assert.h>

#include <stringzilla.h>

Expand All @@ -18,8 +18,8 @@ void populate_random_string(char *buffer, int length, int variability) {
buffer[length] = '\0';
}

// Test function for sz_find_substr_auto
void test_sz_find_substr_auto() {
// Test function for sz_find_substr
void test_sz_find_substr() {
char buffer[MAX_LENGTH + 1];
char pattern[6]; // Maximum length of 5 + 1 for '\0'

Expand All @@ -40,19 +40,19 @@ void test_sz_find_substr_auto() {

// Comparing the result of your function with the standard library function.
const char *result_libc = strstr(buffer, pattern);
uint64_t result_stringzilla = sz_find_substr_auto(haystack, needle);
uint64_t result_stringzilla = sz_find_substr(haystack, needle);

assert(((result_libc && result_stringzilla == (uint64_t)(result_libc - buffer)) ||
(!result_libc && result_stringzilla == (uint64_t)-1)) &&
"Test failed for sz_find_substr_auto");
"Test failed for sz_find_substr");
}
}
}

int main() {
srand((unsigned int)time(NULL));

test_sz_find_substr_auto();
test_sz_find_substr();
// Add calls to other test functions as you implement them

printf("All tests passed!\n");
Expand Down
12 changes: 7 additions & 5 deletions stringzilla/stringzilla.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#ifndef STRINGZILLA_H_
#define STRINGZILLA_H_

#include <stdint.h> // `uint8_t`
#include <ctype.h> // `tolower`
#include <search.h> // `qsort_s`
#include <stddef.h> // `sz_size_t`
#include <string.h> // `memcpy`
#include <stdint.h> // `uint8_t`
#include <stdlib.h> // `qsort_r`
#include <search.h> // `qsort_s`
#include <ctype.h> // `tolower`
#include <string.h> // `memcpy`

#if defined(__AVX2__)
#include <x86intrin.h>
Expand Down Expand Up @@ -427,7 +427,9 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {

#endif // Arm Neon

inline static sz_size_t sz_find_substr_auto(sz_haystack_t h, sz_needle_t n) {
inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_char_swar(h, n); }

inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) {
if (h.length < n.length) return h.length;

switch (n.length) {
Expand Down

0 comments on commit b1cf5e5

Please sign in to comment.