From 3f89c290700618eae78eaa289bdb88d1cfb3514d Mon Sep 17 00:00:00 2001 From: Tyge Lovset Date: Wed, 20 Jul 2022 23:13:11 +0200 Subject: [PATCH] Added cregex_replace*() [implemented in utf8code.c]. Added examples/regex_replace.c. Docs not ready, i.e. API not fixed. Some other refactoring and minor fixes/improvements. cstr_assign_sv() now returns char* like the other cstr_assign*(). --- examples/regex2.c | 2 +- examples/regex_match.c | 6 +-- examples/regex_replace.c | 35 +++++++++++++++ include/stc/ccommon.h | 2 +- include/stc/cregex.h | 60 +++++++++++++++---------- include/stc/cstr.h | 10 +++-- include/stc/csview.h | 4 +- src/cregex.c | 95 ++++++++++------------------------------ src/utf8code.c | 75 ++++++++++++++++++++++++++++++- 9 files changed, 181 insertions(+), 108 deletions(-) create mode 100644 examples/regex_replace.c diff --git a/examples/regex2.c b/examples/regex2.c index 82247da5..1f3163f7 100644 --- a/examples/regex2.c +++ b/examples/regex2.c @@ -22,7 +22,7 @@ int main() printf("input: %s\n", inputs[i]); if (cregex_match(&re, inputs[i], 20, m, 0) > 0) { - c_forrange (j, cregex_captures(re)) + c_forrange (j, cregex_captures(&re)) { printf(" submatch %" PRIuMAX ": %" c_PRIsv "\n", j, c_ARGsv(m[j])); } diff --git a/examples/regex_match.c b/examples/regex_match.c index 2b135bb7..5680b55e 100644 --- a/examples/regex_match.c +++ b/examples/regex_match.c @@ -13,14 +13,14 @@ int main() { int res = cregex_compile(&re, "[+-]?([0-9]*\\.)?\\d+([Ee][+-]?\\d+)?", 0); printf("%d\n", res); - cregmatch m[10]; + csview m[10]; if (cregex_match(&re, s, 10, m, 0) > 0) { printf("Found digits at position %" PRIuMAX "-%" PRIuMAX "\n", m[0].str - s, m[0].str - s + m[0].size); } else { printf("Could not find any digits\n"); } - while (cregex_match(&re, s, 10, m, creg_next) > 0) { + while (cregex_match(&re, s, 10, m, cregex_NEXT) > 0) { printf("%" c_PRIsv " ; ", c_ARGsv(m[0])); } puts(""); @@ -28,7 +28,7 @@ int main() res = cregex_compile(&re, "(.+)\\b(.+)", 0); printf("groups: %d\n", res); if ((res = cregex_match(&re, "hello@wørld", 10, m, 0)) > 0) { - c_forrange (i, res) + c_forrange (i, res) printf("match: [%" c_PRIsv "]\n", c_ARGsv(m[i])); } else printf("err: %d\n", res); diff --git a/examples/regex_replace.c b/examples/regex_replace.c new file mode 100644 index 00000000..1216701f --- /dev/null +++ b/examples/regex_replace.c @@ -0,0 +1,35 @@ +#define i_implement +#include +#include +#include + +cstr sub_20y(int i, csview m) { + if (i == 1) { // year + int year; + sscanf(m.str, "%4d", &year); + return cstr_from_fmt("%04d", year - 20); + } + return cstr_from_sv(m); +} + +int main() +{ + const char* pattern = "\\b(\\d\\d\\d\\d)-(1[0-2]|0[1-9])-(3[01]|[12][0-9]|0[1-9])\\b"; + const char* input = "start date: 2015-12-31, end date: 2022-02-28"; + + c_auto (cregex, re) + c_auto (cstr, str1, str2) + { + printf("input: %s\n", input); + /* European date format */ + str1 = cregex_replace(input, pattern, "\\3.\\2.\\1"); + printf("euros: %s\n", cstr_str(&str1)); + + /* US date format, and subtract 20 years: */ + str2 = cregex_replace_fn(input, pattern, "\\1/\\3/\\2", sub_20y, 0, 0); + printf("us-20: %s\n", cstr_str(&str2)); + } +} + +#include "../src/cregex.c" +#include "../src/utf8code.c" diff --git a/include/stc/ccommon.h b/include/stc/ccommon.h index 3a6d8f4e..e87e7678 100644 --- a/include/stc/ccommon.h +++ b/include/stc/ccommon.h @@ -112,7 +112,7 @@ typedef const char* crawstr; #define crawstr_cmp(xp, yp) strcmp(*(xp), *(yp)) #define crawstr_hash(p) c_strhash(*(p)) #define c_strlen_lit(literal) (sizeof "" literal - 1U) -#define c_sv(lit) c_make(csview){lit, c_strlen_lit(lit)} +#define c_sv(lit) (c_make(csview){lit, c_strlen_lit(lit)}) #define c_PRIsv ".*s" #define c_ARGsv(sv) (int)(sv).size, (sv).str diff --git a/include/stc/cregex.h b/include/stc/cregex.h index 448f9405..11e21b06 100644 --- a/include/stc/cregex.h +++ b/include/stc/cregex.h @@ -34,32 +34,33 @@ THE SOFTWARE. #include "forward.h" // csview typedef enum { - creg_nomatch = -1, - creg_matcherror = -2, - creg_outofmemory = -3, - creg_unmatchedleftparenthesis = -4, - creg_unmatchedrightparenthesis = -5, - creg_toomanysubexpressions = -6, - creg_toomanycharacterclasses = -7, - creg_malformedcharacterclass = -8, - creg_missingoperand = -9, - creg_unknownoperator = -10, - creg_operandstackoverflow = -11, - creg_operatorstackoverflow = -12, - creg_operatorstackunderflow = -13, + creg_success = 1, + creg_nomatch = 0, + creg_matcherror = -1, + creg_outofmemory = -2, + creg_unmatchedleftparenthesis = -3, + creg_unmatchedrightparenthesis = -4, + creg_toomanysubexpressions = -5, + creg_toomanycharacterclasses = -6, + creg_malformedcharacterclass = -7, + creg_missingoperand = -8, + creg_unknownoperator = -9, + creg_operandstackoverflow = -10, + creg_operatorstackoverflow = -11, + creg_operatorstackunderflow = -12, } cregex_error_t; enum { /* compile flags */ - creg_dotall = 1<<0, - creg_caseless = 1<<1, + cregex_DOTALL = 1<<0, + cregex_CASELESS = 1<<1, /* execution flags */ - creg_fullmatch = 1<<2, - creg_next = 1<<3, - creg_startend = 1<<4, + cregex_FULLMATCH = 1<<2, + cregex_NEXT = 1<<3, + cregex_STARTEND = 1<<4, /* limits */ - creg_max_classes = 16, - creg_max_captures = 32, + cregex_MAXCLASSES = 16, + cregex_MAXCAPTURES = 32, }; typedef struct { @@ -76,15 +77,26 @@ static inline cregex cregex_init(void) { int cregex_compile(cregex *self, const char* pattern, int cflags); /* number of capture groups in a regex pattern */ -int cregex_captures(cregex rx); +int cregex_captures(const cregex* self); /* return number of capture groups on success, or (negative) error code on failure. */ int cregex_match(const cregex *self, const char* string, - size_t nmatch, cregmatch match[], int mflags); + unsigned nmatch, csview match[], int mflags); -void cregex_replace(const char* src, char* dst, int dsize, - int nmatch, const cregmatch match[]); +/* replace regular expression */ +void cregex_build_replace(const char* repl, unsigned nmatch, const csview match[], + cstr (*mfun)(int i, csview match), cstr* out); +cstr cregex_replace_re(const char* input, const cregex* re, const char* repl, + cstr (*mfun)(int i, csview match), int cflags, unsigned count); + +cstr cregex_replace_fn(const char* input, const char* pattern, const char* replace, + cstr (*mfun)(int i, csview match), int cflags, unsigned count); +static inline +cstr cregex_replace(const char* input, const char* pattern, const char* replace) + { return cregex_replace_fn(input, pattern, replace, NULL, 0, 0); } + +/* destroy regex */ void cregex_drop(cregex* self); #endif diff --git a/include/stc/cstr.h b/include/stc/cstr.h index 441fe94a..8395f127 100644 --- a/include/stc/cstr.h +++ b/include/stc/cstr.h @@ -404,9 +404,9 @@ STC_DEF char* cstr_reserve(cstr* self, const size_t cap) { if (cap > cstr_s_cap) { char* data = (char *)c_malloc(cap + 1); const size_t len = cstr_s_size(self); - memcpy(data, self->sml.data, len); + memcpy(data, self->sml.data, cstr_s_cap + 1); self->lon.data = data; - cstr_l_set_size(self, len); + self->lon.size = len; cstr_l_set_cap(self, cap); return data; } @@ -525,7 +525,8 @@ STC_DEF int cstr_vfmt(cstr* self, const char* fmt, va_list args) { STC_DEF cstr cstr_from_fmt(const char* fmt, ...) { cstr s = cstr_null; - va_list args; va_start(args, fmt); + va_list args; + va_start(args, fmt); cstr_vfmt(&s, fmt, args); va_end(args); return s; @@ -533,7 +534,8 @@ STC_DEF cstr cstr_from_fmt(const char* fmt, ...) { STC_DEF int cstr_printf(cstr* self, const char* fmt, ...) { cstr s = cstr_null; - va_list args; va_start(args, fmt); + va_list args; + va_start(args, fmt); const int n = cstr_vfmt(&s, fmt, args); va_end(args); cstr_drop(self); *self = s; diff --git a/include/stc/csview.h b/include/stc/csview.h index e74ce844..39bfa354 100644 --- a/include/stc/csview.h +++ b/include/stc/csview.h @@ -120,8 +120,8 @@ STC_INLINE csview cstr_substr_ex(const cstr* self, intptr_t pos, size_t n) STC_INLINE csview cstr_slice_ex(const cstr* self, intptr_t p1, intptr_t p2) { return csview_slice_ex(csview_from_s(self), p1, p2); } -STC_INLINE csview cstr_assign_sv(cstr* self, csview sv) - { return c_make(csview){cstr_assign_n(self, sv.str, sv.size), sv.size}; } +STC_INLINE char* cstr_assign_sv(cstr* self, csview sv) + { return cstr_assign_n(self, sv.str, sv.size); } STC_INLINE void cstr_append_sv(cstr* self, csview sv) { cstr_append_n(self, sv.str, sv.size); } diff --git a/src/cregex.c b/src/cregex.c index b326b4fc..c30b66da 100644 --- a/src/cregex.c +++ b/src/cregex.c @@ -38,9 +38,9 @@ THE SOFTWARE. typedef uint32_t Rune; /* Utf8 code point */ typedef int32_t Token; /* max character classes per program */ -#define NCLASS creg_max_classes +#define NCLASS cregex_MAXCLASSES /* max subexpressions */ -#define NSUBEXP creg_max_captures +#define NSUBEXP cregex_MAXCAPTURES /* max rune ranges per character class */ #define NCCRUNE (NSUBEXP * 2) @@ -91,7 +91,7 @@ typedef struct Reprog /* * Sub expression matches */ -typedef cregmatch Resub; +typedef csview Resub; /* * substitution list @@ -228,11 +228,11 @@ utfruneicase(const char *s, Rune c) * save a new match in mp */ static void -_renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids) +_renewmatch(Resub *mp, unsigned ms, Resublist *sp, int nsubids) { int i; - if (mp==NULL || ms<=0) + if (mp==NULL || ms==0) return; if (mp[0].str == NULL || sp->m[0].str < mp[0].str || (sp->m[0].str == mp[0].str && sp->m[0].size > mp[0].size)) { @@ -249,7 +249,7 @@ _renewmatch(Resub *mp, int ms, Resublist *sp, int nsubids) static Relist* _renewthread(Relist *lp, /* _relist to add to */ Reinst *ip, /* instruction to add */ - int ms, + unsigned ms, Resublist *sep) /* pointers to subexpressions */ { Relist *p; @@ -281,7 +281,7 @@ _renewthread(Relist *lp, /* _relist to add to */ static Relist* _renewemptythread(Relist *lp, /* _relist to add to */ Reinst *ip, /* instruction to add */ - int ms, + unsigned ms, const char *sp) /* pointers to subexpressions */ { Relist *p; @@ -806,8 +806,8 @@ regcomp1(Reprog *progp, Parser *par, const char *s, int cflags) free(progp); return NULL; } - pp->flags.caseless = (cflags & creg_caseless) != 0; - pp->flags.dotall = (cflags & creg_dotall) != 0; + pp->flags.caseless = (cflags & cregex_CASELESS) != 0; + pp->flags.dotall = (cflags & cregex_DOTALL) != 0; par->freep = pp->firstinst; par->classp = pp->cclass; par->errors = 0; @@ -930,10 +930,10 @@ runematch(Rune s, Rune r, bool icase) * <0 if we ran out of _relist space */ static int -regexec1(const Reprog *progp, /* program to run */ +regexec1(const Reprog *progp, /* program to run */ const char *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms, /* number of elements at mp */ + Resub *mp, /* subexpression elements */ + unsigned ms, /* number of elements at mp */ Reljunk *j, int mflags ) @@ -1057,7 +1057,7 @@ regexec1(const Reprog *progp, /* program to run */ /* efficiency: advance and re-evaluate */ continue; case END: /* Match! */ - match = !(mflags & creg_fullmatch) || + match = !(mflags & cregex_FULLMATCH) || ((s == j->eol || r == 0 || r == '\n') && (tlp->se.m[0].str == bol || tlp->se.m[0].str[-1] == '\n')); tlp->se.m[0].size = s - tlp->se.m[0].str; @@ -1082,8 +1082,8 @@ regexec1(const Reprog *progp, /* program to run */ static int regexec2(const Reprog *progp, /* program to run */ const char *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms, /* number of elements at mp */ + Resub *mp, /* subexpression elements */ + unsigned ms, /* number of elements at mp */ Reljunk *j, int mflags ) @@ -1109,7 +1109,7 @@ regexec2(const Reprog *progp, /* program to run */ static int regexec(const Reprog *progp, /* program to run */ const char *bol, /* string to run machine on */ - int ms, /* number of elements at mp */ + unsigned ms, /* number of elements at mp */ Resub mp[], /* subexpression elements */ int mflags) { @@ -1123,10 +1123,10 @@ regexec(const Reprog *progp, /* program to run */ j.starts = bol; j.eol = NULL; - if (mp && mp->str && ms>0) { - if (mflags & creg_startend) + if (ms && mp->size) { + if (mflags & cregex_STARTEND) j.starts = mp->str, j.eol = mp->str + mp->size; - else if (mflags & creg_next) + else if (mflags & cregex_NEXT) j.starts = mp->str + mp->size; } @@ -1157,55 +1157,6 @@ regexec(const Reprog *progp, /* program to run */ * API functions */ -/* substitute into one string using the matches from the last regexec() */ -void cregex_replace( - const char *sp, /* source string */ - char *dp, /* destination string */ - int dlen, - int ms, /* number of elements pointed to by mp */ - const cregmatch mp[]) /* subexpression elements */ -{ - const char *ssp, *ep; - int i; - - ep = dp+dlen-1; - while (*sp != '\0') { - if (*sp == '\\') { - switch (*++sp) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - i = *sp - '0'; - if (mp[i].str != NULL && mp != NULL && ms > i) - for (ssp = mp[i].str; ssp < (mp[i].str + mp[i].size); ssp++) - if (dp < ep) - *dp++ = *ssp; - break; - case '\\': - if (dp < ep) - *dp++ = '\\'; - break; - case '\0': - sp--; - break; - default: - if (dp < ep) - *dp++ = *sp; - break; - } - } else if (*sp == '&') { - if (mp[0].str != NULL && mp != NULL && ms > 0) - for (ssp = mp[0].str; ssp < (mp[0].str + mp[0].size); ssp++) - if (dp < ep) - *dp++ = *ssp; - } else { - if (dp < ep) - *dp++ = *sp; - } - sp++; - } - *dp = '\0'; -} - int cregex_compile(cregex *rx, const char* pattern, int cflags) { Parser par; rx->prog = regcomp1(rx->prog, &par, pattern, cflags); @@ -1214,15 +1165,15 @@ int cregex_compile(cregex *rx, const char* pattern, int cflags) { return par.errors; } -int cregex_captures(cregex rx) { - return rx.prog ? 1 + rx.prog->nsubids : 0; +int cregex_captures(const cregex* self) { + return self->prog ? 1 + self->prog->nsubids : 0; } int cregex_match(const cregex *rx, const char* string, - size_t nmatch, cregmatch match[], int mflags) { + unsigned nmatch, csview match[], int mflags) { int res = regexec(rx->prog, string, nmatch, match, mflags); switch (res) { - case 1: return 1 + rx->prog->nsubids; + case 1: return creg_success; case 0: return creg_nomatch; default: return creg_matcherror; } diff --git a/src/utf8code.c b/src/utf8code.c index dff10409..44120cee 100644 --- a/src/utf8code.c +++ b/src/utf8code.c @@ -1,6 +1,7 @@ #include #define i_header #include +#include #include "utf8tabs.inc" const uint8_t utf8_dtab[] = { @@ -16,7 +17,7 @@ const uint8_t utf8_dtab[] = { 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, }; unsigned utf8_encode(char *out, uint32_t c) @@ -220,3 +221,75 @@ void cstr_lowercase(cstr* self) { void cstr_uppercase(cstr* self) { cstr_take(self, cstr_tocase(self, fn_toupper)); } + + +void cregex_build_replace(const char* repl, unsigned nmatch, const csview match[], + cstr (*mfun)(int i, csview match), cstr* sub) { + cstr_clear(sub); + unsigned len = 0, cap = cstr_capacity(*sub); + char* dst = cstr_data(sub); + + while (*repl != '\0') { + if (*repl == '\\') { + const char num = *++repl; + int i; + switch (num) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + i = num - '0'; + if (i < nmatch) { + csview m; + cstr s = cstr_null; + if (mfun) { s = mfun(i, match[i]); m = cstr_sv(&s); } + else m = match[i]; + if (len + m.size >= cap) + dst = cstr_reserve(sub, cap = cap*3/2 + m.size); + for (const char* rp = m.str; rp != (m.str + m.size); ++rp) + dst[len++] = *rp; + cstr_drop(&s); + } + ++repl; + case '\0': + continue; + } + } + if (len == cap) + dst = cstr_reserve(sub, cap = cap*3/2 + 4); + dst[len++] = *repl++; + } + _cstr_set_size(sub, len); +} + + +cstr cregex_replace_re(const char* input, const cregex* re, const char* repl, + cstr (*mfun)(int i, csview match), int cflags, unsigned count) { + cstr out = cstr_null; + cstr sub = cstr_null; + size_t from = 0; + csview match[cregex_MAXCAPTURES]; + unsigned nmatch = cregex_captures(re); + if (!count) count = ~0; + + while (count-- && cregex_match(re, input + from, nmatch, match, 0) > 0) { + cregex_build_replace(repl, nmatch, match, mfun, &sub); + const size_t pos = match[0].str - input; + cstr_append_n(&out, input + from, pos - from); + cstr_append_s(&out, sub); + from = pos + match[0].size; + } + cstr_append(&out, input + from); + cstr_drop(&sub); + return out; +} + + +cstr cregex_replace_fn(const char* input, const char* pattern, const char* repl, + cstr (*mfun)(int i, csview match), int cflags, unsigned count) { + cregex re = cregex_init(); + int res = cregex_compile(&re, pattern, cflags); + if (res < 0) + return cstr_new("[[cregex_replace_fn]]: invalid pattern"); + cstr out = cregex_replace_re(input, &re, repl, mfun, cflags, count); + cregex_drop(&re); + return out; +}