diff --git a/ext/miniscript/src/miniscript/utilities/UTF8StringTokenizer.cpp b/ext/miniscript/src/miniscript/utilities/UTF8StringTokenizer.cpp new file mode 100644 index 000000000..d7bfd6375 --- /dev/null +++ b/ext/miniscript/src/miniscript/utilities/UTF8StringTokenizer.cpp @@ -0,0 +1,56 @@ +#include +#include + +#include +#include +#include +#include + +using std::string; + +using miniscript::utilities::Character; +using miniscript::utilities::UTF8StringTokenizer; +using miniscript::utilities::UTF8CharacterIterator; + +void UTF8StringTokenizer::tokenize(const string& str, const string& delimiters, bool emptyTokens) +{ + idx = 0; + tokens.clear(); + // + string token; + // + UTF8CharacterIterator u8It(str); + UTF8CharacterIterator delimiterU8It(delimiters); + // iterate string value + for (; u8It.hasNext(); ) { + auto c = u8It.next(); + // iterate delimiters + delimiterU8It.reset(); + // + auto foundDelimiter = false; + for (; delimiterU8It.hasNext(); ) { + // check if delimiter character is our current string value char + auto dc = delimiterU8It.next(); + // got a delimiter? + if (c == dc) { + foundDelimiter = true; + // yep, add token to elements if we have any + if (emptyTokens == true || token.empty() == false) { + tokens.push_back(token); + token.clear(); + } + // + break; + } + } + // + if (foundDelimiter == false) { + // no delimiter, add char to token + token+= Character::toString(c); + } + } + // do we have a token still? add it to elements + if (emptyTokens == true || token.empty() == false) { + tokens.push_back(token); + } +} diff --git a/ext/miniscript/src/miniscript/utilities/UTF8StringTokenizer.h b/ext/miniscript/src/miniscript/utilities/UTF8StringTokenizer.h new file mode 100644 index 000000000..496ba6ce1 --- /dev/null +++ b/ext/miniscript/src/miniscript/utilities/UTF8StringTokenizer.h @@ -0,0 +1,65 @@ +#pragma once + +#include +#include + +#include +#include + +using std::string; +using std::vector; + +/** + * UTF8 string tokenizer class + * @author Andreas Drewke + */ +class miniscript::utilities::UTF8StringTokenizer +{ + +private: + vector tokens; + int idx { 0 }; + +public: + /** + * Public constructor + */ + UTF8StringTokenizer() {}; + + /** + * Tokenize + * @param str string to tokenize + * @param delimiters delimiters + * @param emptyTokens include empty tokens + */ + void tokenize(const string& str, const string& delimiters, bool emptyTokens = false); + + /** + * @return number of tokens + */ + inline int32_t countTokens() { + return tokens.size(); + } + + /** + * @return has more tokens + */ + inline bool hasMoreTokens() { + return idx != tokens.size(); + } + + /** + * @return next token + */ + inline const string& nextToken() { + return tokens[idx++]; + } + + /** + * @return tokens + */ + inline const vector& getTokens() { + return tokens; + } + +}; diff --git a/ext/miniscript/src/miniscript/utilities/UTF8StringTools.cpp b/ext/miniscript/src/miniscript/utilities/UTF8StringTools.cpp new file mode 100644 index 000000000..e24e15afb --- /dev/null +++ b/ext/miniscript/src/miniscript/utilities/UTF8StringTools.cpp @@ -0,0 +1,274 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include + +using std::regex; +using std::regex_match; +using std::regex_replace; +using std::string; +using std::string_view; + +using miniscript::utilities::UTF8StringTools; + +using miniscript::utilities::Character; +using miniscript::utilities::UTF8StringTokenizer; +using miniscript::utilities::UTF8CharacterIterator; + +const string UTF8StringTools::replace( + const string& str, + const string& what, + const string& by, + int64_t beginIndex, + UTF8CharacterIterator::UTF8PositionCache* cache) +{ + auto binaryBeginIndex = getUTF8BinaryIndex(str, beginIndex, cache); + // + string result = str; + if (what.empty()) return result; + while ((binaryBeginIndex = result.find(what, binaryBeginIndex)) != std::string::npos) { + result.replace(binaryBeginIndex, what.length(), by); + binaryBeginIndex += by.length(); + } + return result; +} + +int64_t UTF8StringTools::firstIndexOf( + const string& str, + const string& what, + int64_t beginIndex, + UTF8CharacterIterator::UTF8PositionCache* cache +) { + // utf8 character iterator + UTF8CharacterIterator u8It(str, cache); + u8It.seekCharacterPosition(beginIndex); + // + auto index = str.find(what, u8It.getBinaryPosition()); + if (index == string::npos) { + return string::npos; + } else { + u8It.seekBinaryPosition(index); + return static_cast(u8It.getCharacterPosition()); + } +} + +int64_t UTF8StringTools::lastIndexOf( + const string& str, + const string& what, + int64_t endIndex, + UTF8CharacterIterator::UTF8PositionCache* cache +) { + // utf8 character iterator + UTF8CharacterIterator u8It(str, cache); + if (endIndex == string::npos) endIndex = getLength(str, cache); + // + int64_t binaryIndex = 0; + int64_t index = string::npos; + while (true == true) { + auto whatBinaryIndex = str.find(what, binaryIndex); + if (whatBinaryIndex == string::npos) { + return index; + } else { + u8It.seekBinaryPosition(whatBinaryIndex); + if (u8It.getCharacterPosition() < endIndex) { + index = u8It.getCharacterPosition(); + binaryIndex = whatBinaryIndex + what.size(); + } else { + return index; + } + } + } + // + return index; +} + +int64_t UTF8StringTools::firstIndexOfChars(const string& str, const string& what, int64_t beginIndex, UTF8CharacterIterator::UTF8PositionCache* strCache, UTF8CharacterIterator::UTF8PositionCache* whatCache) { + // utf8 character iterator + UTF8CharacterIterator whatU8It(what, whatCache); + // + int64_t index = string::npos; + while (whatU8It.hasNext() == true) { + auto whatChar = Character::toString(whatU8It.next()); + auto whatIndex = UTF8StringTools::indexOf(str, whatChar, beginIndex, strCache); + if (whatIndex != string::npos) index = index == string::npos?whatIndex:(::miniscript::math::Math::min(index, whatIndex)); + } + // + if (index == string::npos) { + return string::npos; + } else { + return index; + } +} + +int64_t UTF8StringTools::lastIndexOfChars(const string& str, const string& what, int64_t endIndex, UTF8CharacterIterator::UTF8PositionCache* strCache, UTF8CharacterIterator::UTF8PositionCache* whatCache) { + // utf8 character iterator + UTF8CharacterIterator whatU8It(what, whatCache); + if (endIndex == string::npos) endIndex = getLength(str, strCache); + // + int64_t currentIndex = 0; + int64_t index = string::npos; + while (true == true) { + auto hit = false; + while (whatU8It.hasNext() == true) { + auto whatChar = Character::toString(whatU8It.next()); + auto whatIndex = UTF8StringTools::indexOf(str, whatChar, currentIndex, strCache); + if (whatIndex != string::npos) { + hit = true; + index = ::miniscript::math::Math::max(index, whatIndex); + } + // + currentIndex++; + } + if (hit == false) break; + } + // + if (index == string::npos) { + return string::npos; + } else { + return index; + } +} + +const string_view UTF8StringTools::viewSubstring(const string_view& str, int64_t beginIndex, int64_t endIndex, UTF8CharacterIterator::UTF8PositionCache* cache) { + // utf8 character iterator + UTF8CharacterIterator u8It(str, cache); + u8It.seekCharacterPosition(beginIndex); + auto binaryBeginIndex = u8It.getBinaryPosition(); + // + if (endIndex == string::npos) { + return str.substr(binaryBeginIndex); + } else { + u8It.seekCharacterPosition(endIndex) ; + auto binaryEndIndex = u8It.getBinaryPosition(); + // + return str.substr(binaryBeginIndex, binaryEndIndex - binaryBeginIndex); + } +} + +bool UTF8StringTools::equalsIgnoreCase( + const string& string1, + const string& string2, + UTF8CharacterIterator::UTF8PositionCache* string1Cache, + UTF8CharacterIterator::UTF8PositionCache* string2Cache +) { + if (getLength(string1, string1Cache) != getLength(string2, string2Cache)) return false; + // utf8 character iterator + UTF8CharacterIterator string1U8It(string1, string1Cache); + UTF8CharacterIterator string2U8It(string1, string2Cache); + // + while (true == true) { + auto string1Next = string1U8It.hasNext(); + auto string2Next = string2U8It.hasNext(); + if (string1Next != string2Next) return false; + if (string1Next == false) return true; + auto c1 = Character::toUpperCase(string1U8It.next()); + auto c2 = Character::toUpperCase(string2U8It.next()); + if (c1 != c2) return false; + } + // + return false; +} + +const string UTF8StringTools::trim(const string& str, UTF8CharacterIterator::UTF8PositionCache* cache) { + auto result = viewTrim(string_view(str), cache); + return string(result.data(), result.size()); +} + +const string_view UTF8StringTools::viewTrim(const string_view& str, UTF8CharacterIterator::UTF8PositionCache* cache) { + // utf8 character iterator + UTF8CharacterIterator u8It(str, cache); + // + int64_t firstNonSpace = string::npos; + int64_t lastNonSpace = string::npos; + while (u8It.hasNext() == true) { + auto c = u8It.next(); + if (Character::isSpace(c) == false) { + if (firstNonSpace == string::npos) firstNonSpace = u8It.getCharacterPosition() - 1; + lastNonSpace = u8It.getCharacterPosition() - 1; + } + } + // + if (firstNonSpace == string::npos) return string(); + // + return viewSubstring(str, firstNonSpace, lastNonSpace + 1); +} + +const string UTF8StringTools::toLowerCase(const string& str, UTF8CharacterIterator::UTF8PositionCache* cache) { + string result; + // utf8 character iterator + UTF8CharacterIterator u8It(str, cache); + // + while (u8It.hasNext() == true) Character::appendToString(result, Character::toLowerCase(u8It.next())); + // + return result; +} + +const string UTF8StringTools::toUpperCase(const string& str, UTF8CharacterIterator::UTF8PositionCache* cache) { + string result; + // utf8 character iterator + UTF8CharacterIterator u8It(str, cache); + // + while (u8It.hasNext() == true) Character::appendToString(result, Character::toUpperCase(u8It.next())); + // + return result; +} + +bool UTF8StringTools::regexMatch(const string& str, const string& pattern) { + // TODO: return found groups + return regex_match(str, regex(pattern, std::regex::ECMAScript)); +} + +bool UTF8StringTools::regexSearch(const string& str, const string& pattern) { + // TODO: return found groups + return regex_search(str, regex(pattern, std::regex::ECMAScript)); +} + +const string UTF8StringTools::regexReplace(const string& str, const string& pattern, const string& by) { + return regex_replace(str, regex(pattern, std::regex::ECMAScript), by); +} + +const vector UTF8StringTools::tokenize(const string& str, const string& delimiters, bool emptyTokens) { + UTF8StringTokenizer t; + t.tokenize(str, delimiters, emptyTokens); + return t.getTokens(); +} + +const string UTF8StringTools::padLeft(const string& str, const string& by, int64_t toLength, UTF8CharacterIterator::UTF8PositionCache* cache) { + auto result = str; + while (getLength(result) < toLength) result = by + result; + return result; +} + +const string UTF8StringTools::padRight(const string& str, const string& by, int64_t toLength, UTF8CharacterIterator::UTF8PositionCache* cache) { + auto result = str; + UTF8CharacterIterator::UTF8PositionCache resultCache; + if (cache != nullptr) resultCache = *cache; + while (getLength(result, &resultCache) < toLength) result = result + by; + return result; +} + +int64_t UTF8StringTools::getLength(const string& str, UTF8CharacterIterator::UTF8PositionCache* cache) { + UTF8CharacterIterator u8It(str, cache); + u8It.seekCharacterPosition(4611686018427387903); // 2 ^ 62 - 1 + return u8It.getCharacterPosition(); +} + +const string UTF8StringTools::getCharAt(const string& str, int64_t index, UTF8CharacterIterator::UTF8PositionCache* cache) { + // utf8 character iterator + UTF8CharacterIterator u8It(str, cache); + u8It.seekCharacterPosition(index); + // + return u8It.hasNext() == true?Character::toString(u8It.next()):string(); +} + +int64_t UTF8StringTools::getUTF8BinaryIndex(const string& str, int64_t charIdx, UTF8CharacterIterator::UTF8PositionCache* cache) { + UTF8CharacterIterator u8It(str, cache); + u8It.seekCharacterPosition(charIdx); + return u8It.getBinaryPosition(); +} diff --git a/ext/miniscript/src/miniscript/utilities/UTF8StringTools.h b/ext/miniscript/src/miniscript/utilities/UTF8StringTools.h new file mode 100644 index 000000000..950e62d6f --- /dev/null +++ b/ext/miniscript/src/miniscript/utilities/UTF8StringTools.h @@ -0,0 +1,327 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include + +using std::string; +using std::string_view; +using std::vector; + +/** + * UTF8 String tools class + * @author Andreas Drewke + */ +class miniscript::utilities::UTF8StringTools final +{ +public: + /** + * Checks if string starts with prefix + * @param str string + * @param prefix prefix string + * @return if string starts with prefix + */ + inline static const bool startsWith(const string& str, const string& prefix) { + return str.find(prefix) == 0; + } + + /** + * Checks if string starts with prefix + * @param str string + * @param prefix prefix string + * @return if string starts with prefix + */ + inline static const bool viewStartsWith(const string_view& str, const string& prefix) { + return str.find(prefix) == 0; + } + + /** + * Checks if string ends with suffix + * @param str string + * @param suffix suffix string + * @return if string ends with suffix + */ + inline static const bool endsWith(const string& str, const string& suffix) { + return + str.size() >= suffix.size() && + str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0; + } + + /** + * Checks if string ends with suffix + * @param str string + * @param suffix suffix string + * @return if string ends with suffix + */ + inline static const bool viewEndsWith(const string_view& str, const string& suffix) { + return + str.size() >= suffix.size() && + str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0; + } + + /** + * Replace string with another string + * @param str string + * @param what what to replace + * @param by to replace by + * @param beginIndex index to begin with + * @param cache str UTF8 position cache + * @return replace result + */ + static const string replace( + const string& str, + const string& what, + const string& by, + int64_t beginIndex = 0, + ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr + ); + + /** + * Finds first index of given string + * @param str string + * @param what what + * @param beginIndex begin index + * @param cache str UTF8 position cache + * @return index or string::npos if not found + */ + inline static int64_t indexOf( + const string& str, + const string& what, + int64_t beginIndex = 0, + ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr + ) { + return firstIndexOf(str, what, beginIndex, cache); + } + + /** + * Finds first index of given string + * @param str string + * @param what what + * @param beginIndex begin index + * @param cache str UTF8 position cache + * @return index or string::npos if not found + */ + static int64_t firstIndexOf( + const string& str, + const string& what, + int64_t beginIndex = 0, + ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr + ); + + /** + * Finds last index of given string + * @param str string + * @param what what + * @param endIndex end index or string::npos + * @param cache str UTF8 position cache + * @return index or string::npos if not found + */ + static int64_t lastIndexOf( + const string& str, + const string& what, + int64_t endIndex = string::npos, + ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr + ); + + /** + * Finds first index of characters provided within given string + * @param str string + * @param what what + * @param beginIndex begin index + * @param srcCache str UTF8 position cache + * @param whatCache what UTF8 position cache + * @return index or string::npos if not found + */ + static int64_t firstIndexOfChars(const string& str, const string& what, int64_t beginIndex = 0, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* srcCache = nullptr, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* whatCache = nullptr); + + /** + * Finds last index of characters provided within given string + * @param str string + * @param what what + * @param endIndex end index or string::npos + * @param srcCache str UTF8 position cache + * @param whatCache what UTF8 position cache + * @return index or string::npos if not found + */ + static int64_t lastIndexOfChars(const string& str, const string& what, int64_t endIndex = string::npos, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* srcCache = nullptr, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* whatCache = nullptr); + + /** + * Returns substring of given string from begin index to end index + * @param str string + * @param beginIndex begin index + * @param endIndex end index or string::npos + * @param cache str UTF8 position cache + * @return substring result + */ + inline static const string substring(const string& str, int64_t beginIndex, int64_t endIndex = string::npos, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* srcCache = nullptr, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr) { + auto result = viewSubstring(string_view(str), beginIndex, endIndex, cache); + return string(result.data(), result.size()); + } + + /** + * Returns substring of given string from begin index to end index + * @param str string + * @param beginIndex begin index + * @param endIndex end index or string::npos + * @param cache str UTF8 position cache + * @return substring result + */ + static const string_view viewSubstring(const string_view& str, int64_t beginIndex, int64_t endIndex, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + + /** + * Checks if strings equal ignoring case + * @param string1 string 1 + * @param string2 string 2 + * @param string1Cache string1 UTF8 position cache + * @param string2Cache string2 UTF8 position cache + * @return equality + */ + static bool equalsIgnoreCase( + const string& string1, + const string& string2, + ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* string1Cache = nullptr, + ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* string2Cache = nullptr + ); + + /** + * Trim string + * @param str string + * @param cache UTF8 position cache + * @return trimmed string + */ + static const string trim(const string& str, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + + /** + * Trim string + * @param str string + * @param cache UTF8 position cache + * @return trimmed string + */ + static const string_view viewTrim(const string_view& str, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + + /** + * Transform string to lower case + * @param str string + * @param cache UTF8 position cache + * @return lowercase string + */ + static const string toLowerCase(const string& str, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + + /** + * Transform string to upper case + * @param str string + * @param cache UTF8 position cache + * @return uppercase string + */ + static const string toUpperCase(const string& str, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + + /** + * Check if pattern matches whole string + * @param str string + * @param pattern pattern + * @return if pattern matches whole string + */ + static bool regexMatch(const string& str, const string& pattern); + + /** + * Do regex pattern search + * @param str string + * @param pattern pattern + * @return if search was successful + */ + static bool regexSearch(const string& str, const string& pattern); + + /** + * Replace regex pattern with given string + * @param str string + * @param pattern pattern + * @param by replace string + * @return replace result + */ + static const string regexReplace(const string& str, const string& pattern, const string& by); + + /** + * Tokenize + * @param str string + * @param delimiters delimiters + * @param emptyTokens include empty tokens + * @return tokens + */ + static const vector tokenize(const string& str, const string& delimiters, bool emptyTokens = false); + + /** + * Pad a string left + * @param str string + * @param by by + * @param toLength to length + * @param cache str UTF8 position cache + * @return padded string + */ + static const string padLeft(const string& str, const string& by, int64_t toLength, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + + /** + * Pad a string right + * @param str string + * @param by by + * @param toLength to length + * @param cache str UTF8 position cache + * @return padded string + */ + static const string padRight(const string& str, const string& by, int64_t toLength, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + + /** + * Indent a string + * @param str string + * @param with with + * @param count count + * @return resulting string + */ + inline static const string indent(const string& str, const string& with, int64_t count) { + string result; + for (auto i = 0; i < count; i++) result+= with; + return result + str; + } + + /** + * Generate a string + * @param what what + * @param count count + * @return resulting string + */ + inline static const string generate(const string& what, int64_t count = 1) { + string result; + for (auto i = 0; i < count; i++) result+= what; + return result; + } + + /** + * Get UTF8 string length + * @param str string + * @param cache UTF8 position cache + * @return UTF8 string length + */ + static int64_t getLength(const string& str, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + + /** + * Get UTF8 character at given index + * @param str string + * @param index index + * @param cache UTF8 position cache + */ + static const string getCharAt(const string& str, int64_t index, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + + /** + * Get UTF8 binary buffer index + * @param str string + * @param charIdx character index + * @param cache UTF8 position cache + * @return UTF binary buffer position from given character/code point index + */ + static int64_t getUTF8BinaryIndex(const string& str, int64_t charIdx, ::miniscript::utilities::UTF8CharacterIterator::UTF8PositionCache* cache = nullptr); + +}; +