From: Philippe Proulx Date: Fri, 27 Oct 2023 19:21:05 +0000 (+0000) Subject: src/cpp-common: add `bt2c::StrScanner` class X-Git-Url: http://drtracing.org/?a=commitdiff_plain;h=dc64abe778d9e0dbed08a4f2fc30a8cf18147c2f;p=babeltrace.git src/cpp-common: add `bt2c::StrScanner` class This patch adds the `bt2c::StrScanner` class, defined in `str-scanner.hpp` and implemented in `str-scanner.cpp`. A string scanner is a simple lexical scanner. This one is a stripped-down version of yactfr's `yactfr::internal::StrScanner`, stripped-down because yactfr uses this to parse TSDL, therefore it needs more features. What's left for the `bt2c::StrScanner` version is: tryScanLitStr(): Tries to scan a double-quoted literal string, possibly containing escape sequences. tryScanConstInt(): Tries to scan a constant unsigned or signed decimal integer string. tryScanConstReal(): Tries to scan a real number string. tryScanToken(): Tries to scan an exact string. skipWhitespaces(): Skips the next whitespaces. See the specific comments in `str-scanner.hpp` for more details. I could have used the `GScanner` API [1], as we do to parse the value of the `--params` CLI option of `babeltrace2`, but: • `yactfr::internal::StrScanner` is already working, tested, and documented. `bt2c::StrScanner` is a much lighter version of it. • Should we ever make an effort to remove the GLib dependency, this part will already be done. • The `GScanner` API doesn't support `\u` escape sequences in literal strings (needed for JSON strings) out of the box, so we'd need this part on our side anyway. `bt2c::StrScanner` could eventually replace `GScanner` elsewhere in the tree, but it would require a few more features (which already exist in `yactfr::internal::StrScanner`, that is). This is part of an effort to implement a JSON parser to support CTF2‑SPEC‑2.0 [2]. [1]: https://docs.gtk.org/glib/struct.Scanner.html [2]: https://diamon.org/ctf/CTF2-SPEC-2.0.html Signed-off-by: Philippe Proulx Change-Id: I8317917124218618278611794f32a67be4b9a6dd Reviewed-on: https://review.lttng.org/c/babeltrace/+/7410 Reviewed-on: https://review.lttng.org/c/babeltrace/+/12681 --- diff --git a/src/Makefile.am b/src/Makefile.am index a8b4e2af..920139db 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -176,6 +176,8 @@ cpp_common_libcpp_common_la_SOURCES = \ cpp-common/bt2c/regex.hpp \ cpp-common/bt2c/safe-ops.hpp \ cpp-common/bt2c/std-int.hpp \ + cpp-common/bt2c/str-scanner.cpp \ + cpp-common/bt2c/str-scanner.hpp \ cpp-common/bt2c/text-loc.cpp \ cpp-common/bt2c/text-loc.hpp \ cpp-common/bt2c/text-loc-str.cpp \ diff --git a/src/cpp-common/bt2c/str-scanner.cpp b/src/cpp-common/bt2c/str-scanner.cpp new file mode 100644 index 00000000..d30ea0fb --- /dev/null +++ b/src/cpp-common/bt2c/str-scanner.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2015-2024 Philippe Proulx + * + * SPDX-License-Identifier: MIT + */ + +#include +#include + +#include "cpp-common/bt2s/string-view.hpp" + +#include "str-scanner.hpp" + +namespace bt2c { + +/* clang-format off */ + +const bt2c::Regex StrScanner::_realRegex { + "^" /* Start of target */ + "-?" /* Optional negation */ + "(?:0|[1-9]\\d*)" /* Integer part */ + "(?=[eE.]\\d)" /* Assertion: need fraction/exponent part */ + "(?:\\.\\d+)?" /* Optional fraction part */ + "(?:[eE][+-]?\\d+)?" /* Optional exponent part */ +}; + +/* clang-format on */ + +StrScanner::StrScanner(const bt2s::string_view str, const std::size_t baseOffset, + const Logger& logger) : + _mStr {str}, + _mAt {str.begin()}, _mLineBegin {str.begin()}, _mBaseOffset {baseOffset}, + _mLogger {logger, "STR-SCANNER"} +{ +} + +StrScanner::StrScanner(const bt2s::string_view str, const Logger& logger) : + StrScanner {str, 0, logger} +{ +} + +void StrScanner::reset() +{ + this->at(_mStr.begin()); + _mNbLines = 0; + _mLineBegin = _mStr.begin(); +} + +void StrScanner::skipWhitespaces() noexcept +{ + while (!this->isDone()) { + switch (*_mAt) { + case '\n': + this->_checkNewline(); + /* Fall through */ + case ' ': + case '\t': + case '\v': + case '\r': + this->_incrAt(); + break; + default: + return; + } + } +} + +void StrScanner::_appendEscapedUnicodeChar(const Iter at) +{ + /* Create array of four hex characters */ + std::array hexCpBuf; + + std::copy(at, at + 4, hexCpBuf.begin()); + + /* Validate hex characters */ + for (const auto ch : hexCpBuf) { + if (!std::isxdigit(ch)) { + BT_CPPLOGE_TEXT_LOC_APPEND_CAUSE_AND_THROW( + Error, this->loc(), "In `\\u` escape sequence: unexpected character `{:c}`.", ch); + } + } + + /* Convert hex characters to integral codepoint (always works) */ + const auto cp = std::strtoull(hexCpBuf.data(), nullptr, 16); + + /* + * Append UTF-8 bytes from integral codepoint. + * + * See . + */ + if (cp <= 0x7f) { + _mStrBuf.push_back(cp); + } else if (cp <= 0x7ff) { + _mStrBuf.push_back(static_cast((cp >> 6) + 0xc0)); + _mStrBuf.push_back(static_cast((cp & 0x3f) + 0x80)); + } else if (cp > 0xd800 && cp <= 0xdfff) { + /* Unsupported surrogate pairs */ + BT_CPPLOGE_TEXT_LOC_APPEND_CAUSE_AND_THROW( + Error, this->loc(), "In `\\u` escape sequence: unsupported surrogate codepoint U+{:X}.", + static_cast(cp)); + } else { + BT_ASSERT(cp <= 0xffff); + _mStrBuf.push_back(static_cast((cp >> 12) + 0xe0)); + _mStrBuf.push_back(static_cast(((cp >> 6) & 0x3f) + 0x80)); + _mStrBuf.push_back(static_cast((cp & 0x3f) + 0x80)); + } +} + +bool StrScanner::_tryAppendEscapedChar(const bt2s::string_view escapeSeqStartList) +{ + if (this->charsLeft() < 2) { + /* Need at least `\` and another character */ + return false; + } + + if (_mAt[0] != '\\') { + /* Not an escape sequence */ + return false; + } + + /* Try each character of `escapeSeqStartList` */ + for (const auto escapeSeqStart : escapeSeqStartList) { + if (_mAt[1] == '"' || _mAt[1] == '\\' || _mAt[1] == escapeSeqStart) { + /* Escape sequence detected */ + if (_mAt[1] == 'u') { + /* `\u` escape sequence */ + if (this->charsLeft() < 6) { + /* Need `\u` + four hex characters */ + BT_CPPLOGE_TEXT_LOC_APPEND_CAUSE_AND_THROW( + Error, this->loc(), "`\\u` escape sequence needs four hexadecimal digits."); + } + + this->_appendEscapedUnicodeChar(_mAt + 2); + this->_incrAt(6); + } else { + /* Single-character escape sequence */ + switch (_mAt[1]) { + case 'a': + _mStrBuf.push_back('\a'); + break; + case 'b': + _mStrBuf.push_back('\b'); + break; + case 'f': + _mStrBuf.push_back('\f'); + break; + case 'n': + _mStrBuf.push_back('\n'); + break; + case 'r': + _mStrBuf.push_back('\r'); + break; + case 't': + _mStrBuf.push_back('\t'); + break; + case 'v': + _mStrBuf.push_back('\v'); + break; + default: + /* As is */ + _mStrBuf.push_back(_mAt[1]); + break; + } + + this->_incrAt(2); + } + + return true; + } + } + + return false; +} + +bt2s::string_view StrScanner::tryScanLitStr(const bt2s::string_view escapeSeqStartList) +{ + this->skipWhitespaces(); + + /* Backup if we can't completely scan */ + const auto initAt = _mAt; + const auto initLineBegin = _mLineBegin; + const auto initNbLines = _mNbLines; + + /* First character: `"` or alpha */ + const auto c = this->_tryScanAnyChar(); + + if (c < 0) { + return {}; + } + + if (c != '"') { + /* Not a literal string */ + this->at(initAt); + _mLineBegin = initLineBegin; + _mNbLines = initNbLines; + return {}; + } + + /* Reset string buffer */ + _mStrBuf.clear(); + + /* + * Scan inner string, processing escape sequences during the + * process. + */ + while (!this->isDone()) { + /* Check for illegal control character */ + if (std::iscntrl(*_mAt)) { + BT_CPPLOGE_TEXT_LOC_APPEND_CAUSE_AND_THROW( + Error, this->loc(), "Illegal control character {:#02x} in literal string.", + static_cast(*_mAt)); + } + + /* Try to append an escaped character first */ + if (this->_tryAppendEscapedChar(escapeSeqStartList)) { + continue; + } + + /* End of literal string? */ + if (*_mAt == '"') { + /* Skip `"` */ + this->_incrAt(); + return _mStrBuf; + } + + /* Check for newline */ + this->_checkNewline(); + + /* Append regular character and go to next one */ + _mStrBuf.push_back(*_mAt); + this->_incrAt(); + } + + /* Couldn't find end of string */ + this->at(initAt); + _mLineBegin = initLineBegin; + _mNbLines = initNbLines; + return {}; +} + +bool StrScanner::tryScanToken(const bt2s::string_view token) noexcept +{ + this->skipWhitespaces(); + + /* Backup if we can't completely scan */ + const auto initAt = _mAt; + + /* Try to scan token completely */ + auto tokenAt = token.begin(); + + while (tokenAt < token.end() && _mAt != _mStr.end()) { + if (*_mAt != *tokenAt) { + /* Mismatch */ + this->at(initAt); + return false; + } + + this->_incrAt(); + ++tokenAt; + } + + if (tokenAt != token.end()) { + /* Wrapped string ends before end of token */ + this->at(initAt); + return false; + } + + /* Success */ + return true; +} + +bt2s::optional StrScanner::tryScanConstReal() noexcept +{ + this->skipWhitespaces(); + + /* + * Validate JSON number format (with fraction and/or exponent part). + * + * This is needed because std::strtod() accepts more formats which + * JSON doesn't support. + */ + if (!_realRegex.match(_mStr.substr(_mAt - _mStr.begin()))) { + return bt2s::nullopt; + } + + /* Parse */ + char *strEnd = nullptr; + const auto val = std::strtod(&(*_mAt), &strEnd); + + if (val == HUGE_VAL || (val == 0 && &(*_mAt) == strEnd) || errno == ERANGE) { + /* Couldn't parse */ + errno = 0; + return bt2s::nullopt; + } + + /* Success: update character pointer and return value */ + this->at(_mStr.begin() + (strEnd - _mStr.data())); + return val; +} + +} /* namespace bt2c */ diff --git a/src/cpp-common/bt2c/str-scanner.hpp b/src/cpp-common/bt2c/str-scanner.hpp new file mode 100644 index 00000000..254b387d --- /dev/null +++ b/src/cpp-common/bt2c/str-scanner.hpp @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2015-2024 Philippe Proulx + * + * SPDX-License-Identifier: MIT + */ + +#ifndef BABELTRACE_CPP_COMMON_BT2C_STR_SCANNER_HPP +#define BABELTRACE_CPP_COMMON_BT2C_STR_SCANNER_HPP + +#include +#include +#include + +#include "common/assert.h" +#include "cpp-common/bt2c/logging.hpp" +#include "cpp-common/bt2c/regex.hpp" +#include "cpp-common/bt2s/string-view.hpp" + +#include "text-loc.hpp" + +namespace bt2c { + +/* + * String scanner. + * + * A string scanner (lexer) wraps an input string view and scans + * specific characters and sequences of characters, managing a + * current position. + * + * When you call the various tryScan*() methods to try to scan some + * contents, the methods advance the current position on success. They + * also automatically skip initial whitespaces. + */ +class StrScanner final +{ +public: + using Iter = bt2s::string_view::const_iterator; + + /* + * Builds a string scanner, wrapping the string `str`. + * + * When the string scanner logs or appends a cause to the error of + * the current thread, it uses `baseOffset` to format the text + * location part of the message. + */ + explicit StrScanner(bt2s::string_view str, std::size_t baseOffset, const Logger& logger); + + /* + * Alternative constructor setting the `baseOffset` parameter to 0. + */ + explicit StrScanner(bt2s::string_view str, const Logger& logger); + + /* + * Returns the current position. + */ + Iter at() const noexcept + { + return _mAt; + } + + /* + * Sets the current position to `at`. + * + * NOTE: This may corrupt the current text location (loc()) if the + * string between at() and `at` includes one or more + * newline characters. + */ + void at(const Iter at) noexcept + { + BT_ASSERT_DBG(at >= _mStr.begin() && at <= _mStr.end()); + _mAt = at; + } + + /* + * Returns the viewed string, the one with which this string scanner + * was built. + */ + bt2s::string_view str() const noexcept + { + return _mStr; + } + + /* + * Returns the number of characters left until `str().end()`. + */ + std::size_t charsLeft() const noexcept + { + return _mStr.end() - _mAt; + } + + /* + * Returns the current text location considering `_mBaseOffset`. + */ + TextLoc loc() const noexcept + { + return TextLoc {_mBaseOffset + static_cast(_mAt - _mStr.begin()), _mNbLines, + static_cast(_mAt - _mLineBegin)}; + } + + /* + * Returns whether or not the end of the string is reached. + */ + bool isDone() const noexcept + { + return _mAt == _mStr.end(); + } + + /* + * Resets this string scanner, setting the current position + * to `str().begin()`. + */ + void reset(); + + /* + * Tries to scan a double-quoted literal string, considering the + * characters of `escapeSeqStartList`, `\`, and `"` as escape + * sequence starting characters, setting the current position to + * after the closing double quote on success. + * + * If `escapeSeqStartList` includes `u`, then a `\u` escape sequence + * is interpreted as in JSON: four hexadecimal characters which + * represent the value of a single Unicode codepoint. + * + * Valid examples: + * + * "salut!" + * "en circulation\nYves?" + * "\u03c9 often represents angular velocity in physics" + * + * Returns a view of the escaped string, without beginning/end + * double quotes, on success, or an empty view if there's no + * double-quoted literal string (or if the method reaches + * `str().end()` before a closing `"`). + * + * Logs and appends a cause to the error of the current thread, + * throwing `Error`, if the scanning method finds an invalid escape + * sequence or an illegal control character. + * + * The returned string view remains valid as long as you don't call + * any method of this object. + */ + bt2s::string_view tryScanLitStr(bt2s::string_view escapeSeqStartList); + + /* + * Tries to scan and decode a constant integer string, possibly + * negative if `ValT` (either `unsigned long long` or `long long`) + * is signed. + * + * Valid examples: + * + * 9283 + * -42 + * 0 + * + * Returns `bt2s::nullopt` if the method couldn't scan a + * constant integer. + * + * Sets the current position to after this constant integer string + * on success. + */ + template + bt2s::optional tryScanConstInt() noexcept; + + /* + * Tries to scan and decode a constant unsigned integer string. + * + * Returns `bt2s::nullopt` if the method couldn't scan a constant + * unsigned integer. + * + * Sets the current position to after this constant unsigned integer + * string on success. + */ + bt2s::optional tryScanConstUInt() noexcept + { + return this->tryScanConstInt(); + } + + /* + * Tries to scan and decode a constant signed integer string, + * possibly negative. + * + * Returns `bt2s::nullopt` if the method couldn't scan a constant + * signed integer. + * + * Sets the current position to after this constant signed integer + * string on success. + */ + bt2s::optional tryScanConstSInt() noexcept + { + return this->tryScanConstInt(); + } + + /* + * Tries to scan and decode a constant real number string, returning + * `bt2s::nullopt` if not possible. + * + * The format of the real number string to scan is the JSON + * () number one, _with_ a fraction or an + * exponent part. Without a fraction/exponent part, this method + * returns `bt2s::nullopt`: use tryScanConstInt() to try scanning a + * constant integer instead. + * + * Valid examples: + * + * 17.2 + * -42.192 + * 8e9 + * 17E12 + * 9.14e+6 + * -13.2777E-4 + * 0.0 + * -0.0 + * + * Sets the current position to after this constant real number + * string on success. + */ + bt2s::optional tryScanConstReal() noexcept; + + /* + * Tries to scan the specific token `token`, setting the current + * position to after this string and returning `true` on success. + */ + bool tryScanToken(bt2s::string_view token) noexcept; + + /* + * Skips the next whitespaces, updating the current position. + */ + void skipWhitespaces() noexcept; + +private: + /* + * Tries to negate `ullVal` as a signed integer value if `ValT` is + * signed and `negate` is true, returning `bt2s::nullopt` if it + * can't. + * + * Always succeeds when `ValT` is unsigned. + */ + template + static bt2s::optional _tryNegateConstInt(unsigned long long ullVal, bool negate) noexcept; + + /* + * Handles a `\u` escape sequence, appending the UTF-8-encoded + * Unicode character to `_mStrBuf` on success, or throwing `Error` + * on error. + * + * `at` is the position of the first hexadecimal character + * after `\u`. + */ + void _appendEscapedUnicodeChar(Iter at); + + /* + * Tries to append an escaped character to `_mStrBuf` from the + * escape sequence characters at the current positin, considering + * the characters of `escapeSeqStartList`, `\`, and `"` as escape + * sequence starting characters. + */ + bool _tryAppendEscapedChar(bt2s::string_view escapeSeqStartList); + + /* + * Tries to scan any character, returning it and advancing the + * current position on success, or returning -1 if the current + * position is `str().end()`. + */ + int _tryScanAnyChar() noexcept + { + if (this->isDone()) { + return -1; + } + + const auto c = *_mAt; + + this->_incrAt(); + return c; + } + + /* + * Checks if the character at the current position is a newline, + * updating the line count and line beginning position if so. + */ + void _checkNewline() noexcept + { + if (*_mAt == '\n') { + ++_mNbLines; + _mLineBegin = _mAt + 1; + } + } + + /* + * Increments `_mAt` by `count`. + */ + void _incrAt(const std::size_t count = 1) noexcept + { + _mAt += count; + BT_ASSERT_DBG(_mAt <= _mStr.end()); + } + + /* + * Decrements `_mAt` by `count`. + */ + void _decrAt(const std::size_t count = 1) noexcept + { + _mAt -= count; + BT_ASSERT_DBG(_mAt >= _mStr.begin()); + } + +private: + /* Viewed string, given by user */ + bt2s::string_view _mStr; + + /* Current position within `_mStr` */ + Iter _mAt; + + /* Beginning of the current line */ + Iter _mLineBegin; + + /* Number of lines scanned so far */ + std::size_t _mNbLines = 0; + + /* String buffer, used by tryScanToken() and tryScanLitStr() */ + std::string _mStrBuf; + + /* Real number string regex */ + static const bt2c::Regex _realRegex; + + /* Base offset for error messages */ + std::size_t _mBaseOffset; + + /* Logging configuration */ + Logger _mLogger; +}; + +template +bt2s::optional StrScanner::_tryNegateConstInt(const unsigned long long ullVal, + const bool negate) noexcept +{ + /* Check for overflow */ + if (std::is_signed::value) { + constexpr auto llMaxAsUll = + static_cast(std::numeric_limits::max()); + + if (negate) { + if (ullVal > llMaxAsUll + 1) { + return bt2s::nullopt; + } + } else { + if (ullVal > llMaxAsUll) { + return bt2s::nullopt; + } + } + } + + /* Success: cast and negate if needed */ + auto val = static_cast(ullVal); + + if (negate) { + val *= static_cast(-1); + } + + return val; +} + +template +bt2s::optional StrScanner::tryScanConstInt() noexcept +{ + static_assert(std::is_same::value || + std::is_same::value, + "`ValT` is `long long` or `unsigned long long`."); + + this->skipWhitespaces(); + + /* Backup if we can't scan completely */ + const auto initAt = _mAt; + + /* Scan initial character */ + const auto c = this->_tryScanAnyChar(); + + if (c < 0) { + /* Nothing left */ + return bt2s::nullopt; + } + + /* Check for negation */ + const bool negate = (c == '-'); + + if (negate && !std::is_signed::value) { + /* Can't negate an unsigned integer */ + this->at(initAt); + return bt2s::nullopt; + } + + if (!negate) { + /* No negation: rewind */ + this->_decrAt(); + } + + /* + * Only allow a digit at this point: std::strtoull() below supports + * an initial `+`, but this scanner doesn't. + */ + if (this->isDone() || !std::isdigit(*_mAt)) { + this->at(initAt); + return bt2s::nullopt; + } + + /* Parse */ + char *strEnd = nullptr; + const auto ullVal = std::strtoull(&(*_mAt), &strEnd, 10); + + if ((ullVal == 0 && &(*_mAt) == strEnd) || errno == ERANGE) { + /* Couldn't parse */ + errno = 0; + this->at(initAt); + return bt2s::nullopt; + } + + /* Negate if needed */ + const auto val = this->_tryNegateConstInt(ullVal, negate); + + if (!val) { + /* Couldn't negate */ + this->at(initAt); + return bt2s::nullopt; + } + + /* Success: update current position and return value */ + this->at(_mStr.begin() + (strEnd - _mStr.data())); + return val; +} + +} /* namespace bt2c */ + +#endif /* BABELTRACE_CPP_COMMON_BT2C_STR_SCANNER_HPP */