--- /dev/null
+/*
+ * Copyright (c) 2015-2024 Philippe Proulx <pproulx@efficios.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <array>
+#include <cmath>
+
+#include "cpp-common/bt2s/string-view.hpp"
+
+#include "str-scanner.hpp"
+
+namespace bt2c {
+
+/* clang-format off */
+
+const bt2c::Regex StrScanner::_realRegex {
+ "^" /* Start of target */
+ "-?" /* Optional negation */
+ "(?:0|[1-9]\\d*)" /* Integer part */
+ "(?=[eE.]\\d)" /* Assertion: need fraction/exponent part */
+ "(?:\\.\\d+)?" /* Optional fraction part */
+ "(?:[eE][+-]?\\d+)?" /* Optional exponent part */
+};
+
+/* clang-format on */
+
+StrScanner::StrScanner(const bt2s::string_view str, const std::size_t baseOffset,
+ const Logger& logger) :
+ _mStr {str},
+ _mAt {str.begin()}, _mLineBegin {str.begin()}, _mBaseOffset {baseOffset},
+ _mLogger {logger, "STR-SCANNER"}
+{
+}
+
+StrScanner::StrScanner(const bt2s::string_view str, const Logger& logger) :
+ StrScanner {str, 0, logger}
+{
+}
+
+void StrScanner::reset()
+{
+ this->at(_mStr.begin());
+ _mNbLines = 0;
+ _mLineBegin = _mStr.begin();
+}
+
+void StrScanner::skipWhitespaces() noexcept
+{
+ while (!this->isDone()) {
+ switch (*_mAt) {
+ case '\n':
+ this->_checkNewline();
+ /* Fall through */
+ case ' ':
+ case '\t':
+ case '\v':
+ case '\r':
+ this->_incrAt();
+ break;
+ default:
+ return;
+ }
+ }
+}
+
+void StrScanner::_appendEscapedUnicodeChar(const Iter at)
+{
+ /* Create array of four hex characters */
+ std::array<char, 4> hexCpBuf;
+
+ std::copy(at, at + 4, hexCpBuf.begin());
+
+ /* Validate hex characters */
+ for (const auto ch : hexCpBuf) {
+ if (!std::isxdigit(ch)) {
+ BT_CPPLOGE_TEXT_LOC_APPEND_CAUSE_AND_THROW(
+ Error, this->loc(), "In `\\u` escape sequence: unexpected character `{:c}`.", ch);
+ }
+ }
+
+ /* Convert hex characters to integral codepoint (always works) */
+ const auto cp = std::strtoull(hexCpBuf.data(), nullptr, 16);
+
+ /*
+ * Append UTF-8 bytes from integral codepoint.
+ *
+ * See <https://en.wikipedia.org/wiki/UTF-8#Encoding>.
+ */
+ if (cp <= 0x7f) {
+ _mStrBuf.push_back(cp);
+ } else if (cp <= 0x7ff) {
+ _mStrBuf.push_back(static_cast<char>((cp >> 6) + 0xc0));
+ _mStrBuf.push_back(static_cast<char>((cp & 0x3f) + 0x80));
+ } else if (cp > 0xd800 && cp <= 0xdfff) {
+ /* Unsupported surrogate pairs */
+ BT_CPPLOGE_TEXT_LOC_APPEND_CAUSE_AND_THROW(
+ Error, this->loc(), "In `\\u` escape sequence: unsupported surrogate codepoint U+{:X}.",
+ static_cast<unsigned int>(cp));
+ } else {
+ BT_ASSERT(cp <= 0xffff);
+ _mStrBuf.push_back(static_cast<char>((cp >> 12) + 0xe0));
+ _mStrBuf.push_back(static_cast<char>(((cp >> 6) & 0x3f) + 0x80));
+ _mStrBuf.push_back(static_cast<char>((cp & 0x3f) + 0x80));
+ }
+}
+
+bool StrScanner::_tryAppendEscapedChar(const bt2s::string_view escapeSeqStartList)
+{
+ if (this->charsLeft() < 2) {
+ /* Need at least `\` and another character */
+ return false;
+ }
+
+ if (_mAt[0] != '\\') {
+ /* Not an escape sequence */
+ return false;
+ }
+
+ /* Try each character of `escapeSeqStartList` */
+ for (const auto escapeSeqStart : escapeSeqStartList) {
+ if (_mAt[1] == '"' || _mAt[1] == '\\' || _mAt[1] == escapeSeqStart) {
+ /* Escape sequence detected */
+ if (_mAt[1] == 'u') {
+ /* `\u` escape sequence */
+ if (this->charsLeft() < 6) {
+ /* Need `\u` + four hex characters */
+ BT_CPPLOGE_TEXT_LOC_APPEND_CAUSE_AND_THROW(
+ Error, this->loc(), "`\\u` escape sequence needs four hexadecimal digits.");
+ }
+
+ this->_appendEscapedUnicodeChar(_mAt + 2);
+ this->_incrAt(6);
+ } else {
+ /* Single-character escape sequence */
+ switch (_mAt[1]) {
+ case 'a':
+ _mStrBuf.push_back('\a');
+ break;
+ case 'b':
+ _mStrBuf.push_back('\b');
+ break;
+ case 'f':
+ _mStrBuf.push_back('\f');
+ break;
+ case 'n':
+ _mStrBuf.push_back('\n');
+ break;
+ case 'r':
+ _mStrBuf.push_back('\r');
+ break;
+ case 't':
+ _mStrBuf.push_back('\t');
+ break;
+ case 'v':
+ _mStrBuf.push_back('\v');
+ break;
+ default:
+ /* As is */
+ _mStrBuf.push_back(_mAt[1]);
+ break;
+ }
+
+ this->_incrAt(2);
+ }
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bt2s::string_view StrScanner::tryScanLitStr(const bt2s::string_view escapeSeqStartList)
+{
+ this->skipWhitespaces();
+
+ /* Backup if we can't completely scan */
+ const auto initAt = _mAt;
+ const auto initLineBegin = _mLineBegin;
+ const auto initNbLines = _mNbLines;
+
+ /* First character: `"` or alpha */
+ const auto c = this->_tryScanAnyChar();
+
+ if (c < 0) {
+ return {};
+ }
+
+ if (c != '"') {
+ /* Not a literal string */
+ this->at(initAt);
+ _mLineBegin = initLineBegin;
+ _mNbLines = initNbLines;
+ return {};
+ }
+
+ /* Reset string buffer */
+ _mStrBuf.clear();
+
+ /*
+ * Scan inner string, processing escape sequences during the
+ * process.
+ */
+ while (!this->isDone()) {
+ /* Check for illegal control character */
+ if (std::iscntrl(*_mAt)) {
+ BT_CPPLOGE_TEXT_LOC_APPEND_CAUSE_AND_THROW(
+ Error, this->loc(), "Illegal control character {:#02x} in literal string.",
+ static_cast<unsigned int>(*_mAt));
+ }
+
+ /* Try to append an escaped character first */
+ if (this->_tryAppendEscapedChar(escapeSeqStartList)) {
+ continue;
+ }
+
+ /* End of literal string? */
+ if (*_mAt == '"') {
+ /* Skip `"` */
+ this->_incrAt();
+ return _mStrBuf;
+ }
+
+ /* Check for newline */
+ this->_checkNewline();
+
+ /* Append regular character and go to next one */
+ _mStrBuf.push_back(*_mAt);
+ this->_incrAt();
+ }
+
+ /* Couldn't find end of string */
+ this->at(initAt);
+ _mLineBegin = initLineBegin;
+ _mNbLines = initNbLines;
+ return {};
+}
+
+bool StrScanner::tryScanToken(const bt2s::string_view token) noexcept
+{
+ this->skipWhitespaces();
+
+ /* Backup if we can't completely scan */
+ const auto initAt = _mAt;
+
+ /* Try to scan token completely */
+ auto tokenAt = token.begin();
+
+ while (tokenAt < token.end() && _mAt != _mStr.end()) {
+ if (*_mAt != *tokenAt) {
+ /* Mismatch */
+ this->at(initAt);
+ return false;
+ }
+
+ this->_incrAt();
+ ++tokenAt;
+ }
+
+ if (tokenAt != token.end()) {
+ /* Wrapped string ends before end of token */
+ this->at(initAt);
+ return false;
+ }
+
+ /* Success */
+ return true;
+}
+
+bt2s::optional<double> StrScanner::tryScanConstReal() noexcept
+{
+ this->skipWhitespaces();
+
+ /*
+ * Validate JSON number format (with fraction and/or exponent part).
+ *
+ * This is needed because std::strtod() accepts more formats which
+ * JSON doesn't support.
+ */
+ if (!_realRegex.match(_mStr.substr(_mAt - _mStr.begin()))) {
+ return bt2s::nullopt;
+ }
+
+ /* Parse */
+ char *strEnd = nullptr;
+ const auto val = std::strtod(&(*_mAt), &strEnd);
+
+ if (val == HUGE_VAL || (val == 0 && &(*_mAt) == strEnd) || errno == ERANGE) {
+ /* Couldn't parse */
+ errno = 0;
+ return bt2s::nullopt;
+ }
+
+ /* Success: update character pointer and return value */
+ this->at(_mStr.begin() + (strEnd - _mStr.data()));
+ return val;
+}
+
+} /* namespace bt2c */
--- /dev/null
+/*
+ * Copyright (c) 2015-2024 Philippe Proulx <pproulx@efficios.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef BABELTRACE_CPP_COMMON_BT2C_STR_SCANNER_HPP
+#define BABELTRACE_CPP_COMMON_BT2C_STR_SCANNER_HPP
+
+#include <cstdlib>
+#include <limits>
+#include <string>
+
+#include "common/assert.h"
+#include "cpp-common/bt2c/logging.hpp"
+#include "cpp-common/bt2c/regex.hpp"
+#include "cpp-common/bt2s/string-view.hpp"
+
+#include "text-loc.hpp"
+
+namespace bt2c {
+
+/*
+ * String scanner.
+ *
+ * A string scanner (lexer) wraps an input string view and scans
+ * specific characters and sequences of characters, managing a
+ * current position.
+ *
+ * When you call the various tryScan*() methods to try to scan some
+ * contents, the methods advance the current position on success. They
+ * also automatically skip initial whitespaces.
+ */
+class StrScanner final
+{
+public:
+ using Iter = bt2s::string_view::const_iterator;
+
+ /*
+ * Builds a string scanner, wrapping the string `str`.
+ *
+ * When the string scanner logs or appends a cause to the error of
+ * the current thread, it uses `baseOffset` to format the text
+ * location part of the message.
+ */
+ explicit StrScanner(bt2s::string_view str, std::size_t baseOffset, const Logger& logger);
+
+ /*
+ * Alternative constructor setting the `baseOffset` parameter to 0.
+ */
+ explicit StrScanner(bt2s::string_view str, const Logger& logger);
+
+ /*
+ * Returns the current position.
+ */
+ Iter at() const noexcept
+ {
+ return _mAt;
+ }
+
+ /*
+ * Sets the current position to `at`.
+ *
+ * NOTE: This may corrupt the current text location (loc()) if the
+ * string between at() and `at` includes one or more
+ * newline characters.
+ */
+ void at(const Iter at) noexcept
+ {
+ BT_ASSERT_DBG(at >= _mStr.begin() && at <= _mStr.end());
+ _mAt = at;
+ }
+
+ /*
+ * Returns the viewed string, the one with which this string scanner
+ * was built.
+ */
+ bt2s::string_view str() const noexcept
+ {
+ return _mStr;
+ }
+
+ /*
+ * Returns the number of characters left until `str().end()`.
+ */
+ std::size_t charsLeft() const noexcept
+ {
+ return _mStr.end() - _mAt;
+ }
+
+ /*
+ * Returns the current text location considering `_mBaseOffset`.
+ */
+ TextLoc loc() const noexcept
+ {
+ return TextLoc {_mBaseOffset + static_cast<std::size_t>(_mAt - _mStr.begin()), _mNbLines,
+ static_cast<std::size_t>(_mAt - _mLineBegin)};
+ }
+
+ /*
+ * Returns whether or not the end of the string is reached.
+ */
+ bool isDone() const noexcept
+ {
+ return _mAt == _mStr.end();
+ }
+
+ /*
+ * Resets this string scanner, setting the current position
+ * to `str().begin()`.
+ */
+ void reset();
+
+ /*
+ * Tries to scan a double-quoted literal string, considering the
+ * characters of `escapeSeqStartList`, `\`, and `"` as escape
+ * sequence starting characters, setting the current position to
+ * after the closing double quote on success.
+ *
+ * If `escapeSeqStartList` includes `u`, then a `\u` escape sequence
+ * is interpreted as in JSON: four hexadecimal characters which
+ * represent the value of a single Unicode codepoint.
+ *
+ * Valid examples:
+ *
+ * "salut!"
+ * "en circulation\nYves?"
+ * "\u03c9 often represents angular velocity in physics"
+ *
+ * Returns a view of the escaped string, without beginning/end
+ * double quotes, on success, or an empty view if there's no
+ * double-quoted literal string (or if the method reaches
+ * `str().end()` before a closing `"`).
+ *
+ * Logs and appends a cause to the error of the current thread,
+ * throwing `Error`, if the scanning method finds an invalid escape
+ * sequence or an illegal control character.
+ *
+ * The returned string view remains valid as long as you don't call
+ * any method of this object.
+ */
+ bt2s::string_view tryScanLitStr(bt2s::string_view escapeSeqStartList);
+
+ /*
+ * Tries to scan and decode a constant integer string, possibly
+ * negative if `ValT` (either `unsigned long long` or `long long`)
+ * is signed.
+ *
+ * Valid examples:
+ *
+ * 9283
+ * -42
+ * 0
+ *
+ * Returns `bt2s::nullopt` if the method couldn't scan a
+ * constant integer.
+ *
+ * Sets the current position to after this constant integer string
+ * on success.
+ */
+ template <typename ValT>
+ bt2s::optional<ValT> tryScanConstInt() noexcept;
+
+ /*
+ * Tries to scan and decode a constant unsigned integer string.
+ *
+ * Returns `bt2s::nullopt` if the method couldn't scan a constant
+ * unsigned integer.
+ *
+ * Sets the current position to after this constant unsigned integer
+ * string on success.
+ */
+ bt2s::optional<unsigned long long> tryScanConstUInt() noexcept
+ {
+ return this->tryScanConstInt<unsigned long long>();
+ }
+
+ /*
+ * Tries to scan and decode a constant signed integer string,
+ * possibly negative.
+ *
+ * Returns `bt2s::nullopt` if the method couldn't scan a constant
+ * signed integer.
+ *
+ * Sets the current position to after this constant signed integer
+ * string on success.
+ */
+ bt2s::optional<long long> tryScanConstSInt() noexcept
+ {
+ return this->tryScanConstInt<long long>();
+ }
+
+ /*
+ * Tries to scan and decode a constant real number string, returning
+ * `bt2s::nullopt` if not possible.
+ *
+ * The format of the real number string to scan is the JSON
+ * (<https://www.json.org/>) number one, _with_ a fraction or an
+ * exponent part. Without a fraction/exponent part, this method
+ * returns `bt2s::nullopt`: use tryScanConstInt() to try scanning a
+ * constant integer instead.
+ *
+ * Valid examples:
+ *
+ * 17.2
+ * -42.192
+ * 8e9
+ * 17E12
+ * 9.14e+6
+ * -13.2777E-4
+ * 0.0
+ * -0.0
+ *
+ * Sets the current position to after this constant real number
+ * string on success.
+ */
+ bt2s::optional<double> tryScanConstReal() noexcept;
+
+ /*
+ * Tries to scan the specific token `token`, setting the current
+ * position to after this string and returning `true` on success.
+ */
+ bool tryScanToken(bt2s::string_view token) noexcept;
+
+ /*
+ * Skips the next whitespaces, updating the current position.
+ */
+ void skipWhitespaces() noexcept;
+
+private:
+ /*
+ * Tries to negate `ullVal` as a signed integer value if `ValT` is
+ * signed and `negate` is true, returning `bt2s::nullopt` if it
+ * can't.
+ *
+ * Always succeeds when `ValT` is unsigned.
+ */
+ template <typename ValT>
+ static bt2s::optional<ValT> _tryNegateConstInt(unsigned long long ullVal, bool negate) noexcept;
+
+ /*
+ * Handles a `\u` escape sequence, appending the UTF-8-encoded
+ * Unicode character to `_mStrBuf` on success, or throwing `Error`
+ * on error.
+ *
+ * `at` is the position of the first hexadecimal character
+ * after `\u`.
+ */
+ void _appendEscapedUnicodeChar(Iter at);
+
+ /*
+ * Tries to append an escaped character to `_mStrBuf` from the
+ * escape sequence characters at the current positin, considering
+ * the characters of `escapeSeqStartList`, `\`, and `"` as escape
+ * sequence starting characters.
+ */
+ bool _tryAppendEscapedChar(bt2s::string_view escapeSeqStartList);
+
+ /*
+ * Tries to scan any character, returning it and advancing the
+ * current position on success, or returning -1 if the current
+ * position is `str().end()`.
+ */
+ int _tryScanAnyChar() noexcept
+ {
+ if (this->isDone()) {
+ return -1;
+ }
+
+ const auto c = *_mAt;
+
+ this->_incrAt();
+ return c;
+ }
+
+ /*
+ * Checks if the character at the current position is a newline,
+ * updating the line count and line beginning position if so.
+ */
+ void _checkNewline() noexcept
+ {
+ if (*_mAt == '\n') {
+ ++_mNbLines;
+ _mLineBegin = _mAt + 1;
+ }
+ }
+
+ /*
+ * Increments `_mAt` by `count`.
+ */
+ void _incrAt(const std::size_t count = 1) noexcept
+ {
+ _mAt += count;
+ BT_ASSERT_DBG(_mAt <= _mStr.end());
+ }
+
+ /*
+ * Decrements `_mAt` by `count`.
+ */
+ void _decrAt(const std::size_t count = 1) noexcept
+ {
+ _mAt -= count;
+ BT_ASSERT_DBG(_mAt >= _mStr.begin());
+ }
+
+private:
+ /* Viewed string, given by user */
+ bt2s::string_view _mStr;
+
+ /* Current position within `_mStr` */
+ Iter _mAt;
+
+ /* Beginning of the current line */
+ Iter _mLineBegin;
+
+ /* Number of lines scanned so far */
+ std::size_t _mNbLines = 0;
+
+ /* String buffer, used by tryScanToken() and tryScanLitStr() */
+ std::string _mStrBuf;
+
+ /* Real number string regex */
+ static const bt2c::Regex _realRegex;
+
+ /* Base offset for error messages */
+ std::size_t _mBaseOffset;
+
+ /* Logging configuration */
+ Logger _mLogger;
+};
+
+template <typename ValT>
+bt2s::optional<ValT> StrScanner::_tryNegateConstInt(const unsigned long long ullVal,
+ const bool negate) noexcept
+{
+ /* Check for overflow */
+ if (std::is_signed<ValT>::value) {
+ constexpr auto llMaxAsUll =
+ static_cast<unsigned long long>(std::numeric_limits<long long>::max());
+
+ if (negate) {
+ if (ullVal > llMaxAsUll + 1) {
+ return bt2s::nullopt;
+ }
+ } else {
+ if (ullVal > llMaxAsUll) {
+ return bt2s::nullopt;
+ }
+ }
+ }
+
+ /* Success: cast and negate if needed */
+ auto val = static_cast<ValT>(ullVal);
+
+ if (negate) {
+ val *= static_cast<ValT>(-1);
+ }
+
+ return val;
+}
+
+template <typename ValT>
+bt2s::optional<ValT> StrScanner::tryScanConstInt() noexcept
+{
+ static_assert(std::is_same<ValT, long long>::value ||
+ std::is_same<ValT, unsigned long long>::value,
+ "`ValT` is `long long` or `unsigned long long`.");
+
+ this->skipWhitespaces();
+
+ /* Backup if we can't scan completely */
+ const auto initAt = _mAt;
+
+ /* Scan initial character */
+ const auto c = this->_tryScanAnyChar();
+
+ if (c < 0) {
+ /* Nothing left */
+ return bt2s::nullopt;
+ }
+
+ /* Check for negation */
+ const bool negate = (c == '-');
+
+ if (negate && !std::is_signed<ValT>::value) {
+ /* Can't negate an unsigned integer */
+ this->at(initAt);
+ return bt2s::nullopt;
+ }
+
+ if (!negate) {
+ /* No negation: rewind */
+ this->_decrAt();
+ }
+
+ /*
+ * Only allow a digit at this point: std::strtoull() below supports
+ * an initial `+`, but this scanner doesn't.
+ */
+ if (this->isDone() || !std::isdigit(*_mAt)) {
+ this->at(initAt);
+ return bt2s::nullopt;
+ }
+
+ /* Parse */
+ char *strEnd = nullptr;
+ const auto ullVal = std::strtoull(&(*_mAt), &strEnd, 10);
+
+ if ((ullVal == 0 && &(*_mAt) == strEnd) || errno == ERANGE) {
+ /* Couldn't parse */
+ errno = 0;
+ this->at(initAt);
+ return bt2s::nullopt;
+ }
+
+ /* Negate if needed */
+ const auto val = this->_tryNegateConstInt<ValT>(ullVal, negate);
+
+ if (!val) {
+ /* Couldn't negate */
+ this->at(initAt);
+ return bt2s::nullopt;
+ }
+
+ /* Success: update current position and return value */
+ this->at(_mStr.begin() + (strEnd - _mStr.data()));
+ return val;
+}
+
+} /* namespace bt2c */
+
+#endif /* BABELTRACE_CPP_COMMON_BT2C_STR_SCANNER_HPP */