From: Simon Marchi Date: Fri, 3 May 2024 16:54:10 +0000 (-0400) Subject: cpp-common/bt2c: add `UnicodeConv` class X-Git-Url: http://drtracing.org/?a=commitdiff_plain;h=fb9a08c30c8bc0073dfe91be775aceee0951ac18;p=babeltrace.git cpp-common/bt2c: add `UnicodeConv` class Add the `UnicodeConv` class, which currently implements conversion from (UTF-16, UTF-32) × (BE, LE) to UTF-8. The `UnicodeConv` class uses `g_iconv` from GLib internally to make the conversions. It has a vector member that is used as an output buffer. Public conversion methods accept a span of `const std::uint8_t`, and return a span `std::uint8_t` which is a view on that internal vector. Add a test which: ✤ Converts some hardcoded UTF-16 and UTF-32 strings to UTF-8 and verifies the result against a reference UTF-8 string. ✤ Feeds the UTF-16 and UTF-32 conversion methods with truncated strings and verifies that they throw and append specific causes to the error of the current thread. Change-Id: I962bd49261a3d9779ed6a24a26c7800a24beb719 Signed-off-by: Simon Marchi Signed-off-by: Philippe Proulx Reviewed-on: https://review.lttng.org/c/babeltrace/+/12715 --- diff --git a/src/Makefile.am b/src/Makefile.am index eeb50a95..1fffd088 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -195,6 +195,8 @@ cpp_common_libcpp_common_la_SOURCES = \ cpp-common/bt2c/text-loc-str.cpp \ cpp-common/bt2c/text-loc-str.hpp \ cpp-common/bt2c/type-traits.hpp \ + cpp-common/bt2c/unicode-conv.cpp \ + cpp-common/bt2c/unicode-conv.hpp \ cpp-common/bt2c/uuid.hpp \ cpp-common/bt2c/val-req.hpp \ cpp-common/bt2c/vector.hpp \ diff --git a/src/cpp-common/bt2c/unicode-conv.cpp b/src/cpp-common/bt2c/unicode-conv.cpp new file mode 100644 index 00000000..11cdd51a --- /dev/null +++ b/src/cpp-common/bt2c/unicode-conv.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2024 EfficiOS, Inc. + * + * SPDX-License-Identifier: MIT + */ + +#include + +#include "common/assert.h" +#include "cpp-common/bt2/exc.hpp" + +#include "unicode-conv.hpp" + +namespace bt2c { +namespace { + +const auto invalidGIConv = reinterpret_cast(-1); + +} /* namespace */ + +UnicodeConv::UnicodeConv(const bt2c::Logger& parentLogger) : + _mLogger {parentLogger, "UNICODE-CONV"}, _mUtf16BeToUtf8IConv {invalidGIConv}, + _mUtf16LeToUtf8IConv {invalidGIConv}, _mUtf32BeToUtf8IConv {invalidGIConv}, + _mUtf32LeToUtf8IConv {invalidGIConv} +{ +} + +namespace { + +void tryCloseGIConv(const GIConv conv) noexcept +{ + if (conv != invalidGIConv) { + g_iconv_close(conv); + } +}; + +} /* namespace */ + +UnicodeConv::~UnicodeConv() +{ + tryCloseGIConv(_mUtf16BeToUtf8IConv); + tryCloseGIConv(_mUtf16LeToUtf8IConv); + tryCloseGIConv(_mUtf32BeToUtf8IConv); + tryCloseGIConv(_mUtf32LeToUtf8IConv); +} + +ConstBytes UnicodeConv::_justDoIt(const char * const srcEncoding, GIConv& conv, + const ConstBytes data, const std::size_t codeUnitSize) +{ + /* Create iconv conversion descriptor if not created already */ + if (conv == invalidGIConv) { + conv = g_iconv_open("UTF-8", srcEncoding); + + if (conv == invalidGIConv) { + BT_CPPLOGE_ERRNO_APPEND_CAUSE_AND_THROW(bt2::Error, "g_iconv_open() failed", + ": from-encoding={}, to-encoding=UTF-8", + srcEncoding); + } + } + + /* + * Compute a dumb, but safe upper bound for the UTF-8 output buffer. + * + * The input string can encode up to `data.size() / codeUnitSize` + * codepoints. Then, each code point can take up to four bytes in + * UTF-8. + */ + _mBuf.resize(data.size() / codeUnitSize * 4); + + /* Convert */ + gsize inBytesLeft = data.size(); + gsize outBytesLeft = _mBuf.size(); + auto inBuf = const_cast(reinterpret_cast(data.data())); + auto outBuf = reinterpret_cast(_mBuf.data()); + + if (g_iconv(conv, &inBuf, &inBytesLeft, &outBuf, &outBytesLeft) == -1) { + BT_CPPLOGE_ERRNO_APPEND_CAUSE_AND_THROW( + bt2::Error, "g_iconv() failed", + ": input-byte-offset={}, from-encoding={}, to-encoding=UTF-8", + data.size() - inBytesLeft, srcEncoding); + } + + /* + * When g_iconv() is successful, assert that it consumed all input. + * + * The (underlying) iconv() documentation outlines three + * failure modes: + * + * 1. Insufficient output buffer space. + * 2. Invalid multibyte sequence in input. + * 3. Incomplete multibyte sequence in input. + * + * For any malformed input, iconv() will return error 2 or 3. + * + * This suggests that, barring input errors, a successful conversion + * will consume all input bytes. + */ + BT_ASSERT(inBytesLeft == 0); + return {_mBuf.data(), _mBuf.size() - outBytesLeft}; +} + +ConstBytes UnicodeConv::utf8FromUtf16Be(const ConstBytes data) +{ + return this->_justDoIt("UTF-16BE", _mUtf16BeToUtf8IConv, data, 2); +} + +ConstBytes UnicodeConv::utf8FromUtf16Le(const ConstBytes data) +{ + return this->_justDoIt("UTF-16LE", _mUtf16LeToUtf8IConv, data, 2); +} + +ConstBytes UnicodeConv::utf8FromUtf32Be(const ConstBytes data) +{ + return this->_justDoIt("UTF-32BE", _mUtf32BeToUtf8IConv, data, 4); +} + +ConstBytes UnicodeConv::utf8FromUtf32Le(const ConstBytes data) +{ + return this->_justDoIt("UTF-32LE", _mUtf32LeToUtf8IConv, data, 4); +} + +} /* namespace bt2c */ diff --git a/src/cpp-common/bt2c/unicode-conv.hpp b/src/cpp-common/bt2c/unicode-conv.hpp new file mode 100644 index 00000000..25900970 --- /dev/null +++ b/src/cpp-common/bt2c/unicode-conv.hpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2024 EfficiOS, Inc. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef BABELTRACE_CPP_COMMON_BT2C_UNICODE_CONV_HPP +#define BABELTRACE_CPP_COMMON_BT2C_UNICODE_CONV_HPP + +#include +#include + +#include + +#include "logging.hpp" + +#include "aliases.hpp" + +namespace bt2c { + +/* + * A Unicode converter offers the utf8FromUtf*() methods to convert + * UTF-16 and UTF-32 data to UTF-8. + * + * IMPORTANT: The conversion methods aren't thread-safe: a `UnicodeConv` + * instance keeps an internal buffer where it writes the resulting UTF-8 + * data. + */ +class UnicodeConv final +{ +public: + explicit UnicodeConv(const bt2c::Logger& parentLogger); + ~UnicodeConv(); + + /* + * Converts the UTF-16BE data `data` to UTF-8 and returns it. + * + * `data.data()` must not return `nullptr`. + * + * The returned data belongs to this Unicode converter and remains + * valid as long as you don't call another method of this. + * + * Logs a message, appends a cause to the error of the current + * thread, and throws an error if any conversion error occurs, + * including incomplete data in `data`. + */ + ConstBytes utf8FromUtf16Be(ConstBytes data); + + /* + * Converts the UTF-16LE data `data` to UTF-8 and returns it. + * + * `data.data()` must not return `nullptr`. + * + * The returned data belongs to this Unicode converter and remains + * valid as long as you don't call another method of this. + * + * Logs a message, appends a cause to the error of the current + * thread, and throws an error if any conversion error occurs, + * including incomplete data in `data`. + */ + ConstBytes utf8FromUtf16Le(ConstBytes data); + + /* + * Converts the UTF-32BE data `data` to UTF-8 and returns it. + * + * `data.data()` must not return `nullptr`. + * + * The returned data belongs to this Unicode converter and remains + * valid as long as you don't call another method of this. + * + * Logs a message, appends a cause to the error of the current + * thread, and throws an error if any conversion error occurs, + * including incomplete data in `data`. + */ + ConstBytes utf8FromUtf32Be(ConstBytes data); + + /* + * Converts the UTF-32LE data `data` to UTF-8 and returns it. + * + * `data.data()` must not return `nullptr`. + * + * The returned data belongs to this Unicode converter and remains + * valid as long as you don't call another method of this. + * + * Logs a message, appends a cause to the error of the current + * thread, and throws an error if any conversion error occurs, + * including incomplete data in `data`. + */ + ConstBytes utf8FromUtf32Le(ConstBytes data); + +private: + ConstBytes _justDoIt(const char *sourceEncoding, GIConv& converter, const ConstBytes data, + std::size_t codeUnitSize); + + bt2c::Logger _mLogger; + GIConv _mUtf16BeToUtf8IConv; + GIConv _mUtf16LeToUtf8IConv; + GIConv _mUtf32BeToUtf8IConv; + GIConv _mUtf32LeToUtf8IConv; + std::vector _mBuf; +}; + +} /* namespace bt2c */ + +#endif /* BABELTRACE_CPP_COMMON_BT2C_UNICODE_CONV_HPP */ diff --git a/tests/Makefile.am b/tests/Makefile.am index b1953af9..0b63ad9d 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -110,9 +110,22 @@ cpp_common_test_uuid_LDADD = \ $(COMMON_TEST_LDADD) \ $(top_builddir)/src/cpp-common/vendor/fmt/libfmt.la +noinst_PROGRAMS += \ + cpp-common/test-unicode-conv + +cpp_common_test_unicode_conv_SOURCES = \ + cpp-common/test-unicode-conv.cpp + +cpp_common_test_unicode_conv_LDADD = \ + $(top_builddir)/src/cpp-common/vendor/fmt/libfmt.la \ + $(top_builddir)/src/cpp-common/libcpp-common.la \ + $(top_builddir)/src/lib/libbabeltrace2.la \ + $(COMMON_TEST_LDADD) + TESTS_CPP_COMMON = \ cpp-common/test-c-string-view \ - cpp-common/test-uuid + cpp-common/test-uuid \ + cpp-common/test-unicode-conv TESTS_LIB = \ lib/test-bt-uuid \ diff --git a/tests/cpp-common/test-unicode-conv.cpp b/tests/cpp-common/test-unicode-conv.cpp new file mode 100644 index 00000000..b7d910f0 --- /dev/null +++ b/tests/cpp-common/test-unicode-conv.cpp @@ -0,0 +1,193 @@ +/* + * SPDX-License-Identifier: GPL-2.0-only + * + * Copyright (C) 2024 EfficiOS, Inc. + */ + +#include + +#include "cpp-common/bt2c/call.hpp" +#include "cpp-common/bt2c/unicode-conv.hpp" +#include "cpp-common/vendor/fmt/core.h" + +#include "tap/tap.h" + +namespace { + +constexpr std::uint8_t refUtf8String[] = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x21, 0x20, 0xc3, 0x85, + 0xc3, 0xa5, 0xc3, 0x89, 0xc3, 0xa9, 0xc3, 0x9c, 0xc3, 0xbc, 0x20, 0xf0, 0x9f, 0x8c, 0x8d, 0xf0, + 0x9f, 0x9a, 0x80, 0x20, 0xd0, 0x9f, 0xd1, 0x80, 0xd0, 0xb8, 0xd0, 0xb2, 0xd0, 0xb5, 0xd1, 0x82, + 0x20, 0xce, 0x93, 0xce, 0xb5, 0xce, 0xb9, 0xce, 0xac, 0x20, 0xcf, 0x83, 0xce, 0xbf, 0xcf, 0x85, + 0x20, 0xe4, 0xbd, 0xa0, 0xe5, 0xa5, 0xbd, 0x20, 0xe2, 0x88, 0x91, 0xe2, 0x88, 0x8f, 0x00, +}; + +constexpr std::uint8_t utf16BeString[] = { + 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, 0x00, 0x57, + 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x21, 0x00, 0x20, 0x00, 0xc5, 0x00, 0xe5, + 0x00, 0xc9, 0x00, 0xe9, 0x00, 0xdc, 0x00, 0xfc, 0x00, 0x20, 0xd8, 0x3c, 0xdf, 0x0d, 0xd8, 0x3d, + 0xde, 0x80, 0x00, 0x20, 0x04, 0x1f, 0x04, 0x40, 0x04, 0x38, 0x04, 0x32, 0x04, 0x35, 0x04, 0x42, + 0x00, 0x20, 0x03, 0x93, 0x03, 0xb5, 0x03, 0xb9, 0x03, 0xac, 0x00, 0x20, 0x03, 0xc3, 0x03, 0xbf, + 0x03, 0xc5, 0x00, 0x20, 0x4f, 0x60, 0x59, 0x7d, 0x00, 0x20, 0x22, 0x11, 0x22, 0x0f, 0x00, 0x00, +}; + +constexpr std::uint8_t utf16LeString[] = { + 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, 0x00, 0x57, 0x00, + 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x21, 0x00, 0x20, 0x00, 0xc5, 0x00, 0xe5, 0x00, + 0xc9, 0x00, 0xe9, 0x00, 0xdc, 0x00, 0xfc, 0x00, 0x20, 0x00, 0x3c, 0xd8, 0x0d, 0xdf, 0x3d, 0xd8, + 0x80, 0xde, 0x20, 0x00, 0x1f, 0x04, 0x40, 0x04, 0x38, 0x04, 0x32, 0x04, 0x35, 0x04, 0x42, 0x04, + 0x20, 0x00, 0x93, 0x03, 0xb5, 0x03, 0xb9, 0x03, 0xac, 0x03, 0x20, 0x00, 0xc3, 0x03, 0xbf, 0x03, + 0xc5, 0x03, 0x20, 0x00, 0x60, 0x4f, 0x7d, 0x59, 0x20, 0x00, 0x11, 0x22, 0x0f, 0x22, 0x00, 0x00, +}; + +constexpr std::uint8_t utf32BeString[] = { + 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x6c, + 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x57, + 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x64, + 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, 0xe5, + 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, 0xfc, + 0x00, 0x00, 0x00, 0x20, 0x00, 0x01, 0xf3, 0x0d, 0x00, 0x01, 0xf6, 0x80, 0x00, 0x00, 0x00, 0x20, + 0x00, 0x00, 0x04, 0x1f, 0x00, 0x00, 0x04, 0x40, 0x00, 0x00, 0x04, 0x38, 0x00, 0x00, 0x04, 0x32, + 0x00, 0x00, 0x04, 0x35, 0x00, 0x00, 0x04, 0x42, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x03, 0x93, + 0x00, 0x00, 0x03, 0xb5, 0x00, 0x00, 0x03, 0xb9, 0x00, 0x00, 0x03, 0xac, 0x00, 0x00, 0x00, 0x20, + 0x00, 0x00, 0x03, 0xc3, 0x00, 0x00, 0x03, 0xbf, 0x00, 0x00, 0x03, 0xc5, 0x00, 0x00, 0x00, 0x20, + 0x00, 0x00, 0x4f, 0x60, 0x00, 0x00, 0x59, 0x7d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x22, 0x11, + 0x00, 0x00, 0x22, 0x0f, 0x00, 0x00, 0x00, 0x00, +}; + +constexpr std::uint8_t utf32LeString[] = { + 0x48, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x6f, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, + 0x6f, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0xc9, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x0d, 0xf3, 0x01, 0x00, 0x80, 0xf6, 0x01, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x1f, 0x04, 0x00, 0x00, 0x40, 0x04, 0x00, 0x00, 0x38, 0x04, 0x00, 0x00, 0x32, 0x04, 0x00, 0x00, + 0x35, 0x04, 0x00, 0x00, 0x42, 0x04, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x93, 0x03, 0x00, 0x00, + 0xb5, 0x03, 0x00, 0x00, 0xb9, 0x03, 0x00, 0x00, 0xac, 0x03, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0xc3, 0x03, 0x00, 0x00, 0xbf, 0x03, 0x00, 0x00, 0xc5, 0x03, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x60, 0x4f, 0x00, 0x00, 0x7d, 0x59, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x11, 0x22, 0x00, 0x00, + 0x0f, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +/* + * A UTF-16BE string that abruptly ends in the middle of a code point + * (but with complete code units). + */ +constexpr std::uint8_t utf16BeTruncCodePoint[] = { + 0x00, 0x43, 0x00, 0x68, 0x00, 0x61, 0x00, 0x74, 0x00, 0x6f, 0x00, 0x6e, 0x00, 0x20, 0xd8, 0x3d, +}; + +/* + * A UTF-16BE string that abruptly ends in the middle of a code unit. + */ +constexpr std::uint8_t utf16BeTruncCodeUnit[] = { + 0x00, 0x43, 0x00, 0x68, 0x00, 0x61, 0x00, 0x74, 0x00, 0x6f, 0x00, 0x6e, 0x00, +}; + +/* + * A UTF-32BE string that abruptly ends in the middle of a code unit. + */ +constexpr std::uint8_t utf32BeTruncCodeUnit[] = { + 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00, 0x6f, + 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x64, + 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x20, 0x00, 0x01, 0xf9, +}; + +std::string dump(const bt2c::ConstBytes bytes) +{ + std::string res; + + for (const auto byte : bytes) { + res += fmt::format("{:02x} ", byte); + } + + return res; +} + +/* + * Checks that `result` matches `refUtf8String` after a conversion from + * the encoding named `sourceEncoding`. + */ +void checkPass(const bt2c::ConstBytes result, const char * const sourceEncoding) +{ + bool passed = ok(result.size() == sizeof(refUtf8String), "%s to UTF-8: length is expected", + sourceEncoding); + + passed &= ok(std::memcmp(result.data(), refUtf8String, + std::min(result.size(), sizeof(refUtf8String))) == 0, + "%s to UTF-8: content is expected", sourceEncoding); + + if (!passed) { + diag("Expected: %s\n", dump(refUtf8String).c_str()); + diag("Actual: %s\n", dump(result).c_str()); + } +} + +/* + * Checks that calling `f()` throws `bt2::Error` and appends a cause + * having the message `expectedCauseMsg` to the error of the current + * thread. + */ +template +void checkFail(FuncT&& f, const char * const testName, const bt2c::CStringView expectedCauseMsg) +{ + const auto gotError = bt2c::call([&f] { + try { + f(); + } catch (const bt2::Error&) { + return true; + } + + return false; + }); + + ok(gotError, "%s - got error", testName); + + const auto error = bt_current_thread_take_error(); + const auto msg = bt_error_cause_get_message(bt_error_borrow_cause_by_index(error, 0)); + + if (!ok(expectedCauseMsg == msg, "%s - error cause message is expected", testName)) { + diag("Expecting `%s`", msg); + } + + bt_error_release(error); +} + +} /* namespace */ + +int main() +{ + plan_tests(14); + + const bt2c::Logger logger {"test-module", "test-tag", bt2c::Logger::Level::None}; + bt2c::UnicodeConv conv {logger}; + + checkPass(conv.utf8FromUtf16Be(utf16BeString), "UTF-16BE"); + checkPass(conv.utf8FromUtf16Le(utf16LeString), "UTF-16LE"); + checkPass(conv.utf8FromUtf32Be(utf32BeString), "UTF-32BE"); + checkPass(conv.utf8FromUtf32Le(utf32LeString), "UTF-32LE"); + + checkFail( + [&conv] { + conv.utf8FromUtf16Be(utf16BeTruncCodePoint); + }, + "truncated code point", + "g_iconv() failed: Invalid argument: input-byte-offset=14, from-encoding=UTF-16BE, to-encoding=UTF-8"); + + checkFail( + [&conv] { + conv.utf8FromUtf16Be(utf16BeTruncCodeUnit); + }, + "truncated code unit", + "g_iconv() failed: Invalid argument: input-byte-offset=12, from-encoding=UTF-16BE, to-encoding=UTF-8"); + + checkFail( + [&conv] { + conv.utf8FromUtf32Be(utf32BeTruncCodeUnit); + }, + "truncated code unit", + "g_iconv() failed: Invalid argument: input-byte-offset=32, from-encoding=UTF-32BE, to-encoding=UTF-8"); + + return exit_status(); +}