14#include "arccore/base/CoreArray.h"
15#include "arccore/base/BasicTranscoder.h"
17#ifdef ARCCORE_HAS_GLIB
32bool _isSpace(Int32 wc)
34#ifdef ARCCORE_HAS_GLIB
35 return g_unichar_isspace(wc);
37 return std::iswspace(wc);
40Int32 _toUpper(Int32 wc)
42#ifdef ARCCORE_HAS_GLIB
43 return g_unichar_toupper(wc);
45 return std::towupper(wc);
48Int32 _toLower(Int32 wc)
50#ifdef ARCCORE_HAS_GLIB
51 return g_unichar_tolower(wc);
53 return std::towlower(wc);
57int _invalidChar(Int32 pos, Int32& wc)
59 std::cout <<
"WARNING: Invalid sequence '" << wc <<
"' in conversion input (position=" << pos <<
")\n";
64int _notEnoughChar(Int32& wc)
66 std::cout <<
"WARNING: Invalid sequence '" << wc <<
"' in conversion input (unexpected eof)\n";
93 else if (wc < 0x10000)
95 else if (wc < 0x200000)
97 else if (wc < 0x4000000)
103 r[5] = 0x80 | (wc & 0x3f);
108 r[4] = 0x80 | (wc & 0x3f);
113 r[3] = 0x80 | (wc & 0x3f);
118 r[2] = 0x80 | (wc & 0x3f);
123 r[1] = 0x80 | (wc & 0x3f);
130 for (
int i = 0; i < count; ++i)
131 utf8.add((
Byte)r[i]);
150 const Byte* s = uchar.
data() + index;
151 unsigned char c = s[0];
152 Int64 n = uchar.
size() - index;
159 return _invalidChar(1, wc);
163 return _notEnoughChar(wc);
164 if (!((s[1] ^ 0x80) < 0x40))
165 return _invalidChar(2, wc);
166 wc = ((Int32)(c & 0x1f) << 6) | (Int32)(s[1] ^ 0x80);
172 return _notEnoughChar(wc);
173 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0)))
174 return _invalidChar(4, wc);
175 wc = ((Int32)(c & 0x0f) << 12) | ((Int32)(s[1] ^ 0x80) << 6) | (Int32)(s[2] ^ 0x80);
181 return _notEnoughChar(wc);
182 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90)))
183 return _invalidChar(5, wc);
184 wc = ((Int32)(c & 0x07) << 18) | ((Int32)(s[1] ^ 0x80) << 12) | ((Int32)(s[2] ^ 0x80) << 6) | (Int32)(s[3] ^ 0x80);
194 return _notEnoughChar(wc);
195 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (c >= 0xf9 || s[1] >= 0x88)))
196 return _invalidChar(7, wc);
197 wc = ((Int32)(c & 0x03) << 24) | ((Int32)(s[1] ^ 0x80) << 18) | ((Int32)(s[2] ^ 0x80) << 12) | ((Int32)(s[3] ^ 0x80) << 6) | (Int32)(s[4] ^ 0x80);
202 return _notEnoughChar(wc);
203 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (s[5] ^ 0x80) < 0x40 && (c >= 0xfd || s[1] >= 0x84)))
204 return _invalidChar(8, wc);
205 wc = ((Int32)(c & 0x01) << 30) | ((Int32)(s[1] ^ 0x80) << 24) | ((Int32)(s[2] ^ 0x80) << 18) | ((Int32)(s[3] ^ 0x80) << 12) | ((Int32)(s[4] ^ 0x80) << 6) | (Int32)(s[5] ^ 0x80);
208 return _invalidChar(9, wc);
229 if (wc >= 0xd800 && wc < 0xdc00) {
230 if ((index + 1) == uchar.
size()) {
231 std::cout <<
"WARNING: utf16_to_ucs4(): Invalid sequence in conversion input (unexpected eof)\n";
235 Int32 wc2 = uchar[index + 1];
236 if (!(wc2 >= 0xdc00 && wc2 < 0xe000)) {
237 std::cout <<
"WARNING: utf16_to_ucs4(): Invalid sequence (1) '" << wc2 <<
"' in conversion input\n";
241 wc = (0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00));
244 else if (wc >= 0xdc00 && wc < 0xe0000) {
245 std::cout <<
"WARNING: utf16_to_ucs4(): Invalid sequence (2) '" << wc <<
"' in conversion input\n";
274 uchar.add((
UChar)wc);
278 std::cout <<
"WARNING: ucs4_to_utf16(): Invalid sequence in conversion input\n";
283 uchar.add((
UChar)wc);
287 uchar.add((
UChar)((wc - 0x10000) / 0x400 + 0xd800));
288 uchar.add((
UChar)((wc - 0x10000) % 0x400 + 0xdc00));
291 std::cerr <<
"WARNING: ucs4_to_utf16(): Invalid sequence in conversion input\n";
298namespace Arcane::Impl
304Int64 BasicTranscoder::
305stringLen(
const UChar* ustr)
307 if (!ustr || ustr[0] == 0)
309 const UChar* u = ustr + 1;
319void BasicTranscoder::
322 for (
Int64 i = 0, n = utf16.
size(); i < n;) {
324 i += utf16_to_ucs4(utf16, i, wc);
325 ucs4_to_utf8(wc, utf8);
332void BasicTranscoder::
335 for (
Int64 i = 0, n = utf8.
size(); i < n;) {
337 i += utf8_to_ucs4(utf8, i, wc);
338 ucs4_to_utf16(wc, utf16);
345void BasicTranscoder::
346replaceWS(CoreArray<Byte>& out_utf8)
351 for (Int64 i = 0, n = utf8.
size(); i < n;) {
353 i += utf8_to_ucs4(utf8, i, wc);
357 ucs4_to_utf8(wc, out_utf8);
364void BasicTranscoder::
367 Impl::CoreArray<Byte> copy_utf8(out_utf8);
368 Span<const Byte> utf8(copy_utf8);
377 bool old_is_space =
true;
378 bool has_spaces_only =
true;
383 i += utf8_to_ucs4(utf8, i, wc);
390 old_is_space =
false;
391 ucs4_to_utf8(wc, out_utf8);
392 has_spaces_only =
false;
395 if (old_is_space && (!has_spaces_only)) {
396 if (out_utf8.size() > 0)
409void BasicTranscoder::
412 CoreArray<Byte> copy_utf8(out_utf8);
413 Span<const Byte> utf8(copy_utf8.view());
415 for (
Int64 i = 0, n = utf8.
size(); i < n;) {
417 i += utf8_to_ucs4(utf8, i, wc);
418 Int32 upper_wc = _toUpper(wc);
419 ucs4_to_utf8(upper_wc, out_utf8);
426void BasicTranscoder::
429 CoreArray<Byte> copy_utf8(out_utf8);
430 Span<const Byte> utf8(copy_utf8.view());
432 for (
Int64 i = 0, n = utf8.
size(); i < n;) {
434 i += utf8_to_ucs4(utf8, i, wc);
435 Int32 upper_wc = _toLower(wc);
436 ucs4_to_utf8(upper_wc, out_utf8);
443void BasicTranscoder::
447 Int64 current_pos = 0;
448 for (
Int64 i = 0, n = utf8.
size(); i < n;) {
450 i += utf8_to_ucs4(utf8, i, wc);
451 if (current_pos >= pos && current_pos < (pos + len)) {
454 ucs4_to_utf8(wc, out_utf8);
459 ucs4_to_utf8(0, out_utf8);
Internal array for Arccore.
constexpr __host__ __device__ pointer data() const noexcept
Pointer to the start of the view.
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
View of an array of elements of type T.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
std::int64_t Int64
Signed integer type of 64 bits.
constexpr __host__ __device__ Int64 arccoreCheckLargeArraySize(size_t size)
Checks that size can be converted into an 'Int64' to serve as an array size.
unsigned char Byte
Type of a byte.
unsigned short UChar
Type of a unicode character.
std::int32_t Int32
Signed integer type of 32 bits.