14#include "arccore/base/CoreArray.h"
15#include "arccore/base/BasicTranscoder.h"
17#ifdef ARCCORE_HAS_GLIB
32bool _isSpace(Int32 wc)
34#ifdef ARCCORE_HAS_GLIB
35 return g_unichar_isspace(wc);
37 return std::iswspace(wc);
40Int32 _toUpper(Int32 wc)
42#ifdef ARCCORE_HAS_GLIB
43 return g_unichar_toupper(wc);
45 return std::towupper(wc);
48Int32 _toLower(Int32 wc)
50#ifdef ARCCORE_HAS_GLIB
51 return g_unichar_tolower(wc);
53 return std::towlower(wc);
57int _invalidChar(Int32 pos, Int32& wc)
59 std::cout <<
"WARNING: Invalid sequence '" << wc <<
"' in conversion input (position=" << pos <<
")\n";
64int _notEnoughChar(Int32& wc)
66 std::cout <<
"WARNING: Invalid sequence '" << wc <<
"' in conversion input (unexpected eof)\n";
92 else if (wc < 0x10000)
94 else if (wc < 0x200000)
96 else if (wc < 0x4000000)
102 r[5] = 0x80 | (wc & 0x3f);
107 r[4] = 0x80 | (wc & 0x3f);
112 r[3] = 0x80 | (wc & 0x3f);
117 r[2] = 0x80 | (wc & 0x3f);
122 r[1] = 0x80 | (wc & 0x3f);
129 for (
int i = 0; i < count; ++i)
130 utf8.add((
Byte)r[i]);
148 const Byte* s = uchar.
data() + index;
149 unsigned char c = s[0];
150 Int64 n = uchar.
size() - index;
157 return _invalidChar(1, wc);
161 return _notEnoughChar(wc);
162 if (!((s[1] ^ 0x80) < 0x40))
163 return _invalidChar(2, wc);
164 wc = ((
Int32)(c & 0x1f) << 6) | (Int32)(s[1] ^ 0x80);
170 return _notEnoughChar(wc);
171 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0)))
172 return _invalidChar(4, wc);
173 wc = ((
Int32)(c & 0x0f) << 12) | ((Int32)(s[1] ^ 0x80) << 6) | (
Int32)(s[2] ^ 0x80);
179 return _notEnoughChar(wc);
180 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90)))
181 return _invalidChar(5, wc);
182 wc = ((
Int32)(c & 0x07) << 18) | ((Int32)(s[1] ^ 0x80) << 12) | ((
Int32)(s[2] ^ 0x80) << 6) | (Int32)(s[3] ^ 0x80);
192 return _notEnoughChar(wc);
193 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (c >= 0xf9 || s[1] >= 0x88)))
194 return _invalidChar(7, wc);
195 wc = ((
Int32)(c & 0x03) << 24) | ((Int32)(s[1] ^ 0x80) << 18) | ((
Int32)(s[2] ^ 0x80) << 12) | ((Int32)(s[3] ^ 0x80) << 6) | (
Int32)(s[4] ^ 0x80);
200 return _notEnoughChar(wc);
201 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (s[5] ^ 0x80) < 0x40 && (c >= 0xfd || s[1] >= 0x84)))
202 return _invalidChar(8, wc);
203 wc = ((
Int32)(c & 0x01) << 30) | ((Int32)(s[1] ^ 0x80) << 24) | ((
Int32)(s[2] ^ 0x80) << 18) | ((Int32)(s[3] ^ 0x80) << 12) | ((
Int32)(s[4] ^ 0x80) << 6) | (Int32)(s[5] ^ 0x80);
206 return _invalidChar(9, wc);
226 if (wc >= 0xd800 && wc < 0xdc00) {
227 if ((index + 1) == uchar.
size()) {
228 std::cout <<
"WARNING: utf16_to_ucs4(): Invalid sequence in conversion input (unexpected eof)\n";
232 Int32 wc2 = uchar[index + 1];
233 if (!(wc2 >= 0xdc00 && wc2 < 0xe000)) {
234 std::cout <<
"WARNING: utf16_to_ucs4(): Invalid sequence (1) '" << wc2 <<
"' in conversion input\n";
238 wc = (0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00));
241 else if (wc >= 0xdc00 && wc < 0xe0000) {
242 std::cout <<
"WARNING: utf16_to_ucs4(): Invalid sequence (2) '" << wc <<
"' in conversion input\n";
271 uchar.add((
UChar)wc);
275 std::cout <<
"WARNING: ucs4_to_utf16(): Invalid sequence in conversion input\n";
280 uchar.add((
UChar)wc);
284 uchar.add( (
UChar) ((wc - 0x10000) / 0x400 + 0xd800) );
285 uchar.add( (
UChar) ((wc - 0x10000) % 0x400 + 0xdc00) );
288 std::cerr <<
"WARNING: ucs4_to_utf16(): Invalid sequence in conversion input\n";
298Int64 BasicTranscoder::
299stringLen(
const UChar* ustr)
301 if (!ustr || ustr[0] == 0)
303 const UChar* u = ustr + 1;
318 i += utf16_to_ucs4(utf16,i,wc);
319 ucs4_to_utf8(wc,utf8);
326void BasicTranscoder::
331 i += utf8_to_ucs4(utf8,i,wc);
332 ucs4_to_utf16(wc,utf16);
339void BasicTranscoder::
345 for (Int64 i = 0, n = utf8.
size(); i < n;) {
347 i += utf8_to_ucs4(utf8, i, wc);
351 ucs4_to_utf8(wc, out_utf8);
358void BasicTranscoder::
365 Int64 n = utf8.
size();
371 bool old_is_space =
true;
372 bool has_spaces_only =
true;
377 i += utf8_to_ucs4(utf8, i, wc);
384 old_is_space =
false;
385 ucs4_to_utf8(wc, out_utf8);
386 has_spaces_only =
false;
389 if (old_is_space && (!has_spaces_only)) {
390 if (out_utf8.
size() > 0)
403void BasicTranscoder::
409 for (Int64 i = 0, n = utf8.
size(); i < n;) {
411 i += utf8_to_ucs4(utf8, i, wc);
412 Int32 upper_wc = _toUpper(wc);
413 ucs4_to_utf8(upper_wc, out_utf8);
420void BasicTranscoder::
426 for (Int64 i = 0, n = utf8.
size(); i < n;) {
428 i += utf8_to_ucs4(utf8, i, wc);
429 Int32 upper_wc = _toLower(wc);
430 ucs4_to_utf8(upper_wc, out_utf8);
437void BasicTranscoder::
441 Int64 current_pos = 0;
442 for( Int64 i=0, n=utf8.
size(); i<n; ){
444 i += utf8_to_ucs4(utf8,i,wc);
445 if (current_pos>=pos && current_pos<(pos+len)){
448 ucs4_to_utf8(wc,out_utf8);
453 ucs4_to_utf8(0,out_utf8);
static void transcodeFromUtf16ToUtf8(Span< const UChar > utf16, CoreArray< Byte > &utf8)
Traduit depuis UTF16 vers UTF8.
Int64 size() const
Retourne la taille du tableau.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Vue d'un tableau d'éléments de type T.
Espace de nom de Arccore.
unsigned short UChar
Type d'un caractère unicode.
constexpr __host__ __device__ Int64 arccoreCheckLargeArraySize(size_t size)
Vérifie que size peut être converti dans un 'Int64' pour servir de taille à un tableau.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.
unsigned char Byte
Type d'un octet.