Arcane  4.1.12.0
User documentation
Loading...
Searching...
No Matches
BasicTranscoder.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* BasicTranscoder.cc (C) 2000-2025 */
9/* */
10/* Conversions between utf8 and utf16. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/base/CoreArray.h"
15#include "arccore/base/BasicTranscoder.h"
16
17#ifdef ARCCORE_HAS_GLIB
18#include <glib.h>
19#else
20#include <cwctype>
21#endif
22
23#include <iostream>
24
25/*---------------------------------------------------------------------------*/
26/*---------------------------------------------------------------------------*/
27
28namespace
29{
30using namespace Arcane;
31
32bool _isSpace(Int32 wc)
33{
34#ifdef ARCCORE_HAS_GLIB
35 return g_unichar_isspace(wc);
36#else
37 return std::iswspace(wc);
38#endif
39}
40Int32 _toUpper(Int32 wc)
41{
42#ifdef ARCCORE_HAS_GLIB
43 return g_unichar_toupper(wc);
44#else
45 return std::towupper(wc);
46#endif
47}
48Int32 _toLower(Int32 wc)
49{
50#ifdef ARCCORE_HAS_GLIB
51 return g_unichar_tolower(wc);
52#else
53 return std::towlower(wc);
54#endif
55}
56
57int _invalidChar(Int32 pos, Int32& wc)
58{
59 std::cout << "WARNING: Invalid sequence '" << wc << "' in conversion input (position=" << pos << ")\n";
60 wc = '?';
61 return 1;
62}
63
64int _notEnoughChar(Int32& wc)
65{
66 std::cout << "WARNING: Invalid sequence '" << wc << "' in conversion input (unexpected eof)\n";
67 wc = '?';
68 return 1;
69}
70
71/*---------------------------------------------------------------------------*/
72/*---------------------------------------------------------------------------*/
73
74/*!
75 * \brief Convert a Unicode character (UCS4) to utf8.
76 *
77 * Routine retrieved from libiconv.
78 *
79 * One ucs4 character generates between 1 and 6 utf8 characters.
80 * The converted characters are added to the utf8 array.
81 *
82 * \param wc ucs4 value of the character to convert
83 * \param utf8[out] Array containing the converted utf8 characters
84 */
85void ucs4_to_utf8(Int32 wc, Impl::CoreArray<Byte>& utf8)
86{
87 Int32 r[6];
88 int count;
89 if (wc < 0x80)
90 count = 1;
91 else if (wc < 0x800)
92 count = 2;
93 else if (wc < 0x10000)
94 count = 3;
95 else if (wc < 0x200000)
96 count = 4;
97 else if (wc < 0x4000000)
98 count = 5;
99 else
100 count = 6;
101 switch (count) { /* note: code falls through cases! */
102 case 6:
103 r[5] = 0x80 | (wc & 0x3f);
104 wc = wc >> 6;
105 wc |= 0x4000000;
106 [[fallthrough]];
107 case 5:
108 r[4] = 0x80 | (wc & 0x3f);
109 wc = wc >> 6;
110 wc |= 0x200000;
111 [[fallthrough]];
112 case 4:
113 r[3] = 0x80 | (wc & 0x3f);
114 wc = wc >> 6;
115 wc |= 0x10000;
116 [[fallthrough]];
117 case 3:
118 r[2] = 0x80 | (wc & 0x3f);
119 wc = wc >> 6;
120 wc |= 0x800;
121 [[fallthrough]];
122 case 2:
123 r[1] = 0x80 | (wc & 0x3f);
124 wc = wc >> 6;
125 wc |= 0xc0;
126 [[fallthrough]];
127 case 1:
128 r[0] = wc;
129 }
130 for (int i = 0; i < count; ++i)
131 utf8.add((Byte)r[i]);
132}
133/*---------------------------------------------------------------------------*/
134/*---------------------------------------------------------------------------*/
135
136/*!
137 * \brief Convert a utf8 character to unicode (UCS4).
138 *
139 * Routine retrieved from libiconv.
140 *
141 * One ucs4 character is created from 1 to 6 utf8 characters.
142 *
143 * \param uchar Array containing the utf8 characters to convert
144 * \param index index of the first element of the array to convert
145 * \param wc [out] ucs4 value of the character.
146 * \return the number of utf8 characters read.
147 */
148Int64 utf8_to_ucs4(Span<const Byte> uchar, Int64 index, Int32& wc)
149{
150 const Byte* s = uchar.data() + index;
151 unsigned char c = s[0];
152 Int64 n = uchar.size() - index;
153 if (c < 0x80) {
154 wc = c;
155 return 1;
156 }
157
158 if (c < 0xc2)
159 return _invalidChar(1, wc);
160
161 if (c < 0xe0) {
162 if (n < 2)
163 return _notEnoughChar(wc);
164 if (!((s[1] ^ 0x80) < 0x40))
165 return _invalidChar(2, wc);
166 wc = ((Int32)(c & 0x1f) << 6) | (Int32)(s[1] ^ 0x80);
167 return 2;
168 }
169
170 if (c < 0xf0) {
171 if (n < 3)
172 return _notEnoughChar(wc);
173 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0)))
174 return _invalidChar(4, wc);
175 wc = ((Int32)(c & 0x0f) << 12) | ((Int32)(s[1] ^ 0x80) << 6) | (Int32)(s[2] ^ 0x80);
176 return 3;
177 }
178
179 if (c < 0xf8) {
180 if (n < 4)
181 return _notEnoughChar(wc);
182 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90)))
183 return _invalidChar(5, wc);
184 wc = ((Int32)(c & 0x07) << 18) | ((Int32)(s[1] ^ 0x80) << 12) | ((Int32)(s[2] ^ 0x80) << 6) | (Int32)(s[3] ^ 0x80);
185 return 4;
186 }
187
188 // We should never reach here
189 // because there are no longer (since the 2003 standard)
190 // UTF-8 characters encoded in 5 or 6 bytes.
191
192 if (c < 0xfc) {
193 if (n < 5)
194 return _notEnoughChar(wc);
195 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (c >= 0xf9 || s[1] >= 0x88)))
196 return _invalidChar(7, wc);
197 wc = ((Int32)(c & 0x03) << 24) | ((Int32)(s[1] ^ 0x80) << 18) | ((Int32)(s[2] ^ 0x80) << 12) | ((Int32)(s[3] ^ 0x80) << 6) | (Int32)(s[4] ^ 0x80);
198 return 5;
199 }
200 if (c < 0xfe) {
201 if (n < 6)
202 return _notEnoughChar(wc);
203 if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (s[5] ^ 0x80) < 0x40 && (c >= 0xfd || s[1] >= 0x84)))
204 return _invalidChar(8, wc);
205 wc = ((Int32)(c & 0x01) << 30) | ((Int32)(s[1] ^ 0x80) << 24) | ((Int32)(s[2] ^ 0x80) << 18) | ((Int32)(s[3] ^ 0x80) << 12) | ((Int32)(s[4] ^ 0x80) << 6) | (Int32)(s[5] ^ 0x80);
206 return 6;
207 }
208 return _invalidChar(9, wc);
209}
210
211/*---------------------------------------------------------------------------*/
212/*---------------------------------------------------------------------------*/
213
214/*!
215 * \brief Convert a utf16 character to unicode (UCS4).
216 *
217 * Routine retrieved from libiconv.
218 *
219 * One ucs4 character is created from 1 or 2 utf16 characters.
220 *
221 * \param uchar Array containing the utf16 characters to convert
222 * \param index index of the first element of the array to convert
223 * \param wc [out] ucs4 value of the character.
224 * \return the number of utf16 characters read.
225 */
226Int64 utf16_to_ucs4(Span<const UChar> uchar, Int64 index, Int32& wc)
227{
228 wc = uchar[index];
229 if (wc >= 0xd800 && wc < 0xdc00) {
230 if ((index + 1) == uchar.size()) {
231 std::cout << "WARNING: utf16_to_ucs4(): Invalid sequence in conversion input (unexpected eof)\n";
232 wc = 0x1A;
233 return 1;
234 }
235 Int32 wc2 = uchar[index + 1];
236 if (!(wc2 >= 0xdc00 && wc2 < 0xe000)) {
237 std::cout << "WARNING: utf16_to_ucs4(): Invalid sequence (1) '" << wc2 << "' in conversion input\n";
238 wc = 0x1A;
239 return 1;
240 }
241 wc = (0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00));
242 return 2;
243 }
244 else if (wc >= 0xdc00 && wc < 0xe0000) {
245 std::cout << "WARNING: utf16_to_ucs4(): Invalid sequence (2) '" << wc << "' in conversion input\n";
246 wc = 0x1A;
247 return 1;
248 }
249 return 1;
250}
251
252/*---------------------------------------------------------------------------*/
253/*---------------------------------------------------------------------------*/
254
255} // namespace
256
257/*---------------------------------------------------------------------------*/
258/*---------------------------------------------------------------------------*/
259
260/*!
261 * \brief Convert a character (UCS4) to utf16 big-endian.
262 *
263 * Routine retrieved from libiconv.
264 *
265 * One ucs4 character generates 1 or 2 utf16 characters. The
266 * converted characters are added to the uchar array
267 *
268 * \param wc ucs4 value of the character to convert
269 * \param uchar[out] Array containing the converted utf16 characters
270 */
271void ucs4_to_utf16(Int32 wc, Impl::CoreArray<UChar>& uchar)
272{
273 if (wc < 0xd800) {
274 uchar.add((UChar)wc);
275 return;
276 }
277 if (wc < 0xe000) {
278 std::cout << "WARNING: ucs4_to_utf16(): Invalid sequence in conversion input\n";
279 uchar.add(0x1A);
280 return;
281 }
282 if (wc < 0x10000) {
283 uchar.add((UChar)wc);
284 return;
285 }
286 if (wc < 0x110000) {
287 uchar.add((UChar)((wc - 0x10000) / 0x400 + 0xd800));
288 uchar.add((UChar)((wc - 0x10000) % 0x400 + 0xdc00));
289 return;
290 }
291 std::cerr << "WARNING: ucs4_to_utf16(): Invalid sequence in conversion input\n";
292 uchar.add(0x1A);
293}
294
295/*---------------------------------------------------------------------------*/
296/*---------------------------------------------------------------------------*/
297
298namespace Arcane::Impl
299{
300
301/*---------------------------------------------------------------------------*/
302/*---------------------------------------------------------------------------*/
303
304Int64 BasicTranscoder::
305stringLen(const UChar* ustr)
306{
307 if (!ustr || ustr[0] == 0)
308 return 0;
309 const UChar* u = ustr + 1;
310 while ((*u) != 0)
311 ++u;
312 return arccoreCheckLargeArraySize((std::size_t)(u - ustr));
313}
314
315/*---------------------------------------------------------------------------*/
316/*---------------------------------------------------------------------------*/
317
318//! Translates from UTF16 to UTF8
319void BasicTranscoder::
320transcodeFromUtf16ToUtf8(Span<const UChar> utf16, CoreArray<Byte>& utf8)
321{
322 for (Int64 i = 0, n = utf16.size(); i < n;) {
323 Int32 wc;
324 i += utf16_to_ucs4(utf16, i, wc);
325 ucs4_to_utf8(wc, utf8);
326 }
327}
328
329/*---------------------------------------------------------------------------*/
330/*---------------------------------------------------------------------------*/
331
332void BasicTranscoder::
333transcodeFromUtf8ToUtf16(Span<const Byte> utf8, CoreArray<UChar>& utf16)
334{
335 for (Int64 i = 0, n = utf8.size(); i < n;) {
336 Int32 wc;
337 i += utf8_to_ucs4(utf8, i, wc);
338 ucs4_to_utf16(wc, utf16);
339 }
340}
341
342/*---------------------------------------------------------------------------*/
343/*---------------------------------------------------------------------------*/
344
345void BasicTranscoder::
346replaceWS(CoreArray<Byte>& out_utf8)
347{
348 Impl::CoreArray<Byte> copy_utf8(out_utf8);
349 Span<const Byte> utf8(copy_utf8);
350 out_utf8.clear();
351 for (Int64 i = 0, n = utf8.size(); i < n;) {
352 Int32 wc;
353 i += utf8_to_ucs4(utf8, i, wc);
354 if (_isSpace(wc))
355 out_utf8.add(' ');
356 else
357 ucs4_to_utf8(wc, out_utf8);
358 }
359}
360
361/*---------------------------------------------------------------------------*/
362/*---------------------------------------------------------------------------*/
363
364void BasicTranscoder::
365collapseWS(CoreArray<Byte>& out_utf8)
366{
367 Impl::CoreArray<Byte> copy_utf8(out_utf8);
368 Span<const Byte> utf8(copy_utf8);
369 out_utf8.clear();
370 Int64 i = 0;
371 Int64 n = utf8.size();
372 // If the string is empty, return an empty string.
373 if (n == 1) {
374 out_utf8.add('\0');
375 return;
376 }
377 bool old_is_space = true;
378 bool has_spaces_only = true;
379 for (; i < n;) {
380 if (utf8[i] == 0)
381 break;
382 Int32 wc;
383 i += utf8_to_ucs4(utf8, i, wc);
384 if (_isSpace(wc)) {
385 if (!old_is_space)
386 out_utf8.add(' ');
387 old_is_space = true;
388 }
389 else {
390 old_is_space = false;
391 ucs4_to_utf8(wc, out_utf8);
392 has_spaces_only = false;
393 }
394 }
395 if (old_is_space && (!has_spaces_only)) {
396 if (out_utf8.size() > 0)
397 out_utf8.back() = 0;
398 }
399 else {
400 if (has_spaces_only)
401 out_utf8.add(' ');
402 out_utf8.add(0);
403 }
404}
405
406/*---------------------------------------------------------------------------*/
407/*---------------------------------------------------------------------------*/
408
409void BasicTranscoder::
410upperCase(CoreArray<Byte>& out_utf8)
411{
412 CoreArray<Byte> copy_utf8(out_utf8);
413 Span<const Byte> utf8(copy_utf8.view());
414 out_utf8.clear();
415 for (Int64 i = 0, n = utf8.size(); i < n;) {
416 Int32 wc;
417 i += utf8_to_ucs4(utf8, i, wc);
418 Int32 upper_wc = _toUpper(wc);
419 ucs4_to_utf8(upper_wc, out_utf8);
420 }
421}
422
423/*---------------------------------------------------------------------------*/
424/*---------------------------------------------------------------------------*/
425
426void BasicTranscoder::
427lowerCase(CoreArray<Byte>& out_utf8)
428{
429 CoreArray<Byte> copy_utf8(out_utf8);
430 Span<const Byte> utf8(copy_utf8.view());
431 out_utf8.clear();
432 for (Int64 i = 0, n = utf8.size(); i < n;) {
433 Int32 wc;
434 i += utf8_to_ucs4(utf8, i, wc);
435 Int32 upper_wc = _toLower(wc);
436 ucs4_to_utf8(upper_wc, out_utf8);
437 }
438}
439
440/*---------------------------------------------------------------------------*/
441/*---------------------------------------------------------------------------*/
442
443void BasicTranscoder::
444substring(CoreArray<Byte>& out_utf8, Span<const Byte> utf8, Int64 pos, Int64 len)
445{
446 // Copy the 'len' Unicode characters from the 'pos' position of the utf8 array
447 Int64 current_pos = 0;
448 for (Int64 i = 0, n = utf8.size(); i < n;) {
449 Int32 wc;
450 i += utf8_to_ucs4(utf8, i, wc);
451 if (current_pos >= pos && current_pos < (pos + len)) {
452 // To ensure the terminal null character is not added
453 if (wc != 0)
454 ucs4_to_utf8(wc, out_utf8);
455 }
456 ++current_pos;
457 }
458 // Add the terminal null character
459 ucs4_to_utf8(0, out_utf8);
460}
461
462/*---------------------------------------------------------------------------*/
463/*---------------------------------------------------------------------------*/
464
465} // namespace Arcane::Impl
466
467/*---------------------------------------------------------------------------*/
468/*---------------------------------------------------------------------------*/
constexpr __host__ __device__ pointer data() const noexcept
Pointer to the start of the view.
Definition Span.h:539
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Definition Span.h:327
View of an array of elements of type T.
Definition Span.h:635
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
std::int64_t Int64
Signed integer type of 64 bits.
constexpr __host__ __device__ Int64 arccoreCheckLargeArraySize(size_t size)
Checks that size can be converted into an 'Int64' to serve as an array size.
unsigned char Byte
Type of a byte.
Definition BaseTypes.h:43
unsigned short UChar
Type of a unicode character.
Definition BaseTypes.h:47
std::int32_t Int32
Signed integer type of 32 bits.