pegasus/src/Pegasus/Common/CommonUTF.cpp - annotate

Return to CommonUTF.cpp CVS log

Up to [Pegasus] / pegasus / src / Pegasus / Common

1 martin 1.23 //%LICENSE////////////////////////////////////////////////////////////////
2 martin 1.24 //
3 martin 1.23 // Licensed to The Open Group (TOG) under one or more contributor license 4 // agreements. Refer to the OpenPegasusNOTICE.txt file distributed with 5 // this work for additional information regarding copyright ownership. 6 // Each contributor licenses this file to you under the OpenPegasus Open 7 // Source License; you may not use this file except in compliance with the 8 // License.
9 martin 1.24 //
10 martin 1.23 // Permission is hereby granted, free of charge, to any person obtaining a 11 // copy of this software and associated documentation files (the "Software"), 12 // to deal in the Software without restriction, including without limitation 13 // the rights to use, copy, modify, merge, publish, distribute, sublicense, 14 // and/or sell copies of the Software, and to permit persons to whom the 15 // Software is furnished to do so, subject to the following conditions:
16 martin 1.24 //
17 martin 1.23 // The above copyright notice and this permission notice shall be included 18 // in all copies or substantial portions of the Software.
19 martin 1.24 //
20 martin 1.23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 martin 1.24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 martin 1.23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 23 // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 24 // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 25 // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 26 // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 martin 1.24 //
28 martin 1.23 //////////////////////////////////////////////////////////////////////////
29 david 1.1 //
30 kamal.locahana 1.22 //%////////////////////////////////////////////////////////////////////////////
31 david 1.1
32 chuck 1.7 #include <Pegasus/Common/Config.h> 33 #include <Pegasus/Common/Array.h>
34 yi.zhou 1.16 #include <Pegasus/Common/Logger.h>
35 david 1.1 #include "CommonUTF.h"
36 thilo.boehm 1.25 #include <Pegasus/Common/String.h>
37 chuck 1.7 #include <cstdio>
38 david 1.2 #include <cstring>
39 david.dillard 1.15 #include <cctype>
40 kumpf 1.3
41 yi.zhou 1.16 #ifdef PEGASUS_HAS_ICU 42 #include <unicode/uclean.h> 43 #endif 44
45 david 1.1 PEGASUS_NAMESPACE_BEGIN
46 kumpf 1.3
47 karl 1.19 const Uint32 halfBase = 0x0010000UL; 48 const Uint32 halfMask = 0x3FFUL; 49 const int halfShift = 10; 50 const Uint8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 51 52 const Uint32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 53 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 54 55 const char trailingBytesForUTF8[256] = { 56 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 57 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 58 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 59 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 60 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 61 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 62 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 63 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 64 };
65 david.dillard 1.12 inline Uint8 _hexCharToNumeric(Char16 c)
66 chuck 1.6 { 67 Uint8 n; 68
69 david.dillard 1.14 if (isdigit(c))
70 chuck 1.6 n = (c - '0');
71 david.dillard 1.14 else if (isupper(c))
72 chuck 1.6 n = (c - 'A' + 10); 73 else // if (islower(c)) 74 n = (c - 'a' + 10); 75 76 return n; 77 } 78
79 kumpf 1.3 // Note: Caller must ensure that "src" contains "size" bytes.
80 chip 1.11 Boolean isValid_U8(const Uint8 *src, int size)
81 david 1.1 { 82 Uint8 U8_char; 83 const Uint8 *srcptr = src+size; 84 switch (size) 85 {
86 chip 1.11 case 4: 87 if ((U8_char = (--srcptr)) < 0x80 \|\| U8_char > 0xBF) 88 { 89 return false; 90 } 91 case 3: 92 if ((U8_char = (--srcptr)) < 0x80 \|\| U8_char > 0xBF) 93 { 94 return false; 95 } 96 case 2: 97 if ((U8_char = (--srcptr)) > 0xBF) 98 { 99 return false; 100 } 101 switch (src) 102 { 103 case 0xE0: 104 if (U8_char < 0xA0) 105 { 106 return false; 107 chip 1.11 } 108 break; 109 case 0xF0: 110 if (U8_char < 0x90) 111 { 112 return false; 113 } 114 break; 115 case 0xF4: 116 if (U8_char > 0x8F) 117 { 118 return false; 119 } 120 break; 121 default: 122 if (U8_char < 0x80) 123 { 124 return false; 125 } 126 } 127 case 1: 128 chip 1.11 if (src >= 0x80 && src < 0xC2) 129 { 130 return false; 131 } 132 if (*src > 0xF4) 133 { 134 return false; 135 } 136 break;
137 david 1.2 default:
138 chip 1.11 { 139 return false;
140 david 1.2 }
141 david 1.1 142 } 143 return true;
144 chip 1.11 }
145 david 1.1 146 int UTF16toUTF8(const Uint16** srcHead,
147 chip 1.11 const Uint16* srcEnd, 148 Uint8** tgtHead, 149 Uint8* tgtEnd)
150 david 1.1 { 151 int returnCode = 0; 152 const Uint16* src = srcHead; 153 Uint8 tgt = *tgtHead; 154 while (src < srcEnd) 155 {
156 kumpf 1.20 if (*src < 128) 157 { 158 if (tgt == tgtEnd) 159 { 160 returnCode = -1; 161 break; 162 } 163
164 kamal.locahana 1.22 tgt++ = (Uint8)src++;
165 kumpf 1.20 continue; 166 }
167 mike 1.17
168 chip 1.11 Uint32 tempchar; 169 Uint16 numberOfBytes = 0; 170 const Uint16* oldsrc = src; 171 tempchar = src++; 172 if (tempchar >= FIRST_HIGH_SURROGATE 173 && tempchar <= LAST_HIGH_SURROGATE) 174 { 175 if (src < srcEnd) 176 { 177 Uint32 tempchar2 = src; 178 if (tempchar2 >= FIRST_LOW_SURROGATE && 179 tempchar2 <= LAST_LOW_SURROGATE) 180 { 181 tempchar = ((tempchar - FIRST_HIGH_SURROGATE) << halfShift) 182 + (tempchar2 - FIRST_LOW_SURROGATE) + halfBase; 183 ++src; 184 } 185 } 186 else 187 { 188 --src; 189 chip 1.11 returnCode = -1; 190 break; 191 } 192 } 193 if (tempchar < (Uint32)0x80) 194 { 195 numberOfBytes = 1; 196 } 197 else if (tempchar < (Uint32)0x800) 198 { 199 numberOfBytes = 2; 200 } 201 else if (tempchar < (Uint32)0x10000) 202 { 203 numberOfBytes = 3; 204 } 205 else if (tempchar < (Uint32)0x200000) 206 { 207 numberOfBytes = 4; 208 } 209 else 210 chip 1.11 { 211 numberOfBytes = 2; 212 tempchar = REPLACEMENT_CHARACTER; 213 } 214 215 tgt += numberOfBytes; 216 if (tgt > tgtEnd) 217 { 218 src = oldsrc; 219 tgt -= numberOfBytes; 220 returnCode = -1; 221 break; 222 } 223 224 switch (numberOfBytes) 225 { 226 case 4: 227 --tgt = (Uint8)((tempchar \| 0x80) & 0xBF); 228 tempchar >>= 6; 229 case 3: 230 --tgt = (Uint8)((tempchar \| 0x80) & 0xBF); 231 chip 1.11 tempchar >>= 6; 232 case 2: 233 --tgt = (Uint8)((tempchar \| 0x80) & 0xBF); 234 tempchar >>= 6; 235 case 1: 236 --tgt = (Uint8)(tempchar \| firstByteMark[numberOfBytes]); 237 } 238 tgt += numberOfBytes;
239 david 1.1 } 240 srcHead = src; 241 tgtHead = tgt; 242 return returnCode; 243 } 244 245 int UTF8toUTF16 (const Uint8** srcHead,
246 chip 1.11 const Uint8* srcEnd, 247 Uint16** tgtHead, 248 Uint16* tgtEnd)
249 david 1.1 { 250 int returnCode = 0; 251 const Uint8* src = srcHead; 252 Uint16 tgt = *tgtHead; 253 while (src < srcEnd) 254 {
255 chip 1.11 Uint32 tempchar = 0; 256 Uint16 moreBytes = trailingBytesForUTF8[src]; 257 if (src + moreBytes >= srcEnd) 258 { 259 returnCode = -1; 260 break; 261 } 262 switch (moreBytes) 263 { 264 case 3: 265 tempchar += src++; 266 tempchar <<= 6; 267 case 2: 268 tempchar += src++; 269 tempchar <<= 6; 270 case 1: 271 tempchar += src++; 272 tempchar <<= 6; 273 case 0: 274 tempchar += src++; 275 } 276 chip 1.11 tempchar -= offsetsFromUTF8[moreBytes]; 277 278 if (tgt >= tgtEnd) 279 { 280 src -= (moreBytes+1); 281 returnCode = -1; break; 282 } 283 if (tempchar <= MAX_BYTE) 284 { 285 if ((tempchar >= FIRST_HIGH_SURROGATE && 286 tempchar <= LAST_LOW_SURROGATE) \|\| 287 ((tempchar & 0xFFFE) == 0xFFFE)) 288 { 289 tgt++ = REPLACEMENT_CHARACTER; 290 } 291 else 292 { 293 tgt++ = (Uint16)tempchar; 294 } 295 } 296 else if (tempchar > MAX_UTF16) 297 chip 1.11 { 298 tgt++ = REPLACEMENT_CHARACTER; 299 } 300 else 301 { 302 if (tgt + 1 >= tgtEnd) 303 { 304 src -= (moreBytes+1); 305 returnCode = -1; 306 break; 307 } 308 tempchar -= halfBase; 309 tgt++ = (Uint16)((tempchar >> halfShift) + FIRST_HIGH_SURROGATE); 310 tgt++ = (Uint16)((tempchar & halfMask) + FIRST_LOW_SURROGATE); 311 }
312 david 1.1 } 313 srcHead = src; 314 tgtHead = tgt; 315 return returnCode; 316 }
317 david 1.5
318 mike 1.17 Boolean isUTF8Aux(const char *legal)
319 david 1.5 { 320 char numBytes = UTF_8_COUNT_TRAIL_BYTES(*legal)+1; 321 322 // Validate that the string is long enough to hold all the expected bytes. 323 // Note that if legal[0] == 0, numBytes will be 1. 324 for (char i=1; i<numBytes; i++) 325 { 326 if (legal[i] == 0) 327 { 328 return false; 329 } 330 } 331
332 kumpf 1.21 return isValid_U8((const Uint8 *)legal, numBytes);
333 david 1.5 }
334 chuck 1.6
335 chuck 1.9 Boolean isUTF8Str(const char *legal) 336 {
337 chip 1.11 /*char tmp[] = {0xCE,0x99,0xCE,0xBF,0xCF,0x8D,0xCE,0xBD,0xCE,
338 chuck 1.9 0xB9,0xCE,0xBA,0xCE,0xBF,0xCE,0xBD,0xCF,0x84, 339 0x00};*/
340 chip 1.11 // char tmp_[] = "class"; 341 // char * tmp = legal;
342 david.dillard 1.13 size_t count = 0; 343 const size_t size = strlen(legal);
344 chip 1.11 // printf("size = %d\n",size);
345 kumpf 1.21 while (count<size)
346 david.dillard 1.13 {
347 chip 1.11 // printf("count = %d\n",count);
348 kumpf 1.21 if (isUTF8(&legal[count]) == true) 349 {
350 david.dillard 1.13 UTF8_NEXT(legal,count);
351 kumpf 1.21 } 352 else 353 {
354 chip 1.11 // printf("bad string\n"); 355 return false; 356 }
357 david.dillard 1.13 }
358 chip 1.11 // printf("good string\n"); 359 return true;
360 chuck 1.9 /*
361 chip 1.11 printf("legal = %s\n\n", legal); 362 Uint32 count = 0; 363 Uint32 trailingBytes = 0;
364 kumpf 1.21 Uint32 size = strlen(legal);
365 chip 1.11 printf("size of legal is %d\n",size);
366 kumpf 1.21 while (count<size-1) 367 { 368 printf("count = %d\n", count); 369 if (isUTF8((char*)&legal[count]) == true)
370 chuck 1.9 {
371 kumpf 1.21 UTF8_NEXT(legal,trailingBytes);
372 chip 1.11 count += trailingBytes;
373 kumpf 1.21 } 374 else 375 {
376 chip 1.11 printf("CommonUTF8:: returning false; position[%d]",count);
377 kumpf 1.21 return false;
378 chip 1.11 }
379 kumpf 1.21 } 380 printf("CommonUTF8:: returning false; position[%d]",count);
381 chip 1.11 return true;*/
382 chuck 1.9 }
383 chuck 1.6 384 String escapeStringEncoder(const String& Str) 385 { 386 String escapeStr; 387 Uint16 escChar; 388 char hexencoding[6];
389 chip 1.11
390 kumpf 1.21 for (Uint32 i = 0; i < Str.size(); ++i)
391 chuck 1.6 {
392 kumpf 1.21 escChar = Str[i]; 393 if (escChar <= 0x7F)
394 chuck 1.6 {
395 kumpf 1.21 escapeStr.append(escChar);
396 chuck 1.6 }
397 kumpf 1.21 else 398 { 399 memset(hexencoding,0x00,sizeof(hexencoding));
400 chuck 1.6 sprintf(hexencoding, "%%%03X%X", escChar/16, escChar%16); 401 escapeStr.append(hexencoding);
402 kumpf 1.21 }
403 chip 1.11 }
404 kumpf 1.21 return escapeStr;
405 chuck 1.6 } 406 407 String escapeStringDecoder(const String& Str) 408 { 409 Uint32 i; 410
411 chip 1.11 Array<Uint16> utf16Chars;
412 chuck 1.6 413 for (i=0; i< Str.size(); ++i) 414 { 415 if (Str[i] == '%') 416 { 417 Uint8 digit1 = _hexCharToNumeric((Str[++i])); 418 Uint8 digit2 = _hexCharToNumeric((Str[++i])); 419 Uint8 digit3 = _hexCharToNumeric((Str[++i])); 420 Uint8 digit4 = _hexCharToNumeric((Str[++i])); 421
422 chip 1.11 Uint16 decodedChar = (digit1<<12) + (digit2<<8) +
423 chuck 1.6 (digit3<< 4) + (digit4); 424
425 chip 1.11 utf16Chars.append(decodedChar);
426 chuck 1.6 } 427 else 428 {
429 chip 1.11 utf16Chars.append((Uint16)Str[i]);
430 chuck 1.6 } 431 } 432 433 // If there was a string to decode... 434 if (Str.size() > 0) 435 { 436 utf16Chars.append('\0'); 437 return String((Char16 *)utf16Chars.getData()); 438 } 439 else 440 { 441 return String(); 442 } 443 } 444
445 yi.zhou 1.16 #ifdef PEGASUS_HAS_ICU 446 447 Boolean InitializeICU::_initAttempted = false; 448 Boolean InitializeICU::_initSuccessful = false; 449 Mutex InitializeICU::_initMutex; 450 451 Boolean InitializeICU::initICUSuccessful() 452 { 453 if (!_initAttempted) 454 {
455 kumpf 1.20 {
456 yi.zhou 1.16 AutoMutex lock(_initMutex); 457
458 kumpf 1.20 if (!_initAttempted) 459 {
460 yi.zhou 1.16 UErrorCode _status = U_ZERO_ERROR; 461
462 kumpf 1.20 // Initialize ICU
463 yi.zhou 1.16 u_init(&_status); 464 465 if (U_FAILURE(_status)) 466 { 467 _initSuccessful = false;
468 kumpf 1.20 Logger::put( 469 Logger::STANDARD_LOG , System::CIMSERVER, 470 Logger::WARNING, 471 "ICU initialization failed with error: $0.", 472 _status);
473 yi.zhou 1.16 } 474 else 475 { 476 _initSuccessful = true; 477 } 478 _initAttempted = true;
479 kumpf 1.20 } 480 }
481 yi.zhou 1.16 } 482 483 return _initSuccessful; 484 } 485 486 #endif 487
488 david 1.1 PEGASUS_NAMESPACE_END

No CVS admin address has been configured