pegasus/src/Pegasus/Common/CommonUTF.cpp - annotate

Return to CommonUTF.cpp CVS log

Up to [Pegasus] / pegasus / src / Pegasus / Common

1 karl 1.4 //%2003////////////////////////////////////////////////////////////////////////
2 david 1.1 //
3 karl 1.4 // Copyright (c) 2000, 2001, 2002 BMC Software, Hewlett-Packard Development 4 // Company, L. P., IBM Corp., The Open Group, Tivoli Systems. 5 // Copyright (c) 2003 BMC Software; Hewlett-Packard Development Company, L. P.; 6 // IBM Corp.; EMC Corporation, The Open Group.
7 david 1.1 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to 10 // deal in the Software without restriction, including without limitation the 11 // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 12 // sell copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // THE ABOVE COPYRIGHT NOTICE AND THIS PERMISSION NOTICE SHALL BE INCLUDED IN 16 // ALL COPIES OR SUBSTANTIAL PORTIONS OF THE SOFTWARE. THE SOFTWARE IS PROVIDED 17 // "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 18 // LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 19 // PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 20 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 // 24 //============================================================================== 25 // 26 // Author: Dave Rosckes (rosckes@us.ibm.com) 27 // 28 david 1.1 // 29 //%///////////////////////////////////////////////////////////////////////////// 30
31 chuck 1.7 #include <Pegasus/Common/Config.h> 32 #include <Pegasus/Common/Array.h>
33 david 1.1 #include "CommonUTF.h"
34 chuck 1.7 #include <cctype> 35 #include <cstdio>
36 david 1.2 #include <cstring>
37 kumpf 1.3
38 david 1.1 PEGASUS_NAMESPACE_BEGIN
39 kumpf 1.3
40 chuck 1.6 41 inline Uint8 _hexCharToNumeric(const Uint16 c) 42 { 43 Uint8 n; 44 45 if (isdigit(c)) 46 n = (c - '0'); 47 else if (isupper(c)) 48 n = (c - 'A' + 10); 49 else // if (islower(c)) 50 n = (c - 'a' + 10); 51 52 return n; 53 } 54
55 kumpf 1.3 // Note: Caller must ensure that "src" contains "size" bytes.
56 david 1.1 int isValid_U8(const Uint8 src, int size) 57 { 58 Uint8 U8_char; 59 const Uint8 srcptr = src+size; 60 switch (size) 61 { 62 case 4: 63 if ((U8_char = (--srcptr)) < 0x80 \|\| U8_char > 0xBF) 64 { 65 return false; 66 } 67 case 3: 68 if ((U8_char = (--srcptr)) < 0x80 \|\| U8_char > 0xBF) 69 { 70 return false; 71 } 72 case 2: 73 if ((U8_char = (--srcptr)) > 0xBF) 74 { 75 return false; 76 } 77 david 1.1 switch (src) 78 { 79 case 0xE0: 80 if (U8_char < 0xA0) 81 { 82 return false; 83 } 84 break; 85 case 0xF0: 86 if (U8_char < 0x90) 87 { 88 return false; 89 } 90 break; 91 case 0xF4: 92 if (U8_char > 0x8F) 93 { 94 return false; 95 } 96 break; 97 default: 98 david 1.1 if (U8_char < 0x80) 99 { 100 return false; 101 } 102 } 103 case 1: 104 if (src >= 0x80 && src < 0xC2) 105 { 106 return false; 107 } 108 if (*src > 0xF4) 109 { 110 return false; 111 } 112 break;
113 david 1.2 default: 114 { 115 return false; 116 }
117 david 1.1 118 } 119 return true; 120 } 121 122 int UTF16toUTF8(const Uint16** srcHead, 123 const Uint16* srcEnd, 124 Uint8** tgtHead, 125 Uint8* tgtEnd) 126 { 127 int returnCode = 0; 128 const Uint16* src = srcHead; 129 Uint8 tgt = tgtHead; 130 while (src < srcEnd) 131 { 132 Uint32 tempchar; 133 Uint16 numberOfBytes = 0; 134 const Uint16 oldsrc = src; 135 tempchar = src++; 136 if (tempchar >= FIRST_HIGH_SURROGATE 137 && tempchar <= LAST_HIGH_SURROGATE) 138 david 1.1 { 139 if (src < srcEnd) 140 { 141 Uint32 tempchar2 = src; 142 if (tempchar2 >= FIRST_LOW_SURROGATE && 143 tempchar2 <= LAST_LOW_SURROGATE) 144 { 145 tempchar = ((tempchar - FIRST_HIGH_SURROGATE) << halfShift) 146 + (tempchar2 - FIRST_LOW_SURROGATE) + halfBase; 147 ++src; 148 } 149 } 150 else 151 { 152 --src; 153 returnCode = -1; 154 break; 155 } 156 } 157 if (tempchar < (Uint32)0x80) 158 { 159 david 1.1 numberOfBytes = 1; 160 } 161 else if (tempchar < (Uint32)0x800) 162 { 163 numberOfBytes = 2; 164 } 165 else if (tempchar < (Uint32)0x10000) 166 { 167 numberOfBytes = 3; 168 } 169 else if (tempchar < (Uint32)0x200000) 170 { 171 numberOfBytes = 4; 172 } 173 else 174 { 175 numberOfBytes = 2; 176 tempchar = REPLACEMENT_CHARACTER; 177 } 178 179 tgt += numberOfBytes; 180 david 1.1 if (tgt > tgtEnd) 181 { 182 src = oldsrc; 183 tgt -= numberOfBytes; 184 returnCode = -1; 185 break; 186 } 187 188 switch (numberOfBytes) 189 { 190 case 4: 191 --tgt = (Uint8)((tempchar \| 0x80) & 0xBF); 192 tempchar >>= 6; 193 case 3: 194 --tgt = (Uint8)((tempchar \| 0x80) & 0xBF); 195 tempchar >>= 6; 196 case 2: 197 --tgt = (Uint8)((tempchar \| 0x80) & 0xBF); 198 tempchar >>= 6; 199 case 1: 200 --tgt = (Uint8)(tempchar \| firstByteMark[numberOfBytes]); 201 david 1.1 } 202 tgt += numberOfBytes; 203 } 204 srcHead = src; 205 tgtHead = tgt; 206 return returnCode; 207 } 208 209 int UTF8toUTF16 (const Uint8** srcHead, 210 const Uint8* srcEnd, 211 Uint16** tgtHead, 212 Uint16* tgtEnd) 213 { 214 int returnCode = 0; 215 const Uint8* src = srcHead; 216 Uint16 tgt = tgtHead; 217 while (src < srcEnd) 218 { 219 Uint32 tempchar = 0; 220 Uint16 moreBytes = trailingBytesForUTF8[src]; 221 if (src + moreBytes >= srcEnd) 222 david 1.1 { 223 returnCode = -1; 224 break; 225 } 226 switch (moreBytes) 227 { 228 case 3: 229 tempchar += src++; 230 tempchar <<= 6; 231 case 2: 232 tempchar += src++; 233 tempchar <<= 6; 234 case 1: 235 tempchar += src++; 236 tempchar <<= 6; 237 case 0: 238 tempchar += src++; 239 } 240 tempchar -= offsetsFromUTF8[moreBytes]; 241 242 if (tgt >= tgtEnd) 243 david 1.1 { 244 src -= (moreBytes+1); 245 returnCode = -1; break; 246 } 247 if (tempchar <= MAX_BYTE) 248 { 249 if ((tempchar >= FIRST_HIGH_SURROGATE && 250 tempchar <= LAST_LOW_SURROGATE) \|\| 251 ((tempchar & 0xFFFE) == 0xFFFE)) 252 { 253 tgt++ = REPLACEMENT_CHARACTER; 254 } 255 else 256 { 257 tgt++ = (Uint16)tempchar; 258 } 259 } 260 else if (tempchar > MAX_UTF16) 261 { 262 tgt++ = REPLACEMENT_CHARACTER; 263 } 264 david 1.1 else 265 { 266 if (tgt + 1 >= tgtEnd) 267 { 268 src -= (moreBytes+1); 269 returnCode = -1; 270 break; 271 } 272 tempchar -= halfBase; 273 tgt++ = (Uint16)((tempchar >> halfShift) + FIRST_HIGH_SURROGATE); 274 tgt++ = (Uint16)((tempchar & halfMask) + FIRST_LOW_SURROGATE); 275 } 276 } 277 srcHead = src; 278 *tgtHead = tgt; 279 return returnCode; 280 }
281 david 1.5 282 Boolean isUTF8(const char legal) 283 { 284 char numBytes = UTF_8_COUNT_TRAIL_BYTES(legal)+1; 285 286 // Validate that the string is long enough to hold all the expected bytes. 287 // Note that if legal[0] == 0, numBytes will be 1. 288 for (char i=1; i<numBytes; i++) 289 { 290 if (legal[i] == 0) 291 { 292 return false; 293 } 294 } 295 296 return (isValid_U8((const Uint8 *)legal, numBytes)); 297 }
298 chuck 1.6
299 humberto 1.7.4.1 Boolean isUTF8Str(const char legal) 300 { 301 /char tmp[] = {0xCE,0x99,0xCE,0xBF,0xCF,0x8D,0xCE,0xBD,0xCE, 302 0xB9,0xCE,0xBA,0xCE,0xBF,0xCE,0xBD,0xCF,0x84, 303 0x00};/ 304 // char tmp_[] = "class"; 305 // char tmp = legal; 306 Uint32 count = 0; 307 Uint32 size = strlen(legal); 308 // printf("size = %d\n",size); 309 while(count<size) 310 { 311 // printf("count = %d\n",count); 312 if(isUTF8(&legal[count]) == true){ 313 UTF8_NEXT(legal,count); 314 }else{ 315 // printf("bad string\n"); 316 return false; 317 } 318 } 319 // printf("good string\n"); 320 humberto 1.7.4.1 return true; 321 /* 322 printf("legal = %s\n\n", legal); 323 Uint32 count = 0; 324 Uint32 trailingBytes = 0; 325 Uint32 size = strlen(legal); 326 printf("size of legal is %d\n",size); 327 while(count<size-1) 328 { 329 printf("count = %d\n", count); 330 if(isUTF8((char)&legal[count]) == true){ 331 UTF8_NEXT(legal,trailingBytes); 332 count += trailingBytes; 333 } else{ 334 printf("CommonUTF8:: returning false; position[%d]",count); 335 return false; 336 } 337 } 338 printf("CommonUTF8:: returning false; position[%d]",count); 339 return true;/ 340 }
341 chuck 1.6 342 String escapeStringEncoder(const String& Str) 343 { 344 String escapeStr; 345 Uint16 escChar; 346 char hexencoding[6]; 347 348 for(Uint32 i = 0; i < Str.size(); ++i) 349 { 350 escChar = Str[i]; 351 if(escChar <= 0x7F) 352 { 353 escapeStr.append(escChar); 354 } 355 else 356 { 357 memset(hexencoding,0x00,sizeof(hexencoding)); 358 sprintf(hexencoding, "%%%03X%X", escChar/16, escChar%16); 359 escapeStr.append(hexencoding); 360 } 361 } 362 chuck 1.6 return(escapeStr); 363 } 364 365 String escapeStringDecoder(const String& Str) 366 { 367 Uint32 i; 368 369 Array<Uint16> utf16Chars; 370 371 for (i=0; i< Str.size(); ++i) 372 { 373 if (Str[i] == '%') 374 { 375 Uint8 digit1 = _hexCharToNumeric((Str[++i])); 376 Uint8 digit2 = _hexCharToNumeric((Str[++i])); 377 Uint8 digit3 = _hexCharToNumeric((Str[++i])); 378 Uint8 digit4 = _hexCharToNumeric((Str[++i])); 379 380 Uint16 decodedChar = (digit1<<12) + (digit2<<8) + 381 (digit3<< 4) + (digit4); 382 383 chuck 1.6 utf16Chars.append(decodedChar); 384 } 385 else 386 { 387 utf16Chars.append((Uint16)Str[i]); 388 } 389 } 390 391 // If there was a string to decode... 392 if (Str.size() > 0) 393 { 394 utf16Chars.append('\0'); 395 return String((Char16 *)utf16Chars.getData()); 396 } 397 else 398 { 399 return String(); 400 } 401 } 402
403 david 1.1 PEGASUS_NAMESPACE_END

No CVS admin address has been configured