(file) Return to CommonUTF.cpp CVS log (file) (dir) Up to [Pegasus] / pegasus / src / Pegasus / Common

  1 martin 1.23 //%LICENSE////////////////////////////////////////////////////////////////
  2 martin 1.24 //
  3 martin 1.23 // Licensed to The Open Group (TOG) under one or more contributor license
  4             // agreements.  Refer to the OpenPegasusNOTICE.txt file distributed with
  5             // this work for additional information regarding copyright ownership.
  6             // Each contributor licenses this file to you under the OpenPegasus Open
  7             // Source License; you may not use this file except in compliance with the
  8             // License.
  9 martin 1.24 //
 10 martin 1.23 // Permission is hereby granted, free of charge, to any person obtaining a
 11             // copy of this software and associated documentation files (the "Software"),
 12             // to deal in the Software without restriction, including without limitation
 13             // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 14             // and/or sell copies of the Software, and to permit persons to whom the
 15             // Software is furnished to do so, subject to the following conditions:
 16 martin 1.24 //
 17 martin 1.23 // The above copyright notice and this permission notice shall be included
 18             // in all copies or substantial portions of the Software.
 19 martin 1.24 //
 20 martin 1.23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 21 martin 1.24 // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 22 martin 1.23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 23             // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 24             // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 25             // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 26             // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 27 martin 1.24 //
 28 martin 1.23 //////////////////////////////////////////////////////////////////////////
 29 david  1.1  //
 30 kamal.locahana 1.22 //%////////////////////////////////////////////////////////////////////////////
 31 david          1.1  
 32 chuck          1.7  #include <Pegasus/Common/Config.h>
 33                     #include <Pegasus/Common/Array.h>
 34 yi.zhou        1.16 #include <Pegasus/Common/Logger.h>
 35 david          1.1  #include "CommonUTF.h"
 36 thilo.boehm    1.25 #include <Pegasus/Common/String.h>
 37 chuck          1.7  #include <cstdio>
 38 david          1.2  #include <cstring>
 39 david.dillard  1.15 #include <cctype>
 40 kumpf          1.3  
 41 yi.zhou        1.16 #ifdef PEGASUS_HAS_ICU
 42                     #include <unicode/uclean.h>
 43                     #endif
 44                     
 45 david          1.1  PEGASUS_NAMESPACE_BEGIN
 46 kumpf          1.3  
 47 karl           1.19 const Uint32 halfBase = 0x0010000UL;
 48                     const Uint32 halfMask = 0x3FFUL;
 49                     const int halfShift  = 10;
 50                     const Uint8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 51                     
 52                     const Uint32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
 53                                  0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 54                     
 55                     const char trailingBytesForUTF8[256] = {
 56                         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 57                         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 58                         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 59                         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 60                         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 61                         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 62                         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 63                         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
 64                     };
 65 david.dillard  1.12 inline Uint8 _hexCharToNumeric(Char16 c)
 66 chuck          1.6  {
 67                         Uint8 n;
 68                     
 69 david.dillard  1.14     if (isdigit(c))
 70 chuck          1.6          n = (c - '0');
 71 david.dillard  1.14     else if (isupper(c))
 72 chuck          1.6          n = (c - 'A' + 10);
 73                         else // if (islower(c))
 74                             n = (c - 'a' + 10);
 75                     
 76                         return n;
 77                     }
 78                     
 79 kumpf          1.3  // Note: Caller must ensure that "src" contains "size" bytes.
 80 chip           1.11 Boolean isValid_U8(const Uint8 *src, int size)
 81 david          1.1  {
 82                         Uint8 U8_char;
 83                         const Uint8 *srcptr = src+size;
 84                         switch (size)
 85                         {
 86 chip           1.11     case 4:
 87                             if ((U8_char = (*--srcptr)) < 0x80 || U8_char > 0xBF)
 88                             {
 89                             return false;
 90                             }
 91                         case 3:
 92                             if ((U8_char = (*--srcptr)) < 0x80 || U8_char > 0xBF)
 93                             {
 94                             return false;
 95                             }
 96                         case 2:
 97                             if ((U8_char = (*--srcptr)) > 0xBF)
 98                             {
 99                             return false;
100                             }
101                             switch (*src)
102                             {
103                             case 0xE0:
104                                 if (U8_char < 0xA0)
105                                 {
106                                 return false;
107 chip           1.11             }
108                                 break;
109                             case 0xF0:
110                                 if (U8_char < 0x90)
111                                 {
112                                 return false;
113                                 }
114                                 break;
115                             case 0xF4:
116                                 if (U8_char > 0x8F)
117                                 {
118                                 return false;
119                                 }
120                                 break;
121                             default:
122                                 if (U8_char < 0x80)
123                                 {
124                                 return false;
125                                 }
126                             }
127                         case 1:
128 chip           1.11         if (*src >= 0x80 && *src < 0xC2)
129                             {
130                             return false;
131                             }
132                             if (*src > 0xF4)
133                             {
134                             return false;
135                             }
136                             break;
137 david          1.2          default:
138 chip           1.11         {
139                             return false;
140 david          1.2              }
141 david          1.1  
142                         }
143                         return true;
144 chip           1.11 }
145 david          1.1  
146                     int UTF16toUTF8(const Uint16** srcHead,
147 chip           1.11         const Uint16* srcEnd,
148                             Uint8** tgtHead,
149                             Uint8* tgtEnd)
150 david          1.1  {
151                         int returnCode = 0;
152                         const Uint16* src = *srcHead;
153                         Uint8* tgt = *tgtHead;
154                         while (src < srcEnd)
155                         {
156 kumpf          1.20         if (*src < 128)
157                             {
158                                 if (tgt == tgtEnd)
159                                 {
160                                     returnCode = -1;
161                                     break;
162                                 }
163                     
164 kamal.locahana 1.22             *tgt++ = (Uint8)*src++;
165 kumpf          1.20             continue;
166                             }
167 mike           1.17 
168 chip           1.11     Uint32 tempchar;
169                         Uint16 numberOfBytes = 0;
170                         const Uint16* oldsrc = src;
171                         tempchar = *src++;
172                         if (tempchar >= FIRST_HIGH_SURROGATE
173                             && tempchar <= LAST_HIGH_SURROGATE)
174                         {
175                             if (src < srcEnd)
176                             {
177                             Uint32 tempchar2 = *src;
178                             if (tempchar2 >= FIRST_LOW_SURROGATE &&
179                                 tempchar2 <= LAST_LOW_SURROGATE)
180                             {
181                                 tempchar = ((tempchar - FIRST_HIGH_SURROGATE) << halfShift)
182                                   + (tempchar2 - FIRST_LOW_SURROGATE) + halfBase;
183                                 ++src;
184                             }
185                             }
186                             else
187                             {
188                             --src;
189 chip           1.11         returnCode = -1;
190                             break;
191                             }
192                         }
193                         if (tempchar < (Uint32)0x80)
194                         {
195                             numberOfBytes = 1;
196                         }
197                         else if (tempchar < (Uint32)0x800)
198                         {
199                             numberOfBytes = 2;
200                         }
201                         else if (tempchar < (Uint32)0x10000)
202                         {
203                             numberOfBytes = 3;
204                         }
205                         else if (tempchar < (Uint32)0x200000)
206                         {
207                             numberOfBytes = 4;
208                         }
209                         else
210 chip           1.11     {
211                             numberOfBytes = 2;
212                             tempchar = REPLACEMENT_CHARACTER;
213                         }
214                     
215                         tgt += numberOfBytes;
216                         if (tgt > tgtEnd)
217                         {
218                             src = oldsrc;
219                             tgt -= numberOfBytes;
220                             returnCode = -1;
221                             break;
222                         }
223                     
224                         switch (numberOfBytes)
225                         {
226                             case 4:
227                             *--tgt = (Uint8)((tempchar | 0x80) & 0xBF);
228                             tempchar >>= 6;
229                             case 3:
230                             *--tgt = (Uint8)((tempchar | 0x80) & 0xBF);
231 chip           1.11         tempchar >>= 6;
232                             case 2:
233                             *--tgt = (Uint8)((tempchar | 0x80) & 0xBF);
234                             tempchar >>= 6;
235                             case 1:
236                             *--tgt =  (Uint8)(tempchar | firstByteMark[numberOfBytes]);
237                         }
238                         tgt += numberOfBytes;
239 david          1.1      }
240                         *srcHead = src;
241                         *tgtHead = tgt;
242                         return returnCode;
243                     }
244                     
245                     int UTF8toUTF16 (const Uint8** srcHead,
246 chip           1.11          const Uint8* srcEnd,
247                              Uint16** tgtHead,
248                              Uint16* tgtEnd)
249 david          1.1  {
250                         int returnCode = 0;
251                         const Uint8* src = *srcHead;
252                         Uint16* tgt = *tgtHead;
253                         while (src < srcEnd)
254                         {
255 chip           1.11     Uint32 tempchar = 0;
256                         Uint16 moreBytes = trailingBytesForUTF8[*src];
257                         if (src + moreBytes >= srcEnd)
258                         {
259                             returnCode = -1;
260                             break;
261                         }
262                         switch (moreBytes)
263                         {
264                             case 3:
265                             tempchar += *src++;
266                             tempchar <<= 6;
267                             case 2:
268                             tempchar += *src++;
269                             tempchar <<= 6;
270                             case 1:
271                             tempchar += *src++;
272                             tempchar <<= 6;
273                             case 0:
274                             tempchar += *src++;
275                         }
276 chip           1.11     tempchar -= offsetsFromUTF8[moreBytes];
277                     
278                         if (tgt >= tgtEnd)
279                         {
280                             src -= (moreBytes+1);
281                             returnCode = -1; break;
282                         }
283                         if (tempchar <= MAX_BYTE)
284                         {
285                             if ((tempchar >= FIRST_HIGH_SURROGATE &&
286                              tempchar <= LAST_LOW_SURROGATE) ||
287                             ((tempchar & 0xFFFE) == 0xFFFE))
288                             {
289                             *tgt++ = REPLACEMENT_CHARACTER;
290                             }
291                             else
292                             {
293                             *tgt++ = (Uint16)tempchar;
294                             }
295                         }
296                         else if (tempchar > MAX_UTF16)
297 chip           1.11     {
298                             *tgt++ = REPLACEMENT_CHARACTER;
299                         }
300                         else
301                         {
302                             if (tgt + 1 >= tgtEnd)
303                             {
304                             src -= (moreBytes+1);
305                             returnCode = -1;
306                             break;
307                             }
308                             tempchar -= halfBase;
309                             *tgt++ = (Uint16)((tempchar >> halfShift) + FIRST_HIGH_SURROGATE);
310                             *tgt++ = (Uint16)((tempchar & halfMask) + FIRST_LOW_SURROGATE);
311                         }
312 david          1.1      }
313                         *srcHead = src;
314                         *tgtHead = tgt;
315                         return returnCode;
316                     }
317 david          1.5  
318 mike           1.17 Boolean isUTF8Aux(const char *legal)
319 david          1.5  {
320                         char numBytes = UTF_8_COUNT_TRAIL_BYTES(*legal)+1;
321                     
322                         // Validate that the string is long enough to hold all the expected bytes.
323                         // Note that if legal[0] == 0, numBytes will be 1.
324                         for (char i=1; i<numBytes; i++)
325                         {
326                             if (legal[i] == 0)
327                             {
328                                 return false;
329                             }
330                         }
331                     
332 kumpf          1.21     return isValid_U8((const Uint8 *)legal, numBytes);
333 david          1.5  }
334 chuck          1.6  
335 chuck          1.9  Boolean isUTF8Str(const char *legal)
336                     {
337 chip           1.11     /*char tmp[] = {0xCE,0x99,0xCE,0xBF,0xCF,0x8D,0xCE,0xBD,0xCE,
338 chuck          1.9                        0xB9,0xCE,0xBA,0xCE,0xBF,0xCE,0xBD,0xCF,0x84,
339                                           0x00};*/
340 chip           1.11 //  char tmp_[] = "class";
341                     //  char * tmp = legal;
342 david.dillard  1.13     size_t count = 0;
343                         const size_t size = strlen(legal);
344 chip           1.11 //  printf("size = %d\n",size);
345 kumpf          1.21     while (count<size)
346 david.dillard  1.13     {
347 chip           1.11 //      printf("count = %d\n",count);
348 kumpf          1.21         if (isUTF8(&legal[count]) == true)
349                             {
350 david.dillard  1.13             UTF8_NEXT(legal,count);
351 kumpf          1.21         }
352                             else
353                             {
354 chip           1.11 //          printf("bad string\n");
355                                 return false;
356                             }
357 david.dillard  1.13     }
358 chip           1.11 //  printf("good string\n");
359                         return true;
360 chuck          1.9  /*
361 chip           1.11     printf("legal = %s\n\n", legal);
362                         Uint32 count = 0;
363                         Uint32 trailingBytes = 0;
364 kumpf          1.21     Uint32 size = strlen(legal);
365 chip           1.11     printf("size of legal is %d\n",size);
366 kumpf          1.21     while (count<size-1)
367                         {
368                             printf("count = %d\n", count);
369                             if (isUTF8((char*)&legal[count]) == true)
370 chuck          1.9          {
371 kumpf          1.21             UTF8_NEXT(legal,trailingBytes);
372 chip           1.11             count += trailingBytes;
373 kumpf          1.21         }
374                             else
375                             {
376 chip           1.11             printf("CommonUTF8:: returning false; position[%d]",count);
377 kumpf          1.21             return false;
378 chip           1.11         }
379 kumpf          1.21     }
380                         printf("CommonUTF8:: returning false; position[%d]",count);
381 chip           1.11     return true;*/
382 chuck          1.9  }
383 chuck          1.6  
384                     String escapeStringEncoder(const String& Str)
385                     {
386                         String escapeStr;
387                         Uint16 escChar;
388                         char hexencoding[6];
389 chip           1.11 
390 kumpf          1.21     for (Uint32 i = 0; i < Str.size(); ++i)
391 chuck          1.6      {
392 kumpf          1.21         escChar = Str[i];
393                             if (escChar <= 0x7F)
394 chuck          1.6          {
395 kumpf          1.21             escapeStr.append(escChar);
396 chuck          1.6          }
397 kumpf          1.21         else
398                             {
399                                 memset(hexencoding,0x00,sizeof(hexencoding));
400 chuck          1.6              sprintf(hexencoding, "%%%03X%X", escChar/16, escChar%16);
401                                 escapeStr.append(hexencoding);
402 kumpf          1.21         }
403 chip           1.11     }
404 kumpf          1.21     return escapeStr;
405 chuck          1.6  }
406                     
407                     String escapeStringDecoder(const String& Str)
408                     {
409                         Uint32 i;
410                     
411 chip           1.11     Array<Uint16> utf16Chars;
412 chuck          1.6  
413                         for (i=0; i< Str.size(); ++i)
414                         {
415                             if (Str[i] == '%')
416                             {
417                                 Uint8 digit1 = _hexCharToNumeric((Str[++i]));
418                                 Uint8 digit2 = _hexCharToNumeric((Str[++i]));
419                                 Uint8 digit3 = _hexCharToNumeric((Str[++i]));
420                                 Uint8 digit4 = _hexCharToNumeric((Str[++i]));
421                     
422 chip           1.11         Uint16 decodedChar = (digit1<<12) + (digit2<<8) +
423 chuck          1.6                                   (digit3<< 4) + (digit4);
424                     
425 chip           1.11             utf16Chars.append(decodedChar);
426 chuck          1.6          }
427                             else
428                             {
429 chip           1.11             utf16Chars.append((Uint16)Str[i]);
430 chuck          1.6          }
431                         }
432                     
433                         // If there was a string to decode...
434                         if (Str.size() > 0)
435                         {
436                             utf16Chars.append('\0');
437                             return String((Char16 *)utf16Chars.getData());
438                         }
439                         else
440                         {
441                             return String();
442                         }
443                     }
444                     
445 yi.zhou        1.16 #ifdef PEGASUS_HAS_ICU
446                     
447                     Boolean InitializeICU::_initAttempted = false;
448                     Boolean InitializeICU::_initSuccessful = false;
449                     Mutex InitializeICU::_initMutex;
450                     
451                     Boolean InitializeICU::initICUSuccessful()
452                     {
453                         if (!_initAttempted)
454                         {
455 kumpf          1.20         {
456 yi.zhou        1.16             AutoMutex lock(_initMutex);
457                     
458 kumpf          1.20             if (!_initAttempted)
459                                 {
460 yi.zhou        1.16                 UErrorCode _status = U_ZERO_ERROR;
461                     
462 kumpf          1.20                 // Initialize ICU
463 yi.zhou        1.16                 u_init(&_status);
464                     
465                                     if (U_FAILURE(_status))
466                                     {
467                                         _initSuccessful = false;
468 kumpf          1.20                     Logger::put(
469                                             Logger::STANDARD_LOG , System::CIMSERVER,
470                                             Logger::WARNING,
471                                             "ICU initialization failed with error: $0.",
472                                             _status);
473 yi.zhou        1.16                 }
474                                     else
475                                     {
476                                         _initSuccessful = true;
477                                     }
478                                     _initAttempted = true;
479 kumpf          1.20             }
480                             }
481 yi.zhou        1.16     }
482                     
483                         return _initSuccessful;
484                     }
485                     
486                     #endif
487                     
488 david          1.1  PEGASUS_NAMESPACE_END

No CVS admin address has been configured
Powered by
ViewCVS 0.9.2