(file) Return to CommonUTF.cpp CVS log (file) (dir) Up to [Pegasus] / pegasus / src / Pegasus / Common

  1 karl  1.4 //%2003////////////////////////////////////////////////////////////////////////
  2 david 1.1 //
  3 karl  1.4 // Copyright (c) 2000, 2001, 2002  BMC Software, Hewlett-Packard Development
  4           // Company, L. P., IBM Corp., The Open Group, Tivoli Systems.
  5           // Copyright (c) 2003 BMC Software; Hewlett-Packard Development Company, L. P.;
  6           // IBM Corp.; EMC Corporation, The Open Group.
  7 david 1.1 //
  8           // Permission is hereby granted, free of charge, to any person obtaining a copy
  9           // of this software and associated documentation files (the "Software"), to
 10           // deal in the Software without restriction, including without limitation the
 11           // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 12           // sell copies of the Software, and to permit persons to whom the Software is
 13           // furnished to do so, subject to the following conditions:
 14           // 
 15           // THE ABOVE COPYRIGHT NOTICE AND THIS PERMISSION NOTICE SHALL BE INCLUDED IN
 16           // ALL COPIES OR SUBSTANTIAL PORTIONS OF THE SOFTWARE. THE SOFTWARE IS PROVIDED
 17           // "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
 18           // LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 19           // PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 20           // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21           // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 22           // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 23           //
 24           //==============================================================================
 25           //
 26           // Author: Dave Rosckes   (rosckes@us.ibm.com)
 27           //
 28 david 1.1 //
 29           //%/////////////////////////////////////////////////////////////////////////////
 30           
 31 chuck 1.7 #include <Pegasus/Common/Config.h>
 32           #include <Pegasus/Common/Array.h>
 33 david 1.1 #include "CommonUTF.h"
 34 chuck 1.7 #include <cctype>
 35           #include <cstdio>
 36 david 1.2 #include <cstring>
 37 kumpf 1.3 
 38 david 1.1 PEGASUS_NAMESPACE_BEGIN
 39 kumpf 1.3 
 40 chuck 1.6 
 41           inline Uint8 _hexCharToNumeric(const Uint16 c)
 42           {
 43               Uint8 n;
 44           
 45               if (isdigit(c))
 46                   n = (c - '0');
 47               else if (isupper(c))
 48                   n = (c - 'A' + 10);
 49               else // if (islower(c))
 50                   n = (c - 'a' + 10);
 51           
 52               return n;
 53           }
 54           
 55 kumpf 1.3 // Note: Caller must ensure that "src" contains "size" bytes.
 56 david 1.1 int isValid_U8(const Uint8 *src, int size)
 57           {
 58               Uint8 U8_char;
 59               const Uint8 *srcptr = src+size;
 60               switch (size)
 61               {
 62           	case 4:
 63           	    if ((U8_char = (*--srcptr)) < 0x80 || U8_char > 0xBF)
 64           	    {
 65           		return false;
 66           	    }
 67           	case 3:
 68           	    if ((U8_char = (*--srcptr)) < 0x80 || U8_char > 0xBF)
 69           	    {
 70           		return false;
 71           	    }
 72           	case 2:
 73           	    if ((U8_char = (*--srcptr)) > 0xBF)
 74           	    {
 75           		return false;
 76           	    }
 77 david 1.1 	    switch (*src)
 78           	    {
 79           		case 0xE0:
 80           		    if (U8_char < 0xA0)
 81           		    {
 82           			return false;
 83           		    }
 84           		    break;
 85           		case 0xF0:
 86           		    if (U8_char < 0x90)
 87           		    {
 88           			return false;
 89           		    }
 90           		    break;
 91           		case 0xF4:
 92           		    if (U8_char > 0x8F)
 93           		    {
 94           			return false;
 95           		    }
 96           		    break;
 97           		default:
 98 david 1.1 		    if (U8_char < 0x80)
 99           		    {
100           			return false;
101           		    }
102           	    }
103           	case 1:
104           	    if (*src >= 0x80 && *src < 0xC2)
105           	    {
106           		return false;
107           	    }
108           	    if (*src > 0xF4)
109           	    {
110           		return false;
111           	    }
112           	    break;
113 david 1.2         default:
114           	    {
115           		return false;
116                       }
117 david 1.1 
118               }
119               return true;
120           }	
121           
122           int UTF16toUTF8(const Uint16** srcHead,
123           		const Uint16* srcEnd, 
124           		Uint8** tgtHead,
125           		Uint8* tgtEnd)
126           {
127               int returnCode = 0;
128               const Uint16* src = *srcHead;
129               Uint8* tgt = *tgtHead;
130               while (src < srcEnd)
131               {
132           	Uint32 tempchar;
133           	Uint16 numberOfBytes = 0;
134           	const Uint16* oldsrc = src; 
135           	tempchar = *src++;
136           	if (tempchar >= FIRST_HIGH_SURROGATE
137           	    && tempchar <= LAST_HIGH_SURROGATE)
138 david 1.1 	{
139           	    if (src < srcEnd)
140           	    {
141           		Uint32 tempchar2 = *src;
142           		if (tempchar2 >= FIRST_LOW_SURROGATE &&
143           		    tempchar2 <= LAST_LOW_SURROGATE)
144           		{
145           		    tempchar = ((tempchar - FIRST_HIGH_SURROGATE) << halfShift)
146           		      + (tempchar2 - FIRST_LOW_SURROGATE) + halfBase;
147           		    ++src;
148           		} 
149           	    }
150           	    else
151           	    { 
152           		--src;
153           		returnCode = -1;
154           		break;
155           	    }
156           	}
157           	if (tempchar < (Uint32)0x80)
158           	{
159 david 1.1 	    numberOfBytes = 1;
160           	}
161           	else if (tempchar < (Uint32)0x800)
162           	{
163           	    numberOfBytes = 2;
164           	}
165           	else if (tempchar < (Uint32)0x10000)
166           	{
167           	    numberOfBytes = 3;
168           	}
169           	else if (tempchar < (Uint32)0x200000)
170           	{
171           	    numberOfBytes = 4;
172           	}
173           	else
174           	{
175           	    numberOfBytes = 2;
176           	    tempchar = REPLACEMENT_CHARACTER;
177           	}
178           
179           	tgt += numberOfBytes;
180 david 1.1 	if (tgt > tgtEnd)
181           	{
182           	    src = oldsrc;
183           	    tgt -= numberOfBytes;
184           	    returnCode = -1;
185           	    break;
186           	}
187           
188           	switch (numberOfBytes)
189           	{ 
190           	    case 4:
191           		*--tgt = (Uint8)((tempchar | 0x80) & 0xBF);
192           		tempchar >>= 6;
193           	    case 3:
194           		*--tgt = (Uint8)((tempchar | 0x80) & 0xBF);
195           		tempchar >>= 6;
196           	    case 2:
197           		*--tgt = (Uint8)((tempchar | 0x80) & 0xBF);
198           		tempchar >>= 6;
199           	    case 1:
200           		*--tgt =  (Uint8)(tempchar | firstByteMark[numberOfBytes]);
201 david 1.1 	}
202           	tgt += numberOfBytes;
203               }
204               *srcHead = src;
205               *tgtHead = tgt;
206               return returnCode;
207           }
208           
209           int UTF8toUTF16 (const Uint8** srcHead,
210           		 const Uint8* srcEnd, 
211           		 Uint16** tgtHead,
212           		 Uint16* tgtEnd)
213           {
214               int returnCode = 0;
215               const Uint8* src = *srcHead;
216               Uint16* tgt = *tgtHead;
217               while (src < srcEnd)
218               {
219           	Uint32 tempchar = 0;
220           	Uint16 moreBytes = trailingBytesForUTF8[*src];
221           	if (src + moreBytes >= srcEnd)
222 david 1.1 	{
223           	    returnCode = -1;
224           	    break;
225           	}
226           	switch (moreBytes)
227           	{
228           	    case 3:
229           		tempchar += *src++;
230           		tempchar <<= 6;
231           	    case 2:
232           		tempchar += *src++;
233           		tempchar <<= 6;
234           	    case 1:
235           		tempchar += *src++;
236           		tempchar <<= 6;
237           	    case 0:
238           		tempchar += *src++;
239           	}
240           	tempchar -= offsetsFromUTF8[moreBytes];
241           
242           	if (tgt >= tgtEnd)
243 david 1.1 	{
244           	    src -= (moreBytes+1); 
245           	    returnCode = -1; break;
246           	}
247           	if (tempchar <= MAX_BYTE)
248           	{	
249           	    if ((tempchar >= FIRST_HIGH_SURROGATE &&
250           		 tempchar <= LAST_LOW_SURROGATE) ||
251           		((tempchar & 0xFFFE) ==	0xFFFE))
252           	    {
253           		*tgt++ = REPLACEMENT_CHARACTER;
254           	    }
255           	    else
256           	    {
257           		*tgt++ = (Uint16)tempchar; 
258           	    }
259           	}
260           	else if (tempchar > MAX_UTF16)
261           	{
262           	    *tgt++ = REPLACEMENT_CHARACTER;
263           	}
264 david 1.1 	else
265           	{
266           	    if (tgt + 1 >= tgtEnd)
267           	    {
268           		src -= (moreBytes+1);
269           		returnCode = -1;
270           		break;
271           	    }
272           	    tempchar -= halfBase;
273           	    *tgt++ = (Uint16)((tempchar >> halfShift) + FIRST_HIGH_SURROGATE);
274           	    *tgt++ = (Uint16)((tempchar & halfMask) + FIRST_LOW_SURROGATE);
275           	}
276               }
277               *srcHead = src;
278               *tgtHead = tgt;
279               return returnCode;
280           }
281 david 1.5 
282           Boolean isUTF8(const char *legal)
283           {
284               char numBytes = UTF_8_COUNT_TRAIL_BYTES(*legal)+1;
285           
286               // Validate that the string is long enough to hold all the expected bytes.
287               // Note that if legal[0] == 0, numBytes will be 1.
288               for (char i=1; i<numBytes; i++)
289               {
290                   if (legal[i] == 0)
291                   {
292                       return false;
293                   }
294               }
295           
296               return (isValid_U8((const Uint8 *)legal, numBytes));
297           }
298 chuck 1.6 
299 humberto 1.7.4.1 Boolean isUTF8Str(const char *legal)
300                  {
301                  	/*char tmp[] = {0xCE,0x99,0xCE,0xBF,0xCF,0x8D,0xCE,0xBD,0xCE,
302                                        0xB9,0xCE,0xBA,0xCE,0xBF,0xCE,0xBD,0xCF,0x84,
303                                        0x00};*/
304                  //	char tmp_[] = "class";
305                  //	char * tmp = legal;
306                  	Uint32 count = 0;
307                          Uint32 size = strlen(legal);
308                  //	printf("size = %d\n",size);
309                          while(count<size)
310                          {
311                  //		printf("count = %d\n",count);
312                                  if(isUTF8(&legal[count]) == true){
313                                  	UTF8_NEXT(legal,count);
314                  		}else{
315                  //			printf("bad string\n");
316                  			return false;
317                  		}
318                          }
319                  //	printf("good string\n");
320 humberto 1.7.4.1 	return true;
321                  /*
322                  	printf("legal = %s\n\n", legal);
323                  	Uint32 count = 0;
324                  	Uint32 trailingBytes = 0;
325                          Uint32 size = strlen(legal);
326                  	printf("size of legal is %d\n",size);
327                          while(count<size-1)
328                          {
329                  		printf("count = %d\n", count);
330                                  if(isUTF8((char*)&legal[count]) == true){
331                                  	UTF8_NEXT(legal,trailingBytes);
332                  			count += trailingBytes;
333                  		} else{
334                  			printf("CommonUTF8:: returning false; position[%d]",count);
335                  			 return false;	
336                  		}
337                          }
338                  	 printf("CommonUTF8:: returning false; position[%d]",count);
339                  	return true;*/
340                  }
341 chuck    1.6     
342                  String escapeStringEncoder(const String& Str)
343                  {
344                      String escapeStr;
345                      Uint16 escChar;
346                      char hexencoding[6];
347                      
348                      for(Uint32 i = 0; i < Str.size(); ++i)
349                      {
350                  	escChar = Str[i];
351                  	if(escChar <= 0x7F)
352                          {
353                  	    escapeStr.append(escChar);
354                          }
355                  	else
356                  	{
357                  	    memset(hexencoding,0x00,sizeof(hexencoding));
358                              sprintf(hexencoding, "%%%03X%X", escChar/16, escChar%16);
359                              escapeStr.append(hexencoding);
360                  	}
361                      }
362 chuck    1.6         return(escapeStr);
363                  }
364                  
365                  String escapeStringDecoder(const String& Str)
366                  {
367                      Uint32 i;
368                  
369                      Array<Uint16> utf16Chars; 
370                  
371                      for (i=0; i< Str.size(); ++i)
372                      {
373                          if (Str[i] == '%')
374                          {
375                              Uint8 digit1 = _hexCharToNumeric((Str[++i]));
376                              Uint8 digit2 = _hexCharToNumeric((Str[++i]));
377                              Uint8 digit3 = _hexCharToNumeric((Str[++i]));
378                              Uint8 digit4 = _hexCharToNumeric((Str[++i]));
379                  
380                  	    Uint16 decodedChar = (digit1<<12) + (digit2<<8) +
381                                                   (digit3<< 4) + (digit4);
382                  
383 chuck    1.6                 utf16Chars.append(decodedChar);				
384                          }
385                          else
386                          {
387                              utf16Chars.append((Uint16)Str[i]);	
388                          }
389                      }
390                  
391                      // If there was a string to decode...
392                      if (Str.size() > 0)
393                      {
394                          utf16Chars.append('\0');
395                          return String((Char16 *)utf16Chars.getData());
396                      }
397                      else
398                      {
399                          return String();
400                      }
401                  }
402                  
403 david    1.1     PEGASUS_NAMESPACE_END

No CVS admin address has been configured
Powered by
ViewCVS 0.9.2