1 karl 1.8 //%2004////////////////////////////////////////////////////////////////////////
|
2 david 1.1 //
|
3 karl 1.8 // Copyright (c) 2000, 2001, 2002 BMC Software; Hewlett-Packard Development
4 // Company, L.P.; IBM Corp.; The Open Group; Tivoli Systems.
5 // Copyright (c) 2003 BMC Software; Hewlett-Packard Development Company, L.P.;
|
6 karl 1.4 // IBM Corp.; EMC Corporation, The Open Group.
|
7 karl 1.8 // Copyright (c) 2004 BMC Software; Hewlett-Packard Development Company, L.P.;
8 // IBM Corp.; EMC Corporation; VERITAS Software Corporation; The Open Group.
|
9 david 1.1 //
10 // Permission is hereby granted, free of charge, to any person obtaining a copy
11 // of this software and associated documentation files (the "Software"), to
12 // deal in the Software without restriction, including without limitation the
13 // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
14 // sell copies of the Software, and to permit persons to whom the Software is
15 // furnished to do so, subject to the following conditions:
16 //
17 // THE ABOVE COPYRIGHT NOTICE AND THIS PERMISSION NOTICE SHALL BE INCLUDED IN
18 // ALL COPIES OR SUBSTANTIAL PORTIONS OF THE SOFTWARE. THE SOFTWARE IS PROVIDED
19 // "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
20 // LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
21 // PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
23 // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 //
26 //==============================================================================
27 //
28 // Author: Dave Rosckes (rosckes@us.ibm.com)
29 //
30 david 1.1 //
31 //%/////////////////////////////////////////////////////////////////////////////
32
|
33 chuck 1.7 #include <Pegasus/Common/Config.h>
34 #include <Pegasus/Common/Array.h>
|
35 david 1.1 #include "CommonUTF.h"
|
36 chuck 1.7 #include <cctype>
37 #include <cstdio>
|
38 david 1.2 #include <cstring>
|
39 kumpf 1.3
|
40 david 1.1 PEGASUS_NAMESPACE_BEGIN
|
41 kumpf 1.3
|
42 chuck 1.6
43 inline Uint8 _hexCharToNumeric(const Uint16 c)
44 {
45 Uint8 n;
46
47 if (isdigit(c))
48 n = (c - '0');
49 else if (isupper(c))
50 n = (c - 'A' + 10);
51 else // if (islower(c))
52 n = (c - 'a' + 10);
53
54 return n;
55 }
56
|
57 kumpf 1.3 // Note: Caller must ensure that "src" contains "size" bytes.
|
58 david 1.1 int isValid_U8(const Uint8 *src, int size)
59 {
60 Uint8 U8_char;
61 const Uint8 *srcptr = src+size;
62 switch (size)
63 {
64 case 4:
65 if ((U8_char = (*--srcptr)) < 0x80 || U8_char > 0xBF)
66 {
67 return false;
68 }
69 case 3:
70 if ((U8_char = (*--srcptr)) < 0x80 || U8_char > 0xBF)
71 {
72 return false;
73 }
74 case 2:
75 if ((U8_char = (*--srcptr)) > 0xBF)
76 {
77 return false;
78 }
79 david 1.1 switch (*src)
80 {
81 case 0xE0:
82 if (U8_char < 0xA0)
83 {
84 return false;
85 }
86 break;
87 case 0xF0:
88 if (U8_char < 0x90)
89 {
90 return false;
91 }
92 break;
93 case 0xF4:
94 if (U8_char > 0x8F)
95 {
96 return false;
97 }
98 break;
99 default:
100 david 1.1 if (U8_char < 0x80)
101 {
102 return false;
103 }
104 }
105 case 1:
106 if (*src >= 0x80 && *src < 0xC2)
107 {
108 return false;
109 }
110 if (*src > 0xF4)
111 {
112 return false;
113 }
114 break;
|
115 david 1.2 default:
116 {
117 return false;
118 }
|
119 david 1.1
120 }
121 return true;
122 }
123
124 int UTF16toUTF8(const Uint16** srcHead,
125 const Uint16* srcEnd,
126 Uint8** tgtHead,
127 Uint8* tgtEnd)
128 {
129 int returnCode = 0;
130 const Uint16* src = *srcHead;
131 Uint8* tgt = *tgtHead;
132 while (src < srcEnd)
133 {
134 Uint32 tempchar;
135 Uint16 numberOfBytes = 0;
136 const Uint16* oldsrc = src;
137 tempchar = *src++;
138 if (tempchar >= FIRST_HIGH_SURROGATE
139 && tempchar <= LAST_HIGH_SURROGATE)
140 david 1.1 {
141 if (src < srcEnd)
142 {
143 Uint32 tempchar2 = *src;
144 if (tempchar2 >= FIRST_LOW_SURROGATE &&
145 tempchar2 <= LAST_LOW_SURROGATE)
146 {
147 tempchar = ((tempchar - FIRST_HIGH_SURROGATE) << halfShift)
148 + (tempchar2 - FIRST_LOW_SURROGATE) + halfBase;
149 ++src;
150 }
151 }
152 else
153 {
154 --src;
155 returnCode = -1;
156 break;
157 }
158 }
159 if (tempchar < (Uint32)0x80)
160 {
161 david 1.1 numberOfBytes = 1;
162 }
163 else if (tempchar < (Uint32)0x800)
164 {
165 numberOfBytes = 2;
166 }
167 else if (tempchar < (Uint32)0x10000)
168 {
169 numberOfBytes = 3;
170 }
171 else if (tempchar < (Uint32)0x200000)
172 {
173 numberOfBytes = 4;
174 }
175 else
176 {
177 numberOfBytes = 2;
178 tempchar = REPLACEMENT_CHARACTER;
179 }
180
181 tgt += numberOfBytes;
182 david 1.1 if (tgt > tgtEnd)
183 {
184 src = oldsrc;
185 tgt -= numberOfBytes;
186 returnCode = -1;
187 break;
188 }
189
190 switch (numberOfBytes)
191 {
192 case 4:
193 *--tgt = (Uint8)((tempchar | 0x80) & 0xBF);
194 tempchar >>= 6;
195 case 3:
196 *--tgt = (Uint8)((tempchar | 0x80) & 0xBF);
197 tempchar >>= 6;
198 case 2:
199 *--tgt = (Uint8)((tempchar | 0x80) & 0xBF);
200 tempchar >>= 6;
201 case 1:
202 *--tgt = (Uint8)(tempchar | firstByteMark[numberOfBytes]);
203 david 1.1 }
204 tgt += numberOfBytes;
205 }
206 *srcHead = src;
207 *tgtHead = tgt;
208 return returnCode;
209 }
210
211 int UTF8toUTF16 (const Uint8** srcHead,
212 const Uint8* srcEnd,
213 Uint16** tgtHead,
214 Uint16* tgtEnd)
215 {
216 int returnCode = 0;
217 const Uint8* src = *srcHead;
218 Uint16* tgt = *tgtHead;
219 while (src < srcEnd)
220 {
221 Uint32 tempchar = 0;
222 Uint16 moreBytes = trailingBytesForUTF8[*src];
223 if (src + moreBytes >= srcEnd)
224 david 1.1 {
225 returnCode = -1;
226 break;
227 }
228 switch (moreBytes)
229 {
230 case 3:
231 tempchar += *src++;
232 tempchar <<= 6;
233 case 2:
234 tempchar += *src++;
235 tempchar <<= 6;
236 case 1:
237 tempchar += *src++;
238 tempchar <<= 6;
239 case 0:
240 tempchar += *src++;
241 }
242 tempchar -= offsetsFromUTF8[moreBytes];
243
244 if (tgt >= tgtEnd)
245 david 1.1 {
246 src -= (moreBytes+1);
247 returnCode = -1; break;
248 }
249 if (tempchar <= MAX_BYTE)
250 {
251 if ((tempchar >= FIRST_HIGH_SURROGATE &&
252 tempchar <= LAST_LOW_SURROGATE) ||
253 ((tempchar & 0xFFFE) == 0xFFFE))
254 {
255 *tgt++ = REPLACEMENT_CHARACTER;
256 }
257 else
258 {
259 *tgt++ = (Uint16)tempchar;
260 }
261 }
262 else if (tempchar > MAX_UTF16)
263 {
264 *tgt++ = REPLACEMENT_CHARACTER;
265 }
266 david 1.1 else
267 {
268 if (tgt + 1 >= tgtEnd)
269 {
270 src -= (moreBytes+1);
271 returnCode = -1;
272 break;
273 }
274 tempchar -= halfBase;
275 *tgt++ = (Uint16)((tempchar >> halfShift) + FIRST_HIGH_SURROGATE);
276 *tgt++ = (Uint16)((tempchar & halfMask) + FIRST_LOW_SURROGATE);
277 }
278 }
279 *srcHead = src;
280 *tgtHead = tgt;
281 return returnCode;
282 }
|
283 david 1.5
284 Boolean isUTF8(const char *legal)
285 {
286 char numBytes = UTF_8_COUNT_TRAIL_BYTES(*legal)+1;
287
288 // Validate that the string is long enough to hold all the expected bytes.
289 // Note that if legal[0] == 0, numBytes will be 1.
290 for (char i=1; i<numBytes; i++)
291 {
292 if (legal[i] == 0)
293 {
294 return false;
295 }
296 }
297
298 return (isValid_U8((const Uint8 *)legal, numBytes));
299 }
|
300 chuck 1.6
301
302 String escapeStringEncoder(const String& Str)
303 {
304 String escapeStr;
305 Uint16 escChar;
306 char hexencoding[6];
307
308 for(Uint32 i = 0; i < Str.size(); ++i)
309 {
310 escChar = Str[i];
311 if(escChar <= 0x7F)
312 {
313 escapeStr.append(escChar);
314 }
315 else
316 {
317 memset(hexencoding,0x00,sizeof(hexencoding));
318 sprintf(hexencoding, "%%%03X%X", escChar/16, escChar%16);
319 escapeStr.append(hexencoding);
320 }
321 chuck 1.6 }
322 return(escapeStr);
323 }
324
325 String escapeStringDecoder(const String& Str)
326 {
327 Uint32 i;
328
329 Array<Uint16> utf16Chars;
330
331 for (i=0; i< Str.size(); ++i)
332 {
333 if (Str[i] == '%')
334 {
335 Uint8 digit1 = _hexCharToNumeric((Str[++i]));
336 Uint8 digit2 = _hexCharToNumeric((Str[++i]));
337 Uint8 digit3 = _hexCharToNumeric((Str[++i]));
338 Uint8 digit4 = _hexCharToNumeric((Str[++i]));
339
340 Uint16 decodedChar = (digit1<<12) + (digit2<<8) +
341 (digit3<< 4) + (digit4);
342 chuck 1.6
343 utf16Chars.append(decodedChar);
344 }
345 else
346 {
347 utf16Chars.append((Uint16)Str[i]);
348 }
349 }
350
351 // If there was a string to decode...
352 if (Str.size() > 0)
353 {
354 utf16Chars.append('\0');
355 return String((Char16 *)utf16Chars.getData());
356 }
357 else
358 {
359 return String();
360 }
361 }
362
|
363 david 1.1 PEGASUS_NAMESPACE_END
|