version 1.135, 2008/12/16 18:56:00
|
version 1.138, 2010/07/16 10:15:31
|
|
|
#include <Pegasus/Common/PegasusAssert.h> | #include <Pegasus/Common/PegasusAssert.h> |
#include <cstring> | #include <cstring> |
#include "InternalException.h" | #include "InternalException.h" |
#include "CommonUTF.h" |
|
#include "MessageLoader.h" | #include "MessageLoader.h" |
#include "StringRep.h" | #include "StringRep.h" |
| |
|
|
return x; | return x; |
} | } |
| |
template<class P, class Q> |
|
static void _copy(P* p, const Q* q, size_t n) |
|
{ |
|
// The following employs loop unrolling for efficiency. Please do not |
|
// eliminate. |
|
|
|
while (n >= 8) |
|
{ |
|
p[0] = q[0]; |
|
p[1] = q[1]; |
|
p[2] = q[2]; |
|
p[3] = q[3]; |
|
p[4] = q[4]; |
|
p[5] = q[5]; |
|
p[6] = q[6]; |
|
p[7] = q[7]; |
|
p += 8; |
|
q += 8; |
|
n -= 8; |
|
} |
|
|
|
while (n >= 4) |
|
{ |
|
p[0] = q[0]; |
|
p[1] = q[1]; |
|
p[2] = q[2]; |
|
p[3] = q[3]; |
|
p += 4; |
|
q += 4; |
|
n -= 4; |
|
} |
|
|
|
while (n--) |
|
*p++ = *q++; |
|
} |
|
|
|
static Uint16* _find(const Uint16* s, size_t n, Uint16 c) | static Uint16* _find(const Uint16* s, size_t n, Uint16 c) |
{ | { |
// The following employs loop unrolling for efficiency. Please do not | // The following employs loop unrolling for efficiency. Please do not |
|
|
throw NullPointer(); | throw NullPointer(); |
} | } |
| |
static void _StringThrowBadUTF8(Uint32 index) |
#define BADUTF8_MAX_CLEAR_CHAR 40 |
{ |
#define BADUTF8_MAX_CHAR_TO_HEX 10 |
MessageLoaderParms parms( |
|
"Common.String.BAD_UTF8", |
|
"The byte sequence starting at index $0 " |
|
"is not valid UTF-8 encoding.", |
|
index); |
|
throw Exception(parms); |
|
} |
|
| |
static size_t _copyFromUTF8( |
static void _formatBadUTF8Chars( |
Uint16* dest, |
char* buffer, |
const char* src, |
Uint32 index, |
size_t n, |
const char* q, |
size_t& utf8_error_index) |
size_t n ) |
{ | { |
Uint16* p = dest; |
|
const Uint8* q = (const Uint8*)src; |
|
| |
// Process leading 7-bit ASCII characters (to avoid UTF8 overhead later). |
char tmp[20]; |
// Use loop-unrolling. |
const char* start; |
| |
while (n >=8 && ((q[0]|q[1]|q[2]|q[3]|q[4]|q[5]|q[6]|q[7]) & 0x80) == 0) |
size_t clearChar = |
{ |
(( index < BADUTF8_MAX_CLEAR_CHAR ) ? index : BADUTF8_MAX_CLEAR_CHAR ); |
p[0] = q[0]; |
size_t charToHex = |
p[1] = q[1]; |
((n-index-1) < BADUTF8_MAX_CHAR_TO_HEX ? |
p[2] = q[2]; |
(n-index-1) : BADUTF8_MAX_CHAR_TO_HEX ); |
p[3] = q[3]; |
|
p[4] = q[4]; |
|
p[5] = q[5]; |
|
p[6] = q[6]; |
|
p[7] = q[7]; |
|
p += 8; |
|
q += 8; |
|
n -= 8; |
|
} |
|
|
|
while (n >=4 && ((q[0]|q[1]|q[2]|q[3]) & 0x80) == 0) |
|
{ |
|
p[0] = q[0]; |
|
p[1] = q[1]; |
|
p[2] = q[2]; |
|
p[3] = q[3]; |
|
p += 4; |
|
q += 4; |
|
n -= 4; |
|
} |
|
| |
switch (n) |
if (index < BADUTF8_MAX_CLEAR_CHAR) |
{ | { |
case 0: |
start = q; |
return p - dest; |
} else |
case 1: |
|
if (q[0] < 128) |
|
{ | { |
p[0] = q[0]; |
start = &(q[ index - BADUTF8_MAX_CLEAR_CHAR]); |
return p + 1 - dest; |
|
} |
|
break; |
|
case 2: |
|
if (((q[0]|q[1]) & 0x80) == 0) |
|
{ |
|
p[0] = q[0]; |
|
p[1] = q[1]; |
|
return p + 2 - dest; |
|
} |
|
break; |
|
case 3: |
|
if (((q[0]|q[1]|q[2]) & 0x80) == 0) |
|
{ |
|
p[0] = q[0]; |
|
p[1] = q[1]; |
|
p[2] = q[2]; |
|
return p + 3 - dest; |
|
} |
|
break; |
|
} | } |
| |
// Process remaining characters. |
// Intialize the buffer with the first character as '\0' to be able to use |
|
// strnchat() and strcat() |
while (n) |
buffer[0] = 0; |
|
// Start the buffer with the valid UTF8 chars |
|
strncat(buffer,start,clearChar); |
|
for (size_t i = clearChar, j = 0; j <= charToHex; i++,j++ ) |
{ | { |
// Optimize for 7-bit ASCII case. |
tmp[0] = 0; |
|
sprintf(&(tmp[0])," 0x%02X",(Uint8)start[i]); |
|
strncat(buffer,&(tmp[0]),5); |
|
} |
| |
if (*q < 128) |
|
{ |
|
*p++ = *q++; |
|
n--; |
|
} | } |
else |
|
{ |
|
Uint8 c = UTF_8_COUNT_TRAIL_BYTES(*q) + 1; |
|
| |
if (c > n || !isValid_U8(q, c) || |
static void _StringThrowBadUTF8(Uint32 index, const char* q, size_t n) |
UTF8toUTF16(&q, q + c, &p, p + n) != 0) |
|
{ | { |
utf8_error_index = q - (const Uint8*)src; |
char buffer[1024]; |
return size_t(-1); |
|
} |
|
| |
n -= c; |
_formatBadUTF8Chars(&(buffer[0]),index,q,n); |
} |
|
} |
MessageLoaderParms parms( |
|
"Common.String.BAD_UTF8_LONG", |
|
"The byte sequence starting at index $0 " |
|
"is not valid UTF-8 encoding: $1", |
|
index,buffer); |
| |
return p - dest; |
throw Exception(parms); |
} | } |
| |
// Note: dest must be at least three times src (plus an extra byte for | // Note: dest must be at least three times src (plus an extra byte for |
|
|
return p - (Uint8*)dest; | return p - (Uint8*)dest; |
} | } |
| |
static inline size_t _convert( |
|
Uint16* p, const char* q, size_t n, size_t& utf8_error_index) |
|
{ |
|
#ifdef PEGASUS_STRING_NO_UTF8 |
|
_copy(p, q, n); |
|
return n; |
|
#else |
|
return _copyFromUTF8(p, q, n, utf8_error_index); |
|
#endif |
|
} |
|
|
|
//============================================================================== | //============================================================================== |
// | // |
// class CString | // class CString |
|
|
if (rep->size == size_t(-1)) | if (rep->size == size_t(-1)) |
{ | { |
StringRep::free(rep); | StringRep::free(rep); |
_StringThrowBadUTF8((Uint32)utf8_error_index); |
_StringThrowBadUTF8((Uint32)utf8_error_index, data,size); |
} | } |
| |
rep->data[rep->size] = '\0'; | rep->data[rep->size] = '\0'; |
|
|
{ | { |
StringRep::free(_rep); | StringRep::free(_rep); |
_rep = &StringRep::_emptyRep; | _rep = &StringRep::_emptyRep; |
_StringThrowBadUTF8((Uint32)utf8_error_index); |
_StringThrowBadUTF8((Uint32)utf8_error_index,s2,n2); |
} | } |
| |
_rep->size = n1 + tmp; | _rep->size = n1 + tmp; |
|
|
{ | { |
StringRep::free(_rep); | StringRep::free(_rep); |
_rep = &StringRep::_emptyRep; | _rep = &StringRep::_emptyRep; |
_StringThrowBadUTF8((Uint32)utf8_error_index); |
_StringThrowBadUTF8((Uint32)utf8_error_index,s1,n1); |
} | } |
| |
_rep->size = n2 + tmp; | _rep->size = n2 + tmp; |
|
|
{ | { |
StringRep::free(_rep); | StringRep::free(_rep); |
_rep = &StringRep::_emptyRep; | _rep = &StringRep::_emptyRep; |
_StringThrowBadUTF8((Uint32)utf8_error_index); |
_StringThrowBadUTF8((Uint32)utf8_error_index,str,n); |
} | } |
| |
_rep->data[_rep->size] = 0; | _rep->data[_rep->size] = 0; |
|
|
{ | { |
StringRep::free(_rep); | StringRep::free(_rep); |
_rep = &StringRep::_emptyRep; | _rep = &StringRep::_emptyRep; |
_StringThrowBadUTF8((Uint32)utf8_error_index); |
_StringThrowBadUTF8((Uint32)utf8_error_index,str,size); |
} | } |
| |
_rep->size += tmp; | _rep->size += tmp; |
|
|
Boolean String::equal(const String& s1, const String& s2) | Boolean String::equal(const String& s1, const String& s2) |
{ | { |
return (s1._rep == s2._rep) || | return (s1._rep == s2._rep) || |
(s1._rep->size == s2._rep->size) && |
((s1._rep->size == s2._rep->size) && |
memcmp(s1._rep->data, | memcmp(s1._rep->data, |
s2._rep->data, | s2._rep->data, |
s1._rep->size * sizeof(Uint16)) == 0; |
s1._rep->size * sizeof(Uint16)) == 0); |
} | } |
| |
Boolean String::equal(const String& s1, const char* s2) | Boolean String::equal(const String& s1, const char* s2) |