version 1.111.2.7, 2005/09/30 12:57:15
|
version 1.111.6.2, 2005/10/08 01:59:52
|
|
|
// | // |
// Author: Mike Brasher (mbrasher@bmc.com) | // Author: Mike Brasher (mbrasher@bmc.com) |
// | // |
|
// Modified By: |
|
// Roger Kumpf, Hewlett-Packard Company (roger_kumpf@hp.com) |
|
// Josephine Eskaline Joyce, IBM (jojustin@in.ibm.com) for Bug#3297 |
|
// David Dillard, VERITAS Software Corp. (david.dillard@veritas.com) |
|
// Mike Brasher (mike-brasher@austin.rr.com) |
|
// |
//%///////////////////////////////////////////////////////////////////////////// | //%///////////////////////////////////////////////////////////////////////////// |
| |
#define PEGASUS_USE_INTERNAL_INLINES |
|
#include "String.h" |
|
#include <cassert> | #include <cassert> |
#include "InternalException.h" | #include "InternalException.h" |
#include "CommonUTF.h" | #include "CommonUTF.h" |
#include "CharSet.h" |
#include "MessageLoader.h" |
|
#include "StringRep.h" |
| |
#ifdef PEGASUS_STRING_ENABLE_ICU |
#ifdef PEGASUS_HAS_ICU |
#include <unicode/ustring.h> | #include <unicode/ustring.h> |
#include <unicode/uchar.h> | #include <unicode/uchar.h> |
#endif | #endif |
|
|
// | // |
// Compile-time macros (undefined by default). | // Compile-time macros (undefined by default). |
// | // |
// PEGASUS_STRING_ENABLE_ICU -- enables use of ICU package |
|
// |
|
// PEGASUS_STRING_NO_THROW -- suppresses throwing of exceptions | // PEGASUS_STRING_NO_THROW -- suppresses throwing of exceptions |
// | // |
// PEGASUS_STRING_NO_UTF8 -- don't generate slower UTF8 code. | // PEGASUS_STRING_NO_UTF8 -- don't generate slower UTF8 code. |
|
|
// | // |
//============================================================================== | //============================================================================== |
| |
|
const Uint8 _to_upper_tbl[256] = |
|
{ |
|
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, |
|
0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, |
|
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, |
|
0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, |
|
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, |
|
0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, |
|
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, |
|
0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, |
|
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47, |
|
0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F, |
|
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57, |
|
0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F, |
|
0x60,0x41,0x42,0x43,0x44,0x45,0x46,0x47, |
|
0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F, |
|
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57, |
|
0x58,0x59,0x5A,0x7B,0x7C,0x7D,0x7E,0x7F, |
|
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, |
|
0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, |
|
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, |
|
0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, |
|
0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7, |
|
0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF, |
|
0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7, |
|
0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF, |
|
0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7, |
|
0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF, |
|
0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7, |
|
0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, |
|
0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7, |
|
0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, |
|
0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7, |
|
0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF, |
|
}; |
|
|
|
const Uint8 _to_lower_tbl[256] = |
|
{ |
|
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, |
|
0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, |
|
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, |
|
0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, |
|
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, |
|
0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, |
|
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, |
|
0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, |
|
0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67, |
|
0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, |
|
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, |
|
0x78,0x79,0x7A,0x5B,0x5C,0x5D,0x5E,0x5F, |
|
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67, |
|
0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, |
|
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, |
|
0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, |
|
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, |
|
0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, |
|
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, |
|
0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, |
|
0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7, |
|
0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF, |
|
0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7, |
|
0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF, |
|
0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7, |
|
0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF, |
|
0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7, |
|
0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, |
|
0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7, |
|
0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, |
|
0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7, |
|
0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF, |
|
}; |
|
|
// Converts 16-bit characters to upper case. | // Converts 16-bit characters to upper case. |
inline Uint16 _to_upper(Uint16 x) | inline Uint16 _to_upper(Uint16 x) |
{ | { |
return (x & 0xFF00) ? x : CharSet::to_upper(x); |
return (x & 0xFF00) ? x : _to_upper_tbl[x]; |
} | } |
| |
// Converts 16-bit characters to lower case. | // Converts 16-bit characters to lower case. |
inline Uint16 _to_lower(Uint16 x) | inline Uint16 _to_lower(Uint16 x) |
{ | { |
return (x & 0xFF00) ? x : CharSet::to_lower(x); |
return (x & 0xFF00) ? x : _to_lower_tbl[x]; |
} | } |
| |
// Rounds x to the next power of two (or just returns 8 if x < 8). | // Rounds x to the next power of two (or just returns 8 if x < 8). |
|
|
while (n-- && (*s1++ - *s2++) == 0) | while (n-- && (*s1++ - *s2++) == 0) |
; | ; |
| |
|
// |
|
|
return s1[-1] - s2[-1]; | return s1[-1] - s2[-1]; |
} | } |
| |
|
|
Uint16* p = dest; | Uint16* p = dest; |
const Uint8* q = (const Uint8*)src; | const Uint8* q = (const Uint8*)src; |
| |
// Process leading 7-bit ASCII characters (to avoid UTF8 overhead below |
// Process leading 7-bit ASCII characters (to avoid UTF8 overhead later). |
// this loop). Use factor-four loop-unrolling. |
// Use loop-unrolling. |
| |
while (n >= 4 && q[0] < 128 && q[1] < 128 && q[2] < 128 && q[3] < 128) |
while (n >=8 && ((q[0]|q[1]|q[2]|q[3]|q[4]|q[5]|q[6]|q[7]) & 0x80) == 0) |
|
{ |
|
p[0] = q[0]; |
|
p[1] = q[1]; |
|
p[2] = q[2]; |
|
p[3] = q[3]; |
|
p[4] = q[4]; |
|
p[5] = q[5]; |
|
p[6] = q[6]; |
|
p[7] = q[7]; |
|
p += 8; |
|
q += 8; |
|
n -= 8; |
|
} |
|
|
|
while (n >=4 && ((q[0]|q[1]|q[2]|q[3]) & 0x80) == 0) |
{ | { |
p[0] = q[0]; | p[0] = q[0]; |
p[1] = q[1]; | p[1] = q[1]; |
|
|
} | } |
break; | break; |
case 2: | case 2: |
if (q[0] < 128 && q[1] < 128) |
if (((q[0]|q[1]) & 0x80) == 0) |
{ | { |
p[0] = q[0]; | p[0] = q[0]; |
p[1] = q[1]; | p[1] = q[1]; |
|
|
} | } |
break; | break; |
case 3: | case 3: |
if (q[0] < 128 && q[1] < 128 && q[2] < 128) |
if (((q[0]|q[1]|q[2]) & 0x80) == 0) |
{ | { |
p[0] = q[0]; | p[0] = q[0]; |
p[1] = q[1]; | p[1] = q[1]; |
|
|
if (c > n || !isValid_U8(q, c) || | if (c > n || !isValid_U8(q, c) || |
UTF8toUTF16(&q, q + c, &p, p + n) != 0) | UTF8toUTF16(&q, q + c, &p, p + n) != 0) |
{ | { |
throw Exception("Bad UTF8 encoding"); |
MessageLoaderParms parms("Common.String.BAD_UTF8", |
|
"The byte sequence starting at index $0 " |
|
"is not valid UTF-8 encoding.", |
|
q - (const Uint8*)src); |
|
throw Exception(parms); |
} | } |
| |
n -= c; | n -= c; |
|
|
| |
void String::toLower() | void String::toLower() |
{ | { |
#ifdef PEGASUS_STRING_ENABLE_ICU |
#ifdef PEGASUS_HAS_ICU |
| |
if (InitializeICU::initICUSuccessful()) | if (InitializeICU::initICUSuccessful()) |
{ | { |
|
if (Atomic_get(&_rep->refs) != 1) |
|
_rep = StringRep::copy_on_write(_rep); |
|
|
|
// This will do a locale-insensitive, but context-sensitive convert. |
|
// Since context-sensitive casing looks at adjacent chars, this |
|
// prevents optimizations where the us-ascii is converted before |
|
// calling ICU. |
|
// The string may shrink or expand after the convert. |
|
|
//// First calculate size of resulting string. u_strToLower() returns | //// First calculate size of resulting string. u_strToLower() returns |
//// only the size when zero is passed as the destination size argument. | //// only the size when zero is passed as the destination size argument. |
| |
|
|
int32_t new_size = u_strToLower( | int32_t new_size = u_strToLower( |
NULL, 0, (UChar*)_rep->data, _rep->size, NULL, &err); | NULL, 0, (UChar*)_rep->data, _rep->size, NULL, &err); |
| |
|
err = U_ZERO_ERROR; |
|
|
//// Reserve enough space for the result. | //// Reserve enough space for the result. |
| |
if ((Uint32)new_size > _rep->cap) | if ((Uint32)new_size > _rep->cap) |
|
|
(UChar*)_rep->data, _rep->size, NULL, &err); | (UChar*)_rep->data, _rep->size, NULL, &err); |
| |
_rep->size = new_size; | _rep->size = new_size; |
|
return; |
} | } |
| |
#endif /* PEGASUS_STRING_ENABLE_ICU */ |
#endif /* PEGASUS_HAS_ICU */ |
| |
if (Atomic_get(&_rep->refs) != 1) | if (Atomic_get(&_rep->refs) != 1) |
_rep = StringRep::copy_on_write(_rep); | _rep = StringRep::copy_on_write(_rep); |
|
|
| |
void String::toUpper() | void String::toUpper() |
{ | { |
#ifdef PEGASUS_STRING_ENABLE_ICU |
#ifdef PEGASUS_HAS_ICU |
| |
if (InitializeICU::initICUSuccessful()) | if (InitializeICU::initICUSuccessful()) |
{ | { |
|
if (Atomic_get(&_rep->refs) != 1) |
|
_rep = StringRep::copy_on_write(_rep); |
|
|
|
// This will do a locale-insensitive, but context-sensitive convert. |
|
// Since context-sensitive casing looks at adjacent chars, this |
|
// prevents optimizations where the us-ascii is converted before |
|
// calling ICU. |
|
// The string may shrink or expand after the convert. |
|
|
//// First calculate size of resulting string. u_strToUpper() returns | //// First calculate size of resulting string. u_strToUpper() returns |
//// only the size when zero is passed as the destination size argument. | //// only the size when zero is passed as the destination size argument. |
| |
|
|
int32_t new_size = u_strToUpper( | int32_t new_size = u_strToUpper( |
NULL, 0, (UChar*)_rep->data, _rep->size, NULL, &err); | NULL, 0, (UChar*)_rep->data, _rep->size, NULL, &err); |
| |
|
err = U_ZERO_ERROR; |
|
|
//// Reserve enough space for the result. | //// Reserve enough space for the result. |
| |
if ((Uint32)new_size > _rep->cap) | if ((Uint32)new_size > _rep->cap) |
|
|
(UChar*)_rep->data, _rep->size, NULL, &err); | (UChar*)_rep->data, _rep->size, NULL, &err); |
| |
_rep->size = new_size; | _rep->size = new_size; |
|
|
|
return; |
} | } |
| |
#endif /* PEGASUS_STRING_ENABLE_ICU */ |
#endif /* PEGASUS_HAS_ICU */ |
| |
if (Atomic_get(&_rep->refs) != 1) | if (Atomic_get(&_rep->refs) != 1) |
_rep = StringRep::copy_on_write(_rep); | _rep = StringRep::copy_on_write(_rep); |
|
|
| |
int String::compareNoCase(const String& str1, const String& str2) | int String::compareNoCase(const String& str1, const String& str2) |
{ | { |
#ifdef PEGASUS_STRING_ENABLE_ICU |
#ifdef PEGASUS_HAS_ICU |
| |
if (InitializeICU::initICUSuccessful()) | if (InitializeICU::initICUSuccessful()) |
{ | { |
|
|
str1._rep->data, str2._rep->data, U_FOLD_CASE_DEFAULT); | str1._rep->data, str2._rep->data, U_FOLD_CASE_DEFAULT); |
} | } |
| |
#endif /* PEGASUS_STRING_ENABLE_ICU */ |
#endif /* PEGASUS_HAS_ICU */ |
| |
const Uint16* s1 = str1._rep->data; | const Uint16* s1 = str1._rep->data; |
const Uint16* s2 = str2._rep->data; | const Uint16* s2 = str2._rep->data; |
|
|
| |
Boolean String::equalNoCase_aux(const String& s1, const String& s2) | Boolean String::equalNoCase_aux(const String& s1, const String& s2) |
{ | { |
#ifdef PEGASUS_STRING_ENABLE_ICU |
#ifdef PEGASUS_HAS_ICU |
| |
return String::compareNoCase(s1, s2) == 0; | return String::compareNoCase(s1, s2) == 0; |
| |
#else /* PEGASUS_STRING_ENABLE_ICU */ |
#else /* PEGASUS_HAS_ICU */ |
| |
Uint16* p = (Uint16*)s1._rep->data; | Uint16* p = (Uint16*)s1._rep->data; |
Uint16* q = (Uint16*)s2._rep->data; | Uint16* q = (Uint16*)s2._rep->data; |
|
|
| |
return true; | return true; |
| |
#endif /* PEGASUS_STRING_ENABLE_ICU */ |
#endif /* PEGASUS_HAS_ICU */ |
} | } |
| |
Boolean String::equalNoCase(const String& s1, const char* s2) | Boolean String::equalNoCase(const String& s1, const char* s2) |
{ | { |
_check_null_pointer(s2); | _check_null_pointer(s2); |
| |
#if defined(PEGASUS_STRING_ENABLE_ICU) |
#if defined(PEGASUS_HAS_ICU) |
| |
return String::equalNoCase(s1, String(s2)); | return String::equalNoCase(s1, String(s2)); |
| |
|
|
if (!*p2) | if (!*p2) |
return false; | return false; |
| |
if (_to_upper(*p1++) != CharSet::to_upper(int(*p2++))) |
if (_to_upper(*p1++) != _to_upper_tbl[int(*p2++)]) |
return false; | return false; |
} | } |
| |
|
if (*p2) |
|
return false; |
|
|
return true; | return true; |
| |
#else /* PEGASUS_STRING_ENABLE_ICU */ |
#else /* PEGASUS_HAS_ICU */ |
| |
// ATTN: optimize this! | // ATTN: optimize this! |
return String::equalNoCase(s1, String(s2)); | return String::equalNoCase(s1, String(s2)); |
| |
#endif /* PEGASUS_STRING_ENABLE_ICU */ |
#endif /* PEGASUS_HAS_ICU */ |
} | } |
| |
Boolean String::equal(const String& s1, const String& s2) | Boolean String::equal(const String& s1, const String& s2) |
|
|
CString cstr = str.getCString(); | CString cstr = str.getCString(); |
const char* utf8str = cstr; | const char* utf8str = cstr; |
os << utf8str; | os << utf8str; |
|
return os; |
|
#else |
| |
#elif defined(PEGASUS_STRING_ENABLE_ICU) |
#if defined(PEGASUS_HAS_ICU) |
| |
if (InitializeICU::initICUSuccessful()) | if (InitializeICU::initICUSuccessful()) |
{ | { |
|
|
os << buf; | os << buf; |
os.flush(); | os.flush(); |
delete [] buf; | delete [] buf; |
|
return os; |
} | } |
| |
#endif /* PEGASUS_OS_OS400 */ |
#endif // PEGASUS_HAS_ICU |
| |
for (Uint32 i = 0, n = str.size(); i < n; i++) | for (Uint32 i = 0, n = str.size(); i < n; i++) |
{ | { |
|
|
} | } |
| |
return os; | return os; |
|
#endif // PEGASUS_OS_OS400 |
} | } |
| |
void String::_append_char_aux() | void String::_append_char_aux() |
|
|
} | } |
| |
9. Experimented to find the optimial initial size for a short string. | 9. Experimented to find the optimial initial size for a short string. |
Eight seems to offer the best tradoff between space and time. |
Eight seems to offer the best tradeoff between space and time. |
| |
10. Inlined all members of the Char16 class. | 10. Inlined all members of the Char16 class. |
| |
|
|
| |
This avoids slower UTF8 processing when not needed. | This avoids slower UTF8 processing when not needed. |
| |
|
BUG-4200 Review actions: |
|
|
|
1. Use PEGASUS_USE_EXPERIMENTAL_INTERFACES instead of |
|
PEGASUS_STRING_EXTENSIONS. |
|
|
|
Status: done |
|
|
|
2. Doc++ String.h |
|
|
|
Status: pending |
|
|
|
3. Look at PEP223 for security coding guidelines for strings. |
|
|
|
Status: pending |
|
|
|
4. Increasing the number of objects may break Windows 2000 build |
|
(limit of 2048 bytes for command line). See BUG-2754 |
|
|
|
Status: looking into the use auto-generated linker files. |
|
|
|
5. Concerns about whether generating inlines and non-inline versions |
|
of functions will work with all compilers. |
|
|
|
Status: confident it will work on platforms except maybe Windows. |
|
|
================================================================================ | ================================================================================ |
*/ | */ |