I'm debugging some opensource code on a 64-bit Solaris system, using GCC, that converts 2byte characters (wchar_t
) to 4byte characters (wchar_t
). Because Solaris like some other Unixes define wchar_t as 4byte, not 2byte like in Windows.
Now I fixed the problem, through laying it out the pointer arithmetic over two lines, but I'm not sure what was wrong with the original code. Any clues?
Original Code
int StringCopy2to4bytes(const unsigned short* src, int src_size,
unsigned int* dst, int dst_size)
{
int cp_size = 0;
const unsigned short *src_end = NULL;
const unsigned int *dst_end = NULL;
unsigned int c1, c2;
src_end = src + src_size;
dst_end = dst + dst_size;
while (src < src_end)
{
c1 = *src++;
if ((c1 >= UNI_SUR_HIGH_START) && (c1 <= UNI_SUR_HIGH_END))
{
if (src < src_end)
{
c2 = *src;
if ((c2 >= UNI_SUR_LOW_START) && (c2 <= UNI_SUR_LOW_END))
{
c1 = ((c1 - UNI_SUR_HIGH_START) << UNI_SHIFT) +
(c1 - UNI_SUR_LOW_START ) + UNI_BASE;
++src;
}
}
else
return -1;
}
if (dst >= dst_end) return -2;
*dst++ = c1;
cp_size++;
}
return cp_size;
}
Fixed Code
int StringCopy2to4bytes(const unsigned short* src, int src_size,
unsigned int* dst, int dst_size)
{
int cp_size = 0;
const unsigned short *src_end = NULL;
const unsigned int *dst_end = NULL;
unsigned int c1, c2;
src_end = src + src_size;
dst_end = dst + dst_size;
while (src < src_end)
{
c1 = *src; //FIX
++src;
if ((c1 >= UNI_SUR_HIGH_START) && (c1 <= UNI_SUR_HIGH_END))
{
if (src < src_end)
{
c2 = *src;
if ((c2 >= UNI_SUR_LOW_START) && (c2 <= UNI_SUR_LOW_END))
{
c1 = ((c1 - UNI_SUR_HIGH_START) << UNI_SHIFT) +
(c1 - UNI_SUR_LOW_START ) + UNI_BASE;
++src;
}
}
else
return -1;
}
if (dst >= dst_end) return -2;
*dst = c1; //FIX
++dst;
cp_size++;
}
return cp_size;
}
Edit: for the record, the code isn't mine, I'm just using it, and happen to be debugging it, not that it makes a big difference, but the source is fairly big, so I'm trying to fix it with tweezers as it may be, not refactor everything, anyways bugs are bugs, and I need to fix it and mail the author about what was wrong.
The constants are:
/* unicode constants */
#define UNI_SHIFT ((int) 10 )
#define UNI_BASE ((unsigned int) 0x0010000UL)
#define UNI_MASK ((unsigned int) 0x3FFUL)
#define UNI_REPLACEMENT_CHAR ((unsigned int) 0x0000FFFD)
#define UNI_MAX_BMP ((unsigned int) 0x0000FFFF)
#define UNI_MAX_UTF16 ((unsigned int) 0x0010FFFF)
#define UNI_MAX_UTF32 ((unsigned int) 0x7FFFFFFF)
#define UNI_MAX_LEGAL_UTF32 ((unsigned int) 0x0010FFFF)
#define UNI_SUR_HIGH_START ((unsigned int) 0xD800)
#define UNI_SUR_HIGH_END ((unsigned int) 0xDBFF)
#define UNI_SUR_LOW_START ((unsigned int) 0xDC00)
#define UNI_SUR_LOW_END ((unsigned int) 0xDFFF)