/* ***************************************************************************** * * $RCSfile: utfutil.c,v $ * $Date: 1999/05/18 18:36:20 $ * $Source: /home/richard/Xml/RCS/utfutil.c,v $ * $Revision: 1.58 $ * $Author: richard $ * ***************************************************************************** * * Copyright 1998, Brown University and Richard Goerwitz * ***************************************************************************** * * Functions for use with Unicode/UTF/UCS characters and strings. * Mainly these just mirror stdio routines we all know and love. * E.g., uni_strcmp() is like strcmp(), but has my_wchar_t * args. * Other routines include: * * char *utf_16_to_utf_8(my_wchar_t *s) convert UTF-16 to UTF-8 * my_wchar_t *utf_8_to_utf_16 (char *s) convert UTF-8 to UTF-16 * * Note that the uni_utf- routines below (that compare, find, etc.) * a UTF-8 string and/in a UTF-16 string are somewhat inefficient. * They use malloc() and free(). * ***************************************************************************** */ #include "utfutil.h" #include "errabort.h" #define TEN_BIT_MASK ((1 << 10) - 1) static size_t uni_len = 0; static size_t uni_buflen = 0; static my_wchar_t **uni_lst = NULL; #ifndef LONG_MAX # define LONG_MAX ((long)(~0UL>>1)) #endif /* * uni_add_string * * Add string to a static list of strings to be free'd later * on. Simple way of doing memory management. Used by the * lexical analyzer every time it uni_strdups a string. */ my_wchar_t * uni_add_string (my_wchar_t *uc) { /* What are we doing? */ xwrap (errdebug (7, "Adding new string to uni_lst.\n")); if (++uni_len > uni_buflen) { uni_buflen = uni_len + 32; if (uni_lst == NULL) { if ((uni_lst = malloc (uni_buflen * sizeof (my_wchar_t *))) == NULL) errabort (40, "malloc() error in %s\n", "uni_add_string()"); } else { if ((uni_lst = realloc (uni_lst, uni_buflen * sizeof (my_wchar_t *))) == NULL) errabort (41, "realloc() error in %s\n", "uni_add_string()"); } } if ((uni_lst[uni_len - 1] = uni_strdup (uc)) == NULL) errabort (40, "malloc() error in %s\n", "uni_add_string()"); xwrap (errdebug (7, "Added string to uni_lst, %s (len = %d)\n", utf_16_to_utf_8 (uc), uni_len)); return uni_lst[uni_len - 1]; } /* * uni_free_strings * * Free all strings allocated in uni_add_string(), and * reset the uni_lst list to NULL. */ void uni_free_strings (void) { size_t i; /* What are we doing? */ xwrap (errdebug (7, "Garbage collecting uni_lst.\n")); for (i = 0; i < uni_len; i++) { xwrap (errdebug (7, "Freeing uni_lst string, %s\n", utf_16_to_utf_8 (uni_lst[i]))); free (uni_lst[i]); } free (uni_lst); uni_len = 0; uni_buflen = 0; uni_lst = NULL; /* What did we just do? */ xwrap (errdebug (5, "Freed %d uni_lst strings.\n", i)); } /* * uni_truncate_to: * * Truncate my_wchar_t *s (arg 1) to maxlen characters, and place the * result in a static buffer. Trailing 0 counts as a character - * because the point of this routine is to trim a string down so it * fits into a buffer of a given size. * * Returns a pointer into static storage space that may change on * subsequent calls. This routine is nondestructive. Arg 1 (s) is * not changed. */ my_wchar_t * uni_truncate_to (my_wchar_t *s, size_t maxlen) { size_t i; static size_t buflen; static my_wchar_t *buf = NULL; if (buf == NULL) if ((buf = malloc ((buflen = maxlen) * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc error in %s\n", "uni_truncate_to()"); if (maxlen > buflen) if ((buf = realloc (buf, (buflen = maxlen) * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc error in %s\n", "uni_truncate_to()"); *buf = 0; if (maxlen > 0) { if ((i = uni_strlen (s)) < maxlen) uni_strcpy (buf, s); else { if (maxlen <= 4) { memcpy (buf, s, (maxlen - 1) * sizeof (my_wchar_t)); buf[maxlen - 1] = 0; } else { memcpy (buf, s, (maxlen - 4) * sizeof (my_wchar_t)); buf[maxlen - 4] = '.'; buf[maxlen - 3] = '.'; buf[maxlen - 2] = '.'; buf[maxlen - 1] = 0; } } } return buf; } /* * append_string_to_comma_delimited_string * * Append a string to a series of comma-delimited strings. E.g., * append "blue" to "red, white" to get "red, white, blue". If the * comma delimited string is NULL, malloc() it. Returns the new * comma-delimited string. Note that string is not added if it is * already present! */ my_wchar_t * append_string_to_comma_delimited_string (my_wchar_t *cdstring, my_wchar_t *string) { size_t i, len; if (! cdstring) /* nothing in "cdstring" yet */ cdstring = uni_concatenate_no_free (cdstring, string); else { /* add string to "cdstring" - if, that is, it's not * there already */ len = uni_strlen (string); for (i = 0; cdstring[i]; i++) { if (! i) { if (uni_strncmp (cdstring, string, len) == 0) if (cdstring[len] == 0 || cdstring[len] == ',') break; } else { if (cdstring[i] == ' ' && uni_strncmp (&cdstring[i+1], string, len) == 0 && (cdstring[i + 1 + len] == 0 || cdstring[i + 1 + len] == ',')) break; } } if (! cdstring[i]) { cdstring = uni_concatenate (cdstring, uni_strdup (utf_8_to_utf_16 (", "))); cdstring = uni_concatenate_no_free (cdstring, string); } } return cdstring; } /* * uni_utf_strip * * Strip out characters in cs from s. Return a pointer to s. This * routine is DESTRUCTIVE; it alters characters pointed to by s. */ my_wchar_t * uni_utf_strip (my_wchar_t *s, char *cs) { char *csp; my_wchar_t *sp1, *sp2; for (sp1 = sp2 = s; *sp2; sp2++) { for (csp = cs; *csp != '\0'; csp++) if (*csp == *sp2) break; if (*csp == '\0') *sp1++ = *sp2; } *sp1 = 0; return s; } /* * uni_downcase: my_wchar_t * -> my_wchar_t * * s -> s (side effect: changes s) * * Convert all letters in string s to lowercase; return s. Note * that the elements of s are altered; then s is returned. */ my_wchar_t * uni_downcase (my_wchar_t *wstr) { my_wchar_t *wp; for (wp = wstr; *wp != 0; wp++) *wp = uni_tolower (*wp); return wstr; } /* * uni_upcase: my_wchar_t * -> my_wchar_t * * s -> s (side effect: changes s) * * Convert all letters in string s to upper case; return s. Note * that the elements of s are altered; then s is returned. */ my_wchar_t * uni_upcase (my_wchar_t *wstr) { my_wchar_t *wp; for (wp = wstr; *wp != 0; wp++) *wp = uni_toupper (*wp); return wstr; } /* * uni_tolower * * Like tolower(), in ctype.h, but for use with Unicode chars. * Returns the lowercase equivalent to c, if there is one. * Otherwise, returns c. Note that Latin, Greek, Cyrillic, and * Armenian are the only scripts that use case. */ int uni_tolower (my_wchar_t c) { unsigned int uc; uc = (unsigned int)c; if (c == (my_wchar_t)EOF || uc <= 0xFFU) /* For backwards compatibility */ return tolower (c); /* Latin extended additional */ if ((uc >= 0x1E00U && uc <= 0x1E96U) || (uc >= 0x1EA0U && uc <= 0x1EFFU)) if (! (uc % 2)) /* add one to the even-numbered ones to downcase */ return (my_wchar_t)(uc + 1); /* Regular Greek (see extended below) */ if (uc >= 0x0391U && uc <= 0x03ABU) return (my_wchar_t)(uc + 32); /* Coptic (see also extended Greek below) */ if (uc >= 0x03E2 && uc <= 0x03EFU) if (! (uc % 2)) /* add one to the even-numbered ones to downcase */ return (my_wchar_t)(uc + 1); /* Greek extended; insane char arrangement; this only partly works */ if ((uc >= 0x1F00U && uc <= 0x1F6FU) && ((uc & 0x000FU) < 8)) return (my_wchar_t)(uc + 8); if ((uc >= 0x1F80U && uc <= 0x1FAFU) && ((uc & 0x000FU) < 8)) return (my_wchar_t)(uc + 8); if ((uc >= 0x1FB0U && uc <= 0x1FFFU) && ((uc & 0x000FU) < 4)) /* sic */ return (my_wchar_t)(uc + 8); /* Armenian uppercase letters; hey, this arrangement is sane */ if (uc >= 0x0531U && uc <= 0x0555U) return (my_wchar_t)(uc + 48); /* Cyrillic uppercase letters; another insane arrangement */ if (uc >= 0x0401U && uc <= 0x040FU) return (my_wchar_t)(uc + 80); if (uc >= 0x0410U && uc <= 0x042FU) return (my_wchar_t)(uc + 32); if (uc >= 0x0460U && uc <= 0x0481U) if (! (uc % 2)) /* add one to the even-numbered ones to downcase */ return (my_wchar_t)(uc + 1); if (uc >= 0x0490U && uc <= 0x04BFU) if (! (uc % 2)) /* add one to the even-numbered ones to downcase */ return (my_wchar_t)(uc + 1); if (uc >= 0x04C1U && uc <= 0x04C4U) if (uc % 2) /* add one to the ODD-numbered ones to downcase */ return (my_wchar_t)(uc + 1); if (uc >= 0x04D0U && uc <= 0x04FFU) if (! (uc % 2)) /* add one to the even-numbered ones to downcase */ return (my_wchar_t)(uc + 1); /* Isn't uppercase */ return c; } /* * uni_toupper * * Like toupper(), in ctype.h, but for use with Unicode chars. * Returns the uppercase equivalent to c, if there is one. * Otherwise, returns c. Note that Latin, Greek, Cyrillic, and * Armenian are the only scripts that use case. */ int uni_toupper (my_wchar_t c) { unsigned int uc; uc = (unsigned int)c; if (c == (my_wchar_t)EOF || uc <= 0xFFU) /* For backwards compatibility */ return toupper (c); /* Latin extended additional */ if ((uc >= 0x1E00U && uc <= 0x1E96U) || (uc >= 0x1EA0U && uc <= 0x1EFFU)) /* subtract one from the odd-numbered ones to upcase */ if (uc % 2) return (my_wchar_t)(uc - 1); /* Regular Greek (see extended below) */ if (uc >= 0x03B1U && uc <= 0x03CBU) return (my_wchar_t)(uc - 32); /* Coptic (see also extended Greek below) */ if (uc >= 0x03E2 && uc <= 0x03EFU) if (uc % 2) /* subtract one from the odd-numbered ones to upcase */ return (my_wchar_t)(uc - 1); /* Greek extended; a really insane arrangement; this only partly works */ if ((uc >= 0x1F00U && uc <= 0x1F6FU) && ((uc & 0x000FU) >= 8)) return (my_wchar_t)(uc - 8); if ((uc >= 0x1F80U && uc <= 0x1FAFU) && ((uc & 0x000FU) >= 8)) return (my_wchar_t)(uc - 8); if ((uc >= 0x1FB0U && uc <= 0x1FFFU) && ((uc & 0x000FU) >= 8) && ((uc & 0x000FU) < 12)) return (my_wchar_t)(uc - 8); /* Armenian uppercase letters; hey, this arrangement is sane */ if (uc >= 0x0561U && uc <= 0x0586U) return (my_wchar_t)(uc - 48); /* Cyrillic uppercase letters; another insane arrangement */ if (uc >= 0x0451U && uc <= 0x045FU) return (my_wchar_t)(uc - 80); if (uc >= 0x0430U && uc <= 0x044FU) return (my_wchar_t)(uc - 32); if (uc >= 0x0460U && uc <= 0x0481U) if (uc % 2) /* subtract one from the odd-numbered ones to upcase */ return (my_wchar_t)(uc - 1); if (uc >= 0x0490U && uc <= 0x04BFU) if (uc % 2) /* subtract one from the odd-numbered ones to upcase */ return (my_wchar_t)(uc - 1); if (uc >= 0x04C1U && uc <= 0x04C4U) if (! (uc % 2)) /* subtract one from the EVEN-numbered ones to downcase */ return (my_wchar_t)(uc - 1); if (uc >= 0x04D0U && uc <= 0x04FFU) if (uc % 2) /* subtract one from the odd-numbered ones to upcase */ return (my_wchar_t)(uc - 1); /* Isn't lowercase */ return c; } /* * uni_utf_any * * Returns nonnull if s (first arg) begins with a character from cs * (a string of UTF-8 characters). Returns NULL otherwise. */ my_wchar_t * uni_utf_any (my_wchar_t *s, char *cs) { my_wchar_t *p, *s2; /* returns a pointer to a static buffer */ if ((s2 = uni_strdup (utf_8_to_utf_16 (cs))) == NULL) errabort (40, "malloc() error in %s\n", "uni_utf_any()"); for (p = s2; *p != 0; p++) if (*p == *s) { free (s2); return s + 1; } free (s2); return NULL; } /* * uni_isascii * * Like isascii(), in ctype.h, but for use with Unicode chars. * Returns true if c is a 7-bit ASCII character. */ int uni_isascii (my_wchar_t c) { unsigned int uc; uc = (unsigned int)c; if (uc >= 0 && uc < 0x80) /* Is ASCII */ return 1; /* Isn't ASCII */ return 0; } /* * uni_isspace * * Like isspace(), in ctype.h, but for use with Unicode chars. * Returns true if c is in the ISO-8859-1 range, and isspace(c) * returns true. Also returns true if c is a Unicode "space * character" or a line/paragraph separator. * * See also uni_isblank(), which counts only spaces or tabs. */ int uni_isspace (my_wchar_t c) { unsigned int uc; /* Unicode/ISO 8859-1 no-break space */ if (c == 0x00A0U) return 1; uc = (unsigned int)c; if (c == (my_wchar_t)EOF || uc <= 0xFFU) /* For backwards compatibility */ return isspace (c); /* Look for Unicode "space characters" (table 6-18 of 2.0 standard) */ if ((uc >= 0x2000U && uc <= 0x200FU) /* || uc == 0x3000U - ideographic space; not really a space */ || uc == 0xFEFFU /* zero-width no-break space */ || uc == 0x2028U /* line separator (throw it in for good measure) */ || uc == 0x2029U) /* paragraph separator */ /* Is a space */ return 1; /* Isn't a space */ return 0; } /* * uni_isblank * * Like isblank(), in ctype.h, but for use with Unicode chars. * Returns true if c is in the ISO-8859-1 range, and is a space or * tab. Also returns true if c is a Unicode "space character." */ int uni_isblank (my_wchar_t c) { unsigned int uc; /* Unicode/ISO 8859-1 no-break space */ if (c == 0x00A0U) return 1; uc = (unsigned int)c; if (c == (my_wchar_t)EOF || uc <= 0xFFU) /* For backwards compatibility */ return isspace (c); /* Look for Unicode "space characters" (table 6-18 of 2.0 standard) */ if (uc == (unsigned int)'\t' || uc == (unsigned int)'\x20' || uc == 0x00A0U || (uc >= 0x2000U && uc <= 0x200FU) || 0x3000U /* ideographic space */ || 0xFEFFU) /* zero-width no-break space */ /* Is a space */ return 1; /* Isn't a space */ return 0; } /* * uni_isdigit * * Like isdigit(), in ctype.h, but for use with Unicode chars. * Returns true if c is a Unicode digit. * * To check whether c is an ASCII digit, use isascii(c) && * isdigit(c). */ int uni_isdigit (my_wchar_t c) { unsigned int uc; uc = (unsigned int)c; if (c == (my_wchar_t)EOF || uc <= 0xFFU) /* for backwards compatibility */ return isdigit (c); if ((uc >= 0x0660U && uc <= 0x0669U) || (uc >= 0x06F0U && uc <= 0x06F9U) /* Arabic-Indic digits */ || (uc >= 0x0966U && uc <= 0x096FU) /* Devanagari digits */ || (uc >= 0x09E6U && uc <= 0x09EFU) /* Bengali (follows Devanagari) */ || (uc >= 0x0A66U && uc <= 0x0A6FU) /* Gurmukhi (follows Devanagari) */ || (uc >= 0x0AE6U && uc <= 0x0AEFU) /* Gujarati (follows Devanagari) */ || (uc >= 0x0B66U && uc <= 0x0B6FU) /* Oriya (follows Devanagari) */ || (uc >= 0x0BE6U && uc <= 0x0BEFU) /* Tamil (follows Devanagari) */ || (uc >= 0x0C66U && uc <= 0x0C6FU) /* Telegu (follows Devanagari) */ || (uc >= 0x0CE6U && uc <= 0x0CEFU) /* Kannada (follows Devanagari) */ || (uc >= 0x0D66U && uc <= 0x0D6FU) /* Malayalam (follows Devanagari) */ || (uc >= 0x0E50U && uc <= 0x0E59U) /* Thai digits */ || (uc >= 0x0ED0U && uc <= 0x0ED9U) /* Lao (follows Thai) */ || (uc >= 0x0F20U && uc <= 0x0F29U)) /* Tibetan (follows Thai) */ /* Is a digit */ return 1; /* Isn't a digit */ return 0; } /* * uni_digit_value * * Return the value, as an integer, of a Unicode digit. E.g., for * the Arabic 0x06F0 return zero. I.e., this routine converts all * Unicode digits to integers. Return -1 on failure. */ int uni_digit_value (my_wchar_t c) { unsigned int uc; uc = (int )c; if (uc >= '0' && uc <= '9') return uc - '0'; if (uc >= 0x0660 && uc <= 0x0669) return uc - 0x0660; if (uc >= 0x06F0 && uc <= 0x06F9) /* Arabic-Indic digits */ return uc - 0x06F0; if (uc >= 0x0966 && uc <= 0x096F) /* Devanagari digits */ return uc - 0x0966; if (uc >= 0x09E6 && uc <= 0x09EF) /* Bengali (follows Devanagari) */ return uc - 0x09E6; if (uc >= 0x0A66 && uc <= 0x0A6F) /* Gurmukhi (follows Devanagari) */ return uc - 0x0A66; if (uc >= 0x0AE6 && uc <= 0x0AEF) /* Gujarati (follows Devanagari) */ return uc - 0x0AE6; if (uc >= 0x0B66 && uc <= 0x0B6F) /* Oriya (follows Devanagari) */ return uc - 0x0B66; if (uc >= 0x0BE6 && uc <= 0x0BEF) /* Tamil (follows Devanagari) */ return uc - 0x0BE6; if (uc >= 0x0C66 && uc <= 0x0C6F) /* Telegu (follows Devanagari) */ return uc - 0x0C66; if (uc >= 0x0CE6 && uc <= 0x0CEF) /* Kannada (follows Devanagari) */ return uc - 0x0CE6; if (uc >= 0x0D66 && uc <= 0x0D6F) /* Malayalam (follows Devanagari) */ return uc - 0x0D66; if (uc >= 0x0E50 && uc <= 0x0E59) /* Thai digits */ return uc - 0x0E50; if (uc >= 0x0ED0 && uc <= 0x0ED9) /* Lao (follows Thai) */ return uc - 0x0ED0; if (uc >= 0x0F20 && uc <= 0x0F29) /* Tibetan (follows Thai) */ return uc - 0x0F20; /* Probably isn't even a digit */ return -1; } /* * uni_tokenize: my_wchar_t * -> my_wchar_t * * unicode str -> unicode str2 (returns pointer to static buffer) * * Tokenize English text in a manner reminiscent of strtok(). See * the man page on strtok for the calling conventions. The only * real differences between strtok() and uni_tokinize are 1) that * strtok() operates on chars, which uni_tokenize operates on wide * chars, 2) that uni_tokenize takes no second argument (whitespace * is the only separator it knows), and 3) uni_tokenize does not * modify its argument; rather it returns a pointer to a static * buffer that may change on subsequent calls. */ my_wchar_t * uni_tokenize (my_wchar_t *str) { size_t i; my_wchar_t *p; static my_wchar_t *pos_p; static my_wchar_t *buf_p = NULL; static size_t buf_p_size = 0; if (str != NULL) { i = uni_strlen (str) + 1; if (buf_p == NULL) { buf_p_size = i; if ((buf_p = malloc (sizeof (my_wchar_t) * i)) == NULL) errabort (40, "malloc error in %s\n", "uni_tokenize()"); } else if (i > buf_p_size) { buf_p_size = i; if ((buf_p = realloc (buf_p, sizeof (my_wchar_t) * i)) == NULL) errabort (41, "realloc error in %s\n", "uni_tokenize()"); } memcpy (buf_p, str, sizeof (my_wchar_t) * i); pos_p = buf_p; } /* move past initial whitespace */ for (; uni_isspace (*pos_p); pos_p++); if (*pos_p == 0) return NULL; /* save the beginning of this token; then find its end */ for (p = pos_p; ! uni_isspace (*pos_p) && *pos_p != 0; pos_p++); /* if we reached EOL, stop; otherwise, increment pos_p */ if (*pos_p != 0) *pos_p++ = 0; return p; } /* * uni_strtol * * Like strtol(), but works with any digit (e.g., Arabic, Thai), and * recognizes not only the ASCII hyphen, but also the true Unicode * minus sign. */ long uni_strtol (my_wchar_t *s, my_wchar_t **endptr, int base) { long total, max; my_wchar_t c, *start; int neg, intval, okay; start = s; while (uni_isspace (*s)) s++; c = *s++; neg = 0; /* ASCII plus is Unicode plus; but ASCII hyphen and Unicode minus * are different */ if (c == '+') c = *s++; else if (c == 0x2212 || c == '-') { neg = 1; c = *s++; } if (base == 0 || base == 16) /* autodetect base 16; strip off leading 0X or 0x */ if (uni_digit_value (c) == 0 && (*s == 'x' || *s == 'X')) { s++; c = *s++; base = 16; } #define out_of_range() { okay = 0; errno = ERANGE; break; } /* What if there are no digits at all? */ if ((base <= 10 && uni_digit_value (c) == -1) || (base > 10 && (uni_digit_value (c) == -1 && ! (c < 0x80 && isalpha (c))))) { /* If there were no digits, reset endptr to start */ endptr = &start; return 0; } else { /* Okay, we have at least _some_ digits to process */ if (base == 0) /* if base is zero, autodetect octal or base-ten */ base = (uni_digit_value (c) == 0) ? 8 : 10; max = (LONG_MAX > base) ? (LONG_MAX / base) : LONG_MAX; okay = 1; total = 0; do { if (base <= 10 || uni_isdigit (c)) { intval = uni_digit_value (c); if (intval == -1) break; } else { /* ASCII alphabetic character */ if (c < 0x80 && isalpha (c)) intval = (tolower (c) - 'a') + 10; else break; } if (total > max) out_of_range (); total *= base; if (total > max) if (intval > (LONG_MAX - total)) out_of_range (); total += intval; } while ((c = *s++)); if (endptr != NULL) *endptr = okay ? (s - 1) : s; if (okay) return neg ? -total : total; /* yes, this assumes LONG_MIN is -LONG_MAX */ return neg ? -LONG_MAX : LONG_MAX; } } /* * uni_strcmp: * * Compare Unicode strings, à la strcmp(s1, s2). Returns 1 if s1 is * lexically greater than s2, 0 if they are identical, and -1 if s1 * is lexically less than s2. Note that no locale variations are * considered. */ int uni_strcmp (my_wchar_t *s1, my_wchar_t *s2) { while (*s1 == *s2++) if (*s1++ == 0) return 0; /* Convert to integral type and compare */ if ((unsigned int)*s1 > (unsigned int)*(s2 - 1)) return 1; return -1; } /* * uni_strcasecmp: * * Compare Unicode strings, à la strcasecmp(s1, s2). Returns 1 * if s1 is lexically greater than s2, 0 if they are identical, and * -1 if s1 is lexically less than s2. Note that no locale * variations are considered for the comparison. But they do come * into play for the downcasing that happens to every character * BEFORE it is compared. */ int uni_strcasecmp (my_wchar_t *s1, my_wchar_t *s2) { while (uni_tolower(*s1) == uni_tolower(*s2++)) if (*s1++ == 0) return 0; /* Convert to integral type and compare */ if ((unsigned int)uni_tolower(*s1) > (unsigned int)uni_tolower(*(s2 - 1))) return 1; return -1; } /* * uni_utf_strcmp: * * Compare Unicode string to UTF-8 (or strict 7-bit ASCII) string. * Works like uni_strcmp(), except that it first converts the second * argument to a Unicode string. * * Note that uni_utf_strcmp() calls utf_8_to_utf_16(), and will * reset its (static) buffer. This may screw things up if you just * used utf_8_to_utf_16() on another string, and didn't uni_strdup() * its return value. */ int uni_utf_strcmp (my_wchar_t *s1, const char *utf_s) { my_wchar_t *wp, *s2; /* returns a pointer to a static buffer */ if ((s2 = wp = uni_strdup (utf_8_to_utf_16 (utf_s))) == NULL) errabort (40, "malloc() error in %s\n", "uni_utf_strcmp()"); while (*s1 == *s2++) if (*s1++ == 0) { free (wp); return 0; } /* Convert to integral type and compare */ if ((unsigned int)*s1 > (unsigned int)*(s2 - 1)) { free (wp); return 1; } free (wp); return -1; } /* * uni_utf_strncmp: * * Compare 16-bit Unicode string to a UTF-8 string, after first * converting the UTF-8 string to UTF-16. Like uni_utf_strcmp(), * but only looks at the first n characters. Returns 1 if, up to * character n, s1 is lexically greater than s2; returns 0 if they * are identical up to that point; returns -1 if s1 is lexically * less than s2 up to that point. Note that no locale variations * are considered. */ int uni_utf_strncmp (my_wchar_t *s1, char *utf_s, size_t n) { my_wchar_t *wp, *s2; /* returns a pointer to a static buffer */ if ((s2 = wp = uni_strdup (utf_8_to_utf_16 (utf_s))) == NULL) errabort (40, "malloc() error in %s\n", "uni_utf_strcmp()"); if (n == 0) return 0; do { if (*s1 != *s2++) { /* Convert to integral type and compare */ if ((unsigned int)*s1 > (unsigned int)*(s2 - 1)) { free (wp); return 1; } free (wp); return -1; } else if (*s1++ == 0) break; } while (--n != 0); free (wp); return 0; } /* * uni_utf_strcasecmp: * * Compare Unicode string to UTF-8 (or strict 7-bit ASCII) string * case-insensitively. Works like uni_strcasecmp(), except that it * first converts the second argument to a Unicode string. * * Note that uni_utf_strcasecmp() calls utf_8_to_utf_16(), and will * reset its (static) buffer. This may screw things up if you just * used utf_8_to_utf_16() on another string, and didn't uni_strdup() * its return value. * * Note that locale considerations may come into play, not in the * comarison, but in the downcasing of each character that happens * before it is compared to its correspondent in the other string. */ int uni_utf_strcasecmp (my_wchar_t *s1, char *utf_s) { my_wchar_t *wp, *s2; /* returns a pointer to a static buffer */ if ((s2 = wp = uni_strdup (utf_8_to_utf_16 (utf_s))) == NULL) errabort (40, "malloc() error in %s\n", "uni_utf_strcasecmp()"); while (uni_tolower(*s1) == uni_tolower(*s2++)) if (*s1++ == 0) { free (wp); return 0; } /* Convert to integral type and compare */ if ((unsigned int)uni_tolower(*s1) > (unsigned int)uni_tolower(*(s2 - 1))) { free (wp); return 1; } free (wp); return -1; } /* * uni_utf_strncasecmp: * * Compare Unicode the first n (arg 3) characters in s1 (arg 1) to * the first n characters in utf_s (arg 2), after converting utf_s * to UTF-16 internally. The comparison is case insensitive. The * return values are analogous to those of strncasecmp(). * * Note that uni_utf_strncasecmp() calls utf_8_to_utf_16(), and will * reset its (static) buffer. This may screw things up if you just * used utf_8_to_utf_16() on another string, and didn't uni_strdup() * its return value. * * Note that locale considerations may come into play, not in the * comarison, but in the downcasing of each character that happens * before it is compared to its correspondent in the other string. */ int uni_utf_strncasecmp (my_wchar_t *s1, char *utf_s, size_t n) { my_wchar_t *wp, *s2; if (n == 0) return 0; /* returns a pointer to a static buffer */ if ((s2 = wp = uni_strdup (utf_8_to_utf_16 (utf_s))) == NULL) errabort (40, "malloc() error in %s\n", "uni_utf_strcasecmp()"); do { if (uni_tolower (*s1) != uni_tolower (*s2++)) { /* Convert to integral type and compare */ if ((unsigned int)uni_tolower(*s1) > (unsigned int)uni_tolower(*(s2 - 1))) { free (wp); return 1; } free (wp); return -1; } else if (*s1++ == 0) break; } while (--n != 0); free (wp); return 0; } /* * uni_strncmp: * * Compare 16-bit Unicode strings, à la strncmp(s1, s2). Like * uni_strcmp(), but only looks at the first n characters. Returns * 1 if, up to character n, s1 is lexically greater than s2; returns * 0 if they are identical up to that point; returns -1 if s1 is * lexically less than s2 up to that point. Note that no locale * variations are considered. */ int uni_strncmp (my_wchar_t *s1, my_wchar_t *s2, size_t n) { if (n == 0) return 0; do { if (*s1 != *s2++) { /* Convert to integral type and compare */ if ((unsigned int)*s1 > (unsigned int)*(s2 - 1)) return 1; return -1; } else if (*s1++ == 0) break; } while (--n != 0); return 0; } /* * uni_strncasecmp: * * Compare 16-bit Unicode strings, à la strncasecmp(s1, s2). * Like uni_strcasecmp(), but only looks at the first n characters. * Returns 1 if, up to character n, s1 (mapped to lowercase) is * lexically greater than s2 (mapped to lowercase); returns 0 if * they are identical up to that point; returns -1 if s1 is * lexically less than s2 up to that point. Note that no locale * variations are considered for the char comparison (they do, * however, come into play for downcasing). */ int uni_strncasecmp (my_wchar_t *s1, my_wchar_t *s2, size_t n) { if (n == 0) return 0; do { if (uni_tolower(*s1) != uni_tolower(*s2++)) { /* Convert to integral type and compare */ if ((unsigned int)uni_tolower(*s1) > (unsigned int)uni_tolower(*(s2 - 1))) return 1; return -1; } else if (*s1++ == 0) break; } while (--n != 0); return 0; } /* * uni_strpbrk: * * Find the first instance of any Unicode character in s2 in UTF-16 * string s1. Like strpbrk(). Returns NULL if none of the chars in * s2 occur in s1. */ my_wchar_t * uni_strpbrk (my_wchar_t *s1, my_wchar_t *s2) { my_wchar_t *p1, *p2; for (p1 = s1; *p1 != 0; p1++) for (p2 = s2; *p2 != 0; p2++) if (*p1 == *p2) return p1; return NULL; } /* * uni_strspn: * * Calculate the length of the initial span of characters from cset * s2 occurring in string s1. I.e., tell us how long the initial * string of chars from s2 is, starting with s1[0]. */ size_t uni_strspn (my_wchar_t *s1, my_wchar_t *s2) { my_wchar_t *p1, *p2; for (p1 = s1; *p1 != 0; p1++) { for (p2 = s2; *p2 != 0; p2++) if (*p1 == *p2) break; if (*p2 == 0) break; } return (size_t)(p1 - s1); } /* * uni_utf_strpbrk: * * Find the first instance of any character in s2 in UTF-16 string * s1. s2 is a UTF-8 string (that gets converted to UTF-16). This * routine won't work right if there are characters in s2 that * occupy the extended (two-char) UTF-16 range. Returns NULL if * there none of the chars in s2 occur in s2. */ my_wchar_t * uni_utf_strpbrk(my_wchar_t *s1, char *s2) { my_wchar_t *new_s2; my_wchar_t *p1, *p2; /* returns a pointer to a static buffer */ if ((new_s2 = uni_strdup (utf_8_to_utf_16 (s2))) == NULL) errabort (40, "malloc() error in %s\n", "uni_utf_strpbrk()"); for (p1 = s1; *p1 != 0; p1++) for (p2 = new_s2; *p2 != 0; p2++) if (*p1 == *p2) { free (new_s2); return p1; } free (new_s2); return NULL; } /* * uni_strcat: * * Like strcat(), but for Unicode/UTF-16 "strings". Appends string * src to string dest. Returns a pointer to dest. Assumes that * there's enough room in dest to hold whatever is there already * plus the string in src. */ my_wchar_t * uni_strcat (my_wchar_t *dest, my_wchar_t *src) { size_t i, len; /* find the end of the string in dest */ for (len = 0; dest[len] != 0; len++); for (i = 0; src[i] != 0; i++) dest[len + i] = src[i]; dest[len + i] = 0; return dest; } /* * uni_utf_strcat: * * Like uni_strcat() above, but takes a UTF-8 string as its second * argument. I.e., appends string src to string dest. Returns a * pointer to dest, after converting src to UTF-16. Assumes that * there's enough room in dest to hold whatever is there already * plus the string in src. */ my_wchar_t * uni_utf_strcat (my_wchar_t *dest, char *src) { my_wchar_t *wsrc; size_t i, len; /* returns a pointer to a static buffer */ if ((wsrc = uni_strdup (utf_8_to_utf_16 (src))) == NULL) errabort (40, "malloc error in %s\n", "uni_utf_strcat()"); /* find the end of the string in dest */ for (len = 0; dest[len] != 0; len++); for (i = 0; wsrc[i] != 0; i++) dest[len + i] = wsrc[i]; dest[len + i] = 0; free (wsrc); return dest; } /* * uni_strcpy: * * Like strcpy(), but for Unicode/UTF-16 "strings". Copies string * src into destination dest. Returns a pointer to dest. Assumes * that there's enough room in dest to hold the string in src. */ my_wchar_t * uni_strcpy (my_wchar_t *dest, my_wchar_t *src) { size_t i; for (i = 0; src[i] != 0; i++) dest[i] = src[i]; dest[i] = 0; return dest; } /* * uni_utf_strcpy: * * Like strcpy(), but for Unicode/UTF-16 "strings". Copies string * src into destination dest. Returns a pointer to dest. Assumes * that there's enough room in dest to hold the string in src. */ my_wchar_t * uni_utf_strcpy (my_wchar_t *dest, char *src) { size_t i; my_wchar_t *wsrc; /* returns a pointer to a static buffer */ if ((wsrc = uni_strdup (utf_8_to_utf_16 (src))) == NULL) errabort (40, "malloc error in %s\n", "uni_utf_strcpy()"); for (i = 0; wsrc[i] != 0; i++) dest[i] = wsrc[i]; dest[i] = 0; free (wsrc); return dest; } /* * uni_strchr: * * Like strchr(), but for Unicode/UTF-16 "strings". Returns a * pointer to the first occurrence of c in string ucp. Returns NULL * if there is no such character. */ my_wchar_t * uni_strchr (my_wchar_t *ucp, int c) { size_t len; for (len = 0; ucp[len] != c && ucp[len] != 0; len++); if (ucp[len]) return &ucp[len]; return NULL; } /* * uni_strstr: * * Like strstr(), but for Unicode/UTF-16 "strings". Returns a * pointer to the first occurrence of needle in haystack. Returns * NULL if there is no such string in haystack. */ my_wchar_t * uni_strstr (my_wchar_t *haystack, my_wchar_t *needle) { my_wchar_t *p; size_t needle_len; needle_len = uni_strlen (needle); for (p = haystack; *p != 0; p++) if (*p == *needle) if (uni_strncmp (p, needle, needle_len) == 0) return p; return NULL; } /* * uni_utf_strstr: * * Like uni_strstr(), but finds UTF-8 needle in a UTF-16 haystack by * converting the UTF-8 needle to UTF-16, and then doing the search. * Returns a pointer to the first occurrence of needle in haystack. * Returns NULL if there is no such string in haystack. */ my_wchar_t * uni_utf_strstr (my_wchar_t *haystack, char *needle) { my_wchar_t *p; my_wchar_t *wneedle; size_t wneedle_len; /* returns a pointer to a static buffer */ if ((wneedle = uni_strdup (utf_8_to_utf_16 (needle))) == NULL) errabort (40, "malloc() error in %s\n", "uni_utf_strstr()"); wneedle_len = uni_strlen (wneedle); for (p = haystack; *p != 0; p++) if (*p == *wneedle) if (uni_strncmp (p, wneedle, wneedle_len) == 0) { free (wneedle); return p; } free (wneedle); return NULL; } /* * uni_strlen: * * Like strlen(), but for Unicode/UTF-16 "strings". Returns the * number of 16-bit Unicode/UTF-16 characters in ucp minus the 0 * (trailing null). */ size_t uni_strlen (my_wchar_t *ucp) { size_t len; for (len = 0; ucp[len] != 0; len++); return len; } /* * uni_strdup: * * Like strdup(), but for Unicode/UTF-16 "strings". Calls malloc(). * Returns NULL on malloc() failure. */ my_wchar_t * uni_strdup (my_wchar_t *ucp) { size_t len; my_wchar_t *new_ucp; len = uni_strlen (ucp); if ((new_ucp = malloc ((len + 1) * sizeof (my_wchar_t))) == NULL) return NULL; memcpy (new_ucp, ucp, len * sizeof (my_wchar_t)); new_ucp[len] = 0; return new_ucp; } /* * uni_concatenate * * Concatenate two malloc'd strings by reallocating and extending * the first string, then appending the second, and finally freeing * the storage used by the second (cf. uni_concatenate_no_free() * below, which does not free the second string): * * string1 "hello " (len 7) + * string2 "there" (len 6) = string1 "hello there" (len 12) * * Note well: After uni_concatenate() has returned, string2 is no * longer a valid pointer. I.e., it no longer points at a valid * string. BEFORE YOU USE IT AGAIN, be sure to re-initializing it. */ my_wchar_t * uni_concatenate (my_wchar_t *s1, my_wchar_t *s2) { size_t l1, l2; if (s2 == NULL) return s1; if (s1 == NULL) { s1 = uni_strdup (s2); free (s2); return s1; } l1 = uni_strlen (s1); l2 = uni_strlen (s2); if ((s1 = realloc (s1, (l1 + l2 + 1) * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "uni_concatenate()"); memcpy (s1 + l1, s2, (l2 + 1) * sizeof (my_wchar_t)); /* NB: s2 won't be useful after this! */ free (s2); return s1; } /* * uni_concatenate_no_free * * Concatenate two malloc'd strings by reallocating and extending * the first string, then appending the second. But don't free * the second string (cf. uni_concatenate() above). * * string1 "hello " (len 7) + * string2 "there" (len 6) = string1 "hello there" (len 12) */ my_wchar_t * uni_concatenate_no_free (my_wchar_t *s1, my_wchar_t *s2) { size_t l1, l2; if (s2 == NULL) return s1; if (s1 == NULL) return uni_strdup (s2); l1 = uni_strlen (s1); l2 = uni_strlen (s2); if ((s1 = realloc (s1, (l1 + l2 + 1) * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "uni_concatenate_no_free()"); memcpy (s1 + l1, s2, (l2 + 1) * sizeof (my_wchar_t)); return s1; } /* * uni_map_whitespace_to_space * * Destructively maps whitespace chars to spaces, '\x20'; also, maps * \r\n to '\x20'. Note that this routine DOES NOT map sequences of * whitespace characters to a single space. uni_map_spaces_to_space() * below handles this job. This routine is what you use on CDATA * attributes. * * Whitespace characters here are not defined as you might expect * (namely, as Unicode whitespace, a la uni_isspace()). Rather they * are merely defined as ASCII \r\n\t\x20 (isspace()). */ my_wchar_t * uni_map_whitespace_to_space (my_wchar_t *s) { my_wchar_t *p, *p2; if (s == NULL) return NULL; p = s; for (p2 = s; *p != 0; p++) { if (*p > 0x80 || ! isspace (*p)) *p2++ = *p; else { /* map CR-LF to a single space */ if (*p == '\r' && *(p + 1) == '\n') p++; /* map other ASCII whitespace chars to a space */ *p2++ = '\x20'; } } *p2 = 0; return s; } /* * uni_map_spaces_to_space * * Destructively maps sequences of whitespace chars in (my_wchar_t *)s * to a single space ('\x20'). Trims off leading and trailing white * space as well. (This has to happen to all non-CDATA attributes.) * * NOTE WELL: Whitespace characters here are defined as \r\n\t\x20 * (isspace() is what's used - NOT uni_isspace())! */ my_wchar_t * uni_map_spaces_to_space (my_wchar_t *s) { my_wchar_t *p, *p2; if (s == NULL) return NULL; p = s; /* skip over leading whitespace */ while (*p < 0x80 && isspace (*p)) p++; for (p2 = s; *p != 0; p++) { if (*p > 0x80 || ! isspace (*p)) *p2++ = *p; else if (*(p + 1) > 0x80 || ! isspace (*(p + 1))) *p2++ = '\x20'; } /* strip off trailing spaces (if there are any) */ if (p2 > s) while (*(p2 - 1) == '\x20') p2--; *p2 = 0; return s; } /* * uni_ptr_ptr_cmp: * * Used when bsearching or qsorting arrays of (my_wchar_t *) strings * (i.e., arrays of pointers to wchars). After dereferencing its * two void * arguments, this routine does simple uni_strcmp on the * resulting (my_wchar_t *) strings. Return values are those of * uni_strcmp. */ int uni_ptr_ptr_cmp (const void *x1, const void *x2) { my_wchar_t *p1, *p2; p1 = *(my_wchar_t **)x1; p2 = *(my_wchar_t **)x2; return uni_strcmp (p1, p2); } /* * uni_ptr_ptr_casecmp: * * Like uni_ptr_ptr_cmp above, but case-insensitive. Used when * bsearching or qsorting arrays of (my_wchar_t *) strings (i.e., * arrays of pointers to wchars). After dereferencing its two void * * arguments, this routine does simple uni_strcasecmp on the * resulting (my_wchar_t *) strings. Return values are those of * uni_strcasecmp. */ int uni_ptr_ptr_casecmp (const void *x1, const void *x2) { my_wchar_t *p1, *p2; p1 = *(my_wchar_t **)x1; p2 = *(my_wchar_t **)x2; return uni_strcasecmp (p1, p2); } /* * uni_utf_ptr_ptr_casecmp: * * Like uni_ptr_ptr_casecmp above, but with a UTF-8 second argument. * Used when bsearching or qsorting arrays of (my_wchar_t *) strings * (i.e., arrays of pointers to wchars). After dereferencing its * two void * arguments, this routine does simple uni_utf_strcasecmp * on the resulting (my_wchar_t *) strings. Return values are those of * uni_utf_strcasecmp. */ int uni_utf_ptr_ptr_casecmp (const void *x1, const void *x2) { my_wchar_t *p1; char *p2; p1 = *(my_wchar_t **)x1; p2 = *(char **)x2; return uni_utf_strcasecmp (p1, p2); } /* * utf_16_to_ucs_4 * * Convert UTF-16 character string (i.e., a my_wchar_t array) into an * array of UCS-4 characters. Returns a pointer into static storage * that may change on subsequent calls. */ u_int32_t * utf_16_to_ucs_4 (my_wchar_t *uc) { size_t len, i, j; static size_t new_uc_buflen = 0; static u_int32_t *new_uc = NULL; len = uni_strlen (uc); if (new_uc == NULL) { new_uc_buflen = len + 1; if ((new_uc = malloc (new_uc_buflen * sizeof (u_int32_t))) == NULL) errabort (40, "malloc error in %s\n", "utf_16_to_ucs_4()"); } else if (len > new_uc_buflen) { new_uc_buflen = len + 1; if ((new_uc = realloc (new_uc, new_uc_buflen * sizeof (u_int32_t))) == NULL) errabort (41, "realloc error in %s\n", "utf_16_to_ucs_4()"); } for (i = j = 0; i < len; i++, j++) { if (! (uc[i] >= 0xD800 && uc[i] <= 0xDBFF && uc[i + 1] >= 0xDC00 && uc[i + 1] <= 0xDFFF)) new_uc[j] = uc[i]; else { /* we have a UTF-16 extended (two-char) sequence; merge in next wchar */ new_uc[j] = (((uc[i] - 0xD800U) << 10) | (uc[i + 1] - 0xDC00U)) + 0x0010000U; i++; } } new_uc[j] = 0; return new_uc; } /* * utf_16_to_int * * Convert first UTF-16 character in a UTF-16 string (i.e., a * my_wchar_t array) into an integer. Useful if we want to display the * scalar value of a UTF-16 character, and we don't know if it's one * or two-bytes long. */ int utf_16_to_int (my_wchar_t *uc) { unsigned int ul1, ul2; ul1 = uc[0]; ul2 = uc[1]; if (ul1 >= 0xD800U && ul1 <= 0xDBFFU) { if (ul2 < 0xDC00U || ul2 > 0xDFFFU) { /* Oops; first wchar is okay, second wchar is wrong */ xwrap (errdebug (7, "corrupt UTF-16 char %X,%X\n", ul1, ul2)); /* 0xFFFDU is the Unicode "don't know what this is" char */ return ul1; } else { /* we have a UTF-16 extended (two-char) sequence; merge in next wchar */ return (((ul1 - 0xD800U) << 10) | (ul2 - 0xDC00U)) + 0x0010000U; } } return ul1; } /* * utf_16_to_utf_8: * * Convert a UTF-16 string (a my_wchar_t[] array) to a UTF-8 string. * Copies the UTF-8 chars into a static buffer and returns a pointer * to that buffer. Note that the buffer's contents may change on * subsequent calls, so be sure to strdup anything that you want to * keep! */ char * utf_16_to_utf_8 (my_wchar_t *uc) { size_t pos; char *p, *q; my_wchar_t *up; unsigned int c; static size_t utf_8_bufsize; static unsigned char *utf_8_buf = NULL; if (utf_8_buf == NULL) { utf_8_bufsize = 256; if ((utf_8_buf = malloc (utf_8_bufsize)) == NULL) errabort (40, "malloc() failure in %s\n", "utf_16_to_utf_8()"); } pos = 0; utf_8_buf[0] = '\0'; for (up = uc; *up != 0; up++) { c = (unsigned int)*up; if (c >= 0xD800U && c <= 0xDBFFU) if ((unsigned int)*(++up) < 0xDC00U || (unsigned int)*(up) > 0xDFFFU) { /* Oops; first wchar is okay, second wchar is wrong */ xwrap (errdebug (7, "corrupt UTF-16 char %X,%X\n", c, *(up--))); /* 0xFFFDU is the Unicode "don't know what this is" char */ c = 0xFFFDU; } else /* we have a UTF-16 extended (two-char) sequence; merge in next wchar */ c = (((c - 0xD800U) << 10) | ((unsigned int)*(up) - 0xDC00U)) + 0x0010000U; /* c is *uc, unless *uc contained a UTF-16 char (see above) */ if ((p = int_2_utf_8_string (c)) == NULL) return NULL; for (q = p; *q != '\0'; q++, pos++) { /* the " - 1" here leaves room for the trailing null */ if (pos >= (utf_8_bufsize - 1)) { utf_8_bufsize += 256; if ((utf_8_buf = realloc (utf_8_buf, utf_8_bufsize)) == NULL) errabort (41, "realloc() failure in %s\n", "utf_16_to_utf_8()"); } utf_8_buf[pos] = *q; } /* append the trailing '\0' */ utf_8_buf[pos] = *q; } return utf_8_buf; } /* * ucs_2_to_utf_8: * * Convert a Unicode (char = 16-bit) string to a UTF-8 string. * Copies the UTF-8 chars into a static buffer and returns a pointer * to that buffer. Note that the buffer's contents may change on * subsequent calls, so be sure to strdup anything that you want to * keep! * * Normally, utf_16_to_utf_8() should be used instead of this * function, since utf_16_to_utf_8() does everything this one does, * and also handles two-wchar UTF-16 extended characters. (UCS-2 is * the strict sixteen-bits-per-char subset of UCS-4; UTF-16 includes * a few two-wchar characters). */ char * ucs_2_to_utf_8 (my_wchar_t *uc) { size_t pos; char *p, *q; my_wchar_t *up; static size_t utf_8_bufsize; static unsigned char *utf_8_buf = NULL; if (utf_8_buf == NULL) { utf_8_bufsize = 256; if ((utf_8_buf = malloc (utf_8_bufsize)) == NULL) errabort (40, "malloc() failure in %s\n", "ucs_2_to_utf_8"); } pos = 0; for (up = uc; *up != 0; up++) { if ((p = int_2_utf_8_string ((unsigned int)*up)) == NULL) return NULL; for (q = p; *q != '\0'; q++, pos++) { if (pos >= utf_8_bufsize) { utf_8_bufsize += 256; if ((utf_8_buf = realloc (utf_8_buf, utf_8_bufsize)) == NULL) errabort (41, "realloc() failure in %s\n", "ucs_2_to_utf_8"); } utf_8_buf[pos] = *q; } /* append the trailing '\0' */ utf_8_buf[pos] = *q; } return utf_8_buf; } /* * int_2_utf_16_string: * * Convert UCS-4 char, i (here, an unsigned int), to a UTF-16 string * that sits in a static buffer. Returns a pointer to that buffer. * Note that the buffer's contents may change on subsequent calls. */ my_wchar_t * int_2_utf_16_string (unsigned int i) { static my_wchar_t buf[3]; buf[0] = buf[1] = buf[2] = 0; if (i < 0x00010000U) buf[0] = i; else { if (i > 0x0010FFFFU) { errwarn (153, "discarding out-of-range char, %X\n", i); /* the Unicode "dunno" character */ buf[0] = 0x0000FFFD; } else { buf[1] = i - 0x10000U; buf[0] = ((buf[1] >> 10) & TEN_BIT_MASK) + 0xD800U; buf[1] = (buf[1] & TEN_BIT_MASK) + 0xDC00U; } } return &buf[0]; } /* * int_2_utf_8_string: * * Convert UCS-4 char, i (an int), to a variable-length UTF-8 * string. Copy UTF-8 chars into a static buffer and returns a * pointer to that buffer. * * Note that the buffer's contents may change on subsequent calls, * so be sure to strdup anything that you want to keep! * * Note also that this routine will convert any UCS-4 quantity * to UTF-8. Normally, though, we will be using only UCS-2, so * some of this code shouldn't get used. */ char * int_2_utf_8_string (unsigned int i) { unsigned int bits_left, pos; static unsigned char utf_8_buf[7] = { 0 }; /* Is i out of range? */ if ((unsigned int)i >= 0x7FFFFFFFU) { /* Replace it with the Unicode "replacement character" */ errwarn (153, "replacing out-of-range UCS-4 char, %X\n", i); i = 0x0000FFFDU; } if (i < 0x00000080U) { bits_left = 0; utf_8_buf[0] = (i >> bits_left) & 0x7FU; } else if (i < 0x00000800U) { bits_left = 6; utf_8_buf[0] = ((i >> bits_left) & 0x1FU) | 0xC0; } else if (i < 0x00010000U) { bits_left = 12; utf_8_buf[0] = ((i >> bits_left) & 0x0FU) | 0xE0; } else if (i < 0x00200000U) { bits_left = 18; utf_8_buf[0] = ((i >> bits_left) & 0x07U) | 0xF0; } else if (i < 0x04000000U) { bits_left = 24; utf_8_buf[0] = ((i >> bits_left) & 0x03U) | 0xF8; } else { bits_left = 30; utf_8_buf[0] = ((i >> bits_left) & 0x01U) | 0xFC; } pos = 1; while (bits_left) { bits_left -= 6; utf_8_buf[pos++] = ((i >> bits_left) & 0x3FU) | 0x80; } /* append trailing nil */ utf_8_buf[pos] = '\0'; return utf_8_buf; } /* * utf_8_to_utf_16: * * Converts UTF-8 string to UTF-16/Unicode string. Returns a * pointer into a static buffer that may change on subsequent * calls. * * NOTE WELL: This routine uses code similar to what's used in * utf_8() in fileutil.c. If you find a bug here, doubtless you'll * find a bug there as well. */ my_wchar_t * utf_8_to_utf_16 (const char *s) { unsigned int c, i; int pos, pos2, uni_pos, bytes_left; size_t len; static size_t ucp_len; static my_wchar_t *ucp = NULL; len = strlen (s) + 1; if (ucp == NULL) { /* Unless my_wchar_t is 8 bits, this is more than enough storage */ if ((ucp = malloc (ucp_len = len * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "utf_8_to_utf_16"); } else { if ((len * sizeof (my_wchar_t)) > ucp_len) if ((ucp = realloc (ucp, ucp_len = len * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "utf_8_to_utf_16"); } bytes_left = uni_pos = 0; for (pos = 0; pos < len; pos += bytes_left + 1) { i = 0; c = (unsigned char)s[pos]; if ((c & 0x80) == 0x00) { /* ASCII-range */ i = c; bytes_left = 0; } else if ((c & 0xC0) == 0x80) { errwarn (154, "out-of-sync UTF-8 char at offset %ld in %s\n", pos, s); return utf_8_to_utf_16 (s); } else if ((c & 0xE0) == 0xC0) { i = (c & (~0xE0U)); bytes_left = 1; } else if ((c & 0xF0) == 0xE0) { i = (c & (~0xF0U)); bytes_left = 2; } else if ((c & 0xF8) == 0xF0) { i = (c & (~0xF8U)); bytes_left = 3; } else if ((c & 0xFC) == 0xF8) { i = (c & (~0xFCU)); bytes_left = 4; } else if ((c & 0xFE) == 0xFC) { i = (c & (~0xFEU)); bytes_left = 5; } for (pos2 = pos + 1; pos2 <= pos + bytes_left; pos2++) { if ((s[pos2] & 0xC0) != 0x80) { xwrap (errdebug (7, "corrupt UTF-8 char in string %s\n", s)); return utf_8_to_utf_16 (s); } i = (i << 6) | (s[pos2] & 0x3F); } if (i < 0x00010000U) ucp[uni_pos++] = i; else { /* Some UCS-4 characters can't be represented in UTF-16 */ if (i > 0x0010FFFFU) { /* Replace it with the Unicode "replacement character" */ errwarn (153, "replacing out-of-range char, %X\n", i); ucp[uni_pos++] = 0x0000FFFDU; } /* Map i to UCS-16. Add the "high half" zone to ucp first. * Then add the "low half" zone. */ i -= 0x10000U; ucp[uni_pos++] = ((i >> 10) & TEN_BIT_MASK) + 0xD800U; ucp[uni_pos++] = (i & TEN_BIT_MASK) + 0xDC00U; } } return ucp; } #ifdef STANDALONE_UTFUTIL_TEST #include "readcfg.h" xmlparse_environment xmlparse_env; int main (int argc, char **argv) { long val; char linebuf[2048], lbuf[2048]; my_wchar_t wlinebuf[2048], wbuf[2048], *wp, **endp; readcfg (argc, argv); while (gets (linebuf)) { wlinebuf[0] = 0; uni_strcat (wlinebuf, utf_8_to_utf_16 (linebuf)); wp = uni_add_string (wlinebuf); printf ("string = %s\n", utf_16_to_utf_8 (wp)); printf ("uni_strlen (%s) = %d\n", linebuf, uni_strlen (wlinebuf)); printf ("Does it contain a zero? "); printf (uni_strchr (wlinebuf, '0') ? "yes\n" : "no\n"); if (! (uni_strchr (wlinebuf, '0') == uni_strstr (wlinebuf, utf_8_to_utf_16 ("0")))) printf ("oops; uni_strchr and uni_strstr disagree on the 0's position!\n"); if (! (uni_strchr (wlinebuf, '0') == uni_utf_strstr (wlinebuf, "0"))) printf ("oops; uni_strchr and uni_utf_strstr disagree on the 0's position!\n"); if (uni_strchr (wlinebuf, '0') && ! uni_isdigit (*(uni_strchr (wlinebuf, '0')))) printf ("Digit not found where digit was expected\n"); if (uni_strchr (wlinebuf, '0') && ! uni_utf_any (uni_strchr (wlinebuf, '0'), "0123")) printf ("0 (zero) not found where expected\n"); if (uni_strchr (wlinebuf, '0') && ! uni_isascii (*(uni_strchr (wlinebuf, '0')))) printf ("0 mistakenly analyzed as a non-ASCII char\n"); if (uni_strchr (wlinebuf, '0')) { wp = uni_strchr (wlinebuf, '0'); printf ("Exchanging this zero for Gujarati zero\n"); *wp = 0x0AE6; if (! uni_isdigit (*wp)) printf ("Gujarati zero not detected as a digit\n"); if (uni_digit_value (*wp) != 0) printf ("Gujarati zero not detected as zero\n"); if (uni_isascii (*wp)) printf ("0AE6 mistakenly analyzed as an ASCII char\n"); } if ((val = uni_strtol (wlinebuf, NULL, 0)) != LONG_MAX) { printf ("integer value of %s is %ld\n", linebuf, val); wp = &wlinebuf[0]; endp = ℘ if (uni_strtol (wlinebuf, endp, 0) != val) printf ("problem with endp section of uni_strtol()\n"); printf ("attempt to convert this as a base-10 number "); printf ("%s\n", (uni_strtol (wlinebuf, endp, 10) != val) ? "failed" : "succeeded"); if (wlinebuf[0] != 0 && *endp != &wlinebuf[0] && **endp == 0) printf ("the entire string was converted\n"); } uni_strcpy (wlinebuf, utf_8_to_utf_16 (linebuf)); printf ("tokenizing string: "); wp = uni_tokenize (wlinebuf); while (wp != NULL) { printf ("%s ", utf_16_to_utf_8 (wp)); wp = uni_tokenize (NULL); } printf ("\n"); strcpy (lbuf, utf_16_to_utf_8 (wlinebuf)); uni_strcpy (wbuf, utf_8_to_utf_16 (lbuf)); if (uni_strcmp (wlinebuf, wbuf) != 0) printf ("bad UTF-8 -> UTF-16 or UTF-16 -> UTF-8 conversion\n"); for (wp = wlinebuf; *wp != 0; wp++) *wp = tolower (*wp); printf ("string (lowercased) = %s\n", utf_16_to_utf_8 (wlinebuf)); for (wp = wlinebuf; *wp != 0; wp++) *wp = toupper (*wp); printf ("string (uppercased) = %s\n", utf_16_to_utf_8 (wlinebuf)); printf ("string truncated to 8 or fewer chars = %s\n", utf_16_to_utf_8 (uni_truncate_to (wlinebuf, 8))); printf ("string truncated to 2 or fewer chars = %s\n", utf_16_to_utf_8 (uni_truncate_to (wlinebuf, 2))); printf ("\n"); } wp = utf_8_to_utf_16 ("\r\n\t "); printf ("Testing whitespace-collapsing routine...\n"); printf ("String of whitespace collapses to: \"%s\".\n", utf_16_to_utf_8 (uni_map_spaces_to_space (wp))); wp = utf_8_to_utf_16 ("\n\r\t hello \t\r\n"); printf ("String \" hello \" collapses to: \"%s\".\n", utf_16_to_utf_8 (uni_map_spaces_to_space (wp))); wp = utf_8_to_utf_16 ("\n\r\t he l lo \t\r\n"); printf ("String \" he l lo \" collapses to: \"%s\".\n", utf_16_to_utf_8 (uni_map_spaces_to_space (wp))); wp = utf_8_to_utf_16 ("\r\n\t "); printf ("Testing whitespace-mapping routine...\n"); printf ("Characters in whitespace-only string collapse to: \"%s\".\n", utf_16_to_utf_8 (uni_map_whitespace_to_space (wp))); wp = utf_8_to_utf_16 ("\r\n\thello\t\r\n"); printf ("String \"\\r\\n\\thello\\t\\r\\n\" maps to: \"%s\".\n", utf_16_to_utf_8 (uni_map_whitespace_to_space (wp))); wp = utf_8_to_utf_16 ("\n\r\t he \rl\n lo \t\r\n"); printf ("String \"\\n\\r\\t he \\rl\\n lo \\t\\r\\n\" collapses to: \"%s\".\n", utf_16_to_utf_8 (uni_map_whitespace_to_space (wp))); uni_free_strings (); exit (0); } #endif /* STANDALONE_UTFUTIL_TEST */