/* ***************************************************************************** * * $RCSfile: langcode.c,v $ * $Date: 1999/05/18 18:36:20 $ * $Source: /home/richard/Xml/RCS/langcode.c,v $ * $Revision: 1.2 $ * $Author: richard $ * ***************************************************************************** * * Copyright 1998, Brown University and Richard Goerwitz * ***************************************************************************** * * Code for checking xml:lang attribute values. Just one entry * point: langcode_okay (wchar *lc), which returns 1 if lc is a valid * xml:lang attribute value, and 0 if not. * * It is assumed that all whitespace mappings and entity expansions * will already have been performed. * ***************************************************************************** */ #include "langcode.h" #include "utfutil.h" /* * begin: x can be anything (-> end) * i IANA-registered language name (-> iana) * ISO standard 639 lang code (-> subtag-1) * * iana: - if no dash (-> end) * 3-8 char alphanumeric (-> subtag-2) * * subtag-1: - if no dash (-> end) * ISO 3166 alpha-2 two-letter country code (-> subtag-2) * * subtag-2: - if no dash (-> end) * anything here is okay (-> subtag-2) */ int langcode_ok (my_wchar_t *lc) { int count, error; my_wchar_t c, *wp, *start, *tmp_lc; static const int langcode_count = 136; /* if you add codes here, adjust langcode_count above */ static const char *langcodes[137] = { "AA", "AB", "AF", "AM", "AR", "AS", "AY", "AZ", "BA", "BE", "BG", "BH", "BI", "BN", "BO", "BR", "CA", "CO", "CS", "CY", "DA", "DE", "DZ", "EL", "EN", "EO", "ES", "ET", "EU", "FA", "FI", "FJ", "FO", "FR", "FY", "GA", "GD", "GL", "GN", "GU", "HA", "HI", "HR", "HU", "HY", "IA", "IE", "IK", "IN", "IS", "IT", "IW", "JA", "JI", "JV", "KA", "KK", "KL", "KM", "KN", "KO", "KS", "KU", "KY", "LA", "LN", "LO", "LT", "LV", "MG", "MI", "MK", "ML", "MN", "MO", "MR", "MS", "MT", "MY", "NA", "NE", "NL", "NO", "OC", "OM", "OR", "PA", "PL", "PS", "PT", "QU", "RM", "RN", "RO", "RU", "RW", "SA", "SD", "SG", "SH", "SI", "SK", "SL", "SM", "SN", "SO", "SQ", "SR", "SS", "ST", "SU", "SV", "SW", "TA", "TE", "TG", "TH", "TI", "TK", "TL", "TN", "TO", "TR", "TS", "TT", "TW", "UK", "UR", "UZ", "VI", "VO", "WO", "XH", "YO", "ZH", "ZU", NULL }; static const int countrycode_count = 239; /* if you add codes here, adjust countrycode_count above */ static const char *countrycodes[240] = { "AD", "AE", "AF", "AG", "AI", "AL", "AM", "AN", "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AZ", "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BM", "BN", "BO", "BR", "BS", "BT", "BV", "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CR", "CU", "CV", "CX", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "EH", "ER", "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", "FX", "GA", "GB", "GD", "GE", "GF", "GH", "GI", "GL", "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IO", "IQ", "IR", "IS", "IT", "JM", "JO", "JP", "KE", "KG", "KH", "KI", "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "MG", "MH", "MK", "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", "PH", "PK", "PL", "PM", "PN", "PR", "PT", "PW", "PY", "QA", "RE", "RO", "RU", "RW", "SA", "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", "SK", "SL", "SM", "SN", "SO", "SR", "ST", "SV", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", "TK", "TM", "TN", "TO", "TP", "TR", "TT", "TV", "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", "WS", "YE", "YT", "YU", "ZA", "ZM", "ZW", NULL }; /* modify a copy of lc - not lc itself */ wp = tmp_lc = uni_strdup (lc); error = 0; if (*tmp_lc == 0) { error++; goto done; } if (uni_utf_strncasecmp (wp, "i-", 2) == 0) { /* i- precedes a non-ISO 639 language name/code */ wp++; goto iana; } else if (uni_utf_strncasecmp (wp, "x-", 2) == 0) { /* x- signals a user-defined language name schema */ wp++; /* subtag2 can be any alphabetic characters */ goto subtag2; } else { /* section off initial letters */ while (isalpha (*wp)) wp++; c = *wp; *wp = 0; /* check to see if letters form an ISO 639 country code */ if (bsearch (&tmp_lc, langcodes, langcode_count, sizeof (char *), uni_utf_ptr_ptr_casecmp)) { *wp = c; goto subtag1; } /* invalid country code */ error++; goto done; } iana: if (*wp == '-') { wp++; /* IANA language code must be 3-8 chars long; e.g., i-cherokee */ for (count = 0; isalpha (*wp); count++, wp++); switch (count) { case 0: case 1: case 2: error++; goto done; default: if (count > 8) { error++; goto done; } /* subtag2 can be any alphabetic characters */ goto subtag2; } } subtag1: if (*wp == '-') { start = ++wp; /* IANA language codes or 2-letter country codes are okay here */ for (count = 0; isalpha (*wp); count++, wp++); switch (count) { case 0: case 1: /* too short */ error++; goto done; case 2: c = *wp; *wp = 0; /* make sure country code is valid */ if (bsearch (&start, countrycodes, countrycode_count, sizeof (char *), uni_utf_ptr_ptr_casecmp)) { *wp = c; goto subtag2; } error++; goto done; default: if (count > 8) { /* too long */ error++; goto done; } goto subtag2; } } subtag2: if (*wp == '-') { wp++; if (*wp == 0) error++; else { /* must be alphabetic, A-z */ while (isalpha (*wp)) wp++; goto subtag2; } } /* if we're not at the end of a string, ERROR */ if (*wp != 0) error++; done: free (tmp_lc); /* if we got to the end of the string, return 1 (success) */ return ! error; } #ifdef STANDALONE_LANGCODE_TEST xmlparse_environment xmlparse_env; int main (void) { char p[1024]; my_wchar_t *wp; while (gets (p)) { wp = utf_8_to_utf_16 (p); printf ("for %s langcode_ok() returns %d\n", p, langcode_ok (wp)); } return 0; } #endif /* STANDALONE_LANGCODE_TEST */