/* ***************************************************************************** * * $RCSfile: fileutil.c,v $ * $Date: 1999/06/19 21:18:50 $ * $Source: /home/richard/Xml/RCS/fileutil.c,v $ * $Revision: 1.116 $ * $Author: richard $ * ***************************************************************************** * * Copyright 1998, 1999 Brown University and Richard Goerwitz * ***************************************************************************** * * General utility routines for reading files, recognizing file * formats, etc. * * getline() - reads in complete lines from a file, with no UCS to * UTF-8 conversions (backslash line-continuations are okay). Used * to read in lines from configuration files. Call syntax for * getline: * * getline (FILE *f, char *filename, int *lineno_p) * * where f is the result of fopen'ing the file called filename (arg * 2), and where lineno_p is a pointer to an integer initialized to * zero before the first call to getline. Returns a pointer to a * (changeable) static buffer. Strcpy() anything you want to keep! * * read_entire_xml_file (struct xml_file *xf) - reads the contents of * xf->file into a static (my_wchar_t *) buffer and returns a pointer * to that buffer. Returns NULL on ferror. * * add_xml_file (struct xml_file *parent_xf, struct xml_file *child_xf) * Makes child_xf a child of parent_xf. This is useful if you want * child_xf to be freed automatically when parent_xf is freed, and * if you want processing errors reported for both files together * (as when you have a parent document instance and a child external * DTD). Note that making child_xf a child of parent_xf implies a * merger of their respective entity and notation tables. Duplicate * entries will be flagged via add_xml_warning(); conflicting ones * will be flagged as errors via add_xml_error(). * * create_xml_file (char *fname) - Open fname, figure out its format, * and return a pointer to an xml_file structure that contains all * that we need to process it. See also free_xml_file() below. * Returns a pointer to an xml_file. * * create_xml_tmpfile (char *URI, char *tmpfname) - Same as above * (create_xml_file()), except that it takes both a URI and a * temporary filename (generated, presumably, by tmpnam) as args. * Returns a pointer to an xml_file. Note that tmpfname will be * unlinked when this xml_file is freed (see free_xml_file() * below). * * rewind_xml_file (xml_file *xf) - Rewind xf to the beginning (resets * xf->lineno to zero, and calls rewind (xf->file). * * free_xml_file (xml_file *xf) - Free xf and all its child xml_file * structures (on which, see add_xml_file() above). * * add_xml_error(xml_file *xf, int num, my_wchar_t *text) - adds notice * of a processing error to an xml_file struct; num (arg 2) refers * to an entry in the message catalog * * add_xml_warning(xml_file *xf, int num, my_wchar_t *text) - adds * notice of a processing problem (a warning; not an error) to an * xml_file struct * * report_all_errors_and_warnings (struct xml_file *xf) - send to * stdout a report of all errors (see add_xml_error()) and warnings * (see add_xml_warning()) encountered while processing xf and any * of its children (on children, see add_xml_file()). Return the * total number of actual errors (not warnings) issued. * * create_xml_element (xml_file *xf, my_wchar_t *name, enum content_types type, void *content_model) * create new xml_element structure for name (arg 2), with a null * attribute list; content_model is null for type = empty or Any; * for type = mixed, it is an array of (my_wchar_t *) pointers (last * one is NULL); for type = children, it's a tree * * create_xml_attribute (xml_file *xf, my_wchar_t *name, enum attribute_types * type, size_t nmtoklen, my_wchar_t **nmtokens, enum default_types * default_type, my_wchar_t *default_val) - create an xml_attribute * structure with name = name (arg1), type = type (arg 2), and with * other appropriate settings (e.g., if type is either 'notation' * or 'enumeration' then nmtokens is a string list) * * free_xml_element (struct xml_element *xe) - free an xml_element * struct (called by free_xml_file(), and by some of the error * handling routines in the parser) * * free_xml_attribute (struct xml_attribute *xa) - free an * xml_attribute struct (called by free_xml_element() above, and also * by some error-handling routines in the parser) * * Error numbers from 140 to 159. * ***************************************************************************** */ #include "fileutil.h" #include "isotabs.h" #include "errabort.h" #include "getmessage.h" #include "grammutil.h" #include "hashutil.h" #include "lockutil.h" #include "nfadfa.h" #include "parsutil.h" #include "sjistabs.h" #include "strutil.h" #include "utfutil.h" #include "xtrautil.h" #define TEN_BIT_MASK ((1U << 10) - 1U) enum hashtype { element_names, notation_names, parameter_entity_names, external_entity_names, unparsed_entity_names, entity_names, id_refs, ids }; enum malformation_type { its_okay, improperly_terminated, spurious_byte_order_char, spurious_whitespace }; /* used by merge_rg_htables; NOFREE must be zero */ #define FREEOLD 1 #define NOFREE 0 #define skip_whitespace(p) { while (uni_isspace (*p)) p++; } static my_wchar_t *check_for_encoding_decl (xml_file *); static enum malformation_type parse_and_remove_text_decl (struct xml_file *, my_wchar_t *); static int is_recursive (struct xml_file *, struct xml_file *); static int merge_rg_htables (struct xml_file *, struct xml_file *, enum hashtype, int); static xml_error *create_xml_error (int, my_wchar_t *, int, long); static xml_warning *create_xml_warning (int, my_wchar_t *, int, long); static void free_xml_error (xml_error *); static void free_xml_warning (xml_warning *); static int (*(detect_input_format (xml_file *)))(xml_file *); static int empty_file (xml_file *); static int ascii_compatible (xml_file *); static int ucs_4_big_endian (xml_file *); static int ucs_4_little_endian (xml_file *); static int shift_jis (xml_file *); static int iso8859_1 (xml_file *); static int iso8859_2 (xml_file *); static int iso8859_3 (xml_file *); static int iso8859_4 (xml_file *); static int iso8859_5 (xml_file *); static int iso8859_6 (xml_file *); static int iso8859_7 (xml_file *); static int iso8859_8 (xml_file *); static int iso8859_9 (xml_file *); static int iso8859_14 (xml_file *); static int iso8859_15 (xml_file *); static int iso8859 (xml_file *, int); static int utf_16_big_endian (xml_file *); static int utf_16_little_endian (xml_file *); static int ucs_2_big_endian (xml_file *); static int ucs_2_little_endian (xml_file *); static int utf_8 (xml_file *); /* used qsort */ static int report_errors_and_warnings (struct xml_file *, int *); /* used by qsort */ int compare_xml_errors (const void *, const void *); int compare_xml_warnings (const void *, const void *); /* list of built-in entites (ones that don't have to be declared; if * you change this here, change it also in the create_xml_(tmp)file * routines below (where it is defined somewhat differently) */ const char *built_in_entity_list[] = { "amp", "gt", "lt", "apos", "quot", NULL }; /* * getline * * Like fgets(), in a way, but concatenates lines ending in a * backslash (\) and takes no "buffer" or "length" argument, * but rather copies each line into a static buffer that may * change on subsequent calls. Returns NULL on EOF. * * NOTE WELL: This routine assumes ASCII input. It's used * only on things like configuration files, which shouldn't * be using anything above character 0x80! */ char * getline (FILE *f, char *filename, int *lineno_p) { size_t len; char *p, *p2; static size_t buflen; static char *buf = NULL; if (buf == NULL) { buflen = 256; if ((buf = malloc (buflen * sizeof (char))) == NULL) errabort (40, "malloc error in %s\n", "getline()"); } /* Grab a line - or at least as much as buf will currently hold */ if (! fgets (buf, buflen, f)) { xwrap (errdebug (5, "fgets() finished for %s\n", filename)); return NULL; } /* increment line number passed to us as a pointer to an int */ xwrap (errdebug (5, "read line #%d from %s: %s\n", ++*lineno_p, filename, buf)); /* Warn the user now if there was a problem. */ if (ferror (f)) errwarn (140, "I/O error reading %s\n", filename); len = strlen (buf); if (len > 0) { /* If the buffer was too small to hold the line, then expand it */ while (buf[len - 1] != '\n') { xwrap (errdebug (7, "no trailing newline; buffer too small; expanding it\n")); buflen *= 2; if ((buf = realloc (buf, buflen)) == NULL) errabort (41, "realloc error in %s\n", "getline()"); p = fgets (&buf[len], buflen - len, f); len += strlen (&buf[len]); if (p == NULL) { xwrap (errdebug (7, "ran out of input before finding a newline\n")); goto fgets_failed; } } /* Check for a line-final backslash; if present, append next line */ if (len > 1 && buf[len - 1] == '\n' && buf[len - 2] == '\\') { xwrap (errdebug (7, "line ends in backslash; concatenating with next\n")); do { len -= 2; p = fgets (&buf[len], buflen - len, f); for (p2 = &buf[len]; isspace (*p2); p2++) continue; strcpy (&buf[len], p2); len += strlen (&buf[len]); if (p == NULL) { xwrap (errdebug (7, "ran out of input before finding newline\n")); goto fgets_failed; } /* Same thing as above; expand buffer if it's too small */ while (buf[len - 1] != '\n') { xwrap (errdebug (7, "buffer too small; expanding it\n")); buflen *= 2; if ((buf = realloc (buf, buflen)) == NULL) errabort (41, "realloc error in %s\n", "getline()"); if (! fgets (&buf[len], buflen - len, f)) goto fgets_failed; len += strlen (&buf[len]); } (*lineno_p)++; } while (buf[len - 1] == '\n' && buf[len - 2] == '\\'); } fgets_failed: /* If we get to here, either fgets() failed, or we got a * full line. Either way, return whatever we've got. */ xwrap (errdebug (3, "read line #%d from %s: %s\n", *lineno_p, filename, buf)); return buf; } /* no data */ xwrap (errdebug (5, "no more data from %s\n", filename)); return NULL; } /* * read_entire_xml_file * * Read in the entire text of xml_file xf, and return it as a * pointer to a static (my_wchar_t *) buffer. Note that the contents * of this buffer may change on subsequent calls. Returns NULL * on error (presumably a read error). */ my_wchar_t * read_entire_xml_file (struct xml_file *parent, struct xml_file *xf) { size_t i, len; my_wchar_t *errmsg; static size_t wp_buflen; static my_wchar_t *wp = NULL; struct xml_file *error_xf; if (wp == NULL) { wp_buflen = 64; if ((wp = malloc (wp_buflen * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "read_entire_xml_file()"); } *wp = 0; /* put an initial chunk into wp */ rewind_xml_file (xf); i = read_xml_file (xf, wp, wp_buflen); for (len = i; i > 0; len += i) { /* check for errors */ if (ferror (xf->file)) { errmsg = utf_8_to_utf_16 ("(token unspecified)"); add_xml_error (xf, 320, errmsg); return wp; } len--; /* nix trailing nil */ if ((len + 64) > wp_buflen) { wp_buflen = len + 64; if ((wp = realloc (wp, wp_buflen * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "read_entire_xml_file()"); } i = read_xml_file (xf, &wp[len], wp_buflen - len); } /* check and remove TextDecl */ switch (parse_and_remove_text_decl (xf, wp)) { case its_okay: break; case improperly_terminated: /* * errmsg = utf_8_to_utf_16 (">"); * add_xml_error (xf, 364, errmsg); */ break; case spurious_byte_order_char: /* * errmsg = utf_8_to_utf_16 ("(at start of external entity)"); * add_xml_error (xf, ???, errmsg); */ break; case spurious_whitespace: error_xf = xf; errmsg = utf_8_to_utf_16 (" "); if (xf->child == maybe) { errmsg = utf_8_to_utf_16 ("(at start of external entity)"); error_xf = (parent ? parent : xf); } add_xml_error (error_xf, 386, errmsg); break; } xwrap (errdebug (7, "read in external entity replacement text\n")); return wp; } /* * parse_and_remove_text_decl * * If there's a TextDecl (a truncated XML decl - one lacking a * 'standalone' decl), then parse it lightly, and strip it out. * This routine is destructive, in the sense that it alters the * contents of its argument, wstr. * * Note well: If you change how this works, change also how * the EncodingDecl rule is handled in parsutil.y. */ static enum malformation_type parse_and_remove_text_decl (struct xml_file *xf, my_wchar_t *wstr) { my_wchar_t c, *p, *wp, *wp2; int i, found_xml_decl, old_lineno; enum malformation_type how_is_it_malformed; int starts_with_space, starts_with_byte_order_char; xwrap (errdebug (7, "checking for text decl in external entity (will remove if found)\n")); xwrap (errdebug (7, "external entity = %s\n", utf_16_to_utf_8 (uni_truncate_to (wstr, 20)))); /* assume for now that the TextDecl is well-formed */ how_is_it_malformed = its_okay; wp = &wstr[0]; found_xml_decl = 0; old_lineno = xf->lineno; /* strip out string-initial byte-order chars */ starts_with_byte_order_char = 0; while (*wp == 0xFEFF || *wp == 0xFFFE) wp++; if (wp > &wstr[0]) { starts_with_byte_order_char = 1; if ((wp - &wstr[0]) > 1) how_is_it_malformed = spurious_byte_order_char; } /* hmmm; if a TextDecl gets found later on, these spaces are invalid */ starts_with_space = 0; if (uni_isspace (*wp)) { skip_whitespace (wp); starts_with_space = 1; } /* now, look for xml declaration */ if (uni_utf_strncmp (wp, "' && *wp != 0) wp++; } else { c = *wp2; *wp2 = 0; for (p = wp; p < wp2; p++) *wp2 = c; wp = wp2 + 1; } } /* XML spec requires whitespace here (which is silly) */ skip_whitespace (wp); if (uni_utf_strncasecmp (wp, "encoding", 8) == 0) { wp += 8; skip_whitespace (wp); wp++; skip_whitespace (wp); c = *wp++; if ((wp2 = uni_strchr (wp, c)) == NULL) { while (*wp != '>' && *wp != 0) wp++; } else { /* skip past encoding decl value */ wp = wp2 + 1; } } /* XML spec requires whitespace here (which is silly) */ skip_whitespace (wp); if (uni_utf_strncasecmp (wp, "standalone", 10) == 0) { wp += 10; skip_whitespace (wp); if (*wp == '=') { wp++; skip_whitespace (wp); } if (*wp == '"' || *wp == '\'') { c = *wp++; if ((wp2 = uni_strchr (wp, c)) == NULL) while (*wp != '>' && *wp != 0) wp++; else wp = wp2 + 1; } } skip_whitespace (wp); if (*wp != '>') while (*wp != '>' && *wp != 0) wp++; if (*wp != '>') found_xml_decl = 0; else { if (*(wp - 1) != '?') /* oops; TextDecl ends in >, not ?> */ how_is_it_malformed = improperly_terminated; wp++; } } if (found_xml_decl) { /* remove the text decl from wstr */ for (i = 0; *wp != 0; wp++) wstr[i++] = *wp; wstr[i] = 0; if (starts_with_space) how_is_it_malformed = spurious_whitespace; } else if (starts_with_byte_order_char) { /* skip over, ... */ for (wp = &wstr[0]; *wp == 0xFEFF || *wp == 0xFFFE; wp++); /* ...and remove, initial byte-order characters */ for (i = 0; *wp != 0; wp++) wstr[i++] = *wp; wstr[i] = 0; } /* nothing to return (although wp may have been modified) */ xwrap (errdebug (5, "removed%s text decl from external entity\n", found_xml_decl ? "" : " no")); xwrap (errdebug (7, "external entity now = %s\n", utf_16_to_utf_8 (uni_truncate_to (wstr, 20)))); /* restore original line position */ xf->lineno = old_lineno; return how_is_it_malformed; } /* * set_encoding: * * Manually sets the encoding for xf (arg 1) according to the string * supplied as arg 2. E.g., if arg 2 is "UTF-8", xf's get_xml_char * routine is set to the utf_8() function. Comparisons are case- * insensitive, and offer some leeway (e.g., "utf-8", "utf_8", are * both okay as well). * * Note that by the time this function gets called, xf's (arg 1) * file format has already been detected and xf->get_xml_char * has been set already. We can therefore check whether encoding * (arg 2) and xf->get_xml_char are incompatible, and emit an error * message if they are not. */ struct xml_file * set_encoding (struct xml_file *xf, my_wchar_t *encoding) { my_wchar_t *tmp; size_t i, j, len; int format_error = 0; tmp = uni_strdup (encoding); len = uni_strlen (tmp); for (i = j = 0; i < len; i++) { tmp[i] = uni_toupper (tmp[i]); if (tmp[i] < 0x80 && isalnum (tmp[i])) tmp[j++] = tmp[i]; else if (tmp[i] >= 0x80 || (tmp[i] != '-' && tmp[i] != '_' && ! isalnum (tmp[i]))) format_error = 1; } if (format_error) add_unique_error (xf, 383, encoding); tmp[j] = 0; switch (tmp[0]) { case 'A': if (uni_utf_strcmp (tmp, "ASCII") == 0) { if (xf->get_xml_char != utf_8) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); } break; case 'E': if (uni_utf_strcmp (tmp, "EUCJP") == 0) { /* unsupported encoding */ add_unique_error (xf, 382, encoding); } else { /* unknown encoding */ add_unique_error (xf, 381, encoding); } break; case 'I': if (uni_utf_strcmp (tmp, "ISO2022JP") == 0) { /* unsupported encoding */ add_unique_error (xf, 382, encoding); goto done; } else if (uni_utf_strncmp (tmp, "ISO10646UCS", 11) == 0) { if (tmp[11] && ! tmp[12]) { switch (tmp[11]) { case '2': /* if utf-16 was already autodetected, we can shift to ucs-2... */ if (xf->get_xml_char == utf_16_big_endian) xf->get_xml_char = ucs_2_big_endian; else if (xf->get_xml_char == utf_16_little_endian) xf->get_xml_char = ucs_2_little_endian; else if (! (xf->get_xml_char == ucs_2_big_endian || xf->get_xml_char == ucs_2_little_endian)) /* ...but otherwise ucs-2 encoding is incompatible * with the detected file format */ add_unique_error (xf, 384, encoding); goto done; case '4': if (! (xf->get_xml_char == ucs_4_big_endian || xf->get_xml_char == ucs_4_little_endian)) add_unique_error (xf, 384, encoding); goto done; } } } else if (uni_utf_strncmp (tmp, "ISO8859", 7) == 0) { switch (tmp[7]) { case '1': switch (tmp[8]) { case 0: if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_1; else if (xf->get_xml_char != iso8859_1) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; case '4': if (tmp[9] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_14; else if (xf->get_xml_char != iso8859_14) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } case '5': if (tmp[9] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_15; else if (xf->get_xml_char != iso8859_15) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } } break; case '2': if (tmp[8] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_2; else if (xf->get_xml_char != iso8859_2) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } case '3': if (tmp[8] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_3; else if (xf->get_xml_char != iso8859_3) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } case '4': if (tmp[8] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_4; else if (xf->get_xml_char != iso8859_4) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } case '5': if (tmp[8] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_5; else if (xf->get_xml_char != iso8859_5) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } case '6': if (tmp[8] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_6; else if (xf->get_xml_char != iso8859_6) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } case '7': if (tmp[8] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_7; else if (xf->get_xml_char != iso8859_7) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } case '8': if (tmp[8] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_8; else if (xf->get_xml_char != iso8859_8) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } case '9': if (tmp[8] == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = iso8859_9; else if (xf->get_xml_char != iso8859_9) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); goto done; } } } /* unknown encoding */ add_unique_error (xf, 381, encoding); done: break; case 'S': if (uni_utf_strcmp (tmp, "SHIFTJIS") == 0) { if (xf->get_xml_char == utf_8) xf->get_xml_char = shift_jis; else if (xf->get_xml_char != shift_jis) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); } else { /* unknown encoding */ add_unique_error (xf, 381, encoding); } break; case 'U': if (uni_utf_strcmp (tmp, "USASCII") == 0) { if (xf->get_xml_char != utf_8) /* encoding is incompatible with actual file format */ add_unique_error (xf, 384, encoding); } else if (uni_utf_strcmp (tmp, "UTF8") == 0) { if (xf->get_xml_char != utf_8) add_unique_error (xf, 384, encoding); } else if (uni_utf_strncmp (tmp, "UTF16", 6) == 0) { if (! (xf->get_xml_char == utf_16_big_endian || xf->get_xml_char == utf_16_little_endian)) add_unique_error (xf, 384, encoding); } else if (uni_utf_strcmp (tmp, "UCS2") == 0) { if (xf->get_xml_char == utf_16_big_endian) xf->get_xml_char = ucs_2_big_endian; else if (xf->get_xml_char == utf_16_little_endian) xf->get_xml_char = ucs_2_little_endian; else if (! (xf->get_xml_char == ucs_2_big_endian || xf->get_xml_char == ucs_2_little_endian)) /* ...but otherwise ucs-2 encoding is incompatible * with the detected file format */ add_unique_error (xf, 384, encoding); } else if (uni_utf_strcmp (tmp, "UCS4") == 0) { if (! (xf->get_xml_char == ucs_4_big_endian || xf->get_xml_char == ucs_4_little_endian)) add_unique_error (xf, 384, encoding); } else /* unknown encoding */ add_unique_error (xf, 381, encoding); break; default: /* unknown encoding */ add_unique_error (xf, 381, encoding); break; } free (tmp); return xf; } /* * read_xml_file: * * Read xml_file xf, doing whatever conversions are necessary to turn * it into UTF-16/Unicode, storing the read characters in buf. Arg 3, * how_many_wchars, give the count of 16-bit Unicode chars to store * in buf. * * Places up to how_many_wchars - 1 wchars in buf, then appends a 0. * Returns the number of characters placed in buf (including the 0); */ size_t read_xml_file (struct xml_file *xf, my_wchar_t *buf, size_t how_many_wchars) { my_wchar_t c = 0; size_t i, stop; if (how_many_wchars < 2) errabort (15, "buffer too small in %s\n", "read_xml_file()"); if (! xf->file || feof (xf->file)) return 0; stop = how_many_wchars - 1; for (i = 0; i < stop; i++) { if ((c = xf->get_xml_char (xf)) != (my_wchar_t)EOF) buf[i] = c; else break; } buf[i] = 0; if (i == 0 && c == (my_wchar_t)EOF) return 0; return i + 1; } /* * is_recursive * * An xml_file can't have itself as its own ancestor. In other * words, it can't have itself as its child. (Same thing.) The way * xml_file structures are implemented, they have both parents and * children. When considering whether to add a child, one must * first go through the parent, the parent's parent, etc. to see if * they point to the same URL/file as the potential child. If any * of them do, then the child must be rejected. Otherwise, the * parser may go into an infinite loop. * * Returns 1 if adding the child would cause an infinite loop, i.e., * if it is its own parent. Otherwise returns 0. */ static int is_recursive (struct xml_file *parent_xf, struct xml_file *child_xf) { struct xml_file *tmp_xf; tmp_xf = parent_xf; while (tmp_xf) { if (tmp_xf == child_xf || strcmp (tmp_xf->filename, child_xf->filename) == 0) { /* Oops. Parent must have contained an external entity * reference that pointed to a file that eventually pointed * back to one of its ancestors. This kind of recursion is * unacceptable. */ add_xml_error (parent_xf, 770, utf_8_to_utf_16 (child_xf->filename)); return 1; } tmp_xf = tmp_xf->parent; } /* We're okay; there is no recursion. */ return 0; } /* * add_xml_file_nomerge * * Adds a new child xml_file (child_xf) to parent_xf, but don't * merge hashtables. The only reason to use this routine is to make * sure that this child is garbage collected later on. Use it on * files holding external entities. * * See add_xml_file below for a fuller version of this routine, * meant for more extensive merging of the parent and child data * structures. Use add_xml_file below to merge the external DTD * with the internal one. * * Returns the number of children, total, for parent_xf. Returns 0 * on error. */ size_t add_xml_file_nomerge (struct xml_file *parent_xf, struct xml_file *child_xf) { size_t i; /* What are we doing? */ xwrap (errdebug (7, "converting a parent xml_file into a pseudo-child\n")); if (parent_xf == child_xf) errabort (158, "can't make parent_xf a child of itself\n"); if (is_recursive (parent_xf, child_xf)) return 0; /* check to be sure child_xf isn't already a child of parent_xf */ for (i = 0; i < parent_xf->filelen; i++) if (parent_xf->xml_files[i] == child_xf) { xwrap (errdebug (5, "child_xf is already a child of parent_xf\n")); return parent_xf->filelen; } if (child_xf->child == yes || child_xf->child == maybe) errabort (156, "xml_file is already someone else's child\n"); if (parent_xf->filelen > 0) parent_xf->xml_files = realloc (parent_xf->xml_files, ++parent_xf->filelen * sizeof (xml_file *)); else { parent_xf->filelen = 1; parent_xf->xml_files = malloc (sizeof (xml_file *)); } parent_xf->xml_files[parent_xf->filelen - 1] = child_xf; /* give the child a copy of the parent's tables */ merge_rg_htables (child_xf, parent_xf, element_names, NOFREE); merge_rg_htables (child_xf, parent_xf, notation_names, NOFREE); merge_rg_htables (child_xf, parent_xf, parameter_entity_names, NOFREE); merge_rg_htables (child_xf, parent_xf, external_entity_names, NOFREE); merge_rg_htables (child_xf, parent_xf, unparsed_entity_names, NOFREE); merge_rg_htables (child_xf, parent_xf, entity_names, NOFREE); merge_rg_htables (child_xf, parent_xf, id_refs, NOFREE); merge_rg_htables (child_xf, parent_xf, ids, NOFREE); /* this is a child xml_file structure */ child_xf->child = maybe; child_xf->parent = parent_xf; xwrap (errdebug (5, "converted a parent xml_file into a pseudo-child\n")); return parent_xf->filelen; } /* * add_xml_file * * Adds a new child xml_file (child_xf) to parent_xf. That is, * makes child_xf (arg2; an unassociated parent xml_file) into a * child of parent_xf (arg1). * * Xml_file structures are added to a top-level xml_file structure * while parsing its associated file. See fileutil.h for info on * the xml_file structure. See also dtdutil.c on how public and * system identifiers are resolved (new xml_file structs are * generated this way). * * Returns the number of children, total, for parent_xf. Returns 0 * on error. */ size_t add_xml_file (struct xml_file *parent_xf, struct xml_file *child_xf) { size_t i; /* What are we doing? */ xwrap (errdebug (7, "converting a parent xml_file into a child\n")); if (parent_xf == child_xf) errabort (158, "can't make parent_xf a child of itself\n"); if (is_recursive (parent_xf, child_xf)) return 0; /* check to be sure child_xf isn't already a child of parent_xf */ for (i = 0; i < parent_xf->filelen; i++) if (parent_xf->xml_files[i] == child_xf) { xwrap (errdebug (5, "child_xf is already a child of parent_xf\n")); return parent_xf->filelen; } if (child_xf->child == yes || child_xf->child == maybe) errabort (156, "xml_file is already someone else's child\n"); if (parent_xf->filelen > 0) parent_xf->xml_files = realloc (parent_xf->xml_files, ++parent_xf->filelen * sizeof (xml_file *)); else { parent_xf->filelen = 1; parent_xf->xml_files = malloc (sizeof (xml_file *)); } parent_xf->xml_files[parent_xf->filelen - 1] = child_xf; /* child's hashtables are just pointers to the parent's */ merge_rg_htables (parent_xf, child_xf, element_names, FREEOLD); merge_rg_htables (parent_xf, child_xf, notation_names, FREEOLD); merge_rg_htables (parent_xf, child_xf, parameter_entity_names, FREEOLD); merge_rg_htables (parent_xf, child_xf, external_entity_names, FREEOLD); merge_rg_htables (parent_xf, child_xf, unparsed_entity_names, FREEOLD); merge_rg_htables (parent_xf, child_xf, entity_names, FREEOLD); merge_rg_htables (parent_xf, child_xf, id_refs, FREEOLD); merge_rg_htables (parent_xf, child_xf, ids, FREEOLD); /* this is a child xml_file structure */ child_xf->child = yes; child_xf->parent = parent_xf; xwrap (errdebug (5, "converted a parent xml_file into a child\n")); return parent_xf->filelen; } /* * merge_rg_htables * * Merge elements in xf2's hash table into xf1's (reject * duplicates). Returns a count of elements added to xf1's * hashtable. * * If freeme is set (i.e., nonzero; use the FREEME define above), * merge_rg_htables() will free all storage used by xf2's * hashtables, than point all of xf2's hashtables at xf1's. This * behavior is useful if xf2 is a child of xf1, and we want * everything defined in it (e.g., new entities, element types) to * be defined also in the parent. * * A typical case when we wouldn't want to do this is if we are * going through an external parameter entity. In this case, we * want to read entity, element, etc. definitions, and enter them * into the local hashtables. But we don't want these changes * reflected in the parent. * * In such cases, call merge_rg_htables() with arg 4 (freeme) unset * (i.e., zero; in fact, use the NOFREE define). */ int merge_rg_htables (struct xml_file *xf1, struct xml_file *xf2, enum hashtype type, int freeme) { size_t i; void *cmodel; int count = 0; const char **pp; my_wchar_t **wps, **wps2; struct rg_htable *ht; struct xml_element *xe, *xe2; struct xml_unparsed_entity *xue; struct rg_htable_item it, *result; switch (type) { case element_names: ht = xf2->element_names; xwrap (errdebug (7, "merging element tables\n")); result = rg_get_htable_items (ht); while (result != NULL) { xe = (xml_element *)result->data; cmodel = xe->content_model; if (! freeme) { /* If we're not freeing the merged xf, then we need to make a * copy of the content model if its type is children or mixed */ if (xe->type == children) cmodel = copy_cmnode (xe->content_model); else if (xe->type == mixed) if ((wps = (my_wchar_t **)xe->content_model)) { for (i = 0; wps[i] != NULL; i++); wps2 = malloc ((i + 1) * sizeof (my_wchar_t *)); for (i = 0; wps[i] != NULL; i++) wps2[i] = uni_strdup (wps[i]); wps2[i] = NULL; cmodel = wps2; } } if ((xe2 = add_element (xf1, result->uni_key, xe->type, cmodel)) == NULL) { if (freeme) free_xml_element (xe); } else { count++; xe2->flags |= xe->flags; if ((xe2->ancestorlen = xe->ancestorlen) == 0) xe2->ancestors = NULL; else { if ((xe2->ancestors = malloc (xe2->ancestorlen * (sizeof (struct xml_element *)))) == NULL) errabort (40, "malloc error in %s\n", "merge_rg_htables()"); memcpy (xe2->ancestors, xe->ancestors, xe2->ancestorlen * (sizeof (struct xml_element *))); if (freeme) free (xe->ancestors); } if ((xe2->parentlen = xe->parentlen) == 0) xe2->parents = NULL; else { if ((xe2->parents = malloc (xe2->parentlen * (sizeof (struct xml_element *)))) == NULL) errabort (40, "malloc error in %s\n", "merge_rg_htables()"); memcpy (xe2->parents, xe->parents, xe2->parentlen * (sizeof (struct xml_element *))); if (freeme) free (xe->parents); } if (xe->attlist) { for (i = 0; i < xe->attlistlen; i++) if (! add_attribute (xf1, result->uni_key, xe->attlist[i])) if (freeme) free_xml_attribute (xe->attlist[i]); if (freeme) free (xe->attlist); } if (freeme) { if (xe->compiled_content_model) free (xe->compiled_content_model); free (xe); } } if (freeme) free (result->uni_key); result = rg_get_htable_items (NULL); } if (freeme) { rg_free_htable (xf2->element_names); xf2->element_names = xf1->element_names; } xwrap (errdebug (5, "merged element tables\n")); break; case notation_names: ht = xf2->notation_names; xwrap (errdebug (7, "merging notation tables\n")); result = rg_get_htable_items (ht); while (result != NULL) { if (add_notname (xf1, result->uni_key, result->data)) count++; result = rg_get_htable_items (NULL); } if (freeme) { rg_free_htable_and_data (xf2->notation_names); xf2->notation_names = xf1->notation_names; } xwrap (errdebug (5, "merged notation tables\n")); break; case parameter_entity_names: ht = xf2->parameter_entity_names; xwrap (errdebug (7, "merging parameter entity tables\n")); result = rg_get_htable_items (ht); while (result != NULL) { if (add_peref (xf1, result->uni_key, result->data)) count++; result = rg_get_htable_items (NULL); } if (freeme) { rg_free_htable_and_data (xf2->parameter_entity_names); xf2->parameter_entity_names = xf1->parameter_entity_names; } xwrap (errdebug (5, "merged parameter entity tables\n")); break; case external_entity_names: ht = xf2->external_entity_names; xwrap (errdebug (7, "merging external entity tables\n")); result = rg_get_htable_items (ht); while (result != NULL) { /* don't bother re-reading file; add entity text directly */ if (add_ext_eref_text (xf1, result->uni_key, result->data)) count++; result = rg_get_htable_items (NULL); } if (freeme) { rg_free_htable_and_data (xf2->external_entity_names); xf2->external_entity_names = xf1->external_entity_names; } xwrap (errdebug (5, "merged external entity tables\n")); break; case unparsed_entity_names: ht = xf2->unparsed_entity_names; xwrap (errdebug (7, "merging unparsed entity tables\n")); result = rg_get_htable_items (ht); while (result != NULL) { xue = (xml_unparsed_entity *)result->data; if (add_uperef (xf1, result->uni_key, xue->notname, xue->sysid)) count++; if (freeme) { free (xue->notname); free (xue->sysid); } result = rg_get_htable_items (NULL); } if (freeme) { rg_free_htable_and_data (xf2->unparsed_entity_names); xf2->unparsed_entity_names = xf1->unparsed_entity_names; } xwrap (errdebug (5, "merged unparsed entity tables\n")); break; case entity_names: ht = xf2->entity_names; xwrap (errdebug (7, "merging general entity tables\n")); /* rip out & <, etc. from parent; avoid silly error msgs */ for (pp = &built_in_entity_list[0]; *pp != NULL; pp++) { it.key = NULL; it.uni_key = uni_strdup (utf_8_to_utf_16 (*pp)); it.data = NULL; rg_delete_item (xf1->entity_names, it); free (it.uni_key); } result = rg_get_htable_items (ht); while (result != NULL) { if (add_eref (xf1, result->uni_key, result->data)) count++; result = rg_get_htable_items (NULL); } if (freeme) { rg_free_htable_and_data (xf2->entity_names); xf2->entity_names = xf1->entity_names; } xwrap (errdebug (5, "merged general entity tables\n")); break; case id_refs: ht = xf2->idrefs; xwrap (errdebug (7, "merging idrefs tables\n")); result = rg_get_htable_items (ht); while (result != NULL) { if (add_idref (xf1, result->uni_key, result->data)) count++; result = rg_get_htable_items (NULL); } if (freeme) { rg_free_htable_and_data (xf2->idrefs); xf2->idrefs = xf1->idrefs; } xwrap (errdebug (5, "merged idrefs tables\n")); break; case ids: ht = xf2->ids; xwrap (errdebug (7, "merging id tables\n")); result = rg_get_htable_items (ht); while (result != NULL) { if (xmlparse_env.keep_children == yes) /* only do this if we're keeping the parse tree around */ { if (add_id (xf1, result->data)) count++; } else { /* If we're not keeping the parse tree around, we should * not be doing a merge of xml_file structs with entries * in their ids hashtables; actually, we shouldn't be doing * any merging in this case, anyway, because if we have * any "ids" entries, we've reached document content, and * we should, in that case, be done merging files (which * all gets done, theoretically, while reading the DTD). */ errabort (53, "merged xml_file structs while in content\n"); } if (result->key && freeme) free (result->key); if (result->uni_key && freeme) free (result->uni_key); result = rg_get_htable_items (NULL); } if (freeme) { rg_free_htable (xf2->ids); xf2->ids = xf1->ids; } xwrap (errdebug (5, "merged id tables\n")); break; } /* how many elements did we add to xf2's hashtable */ return count; } /* * create_xml_file * * Allocate an xml_file structure, and initialize it; open fname * (arg 1). Return NULL if fname can't be opened. Note that we * have to figure out here what input format is used in fname, and * set get_xml_char accordingly. */ xml_file * create_xml_file (char *fname) { my_wchar_t *wp; xml_file *xf; struct rg_htable_item it; /* if you change this var, change it also in create_xml_tmpfile() */ const char **pp, *builtins[] = { "amp", "&", "gt", ">", "lt", "<", "apos", "'", "quot", "\"", NULL }; /* What are we doing? */ xwrap (errdebug (7, "allocating and initializing new xml_file structure\n")); if ((xf = malloc (sizeof (xml_file))) == NULL) errabort (40, "malloc error in %s\n", "create_xml_file ()"); memset (xf, 0, sizeof (xml_file)); if (strcoll (fname, DUMMY_FILE_NAME) == 0) xf->file = NULL; else { errno = 0; if ((xf->file = fopen_and_readlock (fname)) == NULL) { if (errno == EMFILE) errabort (21, "too many open files; aborting\n"); errwarn (141, "error opening %s\n", fname); return NULL; } } /* make sure xf->filename is an absolute path */ xf->filename = strdup (absolutize (fname)); xf->tmpfilename = NULL; /* assume yes until proven otherwise */ xf->standalone = maybe; xf->lineno = 0; /* when initially created, it has no parent */ xf->child = no; xf->parent = NULL; xf->lastchar = 0; /* use this to tell how many unmatched cond_levels = 0; xf->element_names = rg_create_htable (128); xf->notation_names = rg_create_htable (10); xf->parameter_entity_names = rg_create_htable (30); xf->external_entity_names = rg_create_htable (128); xf->unparsed_entity_names = rg_create_htable (128); xf->entity_names = rg_create_htable (128); /* ID and IDREFs table */ xf->idrefs = rg_create_htable (128); xf->ids = rg_create_htable (64); /* & > < ' " must be built-in to entity table */ for (pp = &builtins[0]; *pp != NULL; pp += 2) { it.key = NULL; it.uni_key = uni_strdup (utf_8_to_utf_16 (*pp)); it.data = uni_strdup (utf_8_to_utf_16 (*(pp + 1))); if (it.uni_key == NULL || it.data == NULL) errabort (40, "malloc() error in %s\n", "create_xml_file()"); rg_add_item (xf->entity_names, it); } /* will hold the parse tree; see parstree.c */ xf->parstree = NULL; if (xf->file == NULL) /* if xf->file is DUMMY_FILE_NAME ("/dev/null") */ xf->get_xml_char = empty_file; else { if ((xf->get_xml_char = detect_input_format (xf)) == NULL) { errwarn (142, "error reading %s\n", fname); return NULL; } if ((wp = check_for_encoding_decl (xf))) set_encoding (xf, wp); } /* What did we just do? */ xwrap (errdebug (5, "allocated and initialized new xml_file structure\n")); return xf; } /* * create_xml_tmpfile * * Allocate an xml_file structure, and initialize it; open tmpfname * (arg 2). Return NULL if tmpfname can't be opened. Note we have * to figure out here what input format is used in fname, and set * get_xml_char accordingly. Note that when this structure gets * freed, the tempfile will get unlinked. */ xml_file * create_xml_tmpfile (char *URI, char *tmpfname) { my_wchar_t *wp; xml_file *xf; struct rg_htable_item it; /* if you change this var, change it also in create_xml_file() */ const char **pp, *builtins[] = { "amp", "&", "gt", ">", "lt", "<", "apos", "'", "quot", "\"", NULL }; /* What are we doing? */ xwrap (errdebug (7, "allocating and initializing new xml_file structure (w/ temp file)\n")); if ((xf = malloc (sizeof (xml_file))) == NULL) errabort (40, "malloc error in %s\n", "create_xml_tmpfile ()"); memset (xf, 0, sizeof (xml_file)); if (strcoll (tmpfname, DUMMY_FILE_NAME) == 0) xf->file = NULL; else { errno = 0; if ((xf->file = fopen_and_readlock (tmpfname)) == NULL) { if (errno == EMFILE) errabort (21, "too many open files; aborting\n"); errwarn (141, "error opening %s\n", tmpfname); return NULL; } } xf->filename = strdup (URI); /* make sure xf->tmpfilename is an absolute path */ xf->tmpfilename = strdup (absolutize (tmpfname)); /* assume yes until proven otherwise */ xf->standalone = maybe; xf->lineno = 0; /* when initially created, it has no parent */ xf->child = no; xf->parent = NULL; xf->lastchar = 0; /* use this to tell how many unmatched cond_levels = 0; xf->element_names = rg_create_htable (128); xf->notation_names = rg_create_htable (10); xf->parameter_entity_names = rg_create_htable (30); xf->external_entity_names = rg_create_htable (128); xf->unparsed_entity_names = rg_create_htable (128); xf->entity_names = rg_create_htable (128); /* ID and IDREFs table */ xf->idrefs = rg_create_htable (128); xf->ids = rg_create_htable (64); /* & > < ' " must be built-in to entity table */ for (pp = &builtins[0]; *pp != NULL; pp += 2) { it.key = NULL; it.uni_key = uni_strdup (utf_8_to_utf_16 (*pp)); it.data = uni_strdup (utf_8_to_utf_16 (*(pp + 1))); if (it.uni_key == NULL || it.data == NULL) errabort (40, "malloc() error in %s\n", "create_xml_tmpfile()"); rg_add_item (xf->entity_names, it); } /* will hold the parse tree; see parstree.c */ xf->parstree = NULL; if (xf->file == NULL) /* if xf->file is DUMMY_FILE_NAME ("/dev/null") */ xf->get_xml_char = empty_file; else { if ((xf->get_xml_char = detect_input_format (xf)) == NULL) { errwarn (142, "error reading %s\n", tmpfname); return NULL; } if ((wp = check_for_encoding_decl (xf))) set_encoding (xf, wp); } /* What did we just do? */ xwrap (errdebug (5, "allocated and initialized new xml_file structure (w/ temp file)\n")); return xf; } /* * check_for_encoding_decl * * Checks xf (arg 1) for an initial TextDecl or XMLDecl, and, inside * of that, for an encoding decl. If it finds an encoding decl, it * returns a pointer to the value of that decl (in a static buffer * that may be overwritten on subsequent calls). Otherwise, returns * a NULL pointer. * * This function should only get called after we autodetect our file * format, and we are getting ready to check the results against the * format the user is claiming he or she is using in the encoding * decl. I.e., it should only get called inside of create_xml_file() * and create_xml_tmpfile(). */ static my_wchar_t * check_for_encoding_decl (struct xml_file *xf) { size_t i, count = 0; my_wchar_t c, *wp, *wp2; static my_wchar_t *wbuf = NULL; static size_t buflen = 64; int (*get_xml_char_function)(xml_file *); /* initialize static buffer; give us some space */ if (wbuf == NULL) { buflen = 64; if ((wbuf = malloc (buflen * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc error in %s\n", "check_for_encoding_decl()"); } /* start from the beginning */ rewind_xml_file (xf); /* handle UTF-8 like ASCII; utf_8() chokes if the encoding is ISO-8859-x */ get_xml_char_function = (xf->get_xml_char == utf_8) ? ascii_compatible : xf->get_xml_char; c = get_xml_char_function (xf); count++; if (c == 0xFEFF || c == 0xFFFE) { c = get_xml_char_function (xf); count++; } if (c == '<') { c = get_xml_char_function (xf); count++; /* see if the file begins with a PI (the only PI that can begin * a valid doc is an XML decl) */ if (c == '?') { /* find the end of the XML decl */ while (c != '>' && c != '<' && c != (my_wchar_t)EOF) { c = get_xml_char_function (xf); count++; } if (c == '<') { /* oops; '<' or EOF should't appear inside an XML decl */ rewind_xml_file (xf); return NULL; } else { if ((count + 1) > buflen) { buflen = count + 1; if ((wbuf = realloc (wbuf, buflen * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc error in %s\n", "check_for_encoding_decl()"); } rewind_xml_file (xf); for (i = 0; i < count; i++) wbuf[i] = get_xml_char_function (xf); wbuf[i] = 0; if ((wp = uni_utf_strstr (wbuf, "'; wp2++) { if (*wp2 == *wp) { *wp2 = 0; /* wp points, e.g., at: stuff */ rewind_xml_file (xf); return wp + 1; } } } } } } } } } rewind_xml_file (xf); return NULL; } /* * rewind_xml_file * * Seek to the beginning of an xml_file, and reset the (arg 1). * Return NULL if fname can't be opened. Note that we have to * figure out here what input format is used in fname, and set * get_xml_char accordingly. */ int rewind_xml_file (struct xml_file *xf) { /* What are we doing? */ xwrap (errdebug (7, "rewinding xml_file\n")); if (xf->file == NULL) return 0; if (fseek(xf->file, 0L, SEEK_SET) == -1) { if (errno != EBADF) errabort (148, "seek error %d for file %s\n", errno, xf->filename); else /* Can't reset file pos to the beginning */ errabort (149, "file %s must be seekable\n", xf->filename); } xf->lineno = 0; /* What did we just do? */ xwrap (errdebug (5, "rewound xml_file\n")); /* pretty much meaningless */ return xf->lineno; } void free_xml_file (struct xml_file *xf) { size_t i; struct rg_htable_item *result; /* What are we doing? */ xwrap (errdebug (7, "freeing up xml_file structure\n")); for (i = 0; i < xf->filelen; i++) free_xml_file (xf->xml_files[i]); xf->filelen = 0; for (i = 0; i < xf->errlen; i++) free_xml_error (xf->xml_errors[i]); xf->errlen = 0; for (i = 0; i < xf->warnlen; i++) free_xml_warning (xf->xml_warnings[i]); xf->warnlen = 0; if (xf->file) if (fclose_and_unlock (xf->file) == EOF) errwarn (150, "error closing %s\n", xf->filename); /* holds the full parse of xf (see parstree.c) */ if (xf->parstree) { if (xf->child != no) errabort (42, "unexpectedly nonnull value in %s\n", "free_xml_file()"); free_xml_node (xf->parstree); } if (xf->child != yes) { /* free hashtables for non-children, i.e., for the top-level * xml_file struct, and for pseudo-children, too (pseudo * children are marked as xf->child == maybe; pseudo children * are added to their parent via add_xml_file_nomerge() and * are there just waiting for garbage collection) */ rg_free_htable_and_data (xf->notation_names); rg_free_htable_and_data (xf->parameter_entity_names); rg_free_htable_and_data (xf->external_entity_names); rg_free_htable_and_data (xf->entity_names); rg_free_htable_and_data (xf->idrefs); /* complex structure must be freed separately */ result = rg_get_htable_items (xf->element_names); while (result != NULL) { free (result->uni_key); free_xml_element (result->data); result = rg_get_htable_items (NULL); } rg_free_htable (xf->element_names); /* complex structure must be freed separately */ result = rg_get_htable_items (xf->unparsed_entity_names); while (result != NULL) { if (result->data != NULL) { free (((xml_unparsed_entity *)result->data)->notname); free (((xml_unparsed_entity *)result->data)->sysid); } result = rg_get_htable_items (NULL); } rg_free_htable_and_data (xf->unparsed_entity_names); /* ID table - only free ID */ result = rg_get_htable_items (xf->ids); while (result != NULL) { if (result->key != NULL) free (result->key); if (result->uni_key != NULL) free (result->uni_key); /* * If xmlparse_env.keep_children == yes, then result->data * is a pointer into the parse tree; if not, it's just a * dummy entry. Either way, don't touch it. If it's a live * pointer, it'll get freed as part of the parse tree. */ result = rg_get_htable_items (NULL); } rg_free_htable (xf->ids); } free (xf->filename); if (xf->tmpfilename) { if (unlink (xf->tmpfilename) == -1) errwarn (13, "can't unlink %s\n", xf->tmpfilename); free (xf->tmpfilename); } free (xf); /* What did we just do? */ xwrap (errdebug (5, "freed xml_file structure\n")); } /* * remove_all_errors_and_warnings_except * * Removes all error and warning messages in xf (arg 1) except those * given as arguments to this function. Error and warning messages * are held in xf->xml_errors and xf->xml_warnings fields, and are * identified by integers in their own "num" fields (e.g., xf->xml_ * errors[i]->num). Only those with "num" fields having integers * that match an argument to this function are kept. All others * are deallocated and removed. * * See the "messages" file for a list of error numbers. Be sure * that the last error number is zero (the zero just marks the end * of the list). * * Returns a count of error/warning messages that were removed. */ int remove_all_errors_and_warnings_except (struct xml_file *xf, ...) { size_t i, j; va_list valst; int errnum, count; struct xml_error *tmpe; struct xml_warning *tmpw; if (xf == NULL) return 0; /* count of error or warning structs removed */ count = 0; /* remove xml_error structs */ for (i = j = 0; i < xf->errlen; i++) { #ifdef STDC_HEADERS va_start (valst, xf); #else va_start (valst); #endif while ((errnum = va_arg (valst, int)) != 0) { if (xf->xml_errors[i]->num == errnum) { if (i != j) { tmpe = xf->xml_errors[j]; xf->xml_errors[j] = xf->xml_errors[i]; xf->xml_errors[i] = tmpe; } j++; break; } } va_end (valst); } xf->errlen = j; for (j = xf->errlen; j < i; j++) free_xml_error (xf->xml_errors[j]); xf->xml_errors = realloc (xf->xml_errors, xf->errlen * sizeof (struct xml_error *)); if (xf->xml_errors == NULL) errabort (40, "malloc() error in %s\n", "remove_all_errors_and_warnings_except()"); /* remove xml_warning structs */ for (i = j = 0; i < xf->warnlen; i++) { #ifdef STDC_HEADERS va_start (valst, xf); #else va_start (valst); #endif while ((errnum = va_arg (valst, int)) != 0) { if (xf->xml_warnings[i]->num == errnum) { if (i != j) { tmpw = xf->xml_warnings[j]; xf->xml_warnings[j] = xf->xml_warnings[i]; xf->xml_warnings[i] = tmpw; } j++; break; } } va_end (valst); } xf->warnlen = j; for (j = xf->warnlen; j < i; j++) free_xml_warning (xf->xml_warnings[j]); xf->xml_warnings = realloc (xf->xml_warnings, xf->warnlen * sizeof (struct xml_warning *)); if (xf->xml_warnings == NULL) errabort (40, "malloc() error in %s\n", "remove_all_errors_and_warnings_except()"); /* return a count of however many were removed */ return count; } /* * report_all_errors_and_warnings * * Prints a report to stderr of error and warning messages created * while parsing (xml_file )xf and all its children xml_file * structs. Returns the total number of errors and warnings * reported for all files. */ int report_all_errors_and_warnings (struct xml_file *xf, int *total) { size_t i; int errors = 0; for (i = 0; i < xf->filelen; i++) errors += report_all_errors_and_warnings (xf->xml_files[i], total); errors += report_errors_and_warnings (xf, total); return errors; } /* * report_errors_and_warnings * * Prints a report to stderr of error and warning messages created * while parsing xf. Returns the total number of errors reported * (not warnings). */ static int report_errors_and_warnings (struct xml_file *xf, int *total) { char numbuf[32]; char *tmp, *format; size_t eidx, widx; static char *program_name; program_name = xmlparse_env.program_name; if (program_name == NULL) program_name = "xmlparse"; /* errors and warnings must be sorted, in ascending order */ sort_xml_errors_and_warnings (xf); #ifndef HAVE_VSNPRINTF # define snprintf(a,b,c,d) sprintf(a,c,d) #endif eidx = widx = 0; while (eidx < xf->errlen || widx < xf->warnlen) { if (*total >= xmlparse_env.max_errors) { snprintf (numbuf, 32, "%d", 5); if ((format = getmessage (xmlparse_env.message_catalog, numbuf)) != NULL) { fprintf (stderr, "error (%d): ", 5); fprintf (stderr, format, xf->filename); } return xf->errlen; } else if (eidx == xf->errlen) { snprintf (numbuf, 32, "%d", xf->xml_warnings[widx]->num); if ((format = getmessage (xmlparse_env.message_catalog, numbuf)) != NULL) { fprintf (stderr, "warning (%d): ", xf->xml_warnings[widx]->num); tmp = replace (utf_16_to_utf_8 (xf->xml_warnings[widx]->text), "\n", " "); fprintf (stderr, format, xf->filename, xf->xml_warnings[widx]->lineno, tmp); } (*total)++; widx++; } else if (widx == xf->warnlen) { snprintf (numbuf, 32, "%d", xf->xml_errors[eidx]->num); if ((format = getmessage (xmlparse_env.message_catalog, numbuf)) != NULL) { fprintf (stderr, "error (%d): ", xf->xml_errors[eidx]->num); tmp = replace (utf_16_to_utf_8 (xf->xml_errors[eidx]->text), "\n", " "); fprintf (stderr, format, xf->filename, xf->xml_errors[eidx]->lineno, tmp); } (*total)++; eidx++; } else if (xf->xml_errors[eidx]->lineno > xf->xml_warnings[widx]->lineno) { snprintf (numbuf, 32, "%d", xf->xml_warnings[widx]->num); if ((format = getmessage (xmlparse_env.message_catalog, numbuf)) != NULL) { fprintf (stderr, "warning (%d): ", xf->xml_warnings[widx]->num); tmp = replace (utf_16_to_utf_8 (xf->xml_warnings[widx]->text), "\n", " "); fprintf (stderr, format, xf->filename, xf->xml_warnings[widx]->lineno, tmp); } (*total)++; widx++; } else { snprintf (numbuf, 32, "%d", xf->xml_errors[eidx]->num); if ((format = getmessage (xmlparse_env.message_catalog, numbuf)) != NULL) { fprintf (stderr, "error (%d): ", xf->xml_errors[eidx]->num); tmp = replace (utf_16_to_utf_8 (xf->xml_errors[eidx]->text), "\n", " "); fprintf (stderr, format, xf->filename, xf->xml_errors[eidx]->lineno, tmp); } (*total)++; eidx++; } } #ifndef HAVE_VSNPRINTF # undef snprintf #endif return xf->errlen; } /* * sort_xml_errors_and_warnings * * Make sure that the error and warning numbers in xf (arg 1) are * sorted by line number. Returns the total number of errors and * warnings for xf. */ int sort_xml_errors_and_warnings (struct xml_file *xf) { qsort (xf->xml_errors, xf->errlen, sizeof (xml_error *), compare_xml_errors); qsort (xf->xml_warnings, xf->warnlen, sizeof (xml_warning *), compare_xml_warnings); return xf->errlen + xf->warnlen; } /* * compare_xml_errors: * * Used by qsort, which takes (as its last arg) a pointer to a * function returning int. In this case the function takes two * arguments, both void pointers (as required by qsort and also * bsearch). But they are assumed to be pointers to xml_error * structs. * * Returns 1 if arg 1 points to an xml_error struct registering an * error later in the file than the one pointed to by arg2. Returns * 0 if they occur in the same spot (same line, that is). Otherwise * returns -1. */ int compare_xml_errors (const void *xep1, const void *xep2) { size_t lineno1, lineno2; struct xml_error *xe1, *xe2; xe1 = *(xml_error **)xep1; xe2 = *(xml_error **)xep2; lineno1 = xe1->lineno; lineno2 = xe2->lineno; if (lineno1 > lineno2) return 1; else if (lineno1 == lineno2) { /* -- overkill -- * if (xe2 > xe1) * return 1; * else if (xe2 == xe1) */ return 0; } return -1; } /* * compare_xml_warnings: * * Used by qsort, which takes (as its last arg) a pointer to a * function returning int. In this case the function takes two * arguments, both void pointers (as required by qsort and also * bsearch). But they are assumed to be pointers to xml_warning * structs. * * Returns 1 if arg 1 points to an xml_warning struct registering an * error later in the file than the one pointed to by arg2. Returns * 0 if they occur in the same spot (same line, that is). Otherwise * returns -1. */ int compare_xml_warnings (const void *xwp1, const void *xwp2) { size_t lineno1, lineno2; struct xml_warning *xw1, *xw2; xw1 = *(xml_warning **)xwp1; xw2 = *(xml_warning **)xwp2; lineno1 = xw1->lineno; lineno2 = xw2->lineno; if (lineno1 > lineno2) return 1; else if (lineno1 == lineno2) { /* -- overkill -- * if (xw2 > xw1) * return 1; * else if (xw2 == xw1) */ return 0; } return -1; } /* * add_xml_error: * * Add an xml_error to an xml_file struct's error list. */ xml_error * add_xml_error (xml_file *xf, int num, my_wchar_t *text) { if (xf->errlen > 0) xf->xml_errors = realloc (xf->xml_errors, ++xf->errlen * sizeof (xml_error *)); else { xf->errlen = 1; xf->xml_errors = malloc (sizeof (xml_error *)); } xf->xml_errors[xf->errlen - 1] = create_xml_error (num, text, xf->lineno ? xf->lineno : 1, xf->file ? ftell (xf->file) : 0); xwrap (errdebug (5, "added error #%d - %s, line %d\n", num, xf->filename, xf->lineno)); return xf->xml_errors[xf->errlen - 1]; } /* * create_xml_error * * Create an xml_error structure, with num, text, lineno, and offset * fields all initialized to the values provided here as arguments. * Returns a pointer to a malloc'd xml_error structure. Aborts on * malloc() failures. Note that to free the xml_error struct, one * should call free_xml_error(). */ static xml_error * create_xml_error (int num, my_wchar_t *text, int lineno, long offset) { xml_error *xe; xwrap (errdebug (5, "creating error struct to hold text: %s\n", text ? utf_16_to_utf_8 (text) : "(null)")); if ((xe = malloc (sizeof (xml_error))) == NULL) errabort (40, "malloc error in %s\n", "create_xml_error()"); xe->num = num; xe->text = text ? uni_strdup (text) : uni_strdup (utf_8_to_utf_16 ("")); if (xe->text == NULL) errabort (40, "malloc error in %s\n", "create_xml_error()"); xe->lineno = lineno; xe->offset = offset; return xe; } /* * add_xml_warning: * * Add an xml_warning to an xml_file struct's warning list. */ xml_warning * add_xml_warning (xml_file *xf, int num, my_wchar_t *text) { if (xf->warnlen > 0) xf->xml_warnings = realloc (xf->xml_warnings, ++xf->warnlen * sizeof (xml_warning *)); else { xf->warnlen = 1; xf->xml_warnings = malloc (sizeof (xml_warning *)); } xf->xml_warnings[xf->warnlen - 1] = create_xml_warning (num, text, xf->lineno ? xf->lineno : 1, xf->file ? ftell (xf->file) : 0); xwrap (errdebug (5, "added warning - %s, line %d\n", xf->filename, xf->lineno)); return xf->xml_warnings[xf->warnlen - 1]; } /* * create_xml_warning * * Create an xml_warning structure, with num, text, lineno, and * offset fields all initialized to the values provided here as * arguments. Returns a pointer to a malloc'd xml_warning * structure. Aborts on malloc() failures. Note that to free the * xml_warning struct, one should call free_xml_warning(). */ static xml_warning * create_xml_warning (int num, my_wchar_t *text, int lineno, long offset) { xml_warning *xw; xwrap (errdebug (5, "creating warning struct to hold text: %s\n", text ? utf_16_to_utf_8 (text) : "(null)")); if ((xw = malloc (sizeof (xml_warning))) == NULL) errabort (40, "malloc warning in %s\n", "create_xml_warning()"); xw->num = num; xw->text = text ? uni_strdup (text) : uni_strdup (utf_8_to_utf_16 ("")); if (xw->text == NULL) errabort (40, "malloc error in %s\n", "create_xml_warning()"); xw->lineno = lineno; xw->offset = offset; return xw; } /* * free_xml_error * * See above, create_xml_error(). */ void free_xml_error (xml_error *xe) { if (xe != NULL) { if (xe->text != NULL) free (xe->text); free (xe); } } /* * free_xml_warning * * See above, create_xml_warning(). */ void free_xml_warning (xml_warning *xw) { if (xw != NULL) { if (xw->text != NULL) free (xw->text); free (xw); } } struct xml_element * create_xml_element (struct xml_file *xf, my_wchar_t *name, enum content_types type, void *content_model) { struct xml_element *xe; if ((xe = malloc (sizeof (struct xml_element))) == NULL) errabort (40, "malloc() error in %s\n", "create_xml_element()"); memset (xe, 0, sizeof (struct xml_element)); /* note that the last_closed_at field (a kind of a kludge) isn't used until * we start parsing the document, and we find an end tag for an element that's * not open - in which case we note where the element was last closed */ xe->type = type; xe->name = name ? uni_strdup (name) : NULL; /* don't know who our possible ancestors are yet */ xe->ancestorlen = 0; xe->ancestors = NULL; /* don't know who our immediate ancestors (parents) are yet */ xe->parentlen = 0; xe->parents = NULL; if (xe->type == empty || xe->type == Any || xe->type == dummy) if (xe->content_model != NULL) errwarn (42, "unexpected nonnull value in %s\n", "create_xml_element()"); /* for elements with "mixed" content models, the content_model field * is an array of (my_wchar_t *)strings, the last of which is NULL */ xe->content_model = content_model; xe->compiled_content_model = NULL; xe->attlistlen = 0; xe->attlist = NULL; if (xf && in_external_dtd_subset (xf)) /* set, if it's defined in the external DTD subset */ xe->flags |= DEFINED_EXTERNALLY; return xe; } /* * create_xml_attribute * * Malloc and initialize an xml_attribute structure. Only certain * argument combinations are possible here. Arg 1 is the attribute * name. Arg 2 is the attribute type. For all attribute types * except notation and enumeration, nmtoklen (arg 3) must be zero * and nmtokens will be NULL (arg 4). For notation and enumeration * types, nmtokens is a string list, and nmtoklen is its length. * Arg 5, default_type, is one of required, implied, defaulted, or * fixed, and arg 6, default_val (which is only set if arg 5 is * 'defaulted' or 'fixed'), is a (my_wchar_t *) string containing the * default value for the attribute. * * Returns a pointer to the newly created xml_attribute. Be sure * to free it later with free_xml_attribute(). */ struct xml_attribute * create_xml_attribute (struct xml_file *xf, my_wchar_t *name, enum attribute_types type, size_t nmtoklen, my_wchar_t **nmtokens, enum default_types default_type, my_wchar_t *default_val) { struct xml_attribute *xa; if (! (type == notation || type == enumeration) && nmtokens != NULL) /* * except where type == enumeration or notation, token list should * be null */ errwarn (42, "unexpected nonnull value, %s\n", "create_xml_attribute()"); if ((xa = malloc (sizeof (struct xml_attribute))) == NULL) errabort (40, "malloc() error in %s\n", "create_xml_attribute()"); memset (xa, 0, sizeof (struct xml_attribute)); xa->name = uni_strdup (name); xa->type = type; xa->nmtoklen = nmtokens ? nmtoklen : 0U; xa->nmtokens = nmtokens; xa->default_type = default_type; xa->flags = 0; if (in_external_dtd_subset (xf)) /* set, if it's defined in the external DTD subset */ xa->flags |= DEFINED_EXTERNALLY; if (default_val) { if (! (xa->default_type == defaulted || xa->default_type == fixed)) errwarn (42, "unexpectedly nonnull value in %s\n", "create_xml_attribute()"); xa->default_val = uni_strdup (default_val); } return xa; } /* * free_xml_element * * Free an xml_element struct created above with * create_xml_element(). */ void free_xml_element (struct xml_element *xe) { size_t i; my_wchar_t **wpp; if (xe->name) free (xe->name); if (xe->ancestors) free (xe->ancestors); if (xe->parents) free (xe->parents); if (xe->content_model) { if (xe->type == children) { /* if type is children, content model is a cmnode struct */ free_cmnode (xe->content_model); } else if (xe->type == mixed) { /* for elements with "mixed" content models, the content_model * field is an array of (my_wchar_t *)strings, the last of which * is NULL */ for (wpp = xe->content_model; *wpp != NULL; wpp++) free (*wpp); free (xe->content_model); } else /* if content model is nonnull, type should be mixed or children */ errabort (42, "unexpectedly nonnull val, %s\n", "free_xml_element()"); } if (xe->compiled_content_model) free_dfa (xe->compiled_content_model); if (xe->attlist) { for (i = 0; i < xe->attlistlen; i++) free_xml_attribute (xe->attlist[i]); free (xe->attlist); } free (xe); } /* * free_xml_attribute * * Free an xml_attribute struct created above with * create_xml_attribute(). */ void free_xml_attribute (struct xml_attribute *xa) { size_t i; free (xa->name); if (xa->nmtokens) { for (i = 0; i < xa->nmtoklen; i++) free (xa->nmtokens[i]); free (xa->nmtokens); } if (xa->default_val) free (xa->default_val); free (xa); } /* * detect_input_format: * * Figure out what input file format a given xml file (xf->file) * uses, and set up a character reading function for that file * (xf->get_xml_char). Returns NULL if xf->file doesn't behave * itself. Aborts on invalid formats or IO errors. * * The bottom line is that after you run detect_input_format() on * an xml_file, xf, you should be able to call xf->get_xml_char(xf) * repeatedly, and get UTF-16 characters set according to the local * host's native byte order. All necessary conversions will be done * on the fly (e.g., if xf->file is in little-endian UCS-4, it will * be re-mapped, where possible, to UTF-16, and byte-swapped). * * Note that when xf->get_xml_char(xf) hits end-of-file, it returns * EOF, like normal stdio routines. */ static int (*(detect_input_format (xml_file *xf)))(xml_file *) { unsigned char buf[4]; int (*get_xml_char_function)(xml_file *); /* What are we doing? */ xwrap (errdebug (7, "figuring out what file format %s uses...\n", xf->filename)); /* default */ get_xml_char_function = utf_8; if (xf->file == NULL) /* if xf->file is a dummy file, return UTF-8 routine */ return get_xml_char_function; if (fread (&buf[0], 1, 4, xf->file) < 4) { if (ferror (xf->file)) return NULL; /* if it's a null or short file, assume utf_8 */ get_xml_char_function = utf_8; } else switch (buf[0]) { case 0: if (memcmp ("\x00\x00\x3c", &buf[1], 3) == 0) get_xml_char_function = ucs_4_big_endian; else if (memcmp ("\x00\x3c\x00", &buf[1], 3) == 0) errabort (145, "bad byte order, %s, in %s\n", "2143", xf->filename); else if (memcmp ("\x3c\x00\x00", &buf[1], 3) == 0) errabort (145, "bad byte order, %s, in %s\n", "3412", xf->filename); else if (memcmp ("\x3c\x00\x3f", &buf[1], 3) == 0) { errwarn (146, "missing byte order mark, %s\n", xf->filename); get_xml_char_function = utf_16_big_endian; } break; case 0x3C: if (memcmp ("\x00\x00\x00", &buf[1], 3) == 0) get_xml_char_function = ucs_4_little_endian; else if (memcmp ("\x3f\x78\x6d", &buf[1], 3) == 0) { /* UTF-8 or some ASCII-compatible encoding */ get_xml_char_function = utf_8; } else if (memcmp ("\x00\x3f\x00", &buf[1], 3) == 0) { errwarn (146, "missing byte order mark, %s\n", xf->filename); get_xml_char_function = utf_16_little_endian; } break; case 0xFE: if (buf[1] == 0xFF) get_xml_char_function = utf_16_big_endian; else /* Who knows what we have in this case? */ get_xml_char_function = utf_8; break; case 0xFF: if (buf[1] == 0xFE) get_xml_char_function = utf_16_little_endian; else /* Who knows what we have in this case? */ get_xml_char_function = utf_8; break; case 0x4C: errabort (147, "can't read EBCDIC, %s\n", xf->filename); get_xml_char_function = NULL; break; default: get_xml_char_function = utf_8; break; } /* go back to the beginning */ rewind_xml_file (xf); xwrap (errdebug (5, "figured out what file format %s uses...\n", xf->filename)); return get_xml_char_function; } /* * empty_file * * This function's job is just to return EOF. It's used for files * like DUMMY_FILE_NAME (usually = "/dev/null") that never hold any * data. */ static int empty_file (struct xml_file *xf) { return EOF; } /* * ascii_compatible: * * This function returns a UTF-16 character read from an input XML * file, xf->file in ASCII(-compatible) format (it converts on the * fly). Note that this routine is never called directly. Call it * via xf->get_xml_char (xf). */ static int ascii_compatible (struct xml_file *xf) { int c; if ((c = fgetc (xf->file)) == EOF) { xf->lineno++; return EOF; } else { /* translate \r into \n; delete \n if after \r */ if (xf->lastchar == '\r' && c == '\n') { xf->lastchar = c; return ascii_compatible (xf); } xf->lastchar = c; if (c == '\r') c = '\n'; if (c == '\n') xf->lineno++; return c; } } /* * ucs_4_big_endian: * * This function returns a UTF-16 character read from an input XML * file, xf->file in UCS-4 format (it converts on the fly, doing a * byte-swap if the local host has a different byte order). Note * that this routine is never called directly. Call it via * xf->get_xml_char (xf). */ static int ucs_4_big_endian (struct xml_file *xf) { int v, count; char errbuf[40]; static int chunk_left = 0; static union { u_int32_t i; unsigned char c[4]; } i, j; if (chunk_left) { /* Last time we were called, we had a UCS-4 char that was bigger * than 16 bits. So we returned first the "high half" zone. * Now we have to return the "low half" zone. */ chunk_left = 0; return (int )j.i; } else { /* Read in a UCS-4 character (four bytes) */ i.i = 0; count = fread (&i.c[0], 1, 4, xf->file); switch (count) { case 4: /* got what we needed */ break; case 0: /* got nothing (EOF?) */ if (ferror (xf->file)) errwarn (140, "error reading %s\n", xf->filename); return EOF; case EOF: xf->lineno++; return EOF; default: /* got something - not enough */ errwarn (151, "short read (%d) from %s\n", count, xf->filename); return EOF; } /* If the localhost's byte order isn't big endian, swap bytes */ if (is_big_endian) j = i; else { j.c[0] = i.c[3]; j.c[1] = i.c[2]; j.c[2] = i.c[1]; j.c[3] = i.c[0]; } /* If none of the high sixteen bits are set, return a regular * Unicode character (UCS-4 in this range is the same as UCS-2 * and UTF-16). Otherwise, we have to play the UTF-16 game. */ if (j.i < 0x00010000U) { /* translate \r to \n; delete \n after \r */ if (xf->lastchar == '\r' && j.i == '\n') { xf->lastchar = j.i; return ucs_4_big_endian (xf); } xf->lastchar = j.i; if (j.i == '\r') j.i = '\n'; /* 0x2028 is the Unicode linefeed character */ if (j.i == '\n' || j.i == 0x2028U) xf->lineno++; return (int )j.i; } else { /* Some UCS-4 characters can't be represented in UTF-16 */ if (j.i > 0x0010FFFFU) { sprintf (errbuf, "byte offset %ld", ftell (xf->file) - 4); xf->lineno++; add_xml_error (xf, 332, utf_8_to_utf_16 (errbuf)); xf->lineno--; xf->lastchar = 0x0000FFFD; return (int )0x0000FFFD; } /* Map j to UCS-16 and put the result in i; return the "high * half" zone first; on the next call return the "low half" * zone. */ chunk_left = 1; j.i -= 0x10000U; i.i = ((j.i >> 10) & TEN_BIT_MASK) + 0xD800U; j.i = (j.i & TEN_BIT_MASK) + 0xDC00U; xf->lastchar = i.i; return (int )i.i; } } } /* * ucs_4_little_endian: * * This function returns a UTF-16 character read from an input XML * file, xf->file in UCS-4 format (it converts on the fly, doing a * byte-swap if the local host has a different byte order). Note * that this routine is never called directly. Call it via * xf->get_xml_char (xf). */ static int ucs_4_little_endian (struct xml_file *xf) { int v, count; char errbuf[40]; static int chunk_left = 0; static union { u_int32_t i; unsigned char c[4]; } i, j; if (chunk_left) { /* Last time we were called, we had a UCS-4 char that was bigger * than 16 bits. So we returned first the "high half" zone. * Now we have to return the "low half" zone. */ chunk_left = 0; return (int )j.i; } else { /* Read in a UCS-4 character (four bytes) */ i.i = 0; count = fread (&i.c[0], 1, 4, xf->file); switch (count) { case 4: /* got what we needed */ break; case 0: /* got nothing (EOF?) */ if (ferror (xf->file)) errwarn (140, "error reading %s\n", xf->filename); return EOF; case EOF: xf->lineno++; return EOF; default: /* got something - not enough */ errwarn (151, "short read (%d) from %s\n", count, xf->filename); return EOF; } /* If the localhost's byte order is big endian, swap bytes */ if (! is_big_endian) j = i; else { j.c[0] = i.c[3]; j.c[1] = i.c[2]; j.c[2] = i.c[1]; j.c[3] = i.c[0]; } /* If none of the high sixteen bits are set, return a regular * Unicode character (UCS-4 in this range is the same as UCS-2 * and UTF-16). Otherwise, we have to play the UTF-16 game. */ if (j.i < 0x00010000U) { /* translate \r to \n; delete \n after \r */ if (xf->lastchar == '\r' && j.i == '\n') { xf->lastchar = j.i; return ucs_4_little_endian (xf); } xf->lastchar = j.i; if (j.i == '\r') j.i = '\n'; /* 0x2028 is the Unicode linefeed character */ if (j.i == '\n' || j.i == 0x2028U) xf->lineno++; return (int )j.i; } else { /* Some UCS-4 characters can't be represented in UTF-16 */ if (j.i > 0x0010FFFFU) { sprintf (errbuf, "byte offset %ld", ftell (xf->file) - 4); xf->lineno++; add_xml_error (xf, 332, utf_8_to_utf_16 (errbuf)); xf->lineno--; xf->lastchar = 0x0000FFFD; return (int )0x0000FFFD; } /* Map j to UCS-16 and put the result in i; return the "high * half" zone first; on the next call return the "low half" * zone. */ chunk_left = 1; j.i -= 0x10000U; i.i = ((j.i >> 10) & TEN_BIT_MASK) + 0xD800U; j.i = (j.i & TEN_BIT_MASK) + 0xDC00U; xf->lastchar = i.i; return (int )i.i; } } } /* these are just wrapper functions for iso8859() */ static int iso8859_1 (xml_file *xf) { return iso8859 (xf, 1); } static int iso8859_2 (xml_file *xf) { return iso8859 (xf, 2); } static int iso8859_3 (xml_file *xf) { return iso8859 (xf, 3); } static int iso8859_4 (xml_file *xf) { return iso8859 (xf, 4); } static int iso8859_5 (xml_file *xf) { return iso8859 (xf, 5); } static int iso8859_6 (xml_file *xf) { return iso8859 (xf, 6); } static int iso8859_7 (xml_file *xf) { return iso8859 (xf, 7); } static int iso8859_8 (xml_file *xf) { return iso8859 (xf, 8); } static int iso8859_9 (xml_file *xf) { return iso8859 (xf, 9); } static int iso8859_14 (xml_file *xf) { return iso8859 (xf, 14); } static int iso8859_15 (xml_file *xf) { return iso8859 (xf, 15); } /* * iso8859 * * This function returns a UTF-16 character read from an input XML * file, xf->file, in ISO-8859 format. The second argument, level, * gives the particular ISO-8859 charset being used by the input * stream (e.g., 8 for ISO-8859-8). This function is called by * the iso8859_1, iso8859_2, etc. routines. */ static int iso8859 (xml_file *xf, int level) { int c; unsigned char uc; if ((c = fgetc (xf->file)) == EOF) { xf->lineno++; return EOF; } else { /* translate \r into \n; delete \n if after \r */ if (xf->lastchar == '\r' && c == '\n') { xf->lastchar = c; return iso8859 (xf, level); } uc = (char)c; switch (level) { case 1: c = unicode_8859_1[uc]; break; case 2: c = unicode_8859_2[uc]; break; case 3: c = unicode_8859_3[uc]; break; case 4: c = unicode_8859_4[uc]; break; case 5: c = unicode_8859_5[uc]; break; case 6: c = unicode_8859_6[uc]; break; case 7: c = unicode_8859_7[uc]; break; case 8: c = unicode_8859_8[uc]; break; case 9: c = unicode_8859_9[uc]; break; case 14: c = unicode_8859_14[uc]; break; case 15: c = unicode_8859_15[uc]; break; default: c = unicode_8859_1[uc]; break; } xf->lastchar = c; if (c == '\r') c = '\n'; if (c == '\n') xf->lineno++; return c; } } /* * utf_16_big_endian: * * This function returns a UTF-16 character read from an input XML * UTF-16 file, xf->file. Does a byte swap if the local host is * little-endian. Note that this routine is never called directly. * Call it via xf->get_xml_char (xf). */ static int utf_16_big_endian (struct xml_file *xf) { int v, count; union { u_int16_t i; unsigned char c[2]; } i, j; i.i = 0; count = fread (&i.c[0], 1, 2, xf->file); switch (count) { case 2: /* got what we needed */ break; case 0: /* got nothing (EOF?) */ if (ferror (xf->file)) errwarn (140, "error reading %s\n", xf->filename); return EOF; case EOF: xf->lineno++; return EOF; default: /* got something - not enough */ errwarn (151, "short read (%d) from %s\n", count, xf->filename); return EOF; } /* If the localhost's byte order isn't big endian, swap bytes */ if (! is_big_endian) { j.c[0] = i.c[1]; j.c[1] = i.c[0]; i = j; } if (i.i == 0xFEFF && ftell (xf->file) == 2) /* skip over file-initial byte-order marker, FFFE */ return utf_16_big_endian (xf); /* translate \r to \n; delete \n after \r */ if (xf->lastchar == '\r' && i.i == '\n') { xf->lastchar = i.i; return utf_16_big_endian (xf); } xf->lastchar = i.i; if (i.i == '\r') i.i = '\n'; /* 0x2028 is the Unicode linefeed character */ if (i.i == '\n' || i.i == 0x2028U) xf->lineno++; return (int )i.i; } /* * utf_16_little_endian: * * This function returns a UTF-16 character read from an input XML * UTF-16 file, xf->file. Does a byte swap if the local host is * big-endian. Note that this routine is never called directly. * Call it via xf->get_xml_char (xf). */ static int utf_16_little_endian (struct xml_file *xf) { int v, count; union { u_int16_t i; unsigned char c[2]; } i, j; i.i = 0; count = fread (&i.c[0], 1, 2, xf->file); switch (count) { case 2: /* got what we needed */ break; case 0: /* got nothing (EOF?) */ if (ferror (xf->file)) errwarn (140, "error reading %s\n", xf->filename); return EOF; case EOF: xf->lineno++; return EOF; default: /* got something - not enough */ errwarn (151, "short read (%d) from %s\n", count, xf->filename); return EOF; } /* If the localhost's byte order is big endian, swap bytes */ if (is_big_endian) { j.c[0] = i.c[1]; j.c[1] = i.c[0]; i = j; } if (i.i == 0xFEFF && ftell (xf->file) == 2) /* skip over file-initial byte-order marker, FFFE */ return utf_16_little_endian (xf); /* translate \r to \n; delete \n after \r */ if (xf->lastchar == '\r' && i.i == '\n') { xf->lastchar = i.i; return utf_16_little_endian (xf); } xf->lastchar = i.i; if (i.i == '\r') i.i = '\n'; /* 0x2028 is the Unicode linefeed character */ if (i.i == '\n' || i.i == 0x2028U) xf->lineno++; return (int )i.i; } /* * ucs_2_big_endian: * * This function returns a UTF-16 character read from an input XML * UTF-16 file, xf->file. Does a byte swap if the local host is * little-endian. Note that this routine is never called directly. * Call it via xf->get_xml_char (xf). * * The difference between ucs_2_big_endian and utf_16_big_endian * is that this latter function accepts multi-byte characters that * are technically out of the UCS-2 range with no error message. * This function, however, records an error message. */ static int ucs_2_big_endian (struct xml_file *xf) { int v, count; char errbuf[40]; union { u_int16_t i; unsigned char c[2]; } i, j; i.i = 0; count = fread (&i.c[0], 1, 2, xf->file); switch (count) { case 2: /* got what we needed */ break; case 0: /* got nothing (EOF?) */ if (ferror (xf->file)) errwarn (140, "error reading %s\n", xf->filename); return EOF; case EOF: xf->lineno++; return EOF; default: /* got something - not enough */ errwarn (151, "short read (%d) from %s\n", count, xf->filename); return EOF; } /* If the localhost's byte order isn't big endian, swap bytes */ if (! is_big_endian) { j.c[0] = i.c[1]; j.c[1] = i.c[0]; i = j; } if (i.i == 0xFEFF && ftell (xf->file) == 2) /* skip over file-initial byte-order marker, FFFE */ return ucs_2_big_endian (xf); /* translate \r to \n; delete \n after \r */ if (xf->lastchar == '\r' && i.i == '\n') { xf->lastchar = i.i; return ucs_2_big_endian (xf); } xf->lastchar = i.i; if (i.i == '\r') i.i = '\n'; /* 0x2028 is the Unicode linefeed character */ if (i.i == '\n' || i.i == 0x2028U) xf->lineno++; /* It's byte one of a two-byte UTF-16 character; no go in UCS-2 */ if (i.i >= 0xD800U && i.i <= 0xDBFFU) { sprintf (errbuf, "byte offset %ld", ftell (xf->file) - 4); xf->lineno++; add_xml_error (xf, 332, utf_8_to_utf_16 (errbuf)); xf->lineno--; } return (int )i.i; } /* * ucs_2_little_endian: * * This function returns a UTF-16 character read from an input XML * UTF-16 file, xf->file. Does a byte swap if the local host is * big-endian. Note that this routine is never called directly. * Call it via xf->get_xml_char (xf). * * The diff between ucs_2_little_endian and utf_16_little_endian * is that this latter function accepts multi-byte characters that * are technically out of the UCS-2 range with no error message. * This function, however, records an error message. */ static int ucs_2_little_endian (struct xml_file *xf) { int v, count; char errbuf[40]; union { u_int16_t i; unsigned char c[2]; } i, j; i.i = 0; count = fread (&i.c[0], 1, 2, xf->file); switch (count) { case 2: /* got what we needed */ break; case 0: /* got nothing (EOF?) */ if (ferror (xf->file)) errwarn (140, "error reading %s\n", xf->filename); return EOF; case EOF: xf->lineno++; return EOF; default: /* got something - not enough */ errwarn (151, "short read (%d) from %s\n", count, xf->filename); return EOF; } /* If the localhost's byte order is big endian, swap bytes */ if (is_big_endian) { j.c[0] = i.c[1]; j.c[1] = i.c[0]; i = j; } if (i.i == 0xFEFF && ftell (xf->file) == 2) /* skip over file-initial byte-order marker, FFFE */ return ucs_2_little_endian (xf); /* translate \r to \n; delete \n after \r */ if (xf->lastchar == '\r' && i.i == '\n') { xf->lastchar = i.i; return ucs_2_little_endian (xf); } xf->lastchar = i.i; if (i.i == '\r') i.i = '\n'; /* 0x2028 is the Unicode linefeed character */ if (i.i == '\n' || i.i == 0x2028U) xf->lineno++; /* It's byte one of a two-byte UTF-16 character; no go in UCS-2 */ if (i.i >= 0xD800U && i.i <= 0xDFFFU) { sprintf (errbuf, "byte offset %ld", ftell (xf->file) - 4); xf->lineno++; add_xml_error (xf, 332, utf_8_to_utf_16 (errbuf)); xf->lineno--; } return (int )i.i; } /* * shift_jis: * * This function returns a UTF-16 character read from an input XML * Shift-JIS file, xf->file. Shift-JIS is a multi-byte ASCII- * compatible encoding (like UTF-8, but much simpler to decode). * * Note that this routine is never called directly. Call it via * xf->get_xml_char (xf). */ static int shift_jis (struct xml_file *xf) { int v, x; char errbuf[40]; size_t off_by; u_int16_t uchar; union { u_int16_t i; unsigned char c[2]; } i, j; if ((x = fgetc (xf->file)) == EOF) { xf->lineno++; return EOF; } i.i = 0; switch (x & 0xF0U) { case 0x80: case 0x90: case 0xE0: case 0xF0: i.c[1] = (unsigned char)x; if ((x = fgetc (xf->file)) == EOF) { errwarn (151, "short read (0) from file %s\n", xf->filename); return EOF; } i.c[0] = (unsigned char)x; break; default: i.c[0] = x; break; } /* If the localhost's byte order is big endian, swap bytes */ if (is_big_endian) { j.c[0] = i.c[1]; j.c[1] = i.c[0]; i = j; } if (i.i > max_sjis_char) { sprintf (errbuf, "byte offset %ld", ftell (xf->file)); xf->lineno++; add_xml_error (xf, 332, utf_8_to_utf_16 (errbuf)); xf->lineno--; return shift_jis (xf); } off_by = 0; uchar = 0xFFFD; for (x = 0; sjis_gaps[x][2]; x++) { if (i.i < sjis_gaps[x][0]) { uchar = unicode_sjis[i.i - off_by]; break; } else if (i.i <= sjis_gaps[x][1]) { /* Character is in one of the gaps */ sprintf (errbuf, "byte offset %ld", ftell (xf->file)); xf->lineno++; add_xml_error (xf, 333, utf_8_to_utf_16 (errbuf)); xf->lineno--; return shift_jis (xf); } /* The unicode_sjis table is sparse, and as we move up * through it, we have to adjust by increasing amounts for * gaps in the sequence. */ off_by = sjis_gaps[x][2]; } /* translate \r to \n; delete \n after \r */ if (xf->lastchar == '\r' && uchar == '\n') { xf->lastchar = uchar; return shift_jis (xf); } xf->lastchar = uchar; if (uchar == '\r') uchar = '\n'; /* 0x2028 is the Unicode linefeed character */ if (uchar == '\n' || uchar == 0x2028U) xf->lineno++; return (int )uchar; } /* * utf_8 * * Routine for returning 16-bit UTF-16 characters, one by one, taken * and converted from a UTF-8 input file, xf->file. This routine is * never called directly. Call it via xf->get_xml_char (xf) (where * xf is an xml_file structure). * * NOTE WELL: This routine is similar to a routine, utf_8_to_int() * in utfutil.c. If you find a bug here, doubtless you'll find a but * there as well. */ static int utf_8 (struct xml_file *xf) { char errbuf[40]; unsigned char buffer[7]; int c, pos, bytes_left = 0; unsigned int count, j; int maybe_a_unicode_line_break; static unsigned int i, chunk_left = 0; if (chunk_left) { chunk_left = 0; xf->lastchar = i; return (int )i; } else { i = 0; maybe_a_unicode_line_break = 0; if ((c = fgetc (xf->file)) == EOF) { xf->lineno++; return EOF; } /* Check first for ASCII-range char */ if ((c & 0x80) == 0x00) { /* translate \r into \n; delete \n if after \r */ if (xf->lastchar == '\r' && c == '\n') { xf->lastchar = c; return utf_8 (xf); } xf->lastchar = c; if (c == '\r') c = '\n'; if (c == '\n') xf->lineno++; return c; } else if ((c & 0xC0) == 0x80) { sprintf (errbuf, "byte offset %ld", ftell (xf->file) - 1); xf->lineno++; add_xml_error (xf, 330, utf_8_to_utf_16 (errbuf)); xf->lineno--; return utf_8 (xf); } else if ((c & 0xE0) == 0xC0) { i = (unsigned int)(c & (~0xE0U)); bytes_left = 1; } else if ((c & 0xF0) == 0xE0) { i = (unsigned int)(c & (~0xF0U)); bytes_left = 2; if (i == 0x08) /* Unicode linebreak is 0x2028; UTF-8 byte #1 is 0x08 | 0xE0 */ maybe_a_unicode_line_break = 1; } else if ((c & 0xF8) == 0xF0) { i = (unsigned int)(c & (~0xF8U)); bytes_left = 3; } else if ((c & 0xFC) == 0xF8) { i = (unsigned int)(c & (~0xFCU)); bytes_left = 4; } else if ((c & 0xFE) == 0xFC) { i = (unsigned int)(c & (~0xFEU)); bytes_left = 5; } if ((count = fread (buffer, 1, bytes_left, xf->file)) < bytes_left) { errwarn (151, "short read (%d) from file %s\n", count, xf->filename); return EOF; } if (maybe_a_unicode_line_break) /* Unicode linebreak is 0x2028; we've seen the low 4 bytes already */ if ((buffer[0] & 0x3F) == 0x02 && (buffer[1] & 0x3F) == 0x08) xf->lineno++; for (pos = 0; pos < bytes_left; pos++) { if ((buffer[pos] & 0xC0) != 0x80) { sprintf (errbuf, "byte offset %ld", ftell (xf->file) - bytes_left); xf->lineno++; add_xml_error (xf, 331, utf_8_to_utf_16 (errbuf)); xf->lineno--; return utf_8 (xf); } i = (i << 6) | (buffer[pos] & 0x3F); } if (i < 0x00010000U) { xf->lastchar = i; return (int )i; } else { /* Some UCS-4 characters can't be represented in UTF-16 */ if (i > 0x0010FFFFU) { sprintf (errbuf, "byte offset %ld", ftell (xf->file) - bytes_left); xf->lineno++; add_xml_error (xf, 332, utf_8_to_utf_16 (errbuf)); xf->lineno--; return utf_8 (xf); } /* Map i to UCS-16 and put the result in i; return the "high * half" zone first; on the next call return the "low half" * zone. */ chunk_left = 1; i -= 0x10000U; j = ((i >> 10) & TEN_BIT_MASK) + 0xD800U; i = (i & TEN_BIT_MASK) + 0xDC00U; xf->lastchar = j; return (int )j; } } } #ifdef STANDALONE_FILEUTIL_TEST #include "readcfg.h" #include #include #include xmlparse_environment xmlparse_env; int main (int argc, char **argv) { FILE *f; int i = 0; char *line = NULL; my_wchar_t *errmsg, *tmp; struct xml_file *xf, *xf2; struct xml_element *xe; struct xml_attribute *xa; my_wchar_t **string_list; readcfg (argc, argv); if ((f = fopen ("Test/fileutil.input", "r")) != NULL) { while ((line = getline (f, "Test/fileutil.input", &i))) printf ("%s", line); fclose (f); } if ((xf = create_xml_file ("Test/fileutil.input"))) { tmp = read_entire_xml_file (NULL, xf); if (tmp == NULL) { printf ("eek; read_entire_xml_file() failed\n"); exit (1); } line = utf_16_to_utf_8 (tmp); printf ("%s\n", line); rewind_xml_file (xf); if ((line = utf_16_to_utf_8 (read_entire_xml_file (NULL, xf)))) printf ("%s\n", line); errmsg = utf_8_to_utf_16 ("(this is not a real error)"); add_xml_error (xf, 320, errmsg); errmsg = utf_8_to_utf_16 ("(this is not a real warning)"); add_xml_warning (xf, 320, errmsg); rewind_xml_file (xf); errmsg = utf_8_to_utf_16 ("(this is not a real error)"); add_xml_error (xf, 320, errmsg); errmsg = utf_8_to_utf_16 ("(this is not a real warning)"); add_xml_warning (xf, 320, errmsg); sort_xml_errors_and_warnings (xf); if ((xf2 = create_xml_file ("Test/dtdutil.input"))) { add_xml_file (xf, xf2); sort_xml_errors_and_warnings (xf); line = tmpnam (NULL); if ((i = open (line, O_EXCL | O_CREAT | O_RDWR, 0600)) != -1) close (i); if ((xf2 = create_xml_tmpfile (line, line))) { errmsg = utf_8_to_utf_16 ("(this is not a real (child) error)"); add_xml_error (xf2, 320, errmsg); errmsg = utf_8_to_utf_16 ("(this is not a real (child) warning)"); add_xml_warning (xf2, 320, errmsg); sort_xml_errors_and_warnings (xf); add_xml_file (xf, xf2); printf ("Xml parent with two children created\n"); } i = 0; report_all_errors_and_warnings (xf, &i); printf ("Xml parent freed, with its children\n"); } } /* Don't do anything with the line variable; it gets used * below. */ if ((xa = create_xml_attribute (xf, utf_8_to_utf_16 ("HREF"), cdata, 0, NULL, implied, NULL))) printf ("created cdata attribute, HREF\n"); free_xml_attribute (xa); if ((xa = create_xml_attribute (xf, utf_8_to_utf_16 ("ID"), id, 0, NULL, implied, NULL))) printf ("created ID attribute, ID\n"); free_xml_attribute (xa); printf ("Deliberately creating a malformed ID attribute (w/ fixed default value):\n"); tmp = uni_strdup (utf_8_to_utf_16 ("666")); if ((xa = create_xml_attribute (xf, utf_8_to_utf_16 ("ID"), id, 0, NULL, fixed, tmp))) printf ("created ID attribute _with default_ (an error), ID\n"); free_xml_attribute (xa); free (tmp); printf ("This should prompt error 42:\n"); string_list = malloc (3 * sizeof (my_wchar_t *)); string_list[0] = uni_strdup (utf_8_to_utf_16 ("value1")); string_list[1] = uni_strdup (utf_8_to_utf_16 ("value2")); string_list[2] = uni_strdup (utf_8_to_utf_16 ("value3")); if ((xa = create_xml_attribute (xf, utf_8_to_utf_16 ("HREF"), cdata, 3, string_list, implied, NULL))) printf ("created cdata attribute with nonnull nmtoken list (an error), HREF\n"); /* add an element, and see if it still frees properly */ if ((xe = create_xml_element (NULL, NULL, empty, NULL))) printf ("created empty element\n"); xe->attlistlen = 1; xe->attlist = malloc (sizeof (my_wchar_t *)); xe->attlist[0] = xa; free_xml_element (xe); if ((xe = create_xml_element (NULL, NULL, Any, NULL))) printf ("created element with unrestricted content\n"); free_xml_element (xe); if ((xe = create_xml_element (NULL, NULL, mixed, NULL))) printf ("created element with mixed content model\n"); string_list = malloc (3 * sizeof (my_wchar_t *)); string_list[0] = uni_strdup (utf_8_to_utf_16 ("element1")); string_list[1] = uni_strdup (utf_8_to_utf_16 ("element2")); string_list[2] = NULL; xe->content_model = string_list; free_xml_element (xe); if ((xe = create_xml_element (NULL, NULL, children, NULL))) printf ("created empty element\n"); free_xml_element (xe); free_xml_file (xf); if (line) if (access (line, F_OK) == -1) printf ("Temporary file successfully removed\n"); exit (0); } #endif /* STANDALONE_FILEUTIL_TEST */