/* ***************************************************************************** * * $RCSfile: grammutil.c,v $ * $Date: 1999/08/19 17:20:20 $ * $Source: /home/richard/Xml/RCS/grammutil.c,v $ * $Revision: 1.93 $ * $Author: richard $ * ***************************************************************************** * * Copyright 1998, 1999 Brown University and Richard Goerwitz * ***************************************************************************** * * Utilities for adding new elements, attributes, etc. to internal * hashtables; utilities for resolving general, parameter-entity, * and other references; utilities for checking attributes; etc. * * Set your editor's column/screen width to at least 100, or this * file may end up a bit hard to read. * ***************************************************************************** */ #include "grammutil.h" #include "errabort.h" #include "hashutil.h" #include "langcode.h" #include "namespace.h" #include "nfadfa.h" #include "parstree.h" #include "parsutil.h" #include "utfutil.h" #include "xtrautil.h" static void check_for_duplicate_names (xml_file *, my_wchar_t **); static int basic_attribute_integrity_check (xml_file *, xml_attribute *, name_val *, int); static int expands_recursively (xml_file *, my_wchar_t *, my_wchar_t, my_wchar_t *); static my_wchar_t *state_to_string (enum where_am_i); /* * expand_element * * Provide the xml_element struct corresponding to an element, * elname (arg 2). Note that this routine doesn't really expand * anything (in the sense that, say, expand_eref() provides the * declared expansion text for a given element reference). It * just returns more information on elname, in the form of an * xml_element struct. * * Returns a pointer to a static buffer that may get overwritten * on subsequent calls. Returns NULL if no such notation has been * declared. */ struct xml_element * expand_element (struct xml_file *xf, my_wchar_t *elname) { struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "expanding elname, %s\n", utf_16_to_utf_8 (uni_truncate_to (elname, 20)))); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = elname; it.data = NULL; if ((result = rg_find_item (xf->element_names, it)) == NULL) { /* if an element used in markup doesn't resolve, it's an error */ xwrap (errdebug (5, "oops; element name, %s, undefined\n", utf_16_to_utf_8 (uni_truncate_to (elname, 20)))); return NULL; } xwrap (errdebug (5, "expanded element as xml_element struct\n")); return result->data; } /* * expand_notname * * Provide the system identifier corresponding to a given notation * name (notations can be declared with a public or a system * identifier, but it's expected that the public identifier will * resolve to a system identifier). * * Returns a pointer to a static buffer that may get overwritten * on subsequent calls. Returns NULL if no such notation has been * declared. */ my_wchar_t * expand_notname (struct xml_file *xf, my_wchar_t *notname) { char *tmp; static size_t len = 0; static size_t buflen = 0; static my_wchar_t *buf = NULL; struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "expanding notname, %s\n", utf_16_to_utf_8 (uni_truncate_to (notname, 20)))); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = notname; it.data = NULL; result = rg_find_item (xf->notation_names, it); if (result == NULL) { /* it's an error if a notation name doesn't resolve */ xwrap (errdebug (5, "oops; notation name, %s, undefined\n", utf_16_to_utf_8 (uni_truncate_to (notname, 20)))); return NULL; } len = uni_strlen (result->data) + 1; if (len > buflen) { buflen = len; if (buf == NULL) { if ((buf = malloc (buflen * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "expand_notname()"); } else { if ((buf = realloc (buf, buflen * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "expand_notname()"); } } /* copy the sysid corresponding to notname into a static buffer */ memcpy (buf, result->data, len * sizeof (my_wchar_t)); if (xmlparse_env.debug_level >= 5) { tmp = strdup (utf_16_to_utf_8 (uni_truncate_to (buf, 20))); xwrap (errdebug (5, "expanded notname, %s -> %s\n", utf_16_to_utf_8 (uni_truncate_to (notname, 20)), tmp)); free (tmp); } return buf; } /* * expand_peref * * Provide expansion text for peref. Note that expand_peref() * returns a pointer into a static buffer that may change on * subsequent calls. Note that PErefs are, except when used in * entity values, expanded with one leading and one trailing blank * space, '\x20' (XML 1.0 paragraph 4.4.8). The third argument to * expand_peref() controls whether these spaces are added or not. * If it is nonzero, they are added; otherwise not. * * Returns NULL on an unresolvable PERef. */ my_wchar_t * expand_peref (struct xml_file *xf, my_wchar_t *peref, int with_whitespace) { char *tmp; static size_t len = 0; static size_t buflen = 0; static my_wchar_t *buf = NULL; struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "expanding peref, %s\n", utf_16_to_utf_8 (uni_truncate_to (peref, 20)))); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = peref; it.data = NULL; result = rg_find_item (xf->parameter_entity_names, it); if (result == NULL) { /* it's an error if a PEReference doesn't resolve */ xwrap (errdebug (5, "oops; parameter entity, %s, undefined\n", utf_16_to_utf_8 (uni_truncate_to (peref, 20)))); return NULL; } len = uni_strlen (result->data); if ((len + 3) > buflen) { buflen = len + 3; if (buf == NULL) { if ((buf = malloc (buflen * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "expand_peref()"); } else { if ((buf = realloc (buf, buflen * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "expand_peref()"); } } if (! with_whitespace) { /* When used in an entity value, no whitespace is added */ memcpy (buf, result->data, (len + 1) * sizeof (my_wchar_t)); } else { /* XML 1.0 paragraph 4.4.8 says we need leading and trailing * whitespace, except inside quoted entity values, in entity * decls. */ buf[0] = '\x20'; memcpy (&buf[1], result->data, len * sizeof (my_wchar_t)); *(buf + 1 + len) = '\x20'; *(buf + 1 + len + 1) = 0; } if (xmlparse_env.debug_level >= 5) { tmp = strdup (utf_16_to_utf_8 (uni_truncate_to (buf, 20))); xwrap (errdebug (5, "expanded peref, %s -> \"%s\"\n", utf_16_to_utf_8 (uni_truncate_to (peref, 20)), tmp)); free (tmp); } return buf; } /* * expand_eref * * Provide expansion text for eref. Note that expand_eref() * returns a pointer into a static buffer that may change on * subsequent calls. */ my_wchar_t * expand_eref (struct xml_file *xf, my_wchar_t *eref) { char *tmp; static size_t len = 0; static size_t buflen = 0; static my_wchar_t *buf = NULL; struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "expanding eref, %s\n", utf_16_to_utf_8 (uni_truncate_to (eref, 20)))); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = eref; it.data = NULL; if ((result = rg_find_item (xf->entity_names, it)) == NULL) { /* oops; this entity reference is not (yet?) defined */ xwrap (errdebug (5, "oops; entity, %s, undefined\n", utf_16_to_utf_8 (uni_truncate_to (eref, 20)))); return NULL; } len = uni_strlen (result->data) + 1; if (len > buflen) { buflen = len; if (buf == NULL) { if ((buf = malloc (buflen * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "expand_eref()"); } else { if ((buf = realloc (buf, buflen * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "expand_eref()"); } } /* okay, now copy eref text into static buf */ memcpy (buf, result->data, len * sizeof (my_wchar_t)); if (xmlparse_env.debug_level >= 5) { tmp = strdup (utf_16_to_utf_8 (uni_truncate_to (buf, 20))); xwrap (errdebug (5, "expanded entity ref, %s -> %s\n", utf_16_to_utf_8 (uni_truncate_to (eref, 20)), tmp)); free (tmp); } return buf; } /* * expand_ext_eref * * Provide expansion text for ext_eref (a reference to an external * entity). Note that expand_ext_eref() returns a pointer into a * static buffer that may change on subsequent calls. * */ my_wchar_t * expand_ext_eref (struct xml_file *xf, my_wchar_t *ext_eref) { char *tmp; static size_t len = 0; static size_t buflen = 0; static my_wchar_t *buf = NULL; struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "expanding ext_eref, %s\n", utf_16_to_utf_8 (uni_truncate_to (ext_eref, 20)))); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = ext_eref; it.data = NULL; result = rg_find_item (xf->external_entity_names, it); if (result == NULL) { /* oops; this entity reference is not (yet?) defined */ xwrap (errdebug (5, "oops; external entity, %s, undefined\n", utf_16_to_utf_8 (uni_truncate_to (ext_eref, 20)))); return NULL; } len = uni_strlen (result->data) + 1; if (len > buflen) { buflen = len; if (buf == NULL) { if ((buf = malloc (buflen * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "expand_ext_eref()"); } else { if ((buf = realloc (buf, buflen * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "expand_ext_eref()"); } } /* okay, now copy external eref text into static buf */ memcpy (buf, result->data, len * sizeof (my_wchar_t)); if (xmlparse_env.debug_level >= 5) { tmp = strdup (utf_16_to_utf_8 (uni_truncate_to (buf, 20))); xwrap (errdebug (5, "expanded entity ref, %s -> %s\n", utf_16_to_utf_8 (uni_truncate_to (ext_eref, 20)), tmp)); free (tmp); } return buf; } /* * expand_uperef * * Provide an expansion for uperef. Note that expand_uperef() * returns a pointer to an xml_unparsed_entity struct, so it isn't * really "expanding" it, per se. Rather, it is returning more * information on it (information that comes in the form of an * xml_unparsed_entity structure). */ struct xml_unparsed_entity * expand_uperef (struct xml_file *xf, my_wchar_t *uperef) { struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "expanding uperef, %s\n", utf_16_to_utf_8 (uni_truncate_to (uperef, 20)))); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = uperef; it.data = NULL; if ((result = rg_find_item (xf->unparsed_entity_names, it)) == NULL) { /* it's an error if an unparsed eref doesn't resolve */ xwrap (errdebug (5, "oops; unparsed entity, %s, undefined\n", utf_16_to_utf_8 (uni_truncate_to (uperef, 20)))); return NULL; } xwrap (errdebug (5, "expanded uperef as xml_unparsed_entity struct\n")); return result->data; } /* * expand_uperef_as_sysid * * Provide the system identifier associated with uperef. Note that * expand_uperef_as_sysid() returns a pointer into a static buffer * that may change on subsequent calls. Returns NULL if no such * unparsed entity has been defined. */ my_wchar_t * expand_uperef_as_sysid (struct xml_file *xf, my_wchar_t *uperef) { char *tmp; static size_t len = 0; static size_t buflen = 0; static my_wchar_t *buf = NULL; struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "expanding uperef, %s\n", utf_16_to_utf_8 (uni_truncate_to (uperef, 20)))); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = uperef; it.data = NULL; result = rg_find_item (xf->unparsed_entity_names, it); if (result == NULL) { /* it's an error if an unparsed eref doesn't resolve */ xwrap (errdebug (5, "oops; unparsed entity, %s, undefined\n", utf_16_to_utf_8 (uni_truncate_to (uperef, 20)))); return NULL; } len = uni_strlen (result->data) + 1; if (len > buflen) { buflen = len; if (buf == NULL) { if ((buf = malloc (buflen * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "expand_uperef_as_sysid()"); } else { if ((buf = realloc (buf, buflen * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "expand_uperef_as_sysid()"); } } /* copy system identifier string into static buffer */ memcpy (buf, ((struct xml_unparsed_entity *)result->data)->sysid, len * sizeof (my_wchar_t)); if (xmlparse_env.debug_level >= 5) { tmp = strdup (utf_16_to_utf_8 (uni_truncate_to (buf, 20))); xwrap (errdebug (5, "expanded uperef, %s -> %s\n", utf_16_to_utf_8 (uni_truncate_to (uperef, 20)), tmp)); free (tmp); } return buf; } /* * expand_uperef_as_notname * * Provide the notation associated with uperef. Note that * expand_uperef_as_notname() returns a pointer into a static buffer * that may change on subsequent calls. Returns NULL if no such * unparsed entity, uperef, has been defined. * * Note that if there is an uparsed entity in the DTD is invalid, * and points to an undeclared NOTATION, then this routine will, if * invoked on that entity name, will return a string that if passed * to expand_notname(), will trigger a NULL return value. * * Put more simply: If the document isn't valid, don't count on * this function returning anything useful. */ my_wchar_t * expand_uperef_as_notname (struct xml_file *xf, my_wchar_t *uperef) { char *tmp; static size_t len = 0; static size_t buflen = 0; static my_wchar_t *buf = NULL; struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "expanding uperef, %s\n", utf_16_to_utf_8 (uni_truncate_to (uperef, 20)))); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = uperef; it.data = NULL; result = rg_find_item (xf->unparsed_entity_names, it); if (result == NULL) { /* it's an error if an unparsed entity ref doesn't resolve */ xwrap (errdebug (5, "oops; unparsed entity, %s, undefined\n", utf_16_to_utf_8 (uni_truncate_to (uperef, 20)))); return NULL; } len = uni_strlen (result->data) + 1; if (len > buflen) { buflen = len; if (buf == NULL) { if ((buf = malloc (buflen * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "expand_uperef_as_notname()"); } else { if ((buf = realloc (buf, buflen * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "expand_uperef_as_notname()"); } } /* copy notation name into static buffer */ memcpy (buf, ((struct xml_unparsed_entity *)result->data)->notname, len * sizeof (my_wchar_t)); if (xmlparse_env.debug_level >= 5) { tmp = strdup (utf_16_to_utf_8 (uni_truncate_to (buf, 20))); xwrap (errdebug (5, "expanded uperef, %s -> %s\n", utf_16_to_utf_8 (uni_truncate_to (uperef, 20)), tmp)); free (tmp); } return buf; } /* * expand_attribute * * Adds an attribute to a given element, elname's, attribute list, for * a given xml_file struct, xf. Returns zero if the attribute is * already present. Otherwise returns the number of attributes in * the xml_element struct. */ xml_attribute * expand_attribute (xml_file *xf, my_wchar_t *elname, my_wchar_t *attname) { size_t i; char *tmp; struct xml_element *xe; struct xml_attribute *retval; if (xmlparse_env.debug_level >= 7) { tmp = strdup (utf_16_to_utf_8 (uni_truncate_to (attname, 20))); xwrap (errdebug (7, "trying to expand attname %s for element %s\n", tmp, utf_16_to_utf_8 (uni_truncate_to (elname, 20)))); free (tmp); } if ((xe = expand_element (xf, elname)) == NULL) { xwrap (errdebug (5, "no such element, %s\n", utf_16_to_utf_8 (uni_truncate_to (elname, 20)))); return NULL; } retval = NULL; for (i = 0; i < xe->attlistlen; i++) if (uni_strcmp (xe->attlist[i]->name, attname) == 0) retval = xe->attlist[i]; if (xmlparse_env.debug_level >= 5) { tmp = strdup (utf_16_to_utf_8 (elname)); xwrap (errdebug (5, "attribute %s%s found for element %s\n", utf_16_to_utf_8 (uni_truncate_to (elname, 20)), retval ? "" : " not", tmp)); free (tmp); } return retval; } /* * expand_id * * ID attribute values are unique for a given XML document. This * routine returns a pointer into the parse tree for xf (arg 1) * where a given ID attribute occurs. The ID attribute value being * sought is given as (my_wchar_t *)idstring (arg 2). * * Returns NULL if the idstring can't be resolved (presumably * because no such ID has been encoutered [yet] in xf). * * Note: This routine isn't much use if xmlparse_env.keep_children * is set to "no", since this means the parser is discarding nodes * in the parse tree shortly after generating them. */ struct name_val * expand_id (struct xml_file *xf, my_wchar_t *idstring) { struct rg_htable_item item, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "expanding id, %s\n", utf_16_to_utf_8 (uni_truncate_to (idstring, 20)))); /* return NULL if ID is already taken */ item.key = NULL; item.uni_key = idstring; item.data = NULL; if ((result = rg_find_item (xf->ids, item)) == NULL) { if (xmlparse_env.debug_level >= 5) xwrap (errdebug (5, "id, %s, not defined\n", utf_16_to_utf_8 (uni_truncate_to (idstring, 20)))); return NULL; } /* report ids are now in the hashtable */ if (xmlparse_env.debug_level >= 5) xwrap (errdebug (5, "expanded id, %s, as xml_node\n", utf_16_to_utf_8 (uni_truncate_to (idstring, 20)))); if (xmlparse_env.keep_children == no) errabort (54, "tried to dereference ID attribute without any parse tree\n"); return result->data; } /* * add_element * * Add a given element to an xml_file's element_names hashtable, * with information on the element's content model and attribute * names/types/defaults. Returns NULL if the element is already * present. Otherwise returns a pointer to the new element. */ struct xml_element * add_element (xml_file *xf, my_wchar_t *name, enum content_types type, void *content_model) { size_t attlistlen; struct xml_element *xe; struct xml_attribute **attlist; struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "adding %s to element_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->element_names->no_items)); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = name; it.data = NULL; if ((result = rg_find_item (xf->element_names, it)) == NULL) { /* Not defined already; proceed to add it to the element_names * hashtable for xf. Do a quick check to make sure that the * element name doesn't have any malformed namespaces (e.g., * "hello:" or ":goodbye"). */ check_for_colon (xf, name, 1376, 1377, 1378); if ((xe = create_xml_element (xf, name, type, content_model)) == NULL) errabort (40, "malloc() error in %s\n", "add_element()"); switch (type) { case children: if (content_model) xe->compiled_content_model = make_dfa (xf, content_model); break; case mixed: check_for_duplicate_names (xf, content_model); break; case dummy: case empty: case Any: /* what should we check here? */ break; } it.uni_key = uni_strdup (name); it.data = xe; rg_add_item (xf->element_names, it); xwrap (errdebug (5, "added %s to element_names hashtable (new size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->element_names->no_items)); /* return a pointer to the element we just created */ return it.data; } else { /* eek; it's already defined; if it's a dummy element (created * to make add_attribute() work for an as yet undeclared element), * then delete the dummy and try again; otherwise, it's an error */ xe = result->data; if (xe->type == dummy) { attlist = xe->attlist; xe->attlist = NULL; attlistlen = xe->attlistlen; xe->attlistlen = 0; xwrap (errdebug (5, "duplicate dummy element exists; deleting it\n")); rg_delete_item (xf->element_names, it); free_xml_element (xe); xe = add_element (xf, name, type, content_model); xe->attlistlen = attlistlen; xe->attlist = attlist; return xe; } else { /* element has already been fully defined */ add_xml_error (xf, 650, uni_truncate_to (name, 20)); /* element structs are fairly complex; comparing them is hard */ xwrap (errdebug (3, "error adding %s to element_names hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->element_names->no_items)); return NULL; } } } /* * check_for_duplicate_names * * Checks for duplicate strings in an array of my_wchar_t strings. * This routine is used to determine whether a symbol is used twice * in a Mixed content model. Assume the last string is NULL. */ static void check_for_duplicate_names (struct xml_file *xf, my_wchar_t **wps) { size_t i, j; if (wps) for (i = j = 0; wps[i] != NULL; i++) for (j = 0; j < i; j++) if (uni_strcmp (wps[j], wps[i]) == 0) add_xml_warning (xf, 681, wps[j]); } /* * add_attribute * * Adds an attribute to a given element, elname's, attribute list, for * a given xml_file struct, xf. Returns zero if the attribute is * already present. Otherwise returns the number of attributes in * the xml_element struct. */ size_t add_attribute (xml_file *xf, my_wchar_t *elname, struct xml_attribute *xa) { char *tmp; size_t i, len; int already_declared; struct name_val *nv; struct xml_element *xe; my_wchar_t *wtmp, *expansion; if (xmlparse_env.debug_level >= 7) { tmp = strdup (utf_16_to_utf_8 (uni_truncate_to (xa->name, 20))); xwrap (errdebug (7, "adding attribute %s to element, %s\n", tmp, utf_16_to_utf_8 (uni_truncate_to (elname, 20)))); free (tmp); } if ((xe = expand_element (xf, elname)) == NULL) /* create a dummy element to hold the attlist */ if ((xe = add_element (xf, elname, dummy, NULL)) == NULL) errabort (45, "unexpected return value from %s\n", "add_attribute()"); /* Check if the attribute is already declared for this element */ already_declared = 0; for (i = 0; i < xe->attlistlen; i++) if (uni_strcmp (xe->attlist[i]->name, xa->name) == 0) { /* duplicate attribute definition (discard the second one) */ add_xml_warning (xf, 581, uni_truncate_to (xa->name, 20)); already_declared++; } /* Only do all these checks if this is the first time we've seen * this attribute (xa). Note that the xml:space and xml:lang * attribute checks could easily be moved to the beginning of this * function. Do not move the ID check, however. */ if (! already_declared) { /* Now that we're namespace-aware, check for malformed * namespaces in the attribute name (e.g., "hello:" or * ":goodbye"). */ if (! check_for_colon (xf, xa->name, 1380, 1381, 1382)) if (uni_utf_strcmp (xa->name, "xmlns") == 0) goto do_check; else /* xmlns: attributes should really be #FIXED or #REQUIRED */ if (uni_utf_strncmp (xa->name, "xmlns:", 6) == 0) { do_check: if (xa->default_type != fixed && xa->default_type != required) add_xml_warning (xf, 1390, xa->name); } /* If this attribute is xml:space, then it has to be an * enumerated attribute type with two possible values, * default and preserve. The default can be one or the * other; e.g.: * * */ if (uni_utf_strcmp (xa->name, "xml:space") == 0) { if (xa->type != enumeration) /* XML 1.0 spec, par. 2.10 says it has to be enumerated */ add_xml_error (xf, 584, uni_truncate_to (xa->name, 20)); else if (xa->nmtoklen != 2) add_xml_error (xf, 584, uni_truncate_to (xa->name, 20)); else if (! ((uni_utf_strcmp (xa->nmtokens[0], "preserve") == 0 && uni_utf_strcmp (xa->nmtokens[1], "default") == 0) || (uni_utf_strcmp (xa->nmtokens[0], "default") == 0 && uni_utf_strcmp (xa->nmtokens[1], "preserve") == 0))) add_xml_error (xf, 584, uni_truncate_to (xa->name, 20)); } /* If this attribute is xml:lang, then it has to be an * nmtoken or enumerated attribute type, e.g.: * * */ if (uni_utf_strcmp (xa->name, "xml:lang") == 0) if (xa->type != cdata && xa->type != nmtoken && xa->type != enumeration) add_xml_error (xf, 583, uni_truncate_to (xa->name, 20)); /* XML standard, section 3.3.1: only one ID attribute per element */ if (xa->type == id) { for (i = 0; i < xe->attlistlen; i++) if (xe->attlist[i]->type == id) { len = uni_strlen (elname) + uni_strlen (xa->name) + 9; wtmp = malloc (len * sizeof (my_wchar_t)); uni_strcpy (wtmp, elname); uni_utf_strcat (wtmp, " (id \""); uni_strcat (wtmp, xa->name); uni_utf_strcat (wtmp, "\")"); add_xml_error (xf, 580, wtmp); free (wtmp); } } /* Make room in (xml_element *)xe 's attlist for a new attribute */ xe->attlistlen++; if (xe->attlistlen == 1) { if ((xe->attlist = malloc (sizeof (xml_attribute *))) == NULL) errabort (40, "malloc() error in %s\n", "add_attribute()"); } else { xe->attlist = realloc (xe->attlist, xe->attlistlen * sizeof (xml_attribute *)); if (xe->attlist == NULL) errabort (41, "realloc() error in %s\n", "add_attribute()"); } /* now, finally, add the new attribute */ xe->attlist[xe->attlistlen - 1] = xa; } /* If a default is given, make sure that default is a valid instance * of the attribute. */ if (xa->default_type == defaulted || xa->default_type == fixed) { if (xa->type == id) /* Makes no sense for an ID attribute to be #FIXED or defaulted */ add_xml_warning (xf, 587, uni_truncate_to (xa->name, 20)); if (xa->default_val) { /* See s 3.3.3 of the XML standard on attribute whitespace. */ wtmp = uni_strdup (xa->default_val); uni_map_whitespace_to_space (xa->default_val); if (xa->type != cdata) /* non-CDATA attributes get further whitespace normalization */ uni_map_spaces_to_space (xa->default_val); expansion = map_entities (xf, xa->default_val, MAP_CHAR_ENTITIES | MAP_GENERAL_ENTITIES | NO_EXTERNAL_EREFS | ABORT_ON_FAILURE, 0); free (xa->default_val); xa->default_val = expansion; if (expansion == NULL) { /* if there was an entity-expansion error, treat as implied */ add_xml_error (xf, 621, uni_truncate_to (wtmp, 20)); xa->default_type = implied; } else if (xa->type != cdata) { /* Does whitespace normalization give us a blank namespace? */ if (uni_utf_strcmp (xa->name, "xmlns") == 0) if (*wtmp && ! *xa->default_val) add_xml_warning (xf, 620, uni_truncate_to (xa->name, 20)); } free (wtmp); /* if there was an error, xa->default_val was reset to NULL */ if (xa->default_val) { /* Now check to make sure that the default is okay */ nv = create_name_val (xa->name, xa->default_val, NULL, NULL, xf->lineno, yes, NULL); if (! already_declared) { /* Do full checks */ if (! check_attribute (xf, elname, nv, DONT_INSERT_IDS)) add_xml_error (xf, 589, uni_truncate_to (xa->default_val, 20)); } else { /* Don't do full checks (this attribute was already * declared, and will be discarded anyway) */ if (! basic_attribute_integrity_check (xf, xa, nv, DONT_INSERT_IDS)) add_xml_error (xf, 589, uni_truncate_to (xa->default_val, 20)); } free_name_val (nv); if (xf->standalone == yes && in_external_dtd_subset (xf)) add_xml_warning (xf, 588, uni_truncate_to (xa->name, 20)); } } } if (already_declared) /* caller frees xa on zero return value */ return 0; else { if (xmlparse_env.debug_level >= 5) { tmp = strdup (utf_16_to_utf_8 (uni_truncate_to (xa->name, 20))); xwrap (errdebug (5, "added attribute %s to element %s (count = %d)\n", tmp, utf_16_to_utf_8 (uni_truncate_to (elname, 20)), xe->attlistlen)); free (tmp); } return xe->attlistlen; } } /* * add_notname * * Add a given entity name to an xml_file's notation_names table, * along with its associated sysid. Returns zero on error (e.g., if * the notation name is already in the table). Otherwise returns * the number of elements in the hash table (presumably at least 1). */ size_t add_notname (struct xml_file *xf, my_wchar_t *name, my_wchar_t *value) { struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "adding %s to notation_names hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->notation_names->no_items)); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = name; it.data = NULL; if ((result = rg_find_item (xf->notation_names, it)) == NULL) { it.uni_key = uni_strdup (name); it.data = uni_strdup (value); if (it.uni_key == NULL || it.data == NULL) errabort (40, "malloc() error in %s\n", "add_notname()"); rg_add_item (xf->notation_names, it); xwrap (errdebug (5, "added %s to notation_names hashtable (new size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->notation_names->no_items)); } else { /* eek; it's already defined */ add_xml_warning (xf, 700, uni_truncate_to (name, 20)); if (uni_strcmp (result->data, value) != 0) add_xml_warning (xf, 701, uni_truncate_to (name, 20)); xwrap (errdebug (3, "error adding %s to notation_names hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->notation_names->no_items)); return 0; } /* report how many items are now in the hashtable */ return (size_t)xf->notation_names->no_items; } /* * add_peref * * Add a given entity name to an xml_file's parameter_entity_names * hashtable, along with its associated expansion. Returns zero on * error (e.g., if the key is already present). Otherwise returns * the number of elements in the hash table (presumably at least 1). */ size_t add_peref (struct xml_file *xf, my_wchar_t *name, my_wchar_t *value) { struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "adding %s to parameter_entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->parameter_entity_names->no_items)); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = name; it.data = NULL; if ((result = rg_find_item (xf->parameter_entity_names, it)) == NULL) { /* Not defined yet. Good. Now do a quick test for recursion. */ if (expands_recursively (xf, value, '%', name)) { xwrap (errdebug (5, "parameter entity expands recursively, %s\n", uni_truncate_to (name, 20))); add_xml_error (xf, 1002, uni_truncate_to (name, 20)); return 0; } else { it.uni_key = uni_strdup (name); it.data = uni_strdup (value); if (it.uni_key == NULL || it.data == NULL) errabort (40, "malloc() error in %s\n", "add_peref()"); rg_add_item (xf->parameter_entity_names, it); if (xmlparse_env.debug_level >= 5) { xwrap (errdebug (7, "parameter entity's expansion is - \"%s\"\n", utf_16_to_utf_8 (uni_truncate_to (value, 20)))); xwrap (errdebug (5, "added %s to parameter_entity_name hashtable (new size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->parameter_entity_names->no_items)); } } } else { /* eek; it's already defined */ add_xml_warning (xf, 750, uni_truncate_to (name, 20)); if (uni_strcmp (result->data, value) != 0) add_xml_warning (xf, 751, uni_truncate_to (name, 20)); xwrap (errdebug (3, "error adding %s to parameter_entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->parameter_entity_names->no_items)); /* Zero here = error */ return 0; } /* report how many items are now in the hashtable */ return (size_t)xf->parameter_entity_names->no_items; } /* * expands_recursively * * Return true if entity replacement text, text, contains a * reference to entity name, name (be sure to prepend c to name). * Name (arg1) should be the name of the entity whose replacement * text is text (arg2). If name is a parameter entity, c should be * '%'. If it's a general entity, then c should be '&'. * * This is a very simplistic test, and only catches the most obvious * cases of recursion. The map_entities() routine catches other, * less obvious, cases by just checking the depth of recursion. * Past a certain point it's obvious there's a problem. * * This code is very similar to what we find in check_gt_and_lt() * in parsutil.y. If there are bugs in check_gt_and_lt(), there * are probably bugs here as well. */ static int expands_recursively (struct xml_file *xf, my_wchar_t *text, my_wchar_t c, my_wchar_t *name) { size_t j; my_wchar_t *wp; enum where_am_i state; int marked_section_nesting_level; if (! (c == '&' || c == '%')) errabort (46, "unexpected value for c in %s\n", "expands_recursively()"); /* This code is very similar to what we find in check_gt_and_lt() in * parsutil.y. If there are bugs in check_gt_and_lt(), there are * probably bugs here as well. */ state = nowhere; marked_section_nesting_level = 0; for (wp = text; *wp != 0; ++wp) { switch (*wp) { case '%': case '&': if (*wp != c) /* just move up to next semicolon */ while (*wp && *wp != ';') wp++; else /* If we're not in markup, etc., and if there's more of wp left... */ if (state == nowhere && *++wp) { /* ... then see if wp now starts with "name;" */ for (j = 0; *wp != ';' && name[j] != 0; j++, wp++) if (name[j] != *wp) break; /* If it does start with "name;", we have recursion - at * least if expands_recursively() was called properly. */ if (*wp == ';' && name[j] == 0) return 1; else while (*wp && *wp != ';') wp++; } break; case '\'': switch (state) { case in_markup: state = in_single_quote; break; case in_single_quote: state = in_markup; default: break; } break; case '"': switch (state) { case in_markup: state = in_double_quote; break; case in_double_quote: state = in_markup; default: break; } break; case '<': switch (state) { case in_markup: /* can't parse this; just say it's okay (non-recursive) */ return 0; case in_pi: case in_comment: case in_single_quote: case in_double_quote: break; case in_marked_section: if (*(wp + 1) == '!' && *(wp + 2) == '[') { marked_section_nesting_level++; wp += 2; } break; default: switch (*(wp + 1)) { case '?': state = in_pi; wp++; break; case '!': if (*(wp + 2) == '-' && *(wp + 3) == '-') { state = in_comment; wp += 3; } else if (*(wp + 2) == '[') { marked_section_nesting_level = 1; state = in_marked_section; wp += 2; } else state = in_markup; break; default: state = in_markup; break; } break; } break; case '?': if (state == in_pi && *(wp + 1) == '>') { state = nowhere; wp++; } break; case ']': if (state == in_marked_section && *(wp + 1) == ']' && *(wp + 2) == '>') { /* marked sections must nest properly */ if (--marked_section_nesting_level == 0) state = nowhere; else if (marked_section_nesting_level < 0) /* can't parse it; just say it's okay */ return 0; wp += 2; } break; case '-': if (state == in_comment && *(wp + 1) == '-' && *(wp + 2) == '>') { state = nowhere; wp += 2; } break; case '>': switch (state) { case in_markup: state = nowhere; case in_pi: case in_comment: case in_marked_section: case in_single_quote: case in_double_quote: break; default: /* can't parse it; just say it's okay */ return 0; } break; default: break; } } /* Text is probably okay if we get to here and don't find recursion */ return 0; } /* * add_ext_peref * * Add a given entity name to an xml_file's parameter_entity_names * hashtable, along with its expansion - which in this case comes * from an external file. Returns NULL if the parameter entity has * already been declared (and is thus in the table) or if there is * an error reading the file containing the external data (somewhat * unlikely). Otherwise returns the contents of external_xf, minus * the TextDecl - and mapped according to map_what). */ my_wchar_t * add_ext_peref (struct xml_file *xf, my_wchar_t *name, struct xml_file *external_xf, int map_what) { my_wchar_t *data, *oldval; struct rg_htable_item it; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "adding %s to parameter_entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->parameter_entity_names->no_items)); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = name; it.data = NULL; if (rg_find_item (xf->parameter_entity_names, it) == NULL) { /* Not defined yet. Good. Now see if we can actually read external_xf */ if ((data = read_entire_xml_file (xf, external_xf)) == NULL) return NULL; /* Do a brief, naive check for recursion */ if (expands_recursively (xf, data, '%', name)) { xwrap (errdebug (5, "ext PE expands recursively, %s\n", uni_truncate_to (name, 20))); add_xml_error (xf, 1002, uni_truncate_to (name, 20)); return NULL; } else { /* Now, finally, add the replacement text to the external entity hash table */ it.uni_key = uni_strdup (name); it.data = map_entities (xf, data, map_what, 0); if (it.data == NULL || it.uni_key == NULL) errabort (40, "malloc() error in %s\n", "add_ext_peref()"); rg_add_item (xf->parameter_entity_names, it); xwrap (errdebug (5, "added %s to parameter_entity_name hashtable (new size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->parameter_entity_names->no_items)); } } else { /* eek; it's already defined */ add_xml_warning (xf, 750, name); if ((oldval = expand_peref (xf, name, WITHOUT_WHITESPACE)) == NULL) add_xml_warning (xf, 751, uni_truncate_to (name, 20)); else { /* See if we can actually read external_xf */ if ((data = read_entire_xml_file (xf, external_xf)) == NULL) return NULL; /* Quick and dirty check for recursion */ if (expands_recursively (xf, data, '%', name)) { xwrap (errdebug (5, "ext PE expands recursively, %s\n", uni_truncate_to (name, 20))); add_xml_error (xf, 1002, uni_truncate_to (name, 20)); return NULL; } else { /* data here has to be freed, unlike data just above */ data = map_entities (xf, data, map_what, 0); if (data && uni_strcmp (oldval, data) != 0) { add_xml_warning (xf, 751, uni_truncate_to (name, 20)); free (data); } } } xwrap (errdebug (3, "error adding %s to parameter_entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->parameter_entity_names->no_items)); return NULL; } /* report how many items are now in the hashtable */ return it.data; } /* * add_eref * * Add a given entity name to a given xml_file's entity_names * hashtable, along with its expansion. Returns zero on error * (e.g., if the key is already present). Otherwise returns the * number of elements in the hash table (presumably at least 1). */ size_t add_eref (struct xml_file *xf, my_wchar_t *name, my_wchar_t *value) { size_t len; my_wchar_t *tmp, *expansion; struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "adding %s to entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->entity_names->no_items)); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = name; it.data = NULL; if (rg_find_item (xf->entity_names, it) == NULL && rg_find_item (xf->external_entity_names, it) == NULL && rg_find_item (xf->unparsed_entity_names, it) == NULL) { /* Not defined. Good. Now do a simple-minded recursion test. */ if (expands_recursively (xf, value, '&', name)) { xwrap (errdebug (5, "general entity expands recursively, %s\n", uni_truncate_to (name, 20))); add_xml_error (xf, 1002, uni_truncate_to (name, 20)); return 0; } else { it.uni_key = uni_strdup (name); it.data = uni_strdup (value); if (it.uni_key == NULL || it.data == NULL) errabort (40, "malloc() error in %s\n", "add_eref()"); rg_add_item (xf->entity_names, it); if (xmlparse_env.debug_level >= 5) { xwrap (errdebug (7, "entity's expansion is - \"%s\"\n", utf_16_to_utf_8 (uni_truncate_to (value, 20)))); xwrap (errdebug (5, "added %s to entity_name hashtable (new size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->entity_names->no_items)); } } } else { if (! is_builtin (name)) { /* eek; it's already defined & it's not built-in */ add_xml_warning (xf, 800, uni_truncate_to (name, 20)); xwrap (errdebug (3, "error adding %s to entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->entity_names->no_items)); } if ((result = rg_find_item (xf->entity_names, it)) == NULL) add_xml_warning (xf, 801, uni_truncate_to (name, 20)); else { /* simple test; map_entities() below does this better */ if (expands_recursively (xf, value, '&', name)) { xwrap (errdebug (5, "general entity expands recursively, %s\n", uni_truncate_to (name, 20))); add_xml_error (xf, 1002, uni_truncate_to (name, 20)); } else { /* if we haven't yet expanded general entities; do that now */ expansion = map_entities (xf, value, MAP_GENERAL_ENTITIES, 0); /* compare old value to new value */ if (expansion == NULL || uni_strcmp (result->data, expansion) != 0) { /* incompatible redeclaration */ len = uni_strlen (name) + uni_strlen (result->data) + uni_strlen (value) + 30; tmp = malloc (len * sizeof (my_wchar_t)); uni_strcpy (tmp, name); uni_utf_strcat (tmp, " (old value \""); uni_strcat (tmp, result->data); uni_utf_strcat (tmp, "\"; new value \""); uni_strcat (tmp, value); uni_utf_strcat (tmp, "\")"); add_xml_warning (xf, 801, tmp); free (tmp); } if (expansion) free (expansion); } } /* Zero here = error */ return 0; } /* report how many items are now in the hashtable */ return (size_t)xf->entity_names->no_items; } /* * add_ext_eref * * Add a given entity name to an xml_file's external_entity_names * hashtable, along with its expansion. Returns NULL if the key is * already present or if there is an error reading the external * entity data (somewhat unlikely). Otherwise returns a pointer to * the expansion text of the entity. */ my_wchar_t * add_ext_eref (struct xml_file *xf, my_wchar_t *name, xml_file *external_xf, int map_what) { size_t len; struct rg_htable_item it; my_wchar_t *tmp, *data, *expansion, *oldval; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "adding %s to external_entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->external_entity_names->no_items)); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = name; it.data = NULL; if (rg_find_item (xf->external_entity_names, it) == NULL && rg_find_item (xf->entity_names, it) == NULL && rg_find_item (xf->unparsed_entity_names, it) == NULL) { /* See if we can actually read external_xf */ if ((data = read_entire_xml_file (xf, external_xf)) == NULL) return NULL; /* Do a naive, quick check for recursion */ if (expands_recursively (xf, data, '%', name)) { xwrap (errdebug (5, "ext entity expands recursively, %s\n", uni_truncate_to (name, 20))); add_xml_error (xf, 1002, uni_truncate_to (name, 20)); return NULL; } else { /* Now, finally, add the replacement text to the external entity hash table */ it.uni_key = uni_strdup (name); it.data = map_entities (xf, data, map_what, 0); if (it.data == NULL || it.uni_key == NULL) errabort (40, "malloc() error in %s\n", "add_ext_eref()"); rg_add_item (xf->external_entity_names, it); xwrap (errdebug (5, "added %s to external_entity_name hashtable (new size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->external_entity_names->no_items)); } } else { /* eek; it's already defined */ add_xml_error (xf, is_builtin (name) ? 754 : 800, uni_truncate_to (name, 20)); if ((oldval = expand_ext_eref (xf, name)) == NULL) add_xml_warning (xf, 801, uni_truncate_to (name, 20)); else { /* See if we can actually read external_xf */ if ((data = read_entire_xml_file (xf, external_xf)) == NULL) return NULL; /* Do a naive, quick check for recursion */ if (expands_recursively (xf, data, '%', name)) { xwrap (errdebug (5, "ext entity expands recursively, %s\n", uni_truncate_to (name, 20))); add_xml_error (xf, 1002, uni_truncate_to (name, 20)); } else { /* expansion has to be freed later, unlike data above */ expansion = map_entities (xf, data, map_what, 0); if (! expansion || uni_strcmp (oldval, expansion) != 0) { add_xml_warning (xf, 801, uni_truncate_to (name, 20)); /* incompatible redeclaration */ len = uni_strlen (name) + uni_strlen (oldval) + uni_strlen (data) + 30; tmp = malloc (len * sizeof (my_wchar_t)); uni_strcpy (tmp, name); uni_utf_strcat (tmp, " (old value \""); uni_strcat (tmp, oldval); uni_utf_strcat (tmp, "\"; new value \""); uni_strcat (tmp, data); uni_utf_strcat (tmp, "\")"); add_xml_warning (xf, 801, tmp); free (tmp); } if (expansion) free (expansion); expansion = NULL; } } xwrap (errdebug (3, "error adding %s to external_entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->external_entity_names->no_items)); return NULL; } /* return expansion if all went well */ return it.data; } /* * add_ext_eref_text * * Add a given entity name to an xml_file's external_entity_names * hashtable, along with its expansion. Unlike add_ext_eref() up * above, this routine adds straight text to xf's external entity * name table. It's assumed that this text was formerly read in * from an external file at some point, and is just being manually * inserted into this table, so we don't have to read the file in * again or map any entities. * * Returns NULL if the name is already present. Otherwise returns * a pointer to the expansion text of the entity. */ my_wchar_t * add_ext_eref_text (struct xml_file *xf, my_wchar_t *name, my_wchar_t *text) { struct rg_htable_item it; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "adding text to external_entity_name hashtable (size = %d) for %s\n", xf->external_entity_names->no_items, utf_16_to_utf_8 (uni_truncate_to (name, 20)))); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = name; it.data = NULL; if (! (rg_find_item (xf->external_entity_names, it) == NULL && rg_find_item (xf->entity_names, it) == NULL && rg_find_item (xf->unparsed_entity_names, it) == NULL)) return NULL; else { /* Do a naive, quick check for recursion; ERROR MESSAGE WAS ALREADY EMITTED */ if (expands_recursively (xf, text, '%', name)) return NULL; else { /* Now, finally, add the replacement text to the external entity hash table */ it.uni_key = uni_strdup (name); it.data = uni_strdup (text); if (it.data == NULL || it.uni_key == NULL) errabort (40, "malloc() error in %s\n", "add_ext_eref()"); rg_add_item (xf->external_entity_names, it); xwrap (errdebug (5, "added text to external_entity_name hashtable (new size = %d) for %s\n", xf->external_entity_names->no_items, utf_16_to_utf_8 (uni_truncate_to (name, 20)))); } } /* return expansion if all went well */ return it.data; } /* * add_uperef * * Add a given entity name to an xml_file's unparsed_entity_names * hashtable, along with its "expansion." Returns zero if the key * is already present or if there is an error reading the external * entity data (somewhat unlikely). Otherwise returns the number of * elements in the hash table (presumably at least 1). * * Note that in the case of unparsed entities, the keys in the * xf->unparsed_entity_names hashtable are the entity names; the * values are a) the external URIs associated with those names, and * b) the notation names associated with them. * * Put differently, xf->unparsed_entity_names stores information as * follows: keys = unparsed entity name; value = xml_unparsed_entity * struct (consists of two fields: 1) a notation name, 2) a system * identifier). The system identifier is NOT the system identifier * associated with the notation (which you can get via the notation * name hashtable (notname). Rather, it is the system identifier * given in the entity declaration itself. */ size_t add_uperef (xml_file *xf, my_wchar_t *name, my_wchar_t *notname, my_wchar_t *sysid) { struct xml_unparsed_entity *xn; struct rg_htable_item it, *result; if (xmlparse_env.debug_level >= 7) xwrap (errdebug (7, "adding %s to unparsed_entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->unparsed_entity_names->no_items)); /* if it.key == NULL, then rg_find_item will use it.uni_key */ it.key = NULL; it.uni_key = name; it.data = NULL; if ((result = rg_find_item (xf->unparsed_entity_names, it)) == NULL && (result = rg_find_item (xf->entity_names, it)) == NULL && (result = rg_find_item (xf->external_entity_names, it)) == NULL) { /* Get the system identifier associated with this notation name */ if (expand_notname (xf, notname) == NULL) /* * as-yet undeclared notation isn't an error acc. to spec; * at the end of the DTD we'll have to go and check for * notname again (see s. 3.3.1, entity name constraint) */ add_xml_warning (xf, 810, uni_truncate_to (notname, 20)); if ((xn = malloc (sizeof (struct xml_unparsed_entity))) == NULL) errabort (40, "malloc() error in %s\n,", "add_uperef()"); it.uni_key = uni_strdup (name); xn->notname = uni_strdup (notname); xn->sysid = uni_strdup (sysid); if (xn->notname == NULL || xn->sysid == NULL) errabort (40, "malloc() error in %s\n", "add_uperef()"); it.data = xn; rg_add_item (xf->unparsed_entity_names, it); xwrap (errdebug (5, "added %s to unparsed_entity_name hashtable (new size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->unparsed_entity_names->no_items)); } else { /* eek; it's already defined */ add_xml_warning (xf, 800, uni_truncate_to (name, 20)); if ((xn = expand_uperef (xf, name)) == NULL || uni_strcmp (((xml_unparsed_entity *)result->data)->notname, xn->notname) != 0 || uni_strcmp (((xml_unparsed_entity *)result->data)->sysid, xn->sysid) != 0) add_xml_warning (xf, 801, uni_truncate_to (name, 20)); xwrap (errdebug (3, "error adding %s to unparsed_entity_name hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (name, 20)), xf->unparsed_entity_names->no_items)); return 0; } /* report how many items are now in the hashtable */ return (size_t)xf->unparsed_entity_names->no_items; } /* * add_idref * * Add a given val (presumed to be a value for an attribute of * type IDREF) to the xf->idrefs hashtable, along with the line * number where it occurs. * * This routine is different from most other "add" routines in that * it gets called while parsing the actual document content. The * others get called when parsing the DTD. */ size_t add_idref (struct xml_file *xf, my_wchar_t *val, my_wchar_t *lines) { char *tmp; size_t len; my_wchar_t *w_tmp; struct rg_htable_item item, *result; item.key = NULL; item.uni_key = val; item.data = NULL; if ((result = rg_find_item (xf->idrefs, item)) == NULL) { item.uni_key = uni_strdup (val); if (lines) /* lines has a list of line numbers, e.g., "1, 2, 3..." */ item.data = uni_strdup (lines); else { /* lines is null; use current lineno; note: if you change * 196 here, change 192 below */ tmp = malloc (64); w_tmp = malloc (196 * sizeof (my_wchar_t)); if (tmp == NULL || w_tmp == NULL) errabort (40, "malloc error in %s\n", "add_idref()"); sprintf (tmp, "%d", xf->lineno); uni_strcpy (w_tmp, utf_8_to_utf_16 (tmp)); free (tmp); item.data = w_tmp; rg_add_item (xf->idrefs, item); } xwrap (errdebug (5, "added %s to idrefs hashtable (new size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (val, 20)), xf->idrefs->no_items)); } else { /* IDREF exists; add lines or our current lineno to the list */ w_tmp = (my_wchar_t *)result->data; len = uni_strlen (w_tmp); if (w_tmp[len - 1] != '.') { if (lines) { /* lines is nonnull; add it to the list */ if ((len + uni_strlen (lines) + 2) >= 192) { /* no more room */ if (w_tmp[len - 1] != '.') uni_utf_strcat (w_tmp, "..."); } else { /* add lines */ uni_utf_strcat (w_tmp, ", "); uni_strcat (w_tmp, lines); } } else { /* lines is null; use current lineno */ tmp = malloc (64); sprintf (tmp, "%d", xf->lineno); if ((len + strlen (tmp) + 2) >= 192) { /* no more room */ if (w_tmp[len - 1] != '.') uni_utf_strcat (w_tmp, "..."); } else { /* add the current line number to the list */ uni_utf_strcat (w_tmp, ", "); uni_utf_strcat (w_tmp, tmp); } free (tmp); } xwrap (errdebug (3, "added line number, %d, to entry \"%s\" in idefs hashtable\n", xf->lineno, utf_16_to_utf_8 (uni_truncate_to (val, 20)))); } } /* report how many idrefs are now in the hashtable */ return (size_t)xf->idrefs->no_items; } /* * add_id * * ID attribute values are unique for a given XML document. * This routine adds a given ID value to the document's "ids" * table. Returns NULL if such a value is already present. * * This routine is different from most other "add" routines in that * it gets called while parsing the actual document content. The * others get called when parsing the DTD. */ size_t add_id (struct xml_file *xf, struct name_val *nv) { size_t len; my_wchar_t *tmp; struct name_val *old_nv; struct rg_htable_item item, *result; static struct name_val *dummy_nv = NULL; /* return NULL if ID is already taken */ item.key = NULL; item.uni_key = nv->val; item.data = NULL; if ((result = rg_find_item (xf->ids, item)) == NULL) { if (xmlparse_env.keep_children == yes) /* we'll keep the parse tree around; insert pointer into it */ item.data = nv; else { /* we'll discard the parse tree; don't insert pointer into it */ tmp = utf_8_to_utf_16 ("dummy ID value"); dummy_nv = create_name_val (tmp, tmp, NULL, NULL, nv->lineno, yes, NULL); item.data = dummy_nv; } item.uni_key = uni_strdup (nv->val); rg_add_item (xf->ids, item); xwrap (errdebug (5, "added %s to ids hashtable (new size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (nv->name, 20)), xf->ids->no_items)); } else { /* eek; XML spec says ID values must be unique */ xwrap (errdebug (3, "error adding %s to ids hashtable (size = %d)\n", utf_16_to_utf_8 (uni_truncate_to (nv->name, 20)), xf->ids->no_items)); old_nv = (struct name_val *)result->data; if (old_nv->parent == NULL) add_xml_error (xf, 661, uni_truncate_to (nv->name, 20)); else { len = uni_strlen (nv->name) + uni_strlen (old_nv->parent->name) + uni_strlen (old_nv->name) + uni_strlen (old_nv->val) + 12; tmp = malloc (len * sizeof (my_wchar_t)); uni_strcpy (tmp, nv->name); uni_utf_strcat (tmp, " (cf. "); uni_strcat (tmp, old_nv->parent->name); uni_utf_strcat (tmp, " "); uni_strcat (tmp, old_nv->name); uni_utf_strcat (tmp, "=\""); uni_strcat (tmp, old_nv->val); uni_utf_strcat (tmp, "\")"); add_xml_error (xf, 661, uni_truncate_to (nv->name, 20)); free (tmp); } return 0; } /* report how many ids are now in the hashtable */ return (size_t)xf->ids->no_items; } #define cleanup(buf) { free (buf); free (stack.states); return NULL; } /* * map_entities: * * Map all &entity; refs in a string, s, to their expansion text. * Map_what (arg 3) determines what gets mapped (char, general, * and/or parameter entities). * * NOTE WELL: Returns a pointer to a malloc'd buffer THAT MUST BE * FREED, eventually. This is not my usual programming style, but * it's warranted here, to facilitate recursion. * * Options are passed via map_what (arg 3): * * MAP_CHAR_ENTITIES map character entities to chars * MAP_GENERAL_ENTITIES expand general entities * FLAG_BYPASSED_AMPERSANDS flag unexpanded (PI, comment, etc.) &s * UNPARSED_ENTITIES_OK uperefs are okay; no warning msgs * MAP_PARAMETER_ENTITIES expand parameter entities * MAP_AMP_AND_LT convert && and &< to & and < * PAD_PARAMETER_ENTITIES add leading/trailing spaces to perefs * NO_EXTERNAL_EREFS emit error msg if a general external * occurs (used in attribute values) * ABORT_ON_FAILURE return NULL on entity format or resolu- * tion errors * UNRESOLVABLES_OKAY squelch err msgs about unresolvable * entities (overrides ABORT_ON_FAILURE * for unresolvable entities) * * The final argument, depth, is always 0 (it gets incremented * internally). */ my_wchar_t * map_entities (struct xml_file *xf, my_wchar_t *s, int map_what, size_t depth) { char *p; long int l; int lineno, errcode; struct xml_unparsed_entity *xue; struct state_stack stack = { NULL, 0, 0 }; my_wchar_t c, *wp, *tmp, *tmp2, *ref, *wpbuf, **endptr; size_t i, j, k, len, pad, slen, tmplen, base, wpbuflen; if (++depth > 31) { /* If we're this deep, there is probably recursion */ add_xml_error (xf, 1002, uni_truncate_to (s, 20)); return NULL; } /* What are we doing? */ xwrap (errdebug (7, "mapping entities in %s\n", utf_16_to_utf_8 (uni_truncate_to (s, 20)))); slen = uni_strlen (s); wpbuflen = len = slen + 1; if ((wpbuf = malloc (wpbuflen * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc() error in %s\n", "map_entities()"); lineno = 1; push_state (&stack, nowhere); for (i = j = 0; i < slen; i++) { switch (s[i]) { case '\n': lineno++; wpbuf[j++] = s[i]; break; case '\'': switch (top_state (&stack)) { case in_markup: push_state (&stack, in_single_quote); break; case in_single_quote: pop_state (&stack); default: break; } wpbuf[j++] = s[i]; break; case '"': switch (top_state (&stack)) { case in_markup: push_state (&stack, in_double_quote); break; case in_double_quote: pop_state (&stack); default: break; } wpbuf[j++] = s[i]; break; case '<': if (top_state (&stack) == nowhere) { switch (s[i + 1]) { case '?': push_state (&stack, in_pi); wpbuf[j++] = s[i++]; break; case '!': if (s[i + 2] == '-' && s[i + 3] == '-') { /* in a comment */ push_state (&stack, in_comment); for (k = 0; k < 3; k++) wpbuf[j++] = s[i++]; } else if (uni_utf_strncmp (&s[i + 2], "[CDATA[", 7) == 0) { /* in a CDATA section */ push_state (&stack, in_cdsect); for (k = 0; k < 8; k++) wpbuf[j++] = s[i++]; } else /* Not in a comment or CDATA section */ push_state (&stack, in_markup); break; default: push_state (&stack, in_markup); break; } } wpbuf[j++] = s[i]; break; case ']': if (top_state (&stack) == in_cdsect) if (uni_utf_strncmp (&s[i], "]]>", 3) == 0) { /* finished a CDATA section */ pop_state (&stack); wpbuf[j++] = s[i++]; wpbuf[j++] = s[i++]; } wpbuf[j++] = s[i]; break; case '?': if (top_state (&stack) == in_pi && s[i + 1] == '>') { /* finished PI */ pop_state (&stack); wpbuf[j++] = s[i++]; } wpbuf[j++] = s[i]; break; case '-': if (top_state (&stack) == in_comment && s[i + 1] == '-' && s[i + 2] == '>') { /* finished a comment */ pop_state (&stack); wpbuf[j++] = s[i++]; wpbuf[j++] = s[i++]; } wpbuf[j++] = s[i]; break; case '>': if (top_state (&stack) == in_markup) pop_state (&stack); wpbuf[j++] = s[i]; break; case '%': if (top_state (&stack) == in_comment || top_state (&stack) == in_cdsect || top_state (&stack) == in_pi) wpbuf[j++] = s[i]; else if (! ((map_what & MAP_PARAMETER_ENTITIES) && ! uni_isspace (s[i + 1]))) wpbuf[j++] = s[i]; else { /* find the trailing ';' */ for (k = i + 1; s[k] != 0 && s[k] != ';'; k++) { if (uni_isspace (s[k]) || (s[k] <= 0x80 && (iscntrl (s[k]) || (ispunct (s[k]) && ! uni_utf_any (&s[k],":.-_"))))) break; } c = s[k]; s[k] = 0; if (c != ';') { /* found null or junk chars before terminating ';' */ s[k] = c; add_unique_error (xf, depth ? 1003 : 1010, uni_truncate_to (&s[i], (k - i) + 1)); if (map_what & ABORT_ON_FAILURE) cleanup (wpbuf); wpbuf[j++] = s[i]; } else { /* if we're not in a quoted entity val, add leading/trailing space */ pad = (map_what & PAD_PARAMETER_ENTITIES) ? WITH_WHITESPACE : WITHOUT_WHITESPACE; if ((tmp = expand_peref (xf, &s[i + 1], pad)) == NULL) { s[k] = c; /* unresolvables_okay overrides err msg and abort_on_failure */ if (map_what & UNRESOLVABLES_OKAY) xwrap (errdebug (7, "skipping unresolvable peref, %s\n", uni_truncate_to (&s[i], 20))); else { /* reference to an undefined entity is usually an error */ add_xml_error (xf, 1012, uni_truncate_to (&s[i], 20)); if (map_what & ABORT_ON_FAILURE) cleanup (wpbuf); } wpbuf[j++] = s[i]; } else { /* code here is the same as for general entities */ if (xmlparse_env.debug_level >= 7) { p = strdup (utf_16_to_utf_8 (tmp)); xwrap (errdebug (7, "in map_entities(): expanded \"%s\" as \"%s\"\n", utf_16_to_utf_8 (&s[i + 1]), p)); free (p); } goto map_entity_recursively; } } s[k] = c; } /* end of '%' section */ break; case '&': if (top_state (&stack) != nowhere && top_state (&stack) != in_single_quote && top_state (&stack) != in_double_quote) { if (s[i + 1] == '#') /* character entities are handled below */ goto char_entity; if ((map_what & FLAG_BYPASSED_AMPERSANDS) && ((top_state (&stack) == in_pi) || (top_state (&stack) == in_cdsect) || (top_state (&stack) == in_comment))) { /* flag bypassed ampersands */ if ((tmp = malloc (32 * sizeof (my_wchar_t))) == NULL) errabort (40, "malloc error in %s\n", "map_entities()"); uni_strcpy (tmp, uni_truncate_to (&s[i], 2)); uni_utf_strcat (tmp, " ("); uni_strcat (tmp, state_to_string (top_state (&stack))); uni_utf_strcat (tmp, ")"); add_xml_warning (xf, 822, tmp); free (tmp); } wpbuf[j++] = s[i]; } else if (map_what & (MAP_CHAR_ENTITIES | MAP_GENERAL_ENTITIES | MAP_AMP_AND_LT)) { switch (s[i + 1]) { case '&': case '<': /* if we're mapping && and &< to '&' and '<', omit this char */ (map_what & MAP_AMP_AND_LT) ? i++ : (wpbuf[j++] = s[i++]); /* but don't omit the '&' or '<'; rather, add it to wpbuf */ wpbuf[j++] = s[i]; break; case '#': /* note the goto above */ char_entity: if (! (map_what & MAP_CHAR_ENTITIES)) wpbuf[j++] = s[i]; else { /* move past '&#'; set default base to 10 */ i += 2; base = 10; if (s[i] == 'x') { i++; base = 16; } wp = &s[i]; endptr = ℘ /* now try to convert the digits to a wchar */ l = uni_strtol (&s[i], endptr, base); if (l == LONG_MAX && errno == ERANGE) { /* oops; can't convert */ if (s[i - 1] == 'x') i--; i -= 2; wpbuf[j++] = s[i]; } else if (**endptr != ';') { /* oops; there were non-digits in the char ref */ if (s[i - 1] == 'x') i--; add_xml_error (xf, depth ? 1003 : 1010, uni_truncate_to (&s[i - 2], 20)); if (map_what & ABORT_ON_FAILURE) cleanup (wpbuf); i -= 2; wpbuf[j++] = s[i]; } else { errcode = 0; if (l <= 0 || l > 0x0010FFFFU) /* oops; too big to be a valid Unicode char */ errcode = depth ? 1003 : 1010; else if (l >= 0xD800 && l <= 0xDFFF) /* Oops; char entity can't specify a value in the surrogate block */ errcode = 1054; else if (l <= 0x1F && ! (l == 0x09 || l == 0x0A || l == 0x0D)) /* shouldn't be using control characters */ errcode = 1056; else if (l == 0xFFFE) /* byte-order marker is a valid Unicode character; FFFE is not */ errcode = 1055; else if (l == 0xFFFF) /* FFFF will never be a valid Unicode character */ errcode = 1057; else if (l >= 0x80 && l <= 0x9F) { if (s[i - 1] == 'x') i--; /* A "C1" (as opposed to ASCII "C0") control character */ add_xml_warning (xf, 1058, uni_truncate_to (&s[i - 2], 20)); } if (errcode) { if (s[i - 1] == 'x') i--; add_xml_error (xf, errcode, uni_truncate_to (&s[i - 2], 20)); if (map_what & ABORT_ON_FAILURE) cleanup (wpbuf); /* substitute the Unicode "dunno what this is" char, FFFD */ l = 0xFFFDU; } /* Convert integral l to a character */ tmp = int_2_utf_16_string (l); if ((tmplen = uni_strlen (tmp)) > 1) if ((len += tmplen) > wpbuflen) { /* buffer isn't big enough; enlarge it */ wpbuflen = len + uni_strlen (&s[i]) + 1; wpbuf = realloc (wpbuf, wpbuflen * sizeof (my_wchar_t)); if (wpbuf == NULL) errabort (41, "realloc() error in %s\n", "map_entities()"); } memcpy (&wpbuf[j], tmp, tmplen * sizeof (my_wchar_t)); i += (*endptr - &s[i]); j += tmplen; } } break; default: if (! (map_what & MAP_GENERAL_ENTITIES) || uni_isspace (s[i + 1])) wpbuf[j++] = s[i]; else { /* skip to next semicolon */ for (k = i + 1; s[k] != 0 && s[k] != ';'; k++) { if (uni_isspace (s[k]) || (s[k] <= 0x80 && (iscntrl (s[k]) || (ispunct (s[k]) && ! uni_utf_any (&s[k],":.-_"))))) break; } c = s[k]; s[k] = 0; if (c != ';') { /* found null or junk chars before terminating ';' */ s[k] = c; add_unique_error (xf, depth ? 1003 : 1010, uni_truncate_to (&s[i], (k - i) + 1)); if (map_what & ABORT_ON_FAILURE) cleanup (wpbuf); wpbuf[j++] = s[i]; } else { tmp = expand_ext_eref (xf, &s[i + 1]); if (map_what & NO_EXTERNAL_EREFS && tmp) { /* XML 1.0 says ext erefs in attvals are fatal errors */ add_xml_error (xf, 1211, uni_truncate_to (tmp, 20)); if (map_what & ABORT_ON_FAILURE) cleanup (wpbuf); } if (tmp == NULL && (tmp = expand_eref (xf, &s[i + 1])) == NULL) { /* Depending on what the programmer tells us, either flag * unparsed entities as errors, or else ignore them. In * either event, we don't want to expand them. */ if (! (map_what & UNPARSED_ENTITIES_OK)) { /* 1014 -> references to unparsed entities aren't allowed */ xue = expand_uperef (xf, &s[i + 1]); s[k] = c; if (! xue) /* if not an uperef, jump to the normal "unresolvable" section */ goto unresolvable_eref; else { add_xml_error (xf, 1014, uni_truncate_to (&s[i], 20)); if (map_what & ABORT_ON_FAILURE) cleanup (wpbuf); } } else /* programmer is telling me UNPARSED_ENTITIES are to be ignored */ if (! expand_uperef (xf, &s[i + 1])) { /* not an unparsed entity; in fact, not ANY declared entity */ s[k] = c; /* unresolvables_okay overrides err msg and abort_on_failure */ unresolvable_eref: if (map_what & UNRESOLVABLES_OKAY) xwrap (errdebug (7, "skipping unresolvable eref, %s\n", uni_truncate_to (&s[i], 20))); else { /* usually unresolvable references are an error */ add_xml_error (xf, 1012, uni_truncate_to (&s[i], 20)); if (map_what & ABORT_ON_FAILURE) cleanup (wpbuf); } } wpbuf[j++] = s[i]; } else { map_entity_recursively: s[k] = c; xwrap (errdebug (7, "recursively expand \"%s\"\n", utf_16_to_utf_8 (tmp))); tmp = uni_strdup (tmp); if ((ref = map_entities (xf, tmp, map_what, depth))) { free (tmp); /* recursively expand entity replacement text */ wpbuf[j] = 0; tmp2 = uni_strdup (ref); if (xmlparse_env.debug_level >= 7) { xwrap (errdebug (7, "recursively expanded -> \"%s\"\n", utf_16_to_utf_8 (ref))); xwrap (errdebug (7, "concat expansion with \"%s\"\n", utf_16_to_utf_8 (wpbuf))); } tmp = uni_concatenate (wpbuf, tmp2); if ((tmp2 = map_entities (xf, &s[k + 1], map_what, 0)) == NULL) if (map_what & ABORT_ON_FAILURE) cleanup (tmp); free (stack.states); return uni_concatenate (tmp, tmp2); } else { free (tmp); if (map_what & ABORT_ON_FAILURE) cleanup (wpbuf); /* probably a recursion problem; just expand * entity reference once and leave it at that */ len += (tmplen = uni_strlen (tmp)); if (len > wpbuflen) { wpbuflen = len + uni_strlen (&s[i]) + 1; if ((wpbuf = realloc (wpbuf, wpbuflen * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc() error in %s\n", "map_entities()"); } memcpy (&wpbuf[j], tmp, tmplen * sizeof (my_wchar_t)); j += tmplen; i = k + 1; } } s[k] = c; } } /* end of default: in switch statement */ break; } } /* end of '&' section */ break; default: /* default section */ wpbuf[j++] = s[i]; break; } } wpbuf[j] = 0; free (stack.states); return wpbuf; } /* * check_attribute * * Check whether the attribute name/value pair attname/attval (args * 3 and 4) for element elname (arg 2) is valid, i.e., follows that * attribute's declaration. Returns zero if the attribute is not * valid. * * Call this function only after expanding the entities inside of * nv->val. We're supposed to be checking their replacement text. * Entities not in quotes get expanded by the lexer. We have to * expand ones in attribute values "manually" in the parser. */ int check_attribute (struct xml_file *xf, my_wchar_t *elname, struct name_val *nv, int insert_ids) { my_wchar_t *tmp; size_t lineno, len, i; struct xml_attribute *xa; xwrap (errdebug (7, "Checking attribute, %s\n", utf_16_to_utf_8 (nv->name))); /* see if element elname is defined */ if (expand_element (xf, elname) == NULL) { /* error - element isn't defined; use attribute's own line no. */ if (! nv->lineno) add_xml_error (xf, 1203, uni_truncate_to (elname, 20)); else { lineno = xf->lineno; xf->lineno = nv->lineno; add_xml_error (xf, 1203, uni_truncate_to (elname, 20)); xf->lineno = lineno; } return 0; } /* see if attribute nv->name is defined for elname */ if ((xa = expand_attribute (xf, elname, nv->name)) == NULL) { /* if it's not defined, see if the user made a casing error */ len = uni_strlen (nv->name); tmp = malloc ((len + 1) * sizeof (my_wchar_t)); memcpy (tmp, nv->name, (len + 1) * sizeof (my_wchar_t)); /* try all uppercase */ for (i = 0; tmp[i]; i++) tmp[i] = uni_toupper (tmp[i]); if (expand_attribute (xf, elname, tmp)) add_xml_error (xf, 1116, nv->name); else { /* try all lowercase */ for (i = 0; tmp[i]; i++) tmp[i] = uni_tolower (tmp[i]); if (expand_attribute (xf, elname, tmp)) add_xml_error (xf, 1116, nv->name); } /* element is defined; but nv->name isn't one of its attributes */ len = uni_strlen (elname) + 4 + uni_strlen (nv->name); if ((tmp = realloc (tmp, len * sizeof (my_wchar_t))) == NULL) errabort (41, "realloc error in %s\n", "check_attribute()"); uni_strcpy (tmp, elname); uni_utf_strcat (tmp, " ("); uni_strcat (tmp, nv->name); uni_utf_strcat (tmp, ")"); /* error - element isn't defined; use attribute's own line no. */ if (nv->lineno == 0) add_xml_error (xf, 1202, tmp); else { lineno = xf->lineno; xf->lineno = nv->lineno; add_xml_error (xf, 1202, tmp); xf->lineno = lineno; } free (tmp); return 0; } /* values for attributes declared with #FIXED keyword must match default */ if (xa->default_type == fixed) if (uni_strcmp (nv->val, xa->default_val) != 0) { len = uni_strlen (nv->name) + uni_strlen (xa->default_val) + uni_strlen (nv->val) + 17; tmp = malloc (len * sizeof (my_wchar_t)); uni_strcpy (tmp, nv->name); uni_utf_strcat (tmp, "=\""); uni_strcat (tmp, nv->val); uni_utf_strcat (tmp, "\" (default \""); uni_strcat (tmp, xa->default_val); uni_utf_strcat (tmp, "\")"); if (nv->lineno == 0) add_xml_error (xf, 1200, tmp); else { lineno = xf->lineno; xf->lineno = nv->lineno; add_xml_error (xf, 1200, tmp); xf->lineno = lineno; } free (tmp); } /* make sure xml:lang tag has a reasonable value (see langcode.c) */ if (uni_utf_strcmp (nv->name, "xml:lang") == 0) if (! langcode_ok (nv->val)) { if (nv->lineno == 0) add_xml_error (xf, 1220, uni_truncate_to (nv->val, 20)); else { lineno = xf->lineno; xf->lineno = nv->lineno; add_xml_error (xf, 1220, uni_truncate_to (nv->val, 20)); xf->lineno = lineno; } } /* check for excessively long attribute values */ if (! insert_ids || xa->default_type == implied || xa->default_type == required) if (uni_strlen (nv->val) > 32767) add_xml_warning (xf, 1223, uni_truncate_to (nv->val, 100)); /* Check things like whether NMTOKEN attributes have legal characters */ return basic_attribute_integrity_check (xf, xa, nv, insert_ids); } #define error_1221(c,s) { tmp_s = malloc (sizeof (my_wchar_t) * (16 + uni_strlen (c) + uni_strlen (s))); \ uni_strcpy (tmp_s, c); \ uni_utf_strcat (tmp_s, " (in "); \ uni_strcat (tmp_s, s); \ uni_utf_strcat (tmp_s, ")"); \ add_xml_error (xf, 1221, tmp_s); \ free (tmp_s); \ } /* * basic_attribute_integrity_check * * Check whether attribute/value nv for attribute xa has a legal * value. E.g., if xa->type is ENTITY, make sure that value is an * unparsed entity. Or if xa->type is ID, then add value to the * "ids" hash table (it'll be checked later on). Or if xa->type * is NMTOKEN, make sure value has no illegal characters in it. * * If insert_ids (arg 4) is nonnull, inserts any IDs it finds into * the xf->ids hashtable; and so also with IDREFs and the xf->idrefs * hashtable. Normally insert_ids will be set unless we're still in * the DTD, and we're just checking attribute defaults. * * Returns 1 if value is OK; otherwise returns 0. */ static int basic_attribute_integrity_check (xml_file *xf, xml_attribute *xa, name_val *nv, int insert_ids) { size_t i; int errors, old_lineno; my_wchar_t *tmp, *tmp_s; #define RETURN(i) { xf->lineno = old_lineno; return i; } old_lineno = xf->lineno; if (nv->lineno > 0) xf->lineno = nv->lineno; /* Do some basic integrity checks. */ switch (xa->type) { case cdata: /* Dunno what we should check here; any reasonable char is okay. * NB: the XML 1.0 spec doesn't say that CDATA atts should be * treated, with regard to entity expansion, any differently * than other attribute types. Bottom line: Don't get CDATA * sections (which don't do any entity expansion) and CDATA * attributes mixed up. */ break; case id: /* This isn't a very good check; really we should do a full scan * to make sure all ID values match the Name production in the * XML 1.0 spec. See also nmtoken(s) below. */ if (uni_isspace (nv->val[0]) || uni_isdigit (nv->val[0]) || (nv->val[0] < 0x80 && ! (isalpha (nv->val[0]) || uni_utf_any (&nv->val[0], "_:&")))) error_1221 (uni_truncate_to (nv->val, 2), nv->val); if (nv->val[0]) for (i = 1; nv->val[i] != 0; i++) if (uni_isspace (nv->val[i]) || (nv->val[i] < 0x80 && ! (isalnum (nv->val[i]) || uni_utf_any (&nv->val[i], ".-_:&;")))) error_1221 (uni_truncate_to (&nv->val[i], 2), nv->val); /* add this id to the ids hash table */ if (insert_ids) add_id (xf, nv); break; case idref: /* add this idref attribute's value to the idrefs hash table */ if (uni_isspace (nv->val[0]) || uni_isdigit (nv->val[0]) || (nv->val[0] < 0x80 && ! (isalpha (nv->val[0]) || uni_utf_any (&nv->val[0], "_:&")))) error_1221 (uni_truncate_to (nv->val, 2), nv->val); if (nv->val[0]) for (i = 1; nv->val[i] != 0; i++) if (uni_isspace (nv->val[i]) || (nv->val[i] < 0x80 && ! (isalnum (nv->val[i]) || uni_utf_any (&nv->val[i], ".-_:&;")))) error_1221 (uni_truncate_to (&nv->val[i], 2), nv->val); if (insert_ids) add_idref (xf, nv->val, NULL); break; case idrefs: /* add each idref in nv->val to the idrefs hash table */ tmp = uni_tokenize (nv->val); while (tmp != NULL) { if (uni_isdigit (nv->val[0]) || (tmp[0] < 0x80 && ! (isalpha (tmp[0]) || uni_utf_any (&tmp[0], "_:&")))) error_1221 (uni_truncate_to (tmp, 2), tmp); if (tmp[0]) for (i = 1; tmp[i] != 0; i++) if (tmp[i] < 0x80 && ! (isalnum (tmp[i]) || uni_utf_any (&tmp[i], ".-_:&;"))) error_1221 (uni_truncate_to (&tmp[i], 2), tmp); if (insert_ids) add_idref (xf, tmp, NULL); /* unlike strtok(), returns ptr to static buf */ tmp = uni_tokenize (NULL); } break; case entity: if (! expand_uperef (xf, nv->val)) { /* unparsed entity nv->val is not declared */ add_unique_error (xf, 1012, uni_truncate_to (nv->val, 20)); RETURN (0); } break; case entities: /* check each entity to see if it's declared */ errors = 0; tmp = uni_tokenize (nv->val); while (tmp != NULL) { if (! expand_uperef (xf, tmp)) { /* unparsed entity is not declared */ add_unique_error (xf, 1012, uni_truncate_to (tmp, 20)); errors++; } /* unlike strtok(), returns ptr to static buf */ tmp = uni_tokenize (NULL); } if (errors) RETURN (0); break; case nmtoken: /* This isn't a very good check, but it's enough for ASCII * documents. On the TO-DO list is to make this really do a * check for matches against the Nmtoken pattern. See also * "id" above. */ errors = 0; for (i = 0; nv->val[i] != 0; i++) if (uni_isspace (nv->val[i]) || (nv->val[i] < 0x80 && ! (isalnum (nv->val[i]) || uni_utf_any (&nv->val[i], ".-_:&;")))) { error_1221 (uni_truncate_to (&nv->val[i], 2), nv->val); errors++; } if (errors) RETURN (0); break; case nmtokens: /* This isn't a very good check. See above on nmtoken. */ errors = 0; for (i = 0; nv->val[i] != 0; i++) if (nv->val[i] < 0x80 && ! isspace (nv->val[i]) && ! ((isalnum (nv->val[i]) || uni_utf_any (&nv->val[i], ".-_:&;")))) { error_1221 (uni_truncate_to (&nv->val[i], 2), nv->val); errors++; } if (errors) RETURN (0); break; case notation: /* nv->val must resolve as a declared notation */ if (! expand_notname (xf, nv->val)) { add_xml_error (xf, 1013, uni_truncate_to (nv->val, 20)); RETURN (0); } /* fall through */ case enumeration: /* make sure nv->val matches one of the declared values * enumerated in the original AttDef */ for (i = 0; i < xa->nmtoklen; i++) if (xa->nmtokens[i]) if (uni_strcmp (xa->nmtokens[i], nv->val) == 0) break; if (i == xa->nmtoklen) { /* an error; see if there's a case-sensitivity problem */ for (i = 0; i < xa->nmtoklen; i++) if (xa->nmtokens[i]) if (uni_strcasecmp (xa->nmtokens[i], nv->val) == 0) break; if (i < xa->nmtoklen) add_xml_error (xf, 1116, uni_truncate_to (nv->name, 20)); add_xml_error (xf, 1205, uni_truncate_to (nv->name, 20)); RETURN (0); } break; }; RETURN (1); } /* * check_dup_enum_vals * * XML 1.0 spec, par. 3.3.1, says that XML enumerated attribute * value lists should not use the same value twice for any two * attributes of a single element. This is a stupid SGML compat * restriction. But I guess it can't hurt to issue a warning. * * Returns 1 if all is well. Returns 0 if any warning messages were * generated. I.e., a zero return value indicates a problem. */ int check_dup_enum_vals (struct xml_file *xf, struct xml_element *xe) { int errors = 0; size_t i, j, k, l; /* go through every attribute defined for this element */ for (i = 0; i < xe->attlistlen; i++) /* for each element, see if that element is enumerated */ if (xe->attlist[i]->type == enumeration) /* * if that element is enumerated, check to see if any of its * values (xe->attlist[j]->nmtokens) have already been used - * i.e., if it's a duplicate */ for (j = 0; j < xe->attlist[i]->nmtoklen; j++) { for (k = 0; k <= i; k++) { if (xe->attlist[k]->type == enumeration) { if (k < i) { for (l = 0; l < xe->attlist[k]->nmtoklen; l++) if (uni_strcmp (xe->attlist[k]->nmtokens[l], xe->attlist[i]->nmtokens[j]) == 0) { /* just issue a warning; no error */ add_xml_warning (xf, 1210, xe->attlist[i]->nmtokens[j]); errors++; goto next; } } else { for (l = 0; l < j; l++) if (uni_strcmp (xe->attlist[k]->nmtokens[l], xe->attlist[i]->nmtokens[j]) == 0) { add_xml_warning (xf, 1210, xe->attlist[i]->nmtokens[j]); errors++; goto next; } } } } /* we jump right to here if we got a duplicate */ next: } return ! errors; } /* * check_content_models * * For every element declared in both the internal and external DTD * subset, checks for undeclared elements in content models. Returns * the number of elements with content models of type "mixed" or * "children" (and that therefore had to be checked). */ int check_content_models (xml_file *xf) { size_t len; int count = 0; my_wchar_t **wpp, *tmp; struct xml_element *xe, *xe2; struct rg_htable_item *result; if (xf->element_names) { /* for each element... */ result = rg_get_htable_items (xf->element_names); while (result != NULL) { /* ...check to see if every string in its content mode (barring * #PCDATA, if mixed) corresponds to another declared element */ xe = (struct xml_element *)result->data; if (xe->content_model) { if (xe->type == children) { /* make sure all the leaves in a cmnode (i.e., in a * given content-model tree) name valid, declared * elements; issue error 672 on undeclared elements */ check_leaves_in_cmnode (xf, result->uni_key, xe->content_model); count++; } else if (xe->type == mixed) { for (wpp = xe->content_model; *wpp != NULL; wpp++) if (uni_utf_strcmp (*wpp, "#PCDATA") != 0) if ((xe2 = expand_element (xf, *wpp)) == NULL || xe2->type == dummy) { /* Oops, content model names an undeclared element */ len = uni_strlen (result->uni_key) + 4 + uni_strlen (*wpp); tmp = malloc (len * sizeof (my_wchar_t)); uni_strcpy (tmp, *wpp); uni_utf_strcat (tmp, " ("); uni_strcat (tmp, result->uni_key); uni_utf_strcat (tmp, ")"); add_xml_warning (xf, 672, tmp); free (tmp); } count++; } } result = rg_get_htable_items (NULL); } } return count; } /* * state_to_string * * Turn a where_am_i enum into a my_wchar_t string. Returns a pointer * to a static my_wchar_t buffer that may change on subsequent calls. */ static my_wchar_t * state_to_string (enum where_am_i st) { char *tmp; static my_wchar_t wbuf[32]; switch (st) { case nowhere: tmp = "nowhere"; break; case in_markup: tmp = "in_markup"; break; case in_single_quote: tmp = "in_single_quote"; break; case in_double_quote: tmp = "in_double_quote"; break; case in_pi: tmp = "in_pi"; break; case in_comment: tmp = "in_comment"; break; case in_marked_section: tmp = "in_marked_section"; break; case in_entity: tmp = "in_entity"; break; case in_cdsect: tmp = "in_cdsect"; break; default: tmp = "???"; break; } uni_strcpy (wbuf, utf_8_to_utf_16 (tmp)); return wbuf; } #ifdef STANDALONE_GRAMMUTIL_TEST #include "readcfg.h" #include "dtdutil.h" #include "strutil.h" xmlparse_environment xmlparse_env; int main (int argc, char **argv) { char linebuf[2048]; struct name_val *nv; struct xml_file *xf, *xf2; struct xml_attribute *xa; my_wchar_t wlinebuf[2048 * 4]; my_wchar_t *wp, *w_tmp, *w_tmp2, *w_tmp3; /* zero out the xmlparse_env structure */ memset (&xmlparse_env, 0, sizeof (xmlparse_env)); /* and set it to retain all of the parse tree */ xmlparse_env.keep_children = yes; readcfg (argc, argv); xf = create_xml_file ("Test/grammutil.input"); /* create a notation called "GIF" */ w_tmp = uni_strdup (utf_8_to_utf_16 ("GIF")); w_tmp2 = uni_strdup (utf_8_to_utf_16 ("http://www.mainsite.com/support")); if (! add_notname (xf, w_tmp, w_tmp2)) printf ("Error adding notation GIF to xf\n"); free (w_tmp2); wp = uni_strdup (utf_8_to_utf_16 ("fig.bump")); w_tmp2 = uni_strdup (utf_8_to_utf_16 ("/Images/bump.gif")); if (! add_uperef (xf, wp, w_tmp, w_tmp2)) printf ("Error adding unparsed entity, fig.bump, to xf\n"); free (w_tmp); free (w_tmp2); free (wp); /* create an element called "IMG", and give it an attribute, "SRC" */ w_tmp = uni_strdup (utf_8_to_utf_16 ("IMG")); if (! add_element (xf, w_tmp, empty, NULL)) printf ("Error adding element IMG to xf\n"); /* create an attribute called "SRC" */ w_tmp2 = uni_strdup (utf_8_to_utf_16 ("SRC")); if ((xa = create_xml_attribute (xf, w_tmp2, entity, 0, NULL, required, NULL)) == NULL) printf ("Error creating attribute SRC\n"); /* add SRC attribute to IMG entity definition */ if (! add_attribute (xf, w_tmp, xa)) printf ("Error adding attribute SRC to element IMG in xml_file xf.\n"); if (expand_attribute (xf, w_tmp, w_tmp2) != xa) printf ("SRC was not correctly registered as an attribute of IMG.\n"); /* pretend we're parsing a file, and we just ran into */ w_tmp3 = uni_strdup (utf_8_to_utf_16 ("fig.bump")); if ((nv = create_name_val (w_tmp2, w_tmp3, NULL, NULL, 0, no, NULL))) { if (! check_attribute (xf, w_tmp, nv, INSERT_IDS)) printf (" doesn't seem to validate (it should).\n"); free_name_val (nv); } free (w_tmp3); free (w_tmp2); free (w_tmp); while (fgets (linebuf, 2048, xf->file)) { trim (linebuf, "\n"); wp = uni_strdup (utf_8_to_utf_16 (linebuf)); uni_strcpy (wlinebuf, wp); uni_strcat (wlinebuf, utf_8_to_utf_16 ("-expansion")); if (! add_peref (xf, wp, wlinebuf)) printf ("insert failed; key already present\n"); else { if ((w_tmp = expand_peref (xf, wp, WITHOUT_WHITESPACE))) printf ("PEref \"%s\" expands to -> \"%s\"\n", linebuf, utf_16_to_utf_8 (w_tmp)); else printf ("Problem: PEref has no expansion.\n"); if ((w_tmp = expand_peref (xf, wp, WITH_WHITESPACE))) printf ("PEref \"%s\" expands (with whitespace) to -> \"%s\"\n", linebuf, utf_16_to_utf_8 (w_tmp)); else printf ("Problem: PEref has no expansion.\n"); } if (! add_eref (xf, wp, wlinebuf)) printf ("eref insert failed; key already present\n"); else { if ((w_tmp = expand_eref (xf, wp))) printf ("Eref \"%s\" expands to -> \"%s\"\n", linebuf, utf_16_to_utf_8 (w_tmp)); else printf ("Problem: Eref has no expansion.\n"); } w_tmp = utf_8_to_utf_16 ("junkdata.data"); if ((xf2 = resolve_pub_or_sysid (xf, NULL, w_tmp)) == NULL) printf ("Error: Can't create xf2.\n"); else { if (add_ext_eref (xf, wp, xf2, MAP_CHAR_ENTITIES | MAP_PARAMETER_ENTITIES) <= 0) printf ("external eref insert failed; already present\n"); else { if ((w_tmp = expand_ext_eref (xf, wp))) printf ("External eref \"%s\" expands to -> \"%s\"\n", linebuf, utf_16_to_utf_8 (w_tmp)); else printf ("Problem: External eref has no expansion.\n"); } } free (wp); } wp = uni_strdup (utf_8_to_utf_16 ("shouldn't exist")); printf ("Now, let's try one that shouldn't resolve: \"shouldn't exist\"\n"); if (! (w_tmp = expand_eref (xf, wp))) printf ("Good; it doesn't resolve.\n"); else printf ("Oh no, it resolves: \"%s\" -> \"%s\"\n", linebuf, utf_16_to_utf_8 (w_tmp)); free (wp); w_tmp = uni_strdup (utf_8_to_utf_16 ("<&stuff;, stuff, %stuff;>!")); printf ("Mapping entities in: <&stuff;, stuff, %%stuff;>!\n"); w_tmp2 = map_entities (xf, w_tmp, MAP_GENERAL_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); w_tmp2 = map_entities (xf, w_tmp, MAP_GENERAL_ENTITIES | MAP_CHAR_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); w_tmp2 = map_entities (xf, w_tmp, MAP_PARAMETER_ENTITIES | MAP_CHAR_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); free (w_tmp); free_xml_file (xf); xf = create_xml_file ("Test/grammutil.input"); while (fgets (linebuf, 2048, xf->file)) { trim (linebuf, "\n"); wp = uni_strdup (utf_8_to_utf_16 (linebuf)); w_tmp = utf_8_to_utf_16 ("junkdata.data"); if ((xf2 = resolve_pub_or_sysid (xf, NULL, w_tmp)) == NULL) printf ("Error: Can't create xf2.\n"); else { if (add_ext_eref (xf, wp, xf2, MAP_CHAR_ENTITIES | MAP_PARAMETER_ENTITIES) <= 0) printf ("insert failed; key already present\n"); else { if ((w_tmp = expand_ext_eref (xf, wp))) printf ("External eref \"%s\" expands to -> \"%s\"\n", linebuf, utf_16_to_utf_8 (w_tmp)); else printf ("Problem: External eref has no expansion.\n"); } } free (wp); } w_tmp = uni_strdup (utf_8_to_utf_16 ("<&stuff;, stuff, %stuff;>!")); printf ("Mapping entities in: <&stuff;, stuff, %%stuff;>!\n"); w_tmp2 = map_entities (xf, w_tmp, MAP_GENERAL_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); w_tmp2 = map_entities (xf, w_tmp, MAP_GENERAL_ENTITIES | MAP_CHAR_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); w_tmp2 = map_entities (xf, w_tmp, MAP_PARAMETER_ENTITIES | MAP_CHAR_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); free (w_tmp); w_tmp = uni_strdup (utf_8_to_utf_16 ("junk;&#junk;�&;, bad stuff, %error")); printf ("Mapping entities in: junk;&#junk;�&;, bad stuff, %%error\n"); w_tmp2 = map_entities (xf, w_tmp, MAP_GENERAL_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); w_tmp2 = map_entities (xf, w_tmp, MAP_GENERAL_ENTITIES | MAP_CHAR_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); w_tmp2 = map_entities (xf, w_tmp, MAP_PARAMETER_ENTITIES | MAP_CHAR_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); free (w_tmp); w_tmp = uni_strdup (utf_8_to_utf_16 (""<>&'&" &<,&&")); printf ("Mapping entities in: "<>&'&" &<,&&\n"); w_tmp2 = map_entities (xf, w_tmp, MAP_GENERAL_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); w_tmp2 = map_entities (xf, w_tmp, MAP_GENERAL_ENTITIES | MAP_CHAR_ENTITIES, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); w_tmp2 = map_entities (xf, w_tmp, MAP_PARAMETER_ENTITIES | MAP_CHAR_ENTITIES | MAP_AMP_AND_LT, 0); printf ("\t -> %s\n", utf_16_to_utf_8 (w_tmp2)); free (w_tmp2); free (w_tmp); w_tmp = uni_strdup (utf_8_to_utf_16 ("ID")); w_tmp2 = uni_strdup (utf_8_to_utf_16 ("1")); if ((nv = create_name_val (w_tmp, w_tmp2, NULL, NULL, 0, no, NULL))) { printf ("Expanding nonexistent ID (should fail).\n"); if (! expand_id (xf, w_tmp)) printf ("Failed.\n"); printf ("Adding ID 1 via add_id().\n"); if (! add_id (xf, nv)) printf ("Failed.\n"); if (expand_id (xf, w_tmp2) != nv) printf ("Oops; add/expand ID routines aren't working.\n"); printf ("Adding ID 1 again (should fail).\n"); if (! add_id (xf, nv)) printf ("Failed.\n"); free_name_val (nv); } free (w_tmp2); free (w_tmp); free_xml_file (xf); exit (0); } #endif /* STANDALONE_GRAMMUTIL_TEST */