/* ***************************************************************************** * * $RCSfile: nfadfa.c,v $ * $Date: 1999/05/18 18:36:20 $ * $Source: /home/richard/Xml/RCS/nfadfa.c,v $ * $Revision: 1.12 $ * $Author: richard $ * ***************************************************************************** * * Copyright 1998, 1999 Brown University and Richard Goerwitz * ***************************************************************************** * * Utilities for converting possibly nondeterministic content models * housed in cmnode structures into DFAs. There is only one entry * point: * * struct dfa_state **make_dfa (xml_file *xf, cmnode *cmn) * * Where xf is just the file we're currently parsing (this_file in * the parser), and where cmnode is the content_model field for an * xml_element structure, e.g., (xml_element)xe->content_model. * * The return value is a pointer to the base of an array of dfa_state * pointers. These pointers point to DFA states. To check whether a * given set of child xml_nodes in a parse tree actually match a * given DFA, use check_node_against_dfa(). Note that this checking * may be done incrementally. See parstree.c for the specifics on * how this is done. (The long and short of it is that you can call * check_node_against_dfa() once for each child seen, then reset it * at the end). * ***************************************************************************** */ #include "nfadfa.h" #include "errabort.h" #include "utfutil.h" static size_t how_many_nfa_states = 0; static struct nfa_state **nfa_states = NULL; static size_t how_many_dfa_states = 0; static struct dfa_state **dfa_states = NULL; static nfa_state *cmnode_to_nfa_state (cmnode *, nfa_state *, nfa_state *); static dfa_state *nfa_to_dfa (xml_file *, nfa_state *); static int add_state_to_dfa_states (dfa_state *); static void add_nfa_e_closure (dfa_state *, nfa_state *); static void free_nfa_states (void); static void free_nfa_state (nfa_state *); static void free_nfa_transition (nfa_trans *); static nfa_state *create_nfa_state (int); static nfa_state *add_nfa_transition (nfa_state *, nfa_state *, my_wchar_t *); static void free_dfa_state (dfa_state *); static void free_dfa_transition (dfa_trans *); static dfa_state *create_dfa_state (void); static dfa_state *add_dfa_transition (dfa_state *, nfa_trans *); static int add_nfa_state_to_dfa_state_set (dfa_state *, nfa_state *); #ifdef STANDALONE_NFADFA_TEST static int print_nfa (void); static int print_dfa (void); #endif struct dfa_state ** make_dfa (struct xml_file *xf, struct cmnode *cmn) { struct nfa_state *nfa; struct dfa_state **dfa_states_pointer; /* convert the cmnode tree (see parstree.c) into an nfa */ nfa = cmnode_to_nfa_state (cmn, NULL, NULL); #ifdef STANDALONE_NFADFA_TEST /* let's see what the NFA looks like */ print_nfa (); #endif /* convert the nfa into a dfa; nfa_to_dfa() stores its results in * the file-level static array, (struct dfa_state *)dfa_states */ nfa_to_dfa (xf, nfa); /* add a trailing NULL pointer to the dfa_states array */ add_state_to_dfa_states (NULL); /* frees nfa_states (a struct pointer, file-level static; see above) */ free_nfa_states (); #ifdef STANDALONE_NFADFA_TEST /* and now let's see what the DFA looks like */ print_dfa (); #endif dfa_states_pointer = dfa_states; /* get ready for the next content model */ how_many_dfa_states = 0; dfa_states = NULL; /* what did we just do? */ xwrap (errdebug (5, "Made DFA from cmnode struct provided by parser.\n")); return dfa_states_pointer; } static dfa_state * nfa_to_dfa (struct xml_file *xf, struct nfa_state *ns) { size_t i, j, k; int still_going; my_wchar_t *ambiguous = NULL; struct dfa_state *start, *ds; /* seed the dfa with the e-closure of the ns initial state */ start = create_dfa_state (); add_nfa_e_closure (start, ns); /* add state ds to dfa_states list */ if (add_nfa_state_to_dfa_state_set (start, ns)) add_state_to_dfa_states (start); still_going = 1; while (still_going) { still_going = 0; for (i = 0; i < how_many_dfa_states; i++) { ds = dfa_states[i]; if (ds->marked == 0) { ds->marked = still_going = 1; for (j = 0; j < ds->scount; j++) { /* for every transition from every nfa in ds->nfa_states... */ for (k = 0; k < ds->nfa_states[j]->tcount; k++) /* * ...add a corresponding transition to the dfa, if it's * not already there (if it's already there, add_dfa_transi- * tion just adds the new state to that transition's end * state set) */ if (ds->nfa_states[j]->transitions[k]->label) if (! add_dfa_transition (ds, ds->nfa_states[j]->transitions[k])) /* build up comma-delimited list of ambiguous tokens */ ambiguous = append_string_to_comma_delimited_string ( ambiguous, ds->nfa_states[j]->transitions[k]->label); } for (j = 0; j < ds->tcount; j++) { /* for every transition out of ds, add to that transition's * end state set the epsilon closure of the nfa states that * make it up */ for (k = 0; k < ds->transitions[j]->new_state->scount; k++) add_nfa_e_closure (ds->transitions[j]->new_state, ds->transitions[j]->new_state->nfa_states[k]); } for (j = 0; j < ds->tcount; j++) { /* for every state in the dfa_states list, add it to the * dfa_states list, if it's not already there; if it's * already there, use the existing state in place of the * new duplicate */ for (k = 0; k < how_many_dfa_states; k++) { if (ds->transitions[j]->new_state->scount == dfa_states[k]->scount && ! memcmp (ds->transitions[j]->new_state->nfa_states, dfa_states[k]->nfa_states, dfa_states[k]->scount * sizeof (struct nfa_state *))) { /* it's a duplicate state; free() and replace it */ free_dfa_state (ds->transitions[j]->new_state); ds->transitions[j]->new_state = dfa_states[k]; break; } } if (k == how_many_dfa_states) /* it's not a duplicate state; add it */ add_state_to_dfa_states (ds->transitions[j]->new_state); } } } } if (ambiguous) add_xml_error (xf, 680, uni_truncate_to (ambiguous, 30)); /* turned (possibly nondeterministic) NFA into a DFA */ xwrap (errdebug (5, "Made DFA from cmnode struct provided by parser.\n")); return start; } static void add_nfa_e_closure (struct dfa_state *ds, struct nfa_state *ns) { size_t i; for (i = ns->tcount; i > 0; i--) { /* ...then add every state reachable from ns by an epsilon * transition to ds, */ if (ns->transitions[i - 1]->label != NULL) /* * NULL-labeled transitions (epsilon transitions) are all * at the end; since we are going backwards, we can just * break at the first nonnull label */ break; else { /* Remember: NULL-labeled transitions are epsilon moves */ if (add_nfa_state_to_dfa_state_set (ds, ns->transitions[i - 1]->new_state)) /* and so on, recursively */ add_nfa_e_closure (ds, ns->transitions[i - 1]->new_state); } } } static struct nfa_state * cmnode_to_nfa_state (struct cmnode *cmn, struct nfa_state *init, struct nfa_state *fin) { struct nfa_state *mid; if (init == NULL) init = create_nfa_state (INIT); if (fin == NULL) fin = create_nfa_state (FINAL); switch (cmn->type) { case qmark: add_nfa_transition (init, fin, NULL); cmnode_to_nfa_state (cmn->left, init, fin); break; case star: add_nfa_transition (init, fin, NULL); cmnode_to_nfa_state (cmn->left, init, init); break; case plus: mid = create_nfa_state (NORMAL); add_nfa_transition (mid, fin, NULL); cmnode_to_nfa_state (cmn->left, init, mid); cmnode_to_nfa_state (cmn->left, mid, mid); break; case slash: cmnode_to_nfa_state (cmn->left, init, fin); cmnode_to_nfa_state (cmn->right, init, fin); break; case comma: mid = create_nfa_state (NORMAL); cmnode_to_nfa_state (cmn->left, init, mid); cmnode_to_nfa_state (cmn->right, mid, fin); break; case leaf: /* cmn->text was allocated w/ uni_add_string() */ add_nfa_transition (init, fin, cmn->text); break; } return init; } static struct nfa_state * add_nfa_transition (struct nfa_state *start, struct nfa_state *end, my_wchar_t *label) { size_t i; struct nfa_trans *nt; if (start == NULL || end == NULL) errabort (43, "unexpectedly null value in %s\n", "add_nfa_transition()"); if ((nt = malloc (sizeof (struct nfa_trans))) == NULL) errabort (40, "malloc error in %s\n", "add_nfa_transition()"); nt->label = label; nt->new_state = end; if (++start->tcount == 1) { start->transitions = malloc (start->tcount * sizeof (struct nfa_trans *)); if (start->transitions == NULL) errabort (40, "malloc error in %s\n", "add_nfa_transition()"); } else { start->transitions = realloc (start->transitions, start->tcount * sizeof (struct nfa_trans *)); if (start->transitions == NULL) errabort (41, "realloc error in %s\n", "add_nfa_transition()"); } for (i = 0; i < (start->tcount - 1); i++) { if (! start->transitions[i]->label) break; else if (label) if (uni_strcmp (label, start->transitions[i]->label) >= 0) break; } /* make room, if need be, for nt at offset i */ if (i < (start->tcount - 1)) { #ifdef STANDALONE_NFADFA_TEST fprintf (stderr, "i = %d; start->tcount = %d; moving offset %d [len %d bytes] to offset %d\n", i, start->tcount, i, (start->tcount - i - 1) * sizeof (struct nfa_trans *), i + 1); #endif memmove (&start->transitions[i + 1], &start->transitions[i], (start->tcount - i - 1) * sizeof (struct nfa_trans *)); } /* now, finally, insert nt */ start->transitions[i] = nt; return start; } static struct nfa_state * create_nfa_state (int type) { struct nfa_state *new_ns; if ((new_ns = malloc (sizeof (struct nfa_state))) == NULL) errabort (40, "malloc error in %s\n", "create_nfa_state()"); if (++how_many_nfa_states == 1) nfa_states = malloc (sizeof (struct nfa_state *)); else nfa_states = realloc (nfa_states, how_many_nfa_states * sizeof (struct nfa_state *)); nfa_states[how_many_nfa_states - 1] = new_ns; memset (new_ns, 0, sizeof (struct nfa_state)); new_ns->type = type; new_ns->tcount = 0; new_ns->transitions = NULL; return new_ns; } static void free_nfa_states (void) { size_t i; if (nfa_states) { for (i = 0; i < how_many_nfa_states; i++) free_nfa_state (nfa_states[i]); if (nfa_states) free (nfa_states); how_many_nfa_states = 0; nfa_states = NULL; } } static void free_nfa_state (struct nfa_state *ns) { size_t i; if (ns) { for (i = 0; i < ns->tcount; i++) free_nfa_transition (ns->transitions[i]); if (ns->transitions) free (ns->transitions); free (ns); } } static void free_nfa_transition (struct nfa_trans *nt) { if (nt) /* nt->label will be freed later by uni_free_strings() */ free (nt); } static struct dfa_state * add_dfa_transition (struct dfa_state *ds, struct nfa_trans *nt) { size_t i; struct dfa_trans *dt; if (ds == NULL || nt == NULL || nt->label == NULL) errabort (43, "unexpectedly null value in %s\n", "add_dfa_transition()"); for (i = 0; i < ds->tcount; i++) { switch (uni_strcmp (nt->label, ds->transitions[i]->label)) { case -1: /* keep looking */ continue; case 0: /* transition already present */ add_nfa_state_to_dfa_state_set (ds->transitions[i]->new_state, nt->new_state); if (nt->label != ds->transitions[i]->label) /* * if nt->label, ds->transitions[i]->label point to the * same string, we have a model like (a?)+, which is like * (a?)(a*), which is ambiguous in the NFA, but not in the * original content model, where the two a's are the same * string; if nt->label, ds->transitions[i]->label are * different, however, then we have ambiguity even in the * content model */ return NULL; /* signal an ambiguous content model */ case 1: /* not present; create it */ break; } } if ((dt = malloc (sizeof (struct dfa_trans))) == NULL) errabort (40, "malloc error in %s\n", "add_dfa_transition()"); /* don't uni_strdup(); uni_free_strings() will clean up later */ dt->label = nt->label; dt->new_state = create_dfa_state (); add_nfa_state_to_dfa_state_set (dt->new_state, nt->new_state); ds->tcount++; ds->transitions = realloc (ds->transitions, ds->tcount * sizeof (struct dfa_trans *)); if (ds->transitions == NULL) errabort (41, "realloc error in %s\n", "add_dfa_transition()"); /* make room, if need be, for dt at offset i */ if (i < (ds->tcount - 1)) { #ifdef STANDALONE_NFADFA_TEST fprintf (stderr, "i = %d; ds->tcount = %d; moving offset %d [len %d bytes] to offset %d\n", i, ds->tcount, i, (ds->tcount - i - 1) * sizeof (struct nfa_trans *), i + 1); #endif memmove (&ds->transitions[i + 1], &ds->transitions[i], (ds->tcount - i - 1) * sizeof (struct dfa_trans *)); } /* now, finally, insert dt */ ds->transitions[i] = dt; return ds; } static int add_nfa_state_to_dfa_state_set (struct dfa_state *ds, struct nfa_state *ns) { size_t i; if (ds == NULL || ns == NULL) errabort (43, "unexpectedly null value in %s\n", "add_nfa_state_to_dfa_state_set()"); for (i = 0; i < ds->scount; i++) { if (ns == ds->nfa_states[i]) /* nfa state already added to dfa state */ return 0; else if (ns > ds->nfa_states[i]) /* add ns at position i to ds->nfa_states */ break; } if (ns->type == INIT) /* if ds contains the initial state of the nfa, mark it initial */ ds->type = (ds->type & FINAL) | INIT; else if (ns->type == FINAL) /* if ds contains the final state of the nfa, mark it final */ ds->type = (ds->type & INIT) | FINAL; if (++ds->scount == 1) { ds->nfa_states = malloc (ds->scount * sizeof (struct nfa_state *)); if (ds->nfa_states == NULL) errabort (40, "malloc error in %s\n", "add_nfa_state_to_dfa_state_set()"); } else { ds->nfa_states = realloc (ds->nfa_states, ds->scount * sizeof (struct nfa_state *)); if (ds->nfa_states == NULL) errabort (41, "realloc error in %s\n", "add_nfa_state_to_dfa_state_set()"); } if (i < (ds->scount - 1)) { #ifdef STANDALONE_NFADFA_TEST fprintf (stderr, "i = %d; ds->scount = %d; moving offset %d [len %d bytes] to offset %d\n", i, ds->scount, i, (ds->scount - i - 1) * sizeof (struct nfa_trans *), i + 1); #endif memmove (&ds->nfa_states[i + 1], &ds->nfa_states[i], (ds->scount - i - 1) * sizeof (struct nfa_state *)); } /* now, finally, insert ns */ ds->nfa_states[i] = ns; return 1; } static int add_state_to_dfa_states (struct dfa_state *new_ds) { /* dfa_states is file-level static; so too is how_many_dfa_states */ if (++how_many_dfa_states == 1) dfa_states = malloc (sizeof (struct dfa_state *)); else dfa_states = realloc (dfa_states, how_many_dfa_states * sizeof (struct dfa_state *)); dfa_states[how_many_dfa_states - 1] = new_ds; /* How many states are in the DFA now? */ return how_many_dfa_states; } static struct dfa_state * create_dfa_state (void) { struct dfa_state *new_ds; if ((new_ds = malloc (sizeof (struct dfa_state))) == NULL) errabort (40, "malloc error in %s\n", "create_dfa_state()"); memset (new_ds, 0, sizeof (struct dfa_state)); new_ds->type = NORMAL; new_ds->scount = 0; new_ds->nfa_states = NULL; new_ds->tcount = 0; new_ds->transitions = NULL; new_ds->marked = 0; return new_ds; } void free_dfa (struct dfa_state **dfa_states) { size_t i; if (dfa_states) { for (i = 0; dfa_states[i] != NULL; i++) free_dfa_state (dfa_states[i]); free (dfa_states); } } static void free_dfa_state (struct dfa_state *ds) { size_t i; if (ds) { /* don't free individual nfa_states; free_nfa_states() does that */ free (ds->nfa_states); /* DO free individual dfa_states, however */ for (i = 0; i < ds->tcount; i++) free_dfa_transition (ds->transitions[i]); if (ds->transitions) free (ds->transitions); free (ds); } } static void free_dfa_transition (struct dfa_trans *dt) { if (dt) /* dt->label will get freed later by uni_free_strings() */ free (dt); } #ifdef STANDALONE_NFADFA_TEST #include "hashutil.h" #include "parsutil.h" #include "readcfg.h" xmlparse_environment xmlparse_env; static int print_dfa (void) { size_t i, j, k; if (dfa_states) { for (i = 0; dfa_states[i] != NULL; i++) { fprintf (stderr, "DFA state #%d:\n", i + 1); fprintf (stderr, "\tType = %s", (dfa_states[i]->type & INIT) ? "initial " : ""); fprintf (stderr, "%s", (dfa_states[i]->type & FINAL) ? "final " : ""); fprintf (stderr, "%s", (dfa_states[i]->type & NORMAL) ? "normal" : ""); fprintf (stderr, "\n"); fprintf (stderr, "\tComprises %d NFA states.\n", dfa_states[i]->scount); fprintf (stderr, "\t%d transitions:\n", dfa_states[i]->tcount); for (j = 0; j < dfa_states[i]->tcount; j++) { fprintf (stderr, "\t\tTransition #%d:\n", j + 1); fprintf (stderr, "\t\t\tLabel = %s\n", utf_16_to_utf_8 (dfa_states[i]->transitions[j]->label)); for (k = 0; dfa_states[k] != NULL; k++) if (dfa_states[i]->transitions[j]->new_state == dfa_states[k]) break; if (dfa_states[k]) fprintf (stderr, "\t\t\tEnd state = #%d\n", k + 1); else fprintf (stderr, "\t\t\tEnd state = (error)\n"); } fprintf (stderr, "\n"); } fprintf (stderr, "\n"); return i; } fprintf (stderr, "(No DFA to print)\n\n"); return 0; } static int print_nfa (void) { my_wchar_t *label; size_t i, j, k; if (nfa_states) { for (i = 0; i < how_many_nfa_states; i++) { fprintf (stderr, "NFA state #%d:\n", i + 1); fprintf (stderr, "\tType = %s", (nfa_states[i]->type & INIT) ? "initial " : ""); fprintf (stderr, "%s", (nfa_states[i]->type & FINAL) ? "final " : ""); fprintf (stderr, "%s", (nfa_states[i]->type & NORMAL) ? "normal" : ""); fprintf (stderr, "\n"); fprintf (stderr, "\t%d transitions:\n", nfa_states[i]->tcount); for (j = 0; j < nfa_states[i]->tcount; j++) { fprintf (stderr, "\t\tTransition #%d:\n", j + 1); label = nfa_states[i]->transitions[j]->label; fprintf (stderr, "\t\t\tLabel = %s\n", label ? utf_16_to_utf_8 (label) : "(epsilon)"); for (k = 0; nfa_states[k] != NULL; k++) if (nfa_states[i]->transitions[j]->new_state == nfa_states[k]) break; if (nfa_states[k]) fprintf (stderr, "\t\t\tEnd state = #%d\n", k + 1); else fprintf (stderr, "\t\t\tEnd state = (error)\n"); } fprintf (stderr, "\n"); } return i; } return 0; } int main (int argc, char **argv) { size_t i; struct rg_htable *ht; struct xml_element *xe; struct rg_htable_item *result; /* zero out the xmlparse_env structure */ memset (&xmlparse_env, 0, sizeof (xmlparse_env)); /* and set it to retain all of the parse tree */ xmlparse_env.keep_children = yes; readcfg (argc, argv); if (argc < 2) errabort (11, "no files to process\n"); optind = argc - 1; xmlparse_env.filelen = 1; xmlparse_env.xml_files = malloc (sizeof (xml_file) * xmlparse_env.filelen); for (i = 0; xmlparse_env.filelen > i; i++) xmlparse_env.xml_files[i] = create_xml_file (argv[optind + i]); for (i = 0; i < xmlparse_env.filelen; i++) parse_xml_file (xmlparse_env.xml_files[i]); for (i = 0; i < xmlparse_env.filelen; i++) if ((ht = xmlparse_env.xml_files[i]->element_names)) { result = rg_get_htable_items (ht); while (result != NULL) { xe = (struct xml_element *)result->data; if (xe->type == children) xe->compiled_content_model = make_dfa (xmlparse_env.xml_files[i], xe->content_model); result = rg_get_htable_items (NULL); } } return 0; } #endif /* STANDALONE_NFADFA_TEST */