/* ***************************************************************************** * * $RCSfile: hashutil.c,v $ * $Date: 1999/05/18 18:36:20 $ * $Source: /home/richard/Xml/RCS/hashutil.c,v $ * $Revision: 1.7 $ * $Author: richard $ * ***************************************************************************** * * Copyright 1998, Brown University and Richard Goerwitz * ***************************************************************************** * * General-purpose hashing utilities - somewhat complicated in that * they are capable of dealing with Unicode and/or regular strings. * Automatic conversions are done, where necessary. * * Note that it is possible to have more than one rg_htable open at * once: * * struct rg_htable *rg_create_htable (unsigned int nel); * Create and return a hash table optimized to hold nel * items. Returns NULL on malloc() error (note that most * routines here use NULL to signal other sorts of things). * * void rg_free_htable (struct rg_htable *ht); * Destroy ht, i.e., free all storage associated with it. * Note that this routine does not free storage associated * with the elements that were inserted (this, as with the * old C hsearch routines, is up to the programmer). * * void rg_free_htable_and_data (struct rg_htable *ht); * Destroy ht, i.e., free all storage associated with it. * Note that this routine DOES try to free storage associated * with the elements that were inserted into ht. Specifically * it assume that all item.key, item.uni_key, and item.data * fields are either NULL or point to malloc'd storage that * should be freed. * * struct rg_htable_item *rg_add_item (struct rg_htable *ht, * struct rg_htable_item it); * Add item it to hash table ht. Complexities: "it" is an * item structure, consisting of key, uni_key, and data * fields. If you want to insert a Unicode string into the * table, set it.key to NULL, and set it.uni_key to the * my_wchar_t array that holds your Unicode string. Your data * can be of any type. If it.key/uni_key isn't already * present, rg_add_item returns NULL. Otherwise, if the key * is already there, it overwrites the old item and returns * a static (item *) pointer (on which, see below). * * Typical code: * * struct item it, *result; * * it.uni_key = NULL; * it.key = strdup (some_word); * it.data = (void *)strdup (some_data); * if ((result = rg_add_item (htable, it)) != NULL) { * emit_warning_message ("overwrote existing key, %s\n", some_word); * if (result->key) free (result->key); * if (result->uni_key) free (result->uni_key); * free (result->data); * } * * Note that if you want to insert a Unicode string, you must * set it.key above to NULL, and then set it.uni_key to point * to a my_wchar_t array: * * it.key = NULL; * it.uni_key = some_utf_16_string; * * struct rg_htable_item *rg_delete_item (struct rg_htable *ht, * struct rg_htable_item it); * Delete item it from hash table ht. Returns a pointer to * a temporary copy of the deleted item, in much the same * way as rg_add_item does above when it overwrites existing * items. * * struct rg_htable_item *rg_get_htable_items (struct rg_htable *); * Use this function to get a listing of every item in hash * table ht. It is invoked as follows (assume that ht is an * already-created and filled-out hash table): * * struct rg_htable_item *result; * result = rg_get_htable_items (ht); * while (result != NULL) { * do something with result->key or uni_key and result->data * ... * result = rg_get_htable_items (NULL); * } * * struct rg_htable_item *rg_find_item (struct rg_htable *ht, * struct rg_htable_item it); * If an item whose key = it.key exists in ht, rg_find_item * returns a pointer to that item. Otherwise it returns * NULL. NULL here means failure, that is (compare this * with rg_add_item() above). * * All of the structure declarations are in hashutil.h, as are the * prototypes for the above functions. * * The reason, incidentally, that rg_add_item copies any items it * overwrites into a static buffer is so that that item's key and * data fields (which typically point to malloc'd storage) can be * freed by the user. In this case a nonnull return value does not * so much indicate success (as opposed to failure) as simply that * there may be work left to do. * * The only error numbers used here are 40 and 41 (malloc and realloc * failure). * ***************************************************************************** */ #include "hashutil.h" #include "errabort.h" #include "utfutil.h" static unsigned int rg_getprime (unsigned int); static unsigned int str_hfunc (char *, unsigned int); static unsigned int uni_hfunc (my_wchar_t *, unsigned int); static int uni_compare (my_wchar_t *, my_wchar_t *); #ifndef CHAR_BIT # define CHAR_BIT 8 #endif struct rg_htable * rg_create_htable (unsigned int nel) { struct rg_htable *ht; if ((ht = malloc (sizeof (struct rg_htable))) == NULL) return NULL; ht->no_items = 0U; ht->len = rg_getprime (nel / 3); ht->buckets = calloc (ht->len, sizeof (struct rg_htable_bucket)); if (ht->buckets == NULL) return NULL; return ht; } /* * rg_free_htable * * Normal routine for freeing a table created via rg_create_htable. * Does not free the data pointed to by items in that table. If you * want to free these, too, do it by hand, or see below on * rg_free_htable_and_data(). */ void rg_free_htable (struct rg_htable *ht) { unsigned int i; if (ht != NULL) { for (i = ht->len; i > 0; i--) free (ht->buckets[i - 1].items); free (ht->buckets); free (ht); } } /* * rg_free_htable_and_data * * Dangerous. Not only frees up all storage occupied by the hash * table, ht, itself, but also frees up all storage pointed to by * entries therein. Cf. rg_free_htable() above. * * NOTE THAT THIS ROUTINE ASSUMES THAT result->key, result->uni_key, * AND result->data ALL POINT TO MALLOC'D STORAGE. This assumption * will not hold for all tables!!! */ void rg_free_htable_and_data (struct rg_htable *ht) { struct rg_htable_item *result; result = rg_get_htable_items (ht); while (result != NULL) { if (result->key) free (result->key); if (result->uni_key) free (result->key); if (result->data) free (result->data); result = rg_get_htable_items (NULL); } rg_free_htable (ht); } /* * rg_add_item: * * Add item to hash table ht. If an item with the same key is * already present, clobber it. */ struct rg_htable_item * rg_add_item (struct rg_htable *ht, struct rg_htable_item item) { int j; void *key, *tablekey; size_t i, how_many_bytes; unsigned int subscript; struct rg_htable_bucket *phb; static struct rg_htable_item tmp_phb; int (*compare_strings)(); if (item.key) { /* use regular string methods */ compare_strings = strcmp; subscript = str_hfunc (item.key, ht->len); key = item.key; item.uni_key = NULL; } else { /* use the unicode string methods */ compare_strings = uni_compare; subscript = uni_hfunc (item.uni_key, ht->len); key = item.uni_key; } phb = &ht->buckets[subscript]; /* * If phb->len is zero, then we need to initialize buflen and the * item array. */ if (phb->len == 0) { /* number of items in ht will be increasing by one */ ht->no_items++; phb->len = 1; phb->buflen = 1; phb->items = malloc (sizeof (struct rg_htable_item) * phb->buflen); if (phb->items == NULL) errabort (40, "malloc error in %s\n", "rg_add_item()"); phb->items[0] = item; return NULL; } /* * Item array is already initialized. Add another item to it and * increment the len field. If there's not enough space in the item * array, increment buflen and enlarge the array accordingly. */ for (i = 0; phb->len > i; i++) { /* * Go through items in bucket pbh one by one, stopping at the * first one whose key is lexically less than item.key. Then * move all remaining items down one, and insert the new item * into the vacant spot. Note that we may get automatic UTF-8 * <-> UTF-16 conversions here. */ if (item.key) { tablekey = (void *)phb->items[i].key; if (tablekey == NULL && phb->items[i].uni_key) tablekey = (void *)utf_16_to_utf_8 (phb->items[i].uni_key); } else { tablekey = (void *)phb->items[i].uni_key; if (tablekey == NULL && phb->items[i].key) tablekey = (void *)utf_8_to_utf_16 (phb->items[i].key); } j = compare_strings (key, tablekey); if (j < 0) { /* number of items in ht will be increasing by one */ ht->no_items++; /* length of this particular bucket will increase, too */ phb->len++; if (phb->len > phb->buflen) { phb->buflen += (phb->buflen / 2) + 1; phb->items = realloc (phb->items, sizeof (struct rg_htable_item) * phb->buflen); if (phb->items == NULL) errabort (41, "realloc error in %s\n", "rg_add_item()"); } how_many_bytes = sizeof (struct rg_htable_item) * (phb->len - (i + 1)); /* * Checker reports that memmove is accessing the memory 3 * bytes before &phb->items[i]. I have no idea why it's * doing this. But then I haven't looked at memmove's src. */ memmove (&phb->items[i+1], &phb->items[i], how_many_bytes); phb->items[i] = item; return NULL; } /* * If an item with an identical key exists, replace it with * item. But, in case the programmer wants to free the objects * its key and data fields point to, return a pointer to a * temporary copy of the replaced item. */ else if (j == 0) { /* don't increment ht->no_items or phb->len */ tmp_phb = item; phb->items[i] = item; return &tmp_phb; } } /* * Item.key is lexically greater than anything else in phb->items; put * the new item at at end of the item array in the current bucket, phb. */ ht->no_items++; phb->len++; if (phb->len > phb->buflen) { phb->buflen += (phb->buflen / 2) + 1; phb->items = realloc (phb->items, sizeof (struct rg_htable_item) * phb->buflen); if (phb->items == NULL) errabort (41, "realloc error in %s\n", "rg_add_item()"); } phb->items[phb->len - 1] = item; return NULL; } /* * rg_delete_item: * * Delete item from hash table ht, if indeed an item with the same key * is already present. Return a pointer to a static buffer that will * temporarily hold the information contained in the deleted item. This * is so the user can free any storage associated with it. Return NULL * if no matching item is found in the table. */ struct rg_htable_item * rg_delete_item (struct rg_htable *ht, struct rg_htable_item item) { int j; void *key, *tablekey; size_t i, how_many_bytes; unsigned int subscript; struct rg_htable_bucket *phb; static struct rg_htable_item tmp_phb; int (*compare_strings)(); if (item.key) { /* use regular string methods */ compare_strings = strcmp; subscript = str_hfunc (item.key, ht->len); key = item.key; item.uni_key = NULL; } else { /* use the unicode string methods */ compare_strings = uni_compare; subscript = uni_hfunc (item.uni_key, ht->len); key = item.uni_key; } phb = &ht->buckets[subscript]; /* * If phb->len is zero, then there are no items in this bucket. So * there can't be an item in the table with a matching key. We have * nothing to do, in other words. Just return NULL. */ if (phb->len == 0) return NULL; /* * Okay, item.key hashes to a bucket with stuff in it. Check to see * if any of the items present has the same key field as item. If * so, then remove it after copying it into a static buffer. */ for (i = 0; phb->len > i; i++) { /* * Go through items in bucket. If we get to one with a key * lexically greater than item.key (i.e., j < 0), then there's * no matching item to delete. Just return NULL. Note that * we can get automatic UTF-8 <-> UTF-16 conversions here. */ if (item.key) { tablekey = (void *)phb->items[i].key; if (tablekey == NULL && phb->items[i].uni_key) tablekey = (void *)utf_16_to_utf_8 (phb->items[i].uni_key); } else { tablekey = (void *)phb->items[i].uni_key; if (tablekey == NULL && phb->items[i].key) tablekey = (void *)utf_8_to_utf_16 (phb->items[i].key); } j = compare_strings (key, tablekey); if (j < 0) return NULL; else if (j == 0) { tmp_phb = phb->items[i]; /* move the rest of items in the bucket back by one */ if ((how_many_bytes = sizeof (struct rg_htable_item) * (phb->len - (i + 1))) > 0) memmove (&phb->items[i], &phb->items[i+1], how_many_bytes); /* decrement number of items in hash table by one */ ht->no_items--; /* decrement number of items in the current bucket by one */ phb->len--; return &tmp_phb; } } /* * All the items in phb->items have keys lexically less than * item.key. Nothing to do. Return NULL. */ return NULL; } /* * rg_find_item: * * If an entry with the same key as item exists in hash table * ht, then return a pointer to that entry (i.e., to the item * struct having that key). Otherwise return NULL. NULL, that * is, indicates that no such item exists in ht. */ struct rg_htable_item * rg_find_item (struct rg_htable *ht, struct rg_htable_item item) { int j; size_t i; void *key, *tablekey; unsigned int subscript; struct rg_htable_bucket *phb; int (*compare_strings)(); if (item.key) { /* use regular string methods */ compare_strings = strcmp; subscript = str_hfunc (item.key, ht->len); key = item.key; item.uni_key = NULL; } else { /* use the unicode string methods */ compare_strings = uni_compare; subscript = uni_hfunc (item.uni_key, ht->len); key = item.uni_key; } phb = &ht->buckets[subscript]; for (i = 0; phb->len > i; i++) { /* Note that we may get automatic UTF-8 <-> UTF-16 conversions * here if the hashtable has keys of a different type than the * key being used in (item)it. */ if (item.key) { tablekey = (void *)phb->items[i].key; if (tablekey == NULL && phb->items[i].uni_key) tablekey = (void *)utf_16_to_utf_8 (phb->items[i].uni_key); } else { tablekey = (void *)phb->items[i].uni_key; if (tablekey == NULL && phb->items[i].key) tablekey = (void *)utf_8_to_utf_16 (phb->items[i].key); } j = compare_strings (key, tablekey); if (j == 0) return &phb->items[i]; else if (j < 0) return NULL; } return NULL; } /* * rg_get_htable_items: * * Produce, successively, each item in hash table ht. To obtain * the first item in ht, use rg_get_htable_items (ht). To obtain * the remaining items in ht, call rg_get_htable_items() with a * NULL argument. When finished, rg_get_htable_items() returns * a NULL pointer. */ struct rg_htable_item * rg_get_htable_items (struct rg_htable *ht) { static size_t j; static unsigned int i; static struct rg_htable *saved_ht; if (ht != NULL) { saved_ht = ht; if (saved_ht->no_items == 0) return NULL; j = i = 0; while (saved_ht->buckets[i].len == 0) i++; } else if (++j == saved_ht->buckets[i].len) { j = 0; do { if (++i == saved_ht->len) return NULL; } while (saved_ht->buckets[i].len == 0); } return &saved_ht->buckets[i].items[j]; } static unsigned int rg_getprime (unsigned int begin) { if (begin < 10) return 11U; else if (begin < 20) return 23U; else if (begin < 50) return 53U; else if (begin < 200) return 211U; else if (begin < 1000) return 1009U; else if (begin < 5000) return 5003U; else if (begin < 25000) return 25013U; else if (begin < 50000) return 50021U; else if (begin < 150000) return 150001U; else if (begin < 1000000) return 1000003U; else if (begin < 1000000000) return 2147483069U; else return 1U; } /* * str_hfunc * * Simple hash function for regular strings (nil-terminated arrays * of single-byte characters). */ static unsigned int str_hfunc (char *key, unsigned int m) { unsigned int i, j; unsigned int offset = 0; i = strlen (key); for (j = 0; i > j; j++) offset = ((offset << CHAR_BIT) + key[j]) % m; return offset; } /* * uni_hfunc * * Simple hash function for unicode (wide-char) strings. These * must be 0-terminated. */ static unsigned int uni_hfunc (my_wchar_t *uni_key, unsigned int m) { char *p; unsigned int hashval; /* convert to UTF-8 and then generate hash */ if ((p = strdup (utf_16_to_utf_8 (uni_key))) == NULL) errabort (40, "malloc error in %s\n", "uni_hfunc()"); hashval = str_hfunc (p, m); free (p); return hashval; } /* * uni_compare: * * Compare Unicode strings, à la strcmp(s1, s2). Returns 1 if s1 is * lexically greater than s2, 0 if they are identical, and -1 if s1 * is lexically less than s2. Note that no locale variations are * considered. */ int uni_compare (my_wchar_t *s1, my_wchar_t *s2) { while (*s1 == *s2++) if (*s1++ == 0) return 0; /* Convert to integral type and compare */ if ((unsigned int)*s1 > (unsigned int)*(s2 - 1)) return 1; return -1; } #ifdef STANDALONE_HASHUTIL_TEST #include #include "readcfg.h" void rg_print_htable (struct rg_htable *ht) { size_t j; unsigned int i; printf ("--> Printing hash table:\n"); printf ("--> \tlen = %u\n", ht->len); for (i = 0; ht->len > i; i++) { printf ("--> \tContents of hash bucket #%u:\n", i+1); printf ("--> \tbuffer size = %u\n", ht->buckets[i].buflen); for (j = 0; ht->buckets[i].len > j; j++) if (ht->buckets[i].items[j].key) printf ("--> \t\tItem #%u: key = %s, data = %s\n", j+1, ht->buckets[i].items[j].key, (char *)ht->buckets[i].items[j].data); else { printf ("--> \t\tItem #%u: uni_key = %s, data = ", j+1, utf_16_to_utf_8 (ht->buckets[i].items[j].uni_key)); printf ("%s\n", utf_16_to_utf_8 (ht->buckets[i].items[j].data)); } } } xmlparse_environment xmlparse_env; int main (int argc, char **argv) { int i; FILE *intext; struct rg_htable *ht; struct rg_htable_item item, *result; char linebuf[2048], otherbuf[2056]; my_wchar_t uni_otherbuf[2056 * 4]; readcfg (argc, argv); if (argc < 2 || ! (intext = fopen (argv[argc - 1], "r"))) { printf ("Eek; can't find any input file.\n"); exit (3); } printf ("Creating hash table...\n"); if ((ht = rg_create_htable (10)) == NULL) { printf ("Error creating hash table.\n"); exit (1); } #ifdef DEBUG rg_print_htable (ht); #else printf ("Number of buckets in table: %u\n", ht->len); #endif /* DEBUG */ printf ("Finding/entering items into the hash table.\n"); while (fgets (linebuf, 2048, intext)) { if (*linebuf != '\0') linebuf[strlen (linebuf) - 1] = '\0'; item.uni_key = NULL; item.key = strdup (linebuf); strcpy (otherbuf, linebuf); strcat (otherbuf, "-data"); item.data = (void *)strdup (&otherbuf[0]); if ((result = rg_find_item (ht, item))) { printf ("\tKey \"%s\" already in the hash table.\n", item.key); printf ("\tOld data: %s\n", (char *)result->data); rg_add_item (ht, item); #ifdef DEBUG rg_print_htable (ht); #endif /* DEBUG */ if ((result = rg_find_item (ht, item)) == NULL) { printf ("\tError: new item was not properly inserted.\n"); exit (2); } else printf ("\tNew data: %s\n", (char *)result->data); } else { printf ("\tKey \"%s\" not found in table.\n", item.key); printf ("\tInserting...\n"); rg_add_item (ht, item); #ifdef DEBUG rg_print_htable (ht); #endif /* DEBUG */ if ((result = rg_find_item (ht, item)) == NULL) { printf ("\tError: new item was not properly inserted.\n"); exit (2); } else printf ("\tDone.\n"); } } rg_print_htable (ht); printf ("Total number of items in table: %u\n", ht->no_items); printf ("Full inventory of items:\n"); result = rg_get_htable_items (ht); i = 0; while (result != NULL) { printf ("Item #%d: key (%s), data (%s)\n", ++i, result->key, (char *)result->data); result = rg_get_htable_items (NULL); } printf ("Now we'll try removing some items (key = \"from\", \"stg\", \"brown\").\n"); item.key = "from"; if (rg_delete_item (ht, item) == NULL) printf ("Failure deleting item (key = \"from\").\n"); item.key = "stg"; if (rg_delete_item (ht, item) == NULL) printf ("Failure deleting item (key = \"stg\").\n"); item.key = "brown"; if (rg_delete_item (ht, item) == NULL) printf ("Failure deleting item (key = \"brown\").\n"); printf ("Now we'll try removing an item not in the list (key = \"richard\").\n"); item.key = "richard"; if (rg_delete_item (ht, item) != NULL) printf ("Whoops. Should have failed.\n"); printf ("New inventory of items:\n"); result = rg_get_htable_items (ht); i = 0; while (result != NULL) { printf ("Item #%d: key (%s), data (%s)\n", ++i, result->key, (char *)result->data); result = rg_get_htable_items (NULL); } printf ("Freeing up old table.\n"); result = rg_get_htable_items (ht); while (result != NULL) { free (result->key); free (result->data); result = rg_get_htable_items (NULL); } rg_free_htable (ht); printf ("Done.\n"); /* Unicode stuff */ printf ("Creating hash table again (with Unicode strings)...\n"); if ((ht = rg_create_htable (10)) == NULL) { printf ("Error creating hash table.\n"); exit (1); } #ifdef DEBUG rg_print_htable (ht); #else printf ("Number of buckets in table: %u\n", ht->len); #endif /* DEBUG */ printf ("Finding/entering items into the hash table.\n"); rewind (intext); while (fgets (linebuf, 2048, intext)) { if (*linebuf != '\0') linebuf[strlen (linebuf) - 1] = '\0'; item.key = NULL; item.uni_key = uni_strdup (utf_8_to_utf_16 (linebuf)); uni_strcpy (uni_otherbuf, item.uni_key); uni_strcat (uni_otherbuf, utf_8_to_utf_16 ("-data")); item.data = (void *)uni_strdup (uni_otherbuf); if ((result = rg_find_item (ht, item))) { printf ("\tKey \"%s\" already in the hash table.\n", utf_16_to_utf_8 (item.uni_key)); printf ("\tOld data: %s\n", utf_16_to_utf_8 (result->data)); rg_add_item (ht, item); if ((result = rg_find_item (ht, item)) == NULL) { printf ("\tError: new item was not properly inserted.\n"); exit (2); } else printf ("\tNew data: %s\n", utf_16_to_utf_8 (result->data)); } else { printf ("\tKey \"%s\" not found in table.\n", utf_16_to_utf_8 (item.uni_key)); printf ("\tInserting...\n"); rg_add_item (ht, item); if ((result = rg_find_item (ht, item)) == NULL) { printf ("\tError: new item was not properly inserted.\n"); exit (2); } else printf ("\tDone.\n"); } } rg_print_htable (ht); printf ("Total number of items in table: %u\n", ht->no_items); printf ("Full inventory of items:\n"); result = rg_get_htable_items (ht); i = 0; while (result != NULL) { printf ("Item #%d: key (%s), data (", ++i, utf_16_to_utf_8 (result->uni_key)); printf ("%s)\n", utf_16_to_utf_8 (result->data)); result = rg_get_htable_items (NULL); } printf ("Now we'll try removing some items (key = \"from\", \"stg\", \"brown\").\n"); item.key = "from"; if (rg_delete_item (ht, item) == NULL) printf ("Failure deleting item (key = \"from\").\n"); item.key = "stg"; if (rg_delete_item (ht, item) == NULL) printf ("Failure deleting item (key = \"stg\").\n"); item.key = NULL; item.uni_key = utf_8_to_utf_16 ("brown"); if (rg_delete_item (ht, item) == NULL) printf ("Failure deleting item (key = \"brown\").\n"); printf ("Now we'll try removing an item not in the list (key = \"richard\").\n"); item.uni_key = utf_8_to_utf_16 ("richard"); if (rg_delete_item (ht, item) != NULL) printf ("Whoops. Should have failed.\n"); printf ("New inventory of items:\n"); result = rg_get_htable_items (ht); i = 0; while (result != NULL) { printf ("Item #%d: key (%s), data (", ++i, utf_16_to_utf_8 (result->uni_key)); printf ("%s)\n", utf_16_to_utf_8 (result->data)); result = rg_get_htable_items (NULL); } printf ("Freeing up old table.\n"); result = rg_get_htable_items (ht); while (result != NULL) { free (result->key); free (result->data); result = rg_get_htable_items (NULL); } rg_free_htable (ht); printf ("Done.\n"); return 0; } #endif /* STANDALONE_HASHUTIL_TEST */