/* ***************************************************************************** * * $RCSfile: xtrautil.c,v $ * $Date: 1999/03/12 17:55:17 $ * $Source: /home/richard/Xml/RCS/xtrautil.c,v $ * $Revision: 1.13 $ * $Author: richard $ * ***************************************************************************** * * Copyright 1998, Brown University and Richard Goerwitz * ***************************************************************************** * * Extra utilities that don't fit in anywhere else: * * push_state (state_struct *) - * pop_state (state_struct *) - routines for maniplating int-valued states * top_state (state_struct *) - * * absolutize (char *) - convert path relative to cwd into an absolute * absolutize_relative_to (char *, char *) - absolutize relative to path * expand_hostname_in_uri (char *) - http://goon/ -> http://goon.stg.brown.edu/ * * unescape_uri (char *) - destructively unescape %hex-escaped chars * escape_uri_string (char *) - %hex escape dangerous chars (calls malloc) * escape_uri (char *) - %hex escape all but reserved chars :/?# * * Note that absolutize() will work on URLs, in the sense that when it * sees something with the string "://" in it, absolutize() won't try to * interpret it as a path within the filesystem. * * Although I doubt they'll be used much, this file also has: * * space_2_plus (char *) - destructively map ' ' to '+' * plus_2_space (char *) - destructively map '+' to ' ' * ***************************************************************************** */ #include "general.h" #include "xtrautil.h" #include "errabort.h" #include #include #include #include /* Convert %hex-escaped char to regular char */ static char x2c (char *); /* * push_block * * Pushes a pointer to some block of data onto a list. If len * (arg 3) is nonzero, malloc len bytes, copy the contents of * block into this newly malloc'd block, then push a pointer to * that block instead of arg 2. * * Normally len will be zero, unless it's a string that is being * pushed. Note that strings must later be freed! * * This routine is used primarily in the parser (parsutil.y) to * accumulate pointers to things like attributes returned by left- * recursive rules (a : a b). */ struct malloc_block_list * push_block (struct malloc_block_list *mbl, void *block, size_t len) { if (mbl == NULL) { if ((mbl = malloc (sizeof (struct malloc_block_list))) == NULL) errabort (40, "malloc() error in %s\n", "push_block()"); mbl->buflen = 1; mbl->blocks = malloc (mbl->buflen * sizeof (void *)); } else { mbl->buflen++; if ((mbl->blocks = realloc (mbl->blocks, mbl->buflen * sizeof (void *))) == NULL) errabort (41, "realloc() error in %s\n", "push_block()"); } /* if len is zero, don't bother copying anything */ if (len == 0) mbl->blocks[mbl->buflen - 1] = block; else { mbl->blocks[mbl->buflen - 1] = malloc (len); if (mbl->blocks[mbl->buflen - 1] == NULL) errabort (40, "malloc() error in %s\n", "push_block()"); memcpy (mbl->blocks[mbl->buflen - 1], block, len); } return mbl; } /* * pop_block * * Opposite of push_block (see above). Basically, pop_block just * returns a pointer to the last block of data passed to the * push_block() routine. If, after doing this, the list becomes * empty, pop_block() frees that list. */ void * pop_block (struct malloc_block_list *mbl) { void *retval; if (mbl == NULL || mbl->buflen == 0) return NULL; --mbl->buflen; retval = mbl->blocks[mbl->buflen]; if (mbl->buflen == 0) { free (mbl->blocks); mbl->blocks = NULL; } return retval; } /* * push_state * * Push int-valued state st onto state_stack ss. Obviously, this * routine can be used with ints or values declared as enums. See * also top_state() and pop_state() below. * * It is assumed that ss->states will be NULL when this routine is * first called. So be sure to initialize ss to { NULL, 0, 0 }; */ int push_state (struct state_stack *ss, int st) { if (ss->states == NULL) { ss->buflen = 5; if ((ss->states = malloc (sizeof (st) * ss->buflen)) == NULL) errabort (40, "malloc() failure in %s\n", "push_state"); } if (++ss->len > ss->buflen) { ss->buflen = ss->len; if ((ss->states = realloc (ss->states, sizeof (st) * ss->buflen)) == NULL) errabort (41, "realloc() failure in %s\n", "push_state"); } ss->states[ss->len - 1] = st; return st; } /* * pop_state * * Pop an int-valued state off of state_stack ss. See also * push_state() above and top_state() below. If there is no * state on the stack, returns 0 (which is presumed to be the * initial state; set up your defines or enums accordingly). */ int pop_state (struct state_stack *ss) { if (ss->len) ss->len--; /* default to the start state */ if (ss->len == 0) return 0; return ss->states[ss->len]; } /* * top_state * * Return top state on state_stack ss (it will be an int). See also * push_state() and pop_state() above. If the stack is empty, * returns 0, which is presumed to be the initial state (hence, if * your states are set up as an enum, make sure the first state in * the list is the initial one). */ int top_state (struct state_stack *ss) { /* default to the start state */ if (ss->len == 0) return 0; return ss->states[ss->len - 1]; } /* * absolutize * * Convert path relative to cwd into an absolute path. Returns a * pointer to a static buffer that may be overwritten on subsequent * calls. * * Note that if path is a URL (i.e., a path containing the string * "://"), then absolutize() won't touch it except to collapse * strings of slashes other than the ones after the colon in "://" * down to a single slash. */ char * absolutize (char *path) { size_t len, len2; static size_t buflen; static char *buf = NULL; char wd[MAXPATHLEN + 1]; char *p, *p2, *is_url = NULL;; len = strlen (path); if (buf == NULL) { if ((buf = malloc (buflen = len + 2)) == NULL) errabort (40, "malloc() error in %s\n", "absolutize()"); } else { if ((len + 2) > buflen) if ((buf = realloc (buf, buflen = len + 2)) == NULL) errabort (41, "realloc() error in %s\n", "absolutize()"); } if (*path == '/' || (is_url = strstr (path, "://"))) memcpy (buf, path, len + 1); else { if (getcwd (wd, MAXPATHLEN) == NULL) errabort (80, "can't determine working dir\n"); len2 = strlen (wd); if ((len2 + len + 2) > buflen) if ((buf = realloc (buf, buflen = len2 + len + 3)) == NULL) errabort (41, "realloc() error in %s\n", "absolutize()"); memcpy (buf, wd, len2 + 1); for (p = buf + len2; p > buf; p--, len2--) if (! (*p == '/' || *p == '\0')) break; if (p > buf) *(p + 1) = '/'; memcpy (buf + len2 + 2, path, len + 1); } /* collapse sequences of slashes down to a single slash, unless they * follow a colon, ':', in which case allow as many as three slashes * in a row after the colon */ for (p = p2 = buf; *p != '\0'; p++) { if (is_url && *p == ':') { *p2++ = *p; if (*(p + 1) == '/' && *(p + 2) == '/') { *p2++ = *++p; *p2++ = *++p; } continue; } else if (! (*p == '/' && *(p + 1) == '/')) *p2++ = *p; } if (! is_url) if (! (--p2 > buf && *p2 == '/')) p2++; *p2 = '\0'; return buf; } /* * absolutize_relative_to * * Convert a path (arg 1) relative to absolute_path (arg 2) into an * absolute path. I.e., absolutize the relative_path given as arg1. * Returns a pointer to static storage space that may change on * subsequent calls. * * NOTE WELL: If absolute_path is only used up to the last slash. If * it doesn't end in a slash, that means that its last (filename) * component will be ignored for purposes of absolutizing arg 1. */ char * absolutize_relative_to (char *relative_path, char *absolute_path) { int is_url; size_t len, len2; static size_t buflen; static char *buf = NULL; char *p, *p2, *absolute_dir; if (*relative_path == '\0') return NULL; is_url = 0; if (strstr (absolute_path, "://")) is_url = 1; else if (*absolute_path != '/') errabort (16, "no leading slash in %s\n", absolute_path); len = strlen (relative_path); if (buf == NULL) { if ((buf = malloc (buflen = 64)) == NULL) errabort (40, "malloc() error in %s\n", "absolutize_relative_to()"); } else { if ((len + 2) > buflen) if ((buf = realloc (buf, buflen = len + 2)) == NULL) errabort (41, "realloc() error in %s\n", "absolutize_relative_to()"); } /* if relative_path is not really relative, just copy it into * buf; otherwise, append it to absolute_path, with (if needed) * an intervening slash */ if ((! is_url && *relative_path == '/') || (strstr (relative_path, "://") && (is_url = 1))) memcpy (buf, relative_path, len + 1); else { absolute_dir = strdup (absolute_path); if (is_url && *relative_path == '#') { /* relative_path is a fragment URL; just check absolute_dir * to see if it contains any directory slashes or question * marks (everything after '?' is QUERY_STRING; skip it) */ for (p = p2 = absolute_dir; *p != '\0'; p++) { if (is_url && *p == ':' && *(p + 1) == '/' && *(p + 2) == '/') p += 3; else if (*p == '/') /* p2 gets used later on */ p2 = p; else if (*p == '?') { *p = '\0'; break; } } } else { /* relative_path is not a fragment URL beginning with '#', so * then find the last slash in absolute_dir, and cut the string * off (e.g.: /usr/local/sgml/stuff.xml -> /usr/local/sgml) */ for (p = p2 = absolute_dir; *p != '\0'; p++) { if (is_url && *p == ':' && *(p + 1) == '/' && *(p + 2) == '/') p += 3; else if (*p == '/') p2 = p; else if (is_url && *p == '?') /* everything after '?' is QUERY_STRING; skip it */ break; } if (! is_url || p2 > absolute_dir) *p2 = '\0'; } len2 = strlen (absolute_dir); if ((len2 + len + 2) > buflen) if ((buf = realloc (buf, buflen = len2 + len + 2)) == NULL) errabort (41, "realloc() error in %s\n", "absolutize_relative_to()"); memcpy (buf, absolute_dir, len2); free (absolute_dir); /* Tack on trailing slash if this isn't a fragment URL. Note, * though: if p2 == absolute_dir, then we may have a URL like * http://host, in which case we have to tack on a slash, even * if the URL is a fragment URL */ if (p2 == absolute_dir || ! (is_url && *relative_path == '#')) if (len2 == 0 || buf[len2 - 1] != '/') { buf[len2] = '/'; len2++; } /* append relative path to absolute */ memcpy (buf + len2, relative_path, len + 1); } /* collapse sequences of slashes down to a single slash, unless they * follow a colon, ':', in which case allow as many as three slashes * in a row after the colon */ for (p = p2 = buf; *p != '\0'; p++) { if (is_url && *p == ':') { *p2++ = *p; if (*(p + 1) == '/' && *(p + 2) == '/') { *p2++ = *++p; *p2++ = *++p; } } else if (! (*p == '/' && *(p + 1) == '/')) *p2++ = *p; } if (! is_url) if (! (--p2 > buf && *p2 == '/')) p2++; *p2 = '\0'; return buf; } char * unescape_uri (char *uri) { int i, j; for (i = j = 0; uri[j]; i++, j++) { uri[i] = uri[j]; /* If we have a hex character escape, e.g., "%A4"... */ if (uri[j] == '%' && uri[j + 1] && uri[j + 2]) { /* ...substitute its value for uri[i] (which is now '%'). */ uri[i] = x2c (&uri[j+1]); j += 2; } } uri[i] = '\0'; return uri; } char * escape_uri (char *uri) { int i, j; char *new_uri; if (uri == NULL) errabort (43, "unexpectedly null uri in %s\n", "escape_uri()"); new_uri = malloc ((strlen (uri) * 3) + 1); for (i = 0, j = 0; uri[i] != '\0'; i++, j++) { if (uri[i] == '.' || uri[i] == '_' || uri[i] == '-' || uri[i] == '&' || uri[i] == '=' || uri[i] == '#' || uri[i] == '?' || uri[i] == ':' || uri[i] == '/' || (isascii (uri[i]) && isalnum (uri[i]))) new_uri[j] = uri[i]; else { /* if it's not already hex escaped, ... */ if (uri[i] == '%' && isxdigit (uri[i + 1]) && isxdigit (uri[i + 2])) { new_uri[j++] = uri[i++]; new_uri[j++] = uri[i++]; new_uri[j] = uri[i]; } else { /* ...escape it now */ sprintf (&new_uri[j], "%%%2x", (unsigned char)uri[i]); j += 2; } } } new_uri[j] = '\0'; new_uri = realloc (new_uri, j + 1); return new_uri; } char * escape_uri_string (char *uri) { int i, j; char *new_uri; new_uri = malloc ((strlen (uri) * 3) + 1); for (i = 0, j = 0; uri[i] != '\0'; i++, j++) if (isascii (uri[i]) && isalnum (uri[i])) new_uri[j] = uri[i]; else { sprintf (&new_uri[j], "%%%2x", (unsigned char)uri[i]); j += 2; } new_uri[j] = '\0'; new_uri = realloc (new_uri, j + 1); return new_uri; } char * plus_2_space (char *str) { int i; for (i = 0; str[i]; i++) if (str[i] == '+') str[i] = ' '; return str; } char * space_2_plus (char *str) { int i; for (i = 0; str[i]; i++) if (str[i] == ' ') str[i] = '+'; return str; } static char x2c (char *what) { char digit; digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0')); digit *= 16; digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0')); return digit; } /* * expand_hostname_in_uri * * Expands http://goon/ to http://goon.stg.brown.edu/. I.e., this * function fully expands hostnames in URLs. * * Cautions: Be sure to pass it a malloc'd string as uri (arg 1). * This URI will usually get freed. In its place a newly malloc'd * string is generally returned. So the normal calling pattern * should be: * * 1) uri = malloc (...) * 2) strcpy (uri, ...) * 3) uri = expand_hostname_in_uri (uri) * 4) free (uri) */ char * expand_hostname_in_uri (char *uri) { char c, *p, **pp; struct hostent *he; size_t i, len, count1, count2; char *hostname, *suffix, *new_uri, *new_hostname; /* if you change the 16 here, change the 17 below */ char prefix[16]; /* Code here is vaguely similar to what's in refers_to_localhost() */ if (uri == NULL) return uri; /* skip leading whitespace */ for (p = uri; isspace (*p); p++); /* find prefix (e.g., "http://") */ if (strncasecmp (p, "http://", 7) == 0) len = 7; else if (strncasecmp (p, "ftp://", 6) == 0) len = 6; else /* not an ftp: or http: URI; skip hostname check */ return uri; c = p[len]; p[len] = '\0'; memcpy (prefix, p, (len + 1) * sizeof (char)); p[len] = c; /* find hostname, e.g., "goon.stg.brown.edu" */ p = &p[len]; for (len = 0; p[len] && (isalnum (p[len]) || strchr ("-.", p[len])); len++) if (p[len] == '.' && p[len + 1] == '.') { len++; break; } c = p[len]; p[len] = '\0'; hostname = strdup (p); p[len] = c; /* after prefix and hostname, the rest of the URI is the "suffix" */ suffix = strdup (&p[len]); if (hostname == NULL || suffix == NULL) errabort (40, "malloc() error in %s\n", "expand_hostname_in_uri"); if (len && (he = gethostbyname (hostname))) /* URI better have been malloc'd */ free (uri); else { /* hostname is empty or unresolvable; punt */ free (hostname); free (suffix); return uri; } /* Expand, e.g., "www" to "www.stg.brown.edu" */ new_hostname = NULL; if (len && hostname[len - 1] == '.') new_hostname = hostname; else { if (strcasecmp (hostname, he->h_name) == 0) /* an exact match with canonical name; use it */ new_hostname = he->h_name; else { /* see if hostname's a substring of anything in he->h_aliases */ if (strncasecmp (hostname, he->h_name, len) == 0 && (! he->h_name[len] || he->h_name[len] == '.')) new_hostname = he->h_name; for (pp = he->h_aliases; *pp; pp++) { if (strncasecmp (hostname, *pp, len) == 0 && ((*pp)[len] == '\0' || (*pp)[len] == '.')) { if (! new_hostname) new_hostname = *pp; else { for (count1 = i = 0; new_hostname[i]; i++) if (new_hostname[i] == '.') count1++; for (count2 = i = 0; (*pp)[i]; i++) if ((*pp)[i] == '.') count2++; /* use the name with the most dots */ if (count2 > count1) new_hostname = *pp; } } } } } if (! new_hostname) /* use canonical hostname as last resort */ new_hostname = he->h_name; /* now build a new uri out of prefix, hostname, suffix */ len = strlen (new_hostname) + strlen (suffix) + 17; if ((new_uri = malloc (len * sizeof (char))) == NULL) errabort (40, "malloc() error in %s\n", "expand_hostname_in_uri()"); sprintf (new_uri, "%s%s%s", prefix, new_hostname, suffix); /* done with hostname, suffix */ free (hostname); free (suffix); return new_uri; } /* * refers_to_localhost * * Returns true if the hostname in uri (arg 1) refers to the local * host. */ int refers_to_localhost (char *uri) { size_t i, len; struct hostent *he, *local_he; static char **localaddrs = NULL; char c, *p, **pp, **pp2, *hostname; /* Code here is vaguely similar to what's in expand_hostname_in_uri() */ if (uri == NULL) return 0; /* get hostent for local host */ if (localaddrs == NULL) { if ((local_he = gethostbyname ("localhost")) == NULL) if ((local_he = gethostbyname ("127.0.0.1")) == NULL) /* no loopback interface, I guess; return false */ return 0; for (len = 0, pp = local_he->h_addr_list; *pp; pp++, len++) localaddrs = malloc ((len + 1) * sizeof (char *)); localaddrs[len] = NULL; for (i = 0; i < len; i++) { localaddrs[i] = malloc (local_he->h_length); memcpy (localaddrs[i], local_he->h_addr_list[i], local_he->h_length); } } /* skip leading whitespace */ for (p = uri; isspace (*p); p++); /* find prefix (e.g., "http://") */ if (strncasecmp (p, "http://", 7) == 0) len = 7; else if (strncasecmp (p, "ftp://", 6) == 0) len = 6; else /* not an ftp: or http: URI; assume the URI isn't local */ return 0; /* find hostname, e.g., "goon.stg.brown.edu" */ p = &p[len]; for (len = 0; p[len] && (isalnum (p[len]) || strchr ("-.", p[len])); len++) if (p[len] == '.' && p[len + 1] == '.') { len++; break; } c = p[len]; p[len] = '\0'; hostname = strdup (p); p[len] = c; /* If hostname is empty, ends in '.', or doesn't resolve, return false */ if (len == 0 || (he = gethostbyname (hostname)) == NULL) { free (hostname); return 0; } free (hostname); for (pp = localaddrs; *pp; pp++) for (pp2 = he->h_addr_list; *pp2; pp2++) if (memcmp (*pp, *pp2, he->h_length) == 0) /* "he" points to local host */ return 1; /* "he" doesn't point to an entry for the local host */ return 0; } #ifdef STANDALONE_XTRAUTIL_TEST #include "readcfg.h" #include "fileutil.h" #include "strutil.h" xmlparse_environment xmlparse_env; int main (int argc, char **argv) { int lineno = 0; char *tmp, *buf, *escaped_buf, *unescaped_buf; readcfg (argc, argv); while ((buf = getline (stdin, "stdin", &lineno)) != NULL) { trim (buf, "\n"); tmp = expand_hostname_in_uri (strdup (buf)); printf ("%s canonicalizes as -> %s\n", buf, tmp); free (tmp); printf ("%s absolutizes as -> %s\n", buf, absolutize (buf)); tmp = absolutize_relative_to (buf, "/oops"); printf ("Relative to \"/oops\", %s absolutizes as -> %s\n", buf, tmp ? tmp : "(null)"); tmp = absolutize_relative_to (buf, "/blast/it"); printf ("Relative to \"/blast/it\", %s absolutizes as -> %s\n", buf, tmp ? tmp : "(null)"); tmp = absolutize_relative_to (buf, "/"); printf ("Relative to \"/\", %s absolutizes as -> %s\n", buf, tmp ? tmp : "(null)"); tmp = absolutize_relative_to (buf, "http://goon.stg.brown.edu/index.html?cgi-data"); printf ("Relative to \"http://goon.stg.brown.edu/index.html?cgi-data/!!\", %s absolutizes as -> %s\n", buf, tmp ? tmp : "(null)"); tmp = absolutize_relative_to (buf, "http://goon.stg.brown.edu/index.html"); printf ("Relative to \"http://goon.stg.brown.edu/index.html\", %s absolutizes as -> %s\n", buf, tmp ? tmp : "(null)"); printf ("Relative to \"http:///\", %s absolutizes as -> %s\n", buf, absolutize_relative_to (buf, "http:///")); printf ("Relative to \"http://goon.stg.brown.edu\", %s absolutizes as -> %s\n", buf, absolutize_relative_to (buf, "http://goon.stg.brown.edu")); printf ("Testing URI-(un)escaping routines:\n"); escaped_buf = escape_uri (buf); printf ("Escaped URI: %s\n", escaped_buf); unescaped_buf = unescape_uri (strdup (escaped_buf)); printf ("Unscaped URI: %s\n", unescaped_buf); if (strcmp (buf, unescaped_buf) != 0) printf ("\tProblem with (un)escaping routines.\n"); else printf ("\t(Un)escaping routines are okay.\n"); free (unescaped_buf); free (escaped_buf); } return 0; } #endif /* STANDALONE_XTRAUTIL_TEST */