/* Parse ABNF to locate dangling references and undefined symbols * does limited syntax checking * * This is a quick&dirty hack donated to public domain by Chris Newman */ #include #include #include #include #define ERROR 0 #define COMMENT 1 #define RULEDEF 2 #define AMENDRULE 3 #define HEXMODE 4 #define BINMODE 5 #define DECMODE 6 #define ALTERNATE 7 #define REPEAT 8 #define BEGINGROUP 9 #define ENDGROUP 10 #define BEGINOPTION 11 #define ENDOPTION 12 #define RANGE 13 #define LITLIST 14 #define NUMBER 15 #define HEXNUMBER 16 #define QUOTED 17 #define PROSE 18 #define RULENAME 19 #define NEWLINE 20 static const char *token_name[] = { "error", "comment", "ruledef", "amendrule", "hexmode", "binmode", "decmode", "alternate", "repeat", "begingroup", "endgroup", "beginoption", "endoption", "range", "litlist", "number", "hexnumber", "quoted", "prose", "rulename", "newline" }; /* simple rulename datastructure */ struct rulename { struct rulename *next; int defined, referenced; int len; char data[1]; }; #define HTSIZE 1023 static struct rulename *htable[HTSIZE]; static unsigned char hex_table[256]; const static unsigned char hexchars[] = "0123456789ABCDEF"; static const char *baserules[] = { "ALPHA", "BIT", "CHAR", "CR", "CRLF", "CTL", "DIGIT", "DQUOTE", "HEXDIG", "HTAB", "LF", "LWSP", "OCTET", "SP", "VCHAR", "WSP", NULL }; const unsigned char cvt_to_lowercase[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff }; #define TOLOWER(c) (cvt_to_lowercase[(unsigned char)(c)]) /* returns pointer to end of token, NULL if no tokens left in line */ char *gettoken(char *ptr, int *tokentype, int hexmode) { *tokentype = ERROR; switch (*ptr++) { case '\0': return (NULL); case '\r': if (*ptr == '\n') ++ptr; case '\n': *tokentype = NEWLINE; break; case ';': *tokentype = COMMENT; while (*ptr != '\0' && *ptr != '\n' && *ptr != '\r') ++ptr; break; case '/': *tokentype = ALTERNATE; break; case '*': *tokentype = REPEAT; break; case '(': *tokentype = BEGINGROUP; break; case ')': *tokentype = ENDGROUP; break; case '[': *tokentype = BEGINOPTION; break; case ']': *tokentype = ENDOPTION; break; case '-': *tokentype = RANGE; break; case '.': *tokentype = LITLIST; break; case '=': *tokentype = RULEDEF; if (*ptr == '/') { *tokentype = AMENDRULE; ++ptr; } break; case '%': switch (*ptr++) { case 'x': case 'X': *tokentype = HEXMODE; break; case 'b': case 'B': *tokentype = BINMODE; break; case 'd': case 'D': *tokentype = DECMODE; break; default: return ("Invalid literal"); } break; case '"': while (*ptr && *ptr != '"') ++ptr; if (*ptr != '"') { return ("Unbalanced double quotes"); } ++ptr; *tokentype = QUOTED; break; case '<': while (*ptr && *ptr != '>') ++ptr; if (*ptr != '>') { return ("Unbalanced angle brakets"); } ++ptr; *tokentype = PROSE; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (hexmode) goto GETHEX; while (isdigit(*ptr)) ++ptr; *tokentype = NUMBER; break; default: /* deal with hex digits */ if (hexmode && isxdigit(ptr[-1])) { GETHEX: while (isxdigit(*ptr)) ++ptr; *tokentype = HEXNUMBER; break; } if (!isalpha(ptr[-1])) { return ("Invalid character"); } /* deal with rule names */ while (isalnum(*ptr) || *ptr == '-') ++ptr; *tokentype = RULENAME; break; } return (ptr); } /* find a rule in a hash table */ struct rulename *hashfind(const char *ptr, int len) { int i, hash; struct rulename **pptr; /* lookup in hash table */ hash = len; for (i = 0; i < len; ++i) { hash = (hash * 13 + TOLOWER(ptr[i])) % HTSIZE; } pptr = &htable[hash]; while (*pptr && ((*pptr)->len != len || strncasecmp(ptr, (*pptr)->data, len) != 0)) { pptr = &(*pptr)->next; } /* create new entry, if needed */ if (!*pptr) { *pptr = calloc(sizeof (struct rulename) + len, 1); if (!*pptr) { fprintf(stderr, "Out of memory\n"); exit(1); } (*pptr)->len = len; memcpy((*pptr)->data, ptr, len); } return (*pptr); } main(int argc, char **argv) { int web_cgi = 0, verbose = 0; const unsigned char *uscan; char *buf, *name, *src, *dst, *linestart; char *ptr, *nptr; const char **ruleptr; int hexmode, tokentype, line; int plevel = 0, blevel = 0; int bused, bsize, amount; struct rulename *lastrule; /* initialize hex table */ memset(hex_table, 16, sizeof (hex_table)); for (uscan = hexchars; *uscan; ++uscan) { hex_table[(unsigned) *uscan] = uscan - hexchars; } /* initialize default rules */ for (ruleptr = baserules; *ruleptr != NULL; ++ruleptr) { lastrule = hashfind(*ruleptr, strlen(*ruleptr)); lastrule->defined = lastrule->referenced = 1; } lastrule = NULL; /* get program name */ name = strrchr(*argv, '/'); name = name == NULL ? *argv : name + 1; if (strncmp("web", name, 3) == 0) web_cgi = 1; /* parse args */ while (*++argv != NULL && **argv == '-') switch ((*argv)[1]) { case 'w': web_cgi = 1; break; case 'v': verbose = 1; break; case 'h': printf("%s [-h] [-w]\n", name); printf(" -h display this help\n"); printf(" -w input is web CGI encoded\n"); exit(0); default: fprintf(stderr, "Invalid arguments, type `%s -h' for usage\n", name); exit(1); } if (web_cgi) printf("Content-Type: text/plain\n\n"); /* read the data into memory */ bsize = 4096; buf = malloc(bsize); if (buf != NULL) { for (bused = 0; (amount = fread(buf + bused, 1, bsize - bused, stdin)) != 0; bused += amount) { if (bused + amount == bsize) { if (bsize > 10*1024*1024) { fprintf(stderr, "Size limit exceeded\n"); exit(1); } buf = realloc(buf, bsize *= 2); if (buf == NULL) break; } } } if (buf == NULL) { fprintf(stderr, "Out of memory\n"); exit(1); } if (!feof(stdin)) { perror("input"); exit(1); } buf[bused] = '\0'; /* remove web CGI grunge */ if (web_cgi) { for (src = buf; *src != '\0' && !isspace(*src) && *src != ':' && *src != '='; ++src) ; if (*src == '=') { ++src; for (dst = buf; *src != '&' && *src != '\0'; ++src, ++dst) { if (*src == '%') { *dst = (hex_table[(unsigned char)src[1]] << 4) + hex_table[(unsigned char)src[2]]; src += 2; } else if (*src == '+') { *dst = ' '; } else { *dst = *src; } } *dst = '\0'; bused = dst - buf; } } /* remove xml RFC grunge */ for (src = buf; *src == ' ' || *src == '\t'; ++src) ; if (*src == '<' && src[1] == '?') { for (dst = buf; *src != '\0'; ++src) { if (*src == '\r' || *src == '\n') { *dst++ = *src; } else if (src[0] == '<' && src[1] == 'a' && (strncmp(src, "", 21) == 0 || strncmp(src, "", 21) == 0)) { src += 21; while (*src != '\0') { if (src[0] == '<' && src[1] == '!' && strncmp(src, "')) { *dst++ = *src++; } src += 3; continue; } if (src[0] == '<' && src[1] == '/' && strncmp(src, "", 10) == 0) { src += 10; break; } if (src[0] == '&') { if (strncmp(src, ">", 4) == 0) { *dst++ = '>'; src += 4; continue; } if (strncmp(src, "<", 4) == 0) { *dst++ = '<'; src += 4; continue; } if (strncmp(src, "&", 5) == 0) { *dst++ = '&'; src += 5; continue; } if (strncmp(src, "'", 6) == 0) { *dst++ = '\''; src += 6; continue; } if (strncmp(src, """, 6) == 0) { *dst++ = '"'; src += 6; continue; } } *dst++ = *src++; } continue; } } *dst = '\0'; } /* process lines */ ptr = linestart = buf; hexmode = 0; line = 1; /* get tokens */ for (;;) { /* skip whitespace */ while (*ptr == ' ' || *ptr == '\t') { ++ptr; hexmode = 0; } /* get token, done if at end of line */ if ((nptr = gettoken(ptr, &tokentype, hexmode)) == NULL) break; /* handle errors */ if (!tokentype) { printf("Error (line %d, col %d): %s\n", line, ptr - linestart, nptr); exit(1); } /* reset hexmode */ if (hexmode && tokentype != HEXNUMBER && tokentype != LITLIST && tokentype != RANGE) { hexmode = 0; } /* handle last rulename */ if (lastrule != NULL) { if (tokentype == RULEDEF) { lastrule->defined = 1; } else if (tokentype == AMENDRULE) { lastrule->defined = lastrule->referenced = 1; } else { lastrule->referenced = 1; } lastrule = NULL; } if (verbose && tokentype != NEWLINE) { printf("line %d, col %d, %s", line, ptr - linestart, token_name[tokentype], nptr - ptr, ptr); if (nptr != NULL) { printf(": %.*s", nptr - ptr, ptr); } putchar('\n'); } switch (tokentype) { case NEWLINE: hexmode = 0; ++line; ptr = linestart = nptr; continue; case RULENAME: lastrule = hashfind(ptr, nptr - ptr); break; case HEXMODE: hexmode = 1; break; case RULEDEF: case AMENDRULE: if (plevel || blevel) { UNBALANCED: printf("Error (line %d): Unbalanced %s\n", line, plevel ? "parenthesis" : "square brackets"); exit(1); } break; case BEGINGROUP: ++plevel; break; case ENDGROUP: if (--plevel < 0) goto UNBALANCED; break; case BEGINOPTION: ++blevel; break; case ENDOPTION: if (--blevel < 0) goto UNBALANCED; break; } /* advance */ ptr = nptr; } /* final warnings */ if (plevel) { printf("Error at end of text: Unbalanced parenthesis\n"); } if (blevel) { printf("Error at end of text: Unbalanced square brackets\n"); } /* deal with last rule on last line */ if (lastrule) { lastrule->referenced = 1; } /* list undefined and unreferenced rules */ for (line = 0; line < HTSIZE; ++line) { lastrule = htable[line]; while (lastrule) { if (!lastrule->defined) { printf("undefined rule: %s\n", lastrule->data); } if (!lastrule->referenced) { printf("unreferenced rule: %s\n", lastrule->data); } lastrule = lastrule->next; } } printf("ABNF validation (version 1.0) completed\n"); exit(0); }