Education Sampler 1992 [NeXTSTEP]

home *** CD-ROM | disk | FTP | other *** search

/ Education Sampler 1992 [NeXTSTEP] / Education_1992_Sampler.iso / Programming / Source / WAIS / ir / ircfiles.c < prev next >

Wrap

C/C++ Source or Header | 1992-02-02 | 22.7 KB | 1,023 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* this file defines a set of helper functions * for indexing common types of files. * -brewster 7/90 */ /* I encourage adding customizations. * (too bad they all have to be hard coded, but * C did not have convenient dynamic linking facilities) * * Add three functions to this file: * boolean foo_separator_function(char *line){} * void foo_header_function(char *line){} * long foo_date_function(char *line){} * void foo_finish_header_function(char *header){} * * then add the prototypes to ircfiles.h * then add the functions to the big case statement in irbuild.c * * * to do: * filter for digests * */ /* Change log: * 8/90 brewster added the library customizations * 6/91 and before - added a bunch of other filters - JG */ #include <string.h> #include <ctype.h> #include "cutil.h" #include "ircfiles.h" #define MAX_HEADER_LEN 100 static char* trim_trailing_newline _AP((char* string)); static char* trim_trailing_newline(string) char* string; { if(string) if(strlen(string) > 0) if(string[strlen(string) -1] == '\n') string[strlen(string) -1] = '\0'; return(string); } /* ================================= * === Groliers Customizations === * ================================= */ boolean groliers_separator_function(line) char *line; { if((strlen(line) > strlen("ARTICLE")) && substrcmp(line, "ARTICLE")){ /* printf("hit %s\n", line); */ return(true); } else{ return(false); } } char groliers_header[MAX_HEADER_LEN + 1]; void groliers_header_function(line) char *line; { if(groliers_separator_function(line)){ strncpy(groliers_header, line + strlen("ARTICLE") + 2, MAX_HEADER_LEN); } } void groliers_finish_header_function(header) char *header; { if(strlen(groliers_header) == 0){ strncpy(header, "Unknown Title", MAX_HEADER_LEN); } else{ strncpy(header, groliers_header, MAX_HEADER_LEN); } groliers_header[0] = '\0'; } /* ============================== * === RMail Customizations === * ============================== */ /* this is just a preliminary version. A good version would * produce a headline like gnu emacs RMAIL */ boolean mail_separator_function(line) char *line; { /* this should really look for a "<cr><cr>From " rather than "<cr>From " */ if((strlen(line) > strlen("From ")) && substrcmp(line, "From ")){ return(true); } else{ return(false); } } boolean rmail_separator_function(line) char *line; { if(0 == strcmp(line, "\n")){ return(true); } else{ return(false); } } /* This one is portable, but might get the wrong answer. I'm open to better code. - Jonny G */ static char *months[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", NULL}; long getdate(line) char *line; { char date[255], *temp; int day, month, year; char cmonth[25]; strcpy(date, line); temp = date; while(!isdigit(*temp)) temp++; sscanf(temp, "%d %s %d", &day, cmonth, &year); for(month = 0; months[month] != NULL; month++) if(!strcmp(cmonth, months[month])) break; if (year > 99) year = year % 100; if(day > 0 && month < 12 && year > 0) { return (10000 * year + 100 * (month+1) + day); } sscanf(temp, "%d/%d/%d", &month, &day, &year); if (year > 99) year = year % 100; if(day > 0 && month < 12 && year > 0) { return (10000 * year + 100 * (month+1) + day); } sscanf(temp, "%d/%d/%d", &year, &month, &day); if (year > 99) year = year % 100; if(day > 0 && month < 12 && year > 0) { return (10000 * year + 100 * (month+1) + day); } return 0; } long mail_date_function(line) char *line; { if((strlen(line) > strlen("Date: ")) && substrcmp(line, "Date: ")){ return(getdate(line+6)); } else return -1; } char mail_subject[MAX_HEADER_LEN + 1]; char mail_from[MAX_HEADER_LEN + 1]; void mail_header_function(line) char *line; { if((strlen(line) > strlen("Subject: ")) && substrcmp(line, "Subject: ") && (strlen(mail_subject) == 0)){ strcpy(mail_subject, "Re: "); s_strncat(mail_subject, line + strlen("Subject: "), MAX_HEADER_LEN, MAX_HEADER_LEN); trim_trailing_newline(mail_subject); } else if((strlen(line) > strlen("From: ")) && substrcmp(line, "From: ") && (strlen(mail_from) == 0)){ /* this should find the <foo@bar> field in the from list */ strncpy(mail_from, line + strlen("From: "), MAX_HEADER_LEN); trim_trailing_newline(mail_from); } } void mail_finish_header_function(header) char *header; { if(strlen(mail_subject) != 0 && strlen(mail_from) != 0){ /* trim the from line if needed */ if(strlen(mail_from) > 10){ mail_from[10] = '\0'; } strncpy(header, mail_from, MAX_HEADER_LEN); s_strncat(header, " ", MAX_HEADER_LEN, MAX_HEADER_LEN); s_strncat(header, mail_subject, MAX_HEADER_LEN, MAX_HEADER_LEN); /* printf("%s\n", header); */ } else if(strlen(mail_subject) != 0){ strncpy(header, mail_subject, MAX_HEADER_LEN); } else if(strlen(mail_from) != 0){ strncpy(header, mail_from, MAX_HEADER_LEN); } else{ strcpy(header, "Unknown Subject"); } mail_from[0] = '\0'; mail_subject[0] = '\0'; } boolean mail_or_rmail_separator(line) char *line; { static boolean blank_line = false; if((strlen(line) > strlen("From ")) && substrcmp(line, "From ") && blank_line == true){ blank_line = false; return(true); } if(substrcmp(line, "")){ blank_line = true; return(true); } if(!strcmp(line, "\n")){ blank_line = true; } else{ blank_line = false; } return(false); } /* ======================================== * === Mail Digest Customizations ==== * ======================================== */ boolean mail_digest_separator_function(line) char *line; { if((strlen(line) > strlen("-----------------------------")) && substrcmp(line, "------------------------------")){ return(true); } else{ return(false); } } /* ======================================== * === Library Catalog Customizations === * ======================================== */ /* just use the title */ boolean catalog_separator_function(line) char *line; { if((strlen(line) > strlen("Call:")) && (substrcmp(line, "Call:"))){ return(true); } else{ return(false); } } char catalog_header[MAX_HEADER_LEN + 1]; void catalog_header_function(line) char *line; { if((strlen(line) > strlen("Title:")) && (substrcmp(line, "Title:"))){ strncpy(catalog_header, line + strlen("Title:"), MAX_HEADER_LEN); } } void catalog_finish_header_function(header) char *header; { if(strlen(catalog_header) == 0){ strcpy(header, "Unknown Title"); } else{ strncpy(header, catalog_header, MAX_HEADER_LEN); } catalog_header[0] = '\0'; } /* ============================ * === Bio Customizations === * ============================ */ /* customizations for a DB of genetic abstracts */ boolean hit_header = false; boolean bio_separator_function(line) char *line; { if((strlen(line) > strlen(">>>")) && substrcmp(line, ">>>")){ return(true); } else{ return(false); } } char bio_header[MAX_HEADER_LEN + 1]; void bio_header_function(line) char *line; { if(hit_header /* we just hit a seperator previous to this */ && (!bio_separator_function(line)) /* we are not on the separator now */ && strlen(bio_header) == 0){ /* and we have not saved the headline yet */ strcpy(bio_header, line); waislog(WLOG_MEDIUM, WLOG_INDEX, "storing line: %s", bio_header); hit_header = false; } } void bio_finish_header_function(header) char *header; { hit_header = true; /* turn on the flag */ if(strlen(bio_header) == 0){ strcpy(header, "Unknown Title"); } else{ strcpy(header, bio_header); } bio_header[0] = '\0'; } /* ================================= * === CMApp Customizations === * ================================= */ boolean cmapp_separator_function(line) char *line; { if((strlen(line) > strlen("@A")) && substrcmp(line, "@A")){ /* printf("hit %s\n", line); */ return(true); } else{ return(false); } } char cmapp_header[MAX_HEADER_LEN + 1]; void cmapp_header_function(line) char *line; { if((strlen(line) > strlen("APPLICATION:")) && substrcmp(line, "APPLICATION:")){ /* printf("hit %s\n", line); */ strncpy(cmapp_header, line + strlen("APPLICATION:"), MAX_HEADER_LEN); } } void cmapp_finish_header_function(header) char *header; { if(strlen(cmapp_header) == 0){ strncpy(header, "Unknown Title", MAX_HEADER_LEN); } else{ strncpy(header, cmapp_header, MAX_HEADER_LEN); } cmapp_header[0] = '\0'; } /* ================================= * === Jargon Customizations === * ================================= */ boolean jargon_separator_function(line) char *line; { if((strlen(line) > 0) && line[0] =='<'){ /* printf("hit %s\n", line); */ return(true); } else{ return(false); } } char jargon_header[MAX_HEADER_LEN + 1]; void jargon_header_function(line) char *line; { if((strlen(line) > 0) && line[0] =='<'){ char *end_ptr = strchr(line, '>'); if(NULL != end_ptr){ strncpy(jargon_header, (1+ line), MIN(MAX_HEADER_LEN, end_ptr - line)); jargon_header[end_ptr-line-1] = '\0'; } } } void jargon_finish_header_function(header) char *header; { if(strlen(jargon_header) == 0){ strncpy(header, "Introduction to the Jargon file", MAX_HEADER_LEN); } else{ strncpy(header, jargon_header, MAX_HEADER_LEN); } jargon_header[0] = '\0'; } /* ================================= * === Internet Resource Guide === * ================================= */ char irg_header[MAX_HEADER_LEN + 1]; boolean irg_header_set = FALSE; boolean irg_separator_function(line) char *line; { if(line[0] == 12){ /* control L */ irg_header_set = FALSE; return(true); } else return(false); } void irg_header_function(line) char *line; { if((irg_header_set == FALSE) && (line[0] == 32 )){ /* space */ strncpy(irg_header, line + strspn(line, " "), MAX_HEADER_LEN); irg_header_set = TRUE; } } void irg_finish_header_function(header) char *header; { if(strlen(irg_header) == 0){ strncpy(header, "Unknown Title", MAX_HEADER_LEN); } else{ strncpy(header, irg_header, MAX_HEADER_LEN); } irg_header[0] = '\0'; irg_header_set = FALSE; } /* ======================== * === Dash Separator === * ======================== */ /* * dash-seperate entries * used in Introduction to Algorithms bug.list, suggestions, etc. * --------------------... at least 20 dashes * header * item * .. * --------------------... at least 20 dashes */ boolean dash_hit_header = false; boolean dash_separator_function(line) char *line; { if((strlen(line) > 20) && substrcmp(line,"--------------------")){ /* printf("hit %s\n", line); */ return(true); } else{ return(false); } } char dash_header[MAX_HEADER_LEN + 1]; void dash_header_function(line) char *line; { if(dash_hit_header && (!dash_separator_function(line)) && strlen(dash_header) == 0) { strncpy(dash_header, line, MAX_HEADER_LEN); dash_hit_header = false; } } void dash_finish_header_function(header) char *header; { dash_hit_header = true; /* turn on the flag */ if (strlen(dash_header) == 0) { strcpy(header, "No Title"); } else { strncpy(header, dash_header, MAX_HEADER_LEN); } dash_header[0] = '\0'; } /* ============================ * === one_line Separator === * ============================ */ /* this is where each line is a document (good for databases) */ boolean one_line_hit_header = false; boolean one_line_separator_function(line) char *line; { return(true); } char one_line_header[MAX_HEADER_LEN + 1]; void one_line_header_function(line) char *line; { strncpy(one_line_header, line, MAX_HEADER_LEN); } void one_line_finish_header_function(header) char *header; { if (strlen(one_line_header) == 0) { strcpy(header, "No Title"); } else { strncpy(header, one_line_header, MAX_HEADER_LEN); } one_line_header[0] = '\0'; } /* ============================= * === Paragraph Separator === * ============================= */ /* paragraph files - seperated by a blank line. Next line is the header */ char para_header[MAX_HEADER_LEN +1]; static boolean para_start = true; boolean para_separator_function(line) char *line; { if (para_start == true) { para_start = false; return true; } if (strlen(line) < 2) para_start = true; return false; } void para_header_function(line) char *line; { if (para_header[0] == 0) strncpy(para_header, line, MAX_HEADER_LEN); } void para_finish_header_function(header) char *header; { if (strlen(para_header) == 0) { strcpy(header, "No Title"); } else { strncpy(header, para_header, MAX_HEADER_LEN); } para_header[0] = 0; } /* ========================== * === Seeker Separator === * ========================== */ boolean seeker_separator_function(line) char *line; { return(dash_separator_function(line)); } char seeker_header[MAX_HEADER_LEN + 1]; boolean in_headline = FALSE; void seeker_header_function(line) char *line; { if(strlen(line) > strlen("Headline:") && substrcmp(line, "Headline:")){ in_headline = TRUE; seeker_header[0] = '\0'; /* printf("hit headline!\n"); */ } else if(in_headline == TRUE && (strlen(seeker_header) < (MAX_HEADER_LEN - 1))){ s_strncat(seeker_header, line, MAX_HEADER_LEN, MAX_HEADER_LEN); trim_trailing_newline(seeker_header); } } void seeker_finish_header_function(header) char *header; { if (strlen(seeker_header) == 0) { strcpy(header, "No Title"); } else { strncpy(header, seeker_header, MAX_HEADER_LEN); } seeker_header[0] = '\0'; in_headline = TRUE; } /* ========================== * === RLIN Separator === * ========================== */ boolean rlin_separator_function(line) char *line; { return(dash_separator_function(line)); } char rlin_header[MAX_HEADER_LEN + 1]; boolean rlin_in_headline = FALSE; void rlin_header_function(line) char *line; { if(rlin_separator_function(line)){ rlin_in_headline = TRUE; rlin_header[0] = '\0'; /* printf("hit headline!\n"); */ } else if(rlin_in_headline == TRUE && (strlen(rlin_header) < (MAX_HEADER_LEN - 1))){ s_strncat(rlin_header, line, MAX_HEADER_LEN, MAX_HEADER_LEN); trim_trailing_newline(rlin_header); } } void rlin_finish_header_function(header) char *header; { if (strlen(rlin_header) == 0) { strcpy(header, "No Title"); } else { strncpy(header, rlin_header, MAX_HEADER_LEN); } rlin_header[0] = '\0'; in_headline = TRUE; } /* ======================================== * === MH_BBoard Customizations ==== * ======================================== */ /* gcardwel@uci.edu MH bboards use a series of control A's to do a blank line.. yuk! */ boolean mh_bboard_separator_function(line) char *line; { static boolean blank_line = false; if((strlen(line) > strlen("BBoard-ID: ")) && substrcmp(line, "BBoard-ID: ") && blank_line == true){ blank_line = false; return(true); } if(!strcmp(line, "\001\001\001\001\n")){ blank_line = true; } else{ blank_line = false; } return (false); } /* ========================== * === Objective-C code === * ========================== */ /*----------------------- FSA -------------------*/ #define fsa_max_edges 4 #define fsa_error_state (-1) typedef struct { int if_input; int then_goto; } fsa_edge; /* action (if non-NULL) is excuted before transfer to next state is made */ /* action takes as arg the int input that will decide the next state */ typedef struct { int default_goto; int n_edges; fsa_edge edges[fsa_max_edges]; int (*action)(); } fsa_vertex; int fsa_step(input, state_p, table) int input; int *state_p; fsa_vertex *table; { int next_state, e; int (*this_action)(); if(*state_p < 0) return(*state_p = fsa_error_state); this_action = table[*state_p].action; if(this_action) this_action(input); for(e=0; e<table[*state_p].n_edges; e++) if(input == table[*state_p].edges[e].if_input) { next_state = table[*state_p].edges[e].then_goto; break; } if(e >= table[*state_p].n_edges) next_state = table[*state_p].default_goto; if(next_state < 0) next_state = fsa_error_state; return(*state_p = next_state); } /* sends null char as last input, returns final state */ int fsa_run(s, state_p, table) char *s; int *state_p; fsa_vertex *table; { char *p; for(p=s; *p; p++) fsa_step((int) *p, state_p, table); fsa_step(0, state_p, table); return(*state_p); } /*----------------------- end FSA -------------------*/ static int wobjc_brace_level = 0; static int wobjc_paren_level = 0; static int wobjc_strip_state = 0; static int wobjc_context = 0; static boolean wobjc_separator = false; static char wobjc_class[MAX_HEADER_LEN + 1]; static char *wobjc_class_end = 0; static char wobjc_header[MAX_HEADER_LEN + 1]; static char *wobjc_header_end = 0; #define WOBJC_BLANK " \t\n\r" #define WOBJC_WORD "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM_0123456789" /* Flag next line as separator, when context fsa says so. */ static int wobjc_separate(input) int input; { return(wobjc_separator = true); } /* FSA to parse objective-C constructs. */ static fsa_vertex wobjc_context_fsa[] = { { 0, 1, {{ '@', 1 }}}, /* look for objc constructs */ { 0, 1, {{ 'i', 20 }}}, { 3, 1, {{ ' ', 2 }}}, /* look for @imp class */ { 4, 1, {{ 'A', 3 }}}, { 4, 3, {{ '+', 6 },{ '-', 8 },{ '@', 10 }}},/* in @imp */ { 4, 3, {{ '+', 6 },{ '-', 8 },{ '@', 10 }}, wobjc_separate}, { 6, 1, {{ '{', 7 }}}, /* look for -method: */ { 5, 1, {{ '{', 7 }}}, { 8, 1, {{ '{', 9 }}}, /* look for +method: */ { 5, 1, {{ '{', 9 }}}, { 4, 1, {{ 'e', 11 }}}, /* look for @end of @imp */ { 4, 1, {{ 'n', 12 }}}, { 4, 1, {{ 'd', 0 }}}, { 14, 1, {{ ' ', 13 }}}, /* look for @intf class */ { 15, 1, {{ 'A', 14 }}}, { 15, 1, {{ '@', 16 }}}, /* in @intf */ { 15, 1, {{ 'e', 17 }}}, /* look for @end of @intf */ { 15, 1, {{ 'n', 18 }}}, { 15, 1, {{ 'd', 19 }}}, { 0, 1, {{ '@', 1 }}, wobjc_separate}, { 0, 2, {{ 'm', 21 },{ 'n', 33 }}}, /* look for @impl */ { 0, 1, {{ 'p', 22 }}}, { 0, 1, {{ 'l', 23 }}}, { 0, 1, {{ 'e', 24 }}}, { 0, 1, {{ 'm', 25 }}}, { 0, 1, {{ 'e', 26 }}}, { 0, 1, {{ 'n', 27 }}}, { 0, 1, {{ 't', 28 }}}, { 0, 1, {{ 'a', 29 }}}, { 0, 1, {{ 't', 30 }}}, { 0, 1, {{ 'i', 31 }}}, { 0, 1, {{ 'o', 32 }}}, { 0, 1, {{ 'n', 2 }}}, { 0, 1, {{ 't', 34 }}}, /* look for @intf */ { 0, 1, {{ 'e', 35 }}}, { 0, 1, {{ 'r', 36 }}}, { 0, 1, {{ 'f', 37 }}}, { 0, 1, {{ 'a', 38 }}}, { 0, 1, {{ 'c', 39 }}}, { 0, 1, {{ 'e', 13 }}} }; /* Action to be used by stripping fsa in non-commented, non-quoted state. */ /* This runs context fsa. */ static int wobjc_process_stripped_code(input) int input; { int context_input; switch(input) { /* Increment brace/paren levels as appropriate. */ case '{': wobjc_brace_level++; break; case '}': if(wobjc_brace_level > 0) wobjc_brace_level--; break; case '(': wobjc_paren_level++; break; case ')': if(wobjc_paren_level > 0) wobjc_paren_level--; break; case '\"': break; case '\'': break; case '/': break; default: /* If in correct context and not in brace/paren/comment/quote, */ /* then record header info. */ if(wobjc_brace_level==0 && wobjc_paren_level==0) { /* Recording class or instance method. Ignore multiple blanks. */ if(wobjc_context==6 || wobjc_context==8) { if(!wobjc_header_end || wobjc_header_end==wobjc_header) { strcpy(wobjc_header, (wobjc_context==6 ? "+[" : "-[")); strcat(wobjc_header, wobjc_class); strcat(wobjc_header, " "); wobjc_header_end = wobjc_header + strlen(wobjc_header); } if((wobjc_header_end - wobjc_header)<(MAX_HEADER_LEN-5) && !(strchr(WOBJC_BLANK, *(wobjc_header_end-1)) && strchr(WOBJC_BLANK, input))) { *wobjc_header_end++ = input; *wobjc_header_end = 0; } } /* Recording class name for @implementation or @interface. */ if(strchr(WOBJC_WORD, input) && (wobjc_context==2 || wobjc_context==3 || wobjc_context==13 || wobjc_context==14)) { if(wobjc_context==2 || wobjc_context==13 || !wobjc_class_end) wobjc_class_end = wobjc_class; if(wobjc_context==13 || (wobjc_context==14 && !wobjc_header_end)) wobjc_header_end = wobjc_header; if((wobjc_class_end - wobjc_class_end)<(MAX_HEADER_LEN/2)) { *wobjc_class_end++ = input; *wobjc_class_end = 0; } if((wobjc_context==13 || wobjc_context==14) && (wobjc_header_end-wobjc_header_end)<(MAX_HEADER_LEN/2)) { *wobjc_header_end++ = input; *wobjc_header_end = 0; } } } /* Since not in comment/quote, run context fsa. */ /* Input is modified like this: */ /* Non-zero brace level => '{'. */ /* Else spaces => ' '. */ /* Else if in correct contexts, word letters => 'A'. */ context_input = input; if(wobjc_brace_level>0) context_input = '{'; else if(strchr(WOBJC_BLANK, input)) context_input = ' '; else if((wobjc_context==3 || wobjc_context==14) && strchr(WOBJC_WORD, input)) context_input = 'A'; fsa_step(context_input, &wobjc_context, wobjc_context_fsa); break; } return(true); } /* FSA to strip out comments and quotes. */ static fsa_vertex wobjc_strip_fsa[] = { { 0, 3, {{ '/', 1 },{ '\"', 5 },{ '\'', 7 }}, wobjc_process_stripped_code}, { 0, 2, {{ '*', 2 },{ '/', 4 }}}, /* look for comment */ { 2, 1, {{ '*', 3 }}}, /* in /* comment */ { 2, 2, {{ '/', 0 },{ '*', 3 }}}, { 4, 1, {{ '\n', 0 }, { '\0', 0 }}}, /* in // comment */ { 5, 2, {{ '\\', 6 },{ '\"', 0 }}}, /* in " quote */ { 5, 0, }, { 7, 2, {{ '\\', 8 },{ '\'', 0 }}}, /* in ' quote */ { 7, 0, } }; boolean wobjc_separator_function(line) char *line; { if(wobjc_separator) { wobjc_separator = false; return true; } else return false; } void wobjc_header_function(line) char *line; { /* Run stripping fsa, which will run context fsa. */ fsa_run(line, &wobjc_strip_state, wobjc_strip_fsa); return; } void wobjc_finish_header_function(header) char *header; { char *p; /* Flush terminal blanks and balance opening '[' if any. */ for(p=wobjc_header+strlen(wobjc_header); p>wobjc_header && strchr(WOBJC_BLANK, *(p-1)); p--); if(wobjc_header[0]=='+' || wobjc_header[0]=='-') *p++ = ']'; *p = 0; /* Copy out final header. */ strcpy(header, wobjc_header); wobjc_header[0] = 0; wobjc_header_end = wobjc_header; return; }