Usenet 1994 January

home *** CD-ROM | disk | FTP | other *** search

/ Usenet 1994 January / usenetsourcesnewsgroupsinfomagicjanuary1994.iso / sources / net / markov / markov3.l < prev next >

Wrap

Lex Description | 1987-03-06 | 11.5 KB | 479 lines

%{ /* * Copyright (c) 1986, 1987 by Joe Buck * * Permission is granted to use, redistribute, or modify this program, * as long as you don't pretend you wrote it. Send improvements or * bug reports to {ihnp4,hplabs,ames,sun}!oliveb!epimass!jbuck. * * The program generates simulated Usenet articles, given Usenet articles * as input. * * This program constructs a table of frequencies for a token appearing, * given the two preceding tokens. A "token" is a sequence of non-blank * characters. An entirely blank line is also treated as a token, as is * the beginning and end of an article. * * The program is designed to process news articles, rejecting text from * the header, signature, and included text, together with cruft inserted * by rn and notes. A paragraph of included text is treated like a token. * * After the table is built (and it can be big), articles are generated * on the standard output. */ #ifndef lint static char *sccs_id = "@(#)markov3.l 1.1 3/6/87 epimass!jbuck"; #endif #include <sys/types.h> /* for time_t */ int in_included_text = 0; %} %Start HDR BODY SIG %% <HDR>^[^ \t]+:.*\n ; /* Header line, e.g. "From: foo@bar.UUCP" */ <HDR>^[ \t]+[^ \t].*\n ; /* Continuation of header line */ <HDR>^[ \t]*$ BEGIN BODY; <BODY>^"-- "$ BEGIN SIG; <BODY>^[><)|#}].*\n { /* 50% rule gets people to change ">" to other characters; this gets most of them */ if (!in_included_text) { in_included_text = 1; process_token ("\n> ...\n\n"); } } <BODY>"]".*\n { /* should have been included in the above. My lex generates bad C code if I say [[><...] even though ed(1) says that's a valid regular expression. */ if (!in_included_text) { in_included_text = 1; process_token ("\n> ...\n\n"); } } <BODY>^"In article".*\n ; /* Reject rn crud */ <BODY>^"/* Written".*"*/"\n ; /* Also NOTES crud */ <BODY>^"/* End of text from".*"*/"\n ; /* NOTES */ <BODY>^"/* ----------".*"----------*/"\n ; /* NOTES */ <BODY>[ \t]+ ; /* Skip white space */ <BODY>\n[ \t\n]*\n { process_token ("\n"); /* Paragraph break */} <BODY>[^ \t\n]+ { in_included_text = 0; process_token (yytext); } <HDR>. ; /* Eat anything that escaped */ <HDR>\n ; <BODY>\n ; <SIG>. ; <SIG>\n ; %% void perror(), exit(); char *strcpy(), *malloc(); extern int optind; extern char *optarg; /* * hashtab is a hash table storing all the tokens we encounter. */ struct htentry { char *htext; struct htentry *hlink; }; #define HSIZE 3557 /* Should be prime */ #define Fprintf (void)fprintf #define Printf (void)printf struct htentry hashtab[HSIZE]; /* * node and succnode are portions of the big structure we're going to build. * node represents something like ("was", "a") in a binary tree. * a linked list of succnodes contain tokens that may follow ("was", "a") */ struct node { char *text; char *text2; int ocount; struct node *lc, *rc; struct succnode *succ; }; struct succnode { struct node *scnod; int count; struct succnode *link; }; struct node *prev_code = NULL; char *prev_token = NULL, **Argv; int init_state = HDR; int verbose = 0; struct node *root = NULL, *tknptr; struct succnode *start = NULL; int n_pairs = 0, n_tokens = 0, n_files = 0, n_total = 0; struct node *insert_token(); char *savetoken(); process_token (txt) char *txt; { struct node *code; char *token = savetoken (txt); /* We have a new token. Say the previous two tokens were "one" "way" * and the current token is "to". Then prev_code points to a node * for ("one", "way") and token is "to". This function adds ("way", "to") as a * successor to ("one","way") and makes prev_code point to ("way","to"). */ code = insert_token (prev_token, token); insert_pair (prev_code, code); prev_code = code; prev_token = token; return; } /* * here it is, the main function. */ main (argc, argv) int argc; char **argv; { int i, c, n_articles = 10, sflag = 0; char *dumpfile = NULL; extern int optind; extern char *optarg; while ((c = getopt (argc, argv, "pxvn:d:s:")) != EOF) { switch (c) { case 'v': verbose = 1; break; case 'p': /* Input is plain text, not Usenet stuff */ init_state = BODY; break; case 'n': /* # articles to generate */ n_articles = atoi (optarg); break; case 'd': /* where to dump the data structure */ dumpfile = optarg; break; case 's': /* Set the seed for rand; fall through */ srand (atoi (optarg)); case 'x': /* set flag to prevent srand */ sflag++; break; default: Fprintf (stderr, "Usage: markov3 [-pvx] [-s seed] [-n n_art] [-d dump] files\n"); exit (1); } } BEGIN init_state; /* initial state of lexical analyzer */ if (!sflag) /* set random number generator */ srand ((int)time ((time_t *)0)); /* Note: if optind == argc, there are no file arguments. yyin is left * initialized to stdin. */ if (optind < argc) { /* yyin is lex input stream. Point to first file. */ if ((yyin = fopen (argv[optind], "r")) == NULL) { perror (argv[optind]); exit (1); } optind++; /* skip to next file */ } Argv = argv; /* make it global so yywrap can access it */ n_files = 1; /* yylex puts all the input files through the lexical analyzer and builds * the database. */ (void) yylex (); if (dumpfile) dump_database (dumpfile); if (verbose) Fprintf (stderr, "Total of %d tokens (%d different), %d different pairs, %d files\n", n_total, n_tokens, n_pairs, n_files); /* Generate the articles, separated by form feeds */ for (i = 0; i < n_articles; i++) { if (i > 0) output_word ("\n\f\n"); generate_article (); } return 0; } /* * Lex calls this when EOF is reached. It opens the next file if there * is one. Lex interprets a return value of 1 to mean "all done" and 0 * to mean "keep going". */ yywrap () { (void) fclose (yyin); insert_pair (prev_code, (struct node *)0); prev_code = NULL; if (Argv[optind] == NULL) return 1; else if ((yyin = fopen (Argv[optind], "r")) == NULL) { perror (Argv[optind]); exit (1); } optind++; in_included_text = 0; if (verbose && n_files % 10 == 0) Fprintf (stderr, "%d files\n", n_files); n_files++; BEGIN init_state; return 0; } /* * This function saves a token in the hash table (if it isn't there * already) and returns a pointer to the stored copy. */ char * savetoken (txt) char *txt; { int h; char *p; struct htentry *hp; n_total++; for (p = txt, h = 0; *p; h += *p++); hp = hashtab + (h % HSIZE); while (hp->hlink) { if (strcmp (hp->htext, txt) == 0) { return hp->htext; } hp = hp->hlink; } /* OK, it's a new token. Make hp->hlink point to a new, * null block and make hp->htext point to the text. */ hp->hlink = (struct htentry *) malloc (sizeof *hp); hp->htext = malloc ((unsigned)(strlen (txt) + 1)); (void) strcpy (hp->htext, txt); hp->hlink->hlink = NULL; hp->hlink->htext = NULL; n_tokens++; return hp->htext; } /* * This recursive function inserts a token pair into the tree. */ struct node * insert_in_tree (p, txt, txt2) struct node *p; char *txt, *txt2; { int cmp; if (p == NULL) { /* Create a new node. */ p = (struct node *) malloc (sizeof *p); p->text = txt; p->text2 = txt2; p->lc = p->rc = NULL; p->succ = NULL; p->ocount = 1; tknptr = p; n_pairs++; if (verbose && n_pairs % 1000 == 0) Fprintf (stderr, "%d pairs\n", n_pairs); return p; } cmp = my_strcmp (p->text, txt); if (cmp == 0) cmp = my_strcmp (p->text2, txt2); if (cmp == 0) { /* It's a match. Increment the count. */ tknptr = p; p->ocount += 1; } /* Look in the subtrees. */ else if (cmp < 0) p->lc = insert_in_tree (p->lc, txt, txt2); else p->rc = insert_in_tree (p->rc, txt, txt2); return p; } /* * This just calls insert_in_tree starting at the root */ struct node * insert_token (txt, txt2) char *txt,*txt2; { root = insert_in_tree (root, txt, txt2); return tknptr; } /* * This function adds a successor. */ struct succnode * insert_in_succ_chain (sp, np) struct succnode *sp; struct node *np; { if (sp == NULL) { sp = (struct succnode *) malloc (sizeof *sp); sp->scnod = np; sp->count = 1; sp->link = NULL; } else if (sp->scnod == np) sp->count += 1; else sp->link = insert_in_succ_chain (sp->link, np); return sp; } /* * This calls insert_in_succ_chain starting at the right place. */ insert_pair (p1, p2) struct node *p1, *p2; { if (p1) p1->succ = insert_in_succ_chain (p1->succ, p2); else start = insert_in_succ_chain (start, p2); } /* * This function dumps the stored data structure onto a file. * Now if only I had a function to read it back in. */ char * pr_token (txt) char *txt; { if (txt[0] != '\n') return txt; return txt[1] ? "<INCL>" : "<LF>"; } treedump (tree, fp) struct node *tree; FILE *fp; { if (tree) { treedump (tree->rc, fp); Fprintf (fp, "( %s %s ) %d", pr_token (tree->text), pr_token (tree->text2), tree->ocount); chaindump (tree->succ, fp); treedump (tree->lc, fp); } } /* * Subroutine of treedump; it does one row. */ chaindump (p, fp) struct succnode *p; FILE *fp; { char *text; while (p) { if (p->scnod == NULL) text = "<EOF>"; else text = pr_token (p->scnod->text2); Fprintf (fp, " %s %d", text, p->count); p = p->link; } putc ('\n', fp); } /* * This routine generates the dump file (-d option) */ dump_database (file) char *file; { FILE *fp = fopen (file, "w"); if (fp == NULL) { Fprintf (stderr, "markov: can't open "); perror (file); exit (1); } Fprintf (fp, "START:"); chaindump (start, fp); treedump (root, fp); } /* roll (n) generates a uniformly distributed rv between 0 and n-1. * This code is stolen from "hack" and should be portable. If you * change this, remember that different systems have rand functions * with different ranges, and the bottom bits are often no good. */ #define roll(n) ((rand() >> 3) % n) /* * This function generates an article by traversing the * structure we've built. */ generate_article () { struct succnode *p = start; int ncounts = n_files; int n, accum; char *tp; while (1) { /* Roll the dice to find out the next token. The code below selects the * next token, and the new state, with a probability corresponding to the * frequency in the input. */ n = roll (ncounts); accum = p->count; while (accum <= n && p->link) { p = p->link; accum += p->count; } if (p->scnod == NULL) break; tp = p->scnod->text2; /* Check for "end of story" */ if (tp == NULL) break; output_word (tp); ncounts = p->scnod->ocount; p = p->scnod->succ; } output_word ("\n"); /* This will flush the buffer as well. */ return; } /* * This version handles null strings * */ my_strcmp (a, b) register char *a, *b; { if (a == NULL) return b ? -1 : 0; if (b == NULL) return 1; return strcmp (a, b); } #define LEN 75 output_word (word) char *word; { static char line[LEN+1]; static int room = LEN; int l; if (word == NULL) return; l = strlen (word); /* If word won't fit, or starts with \n, dump the current line */ if ((l >= room || word[0] == '\n') && line[0]) { Printf ("%s\n", line); line[0] = 0; room = LEN; } /* If word won't fit in the buffer or starts with \n, print it now */ if (l >= LEN) Printf ("%s\n", word); else if (word[0] == '\n') Printf ("%s", word); /* Otherwise fill it in */ else { (void)strcat (line, word); (void)strcat (line, " "); room -= (l + 1); } return; }