home *** CD-ROM | disk | FTP | other *** search
- # wordfreq - print number of occurrences of each word
- # input: text
- # output: number-word pairs sorted by number
-
- BEGIN {
- min_cnt = 1; # define minimum number of occurrances to print
- min_lng = 3; # define minimum length of word to count
- comment_only = /^#/;
- }
-
- # Ignore Comment Only Lines
- comment_only { next; }
-
- {
- fprintf("stderr","%u\n",FNR);
- gsub(/{_p}/,""); #remove puncutation
- gsub(/#.*$/,""); # remove comments
- gsub(/[0-9+^"'`*\$\&~\<\>=\\\/\[\]\(\)\{\}-]+/," "); # change to single white space
- for ( i = 1 ; i <= NF ; i++ ) if ( length($i) > min_lng ) count[$i]++;
- }
-
- FINAL {
- local i = 0, k = 0, m = 0, w;
-
- for ( w in count ) {
- if ( min_cnt <= (j = count[w]) ) {
- print j , w;
- i++;
- m += j;
- }
- k++;
- }
- deletea count;
- printf("File: %s\n",FILENAME);
- printf("Total Words: %lu\nTotal Output: %lu\nTotal Count Ouput: %lu\n\n",k,i,m);
- }
-