OS/2 Shareware BBS: Product

home *** CD-ROM | disk | FTP | other *** search

/ OS/2 Shareware BBS: Product / Product.zip / ISPSRC.ZIP / subset.X < prev next >

Wrap

Text File | 1992-08-14 | 5.4 KB | 194 lines

: Use /bin/sh # # $Id: subset.X,v 1.9 1992/01/09 09:29:53 geoff Exp $ # # Copyright 1987, 1988, 1989, by Geoff Kuenning, Manhattan Beach, CA # Permission for non-profit use is hereby granted. # All other rights reserved. # See "version.h" for a more complete copyright notice. # # $Log: subset.X,v $ # Revision 1.9 1992/01/09 09:29:53 geoff # Fix a typo and a tiny documentation error. # # Revision 1.8 91/07/27 20:48:35 geoff # Improve the commentary and the usage message. Just for neatness, # generate dict.0 first rather than last. Remove intermediate # dictionaries from the temp directory as they are used up. # # Revision 1.7 91/06/12 19:15:20 geoff # Remember to exit after issuing a usage message. Add some warnings about # anomalous behaviors of the script under semi-error conditions. # # Revision 1.6 89/04/28 01:17:08 geoff # Change Header to Id; nobody cares about my pathnames. # # Revision 1.5 89/04/27 23:34:16 geoff # Add (untested) support for a selectable flag marker. # # Revision 1.4 88/12/26 02:32:22 geoff # Update the copyright notice. # # Revision 1.3 88/03/30 00:15:01 geoff # Replace the -d switch with the -l switch. # # Revision 1.2 88/02/20 23:14:34 geoff # Get rid of icombine. Fix some typos. Use a better temp file name. # Run munchlist on the output dictionaries. # # Revision 1.1 87/09/25 00:28:20 geoff # Initial revision # # # Combine and resolve various dictionaries so they are proper # subsets of one another, and so that maximal use is made of # flags in the smaller ones. # # Usage: # # subset [-b base] [-l langfile] small-dict bigger-dict ... biggest-dict # # The output is a an equal number of successively-larger # dictionaries. The smallest is written to "dict.0". Successive # files are named "dict.1", "dict.2", and so forth, and each contains # a list of words which should be added to the previous files to # generate a dictionary. Words which are in smaller dictionaries are # effectively propagated to the larger ones, so that the smaller ones # are proper subsets of their siblings. If dictionaries are # completely disjoint, this may result in an empty output dictionary. # Affix flags are propagated to the smallest dictionary containing # the root word; this expands the effectiveness of small dictionaries # at no cost in hash table space. # # The -b switch is used to specify a different base name for the # output files than "dict". (In other words, "-b english" would # produce output in english.0, english.1, etc.). # # If the -l switch is specified, the language tables are gotten # from the specified file; otherwise they come from $LIBDIR/!!DEFLANG!!. # # Input dictionaries should be "clean"; if non-word characters # appear in the dictionaries, the script may produce incorrect output. # LIBDIR=!!LIBDIR!! TDIR=${TMPDIR-/usr/tmp} TMP=${TDIR}/sset$$. SORTTMP="-T ${TDIR}" # !!SORTTMP!! USAGE="Usage: subset [-b base] [-l langfile] dict-0 dict-1 ..." langtabs=${LIBDIR}/!!DEFLANG!! outbase=dict while : do case "$1" in -b) outbase="$2" shift; shift ;; -l) langtabs="$2" shift; shift ;; -*) echo "$USAGE" 1>&2 exit 1 ;; *) break ;; esac done if [ $# -lt 2 ] then echo "$USAGE" 1>&2 exit 1 fi # Temp files MUNCHOUTPUT=${TMP}a MISSINGWORDS=${TMP}b TEMPDICT=${TMP}c FAKEDICT=${TMP}d FAKEHASH=${TMP}e.hash trap "/bin/rm -f ${TMP}*; exit 1" 1 2 15 trap "/bin/rm -f ${TMP}*; exit 0" 13 # # Create a dummy dictionary to hold a compiled copy of the language # tables. # echo 'QQQQQQQQ' > $FAKEDICT buildhash -s $FAKEDICT $langtabs $FAKEHASH \ || (echo "Couldn't create fake hash file" 1>&2; /bin/rm -f ${TMP}*; exit 1) \ || exit 1 /bin/rm -f ${FAKEDICT}* # # Figure out what the flag-marking character is. # flagmarker=`ispell -D -d $FAKEHASH \ | sed -n '/^flagmarker/s/flagmarker //p'` case "$flagmarker" in \\*) flagmarker=`expr "$flagmarker" : '.$.$'` ;; esac # # (1) Use munchlist to create a list of roots and maximal suffixes. # munchlist -l $langtabs "$@" | sort $SORTTMP > $MUNCHOUTPUT # # (2) Use join to add the maximal suffixes to each dictionary's roots. # Re-expand this, combine with the original, and save for later. # newline=' ' dictno=0 for dictfile do ispell -e -d $FAKEHASH < $dictfile | tr ' ' "$newline" \ | sort -u $SORTTMP | join "-t$flagmarker" -a1 - $MUNCHOUTPUT \ | ispell -e -d $FAKEHASH | tr ' ' "$newline" \ | sort -u $SORTTMP > ${TEMPDICT}.$dictno dictno=`expr $dictno + 1` done /bin/rm -f $MUNCHOUTPUT # # (3) For each adjacent pair of dictionaries, use comm to find words # in the smaller that are missing from the larger, and add them # to the larger. # firstdict="$1" shift lastdict="${TEMPDICT}.0" dictno=1 for dictfile do comm -23 $lastdict ${TEMPDICT}.$dictno > $MISSINGWORDS.$dictno if [ -s $MISSINGWORDS.$dictno ] then sort $SORTTMP -o ${TEMPDICT}.$dictno \ ${TEMPDICT}.$dictno $MISSINGWORDS.$dictno fi lastdict="${TEMPDICT}.$dictno" dictno=`expr $dictno + 1` done /bin/rm -f $MISSINGWORDS.* # # (4) For each pair of dictionaries, use comm to eliminate words in # the smaller from the larger, and shrink the result with munchlist. # munchlist ${TEMPDICT}.0 > $outbase.0 lastdict="${TEMPDICT}.0" dictno=1 for dictfile do comm -13 $lastdict ${TEMPDICT}.$dictno \ | munchlist -l $langtabs > $outbase.$dictno /bin/rm -f $lastdict lastdict="${TEMPDICT}.$dictno" dictno=`expr $dictno + 1` done /bin/rm -f ${TMP}*