home *** CD-ROM | disk | FTP | other *** search
- : Use /bin/sh
- #
- # $Id: subset.X,v 1.9 1992/01/09 09:29:53 geoff Exp $
- #
- # Copyright 1987, 1988, 1989, by Geoff Kuenning, Manhattan Beach, CA
- # Permission for non-profit use is hereby granted.
- # All other rights reserved.
- # See "version.h" for a more complete copyright notice.
- #
- # $Log: subset.X,v $
- # Revision 1.9 1992/01/09 09:29:53 geoff
- # Fix a typo and a tiny documentation error.
- #
- # Revision 1.8 91/07/27 20:48:35 geoff
- # Improve the commentary and the usage message. Just for neatness,
- # generate dict.0 first rather than last. Remove intermediate
- # dictionaries from the temp directory as they are used up.
- #
- # Revision 1.7 91/06/12 19:15:20 geoff
- # Remember to exit after issuing a usage message. Add some warnings about
- # anomalous behaviors of the script under semi-error conditions.
- #
- # Revision 1.6 89/04/28 01:17:08 geoff
- # Change Header to Id; nobody cares about my pathnames.
- #
- # Revision 1.5 89/04/27 23:34:16 geoff
- # Add (untested) support for a selectable flag marker.
- #
- # Revision 1.4 88/12/26 02:32:22 geoff
- # Update the copyright notice.
- #
- # Revision 1.3 88/03/30 00:15:01 geoff
- # Replace the -d switch with the -l switch.
- #
- # Revision 1.2 88/02/20 23:14:34 geoff
- # Get rid of icombine. Fix some typos. Use a better temp file name.
- # Run munchlist on the output dictionaries.
- #
- # Revision 1.1 87/09/25 00:28:20 geoff
- # Initial revision
- #
- #
- # Combine and resolve various dictionaries so they are proper
- # subsets of one another, and so that maximal use is made of
- # flags in the smaller ones.
- #
- # Usage:
- #
- # subset [-b base] [-l langfile] small-dict bigger-dict ... biggest-dict
- #
- # The output is a an equal number of successively-larger
- # dictionaries. The smallest is written to "dict.0". Successive
- # files are named "dict.1", "dict.2", and so forth, and each contains
- # a list of words which should be added to the previous files to
- # generate a dictionary. Words which are in smaller dictionaries are
- # effectively propagated to the larger ones, so that the smaller ones
- # are proper subsets of their siblings. If dictionaries are
- # completely disjoint, this may result in an empty output dictionary.
- # Affix flags are propagated to the smallest dictionary containing
- # the root word; this expands the effectiveness of small dictionaries
- # at no cost in hash table space.
- #
- # The -b switch is used to specify a different base name for the
- # output files than "dict". (In other words, "-b english" would
- # produce output in english.0, english.1, etc.).
- #
- # If the -l switch is specified, the language tables are gotten
- # from the specified file; otherwise they come from $LIBDIR/!!DEFLANG!!.
- #
- # Input dictionaries should be "clean"; if non-word characters
- # appear in the dictionaries, the script may produce incorrect output.
- #
- LIBDIR=!!LIBDIR!!
- TDIR=${TMPDIR-/usr/tmp}
- TMP=${TDIR}/sset$$.
- SORTTMP="-T ${TDIR}" # !!SORTTMP!!
- USAGE="Usage: subset [-b base] [-l langfile] dict-0 dict-1 ..."
-
- langtabs=${LIBDIR}/!!DEFLANG!!
- outbase=dict
- while :
- do
- case "$1" in
- -b)
- outbase="$2"
- shift; shift
- ;;
- -l)
- langtabs="$2"
- shift; shift
- ;;
- -*)
- echo "$USAGE" 1>&2
- exit 1
- ;;
- *)
- break
- ;;
- esac
- done
-
- if [ $# -lt 2 ]
- then
- echo "$USAGE" 1>&2
- exit 1
- fi
-
- # Temp files
- MUNCHOUTPUT=${TMP}a
- MISSINGWORDS=${TMP}b
- TEMPDICT=${TMP}c
- FAKEDICT=${TMP}d
- FAKEHASH=${TMP}e.hash
-
- trap "/bin/rm -f ${TMP}*; exit 1" 1 2 15
- trap "/bin/rm -f ${TMP}*; exit 0" 13
-
- #
- # Create a dummy dictionary to hold a compiled copy of the language
- # tables.
- #
- echo 'QQQQQQQQ' > $FAKEDICT
- buildhash -s $FAKEDICT $langtabs $FAKEHASH \
- || (echo "Couldn't create fake hash file" 1>&2; /bin/rm -f ${TMP}*; exit 1) \
- || exit 1
- /bin/rm -f ${FAKEDICT}*
- #
- # Figure out what the flag-marking character is.
- #
- flagmarker=`ispell -D -d $FAKEHASH \
- | sed -n '/^flagmarker/s/flagmarker //p'`
- case "$flagmarker" in
- \\*)
- flagmarker=`expr "$flagmarker" : '.\(.\)'`
- ;;
- esac
- #
- # (1) Use munchlist to create a list of roots and maximal suffixes.
- #
- munchlist -l $langtabs "$@" | sort $SORTTMP > $MUNCHOUTPUT
- #
- # (2) Use join to add the maximal suffixes to each dictionary's roots.
- # Re-expand this, combine with the original, and save for later.
- #
- newline='
- '
- dictno=0
- for dictfile
- do
- ispell -e -d $FAKEHASH < $dictfile | tr ' ' "$newline" \
- | sort -u $SORTTMP | join "-t$flagmarker" -a1 - $MUNCHOUTPUT \
- | ispell -e -d $FAKEHASH | tr ' ' "$newline" \
- | sort -u $SORTTMP > ${TEMPDICT}.$dictno
- dictno=`expr $dictno + 1`
- done
- /bin/rm -f $MUNCHOUTPUT
- #
- # (3) For each adjacent pair of dictionaries, use comm to find words
- # in the smaller that are missing from the larger, and add them
- # to the larger.
- #
- firstdict="$1"
- shift
- lastdict="${TEMPDICT}.0"
- dictno=1
- for dictfile
- do
- comm -23 $lastdict ${TEMPDICT}.$dictno > $MISSINGWORDS.$dictno
- if [ -s $MISSINGWORDS.$dictno ]
- then
- sort $SORTTMP -o ${TEMPDICT}.$dictno \
- ${TEMPDICT}.$dictno $MISSINGWORDS.$dictno
- fi
- lastdict="${TEMPDICT}.$dictno"
- dictno=`expr $dictno + 1`
- done
- /bin/rm -f $MISSINGWORDS.*
- #
- # (4) For each pair of dictionaries, use comm to eliminate words in
- # the smaller from the larger, and shrink the result with munchlist.
- #
- munchlist ${TEMPDICT}.0 > $outbase.0
- lastdict="${TEMPDICT}.0"
- dictno=1
- for dictfile
- do
- comm -13 $lastdict ${TEMPDICT}.$dictno \
- | munchlist -l $langtabs > $outbase.$dictno
- /bin/rm -f $lastdict
- lastdict="${TEMPDICT}.$dictno"
- dictno=`expr $dictno + 1`
- done
- /bin/rm -f ${TMP}*
-