home *** CD-ROM | disk | FTP | other *** search
Text File | 1991-10-19 | 32.2 KB | 1,074 lines |
- Newsgroups: comp.sources.misc
- From: goer@midway.uchicago.edu (Richard L. Goerwitz)
- Subject: v23i072: quranref - Holy Qur'an word and passage based retrievals, Part06/08
- Message-ID: <1991Oct19.022331.13057@sparky.imd.sterling.com>
- X-Md4-Signature: 8fe3852cf0ae92dbb24e3cd376a4b203
- Date: Sat, 19 Oct 1991 02:23:31 GMT
- Approved: kent@sparky.imd.sterling.com
-
- Submitted-by: goer@midway.uchicago.edu (Richard L. Goerwitz)
- Posting-number: Volume 23, Issue 72
- Archive-name: quranref/part06
- Environment: Icon
-
- ---- Cut Here and feed the following to sh ----
- #!/bin/sh
- # this is quranref.06 (part 6 of a multipart archive)
- # do not concatenate these parts, unpack them in order with /bin/sh
- # file iolib.icn continued
- #
- if test ! -r _shar_seq_.tmp; then
- echo 'Please unpack part 1 first!'
- exit 1
- fi
- (read Scheck
- if test "$Scheck" != 6; then
- echo Please unpack part "$Scheck" next!
- exit 1
- else
- exit 0
- fi
- ) < _shar_seq_.tmp || exit 1
- if test ! -f _shar_wnt_.tmp; then
- echo 'x - still skipping iolib.icn'
- else
- echo 'x - continuing file iolib.icn'
- sed 's/^X//' << 'SHAR_EOF' >> 'iolib.icn' &&
- X er("getval","can't make a table for your terminal",4)
- X
- X return \tc_table[id] | fail
- X # er("getval","the current terminal doesn't support "||id,7)
- X
- Xend
- X
- X
- X
- Xprocedure Decode(s)
- X
- X # Does things like turn ^ plus a letter into a genuine control
- X # character.
- X
- X local new_s, chr, chr2
- X
- X new_s := ""
- X
- X s ? {
- X
- X while new_s ||:= tab(upto('\\^')) do {
- X chr := move(1)
- X if chr == "\\" then {
- X new_s ||:= {
- X case chr2 := move(1) of {
- X "\\" : "\\"
- X "^" : "^"
- X "E" : "\e"
- X "b" : "\b"
- X "f" : "\f"
- X "n" : "\n"
- X "r" : "\r"
- X "t" : "\t"
- X default : {
- X if any(&digits,chr2) then {
- X char(integer("8r"||chr2||move(2 to 0 by -1))) |
- X er("Decode","bad termcap entry",3)
- X }
- X else chr2
- X }
- X }
- X }
- X }
- X else new_s ||:= char(ord(map(move(1),&lcase,&ucase)) - 64)
- X }
- X new_s ||:= tab(0)
- X }
- X
- X return new_s
- X
- Xend
- X
- X
- X
- Xprocedure igoto(cm,col,line)
- X
- X local colline, range, increment, padding, str, outstr, chr, x, y
- X
- X if \col > (tc_table["co"]) | \line > (tc_table["li"]) then {
- X colline := string(\col) || "," || string(\line) | string(\col|line)
- X range := "(" || tc_table["co"]-1 || "," || tc_table["li"]-1 || ")"
- X er("igoto",colline || " out of range " || (\range|""),9)
- X }
- X
- X # Use the Iconish 1;1 upper left corner & not the C-ish 0 offsets
- X increment := -1
- X outstr := ""
- X
- X cm ? {
- X while outstr ||:= tab(find("%")) do {
- X tab(match("%"))
- X if padding := integer(tab(any('23')))
- X then chr := (="d" | "d")
- X else chr := move(1)
- X if case \chr of {
- X "." : outstr ||:= char(line + increment)
- X "+" : outstr ||:= char(line + ord(move(1)) + increment)
- X "d" : {
- X str := string(line + increment)
- X outstr ||:= right(str, \padding, "0") | str
- X }
- X }
- X then line :=: col
- X else {
- X case chr of {
- X "n" : line := ixor(line,96) & col := ixor(col,96)
- X "i" : increment := 0
- X "r" : line :=: col
- X "%" : outstr ||:= "%"
- X "B" : line := ior(ishift(line / 10, 4), line % 10)
- X ">" : {
- X x := move(1); y := move(1)
- X line > ord(x) & line +:= ord(y)
- X &null
- X }
- X } | er("goto","bad termcap entry",5)
- X }
- X }
- X return outstr || tab(0)
- X }
- X
- Xend
- X
- X
- X
- Xprocedure iputs(cp, affcnt)
- X
- X # Writes cp to the screen. Use this instead of writes() for
- X # compatibility with itlib (a UNIX-only version which can handle
- X # albeit inelegantly) terminals that need padding.
- X
- X static num_chars
- X initial num_chars := &digits ++ '.'
- X
- X type(cp) == "string" |
- X er("iputs","you can't iputs() a non-string value!",10)
- X
- X cp ? {
- X if tab(many(num_chars)) & ="*" then
- X stop("iputs: iolib can't use terminals that require padding.")
- X writes(tab(0))
- X }
- X
- X return
- X
- Xend
- SHAR_EOF
- echo 'File iolib.icn is complete' &&
- true || echo 'restore of iolib.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= iscreen.icn ==============
- if test -f 'iscreen.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping iscreen.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting iscreen.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'iscreen.icn' &&
- X############################################################################
- X#
- X# Name: iscreen.icn
- X#
- X# Title: Icon screen functions
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.26
- X#
- X############################################################################
- X#
- X# This and future version of iscreen are placed in the public domain - RLG
- X#
- X############################################################################
- X#
- X# This file contains some rudimentary screen functions for use with
- X# itlib.icn (termlib-like routines for Icon).
- X#
- X# clear() - clears the screen (tries several methods)
- X# emphasize() - initiates emphasized mode
- X# boldface() - initiates bold mode
- X# blink() - initiates blinking mode
- X# normal() - resets to normal mode
- X# message(s) - displays message s on 2nd-to-last line
- X# underline() - initiates underline mode
- X# status_line(s,s2,p) - draws status line s on the 3rd-to-last
- X# screen line; if s is too short for the terminal, s2 is used;
- X# if p is nonnull then it either centers, left-, or right-justi-
- X# fies, depending on the value, "c," "l," or "r."
- X# clear_emphasize() - horrible way of clearing the screen to all-
- X# emphasize mode; necessary for many terminals
- X#
- X############################################################################
- X#
- X# Requires: UNIX
- X#
- X# Links: itlib.icn (or your OS-specific port of itlib)
- X#
- X# See also: boldface.icn
- X#
- X############################################################################
- X
- X
- Xprocedure clear()
- X
- X # Clears the screen. Tries several methods.
- X local i
- X
- X normal()
- X if not iputs(getval("cl"))
- X then iputs(igoto(getval("cm"),1,1) | getval("ho"))
- X if not iputs(getval("cd"))
- X then {
- X every i := 1 to getval("li") do {
- X iputs(igoto(getval("cm"),1,i))
- X iputs(getval("ce"))
- X }
- X iputs(igoto(getval("cm"),1,1))
- X }
- X return
- X
- Xend
- X
- X
- X
- Xprocedure boldface()
- X
- X static bold_str, cookie_str
- X initial {
- X if bold_str := getval("md")
- X then cookie_str := repl(getval("le"|"bc") | "\b", getval("mg"))
- X else {
- X # One global procedure value substituted for another.
- X boldface := emphasize
- X return emphasize()
- X }
- X }
- X
- X normal()
- X iputs(\bold_str)
- X iputs(\cookie_str)
- X return
- X
- Xend
- X
- X
- X
- Xprocedure blink()
- X
- X static blink_str, cookie_str
- X initial {
- X if blink_str := getval("mb")
- X then cookie_str :=
- X repl(getval("le"|"bc") | "\b", getval("mg"))
- X else {
- X # One global procedure value substituted for another.
- X blink := emphasize
- X return emphasize()
- X }
- X }
- X
- X normal()
- X iputs(\blink_str)
- X iputs(\cookie_str)
- X return
- X
- Xend
- X
- X
- X
- Xprocedure emphasize()
- X
- X static emph_str, cookie_str
- X initial {
- X if emph_str := getval("so")
- X then cookie_str := repl(getval("le"|"bc") | "\b", getval("sg"))
- X else {
- X if emph_str := getval("us")
- X then cookie_str := repl(getval("le"|"bc") | "\b", getval("ug"))
- X }
- X }
- X
- X normal()
- X iputs(\emph_str)
- X iputs(\cookie_str)
- X return
- X
- Xend
- X
- X
- X
- Xprocedure underline()
- X
- X static underline_str, cookie_str
- X initial {
- X if underline_str := getval("us")
- X then cookie_str := repl(getval("le"|"bc") | "\b", getval("ug"))
- X }
- X
- X normal()
- X iputs(\underline_str)
- X iputs(\cookie_str)
- X return
- X
- Xend
- X
- X
- X
- Xprocedure normal(mode)
- X
- X static UN_emph_str, emph_cookie_str,
- X UN_underline_str, underline_cookie_str,
- X UN_bold_str, bold_cookie_str
- X
- X initial {
- X
- X # Find out code to turn off emphasize (reverse video) mode.
- X if UN_emph_str := getval("se") then
- X # Figure out how many backspaces we need to erase cookies.
- X emph_cookie_str := repl(getval("le"|"bc") | "\b", getval("sg"))
- X
- X # Finally, figure out how to turn off underline mode.
- X if UN_underline_str := (UN_emph_str ~== getval("ue")) then
- X underline_cookie_str := repl(getval("le"|"bc")|"\b", getval("ug"))
- X
- X # Figure out how to turn off boldface mode.
- X if UN_bold_str :=
- X (UN_underline_str ~== (UN_emph_str ~== getval("me"))) then
- X # Figure out how many backspaces we need to erase cookies.
- X bold_cookie_str := repl(getval("le"|"bc") | "\b", getval("mg"))
- X
- X }
- X
- X iputs(\UN_emph_str) &
- X iputs(\emph_cookie_str)
- X
- X iputs(\UN_underline_str) &
- X iputs(\underline_cookie_str)
- X
- X iputs(\UN_bold_str) &
- X iputs(\bold_cookie_str)
- X
- X return
- X
- Xend
- X
- X
- X
- Xprocedure status_line(s,s2,p)
- X
- X # Writes a status line on the terminal's third-to-last line
- X # The only necessary argument is s. S2 (optional) is used
- X # for extra narrow screens. In other words, by specifying
- X # s2 you give status_line an alternate, shorter status string
- X # to display, in case the terminal isn't wide enough to sup-
- X # port s. If p is nonnull, then the status line is either
- X # centered (if equal to "c"), left justified ("l"), or right
- X # justified ("r").
- X
- X local width
- X
- X /s := ""; /s2 := ""; /p := "c"
- X width := getval("co")
- X if *s > width then {
- X (*s2 < width, s := s2) |
- X er("status_line","Your terminal is too narrow.",4)
- X }
- X
- X case p of {
- X "c" : s := center(s,width)
- X "l" : s := left(s,width)
- X "r" : s := right(s,width)
- X default: stop("status_line: Unknown option "||string(p),4)
- X }
- X
- X iputs(igoto(getval("cm"), 1, getval("li")-2))
- X emphasize(); writes(s)
- X normal()
- X return
- X
- Xend
- X
- X
- X
- Xprocedure message(s)
- X
- X # Display prompt s on the second-to-last line of the screen.
- X # I hate to use the last line, due to all the problems with
- X # automatic scrolling.
- X
- X /s := ""
- X normal()
- X iputs(igoto(getval("cm"), 1, getval("li")))
- X iputs(getval("ce"))
- X normal()
- X iputs(igoto(getval("cm"), 1, getval("li")-1))
- X iputs(getval("ce"))
- X writes(s[1:getval("co")] | s)
- X return
- X
- Xend
- X
- X
- X
- Xprocedure clear_underline()
- X
- X # Horrible way of clearing the screen to all underline mode, but
- X # the only apparent way we can do it "portably" using the termcap
- X # capability database.
- X
- X local i
- X
- X underline()
- X iputs(igoto(getval("cm"),1,1))
- X if getval("am") then {
- X underline()
- X every 1 to (getval("li")-1) * getval("co") do
- X writes(" ")
- X }
- X else {
- X every i := 1 to getval("li")-1 do {
- X iputs(igoto(getval("cm"), 1, i))
- X underline()
- X writes(repl(" ",getval("co")))
- X }
- X }
- X iputs(igoto(getval("cm"),1,1))
- X
- Xend
- X
- X
- X
- Xprocedure clear_emphasize()
- X
- X # Horrible way of clearing the screen to all reverse-video, but
- X # the only apparent way we can do it "portably" using the termcap
- X # capability database.
- X
- X local i
- X
- X emphasize()
- X iputs(igoto(getval("cm"),1,1))
- X if getval("am") then {
- X emphasize()
- X every 1 to (getval("li")-1) * getval("co") do
- X writes(" ")
- X }
- X else {
- X every i := 1 to getval("li")-1 do {
- X iputs(igoto(getval("cm"), 1, i))
- X emphasize()
- X writes(repl(" ",getval("co")))
- X }
- X }
- X iputs(igoto(getval("cm"),1,1))
- X
- Xend
- SHAR_EOF
- true || echo 'restore of iscreen.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= qur2rtv.icn ==============
- if test -f 'qur2rtv.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping qur2rtv.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting qur2rtv.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'qur2rtv.icn' &&
- X############################################################################
- X#
- X# Name: qur2rtv.icn
- X#
- X# Title: qur2rtv (Quran -> retrieve format converter)
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.2
- X#
- X############################################################################
- X#
- X# Program for converting the internet-accessible scan of M. H.
- X# Shakir's Quran translation into retrieve format. Reads standard
- X# input. Writes reformatted text to standard output. Assumes the
- X# sections will come in order (1 before 2; 2 before 3, etc.), but
- X# that they will all be directed into the same input stream.
- X# Naturally, it does not matter whether they have been concatenated
- X# into one, or remain split into several, files.
- X#
- X############################################################################
- X#
- X# Links: none
- X#
- X############################################################################
- X
- X
- Xprocedure main()
- X
- X local line, verse
- X # in case this ever gets encapsulated
- X static section, last_verse, text, skipped, extra_text, seenit
- X initial {
- X last_verse := 1000
- X section := 0
- X extra_text := ""
- X skipped := 1
- X }
- X
- X # While you can read lines from stdin...
- X while line := trim(read(),'\t \x0D\x1A') # trim CR, tab, sp, ^Z
- X do {
- X
- X # ...scan them for text numbers, and output these in retrieve
- X # format, along with corresponding text.
- X line ? {
- X
- X # Housekeeping.
- X if pos(0) then {
- X skipped := 1 # note that the last line was blank
- X next # skip past empty lines
- X }
- X tab(many('\t ')) # tab past whitespace (if present)
- X
- X # Two cases where extra text has been tacked onto a file
- X # and has to be stripped out.
- X ="THE SPIDER" & {
- X until read(&input) ? (tab(match("\x1A" | "with")), pos(0))
- X next
- X }
- X if section = 65 & verse = 12 & /seenit & {
- X ="In the Name of Allah, the Beneficent, the Merciful."
- X }
- X then {
- X until read(&input) ? tab(match("\x1A" | "and she"))
- X seenit := 1
- X next
- X }
- X # More housekeeping (the text is rife with errors).
- X (=". ", match("2"))
- X ="/ "
- X
- X # If the next line begins with a numerical reference, then
- X # write out the text of the preceding text (if in fact
- X # there *was* a preceding text block). Finally, write out the
- X # section/text reference (in retrieve format).
- X
- X if \skipped | any('.?:', \text, -1) &
- X verse := is_it_a_verse()
- X then {
- X write(\text)
- X if -1 <= verse < 2 then {
- X section +:= 1
- X# # For debugging purposes.
- X# write(&errout, "resetting; text = \n", \text)
- X# write(&errout, "section now = ", section)
- X# write(&errout, "last_verse = ", last_verse)
- X# write(&errout, "verse = ", verse)
- X }
- X else if verse ~= (last_verse+1) then {
- X if verse = (last_verse+2) then
- X write(&errout, "LF missing, ",section,":",last_verse)
- X else if not (verse := map(verse, "1", "7")= (last_verse+1))
- X then if verse = 34 & last_verse = 35
- X then verse := 36
- X else if verse = 6 & last_verse = 3 & section = 47
- X then {
- X write(&errout,"extra text, ",section,":",last_verse)
- X until trim(read(&input)) == ""
- X next
- X } else if section = 43 & verse = 29 & last_verse = 30
- X then {
- X find("disbelievers in it", !&input) |
- X stop("parsing error; get help")
- X next
- X }
- X else stop("error, ",section,":",last_verse,"\n",text)
- X }
- X last_verse := verse
- X write("::", section, ":", verse)
- X tab(many(' \t'))
- X text := extra_text || " " || tab(0)
- X extra_text := ""
- X } else {
- X # Dump the (rest of) the line onto text.
- X if /skipped & (extra_text == "") then
- X text ||:= " " || tab(0)
- X else {
- X # if we've had a blank line in this text block, but
- X # no verse number, then concatenate it with any other
- X # text we have after the last blank line
- X extra_text ||:= " " || tab(0)
- X }
- X }
- X }
- X skipped := &null
- X }
- X # Flush the "text" buffer.
- X \text ||:= " " || \extra_text
- X write(\text)
- X
- X exit(0)
- X # or fail
- X
- Xend
- X
- X
- X#
- X# From strings.icn in the IPL (written by Ralph Griswold).
- X#
- Xprocedure REplace(s1,s2,s3)
- X
- X local result, i
- X result := ""
- X i := *s2
- X
- X s1 ? {
- X while result ||:= tab(find(s2)) do {
- X result ||:= s3
- X move(i)
- X }
- X return result || tab(0)
- X }
- X
- Xend
- X
- X
- Xprocedure is_it_a_verse()
- X
- X local tmp
- X
- X #
- X # Can the first bit of text in &subject possible be construed as a
- X # verse reference (with typos)? Let's see.
- X #
- X
- X # I've seen "I 1." for 11.
- X return (="I 1. ", 11) |
- X # I've seen "I." or "l." for "1."
- X (tab(any('lI')), =".", 1) |
- X # I've even seen "S." for "5."
- X (="S", =".", 5) |
- X 1(tab(many(&digits)), tab(any('.: '))) | {
- X # If it's none of the above, then as long as it's over two chars,
- X # try lots of mapping. If I took away the restriction that the
- X # sequence be less than two chars, I'd get lines which begin with
- X # the word "I" (as in "I am"). No go.
- X (tab(find(" ")+1) || tab(upto(&ucase))) \ 1 ? {
- X *(tmp := 1(tab(many(&digits++'IOlS')), tab(any(':., ')))) > 1 &
- X integer(map(tmp, "IOlS", "1015"))
- X }
- X }
- X
- Xend
- SHAR_EOF
- true || echo 'restore of qur2rtv.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= makeind.icn ==============
- if test -f 'makeind.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping makeind.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting makeind.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'makeind.icn' &&
- X############################################################################
- X#
- X# Name: makeind.icn
- X#
- X# Title: makeind.icn
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 2.4
- X#
- X############################################################################
- X#
- X# This file, makeind.icn, compiles into an indexing program which
- X# creates a series of files offering the user rapid access to
- X# individual elements (usually words) within a text file. Access is
- X# gained through a set of basic retrieval utilities contained in the
- X# file retrieve.icn, bmp2text.icn, retrops.icn, and others included
- X# with this package. In order to be indexable, files must interleave
- X# string coded bitfield-style designators with text in the following
- X# manner:
- X#
- X# ::001:001:001
- X# This is text.
- X# ::001:001:002
- X# This is more text.
- X#
- X# The lines beginning with :: (a double colon) mark bitfield-style
- X# location-designators. Location designators are strings with digit
- X# fields of fixed number and length separated either by nothing (as
- X# in, say 001001002), or better yet by non-digits (e.g. 001:001:002).
- X# NOTE WELL: The bitmaps must come in ascending order. For example,
- X# if we assume three-field bitmaps, 002:001:014 would come before
- X# 003:001:013. If your file is not sorted properly, depending on
- X# the structure of the file, retrieve() may 1) abort, 2) retrieve the
- X# wrong text, or 3) work perfectly fine. Unless you're absolutely
- X# sure of what you're doing, write a quick sort routine and put the
- X# file in order before invoking makind on it.
- X#
- X# usage: makeind -f filename -m int -n int [-l int] [-s]
- X#
- X# When calling makeind, you must specify the filename to be indexed
- X# (-f filename), the maximum field value (-m max-value; e.g. if
- X# fields can go from 0 to 255, then -m 255 would be used), and the
- X# number of fields (-n field-number). The -s switch directs makeind
- X# to create a case-sensitive index. The default is case-insensitive.
- X# -l [int] tells makeind to create a .LIM file, which is only needed
- X# if you want to retrieve text by location marker, and not just via
- X# the index (for this, you'll need something to translate human-
- X# readable references into retrieve's native format).
- X#
- X# BUGS: This indexing routine is going to eat up a _tremendous_
- X# amount of memory when used on large files, since every token in the
- X# input file gets its own entry in wordtbl, and each entry gets a set
- X# as its corresponding key. If you don't have the memory, then you
- X# could use strings instead of sets (the insert routines will be just
- X# a tiny bit more complicated). Intermediate files could also be
- X# used. Drop me a line if you want help. Otherwise, make sure you
- X# have at *least* two megabytes core for every megabyte of text in
- X# the file you wish to index (or else a very, very good virtual
- X# memory management system).
- X#
- X# NOTE: The -S [field-sep] option is currently disabled because using
- X# it slows things down drastically. If you want to be able to
- X# specify what separator to use when breaking files down into
- X# individual words, consult ./gettokens.icn.
- X#
- X# NOTE ALSO: Makeind compresses the input file somewhat, and in the
- X# process, backs the old file up. If you need the old file back again,
- X# look for a file with the extension .BAK (the original name may be
- X# slightly permutated). Makeind will not overwrite this .BAK file, so
- X# please either erase or move it when you are finished. If you try to
- X# remake without doing this, makeind (ever cautious) will abort.
- X#
- X############################################################################
- X#
- X# Links: options.icn, codeobj.icn, ./indexutl.icn ./gettokens.icn
- X#
- X# See also: retrieve.icn, bmp2text.icn, expandrf.icn
- X#
- X############################################################################
- X
- X# IPL files to be linked in at compile time.
- Xlink options, codeobj
- X
- X# Global variable (for OS-dependencies).
- X# global IS # declared in indexutl.icn
- X
- X# Is is a record containing vital information on an indexed file, such
- X# as the field separator, the string-length of fields, etc. I've re-
- X# moved the record declaration from this file, and placed it in index-
- X# utl.icn.
- X# record is(FS, s_len, len, no, is_case_sensitive, r_field, hufftree)
- X
- X#
- X# Main procedure.
- X#
- Xprocedure main(a)
- X
- X local usage, opt_table, fname, rollover_field, index_fname,
- X bitmap_fname, upto_field, ofs_filename, bitmap_offset_table,
- X out_IS, limits_fname, char_tbl, backup_filename, unt_filename,
- X huffman_table, index_table
- X
- X # global IS # IS contains stats for file being indexed
- X
- X #
- X # Initialize global OS-related parameters, such as the directory
- X # separator (_slash) and the maximum permissible filename length
- X # minus four (to make room for extensions makeind tacks on).
- X #
- X initialize_os_params()
- X
- X #
- X # Read in and check command argument list. Insert FS and no
- X # parameters into (global) record IS. Calculate s_len, len, and
- X # bitmap_length parameters as well. Returns table of options
- X # (keys are option letters).
- X #
- X usage:= "usage: makeind -f filename -m int -n int [-l int] [-s]"
- X opt_table := initialize_IS(a)
- X fname := \opt_table["f"] | stop(usage)
- X rollover_field := opt_table["l"] # (optional)
- X
- X #
- X # Begin the process of tokenizing, recording token locations, and
- X # of storing this information in two separate files.
- X #
- X # Read input file, making a table of words and their locations.
- X # While we're at it, build a table of character frequencies for
- X # text (not bitmaps) within the file (char_tbl), and then create
- X # a huffman tree and table out of this character frequency list.
- X #
- X index_table := table(); char_tbl := table()
- X create_index(fname, index_table, char_tbl)
- X # Use the char_tbl to generate a Huffman tree & a code table.
- X IS.hufftree := heap_2_huffman_tree(heap_init(char_tbl))
- X huffman_table := hash_huffcodes(IS.hufftree)
- X
- X #
- X # Write keys to one file, with pointers into another file
- X # containing the bitmaps for each key. Use the index_table
- X # created just above (contains words & their locations).
- X #
- X index_fname := dir_name(fname)||create_fname(fname, "IND")
- X bitmap_fname := dir_name(fname)||create_fname(fname, "BMP")
- X write_tokens_and_offsets(index_fname, bitmap_fname, index_table)
- X
- X #
- X # Backup the original text file. Prepare to re-use the original
- X # filename to store compressed text.
- X #
- X backup_filename := create_fname(fname,"BAK")
- X backup_filename ?:= (tab(many('x')), tab(0))
- X backup_filename := dir_name(fname)||backup_filename
- X if close(open(backup_filename)) then
- X abort("makeind", "backup filename collision; aborting", 6)
- X rename(fname, backup_filename) |
- X abort("makeind", "cannot back up file to disk; aborting", 7)
- X #
- X # Open backup file (i.e. the original text file), then run through
- X # it, writing the bitmaps directly to a .UNT file (using the
- X # original, human-readable form). Compress and write the text
- X # associated with those bitmaps to a file having the same name as
- X # the original text file (not the backup name), tacking onto the
- X # bitmaps in the .UNT file their offset within the main text file.
- X # Write the offsets of the major divisions in the .UNT file to the
- X # .OFS file.
- X #
- X upto_field := 1 < (IS.no * 2) / 3 | 1
- X ofs_filename := dir_name(fname)||create_fname(fname, "OFS")
- X unt_filename := dir_name(fname)||create_fname(fname, "UNT")
- X bitmap_offset_table :=
- X store_bitmaps_and_offsets(fname, unt_filename, backup_filename,
- X upto_field, huffman_table)
- X # Write .OFS file. The .UNT and main text files have already been
- X # written out by store_bitmaps_and_offsets(). The original main file
- X # name now holds compressed text.
- X write_bitmaps_and_offsets(ofs_filename, bitmap_offset_table, upto_field)
- X
- X #
- X # Re-open UNT file. Read it, find the pre-rollover bitmaps, and
- X # store them in the .LIM file. Obviously this procedure could be
- X # stuffed into another one above (e.g. store_bitmaps_and_offsets()).
- X #
- X if \rollover_field then {
- X #
- X # Let's say we are using the Bible as our text, and we want to
- X # create all the bitmaps for Genesis 1:9-2:10. We need to know
- X # what verse chapter 1 goes up to. By supplying makeind
- X # with a "-l 3" argument, you are telling it to store this in-
- X # formation for later use by expandrf().
- X #
- X limits_fname := dir_name(fname)||create_fname(fname, "LIM")
- X write_limits(limits_fname, unt_filename, rollover_field)
- X IS.r_field := rollover_field
- X }
- X
- X #
- X # Write IS record to the .IS file.
- X #
- X out_IS := open(dir_name(fname)||create_fname(fname, "IS"), "w") |
- X abort("makeind","can't open .IS file",2)
- X writes(out_IS, encode(IS))
- X close(out_IS)
- X
- X # All is well. Exit with zero status.
- X exit(0)
- X
- Xend
- X
- X
- X#
- X# initialize_IS
- X#
- X# Sets up main parameters for the current index file, such as the
- X# field separator to be used in tokenizing the file, the string and
- X# bit lengths of bitmap fields, the number of fields, and the size of
- X# the actual bitmaps (in bytes) as written to disk (comes out to the
- X# smallest multiple of eight greater than the field length times the
- X# field number. The marker length has to be set in the main
- X# procedure, so initialize_IS leaves it null for now.
- X#
- Xprocedure initialize_IS(a)
- X
- X local usage, fname, opt_table
- X # global IS
- X
- X usage:="usage: makeind -f filename -m int -n int [-l int] [-s]"
- X
- X IS := is() # set up some IS fields
- X opt_table := options(a, "f:m:n+sS:l+")
- X 3 <= *opt_table <= 6 | stop(usage)
- X IS.no := \opt_table["n"] | stop(usage)
- X IS.FS := \opt_table["S"] | "['.]?[^-0-9A-Za-z']+'?"
- X IS.is_case_sensitive := opt_table["s"] # normally is &null
- X
- X #
- X # Calculate string representation length for fields, as well as
- X # the number of bits required for their integer representation.
- X # I.e. if the opt_table["m"] value is 99, this will take two chars to
- X # represent as a string ("99"), but 7 binary "digits" to represent
- X # internally as a base-two integer.
- X #
- X IS.s_len := *string(opt_table["m"])
- X IS.len := *exbase10(opt_table["m"], 2)
- X
- X return opt_table
- X
- Xend
- X
- X
- X#
- X# create_index
- X#
- X# (A better name would be fill_out_index_and_char_frequency_table.)
- X#
- X# Places tokens in fname (the full text file) in a table (supplied as
- X# arg 2), with the set of each token's locations recorded as values
- X# for those tokens. IS.FS is not used. IS.s_len is the location
- X# marker string-representation field length. IS.len is the number of
- X# binary digits needed for an integer representation of a given field.
- X# IS.no is the number of fields. While creating a table for tokens in
- X# fname, create_index ALSO fills out a table of character frequencies
- X# in the entries (i.e. it doesn't count frequencies of chars used in
- X# the bitmaps).
- X#
- Xprocedure create_index(fname, wordtbl, char_tbl)
- X
- X local intext, line, bitmap, token, value
- X
- X intext := open(fname) |
- X abort("create_index","can't open index file, "||fname, 9)
- X # Dummy key to hold all bitmaps in the text.
- X insert(wordtbl, "", set())
- X
- X # Seek past any garbage. Take first :: initial line.
- X match("::", line := !intext)
- X repeat {
- X line ? {
- X
- X if ="::" then {
- X bitmap := digits_2_bitmap(tab(0))
- X # Insert every bitmap into the dummy entry for "".
- X insert(wordtbl[""], bitmap)
- X line := read(intext)
- X } else {
- X
- X value := line || "\n"
- X # Concatenate every line in this entry into a single value.
- X while line := read(intext) do {
- X line ? {
- X match("::") & break
- X value ||:= line || "\n"
- X }
- X }
- X
- X value := trim(value, '\n')
- X # Maintain character frequency table (arg 3 above).
- X count_chars("" ~== value, char_tbl)
- X # Build a table of tokens. NB gettokens() resides
- X # in a separate file. The table is arg 2.
- X value ? {
- X every token := gettokens(IS.is_case_sensitive) do {
- X /wordtbl[token] := set()
- X insert(wordtbl[token], \bitmap)
- X }
- X }
- X
- X # If :: doesn't appear first on line, we're at EOF. If
- X # line == &subject, then to prevent an infinite loop, we
- X # break.
- X line == &subject & break
- X match("::", line) | break
- X }
- X }
- X }
- X
- X \line | abort("create_index", "empty input file, "||fname, 8)
- X close(intext)
- X return "tokenized " || *wordtbl || " words; filled out char_tbl"
- X
- Xend
- X
- X
- X#
- X# write_tokens_and_offsets
- X#
- X# Writes to one file (the .IND file) a list of all tokens collected
- X# from the input file, one to a line, followed by a tab, and then a
- X# byte offset into another file (the .BMP file) where the bitmaps for
- X# that token are kept.
- X#
- X# token tab offset
- X#
- X# A seek to "offset" in the .BMP file will put you at the start of a
- X# block of bitmaps.
- X#
- Xprocedure write_tokens_and_offsets(index_fname, bitmap_fname, t)
- X
- X local outtokens, outbitmaps, index_lst, i, bitmap_length,
- X how_many_bitmaps, bits_needed, inverse_signal, inverse_set
- X
- X outtokens := open(index_fname, "w") |
- X abort("write_tokens_and_offsets","can't open "||index_fname,6)
- X outbitmaps := open(bitmap_fname, "w") |
- X abort("write_tokens_and_offsets","can't open "||bitmap_fname,5)
- X # Calculate the length of bitmaps (must be the smallest multiple of
- X # 8 >= (IS.len * IS.no)).
- X bitmap_length := ((IS.len * IS.no) <= seq(0,8))
- X index_lst := sort(t, 3)
- X bits_needed := 24 # bytes needed to hold no of bitmaps for keys
- X inverse_signal := 8388608 # 24th bit, which signals inverse storage
- X
- X every i := 1 to *index_lst-1 by 2 do {
- X
- X # Write token to index file with the offset of that token's
- X # bitmaps in the bitmap file.
- X write(outtokens, index_lst[i], "\t", where(outbitmaps))
- X
- X # Now write the bitmaps for the above token to the bitmap file.
- X # First write out the number of bitmaps in this block. 4 bytes
- X # are allotted to hold this count (23 bits). If the number of
- X # bitmaps for the current token exceeds 3/5 of the total number
- X # of bitmaps for the entire file, then add bits_needed-1 to the
- X # number. That is, set the highest bit to 1.
- X how_many_bitmaps := *index_lst[i+1]
- X if how_many_bitmaps > (inverse_signal-1) then { # just in case
- X abort("write_tokens_and_offsets",
- X "too many bitmaps for"||index_lst[i], bits_needed)
- X }
- X # "" is a dummy key containing all the bitmaps in the text.
- X # If the number of bitmaps for any key other than "" exceeds
- X # 3/5 that of "", then store those bitmaps where the key does
- X # NOT occur, and set the inverse_signal bit...
- X if index_lst[i] ~== "" &
- X how_many_bitmaps >= integer(*t[""] * 0.60)
- X then {
- X inverse_set := (index_lst[2] -- index_lst[i+1])
- SHAR_EOF
- true || echo 'restore of makeind.icn failed'
- fi
- echo 'End of part 6'
- echo 'File makeind.icn is continued in part 7'
- echo 7 > _shar_seq_.tmp
- exit 0
-
- exit 0 # Just in case...
- --
- Kent Landfield INTERNET: kent@sparky.IMD.Sterling.COM
- Sterling Software, IMD UUCP: uunet!sparky!kent
- Phone: (402) 291-8300 FAX: (402) 291-4362
- Please send comp.sources.misc-related mail to kent@uunet.uu.net.
-